bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/xattr.h>
  98 #include <sys/fcntl.h>
  99 #include <sys/fsctl.h>
 100 #include <sys/ubc_internal.h>
 101 #include <sys/disk.h>
 102 #include <sys/content_protection.h>
 103 #include <sys/clonefile.h>
 104 #include <sys/snapshot.h>
 105 #include <sys/priv.h>
 106 #include <machine/cons.h>
 107 #include <machine/limits.h>
 108 #include <miscfs/specfs/specdev.h>
 109
 110 #include <security/audit/audit.h>
 111 #include <bsm/audit_kevents.h>
 112
 113 #include <mach/mach_types.h>
 114 #include <kern/kern_types.h>
 115 #include <kern/kalloc.h>
 116 #include <kern/task.h>
 117
 118 #include <vm/vm_pageout.h>
 119 #include <vm/vm_protos.h>
 120
 121 #include <libkern/OSAtomic.h>
 122 #include <pexpert/pexpert.h>
 123 #include <IOKit/IOBSD.h>
 124
 125 #if ROUTEFS
 126 #include <miscfs/routefs/routefs.h>
 127 #endif /* ROUTEFS */
 128
 129 #if CONFIG_MACF
 130 #include <security/mac.h>
 131 #include <security/mac_framework.h>
 132 #endif
 133
 134 #if CONFIG_FSE
 135 #define GET_PATH(x) \
 136         (x) = get_pathbuff();
 137 #define RELEASE_PATH(x) \
 138         release_pathbuff(x);
 139 #else
 140 #define GET_PATH(x)     \
 141         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 142 #define RELEASE_PATH(x) \
 143         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 144 #endif /* CONFIG_FSE */
 145
 146 /* struct for checkdirs iteration */
 147 struct cdirargs {
 148         vnode_t olddp;
 149         vnode_t newdp;
 150 };
 151 /* callback  for checkdirs iteration */
 152 static int checkdirs_callback(proc_t p, void * arg);
 153
 154 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 155 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 156 void enablequotas(struct mount *mp, vfs_context_t ctx);
 157 static int getfsstat_callback(mount_t mp, void * arg);
 158 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 159 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 160 static int sync_callback(mount_t, void *);
 161 static void sync_thread(void *, __unused wait_result_t);
 162 static int sync_async(int);
 163 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 164                         user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 165                                                 boolean_t partial_copy);
 166 static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
 167                         user_addr_t bufp);
 168 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 169 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 170                         struct componentname *cnp, user_addr_t fsmountargs,
 171                         int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 172                         vfs_context_t ctx);
 173 void vfs_notify_mount(vnode_t pdvp);
 174
 175 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 176
 177 struct fd_vn_data * fg_vn_data_alloc(void);
 178
 179 /*
 180  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 181  * Concurrent lookups (or lookups by ids) on hard links can cause the
 182  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 183  * does) to return ENOENT as the path cannot be returned from the name cache
 184  * alone. We have no option but to retry and hope to get one namei->reverse path
 185  * generation done without an intervening lookup, lookup by id on the hard link
 186  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 187  * which currently are the MAC hooks for rename, unlink and rmdir.
 188  */
 189 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 190
 191 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
 192
 193 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
 194
 195 #ifdef CONFIG_IMGSRC_ACCESS
 196 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 197 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 198 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 199 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 200 static void mount_end_update(mount_t mp);
 201 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 202 #endif /* CONFIG_IMGSRC_ACCESS */
 203
 204 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 205
 206 __private_extern__
 207 int sync_internal(void);
 208
 209 __private_extern__
 210 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 211
 212 extern lck_grp_t *fd_vn_lck_grp;
 213 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 214 extern lck_attr_t *fd_vn_lck_attr;
 215
 216 /*
 217  * incremented each time a mount or unmount operation occurs
 218  * used to invalidate the cached value of the rootvp in the
 219  * mount structure utilized by cache_lookup_path
 220  */
 221 uint32_t mount_generation = 0;
 222
 223 /* counts number of mount and unmount operations */
 224 unsigned int vfs_nummntops=0;
 225
 226 extern const struct fileops vnops;
 227 #if CONFIG_APPLEDOUBLE
 228 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 229 #endif /* CONFIG_APPLEDOUBLE */
 230
 231 /*
 232  * Virtual File System System Calls
 233  */
 234
 235 #if NFSCLIENT || DEVFS || ROUTEFS
 236 /*
 237  * Private in-kernel mounting spi (NFS only, not exported)
 238  */
 239  __private_extern__
 240 boolean_t
 241 vfs_iskernelmount(mount_t mp)
 242 {
 243         return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE);
 244 }
 245
 246  __private_extern__
 247 int
 248 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 249              void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
 250 {
 251         struct nameidata nd;
 252         boolean_t did_namei;
 253         int error;
 254
 255         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 256                UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 257
 258         /*
 259          * Get the vnode to be covered if it's not supplied
 260          */
 261         if (vp == NULLVP) {
 262                 error = namei(&nd);
 263                 if (error)
 264                         return (error);
 265                 vp = nd.ni_vp;
 266                 pvp = nd.ni_dvp;
 267                 did_namei = TRUE;
 268         } else {
 269                 char *pnbuf = CAST_DOWN(char *, path);
 270
 271                 nd.ni_cnd.cn_pnbuf = pnbuf;
 272                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 273                 did_namei = FALSE;
 274         }
 275
 276         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 277                              syscall_flags, kern_flags, NULL, TRUE, ctx);
 278
 279         if (did_namei) {
 280                 vnode_put(vp);
 281                 vnode_put(pvp);
 282                 nameidone(&nd);
 283         }
 284
 285         return (error);
 286 }
 287 #endif /* NFSCLIENT || DEVFS */
 288
 289 /*
 290  * Mount a file system.
 291  */
 292 /* ARGSUSED */
 293 int
 294 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 295 {
 296         struct __mac_mount_args muap;
 297
 298         muap.type = uap->type;
 299         muap.path = uap->path;
 300         muap.flags = uap->flags;
 301         muap.data = uap->data;
 302         muap.mac_p = USER_ADDR_NULL;
 303         return (__mac_mount(p, &muap, retval));
 304 }
 305
 306 void
 307 vfs_notify_mount(vnode_t pdvp)
 308 {
 309         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 310         lock_vnode_and_post(pdvp, NOTE_WRITE);
 311 }
 312
 313 /*
 314  * __mac_mount:
 315  *      Mount a file system taking into account MAC label behavior.
 316  *      See mount(2) man page for more information
 317  *
 318  * Parameters:    p                        Process requesting the mount
 319  *                uap                      User argument descriptor (see below)
 320  *                retval                   (ignored)
 321  *
 322  * Indirect:      uap->type                Filesystem type
 323  *                uap->path                Path to mount
 324  *                uap->data                Mount arguments
 325  *                uap->mac_p               MAC info
 326  *                uap->flags               Mount flags
 327  *
 328  *
 329  * Returns:        0                       Success
 330  *                !0                       Not success
 331  */
 332 boolean_t root_fs_upgrade_try = FALSE;
 333
 334 int
 335 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 336 {
 337         vnode_t pvp = NULL;
 338         vnode_t vp = NULL;
 339         int need_nameidone = 0;
 340         vfs_context_t ctx = vfs_context_current();
 341         char fstypename[MFSNAMELEN];
 342         struct nameidata nd;
 343         size_t dummy=0;
 344         char *labelstr = NULL;
 345         int flags = uap->flags;
 346         int error;
 347 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 348         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 349 #else
 350 #pragma unused(p)
 351 #endif
 352         /*
 353          * Get the fs type name from user space
 354          */
 355         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 356         if (error)
 357                 return (error);
 358
 359         /*
 360          * Get the vnode to be covered
 361          */
 362         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 363                UIO_USERSPACE, uap->path, ctx);
 364         error = namei(&nd);
 365         if (error) {
 366                 goto out;
 367         }
 368         need_nameidone = 1;
 369         vp = nd.ni_vp;
 370         pvp = nd.ni_dvp;
 371
 372 #ifdef CONFIG_IMGSRC_ACCESS
 373         /* Mounting image source cannot be batched with other operations */
 374         if (flags == MNT_IMGSRC_BY_INDEX) {
 375                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 376                                                   ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 377                 goto out;
 378         }
 379 #endif /* CONFIG_IMGSRC_ACCESS */
 380
 381 #if CONFIG_MACF
 382         /*
 383          * Get the label string (if any) from user space
 384          */
 385         if (uap->mac_p != USER_ADDR_NULL) {
 386                 struct user_mac mac;
 387                 size_t ulen = 0;
 388
 389                 if (is_64bit) {
 390                         struct user64_mac mac64;
 391                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 392                         mac.m_buflen = mac64.m_buflen;
 393                         mac.m_string = mac64.m_string;
 394                 } else {
 395                         struct user32_mac mac32;
 396                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 397                         mac.m_buflen = mac32.m_buflen;
 398                         mac.m_string = mac32.m_string;
 399                 }
 400                 if (error)
 401                         goto out;
 402                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 403                     (mac.m_buflen < 2)) {
 404                         error = EINVAL;
 405                         goto out;
 406                 }
 407                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 408                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 409                 if (error) {
 410                         goto out;
 411                 }
 412                 AUDIT_ARG(mac_string, labelstr);
 413         }
 414 #endif /* CONFIG_MACF */
 415
 416         AUDIT_ARG(fflags, flags);
 417
 418 #if SECURE_KERNEL
 419         if (flags & MNT_UNION) {
 420                 /* No union mounts on release kernels */
 421                 error = EPERM;
 422                 goto out;
 423         }
 424 #endif
 425
 426         if ((vp->v_flag & VROOT) &&
 427                         (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 428                 if (!(flags & MNT_UNION)) {
 429                         flags |= MNT_UPDATE;
 430                 }
 431                 else {
 432                         /*
 433                          * For a union mount on '/', treat it as fresh
 434                          * mount instead of update.
 435                          * Otherwise, union mouting on '/' used to panic the
 436                          * system before, since mnt_vnodecovered was found to
 437                          * be NULL for '/' which is required for unionlookup
 438                          * after it gets ENOENT on union mount.
 439                          */
 440                         flags = (flags & ~(MNT_UPDATE));
 441                 }
 442
 443 #if SECURE_KERNEL
 444                 if ((flags & MNT_RDONLY) == 0) {
 445                         /* Release kernels are not allowed to mount "/" as rw */
 446                         error = EPERM;
 447                         goto out;
 448                 }
 449 #endif
 450                 /*
 451                  * See 7392553 for more details on why this check exists.
 452                  * Suffice to say: If this check is ON and something tries
 453                  * to mount the rootFS RW, we'll turn off the codesign
 454                  * bitmap optimization.
 455                  */
 456 #if CHECK_CS_VALIDATION_BITMAP
 457                 if ((flags & MNT_RDONLY) == 0 ) {
 458                         root_fs_upgrade_try = TRUE;
 459                 }
 460 #endif
 461         }
 462
 463         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 464                              labelstr, FALSE, ctx);
 465
 466 out:
 467
 468 #if CONFIG_MACF
 469         if (labelstr)
 470                 FREE(labelstr, M_MACTEMP);
 471 #endif /* CONFIG_MACF */
 472
 473         if (vp) {
 474                 vnode_put(vp);
 475         }
 476         if (pvp) {
 477                 vnode_put(pvp);
 478         }
 479         if (need_nameidone) {
 480                 nameidone(&nd);
 481         }
 482
 483         return (error);
 484 }
 485
 486 /*
 487  * common mount implementation (final stage of mounting)
 488
 489  * Arguments:
 490  *  fstypename  file system type (ie it's vfs name)
 491  *  pvp         parent of covered vnode
 492  *  vp          covered vnode
 493  *  cnp         component name (ie path) of covered vnode
 494  *  flags       generic mount flags
 495  *  fsmountargs file system specific data
 496  *  labelstr    optional MAC label
 497  *  kernelmount TRUE for mounts initiated from inside the kernel
 498  *  ctx         caller's context
 499  */
 500 static int
 501 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 502              struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 503              char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 504 {
 505 #if !CONFIG_MACF
 506 #pragma unused(labelstr)
 507 #endif
 508         struct vnode *devvp = NULLVP;
 509         struct vnode *device_vnode = NULLVP;
 510 #if CONFIG_MACF
 511         struct vnode *rvp;
 512 #endif
 513         struct mount *mp;
 514         struct vfstable *vfsp = (struct vfstable *)0;
 515         struct proc *p = vfs_context_proc(ctx);
 516         int error, flag = 0;
 517         user_addr_t devpath = USER_ADDR_NULL;
 518         int ronly = 0;
 519         int mntalloc = 0;
 520         boolean_t vfsp_ref = FALSE;
 521         boolean_t is_rwlock_locked = FALSE;
 522         boolean_t did_rele = FALSE;
 523         boolean_t have_usecount = FALSE;
 524
 525         /*
 526          * Process an update for an existing mount
 527          */
 528         if (flags & MNT_UPDATE) {
 529                 if ((vp->v_flag & VROOT) == 0) {
 530                         error = EINVAL;
 531                         goto out1;
 532                 }
 533                 mp = vp->v_mount;
 534
 535                 /* unmount in progress return error */
 536                 mount_lock_spin(mp);
 537                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 538                         mount_unlock(mp);
 539                         error = EBUSY;
 540                         goto out1;
 541                 }
 542                 mount_unlock(mp);
 543                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 544                 is_rwlock_locked = TRUE;
 545                 /*
 546                  * We only allow the filesystem to be reloaded if it
 547                  * is currently mounted read-only.
 548                  */
 549                 if ((flags & MNT_RELOAD) &&
 550                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 551                         error = ENOTSUP;
 552                         goto out1;
 553                 }
 554
 555                 /*
 556                  * If content protection is enabled, update mounts are not
 557                  * allowed to turn it off.
 558                  */
 559                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 560                            ((flags & MNT_CPROTECT) == 0)) {
 561                         error = EINVAL;
 562                         goto out1;
 563                 }
 564
 565 #ifdef CONFIG_IMGSRC_ACCESS
 566                 /* Can't downgrade the backer of the root FS */
 567                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 568                         (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 569                         error = ENOTSUP;
 570                         goto out1;
 571                 }
 572 #endif /* CONFIG_IMGSRC_ACCESS */
 573
 574                 /*
 575                  * Only root, or the user that did the original mount is
 576                  * permitted to update it.
 577                  */
 578                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 579                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 580                         goto out1;
 581                 }
 582 #if CONFIG_MACF
 583                 error = mac_mount_check_remount(ctx, mp);
 584                 if (error != 0) {
 585                         goto out1;
 586                 }
 587 #endif
 588                 /*
 589                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 590                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 591                  */
 592                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 593                         flags |= MNT_NOSUID | MNT_NODEV;
 594                         if (mp->mnt_flag & MNT_NOEXEC)
 595                                 flags |= MNT_NOEXEC;
 596                 }
 597                 flag = mp->mnt_flag;
 598
 599
 600
 601                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 602
 603                 vfsp = mp->mnt_vtable;
 604                 goto update;
 605         }
 606         /*
 607          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 608          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 609          */
 610         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 611                 flags |= MNT_NOSUID | MNT_NODEV;
 612                 if (vp->v_mount->mnt_flag & MNT_NOEXEC)
 613                         flags |= MNT_NOEXEC;
 614         }
 615
 616         /* XXXAUDIT: Should we capture the type on the error path as well? */
 617         AUDIT_ARG(text, fstypename);
 618         mount_list_lock();
 619         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 620                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 621                         vfsp->vfc_refcount++;
 622                         vfsp_ref = TRUE;
 623                         break;
 624                 }
 625         mount_list_unlock();
 626         if (vfsp == NULL) {
 627                 error = ENODEV;
 628                 goto out1;
 629         }
 630
 631         /*
 632          * VFC_VFSLOCALARGS is not currently supported for kernel mounts
 633          */
 634         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
 635                 error = EINVAL;  /* unsupported request */
 636                 goto out1;
 637         }
 638
 639         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 640         if (error != 0) {
 641                 goto out1;
 642         }
 643
 644         /*
 645          * Allocate and initialize the filesystem (mount_t)
 646          */
 647         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 648                 M_MOUNT, M_WAITOK);
 649         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 650         mntalloc = 1;
 651
 652         /* Initialize the default IO constraints */
 653         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 654         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 655         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 656         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 657         mp->mnt_devblocksize = DEV_BSIZE;
 658         mp->mnt_alignmentmask = PAGE_MASK;
 659         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 660         mp->mnt_ioscale = 1;
 661         mp->mnt_ioflags = 0;
 662         mp->mnt_realrootvp = NULLVP;
 663         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 664
 665         TAILQ_INIT(&mp->mnt_vnodelist);
 666         TAILQ_INIT(&mp->mnt_workerqueue);
 667         TAILQ_INIT(&mp->mnt_newvnodes);
 668         mount_lock_init(mp);
 669         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 670         is_rwlock_locked = TRUE;
 671         mp->mnt_op = vfsp->vfc_vfsops;
 672         mp->mnt_vtable = vfsp;
 673         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 674         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 675         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 676         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 677         mp->mnt_vnodecovered = vp;
 678         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 679         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 680         mp->mnt_devbsdunit = 0;
 681
 682         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 683         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 684
 685 #if NFSCLIENT || DEVFS || ROUTEFS
 686         if (kernelmount)
 687                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 688         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0)
 689                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 690 #endif /* NFSCLIENT || DEVFS */
 691
 692 update:
 693         /*
 694          * Set the mount level flags.
 695          */
 696         if (flags & MNT_RDONLY)
 697                 mp->mnt_flag |= MNT_RDONLY;
 698         else if (mp->mnt_flag & MNT_RDONLY) {
 699                 // disallow read/write upgrades of file systems that
 700                 // had the TYPENAME_OVERRIDE feature set.
 701                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 702                         error = EPERM;
 703                         goto out1;
 704                 }
 705                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 706         }
 707         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 708                           MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 709                           MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 710                           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 711                           MNT_QUARANTINE | MNT_CPROTECT);
 712
 713 #if SECURE_KERNEL
 714 #if !CONFIG_MNT_SUID
 715         /*
 716          * On release builds of iOS based platforms, always enforce NOSUID and NODEV on
 717          * all mounts. We do this here because we can catch update mounts as well as
 718          * non-update mounts in this case.
 719          */
 720         mp->mnt_flag |= (MNT_NOSUID);
 721 #endif
 722 #endif
 723
 724         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 725                                  MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 726                                  MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 727                                  MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 728                                  MNT_QUARANTINE | MNT_CPROTECT);
 729
 730 #if CONFIG_MACF
 731         if (flags & MNT_MULTILABEL) {
 732                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 733                         error = EINVAL;
 734                         goto out1;
 735                 }
 736                 mp->mnt_flag |= MNT_MULTILABEL;
 737         }
 738 #endif
 739         /*
 740          * Process device path for local file systems if requested
 741          */
 742         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
 743             !(internal_flags & KERNEL_MOUNT_SNAPSHOT)) {
 744                 if (vfs_context_is64bit(ctx)) {
 745                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
 746                                 goto out1;
 747                         fsmountargs += sizeof(devpath);
 748                 } else {
 749                         user32_addr_t tmp;
 750                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
 751                                 goto out1;
 752                         /* munge into LP64 addr */
 753                         devpath = CAST_USER_ADDR_T(tmp);
 754                         fsmountargs += sizeof(tmp);
 755                 }
 756
 757                 /* Lookup device and authorize access to it */
 758                 if ((devpath)) {
 759                         struct nameidata nd;
 760
 761                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 762                         if ( (error = namei(&nd)) )
 763                                 goto out1;
 764
 765                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 766                         devvp = nd.ni_vp;
 767
 768                         nameidone(&nd);
 769
 770                         if (devvp->v_type != VBLK) {
 771                                 error = ENOTBLK;
 772                                 goto out2;
 773                         }
 774                         if (major(devvp->v_rdev) >= nblkdev) {
 775                                 error = ENXIO;
 776                                 goto out2;
 777                         }
 778                         /*
 779                         * If mount by non-root, then verify that user has necessary
 780                         * permissions on the device.
 781                         */
 782                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 783                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 784
 785                                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
 786                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 787                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0)
 788                                         goto out2;
 789                         }
 790                 }
 791                 /* On first mount, preflight and open device */
 792                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 793                         if ( (error = vnode_ref(devvp)) )
 794                                 goto out2;
 795                         /*
 796                         * Disallow multiple mounts of the same device.
 797                         * Disallow mounting of a device that is currently in use
 798                         * (except for root, which might share swap device for miniroot).
 799                         * Flush out any old buffers remaining from a previous use.
 800                         */
 801                         if ( (error = vfs_mountedon(devvp)) )
 802                                 goto out3;
 803
 804                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 805                                 error = EBUSY;
 806                                 goto out3;
 807                         }
 808                         if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, ctx)) ) {
 809                                 error = ENOTBLK;
 810                                 goto out3;
 811                         }
 812                         if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0)) )
 813                                 goto out3;
 814
 815                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 816 #if CONFIG_MACF
 817                         error = mac_vnode_check_open(ctx,
 818                             devvp,
 819                             ronly ? FREAD : FREAD|FWRITE);
 820                         if (error)
 821                                 goto out3;
 822 #endif /* MAC */
 823                         if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, ctx)) )
 824                                 goto out3;
 825
 826                         mp->mnt_devvp = devvp;
 827                         device_vnode = devvp;
 828
 829                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 830                            (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 831                            (device_vnode = mp->mnt_devvp)) {
 832                         dev_t dev;
 833                         int maj;
 834                         /*
 835                          * If upgrade to read-write by non-root, then verify
 836                          * that user has necessary permissions on the device.
 837                          */
 838                         vnode_getalways(device_vnode);
 839
 840                         if (suser(vfs_context_ucred(ctx), NULL) &&
 841                             (error = vnode_authorize(device_vnode, NULL,
 842                              KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
 843                              ctx)) != 0) {
 844                                 vnode_put(device_vnode);
 845                                 goto out2;
 846                         }
 847
 848                         /* Tell the device that we're upgrading */
 849                         dev = (dev_t)device_vnode->v_rdev;
 850                         maj = major(dev);
 851
 852                         if ((u_int)maj >= (u_int)nblkdev)
 853                                 panic("Volume mounted on a device with invalid major number.");
 854
 855                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
 856                         vnode_put(device_vnode);
 857                         device_vnode = NULLVP;
 858                         if (error != 0) {
 859                                 goto out2;
 860                         }
 861                 }
 862         }
 863 #if CONFIG_MACF
 864         if ((flags & MNT_UPDATE) == 0) {
 865                 mac_mount_label_init(mp);
 866                 mac_mount_label_associate(ctx, mp);
 867         }
 868         if (labelstr) {
 869                 if ((flags & MNT_UPDATE) != 0) {
 870                         error = mac_mount_check_label_update(ctx, mp);
 871                         if (error != 0)
 872                                 goto out3;
 873                 }
 874         }
 875 #endif
 876         /*
 877          * Mount the filesystem.
 878          */
 879         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
 880                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
 881                     (caddr_t)fsmountargs, 0, ctx);
 882         } else {
 883                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
 884         }
 885
 886         if (flags & MNT_UPDATE) {
 887                 if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 888                         mp->mnt_flag &= ~MNT_RDONLY;
 889                 mp->mnt_flag &=~
 890                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
 891                 mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 892                 if (error)
 893                         mp->mnt_flag = flag;  /* restore flag value */
 894                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
 895                 lck_rw_done(&mp->mnt_rwlock);
 896                 is_rwlock_locked = FALSE;
 897                 if (!error)
 898                         enablequotas(mp, ctx);
 899                 goto exit;
 900         }
 901
 902         /*
 903          * Put the new filesystem on the mount list after root.
 904          */
 905         if (error == 0) {
 906                 struct vfs_attr vfsattr;
 907 #if CONFIG_MACF
 908                 if (vfs_flags(mp) & MNT_MULTILABEL) {
 909                         error = VFS_ROOT(mp, &rvp, ctx);
 910                         if (error) {
 911                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
 912                                 goto out3;
 913                         }
 914                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
 915                         /*
 916                          * drop reference provided by VFS_ROOT
 917                          */
 918                         vnode_put(rvp);
 919
 920                         if (error)
 921                                 goto out3;
 922                 }
 923 #endif  /* MAC */
 924
 925                 vnode_lock_spin(vp);
 926                 CLR(vp->v_flag, VMOUNT);
 927                 vp->v_mountedhere = mp;
 928                 vnode_unlock(vp);
 929
 930                 /*
 931                  * taking the name_cache_lock exclusively will
 932                  * insure that everyone is out of the fast path who
 933                  * might be trying to use a now stale copy of
 934                  * vp->v_mountedhere->mnt_realrootvp
 935                  * bumping mount_generation causes the cached values
 936                  * to be invalidated
 937                  */
 938                 name_cache_lock();
 939                 mount_generation++;
 940                 name_cache_unlock();
 941
 942                 error = vnode_ref(vp);
 943                 if (error != 0) {
 944                         goto out4;
 945                 }
 946
 947                 have_usecount = TRUE;
 948
 949                 error = checkdirs(vp, ctx);
 950                 if (error != 0)  {
 951                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
 952                         goto out4;
 953                 }
 954                 /*
 955                  * there is no cleanup code here so I have made it void
 956                  * we need to revisit this
 957                  */
 958                 (void)VFS_START(mp, 0, ctx);
 959
 960                 if (mount_list_add(mp) != 0) {
 961                         /*
 962                          * The system is shutting down trying to umount
 963                          * everything, so fail with a plausible errno.
 964                          */
 965                         error = EBUSY;
 966                         goto out4;
 967                 }
 968                 lck_rw_done(&mp->mnt_rwlock);
 969                 is_rwlock_locked = FALSE;
 970
 971                 /* Check if this mounted file system supports EAs or named streams. */
 972                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
 973                 VFSATTR_INIT(&vfsattr);
 974                 VFSATTR_WANTED(&vfsattr, f_capabilities);
 975                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
 976                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
 977                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
 978                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
 979                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
 980                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
 981                         }
 982 #if NAMEDSTREAMS
 983                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
 984                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
 985                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
 986                         }
 987 #endif
 988                         /* Check if this file system supports path from id lookups. */
 989                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
 990                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
 991                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 992                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
 993                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
 994                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 995                         }
 996
 997                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
 998                                 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
 999                                 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1000                         }
1001                 }
1002                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1003                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1004                 }
1005                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1006                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1007                 }
1008                 /* increment the operations count */
1009                 OSAddAtomic(1, &vfs_nummntops);
1010                 enablequotas(mp, ctx);
1011
1012                 if (device_vnode) {
1013                         device_vnode->v_specflags |= SI_MOUNTEDON;
1014
1015                         /*
1016                          *   cache the IO attributes for the underlying physical media...
1017                          *   an error return indicates the underlying driver doesn't
1018                          *   support all the queries necessary... however, reasonable
1019                          *   defaults will have been set, so no reason to bail or care
1020                          */
1021                         vfs_init_io_attributes(device_vnode, mp);
1022                 }
1023
1024                 /* Now that mount is setup, notify the listeners */
1025                 vfs_notify_mount(pvp);
1026                 IOBSDMountChange(mp, kIOMountChangeMount);
1027
1028         } else {
1029                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1030                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1031                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1032                                         mp->mnt_vtable->vfc_name, error);
1033                 }
1034
1035                 vnode_lock_spin(vp);
1036                 CLR(vp->v_flag, VMOUNT);
1037                 vnode_unlock(vp);
1038                 mount_list_lock();
1039                 mp->mnt_vtable->vfc_refcount--;
1040                 mount_list_unlock();
1041
1042                 if (device_vnode ) {
1043                         vnode_rele(device_vnode);
1044                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD|FWRITE, ctx);
1045                 }
1046                 lck_rw_done(&mp->mnt_rwlock);
1047                 is_rwlock_locked = FALSE;
1048
1049                 /*
1050                  * if we get here, we have a mount structure that needs to be freed,
1051                  * but since the coveredvp hasn't yet been updated to point at it,
1052                  * no need to worry about other threads holding a crossref on this mp
1053                  * so it's ok to just free it
1054                  */
1055                 mount_lock_destroy(mp);
1056 #if CONFIG_MACF
1057                 mac_mount_label_destroy(mp);
1058 #endif
1059                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1060         }
1061 exit:
1062         /*
1063          * drop I/O count on the device vp if there was one
1064          */
1065         if (devpath && devvp)
1066                 vnode_put(devvp);
1067
1068         return(error);
1069
1070 /* Error condition exits */
1071 out4:
1072         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1073
1074         /*
1075          * If the mount has been placed on the covered vp,
1076          * it may have been discovered by now, so we have
1077          * to treat this just like an unmount
1078          */
1079         mount_lock_spin(mp);
1080         mp->mnt_lflag |= MNT_LDEAD;
1081         mount_unlock(mp);
1082
1083         if (device_vnode != NULLVP) {
1084                 vnode_rele(device_vnode);
1085                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1086                        ctx);
1087                 did_rele = TRUE;
1088         }
1089
1090         vnode_lock_spin(vp);
1091
1092         mp->mnt_crossref++;
1093         vp->v_mountedhere = (mount_t) 0;
1094
1095         vnode_unlock(vp);
1096
1097         if (have_usecount) {
1098                 vnode_rele(vp);
1099         }
1100 out3:
1101         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele))
1102                 vnode_rele(devvp);
1103 out2:
1104         if (devpath && devvp)
1105                 vnode_put(devvp);
1106 out1:
1107         /* Release mnt_rwlock only when it was taken */
1108         if (is_rwlock_locked == TRUE) {
1109                 lck_rw_done(&mp->mnt_rwlock);
1110         }
1111
1112         if (mntalloc) {
1113                 if (mp->mnt_crossref)
1114                         mount_dropcrossref(mp, vp, 0);
1115                 else {
1116                         mount_lock_destroy(mp);
1117 #if CONFIG_MACF
1118                         mac_mount_label_destroy(mp);
1119 #endif
1120                         FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1121                 }
1122         }
1123         if (vfsp_ref) {
1124                 mount_list_lock();
1125                 vfsp->vfc_refcount--;
1126                 mount_list_unlock();
1127         }
1128
1129         return(error);
1130 }
1131
1132 /*
1133  * Flush in-core data, check for competing mount attempts,
1134  * and set VMOUNT
1135  */
1136 int
1137 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1138 {
1139 #if !CONFIG_MACF
1140 #pragma unused(cnp,fsname)
1141 #endif
1142         struct vnode_attr va;
1143         int error;
1144
1145         if (!skip_auth) {
1146                 /*
1147                  * If the user is not root, ensure that they own the directory
1148                  * onto which we are attempting to mount.
1149                  */
1150                 VATTR_INIT(&va);
1151                 VATTR_WANTED(&va, va_uid);
1152                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1153                                 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1154                                  (!vfs_context_issuser(ctx)))) {
1155                         error = EPERM;
1156                         goto out;
1157                 }
1158         }
1159
1160         if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
1161                 goto out;
1162
1163         if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) )
1164                 goto out;
1165
1166         if (vp->v_type != VDIR) {
1167                 error = ENOTDIR;
1168                 goto out;
1169         }
1170
1171         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1172                 error = EBUSY;
1173                 goto out;
1174         }
1175
1176 #if CONFIG_MACF
1177         error = mac_mount_check_mount(ctx, vp,
1178             cnp, fsname);
1179         if (error != 0)
1180                 goto out;
1181 #endif
1182
1183         vnode_lock_spin(vp);
1184         SET(vp->v_flag, VMOUNT);
1185         vnode_unlock(vp);
1186
1187 out:
1188         return error;
1189 }
1190
1191 #if CONFIG_IMGSRC_ACCESS
1192
1193 #if DEBUG
1194 #define IMGSRC_DEBUG(args...) printf(args)
1195 #else
1196 #define IMGSRC_DEBUG(args...) do { } while(0)
1197 #endif
1198
1199 static int
1200 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1201 {
1202         struct nameidata nd;
1203         vnode_t vp, realdevvp;
1204         mode_t accessmode;
1205         int error;
1206
1207         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
1208         if ( (error = namei(&nd)) ) {
1209                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1210                 return error;
1211         }
1212
1213         vp = nd.ni_vp;
1214
1215         if (!vnode_isblk(vp)) {
1216                 IMGSRC_DEBUG("Not block device.\n");
1217                 error = ENOTBLK;
1218                 goto out;
1219         }
1220
1221         realdevvp = mp->mnt_devvp;
1222         if (realdevvp == NULLVP) {
1223                 IMGSRC_DEBUG("No device backs the mount.\n");
1224                 error = ENXIO;
1225                 goto out;
1226         }
1227
1228         error = vnode_getwithref(realdevvp);
1229         if (error != 0) {
1230                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1231                 goto out;
1232         }
1233
1234         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1235                 IMGSRC_DEBUG("Wrong dev_t.\n");
1236                 error = ENXIO;
1237                 goto out1;
1238         }
1239
1240         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1241
1242         /*
1243          * If mount by non-root, then verify that user has necessary
1244          * permissions on the device.
1245          */
1246         if (!vfs_context_issuser(ctx)) {
1247                 accessmode = KAUTH_VNODE_READ_DATA;
1248                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
1249                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1250                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1251                         IMGSRC_DEBUG("Access denied.\n");
1252                         goto out1;
1253                 }
1254         }
1255
1256         *devvpp = vp;
1257
1258 out1:
1259         vnode_put(realdevvp);
1260 out:
1261         nameidone(&nd);
1262         if (error) {
1263                 vnode_put(vp);
1264         }
1265
1266         return error;
1267 }
1268
1269 /*
1270  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1271  * and call checkdirs()
1272  */
1273 static int
1274 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1275 {
1276         int error;
1277
1278         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1279
1280         vnode_lock_spin(vp);
1281         CLR(vp->v_flag, VMOUNT);
1282         vp->v_mountedhere = mp;
1283         vnode_unlock(vp);
1284
1285         /*
1286          * taking the name_cache_lock exclusively will
1287          * insure that everyone is out of the fast path who
1288          * might be trying to use a now stale copy of
1289          * vp->v_mountedhere->mnt_realrootvp
1290          * bumping mount_generation causes the cached values
1291          * to be invalidated
1292          */
1293         name_cache_lock();
1294         mount_generation++;
1295         name_cache_unlock();
1296
1297         error = vnode_ref(vp);
1298         if (error != 0) {
1299                 goto out;
1300         }
1301
1302         error = checkdirs(vp, ctx);
1303         if (error != 0)  {
1304                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1305                 vnode_rele(vp);
1306                 goto out;
1307         }
1308
1309 out:
1310         if (error != 0) {
1311                 mp->mnt_vnodecovered = NULLVP;
1312         }
1313         return error;
1314 }
1315
1316 static void
1317 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1318 {
1319         vnode_rele(vp);
1320         vnode_lock_spin(vp);
1321         vp->v_mountedhere = (mount_t)NULL;
1322         vnode_unlock(vp);
1323
1324         mp->mnt_vnodecovered = NULLVP;
1325 }
1326
1327 static int
1328 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1329 {
1330         int error;
1331
1332         /* unmount in progress return error */
1333         mount_lock_spin(mp);
1334         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1335                 mount_unlock(mp);
1336                 return EBUSY;
1337         }
1338         mount_unlock(mp);
1339         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1340
1341         /*
1342          * We only allow the filesystem to be reloaded if it
1343          * is currently mounted read-only.
1344          */
1345         if ((flags & MNT_RELOAD) &&
1346                         ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1347                 error = ENOTSUP;
1348                 goto out;
1349         }
1350
1351         /*
1352          * Only root, or the user that did the original mount is
1353          * permitted to update it.
1354          */
1355         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1356                         (!vfs_context_issuser(ctx))) {
1357                 error = EPERM;
1358                 goto out;
1359         }
1360 #if CONFIG_MACF
1361         error = mac_mount_check_remount(ctx, mp);
1362         if (error != 0) {
1363                 goto out;
1364         }
1365 #endif
1366
1367 out:
1368         if (error) {
1369                 lck_rw_done(&mp->mnt_rwlock);
1370         }
1371
1372         return error;
1373 }
1374
1375 static void
1376 mount_end_update(mount_t mp)
1377 {
1378         lck_rw_done(&mp->mnt_rwlock);
1379 }
1380
1381 static int
1382 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1383 {
1384         vnode_t vp;
1385
1386         if (height >= MAX_IMAGEBOOT_NESTING) {
1387                 return EINVAL;
1388         }
1389
1390         vp = imgsrc_rootvnodes[height];
1391         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1392                 *rvpp = vp;
1393                 return 0;
1394         } else {
1395                 return ENOENT;
1396         }
1397 }
1398
1399 static int
1400 relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
1401                 const char *fsname, vfs_context_t ctx,
1402                 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1403 {
1404         int error;
1405         mount_t mp;
1406         boolean_t placed = FALSE;
1407         vnode_t devvp = NULLVP;
1408         struct vfstable *vfsp;
1409         user_addr_t devpath;
1410         char *old_mntonname;
1411         vnode_t rvp;
1412         uint32_t height;
1413         uint32_t flags;
1414
1415         /* If we didn't imageboot, nothing to move */
1416         if (imgsrc_rootvnodes[0] == NULLVP) {
1417                 return EINVAL;
1418         }
1419
1420         /* Only root can do this */
1421         if (!vfs_context_issuser(ctx)) {
1422                 return EPERM;
1423         }
1424
1425         IMGSRC_DEBUG("looking for root vnode.\n");
1426
1427         /*
1428          * Get root vnode of filesystem we're moving.
1429          */
1430         if (by_index) {
1431                 if (is64bit) {
1432                         struct user64_mnt_imgsrc_args mia64;
1433                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1434                         if (error != 0) {
1435                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1436                                 return error;
1437                         }
1438
1439                         height = mia64.mi_height;
1440                         flags = mia64.mi_flags;
1441                         devpath = mia64.mi_devpath;
1442                 } else {
1443                         struct user32_mnt_imgsrc_args mia32;
1444                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1445                         if (error != 0) {
1446                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1447                                 return error;
1448                         }
1449
1450                         height = mia32.mi_height;
1451                         flags = mia32.mi_flags;
1452                         devpath = mia32.mi_devpath;
1453                 }
1454         } else {
1455                 /*
1456                  * For binary compatibility--assumes one level of nesting.
1457                  */
1458                 if (is64bit) {
1459                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
1460                                 return error;
1461                 } else {
1462                         user32_addr_t tmp;
1463                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
1464                                 return error;
1465
1466                         /* munge into LP64 addr */
1467                         devpath = CAST_USER_ADDR_T(tmp);
1468                 }
1469
1470                 height = 0;
1471                 flags = 0;
1472         }
1473
1474         if (flags != 0) {
1475                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1476                 return EINVAL;
1477         }
1478
1479         error = get_imgsrc_rootvnode(height, &rvp);
1480         if (error != 0) {
1481                 IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
1482                 return error;
1483         }
1484
1485         IMGSRC_DEBUG("got root vnode.\n");
1486
1487         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1488
1489         /* Can only move once */
1490         mp = vnode_mount(rvp);
1491         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1492                 IMGSRC_DEBUG("Already moved.\n");
1493                 error = EBUSY;
1494                 goto out0;
1495         }
1496
1497         IMGSRC_DEBUG("Starting updated.\n");
1498
1499         /* Get exclusive rwlock on mount, authorize update on mp */
1500         error = mount_begin_update(mp , ctx, 0);
1501         if (error != 0) {
1502                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1503                 goto out0;
1504         }
1505
1506         /*
1507          * It can only be moved once.  Flag is set under the rwlock,
1508          * so we're now safe to proceed.
1509          */
1510         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1511                 IMGSRC_DEBUG("Already moved [2]\n");
1512                 goto out1;
1513         }
1514
1515
1516         IMGSRC_DEBUG("Preparing coveredvp.\n");
1517
1518         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1519         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1520         if (error != 0) {
1521                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1522                 goto out1;
1523         }
1524
1525         IMGSRC_DEBUG("Covered vp OK.\n");
1526
1527         /* Sanity check the name caller has provided */
1528         vfsp = mp->mnt_vtable;
1529         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1530                 IMGSRC_DEBUG("Wrong fs name.\n");
1531                 error = EINVAL;
1532                 goto out2;
1533         }
1534
1535         /* Check the device vnode and update mount-from name, for local filesystems */
1536         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1537                 IMGSRC_DEBUG("Local, doing device validation.\n");
1538
1539                 if (devpath != USER_ADDR_NULL) {
1540                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1541                         if (error) {
1542                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1543                                 goto out2;
1544                         }
1545
1546                         vnode_put(devvp);
1547                 }
1548         }
1549
1550         /*
1551          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1552          * and increment the name cache's mount generation
1553          */
1554
1555         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1556         error = place_mount_and_checkdirs(mp, vp, ctx);
1557         if (error != 0) {
1558                 goto out2;
1559         }
1560
1561         placed = TRUE;
1562
1563         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1564         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1565
1566         /* Forbid future moves */
1567         mount_lock(mp);
1568         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1569         mount_unlock(mp);
1570
1571         /* Finally, add to mount list, completely ready to go */
1572         if (mount_list_add(mp) != 0) {
1573                 /*
1574                  * The system is shutting down trying to umount
1575                  * everything, so fail with a plausible errno.
1576                  */
1577                 error = EBUSY;
1578                 goto out3;
1579         }
1580
1581         mount_end_update(mp);
1582         vnode_put(rvp);
1583         FREE(old_mntonname, M_TEMP);
1584
1585         vfs_notify_mount(pvp);
1586
1587         return 0;
1588 out3:
1589         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1590
1591         mount_lock(mp);
1592         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1593         mount_unlock(mp);
1594
1595 out2:
1596         /*
1597          * Placing the mp on the vnode clears VMOUNT,
1598          * so cleanup is different after that point
1599          */
1600         if (placed) {
1601                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1602                 undo_place_on_covered_vp(mp, vp);
1603         } else {
1604                 vnode_lock_spin(vp);
1605                 CLR(vp->v_flag, VMOUNT);
1606                 vnode_unlock(vp);
1607         }
1608 out1:
1609         mount_end_update(mp);
1610
1611 out0:
1612         vnode_put(rvp);
1613         FREE(old_mntonname, M_TEMP);
1614         return error;
1615 }
1616
1617 #endif /* CONFIG_IMGSRC_ACCESS */
1618
1619 void
1620 enablequotas(struct mount *mp, vfs_context_t ctx)
1621 {
1622         struct nameidata qnd;
1623         int type;
1624         char qfpath[MAXPATHLEN];
1625         const char *qfname = QUOTAFILENAME;
1626         const char *qfopsname = QUOTAOPSNAME;
1627         const char *qfextension[] = INITQFNAMES;
1628
1629         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1630         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0 ) {
1631                 return;
1632         }
1633         /*
1634          * Enable filesystem disk quotas if necessary.
1635          * We ignore errors as this should not interfere with final mount
1636          */
1637         for (type=0; type < MAXQUOTAS; type++) {
1638                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1639                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1640                        CAST_USER_ADDR_T(qfpath), ctx);
1641                 if (namei(&qnd) != 0)
1642                         continue;           /* option file to trigger quotas is not present */
1643                 vnode_put(qnd.ni_vp);
1644                 nameidone(&qnd);
1645                 snprintf(qfpath, sizeof(qfpath),  "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1646
1647                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1648         }
1649         return;
1650 }
1651
1652
1653 static int
1654 checkdirs_callback(proc_t p, void * arg)
1655 {
1656         struct cdirargs * cdrp = (struct cdirargs * )arg;
1657         vnode_t olddp = cdrp->olddp;
1658         vnode_t newdp = cdrp->newdp;
1659         struct filedesc *fdp;
1660         vnode_t tvp;
1661         vnode_t fdp_cvp;
1662         vnode_t fdp_rvp;
1663         int cdir_changed = 0;
1664         int rdir_changed = 0;
1665
1666         /*
1667          * XXX Also needs to iterate each thread in the process to see if it
1668          * XXX is using a per-thread current working directory, and, if so,
1669          * XXX update that as well.
1670          */
1671
1672         proc_fdlock(p);
1673         fdp = p->p_fd;
1674         if (fdp == (struct filedesc *)0) {
1675                 proc_fdunlock(p);
1676                 return(PROC_RETURNED);
1677         }
1678         fdp_cvp = fdp->fd_cdir;
1679         fdp_rvp = fdp->fd_rdir;
1680         proc_fdunlock(p);
1681
1682         if (fdp_cvp == olddp) {
1683                 vnode_ref(newdp);
1684                 tvp = fdp->fd_cdir;
1685                 fdp_cvp = newdp;
1686                 cdir_changed = 1;
1687                 vnode_rele(tvp);
1688         }
1689         if (fdp_rvp == olddp) {
1690                 vnode_ref(newdp);
1691                 tvp = fdp->fd_rdir;
1692                 fdp_rvp = newdp;
1693                 rdir_changed = 1;
1694                 vnode_rele(tvp);
1695         }
1696         if (cdir_changed || rdir_changed) {
1697                 proc_fdlock(p);
1698                 fdp->fd_cdir = fdp_cvp;
1699                 fdp->fd_rdir = fdp_rvp;
1700                 proc_fdunlock(p);
1701         }
1702         return(PROC_RETURNED);
1703 }
1704
1705
1706
1707 /*
1708  * Scan all active processes to see if any of them have a current
1709  * or root directory onto which the new filesystem has just been
1710  * mounted. If so, replace them with the new mount point.
1711  */
1712 static int
1713 checkdirs(vnode_t olddp, vfs_context_t ctx)
1714 {
1715         vnode_t newdp;
1716         vnode_t tvp;
1717         int err;
1718         struct cdirargs cdr;
1719
1720         if (olddp->v_usecount == 1)
1721                 return(0);
1722         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
1723
1724         if (err != 0) {
1725 #if DIAGNOSTIC
1726                 panic("mount: lost mount: error %d", err);
1727 #endif
1728                 return(err);
1729         }
1730
1731         cdr.olddp = olddp;
1732         cdr.newdp = newdp;
1733         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
1734         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
1735
1736         if (rootvnode == olddp) {
1737                 vnode_ref(newdp);
1738                 tvp = rootvnode;
1739                 rootvnode = newdp;
1740                 vnode_rele(tvp);
1741         }
1742
1743         vnode_put(newdp);
1744         return(0);
1745 }
1746
1747 /*
1748  * Unmount a file system.
1749  *
1750  * Note: unmount takes a path to the vnode mounted on as argument,
1751  * not special file (as before).
1752  */
1753 /* ARGSUSED */
1754 int
1755 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
1756 {
1757         vnode_t vp;
1758         struct mount *mp;
1759         int error;
1760         struct nameidata nd;
1761         vfs_context_t ctx = vfs_context_current();
1762
1763         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
1764                 UIO_USERSPACE, uap->path, ctx);
1765         error = namei(&nd);
1766         if (error)
1767                 return (error);
1768         vp = nd.ni_vp;
1769         mp = vp->v_mount;
1770         nameidone(&nd);
1771
1772 #if CONFIG_MACF
1773         error = mac_mount_check_umount(ctx, mp);
1774         if (error != 0) {
1775                 vnode_put(vp);
1776                 return (error);
1777         }
1778 #endif
1779         /*
1780          * Must be the root of the filesystem
1781          */
1782         if ((vp->v_flag & VROOT) == 0) {
1783                 vnode_put(vp);
1784                 return (EINVAL);
1785         }
1786         mount_ref(mp, 0);
1787         vnode_put(vp);
1788         /* safedounmount consumes the mount ref */
1789         return (safedounmount(mp, uap->flags, ctx));
1790 }
1791
1792 int
1793 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
1794 {
1795         mount_t mp;
1796
1797         mp = mount_list_lookupby_fsid(fsid, 0, 1);
1798         if (mp == (mount_t)0) {
1799                 return(ENOENT);
1800         }
1801         mount_ref(mp, 0);
1802         mount_iterdrop(mp);
1803         /* safedounmount consumes the mount ref */
1804         return(safedounmount(mp, flags, ctx));
1805 }
1806
1807
1808 /*
1809  * The mount struct comes with a mount ref which will be consumed.
1810  * Do the actual file system unmount, prevent some common foot shooting.
1811  */
1812 int
1813 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
1814 {
1815         int error;
1816         proc_t p = vfs_context_proc(ctx);
1817
1818         /*
1819          * If the file system is not responding and MNT_NOBLOCK
1820          * is set and not a forced unmount then return EBUSY.
1821          */
1822         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
1823                 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
1824                 error = EBUSY;
1825                 goto out;
1826         }
1827
1828         /*
1829          * Skip authorization if the mount is tagged as permissive and
1830          * this is not a forced-unmount attempt.
1831          */
1832         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
1833                 /*
1834                  * Only root, or the user that did the original mount is
1835                  * permitted to unmount this filesystem.
1836                  */
1837                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
1838                                 (error = suser(kauth_cred_get(), &p->p_acflag)))
1839                         goto out;
1840         }
1841         /*
1842          * Don't allow unmounting the root file system.
1843          */
1844         if (mp->mnt_flag & MNT_ROOTFS) {
1845                 error = EBUSY; /* the root is always busy */
1846                 goto out;
1847         }
1848
1849 #ifdef CONFIG_IMGSRC_ACCESS
1850         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
1851                 error = EBUSY;
1852                 goto out;
1853         }
1854 #endif /* CONFIG_IMGSRC_ACCESS */
1855
1856         return (dounmount(mp, flags, 1, ctx));
1857
1858 out:
1859         mount_drop(mp, 0);
1860         return(error);
1861 }
1862
1863 /*
1864  * Do the actual file system unmount.
1865  */
1866 int
1867 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
1868 {
1869         vnode_t coveredvp = (vnode_t)0;
1870         int error;
1871         int needwakeup = 0;
1872         int forcedunmount = 0;
1873         int lflags = 0;
1874         struct vnode *devvp = NULLVP;
1875 #if CONFIG_TRIGGERS
1876         proc_t p = vfs_context_proc(ctx);
1877         int did_vflush = 0;
1878         int pflags_save = 0;
1879 #endif /* CONFIG_TRIGGERS */
1880
1881 #if CONFIG_FSE
1882         if (!(flags & MNT_FORCE)) {
1883                 fsevent_unmount(mp, ctx);  /* has to come first! */
1884         }
1885 #endif
1886
1887         mount_lock(mp);
1888
1889         /*
1890          * If already an unmount in progress just return EBUSY.
1891          * Even a forced unmount cannot override.
1892          */
1893         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1894                 if (withref != 0)
1895                         mount_drop(mp, 1);
1896                 mount_unlock(mp);
1897                 return (EBUSY);
1898         }
1899
1900         if (flags & MNT_FORCE) {
1901                 forcedunmount = 1;
1902                 mp->mnt_lflag |= MNT_LFORCE;
1903         }
1904
1905 #if CONFIG_TRIGGERS
1906         if (flags & MNT_NOBLOCK && p != kernproc)
1907                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
1908 #endif
1909
1910         mp->mnt_kern_flag |= MNTK_UNMOUNT;
1911         mp->mnt_lflag |= MNT_LUNMOUNT;
1912         mp->mnt_flag &=~ MNT_ASYNC;
1913         /*
1914          * anyone currently in the fast path that
1915          * trips over the cached rootvp will be
1916          * dumped out and forced into the slow path
1917          * to regenerate a new cached value
1918          */
1919         mp->mnt_realrootvp = NULLVP;
1920         mount_unlock(mp);
1921
1922         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
1923                 /*
1924                  * Force unmount any mounts in this filesystem.
1925                  * If any unmounts fail - just leave them dangling.
1926                  * Avoids recursion.
1927                  */
1928                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
1929         }
1930
1931         /*
1932          * taking the name_cache_lock exclusively will
1933          * insure that everyone is out of the fast path who
1934          * might be trying to use a now stale copy of
1935          * vp->v_mountedhere->mnt_realrootvp
1936          * bumping mount_generation causes the cached values
1937          * to be invalidated
1938          */
1939         name_cache_lock();
1940         mount_generation++;
1941         name_cache_unlock();
1942
1943
1944         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1945         if (withref != 0)
1946                 mount_drop(mp, 0);
1947         error = 0;
1948         if (forcedunmount == 0) {
1949                 ubc_umount(mp); /* release cached vnodes */
1950                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1951                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
1952                         if (error) {
1953                                 mount_lock(mp);
1954                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1955                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1956                                 mp->mnt_lflag &= ~MNT_LFORCE;
1957                                 goto out;
1958                         }
1959                 }
1960         }
1961
1962         IOBSDMountChange(mp, kIOMountChangeUnmount);
1963
1964 #if CONFIG_TRIGGERS
1965         vfs_nested_trigger_unmounts(mp, flags, ctx);
1966         did_vflush = 1;
1967 #endif
1968         if (forcedunmount)
1969                 lflags |= FORCECLOSE;
1970         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
1971         if ((forcedunmount == 0) && error) {
1972                 mount_lock(mp);
1973                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1974                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1975                 mp->mnt_lflag &= ~MNT_LFORCE;
1976                 goto out;
1977         }
1978
1979         /* make sure there are no one in the mount iterations or lookup */
1980         mount_iterdrain(mp);
1981
1982         error = VFS_UNMOUNT(mp, flags, ctx);
1983         if (error) {
1984                 mount_iterreset(mp);
1985                 mount_lock(mp);
1986                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1987                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1988                 mp->mnt_lflag &= ~MNT_LFORCE;
1989                 goto out;
1990         }
1991
1992         /* increment the operations count */
1993         if (!error)
1994                 OSAddAtomic(1, &vfs_nummntops);
1995
1996         if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
1997                 /* hold an io reference and drop the usecount before close */
1998                 devvp = mp->mnt_devvp;
1999                 vnode_getalways(devvp);
2000                 vnode_rele(devvp);
2001                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
2002                        ctx);
2003                 vnode_clearmountedon(devvp);
2004                 vnode_put(devvp);
2005         }
2006         lck_rw_done(&mp->mnt_rwlock);
2007         mount_list_remove(mp);
2008         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2009
2010         /* mark the mount point hook in the vp but not drop the ref yet */
2011         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2012                 /*
2013                  * The covered vnode needs special handling. Trying to get an
2014                  * iocount must not block here as this may lead to deadlocks
2015                  * if the Filesystem to which the covered vnode belongs is
2016                  * undergoing forced unmounts. Since we hold a usecount, the
2017                  * vnode cannot be reused (it can, however, still be terminated)
2018                  */
2019                 vnode_getalways(coveredvp);
2020                 vnode_lock_spin(coveredvp);
2021
2022                 mp->mnt_crossref++;
2023                 coveredvp->v_mountedhere = (struct mount *)0;
2024                 CLR(coveredvp->v_flag, VMOUNT);
2025
2026                 vnode_unlock(coveredvp);
2027                 vnode_put(coveredvp);
2028         }
2029
2030         mount_list_lock();
2031         mp->mnt_vtable->vfc_refcount--;
2032         mount_list_unlock();
2033
2034         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2035         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2036         mount_lock(mp);
2037         mp->mnt_lflag |= MNT_LDEAD;
2038
2039         if (mp->mnt_lflag & MNT_LWAIT) {
2040                 /*
2041                  * do the wakeup here
2042                  * in case we block in mount_refdrain
2043                  * which will drop the mount lock
2044                  * and allow anyone blocked in vfs_busy
2045                  * to wakeup and see the LDEAD state
2046                  */
2047                 mp->mnt_lflag &= ~MNT_LWAIT;
2048                 wakeup((caddr_t)mp);
2049         }
2050         mount_refdrain(mp);
2051 out:
2052         if (mp->mnt_lflag & MNT_LWAIT) {
2053                 mp->mnt_lflag &= ~MNT_LWAIT;
2054                 needwakeup = 1;
2055         }
2056
2057 #if CONFIG_TRIGGERS
2058         if (flags & MNT_NOBLOCK && p != kernproc) {
2059                 // Restore P_NOREMOTEHANG bit to its previous value
2060                 if ((pflags_save & P_NOREMOTEHANG) == 0)
2061                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2062         }
2063
2064         /*
2065          * Callback and context are set together under the mount lock, and
2066          * never cleared, so we're safe to examine them here, drop the lock,
2067          * and call out.
2068          */
2069         if (mp->mnt_triggercallback != NULL) {
2070                 mount_unlock(mp);
2071                 if (error == 0) {
2072                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2073                 } else if (did_vflush) {
2074                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2075                 }
2076         } else {
2077                 mount_unlock(mp);
2078         }
2079 #else
2080         mount_unlock(mp);
2081 #endif /* CONFIG_TRIGGERS */
2082
2083         lck_rw_done(&mp->mnt_rwlock);
2084
2085         if (needwakeup)
2086                 wakeup((caddr_t)mp);
2087
2088         if (!error) {
2089                 if ((coveredvp != NULLVP)) {
2090                         vnode_t pvp = NULLVP;
2091
2092                         /*
2093                          * The covered vnode needs special handling. Trying to
2094                          * get an iocount must not block here as this may lead
2095                          * to deadlocks if the Filesystem to which the covered
2096                          * vnode belongs is undergoing forced unmounts. Since we
2097                          * hold a usecount, the  vnode cannot be reused
2098                          * (it can, however, still be terminated).
2099                          */
2100                         vnode_getalways(coveredvp);
2101
2102                         mount_dropcrossref(mp, coveredvp, 0);
2103                         /*
2104                          * We'll _try_ to detect if this really needs to be
2105                          * done. The coveredvp can only be in termination (or
2106                          * terminated) if the coveredvp's mount point is in a
2107                          * forced unmount (or has been) since we still hold the
2108                          * ref.
2109                          */
2110                         if (!vnode_isrecycled(coveredvp)) {
2111                                 pvp = vnode_getparent(coveredvp);
2112 #if CONFIG_TRIGGERS
2113                                 if (coveredvp->v_resolve) {
2114                                         vnode_trigger_rearm(coveredvp, ctx);
2115                                 }
2116 #endif
2117                         }
2118
2119                         vnode_rele(coveredvp);
2120                         vnode_put(coveredvp);
2121                         coveredvp = NULLVP;
2122
2123                         if (pvp) {
2124                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2125                                 vnode_put(pvp);
2126                         }
2127                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2128                                 mount_lock_destroy(mp);
2129 #if CONFIG_MACF
2130                                 mac_mount_label_destroy(mp);
2131 #endif
2132                                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2133                 } else
2134                         panic("dounmount: no coveredvp");
2135         }
2136         return (error);
2137 }
2138
2139 /*
2140  * Unmount any mounts in this filesystem.
2141  */
2142 void
2143 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2144 {
2145         mount_t smp;
2146         fsid_t *fsids, fsid;
2147         int fsids_sz;
2148         int count = 0, i, m = 0;
2149         vnode_t vp;
2150
2151         mount_list_lock();
2152
2153         // Get an array to hold the submounts fsids.
2154         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2155                 count++;
2156         fsids_sz = count * sizeof(fsid_t);
2157         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2158         if (fsids == NULL) {
2159                 mount_list_unlock();
2160                 goto out;
2161         }
2162         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2163
2164         /*
2165          * Fill the array with submount fsids.
2166          * Since mounts are always added to the tail of the mount list, the
2167          * list is always in mount order.
2168          * For each mount check if the mounted-on vnode belongs to a
2169          * mount that's already added to our array of mounts to be unmounted.
2170          */
2171         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2172                 vp = smp->mnt_vnodecovered;
2173                 if (vp == NULL)
2174                         continue;
2175                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2176                 for (i = 0; i <= m; i++) {
2177                         if (fsids[i].val[0] == fsid.val[0] &&
2178                             fsids[i].val[1] == fsid.val[1]) {
2179                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2180                                 break;
2181                         }
2182                 }
2183         }
2184         mount_list_unlock();
2185
2186         // Unmount the submounts in reverse order. Ignore errors.
2187         for (i = m; i > 0; i--) {
2188                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2189                 if (smp) {
2190                         mount_ref(smp, 0);
2191                         mount_iterdrop(smp);
2192                         (void) dounmount(smp, flags, 1, ctx);
2193                 }
2194         }
2195 out:
2196         if (fsids)
2197                 FREE(fsids, M_TEMP);
2198 }
2199
2200 void
2201 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2202 {
2203         vnode_lock(dp);
2204         mp->mnt_crossref--;
2205
2206         if (mp->mnt_crossref < 0)
2207                 panic("mount cross refs -ve");
2208
2209         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2210
2211                 if (need_put)
2212                         vnode_put_locked(dp);
2213                 vnode_unlock(dp);
2214
2215                 mount_lock_destroy(mp);
2216 #if CONFIG_MACF
2217                 mac_mount_label_destroy(mp);
2218 #endif
2219                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2220                 return;
2221         }
2222         if (need_put)
2223                 vnode_put_locked(dp);
2224         vnode_unlock(dp);
2225 }
2226
2227
2228 /*
2229  * Sync each mounted filesystem.
2230  */
2231 #if DIAGNOSTIC
2232 int syncprt = 0;
2233 #endif
2234
2235 int print_vmpage_stat=0;
2236 int sync_timeout = 60;  // Sync time limit (sec)
2237
2238 static int
2239 sync_callback(mount_t mp, __unused void *arg)
2240 {
2241         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2242                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2243
2244                 mp->mnt_flag &= ~MNT_ASYNC;
2245                 VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
2246                 if (asyncflag)
2247                         mp->mnt_flag |= MNT_ASYNC;
2248         }
2249
2250         return (VFS_RETURNED);
2251 }
2252
2253 /* ARGSUSED */
2254 int
2255 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2256 {
2257         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2258
2259         if (print_vmpage_stat) {
2260                 vm_countdirtypages();
2261         }
2262
2263 #if DIAGNOSTIC
2264         if (syncprt)
2265                 vfs_bufstats();
2266 #endif /* DIAGNOSTIC */
2267         return 0;
2268 }
2269
2270 static void
2271 sync_thread(void *arg, __unused wait_result_t wr)
2272 {
2273         int *timeout = (int *) arg;
2274
2275         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2276
2277         if (timeout)
2278                 wakeup((caddr_t) timeout);
2279         if (print_vmpage_stat) {
2280                 vm_countdirtypages();
2281         }
2282
2283 #if DIAGNOSTIC
2284         if (syncprt)
2285                 vfs_bufstats();
2286 #endif /* DIAGNOSTIC */
2287 }
2288
2289 /*
2290  * Sync in a separate thread so we can time out if it blocks.
2291  */
2292 static int
2293 sync_async(int timeout)
2294 {
2295         thread_t thd;
2296         int error;
2297         struct timespec ts = {timeout, 0};
2298
2299         lck_mtx_lock(sync_mtx_lck);
2300         if (kernel_thread_start(sync_thread, &timeout, &thd) != KERN_SUCCESS) {
2301                 printf("sync_thread failed\n");
2302                 lck_mtx_unlock(sync_mtx_lck);
2303                 return (0);
2304         }
2305
2306         error = msleep((caddr_t) &timeout, sync_mtx_lck, (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2307         if (error) {
2308                 printf("sync timed out: %d sec\n", timeout);
2309         }
2310         thread_deallocate(thd);
2311
2312         return (0);
2313 }
2314
2315 /*
2316  * An in-kernel sync for power management to call.
2317  */
2318 __private_extern__ int
2319 sync_internal(void)
2320 {
2321         (void) sync_async(sync_timeout);
2322
2323         return 0;
2324 } /* end of sync_internal call */
2325
2326 /*
2327  * Change filesystem quotas.
2328  */
2329 #if QUOTA
2330 int
2331 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2332 {
2333         struct mount *mp;
2334         int error, quota_cmd, quota_status;
2335         caddr_t datap;
2336         size_t fnamelen;
2337         struct nameidata nd;
2338         vfs_context_t ctx = vfs_context_current();
2339         struct dqblk my_dqblk;
2340
2341         AUDIT_ARG(uid, uap->uid);
2342         AUDIT_ARG(cmd, uap->cmd);
2343         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2344                uap->path, ctx);
2345         error = namei(&nd);
2346         if (error)
2347                 return (error);
2348         mp = nd.ni_vp->v_mount;
2349         vnode_put(nd.ni_vp);
2350         nameidone(&nd);
2351
2352         /* copyin any data we will need for downstream code */
2353         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2354
2355         switch (quota_cmd) {
2356         case Q_QUOTAON:
2357                 /* uap->arg specifies a file from which to take the quotas */
2358                 fnamelen = MAXPATHLEN;
2359                 datap = kalloc(MAXPATHLEN);
2360                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2361                 break;
2362         case Q_GETQUOTA:
2363                 /* uap->arg is a pointer to a dqblk structure. */
2364                 datap = (caddr_t) &my_dqblk;
2365                 break;
2366         case Q_SETQUOTA:
2367         case Q_SETUSE:
2368                 /* uap->arg is a pointer to a dqblk structure. */
2369                 datap = (caddr_t) &my_dqblk;
2370                 if (proc_is64bit(p)) {
2371                         struct user_dqblk       my_dqblk64;
2372                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof (my_dqblk64));
2373                         if (error == 0) {
2374                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2375                         }
2376                 }
2377                 else {
2378                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof (my_dqblk));
2379                 }
2380                 break;
2381         case Q_QUOTASTAT:
2382                 /* uap->arg is a pointer to an integer */
2383                 datap = (caddr_t) &quota_status;
2384                 break;
2385         default:
2386                 datap = NULL;
2387                 break;
2388         } /* switch */
2389
2390         if (error == 0) {
2391                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2392         }
2393
2394         switch (quota_cmd) {
2395         case Q_QUOTAON:
2396                 if (datap != NULL)
2397                         kfree(datap, MAXPATHLEN);
2398                 break;
2399         case Q_GETQUOTA:
2400                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2401                 if (error == 0) {
2402                         if (proc_is64bit(p)) {
2403                                 struct user_dqblk       my_dqblk64 = {.dqb_bhardlimit = 0};
2404                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2405                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof (my_dqblk64));
2406                         }
2407                         else {
2408                                 error = copyout(datap, uap->arg, sizeof (struct dqblk));
2409                         }
2410                 }
2411                 break;
2412         case Q_QUOTASTAT:
2413                 /* uap->arg is a pointer to an integer */
2414                 if (error == 0) {
2415                         error = copyout(datap, uap->arg, sizeof(quota_status));
2416                 }
2417                 break;
2418         default:
2419                 break;
2420         } /* switch */
2421
2422         return (error);
2423 }
2424 #else
2425 int
2426 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2427 {
2428         return (EOPNOTSUPP);
2429 }
2430 #endif /* QUOTA */
2431
2432 /*
2433  * Get filesystem statistics.
2434  *
2435  * Returns:     0                       Success
2436  *      namei:???
2437  *      vfs_update_vfsstat:???
2438  *      munge_statfs:EFAULT
2439  */
2440 /* ARGSUSED */
2441 int
2442 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2443 {
2444         struct mount *mp;
2445         struct vfsstatfs *sp;
2446         int error;
2447         struct nameidata nd;
2448         vfs_context_t ctx = vfs_context_current();
2449         vnode_t vp;
2450
2451         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2452                 UIO_USERSPACE, uap->path, ctx);
2453         error = namei(&nd);
2454         if (error != 0)
2455                 return (error);
2456         vp = nd.ni_vp;
2457         mp = vp->v_mount;
2458         sp = &mp->mnt_vfsstat;
2459         nameidone(&nd);
2460
2461 #if CONFIG_MACF
2462         error = mac_mount_check_stat(ctx, mp);
2463         if (error != 0)
2464                 return (error);
2465 #endif
2466
2467         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2468         if (error != 0) {
2469                 vnode_put(vp);
2470                 return (error);
2471         }
2472
2473         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2474         vnode_put(vp);
2475         return (error);
2476 }
2477
2478 /*
2479  * Get filesystem statistics.
2480  */
2481 /* ARGSUSED */
2482 int
2483 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2484 {
2485         vnode_t vp;
2486         struct mount *mp;
2487         struct vfsstatfs *sp;
2488         int error;
2489
2490         AUDIT_ARG(fd, uap->fd);
2491
2492         if ( (error = file_vnode(uap->fd, &vp)) )
2493                 return (error);
2494
2495         error = vnode_getwithref(vp);
2496         if (error) {
2497                 file_drop(uap->fd);
2498                 return (error);
2499         }
2500
2501         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2502
2503         mp = vp->v_mount;
2504         if (!mp) {
2505                 error = EBADF;
2506                 goto out;
2507         }
2508
2509 #if CONFIG_MACF
2510         error = mac_mount_check_stat(vfs_context_current(), mp);
2511         if (error != 0)
2512                 goto out;
2513 #endif
2514
2515         sp = &mp->mnt_vfsstat;
2516         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2517                 goto out;
2518         }
2519
2520         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2521
2522 out:
2523         file_drop(uap->fd);
2524         vnode_put(vp);
2525
2526         return (error);
2527 }
2528
2529 /*
2530  * Common routine to handle copying of statfs64 data to user space
2531  */
2532 static int
2533 statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
2534 {
2535         int error;
2536         struct statfs64 sfs;
2537
2538         bzero(&sfs, sizeof(sfs));
2539
2540         sfs.f_bsize = sfsp->f_bsize;
2541         sfs.f_iosize = (int32_t)sfsp->f_iosize;
2542         sfs.f_blocks = sfsp->f_blocks;
2543         sfs.f_bfree = sfsp->f_bfree;
2544         sfs.f_bavail = sfsp->f_bavail;
2545         sfs.f_files = sfsp->f_files;
2546         sfs.f_ffree = sfsp->f_ffree;
2547         sfs.f_fsid = sfsp->f_fsid;
2548         sfs.f_owner = sfsp->f_owner;
2549         sfs.f_type = mp->mnt_vtable->vfc_typenum;
2550         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2551         sfs.f_fssubtype = sfsp->f_fssubtype;
2552         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2553                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
2554         } else {
2555                 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
2556         }
2557         strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
2558         strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
2559
2560         error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
2561
2562         return(error);
2563 }
2564
2565 /*
2566  * Get file system statistics in 64-bit mode
2567  */
2568 int
2569 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
2570 {
2571         struct mount *mp;
2572         struct vfsstatfs *sp;
2573         int error;
2574         struct nameidata nd;
2575         vfs_context_t ctxp = vfs_context_current();
2576         vnode_t vp;
2577
2578         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2579                 UIO_USERSPACE, uap->path, ctxp);
2580         error = namei(&nd);
2581         if (error != 0)
2582                 return (error);
2583         vp = nd.ni_vp;
2584         mp = vp->v_mount;
2585         sp = &mp->mnt_vfsstat;
2586         nameidone(&nd);
2587
2588 #if CONFIG_MACF
2589         error = mac_mount_check_stat(ctxp, mp);
2590         if (error != 0)
2591                 return (error);
2592 #endif
2593
2594         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
2595         if (error != 0) {
2596                 vnode_put(vp);
2597                 return (error);
2598         }
2599
2600         error = statfs64_common(mp, sp, uap->buf);
2601         vnode_put(vp);
2602
2603         return (error);
2604 }
2605
2606 /*
2607  * Get file system statistics in 64-bit mode
2608  */
2609 int
2610 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
2611 {
2612         struct vnode *vp;
2613         struct mount *mp;
2614         struct vfsstatfs *sp;
2615         int error;
2616
2617         AUDIT_ARG(fd, uap->fd);
2618
2619         if ( (error = file_vnode(uap->fd, &vp)) )
2620                 return (error);
2621
2622         error = vnode_getwithref(vp);
2623         if (error) {
2624                 file_drop(uap->fd);
2625                 return (error);
2626         }
2627
2628         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2629
2630         mp = vp->v_mount;
2631         if (!mp) {
2632                 error = EBADF;
2633                 goto out;
2634         }
2635
2636 #if CONFIG_MACF
2637         error = mac_mount_check_stat(vfs_context_current(), mp);
2638         if (error != 0)
2639                 goto out;
2640 #endif
2641
2642         sp = &mp->mnt_vfsstat;
2643         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2644                 goto out;
2645         }
2646
2647         error = statfs64_common(mp, sp, uap->buf);
2648
2649 out:
2650         file_drop(uap->fd);
2651         vnode_put(vp);
2652
2653         return (error);
2654 }
2655
2656 struct getfsstat_struct {
2657         user_addr_t     sfsp;
2658         user_addr_t     *mp;
2659         int             count;
2660         int             maxcount;
2661         int             flags;
2662         int             error;
2663 };
2664
2665
2666 static int
2667 getfsstat_callback(mount_t mp, void * arg)
2668 {
2669
2670         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2671         struct vfsstatfs *sp;
2672         int error, my_size;
2673         vfs_context_t ctx = vfs_context_current();
2674
2675         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2676 #if CONFIG_MACF
2677                 error = mac_mount_check_stat(ctx, mp);
2678                 if (error != 0) {
2679                         fstp->error = error;
2680                         return(VFS_RETURNED_DONE);
2681                 }
2682 #endif
2683                 sp = &mp->mnt_vfsstat;
2684                 /*
2685                  * If MNT_NOWAIT is specified, do not refresh the
2686                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
2687                  */
2688                 if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2689                         (error = vfs_update_vfsstat(mp, ctx,
2690                             VFS_USER_EVENT))) {
2691                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2692                         return(VFS_RETURNED);
2693                 }
2694
2695                 /*
2696                  * Need to handle LP64 version of struct statfs
2697                  */
2698                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
2699                 if (error) {
2700                         fstp->error = error;
2701                         return(VFS_RETURNED_DONE);
2702                 }
2703                 fstp->sfsp += my_size;
2704
2705                 if (fstp->mp) {
2706 #if CONFIG_MACF
2707                         error = mac_mount_label_get(mp, *fstp->mp);
2708                         if (error) {
2709                                 fstp->error = error;
2710                                 return(VFS_RETURNED_DONE);
2711                         }
2712 #endif
2713                         fstp->mp++;
2714                 }
2715         }
2716         fstp->count++;
2717         return(VFS_RETURNED);
2718 }
2719
2720 /*
2721  * Get statistics on all filesystems.
2722  */
2723 int
2724 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
2725 {
2726         struct __mac_getfsstat_args muap;
2727
2728         muap.buf = uap->buf;
2729         muap.bufsize = uap->bufsize;
2730         muap.mac = USER_ADDR_NULL;
2731         muap.macsize = 0;
2732         muap.flags = uap->flags;
2733
2734         return (__mac_getfsstat(p, &muap, retval));
2735 }
2736
2737 /*
2738  * __mac_getfsstat: Get MAC-related file system statistics
2739  *
2740  * Parameters:    p                        (ignored)
2741  *                uap                      User argument descriptor (see below)
2742  *                retval                   Count of file system statistics (N stats)
2743  *
2744  * Indirect:      uap->bufsize             Buffer size
2745  *                uap->macsize             MAC info size
2746  *                uap->buf                 Buffer where information will be returned
2747  *                uap->mac                 MAC info
2748  *                uap->flags               File system flags
2749  *
2750  *
2751  * Returns:        0                       Success
2752  *                !0                       Not success
2753  *
2754  */
2755 int
2756 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
2757 {
2758         user_addr_t sfsp;
2759         user_addr_t *mp;
2760         size_t count, maxcount, bufsize, macsize;
2761         struct getfsstat_struct fst;
2762
2763         bufsize = (size_t) uap->bufsize;
2764         macsize = (size_t) uap->macsize;
2765
2766         if (IS_64BIT_PROCESS(p)) {
2767                 maxcount = bufsize / sizeof(struct user64_statfs);
2768         }
2769         else {
2770                 maxcount = bufsize / sizeof(struct user32_statfs);
2771         }
2772         sfsp = uap->buf;
2773         count = 0;
2774
2775         mp = NULL;
2776
2777 #if CONFIG_MACF
2778         if (uap->mac != USER_ADDR_NULL) {
2779                 u_int32_t *mp0;
2780                 int error;
2781                 unsigned int i;
2782
2783                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
2784                 if (count != maxcount)
2785                         return (EINVAL);
2786
2787                 /* Copy in the array */
2788                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
2789                 if (mp0 == NULL) {
2790                         return (ENOMEM);
2791                 }
2792
2793                 error = copyin(uap->mac, mp0, macsize);
2794                 if (error) {
2795                         FREE(mp0, M_MACTEMP);
2796                         return (error);
2797                 }
2798
2799                 /* Normalize to an array of user_addr_t */
2800                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
2801                 if (mp == NULL) {
2802                         FREE(mp0, M_MACTEMP);
2803                         return (ENOMEM);
2804                 }
2805
2806                 for (i = 0; i < count; i++) {
2807                         if (IS_64BIT_PROCESS(p))
2808                                 mp[i] = ((user_addr_t *)mp0)[i];
2809                         else
2810                                 mp[i] = (user_addr_t)mp0[i];
2811                 }
2812                 FREE(mp0, M_MACTEMP);
2813         }
2814 #endif
2815
2816
2817         fst.sfsp = sfsp;
2818         fst.mp = mp;
2819         fst.flags = uap->flags;
2820         fst.count = 0;
2821         fst.error = 0;
2822         fst.maxcount = maxcount;
2823
2824
2825         vfs_iterate(0, getfsstat_callback, &fst);
2826
2827         if (mp)
2828                 FREE(mp, M_MACTEMP);
2829
2830         if (fst.error ) {
2831                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2832                 return(fst.error);
2833         }
2834
2835         if (fst.sfsp && fst.count > fst.maxcount)
2836                 *retval = fst.maxcount;
2837         else
2838                 *retval = fst.count;
2839         return (0);
2840 }
2841
2842 static int
2843 getfsstat64_callback(mount_t mp, void * arg)
2844 {
2845         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2846         struct vfsstatfs *sp;
2847         int error;
2848
2849         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2850 #if CONFIG_MACF
2851                 error = mac_mount_check_stat(vfs_context_current(), mp);
2852                 if (error != 0) {
2853                         fstp->error = error;
2854                         return(VFS_RETURNED_DONE);
2855                 }
2856 #endif
2857                 sp = &mp->mnt_vfsstat;
2858                 /*
2859                  * If MNT_NOWAIT is specified, do not refresh the fsstat
2860                  * cache. MNT_WAIT overrides MNT_NOWAIT.
2861                  *
2862                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
2863                  * getfsstat, since the constants are out of the same
2864                  * namespace.
2865                  */
2866                 if (((fstp->flags & MNT_NOWAIT) == 0 ||
2867                      (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2868                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
2869                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2870                         return(VFS_RETURNED);
2871                 }
2872
2873                 error = statfs64_common(mp, sp, fstp->sfsp);
2874                 if (error) {
2875                         fstp->error = error;
2876                         return(VFS_RETURNED_DONE);
2877                 }
2878                 fstp->sfsp += sizeof(struct statfs64);
2879         }
2880         fstp->count++;
2881         return(VFS_RETURNED);
2882 }
2883
2884 /*
2885  * Get statistics on all file systems in 64 bit mode.
2886  */
2887 int
2888 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
2889 {
2890         user_addr_t sfsp;
2891         int count, maxcount;
2892         struct getfsstat_struct fst;
2893
2894         maxcount = uap->bufsize / sizeof(struct statfs64);
2895
2896         sfsp = uap->buf;
2897         count = 0;
2898
2899         fst.sfsp = sfsp;
2900         fst.flags = uap->flags;
2901         fst.count = 0;
2902         fst.error = 0;
2903         fst.maxcount = maxcount;
2904
2905         vfs_iterate(0, getfsstat64_callback, &fst);
2906
2907         if (fst.error ) {
2908                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2909                 return(fst.error);
2910         }
2911
2912         if (fst.sfsp && fst.count > fst.maxcount)
2913                 *retval = fst.maxcount;
2914         else
2915                 *retval = fst.count;
2916
2917         return (0);
2918 }
2919
2920 /*
2921  * gets the associated vnode with the file descriptor passed.
2922  * as input
2923  *
2924  * INPUT
2925  * ctx - vfs context of caller
2926  * fd - file descriptor for which vnode is required.
2927  * vpp - Pointer to pointer to vnode to be returned.
2928  *
2929  * The vnode is returned with an iocount so any vnode obtained
2930  * by this call needs a vnode_put
2931  *
2932  */
2933 int
2934 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
2935 {
2936         int error;
2937         vnode_t vp;
2938         struct fileproc *fp;
2939         proc_t p = vfs_context_proc(ctx);
2940
2941         *vpp =  NULLVP;
2942
2943         error = fp_getfvp(p, fd, &fp, &vp);
2944         if (error)
2945                 return (error);
2946
2947         error = vnode_getwithref(vp);
2948         if (error) {
2949                 (void)fp_drop(p, fd, fp, 0);
2950                 return (error);
2951         }
2952
2953         (void)fp_drop(p, fd, fp, 0);
2954         *vpp = vp;
2955         return (error);
2956 }
2957
2958 /*
2959  * Wrapper function around namei to start lookup from a directory
2960  * specified by a file descriptor ni_dirfd.
2961  *
2962  * In addition to all the errors returned by namei, this call can
2963  * return ENOTDIR if the file descriptor does not refer to a directory.
2964  * and EBADF if the file descriptor is not valid.
2965  */
2966 int
2967 nameiat(struct nameidata *ndp, int dirfd)
2968 {
2969         if ((dirfd != AT_FDCWD) &&
2970             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
2971             !(ndp->ni_cnd.cn_flags & USEDVP)) {
2972                 int error = 0;
2973                 char c;
2974
2975                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
2976                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
2977                         if (error)
2978                                 return (error);
2979                 } else {
2980                         c = *((char *)(ndp->ni_dirp));
2981                 }
2982
2983                 if (c != '/') {
2984                         vnode_t dvp_at;
2985
2986                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
2987                             &dvp_at);
2988                         if (error)
2989                                 return (error);
2990
2991                         if (vnode_vtype(dvp_at) != VDIR) {
2992                                 vnode_put(dvp_at);
2993                                 return (ENOTDIR);
2994                         }
2995
2996                         ndp->ni_dvp = dvp_at;
2997                         ndp->ni_cnd.cn_flags |= USEDVP;
2998                         error = namei(ndp);
2999                         ndp->ni_cnd.cn_flags &= ~USEDVP;
3000                         vnode_put(dvp_at);
3001                         return (error);
3002                 }
3003         }
3004
3005         return (namei(ndp));
3006 }
3007
3008 /*
3009  * Change current working directory to a given file descriptor.
3010  */
3011 /* ARGSUSED */
3012 static int
3013 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3014 {
3015         struct filedesc *fdp = p->p_fd;
3016         vnode_t vp;
3017         vnode_t tdp;
3018         vnode_t tvp;
3019         struct mount *mp;
3020         int error;
3021         vfs_context_t ctx = vfs_context_current();
3022
3023         AUDIT_ARG(fd, uap->fd);
3024         if (per_thread && uap->fd == -1) {
3025                 /*
3026                  * Switching back from per-thread to per process CWD; verify we
3027                  * in fact have one before proceeding.  The only success case
3028                  * for this code path is to return 0 preemptively after zapping
3029                  * the thread structure contents.
3030                  */
3031                 thread_t th = vfs_context_thread(ctx);
3032                 if (th) {
3033                         uthread_t uth = get_bsdthread_info(th);
3034                         tvp = uth->uu_cdir;
3035                         uth->uu_cdir = NULLVP;
3036                         if (tvp != NULLVP) {
3037                                 vnode_rele(tvp);
3038                                 return (0);
3039                         }
3040                 }
3041                 return (EBADF);
3042         }
3043
3044         if ( (error = file_vnode(uap->fd, &vp)) )
3045                 return(error);
3046         if ( (error = vnode_getwithref(vp)) ) {
3047                 file_drop(uap->fd);
3048                 return(error);
3049         }
3050
3051         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3052
3053         if (vp->v_type != VDIR) {
3054                 error = ENOTDIR;
3055                 goto out;
3056         }
3057
3058 #if CONFIG_MACF
3059         error = mac_vnode_check_chdir(ctx, vp);
3060         if (error)
3061                 goto out;
3062 #endif
3063         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3064         if (error)
3065                 goto out;
3066
3067         while (!error && (mp = vp->v_mountedhere) != NULL) {
3068                 if (vfs_busy(mp, LK_NOWAIT)) {
3069                         error = EACCES;
3070                         goto out;
3071                 }
3072                 error = VFS_ROOT(mp, &tdp, ctx);
3073                 vfs_unbusy(mp);
3074                 if (error)
3075                         break;
3076                 vnode_put(vp);
3077                 vp = tdp;
3078         }
3079         if (error)
3080                 goto out;
3081         if ( (error = vnode_ref(vp)) )
3082                 goto out;
3083         vnode_put(vp);
3084
3085         if (per_thread) {
3086                 thread_t th = vfs_context_thread(ctx);
3087                 if (th) {
3088                         uthread_t uth = get_bsdthread_info(th);
3089                         tvp = uth->uu_cdir;
3090                         uth->uu_cdir = vp;
3091                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3092                 } else {
3093                         vnode_rele(vp);
3094                         return (ENOENT);
3095                 }
3096         } else {
3097                 proc_fdlock(p);
3098                 tvp = fdp->fd_cdir;
3099                 fdp->fd_cdir = vp;
3100                 proc_fdunlock(p);
3101         }
3102
3103         if (tvp)
3104                 vnode_rele(tvp);
3105         file_drop(uap->fd);
3106
3107         return (0);
3108 out:
3109         vnode_put(vp);
3110         file_drop(uap->fd);
3111
3112         return(error);
3113 }
3114
3115 int
3116 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3117 {
3118         return common_fchdir(p, uap, 0);
3119 }
3120
3121 int
3122 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3123 {
3124         return common_fchdir(p, (void *)uap, 1);
3125 }
3126
3127 /*
3128  * Change current working directory (".").
3129  *
3130  * Returns:     0                       Success
3131  *      change_dir:ENOTDIR
3132  *      change_dir:???
3133  *      vnode_ref:ENOENT                No such file or directory
3134  */
3135 /* ARGSUSED */
3136 static int
3137 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3138 {
3139         struct filedesc *fdp = p->p_fd;
3140         int error;
3141         struct nameidata nd;
3142         vnode_t tvp;
3143         vfs_context_t ctx = vfs_context_current();
3144
3145         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3146                 UIO_USERSPACE, uap->path, ctx);
3147         error = change_dir(&nd, ctx);
3148         if (error)
3149                 return (error);
3150         if ( (error = vnode_ref(nd.ni_vp)) ) {
3151                 vnode_put(nd.ni_vp);
3152                 return (error);
3153         }
3154         /*
3155          * drop the iocount we picked up in change_dir
3156          */
3157         vnode_put(nd.ni_vp);
3158
3159         if (per_thread) {
3160                 thread_t th = vfs_context_thread(ctx);
3161                 if (th) {
3162                         uthread_t uth = get_bsdthread_info(th);
3163                         tvp = uth->uu_cdir;
3164                         uth->uu_cdir = nd.ni_vp;
3165                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3166                 } else {
3167                         vnode_rele(nd.ni_vp);
3168                         return (ENOENT);
3169                 }
3170         } else {
3171                 proc_fdlock(p);
3172                 tvp = fdp->fd_cdir;
3173                 fdp->fd_cdir = nd.ni_vp;
3174                 proc_fdunlock(p);
3175         }
3176
3177         if (tvp)
3178                 vnode_rele(tvp);
3179
3180         return (0);
3181 }
3182
3183
3184 /*
3185  * chdir
3186  *
3187  * Change current working directory (".") for the entire process
3188  *
3189  * Parameters:  p       Process requesting the call
3190  *              uap     User argument descriptor (see below)
3191  *              retval  (ignored)
3192  *
3193  * Indirect parameters: uap->path       Directory path
3194  *
3195  * Returns:     0                       Success
3196  *              common_chdir: ENOTDIR
3197  *              common_chdir: ENOENT    No such file or directory
3198  *              common_chdir: ???
3199  *
3200  */
3201 int
3202 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3203 {
3204         return common_chdir(p, (void *)uap, 0);
3205 }
3206
3207 /*
3208  * __pthread_chdir
3209  *
3210  * Change current working directory (".") for a single thread
3211  *
3212  * Parameters:  p       Process requesting the call
3213  *              uap     User argument descriptor (see below)
3214  *              retval  (ignored)
3215  *
3216  * Indirect parameters: uap->path       Directory path
3217  *
3218  * Returns:     0                       Success
3219  *              common_chdir: ENOTDIR
3220  *              common_chdir: ENOENT    No such file or directory
3221  *              common_chdir: ???
3222  *
3223  */
3224 int
3225 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3226 {
3227         return common_chdir(p, (void *)uap, 1);
3228 }
3229
3230
3231 /*
3232  * Change notion of root (``/'') directory.
3233  */
3234 /* ARGSUSED */
3235 int
3236 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3237 {
3238         struct filedesc *fdp = p->p_fd;
3239         int error;
3240         struct nameidata nd;
3241         vnode_t tvp;
3242         vfs_context_t ctx = vfs_context_current();
3243
3244         if ((error = suser(kauth_cred_get(), &p->p_acflag)))
3245                 return (error);
3246
3247         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3248                 UIO_USERSPACE, uap->path, ctx);
3249         error = change_dir(&nd, ctx);
3250         if (error)
3251                 return (error);
3252
3253 #if CONFIG_MACF
3254         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3255             &nd.ni_cnd);
3256         if (error) {
3257                 vnode_put(nd.ni_vp);
3258                 return (error);
3259         }
3260 #endif
3261
3262         if ( (error = vnode_ref(nd.ni_vp)) ) {
3263                 vnode_put(nd.ni_vp);
3264                 return (error);
3265         }
3266         vnode_put(nd.ni_vp);
3267
3268         proc_fdlock(p);
3269         tvp = fdp->fd_rdir;
3270         fdp->fd_rdir = nd.ni_vp;
3271         fdp->fd_flags |= FD_CHROOT;
3272         proc_fdunlock(p);
3273
3274         if (tvp != NULL)
3275                 vnode_rele(tvp);
3276
3277         return (0);
3278 }
3279
3280 /*
3281  * Common routine for chroot and chdir.
3282  *
3283  * Returns:     0                       Success
3284  *              ENOTDIR                 Not a directory
3285  *              namei:???               [anything namei can return]
3286  *              vnode_authorize:???     [anything vnode_authorize can return]
3287  */
3288 static int
3289 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3290 {
3291         vnode_t vp;
3292         int error;
3293
3294         if ((error = namei(ndp)))
3295                 return (error);
3296         nameidone(ndp);
3297         vp = ndp->ni_vp;
3298
3299         if (vp->v_type != VDIR) {
3300                 vnode_put(vp);
3301                 return (ENOTDIR);
3302         }
3303
3304 #if CONFIG_MACF
3305         error = mac_vnode_check_chdir(ctx, vp);
3306         if (error) {
3307                 vnode_put(vp);
3308                 return (error);
3309         }
3310 #endif
3311
3312         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3313         if (error) {
3314                 vnode_put(vp);
3315                 return (error);
3316         }
3317
3318         return (error);
3319 }
3320
3321 /*
3322  * Free the vnode data (for directories) associated with the file glob.
3323  */
3324 struct fd_vn_data *
3325 fg_vn_data_alloc(void)
3326 {
3327         struct fd_vn_data *fvdata;
3328
3329         /* Allocate per fd vnode data */
3330         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3331                M_FD_VN_DATA, M_WAITOK | M_ZERO);
3332         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3333         return fvdata;
3334 }
3335
3336 /*
3337  * Free the vnode data (for directories) associated with the file glob.
3338  */
3339 void
3340 fg_vn_data_free(void *fgvndata)
3341 {
3342         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3343
3344         if (fvdata->fv_buf)
3345                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3346         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3347         FREE(fvdata, M_FD_VN_DATA);
3348 }
3349
3350 /*
3351  * Check permissions, allocate an open file structure,
3352  * and call the device open routine if any.
3353  *
3354  * Returns:     0                       Success
3355  *              EINVAL
3356  *              EINTR
3357  *      falloc:ENFILE
3358  *      falloc:EMFILE
3359  *      falloc:ENOMEM
3360  *      vn_open_auth:???
3361  *      dupfdopen:???
3362  *      VNOP_ADVLOCK:???
3363  *      vnode_setsize:???
3364  *
3365  * XXX Need to implement uid, gid
3366  */
3367 int
3368 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3369     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3370     int32_t *retval)
3371 {
3372         proc_t p = vfs_context_proc(ctx);
3373         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3374         struct fileproc *fp;
3375         vnode_t vp;
3376         int flags, oflags;
3377         int type, indx, error;
3378         struct flock lf;
3379         struct vfs_context context;
3380
3381         oflags = uflags;
3382
3383         if ((oflags & O_ACCMODE) == O_ACCMODE)
3384                 return(EINVAL);
3385
3386         flags = FFLAGS(uflags);
3387         CLR(flags, FENCRYPTED);
3388         CLR(flags, FUNENCRYPTED);
3389
3390         AUDIT_ARG(fflags, oflags);
3391         AUDIT_ARG(mode, vap->va_mode);
3392
3393         if ((error = falloc_withalloc(p,
3394             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3395                 return (error);
3396         }
3397         uu->uu_dupfd = -indx - 1;
3398
3399         if ((error = vn_open_auth(ndp, &flags, vap))) {
3400                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)){        /* XXX from fdopen */
3401                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3402                                 fp_drop(p, indx, NULL, 0);
3403                                 *retval = indx;
3404                                 return (0);
3405                         }
3406                 }
3407                 if (error == ERESTART)
3408                         error = EINTR;
3409                 fp_free(p, indx, fp);
3410                 return (error);
3411         }
3412         uu->uu_dupfd = 0;
3413         vp = ndp->ni_vp;
3414
3415         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3416         fp->f_fglob->fg_ops = &vnops;
3417         fp->f_fglob->fg_data = (caddr_t)vp;
3418
3419         if (flags & (O_EXLOCK | O_SHLOCK)) {
3420                 lf.l_whence = SEEK_SET;
3421                 lf.l_start = 0;
3422                 lf.l_len = 0;
3423                 if (flags & O_EXLOCK)
3424                         lf.l_type = F_WRLCK;
3425                 else
3426                         lf.l_type = F_RDLCK;
3427                 type = F_FLOCK;
3428                 if ((flags & FNONBLOCK) == 0)
3429                         type |= F_WAIT;
3430 #if CONFIG_MACF
3431                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3432                     F_SETLK, &lf);
3433                 if (error)
3434                         goto bad;
3435 #endif
3436                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL)))
3437                         goto bad;
3438                 fp->f_fglob->fg_flag |= FHASLOCK;
3439         }
3440
3441 #if DEVELOPMENT || DEBUG
3442         /*
3443          * XXX VSWAP: Check for entitlements or special flag here
3444          * so we can restrict access appropriately.
3445          */
3446 #else /* DEVELOPMENT || DEBUG */
3447
3448         if (vnode_isswap(vp) && (flags & (FWRITE | O_TRUNC)) && (ctx != vfs_context_kernel())) {
3449                 /* block attempt to write/truncate swapfile */
3450                 error = EPERM;
3451                 goto bad;
3452         }
3453 #endif /* DEVELOPMENT || DEBUG */
3454
3455         /* try to truncate by setting the size attribute */
3456         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0))
3457                 goto bad;
3458
3459         /*
3460          * For directories we hold some additional information in the fd.
3461          */
3462         if (vnode_vtype(vp) == VDIR) {
3463                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3464         } else {
3465                 fp->f_fglob->fg_vn_data = NULL;
3466         }
3467
3468         vnode_put(vp);
3469
3470         /*
3471          * The first terminal open (without a O_NOCTTY) by a session leader
3472          * results in it being set as the controlling terminal.
3473          */
3474         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3475             !(flags & O_NOCTTY)) {
3476                 int tmp = 0;
3477
3478                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3479                     (caddr_t)&tmp, ctx);
3480         }
3481
3482         proc_fdlock(p);
3483         if (flags & O_CLOEXEC)
3484                 *fdflags(p, indx) |= UF_EXCLOSE;
3485         if (flags & O_CLOFORK)
3486                 *fdflags(p, indx) |= UF_FORKCLOSE;
3487         procfdtbl_releasefd(p, indx, NULL);
3488
3489 #if CONFIG_SECLUDED_MEMORY
3490         if (secluded_for_filecache &&
3491             FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
3492             vnode_vtype(vp) == VREG) {
3493                 memory_object_control_t moc;
3494
3495                 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
3496
3497                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
3498                         /* nothing to do... */
3499                 } else if (fp->f_fglob->fg_flag & FWRITE) {
3500                         /* writable -> no longer  eligible for secluded pages */
3501                         memory_object_mark_eligible_for_secluded(moc,
3502                                                                  FALSE);
3503                 } else if (secluded_for_filecache == 1) {
3504                         char pathname[32] = { 0, };
3505                         size_t copied;
3506                         /* XXX FBDP: better way to detect /Applications/ ? */
3507                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3508                                 copyinstr(ndp->ni_dirp,
3509                                           pathname,
3510                                           sizeof (pathname),
3511                                           &copied);
3512                         } else {
3513                                 copystr(CAST_DOWN(void *, ndp->ni_dirp),
3514                                         pathname,
3515                                         sizeof (pathname),
3516                                         &copied);
3517                         }
3518                         pathname[sizeof (pathname) - 1] = '\0';
3519                         if (strncmp(pathname,
3520                                     "/Applications/",
3521                                     strlen("/Applications/")) == 0 &&
3522                             strncmp(pathname,
3523                                     "/Applications/Camera.app/",
3524                                     strlen("/Applications/Camera.app/")) != 0) {
3525                                 /*
3526                                  * not writable
3527                                  * AND from "/Applications/"
3528                                  * AND not from "/Applications/Camera.app/"
3529                                  * ==> eligible for secluded
3530                                  */
3531                                 memory_object_mark_eligible_for_secluded(moc,
3532                                                                          TRUE);
3533                         }
3534                 } else if (secluded_for_filecache == 2) {
3535 /* not implemented... */
3536                         if (!strncmp(vp->v_name,
3537                                      DYLD_SHARED_CACHE_NAME,
3538                                      strlen(DYLD_SHARED_CACHE_NAME)) ||
3539                             !strncmp(vp->v_name,
3540                                      "dyld",
3541                                      strlen(vp->v_name)) ||
3542                             !strncmp(vp->v_name,
3543                                      "launchd",
3544                                      strlen(vp->v_name)) ||
3545                             !strncmp(vp->v_name,
3546                                      "Camera",
3547                                      strlen(vp->v_name)) ||
3548                             !strncmp(vp->v_name,
3549                                      "mediaserverd",
3550                                      strlen(vp->v_name))) {
3551                                 /*
3552                                  * This file matters when launching Camera:
3553                                  * do not store its contents in the secluded
3554                                  * pool that will be drained on Camera launch.
3555                                  */
3556                                 memory_object_mark_eligible_for_secluded(moc,
3557                                                                          FALSE);
3558                         }
3559                 }
3560         }
3561 #endif /* CONFIG_SECLUDED_MEMORY */
3562
3563         fp_drop(p, indx, fp, 1);
3564         proc_fdunlock(p);
3565
3566         *retval = indx;
3567
3568         return (0);
3569 bad:
3570         context = *vfs_context_current();
3571         context.vc_ucred = fp->f_fglob->fg_cred;
3572
3573         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
3574             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
3575                 lf.l_whence = SEEK_SET;
3576                 lf.l_start = 0;
3577                 lf.l_len = 0;
3578                 lf.l_type = F_UNLCK;
3579
3580                 (void)VNOP_ADVLOCK(
3581                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
3582         }
3583
3584         vn_close(vp, fp->f_fglob->fg_flag, &context);
3585         vnode_put(vp);
3586         fp_free(p, indx, fp);
3587
3588         return (error);
3589 }
3590
3591 /*
3592  * While most of the *at syscall handlers can call nameiat() which
3593  * is a wrapper around namei, the use of namei and initialisation
3594  * of nameidata are far removed and in different functions  - namei
3595  * gets called in vn_open_auth for open1. So we'll just do here what
3596  * nameiat() does.
3597  */
3598 static int
3599 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3600     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
3601     int dirfd)
3602 {
3603         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
3604                 int error;
3605                 char c;
3606
3607                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3608                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3609                         if (error)
3610                                 return (error);
3611                 } else {
3612                         c = *((char *)(ndp->ni_dirp));
3613                 }
3614
3615                 if (c != '/') {
3616                         vnode_t dvp_at;
3617
3618                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3619                             &dvp_at);
3620                         if (error)
3621                                 return (error);
3622
3623                         if (vnode_vtype(dvp_at) != VDIR) {
3624                                 vnode_put(dvp_at);
3625                                 return (ENOTDIR);
3626                         }
3627
3628                         ndp->ni_dvp = dvp_at;
3629                         ndp->ni_cnd.cn_flags |= USEDVP;
3630                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
3631                             retval);
3632                         vnode_put(dvp_at);
3633                         return (error);
3634                 }
3635         }
3636
3637         return (open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval));
3638 }
3639
3640 /*
3641  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
3642  *
3643  * Parameters:  p                       Process requesting the open
3644  *              uap                     User argument descriptor (see below)
3645  *              retval                  Pointer to an area to receive the
3646  *                                      return calue from the system call
3647  *
3648  * Indirect:    uap->path               Path to open (same as 'open')
3649  *              uap->flags              Flags to open (same as 'open'
3650  *              uap->uid                UID to set, if creating
3651  *              uap->gid                GID to set, if creating
3652  *              uap->mode               File mode, if creating (same as 'open')
3653  *              uap->xsecurity          ACL to set, if creating
3654  *
3655  * Returns:     0                       Success
3656  *              !0                      errno value
3657  *
3658  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3659  *
3660  * XXX:         We should enummerate the possible errno values here, and where
3661  *              in the code they originated.
3662  */
3663 int
3664 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
3665 {
3666         struct filedesc *fdp = p->p_fd;
3667         int ciferror;
3668         kauth_filesec_t xsecdst;
3669         struct vnode_attr va;
3670         struct nameidata nd;
3671         int cmode;
3672
3673         AUDIT_ARG(owner, uap->uid, uap->gid);
3674
3675         xsecdst = NULL;
3676         if ((uap->xsecurity != USER_ADDR_NULL) &&
3677             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
3678                 return ciferror;
3679
3680         VATTR_INIT(&va);
3681         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3682         VATTR_SET(&va, va_mode, cmode);
3683         if (uap->uid != KAUTH_UID_NONE)
3684                 VATTR_SET(&va, va_uid, uap->uid);
3685         if (uap->gid != KAUTH_GID_NONE)
3686                 VATTR_SET(&va, va_gid, uap->gid);
3687         if (xsecdst != NULL)
3688                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3689
3690         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3691                uap->path, vfs_context_current());
3692
3693         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
3694                          fileproc_alloc_init, NULL, retval);
3695         if (xsecdst != NULL)
3696                 kauth_filesec_free(xsecdst);
3697
3698         return ciferror;
3699 }
3700
3701 /*
3702  * Go through the data-protected atomically controlled open (2)
3703  *
3704  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
3705  */
3706 int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval) {
3707         int flags = uap->flags;
3708         int class = uap->class;
3709         int dpflags = uap->dpflags;
3710
3711         /*
3712          * Follow the same path as normal open(2)
3713          * Look up the item if it exists, and acquire the vnode.
3714          */
3715         struct filedesc *fdp = p->p_fd;
3716         struct vnode_attr va;
3717         struct nameidata nd;
3718         int cmode;
3719         int error;
3720
3721         VATTR_INIT(&va);
3722         /* Mask off all but regular access permissions */
3723         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3724         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3725
3726         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3727                uap->path, vfs_context_current());
3728
3729         /*
3730          * Initialize the extra fields in vnode_attr to pass down our
3731          * extra fields.
3732          * 1. target cprotect class.
3733          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
3734          */
3735         if (flags & O_CREAT) {
3736                /* lower level kernel code validates that the class is valid before applying it. */
3737                if (class != PROTECTION_CLASS_DEFAULT) {
3738                        /*
3739                         * PROTECTION_CLASS_DEFAULT implies that we make the class for this
3740                         * file behave the same as open (2)
3741                         */
3742                        VATTR_SET(&va, va_dataprotect_class, class);
3743                }
3744         }
3745
3746         if (dpflags & (O_DP_GETRAWENCRYPTED|O_DP_GETRAWUNENCRYPTED)) {
3747                 if ( flags & (O_RDWR | O_WRONLY)) {
3748                         /* Not allowed to write raw encrypted bytes */
3749                         return EINVAL;
3750                 }
3751                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
3752                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
3753                 }
3754                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
3755                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
3756                 }
3757         }
3758
3759         error = open1(vfs_context_current(), &nd, uap->flags, &va,
3760                       fileproc_alloc_init, NULL, retval);
3761
3762         return error;
3763 }
3764
3765 static int
3766 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
3767     int fd, enum uio_seg segflg, int *retval)
3768 {
3769         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
3770         struct vnode_attr va;
3771         struct nameidata nd;
3772         int cmode;
3773
3774         VATTR_INIT(&va);
3775         /* Mask off all but regular access permissions */
3776         cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3777         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3778
3779         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
3780             segflg, path, ctx);
3781
3782         return (open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
3783             retval, fd));
3784 }
3785
3786 int
3787 open(proc_t p, struct open_args *uap, int32_t *retval)
3788 {
3789         __pthread_testcancel(1);
3790         return(open_nocancel(p, (struct open_nocancel_args *)uap, retval));
3791 }
3792
3793 int
3794 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
3795     int32_t *retval)
3796 {
3797         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3798             uap->mode, AT_FDCWD, UIO_USERSPACE, retval));
3799 }
3800
3801 int
3802 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
3803                 int32_t *retval)
3804 {
3805         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3806             uap->mode, uap->fd, UIO_USERSPACE, retval));
3807 }
3808
3809 int
3810 openat(proc_t p, struct openat_args *uap, int32_t *retval)
3811 {
3812         __pthread_testcancel(1);
3813         return(openat_nocancel(p, (struct openat_nocancel_args *)uap, retval));
3814 }
3815
3816 /*
3817  * openbyid_np: open a file given a file system id and a file system object id
3818  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
3819  *      file systems that don't support object ids it is a node id (uint64_t).
3820  *
3821  * Parameters:  p                       Process requesting the open
3822  *              uap                     User argument descriptor (see below)
3823  *              retval                  Pointer to an area to receive the
3824  *                                      return calue from the system call
3825  *
3826  * Indirect:    uap->path               Path to open (same as 'open')
3827  *
3828  *              uap->fsid               id of target file system
3829  *              uap->objid              id of target file system object
3830  *              uap->flags              Flags to open (same as 'open')
3831  *
3832  * Returns:     0                       Success
3833  *              !0                      errno value
3834  *
3835  *
3836  * XXX:         We should enummerate the possible errno values here, and where
3837  *              in the code they originated.
3838  */
3839 int
3840 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
3841 {
3842         fsid_t fsid;
3843         uint64_t objid;
3844         int error;
3845         char *buf = NULL;
3846         int buflen = MAXPATHLEN;
3847         int pathlen = 0;
3848         vfs_context_t ctx = vfs_context_current();
3849
3850         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
3851                 return (error);
3852         }
3853
3854         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
3855                 return (error);
3856         }
3857
3858         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
3859         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
3860                 return (error);
3861         }
3862
3863         AUDIT_ARG(value32, fsid.val[0]);
3864         AUDIT_ARG(value64, objid);
3865
3866         /*resolve path from fsis, objid*/
3867         do {
3868                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
3869                 if (buf == NULL) {
3870                         return (ENOMEM);
3871                 }
3872
3873                 error = fsgetpath_internal(
3874                         ctx, fsid.val[0], objid,
3875                         buflen, buf, &pathlen);
3876
3877                 if (error) {
3878                         FREE(buf, M_TEMP);
3879                         buf = NULL;
3880                 }
3881         } while (error == ENOSPC && (buflen += MAXPATHLEN));
3882
3883         if (error) {
3884                 return error;
3885         }
3886
3887         buf[pathlen] = 0;
3888
3889         error = openat_internal(
3890                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
3891
3892         FREE(buf, M_TEMP);
3893
3894         return error;
3895 }
3896
3897
3898 /*
3899  * Create a special file.
3900  */
3901 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
3902
3903 int
3904 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
3905 {
3906         struct vnode_attr va;
3907         vfs_context_t ctx = vfs_context_current();
3908         int error;
3909         struct nameidata nd;
3910         vnode_t vp, dvp;
3911
3912         VATTR_INIT(&va);
3913         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3914         VATTR_SET(&va, va_rdev, uap->dev);
3915
3916         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
3917         if ((uap->mode & S_IFMT) == S_IFIFO)
3918                 return(mkfifo1(ctx, uap->path, &va));
3919
3920         AUDIT_ARG(mode, uap->mode);
3921         AUDIT_ARG(value32, uap->dev);
3922
3923         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
3924                 return (error);
3925         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
3926                 UIO_USERSPACE, uap->path, ctx);
3927         error = namei(&nd);
3928         if (error)
3929                 return (error);
3930         dvp = nd.ni_dvp;
3931         vp = nd.ni_vp;
3932
3933         if (vp != NULL) {
3934                 error = EEXIST;
3935                 goto out;
3936         }
3937
3938         switch (uap->mode & S_IFMT) {
3939         case S_IFCHR:
3940                 VATTR_SET(&va, va_type, VCHR);
3941                 break;
3942         case S_IFBLK:
3943                 VATTR_SET(&va, va_type, VBLK);
3944                 break;
3945         default:
3946                 error = EINVAL;
3947                 goto out;
3948         }
3949
3950 #if CONFIG_MACF
3951         error = mac_vnode_check_create(ctx,
3952             nd.ni_dvp, &nd.ni_cnd, &va);
3953         if (error)
3954                 goto out;
3955 #endif
3956
3957         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
3958                 goto out;
3959
3960         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0)
3961                 goto out;
3962
3963         if (vp) {
3964                 int     update_flags = 0;
3965
3966                 // Make sure the name & parent pointers are hooked up
3967                 if (vp->v_name == NULL)
3968                         update_flags |= VNODE_UPDATE_NAME;
3969                 if (vp->v_parent == NULLVP)
3970                         update_flags |= VNODE_UPDATE_PARENT;
3971
3972                 if (update_flags)
3973                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
3974
3975 #if CONFIG_FSE
3976                 add_fsevent(FSE_CREATE_FILE, ctx,
3977                     FSE_ARG_VNODE, vp,
3978                     FSE_ARG_DONE);
3979 #endif
3980         }
3981
3982 out:
3983         /*
3984          * nameidone has to happen before we vnode_put(dvp)
3985          * since it may need to release the fs_nodelock on the dvp
3986          */
3987         nameidone(&nd);
3988
3989         if (vp)
3990                 vnode_put(vp);
3991         vnode_put(dvp);
3992
3993         return (error);
3994 }
3995
3996 /*
3997  * Create a named pipe.
3998  *
3999  * Returns:     0                       Success
4000  *              EEXIST
4001  *      namei:???
4002  *      vnode_authorize:???
4003  *      vn_create:???
4004  */
4005 static int
4006 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4007 {
4008         vnode_t vp, dvp;
4009         int error;
4010         struct nameidata nd;
4011
4012         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4013                 UIO_USERSPACE, upath, ctx);
4014         error = namei(&nd);
4015         if (error)
4016                 return (error);
4017         dvp = nd.ni_dvp;
4018         vp = nd.ni_vp;
4019
4020         /* check that this is a new file and authorize addition */
4021         if (vp != NULL) {
4022                 error = EEXIST;
4023                 goto out;
4024         }
4025         VATTR_SET(vap, va_type, VFIFO);
4026
4027         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0)
4028                 goto out;
4029
4030         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4031 out:
4032         /*
4033          * nameidone has to happen before we vnode_put(dvp)
4034          * since it may need to release the fs_nodelock on the dvp
4035          */
4036         nameidone(&nd);
4037
4038         if (vp)
4039                 vnode_put(vp);
4040         vnode_put(dvp);
4041
4042         return error;
4043 }
4044
4045
4046 /*
4047  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4048  *
4049  * Parameters:  p                       Process requesting the open
4050  *              uap                     User argument descriptor (see below)
4051  *              retval                  (Ignored)
4052  *
4053  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
4054  *              uap->uid                UID to set
4055  *              uap->gid                GID to set
4056  *              uap->mode               File mode to set (same as 'mkfifo')
4057  *              uap->xsecurity          ACL to set, if creating
4058  *
4059  * Returns:     0                       Success
4060  *              !0                      errno value
4061  *
4062  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4063  *
4064  * XXX:         We should enummerate the possible errno values here, and where
4065  *              in the code they originated.
4066  */
4067 int
4068 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4069 {
4070         int ciferror;
4071         kauth_filesec_t xsecdst;
4072         struct vnode_attr va;
4073
4074         AUDIT_ARG(owner, uap->uid, uap->gid);
4075
4076         xsecdst = KAUTH_FILESEC_NONE;
4077         if (uap->xsecurity != USER_ADDR_NULL) {
4078                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
4079                         return ciferror;
4080         }
4081
4082         VATTR_INIT(&va);
4083         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4084         if (uap->uid != KAUTH_UID_NONE)
4085                 VATTR_SET(&va, va_uid, uap->uid);
4086         if (uap->gid != KAUTH_GID_NONE)
4087                 VATTR_SET(&va, va_gid, uap->gid);
4088         if (xsecdst != KAUTH_FILESEC_NONE)
4089                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4090
4091         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4092
4093         if (xsecdst != KAUTH_FILESEC_NONE)
4094                 kauth_filesec_free(xsecdst);
4095         return ciferror;
4096 }
4097
4098 /* ARGSUSED */
4099 int
4100 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4101 {
4102         struct vnode_attr va;
4103
4104         VATTR_INIT(&va);
4105         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4106
4107         return(mkfifo1(vfs_context_current(), uap->path, &va));
4108 }
4109
4110
4111 static char *
4112 my_strrchr(char *p, int ch)
4113 {
4114         char *save;
4115
4116         for (save = NULL;; ++p) {
4117                 if (*p == ch)
4118                         save = p;
4119                 if (!*p)
4120                         return(save);
4121         }
4122         /* NOTREACHED */
4123 }
4124
4125 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4126
4127 int
4128 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4129 {
4130         int ret, len = _len;
4131
4132         *truncated_path = 0;
4133         ret = vn_getpath(dvp, path, &len);
4134         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4135                 if (leafname) {
4136                         path[len-1] = '/';
4137                         len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + 1;
4138                         if (len > MAXPATHLEN) {
4139                                 char *ptr;
4140
4141                                 // the string got truncated!
4142                                 *truncated_path = 1;
4143                                 ptr = my_strrchr(path, '/');
4144                                 if (ptr) {
4145                                         *ptr = '\0';   // chop off the string at the last directory component
4146                                 }
4147                                 len = strlen(path) + 1;
4148                         }
4149                 }
4150         } else if (ret == 0) {
4151                 *truncated_path = 1;
4152         } else if (ret != 0) {
4153                 struct vnode *mydvp=dvp;
4154
4155                 if (ret != ENOSPC) {
4156                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4157                                dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4158                 }
4159                 *truncated_path = 1;
4160
4161                 do {
4162                         if (mydvp->v_parent != NULL) {
4163                                 mydvp = mydvp->v_parent;
4164                         } else if (mydvp->v_mount) {
4165                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4166                                 break;
4167                         } else {
4168                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4169                                 strlcpy(path, "/", _len);
4170                                 len = 2;
4171                                 mydvp = NULL;
4172                         }
4173
4174                         if (mydvp == NULL) {
4175                                 break;
4176                         }
4177
4178                         len = _len;
4179                         ret = vn_getpath(mydvp, path, &len);
4180                 } while (ret == ENOSPC);
4181         }
4182
4183         return len;
4184 }
4185
4186
4187 /*
4188  * Make a hard file link.
4189  *
4190  * Returns:     0                       Success
4191  *              EPERM
4192  *              EEXIST
4193  *              EXDEV
4194  *      namei:???
4195  *      vnode_authorize:???
4196  *      VNOP_LINK:???
4197  */
4198 /* ARGSUSED */
4199 static int
4200 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4201     user_addr_t link, int flag, enum uio_seg segflg)
4202 {
4203         vnode_t vp, dvp, lvp;
4204         struct nameidata nd;
4205         int follow;
4206         int error;
4207 #if CONFIG_FSE
4208         fse_info finfo;
4209 #endif
4210         int need_event, has_listeners;
4211         char *target_path = NULL;
4212         int truncated=0;
4213
4214         vp = dvp = lvp = NULLVP;
4215
4216         /* look up the object we are linking to */
4217         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4218         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4219             segflg, path, ctx);
4220
4221         error = nameiat(&nd, fd1);
4222         if (error)
4223                 return (error);
4224         vp = nd.ni_vp;
4225
4226         nameidone(&nd);
4227
4228         /*
4229          * Normally, linking to directories is not supported.
4230          * However, some file systems may have limited support.
4231          */
4232         if (vp->v_type == VDIR) {
4233                 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4234                         error = EPERM;   /* POSIX */
4235                         goto out;
4236                 }
4237
4238                 /* Linking to a directory requires ownership. */
4239                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4240                         struct vnode_attr dva;
4241
4242                         VATTR_INIT(&dva);
4243                         VATTR_WANTED(&dva, va_uid);
4244                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4245                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4246                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4247                                 error = EACCES;
4248                                 goto out;
4249                         }
4250                 }
4251         }
4252
4253         /* lookup the target node */
4254 #if CONFIG_TRIGGERS
4255         nd.ni_op = OP_LINK;
4256 #endif
4257         nd.ni_cnd.cn_nameiop = CREATE;
4258         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4259         nd.ni_dirp = link;
4260         error = nameiat(&nd, fd2);
4261         if (error != 0)
4262                 goto out;
4263         dvp = nd.ni_dvp;
4264         lvp = nd.ni_vp;
4265
4266 #if CONFIG_MACF
4267         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0)
4268                 goto out2;
4269 #endif
4270
4271         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4272         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0)
4273                 goto out2;
4274
4275         /* target node must not exist */
4276         if (lvp != NULLVP) {
4277                 error = EEXIST;
4278                 goto out2;
4279         }
4280         /* cannot link across mountpoints */
4281         if (vnode_mount(vp) != vnode_mount(dvp)) {
4282                 error = EXDEV;
4283                 goto out2;
4284         }
4285
4286         /* authorize creation of the target note */
4287         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
4288                 goto out2;
4289
4290         /* and finally make the link */
4291         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4292         if (error)
4293                 goto out2;
4294
4295 #if CONFIG_MACF
4296         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4297 #endif
4298
4299 #if CONFIG_FSE
4300         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4301 #else
4302         need_event = 0;
4303 #endif
4304         has_listeners = kauth_authorize_fileop_has_listeners();
4305
4306         if (need_event || has_listeners) {
4307                 char *link_to_path = NULL;
4308                 int len, link_name_len;
4309
4310                 /* build the path to the new link file */
4311                 GET_PATH(target_path);
4312                 if (target_path == NULL) {
4313                         error = ENOMEM;
4314                         goto out2;
4315                 }
4316
4317                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4318
4319                 if (has_listeners) {
4320                         /* build the path to file we are linking to */
4321                         GET_PATH(link_to_path);
4322                         if (link_to_path == NULL) {
4323                                 error = ENOMEM;
4324                                 goto out2;
4325                         }
4326
4327                         link_name_len = MAXPATHLEN;
4328                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4329                                 /*
4330                                  * Call out to allow 3rd party notification of rename.
4331                                  * Ignore result of kauth_authorize_fileop call.
4332                                  */
4333                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4334                                                        (uintptr_t)link_to_path,
4335                                                        (uintptr_t)target_path);
4336                         }
4337                         if (link_to_path != NULL) {
4338                                 RELEASE_PATH(link_to_path);
4339                         }
4340                 }
4341 #if CONFIG_FSE
4342                 if (need_event) {
4343                         /* construct fsevent */
4344                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4345                                 if (truncated) {
4346                                         finfo.mode |= FSE_TRUNCATED_PATH;
4347                                 }
4348
4349                                 // build the path to the destination of the link
4350                                 add_fsevent(FSE_CREATE_FILE, ctx,
4351                                             FSE_ARG_STRING, len, target_path,
4352                                             FSE_ARG_FINFO, &finfo,
4353                                             FSE_ARG_DONE);
4354                         }
4355                         if (vp->v_parent) {
4356                             add_fsevent(FSE_STAT_CHANGED, ctx,
4357                                 FSE_ARG_VNODE, vp->v_parent,
4358                                 FSE_ARG_DONE);
4359                         }
4360                 }
4361 #endif
4362         }
4363 out2:
4364         /*
4365          * nameidone has to happen before we vnode_put(dvp)
4366          * since it may need to release the fs_nodelock on the dvp
4367          */
4368         nameidone(&nd);
4369         if (target_path != NULL) {
4370                 RELEASE_PATH(target_path);
4371         }
4372 out:
4373         if (lvp)
4374                 vnode_put(lvp);
4375         if (dvp)
4376                 vnode_put(dvp);
4377         vnode_put(vp);
4378         return (error);
4379 }
4380
4381 int
4382 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4383 {
4384         return (linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4385             AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE));
4386 }
4387
4388 int
4389 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4390 {
4391         if (uap->flag & ~AT_SYMLINK_FOLLOW)
4392                 return (EINVAL);
4393
4394         return (linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4395             uap->fd2, uap->link, uap->flag, UIO_USERSPACE));
4396 }
4397
4398 /*
4399  * Make a symbolic link.
4400  *
4401  * We could add support for ACLs here too...
4402  */
4403 /* ARGSUSED */
4404 static int
4405 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4406     user_addr_t link, enum uio_seg segflg)
4407 {
4408         struct vnode_attr va;
4409         char *path;
4410         int error;
4411         struct nameidata nd;
4412         vnode_t vp, dvp;
4413         size_t dummy=0;
4414         proc_t p;
4415
4416         error = 0;
4417         if (UIO_SEG_IS_USER_SPACE(segflg)) {
4418                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
4419                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
4420         } else {
4421                 path = (char *)path_data;
4422         }
4423         if (error)
4424                 goto out;
4425         AUDIT_ARG(text, path);  /* This is the link string */
4426
4427         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
4428             segflg, link, ctx);
4429
4430         error = nameiat(&nd, fd);
4431         if (error)
4432                 goto out;
4433         dvp = nd.ni_dvp;
4434         vp = nd.ni_vp;
4435
4436         p = vfs_context_proc(ctx);
4437         VATTR_INIT(&va);
4438         VATTR_SET(&va, va_type, VLNK);
4439         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
4440
4441 #if CONFIG_MACF
4442         error = mac_vnode_check_create(ctx,
4443                         dvp, &nd.ni_cnd, &va);
4444 #endif
4445         if (error != 0) {
4446             goto skipit;
4447         }
4448
4449         if (vp != NULL) {
4450             error = EEXIST;
4451             goto skipit;
4452         }
4453
4454         /* authorize */
4455         if (error == 0)
4456                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
4457         /* get default ownership, etc. */
4458         if (error == 0)
4459                 error = vnode_authattr_new(dvp, &va, 0, ctx);
4460         if (error == 0)
4461                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
4462
4463 #if CONFIG_MACF
4464         if (error == 0 && vp)
4465                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
4466 #endif
4467
4468         /* do fallback attribute handling */
4469         if (error == 0 && vp)
4470                 error = vnode_setattr_fallback(vp, &va, ctx);
4471
4472         if (error == 0) {
4473                 int     update_flags = 0;
4474
4475                 /*check if a new vnode was created, else try to get one*/
4476                 if (vp == NULL) {
4477                         nd.ni_cnd.cn_nameiop = LOOKUP;
4478 #if CONFIG_TRIGGERS
4479                         nd.ni_op = OP_LOOKUP;
4480 #endif
4481                         nd.ni_cnd.cn_flags = 0;
4482                         error = nameiat(&nd, fd);
4483                         vp = nd.ni_vp;
4484
4485                         if (vp == NULL)
4486                                 goto skipit;
4487                 }
4488
4489 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
4490                 /* call out to allow 3rd party notification of rename.
4491                  * Ignore result of kauth_authorize_fileop call.
4492                  */
4493                 if (kauth_authorize_fileop_has_listeners() &&
4494                     namei(&nd) == 0) {
4495                         char *new_link_path = NULL;
4496                         int             len;
4497
4498                         /* build the path to the new link file */
4499                         new_link_path = get_pathbuff();
4500                         len = MAXPATHLEN;
4501                         vn_getpath(dvp, new_link_path, &len);
4502                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
4503                                 new_link_path[len - 1] = '/';
4504                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN-len);
4505                         }
4506
4507                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
4508                                            (uintptr_t)path, (uintptr_t)new_link_path);
4509                         if (new_link_path != NULL)
4510                                 release_pathbuff(new_link_path);
4511                 }
4512 #endif
4513                 // Make sure the name & parent pointers are hooked up
4514                 if (vp->v_name == NULL)
4515                         update_flags |= VNODE_UPDATE_NAME;
4516                 if (vp->v_parent == NULLVP)
4517                         update_flags |= VNODE_UPDATE_PARENT;
4518
4519                 if (update_flags)
4520                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4521
4522 #if CONFIG_FSE
4523                 add_fsevent(FSE_CREATE_FILE, ctx,
4524                             FSE_ARG_VNODE, vp,
4525                             FSE_ARG_DONE);
4526 #endif
4527         }
4528
4529 skipit:
4530         /*
4531          * nameidone has to happen before we vnode_put(dvp)
4532          * since it may need to release the fs_nodelock on the dvp
4533          */
4534         nameidone(&nd);
4535
4536         if (vp)
4537                 vnode_put(vp);
4538         vnode_put(dvp);
4539 out:
4540         if (path && (path != (char *)path_data))
4541                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
4542
4543         return (error);
4544 }
4545
4546 int
4547 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
4548 {
4549         return (symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
4550             uap->link, UIO_USERSPACE));
4551 }
4552
4553 int
4554 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
4555     __unused int32_t *retval)
4556 {
4557         return (symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
4558             uap->path2, UIO_USERSPACE));
4559 }
4560
4561 /*
4562  * Delete a whiteout from the filesystem.
4563  * No longer supported.
4564  */
4565 int
4566 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
4567 {
4568         return (ENOTSUP);
4569 }
4570
4571 /*
4572  * Delete a name from the filesystem.
4573  */
4574 /* ARGSUSED */
4575 static int
4576 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
4577     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
4578 {
4579         struct nameidata nd;
4580         vnode_t vp, dvp;
4581         int error;
4582         struct componentname *cnp;
4583         char  *path = NULL;
4584         int  len=0;
4585 #if CONFIG_FSE
4586         fse_info  finfo;
4587         struct vnode_attr va;
4588 #endif
4589         int flags;
4590         int need_event;
4591         int has_listeners;
4592         int truncated_path;
4593         int batched;
4594         struct vnode_attr *vap;
4595         int do_retry;
4596         int retry_count = 0;
4597         int cn_flags;
4598
4599         cn_flags = LOCKPARENT;
4600         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH))
4601                 cn_flags |= AUDITVNPATH1;
4602         /* If a starting dvp is passed, it trumps any fd passed. */
4603         if (start_dvp)
4604                 cn_flags |= USEDVP;
4605
4606 #if NAMEDRSRCFORK
4607         /* unlink or delete is allowed on rsrc forks and named streams */
4608         cn_flags |= CN_ALLOWRSRCFORK;
4609 #endif
4610
4611 retry:
4612         do_retry = 0;
4613         flags = 0;
4614         need_event = 0;
4615         has_listeners = 0;
4616         truncated_path = 0;
4617         vap = NULL;
4618
4619         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
4620
4621         nd.ni_dvp = start_dvp;
4622         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
4623         cnp = &nd.ni_cnd;
4624
4625 continue_lookup:
4626         error = nameiat(&nd, fd);
4627         if (error)
4628                 return (error);
4629
4630         dvp = nd.ni_dvp;
4631         vp = nd.ni_vp;
4632
4633
4634         /* With Carbon delete semantics, busy files cannot be deleted */
4635         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
4636                 flags |= VNODE_REMOVE_NODELETEBUSY;
4637         }
4638
4639         /* Skip any potential upcalls if told to. */
4640         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
4641                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
4642         }
4643
4644         if (vp) {
4645                 batched = vnode_compound_remove_available(vp);
4646                 /*
4647                  * The root of a mounted filesystem cannot be deleted.
4648                  */
4649                 if (vp->v_flag & VROOT) {
4650                         error = EBUSY;
4651                 }
4652
4653 #if DEVELOPMENT || DEBUG
4654         /*
4655          * XXX VSWAP: Check for entitlements or special flag here
4656          * so we can restrict access appropriately.
4657          */
4658 #else /* DEVELOPMENT || DEBUG */
4659
4660                 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
4661                         error = EPERM;
4662                         goto out;
4663                 }
4664 #endif /* DEVELOPMENT || DEBUG */
4665
4666                 if (!batched) {
4667                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
4668                         if (error) {
4669                                 if (error == ENOENT) {
4670                                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4671                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4672                                                 do_retry = 1;
4673                                                 retry_count++;
4674                                         }
4675                                 }
4676                                 goto out;
4677                         }
4678                 }
4679         } else {
4680                 batched = 1;
4681
4682                 if (!vnode_compound_remove_available(dvp)) {
4683                         panic("No vp, but no compound remove?");
4684                 }
4685         }
4686
4687 #if CONFIG_FSE
4688         need_event = need_fsevent(FSE_DELETE, dvp);
4689         if (need_event) {
4690                 if (!batched) {
4691                         if ((vp->v_flag & VISHARDLINK) == 0) {
4692                                 /* XXX need to get these data in batched VNOP */
4693                                 get_fse_info(vp, &finfo, ctx);
4694                         }
4695                 } else {
4696                         error = vfs_get_notify_attributes(&va);
4697                         if (error) {
4698                                 goto out;
4699                         }
4700
4701                         vap = &va;
4702                 }
4703         }
4704 #endif
4705         has_listeners = kauth_authorize_fileop_has_listeners();
4706         if (need_event || has_listeners) {
4707                 if (path == NULL) {
4708                         GET_PATH(path);
4709                         if (path == NULL) {
4710                                 error = ENOMEM;
4711                                 goto out;
4712                         }
4713                 }
4714                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
4715         }
4716
4717 #if NAMEDRSRCFORK
4718         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK)
4719                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
4720         else
4721 #endif
4722         {
4723                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
4724                 vp = nd.ni_vp;
4725                 if (error == EKEEPLOOKING) {
4726                         if (!batched) {
4727                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
4728                         }
4729
4730                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
4731                                 panic("EKEEPLOOKING, but continue flag not set?");
4732                         }
4733
4734                         if (vnode_isdir(vp)) {
4735                                 error = EISDIR;
4736                                 goto out;
4737                         }
4738                         goto continue_lookup;
4739                 } else if (error == ENOENT && batched) {
4740                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4741                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4742                                 /*
4743                                  * For compound VNOPs, the authorization callback may
4744                                  * return ENOENT in case of racing hardlink lookups
4745                                  * hitting the name  cache, redrive the lookup.
4746                                  */
4747                                 do_retry = 1;
4748                                 retry_count += 1;
4749                                 goto out;
4750                         }
4751                 }
4752         }
4753
4754         /*
4755          * Call out to allow 3rd party notification of delete.
4756          * Ignore result of kauth_authorize_fileop call.
4757          */
4758         if (!error) {
4759                 if (has_listeners) {
4760                         kauth_authorize_fileop(vfs_context_ucred(ctx),
4761                                 KAUTH_FILEOP_DELETE,
4762                                 (uintptr_t)vp,
4763                                 (uintptr_t)path);
4764                 }
4765
4766                 if (vp->v_flag & VISHARDLINK) {
4767                     //
4768                     // if a hardlink gets deleted we want to blow away the
4769                     // v_parent link because the path that got us to this
4770                     // instance of the link is no longer valid.  this will
4771                     // force the next call to get the path to ask the file
4772                     // system instead of just following the v_parent link.
4773                     //
4774                     vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
4775                 }
4776
4777 #if CONFIG_FSE
4778                 if (need_event) {
4779                         if (vp->v_flag & VISHARDLINK) {
4780                                 get_fse_info(vp, &finfo, ctx);
4781                         } else if (vap) {
4782                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
4783                         }
4784                         if (truncated_path) {
4785                                 finfo.mode |= FSE_TRUNCATED_PATH;
4786                         }
4787                         add_fsevent(FSE_DELETE, ctx,
4788                                                 FSE_ARG_STRING, len, path,
4789                                                 FSE_ARG_FINFO, &finfo,
4790                                                 FSE_ARG_DONE);
4791                 }
4792 #endif
4793         }
4794
4795 out:
4796         if (path != NULL)
4797                 RELEASE_PATH(path);
4798
4799 #if NAMEDRSRCFORK
4800         /* recycle the deleted rsrc fork vnode to force a reclaim, which
4801          * will cause its shadow file to go away if necessary.
4802          */
4803          if (vp && (vnode_isnamedstream(vp)) &&
4804                 (vp->v_parent != NULLVP) &&
4805                 vnode_isshadow(vp)) {
4806                         vnode_recycle(vp);
4807          }
4808 #endif
4809         /*
4810          * nameidone has to happen before we vnode_put(dvp)
4811          * since it may need to release the fs_nodelock on the dvp
4812          */
4813         nameidone(&nd);
4814         vnode_put(dvp);
4815         if (vp) {
4816                 vnode_put(vp);
4817         }
4818
4819         if (do_retry) {
4820                 goto retry;
4821         }
4822
4823         return (error);
4824 }
4825
4826 int
4827 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
4828     enum uio_seg segflg, int unlink_flags)
4829 {
4830         return (unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
4831             unlink_flags));
4832 }
4833
4834 /*
4835  * Delete a name from the filesystem using Carbon semantics.
4836  */
4837 int
4838 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
4839 {
4840         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4841             uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY));
4842 }
4843
4844 /*
4845  * Delete a name from the filesystem using POSIX semantics.
4846  */
4847 int
4848 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
4849 {
4850         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4851             uap->path, UIO_USERSPACE, 0));
4852 }
4853
4854 int
4855 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
4856 {
4857         if (uap->flag & ~AT_REMOVEDIR)
4858                 return (EINVAL);
4859
4860         if (uap->flag & AT_REMOVEDIR)
4861                 return (rmdirat_internal(vfs_context_current(), uap->fd,
4862                     uap->path, UIO_USERSPACE));
4863         else
4864                 return (unlinkat_internal(vfs_context_current(), uap->fd,
4865                     NULLVP, uap->path, UIO_USERSPACE, 0));
4866 }
4867
4868 /*
4869  * Reposition read/write file offset.
4870  */
4871 int
4872 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
4873 {
4874         struct fileproc *fp;
4875         vnode_t vp;
4876         struct vfs_context *ctx;
4877         off_t offset = uap->offset, file_size;
4878         int error;
4879
4880         if ( (error = fp_getfvp(p,uap->fd, &fp, &vp)) ) {
4881                 if (error == ENOTSUP)
4882                         return (ESPIPE);
4883                 return (error);
4884         }
4885         if (vnode_isfifo(vp)) {
4886                 file_drop(uap->fd);
4887                 return(ESPIPE);
4888         }
4889
4890
4891         ctx = vfs_context_current();
4892 #if CONFIG_MACF
4893         if (uap->whence == L_INCR && uap->offset == 0)
4894                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
4895                     fp->f_fglob);
4896         else
4897                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
4898                     fp->f_fglob);
4899         if (error) {
4900                 file_drop(uap->fd);
4901                 return (error);
4902         }
4903 #endif
4904         if ( (error = vnode_getwithref(vp)) ) {
4905                 file_drop(uap->fd);
4906                 return(error);
4907         }
4908
4909         switch (uap->whence) {
4910         case L_INCR:
4911                 offset += fp->f_fglob->fg_offset;
4912                 break;
4913         case L_XTND:
4914                 if ((error = vnode_size(vp, &file_size, ctx)) != 0)
4915                         break;
4916                 offset += file_size;
4917                 break;
4918         case L_SET:
4919                 break;
4920         case SEEK_HOLE:
4921         error = VNOP_IOCTL(vp, FSCTL_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
4922                 break;
4923         case SEEK_DATA:
4924         error = VNOP_IOCTL(vp, FSCTL_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
4925                 break;
4926         default:
4927                 error = EINVAL;
4928         }
4929         if (error == 0) {
4930                 if (uap->offset > 0 && offset < 0) {
4931                         /* Incremented/relative move past max size */
4932                         error = EOVERFLOW;
4933                 } else {
4934                         /*
4935                          * Allow negative offsets on character devices, per
4936                          * POSIX 1003.1-2001.  Most likely for writing disk
4937                          * labels.
4938                          */
4939                         if (offset < 0 && vp->v_type != VCHR) {
4940                                 /* Decremented/relative move before start */
4941                                 error = EINVAL;
4942                         } else {
4943                                 /* Success */
4944                                 fp->f_fglob->fg_offset = offset;
4945                                 *retval = fp->f_fglob->fg_offset;
4946                         }
4947                 }
4948         }
4949
4950         /*
4951          * An lseek can affect whether data is "available to read."  Use
4952          * hint of NOTE_NONE so no EVFILT_VNODE events fire
4953          */
4954         post_event_if_success(vp, error, NOTE_NONE);
4955         (void)vnode_put(vp);
4956         file_drop(uap->fd);
4957         return (error);
4958 }
4959
4960
4961 /*
4962  * Check access permissions.
4963  *
4964  * Returns:     0                       Success
4965  *              vnode_authorize:???
4966  */
4967 static int
4968 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
4969 {
4970         kauth_action_t action;
4971         int error;
4972
4973         /*
4974          * If just the regular access bits, convert them to something
4975          * that vnode_authorize will understand.
4976          */
4977         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
4978                 action = 0;
4979                 if (uflags & R_OK)
4980                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
4981                 if (uflags & W_OK) {
4982                         if (vnode_isdir(vp)) {
4983                                 action |= KAUTH_VNODE_ADD_FILE |
4984                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
4985                                 /* might want delete rights here too */
4986                         } else {
4987                                 action |= KAUTH_VNODE_WRITE_DATA;
4988                         }
4989                 }
4990                 if (uflags & X_OK) {
4991                         if (vnode_isdir(vp)) {
4992                                 action |= KAUTH_VNODE_SEARCH;
4993                         } else {
4994                                 action |= KAUTH_VNODE_EXECUTE;
4995                         }
4996                 }
4997         } else {
4998                 /* take advantage of definition of uflags */
4999                 action = uflags >> 8;
5000         }
5001
5002 #if CONFIG_MACF
5003         error = mac_vnode_check_access(ctx, vp, uflags);
5004         if (error)
5005                 return (error);
5006 #endif /* MAC */
5007
5008         /* action == 0 means only check for existence */
5009         if (action != 0) {
5010                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5011         } else {
5012                 error = 0;
5013         }
5014
5015         return(error);
5016 }
5017
5018
5019
5020 /*
5021  * access_extended: Check access permissions in bulk.
5022  *
5023  * Description: uap->entries            Pointer to an array of accessx
5024  *                                      descriptor structs, plus one or
5025  *                                      more NULL terminated strings (see
5026  *                                      "Notes" section below).
5027  *              uap->size               Size of the area pointed to by
5028  *                                      uap->entries.
5029  *              uap->results            Pointer to the results array.
5030  *
5031  * Returns:     0                       Success
5032  *              ENOMEM                  Insufficient memory
5033  *              EINVAL                  Invalid arguments
5034  *              namei:EFAULT            Bad address
5035  *              namei:ENAMETOOLONG      Filename too long
5036  *              namei:ENOENT            No such file or directory
5037  *              namei:ELOOP             Too many levels of symbolic links
5038  *              namei:EBADF             Bad file descriptor
5039  *              namei:ENOTDIR           Not a directory
5040  *              namei:???
5041  *              access1:
5042  *
5043  * Implicit returns:
5044  *              uap->results            Array contents modified
5045  *
5046  * Notes:       The uap->entries are structured as an arbitrary length array
5047  *              of accessx descriptors, followed by one or more NULL terminated
5048  *              strings
5049  *
5050  *                      struct accessx_descriptor[0]
5051  *                      ...
5052  *                      struct accessx_descriptor[n]
5053  *                      char name_data[0];
5054  *
5055  *              We determine the entry count by walking the buffer containing
5056  *              the uap->entries argument descriptor.  For each descriptor we
5057  *              see, the valid values for the offset ad_name_offset will be
5058  *              in the byte range:
5059  *
5060  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
5061  *                                              to
5062  *                              [ uap->entries + uap->size - 2 ]
5063  *
5064  *              since we must have at least one string, and the string must
5065  *              be at least one character plus the NULL terminator in length.
5066  *
5067  * XXX:         Need to support the check-as uid argument
5068  */
5069 int
5070 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5071 {
5072         struct accessx_descriptor *input = NULL;
5073         errno_t *result = NULL;
5074         errno_t error = 0;
5075         int wantdelete = 0;
5076         unsigned int desc_max, desc_actual, i, j;
5077         struct vfs_context context;
5078         struct nameidata nd;
5079         int niopts;
5080         vnode_t vp = NULL;
5081         vnode_t dvp = NULL;
5082 #define ACCESSX_MAX_DESCR_ON_STACK 10
5083         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5084
5085         context.vc_ucred = NULL;
5086
5087         /*
5088          * Validate parameters; if valid, copy the descriptor array and string
5089          * arguments into local memory.  Before proceeding, the following
5090          * conditions must have been met:
5091          *
5092          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5093          * o    There must be sufficient room in the request for at least one
5094          *      descriptor and a one yte NUL terminated string.
5095          * o    The allocation of local storage must not fail.
5096          */
5097         if (uap->size > ACCESSX_MAX_TABLESIZE)
5098                 return(ENOMEM);
5099         if (uap->size < (sizeof(struct accessx_descriptor) + 2))
5100                 return(EINVAL);
5101         if (uap->size <= sizeof (stack_input)) {
5102                 input = stack_input;
5103         } else {
5104         MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5105         if (input == NULL) {
5106                 error = ENOMEM;
5107                 goto out;
5108         }
5109         }
5110         error = copyin(uap->entries, input, uap->size);
5111         if (error)
5112                 goto out;
5113
5114         AUDIT_ARG(opaque, input, uap->size);
5115
5116         /*
5117          * Force NUL termination of the copyin buffer to avoid nami() running
5118          * off the end.  If the caller passes us bogus data, they may get a
5119          * bogus result.
5120          */
5121         ((char *)input)[uap->size - 1] = 0;
5122
5123         /*
5124          * Access is defined as checking against the process' real identity,
5125          * even if operations are checking the effective identity.  This
5126          * requires that we use a local vfs context.
5127          */
5128         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5129         context.vc_thread = current_thread();
5130
5131         /*
5132          * Find out how many entries we have, so we can allocate the result
5133          * array by walking the list and adjusting the count downward by the
5134          * earliest string offset we see.
5135          */
5136         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5137         desc_actual = desc_max;
5138         for (i = 0; i < desc_actual; i++) {
5139                 /*
5140                  * Take the offset to the name string for this entry and
5141                  * convert to an input array index, which would be one off
5142                  * the end of the array if this entry was the lowest-addressed
5143                  * name string.
5144                  */
5145                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5146
5147                 /*
5148                  * An offset greater than the max allowable offset is an error.
5149                  * It is also an error for any valid entry to point
5150                  * to a location prior to the end of the current entry, if
5151                  * it's not a reference to the string of the previous entry.
5152                  */
5153                 if (j > desc_max || (j != 0 && j <= i)) {
5154                         error = EINVAL;
5155                         goto out;
5156                 }
5157
5158                 /* Also do not let ad_name_offset point to something beyond the size of the input */
5159                 if (input[i].ad_name_offset >= uap->size) {
5160                         error = EINVAL;
5161                         goto out;
5162                 }
5163
5164                 /*
5165                  * An offset of 0 means use the previous descriptor's offset;
5166                  * this is used to chain multiple requests for the same file
5167                  * to avoid multiple lookups.
5168                  */
5169                 if (j == 0) {
5170                         /* This is not valid for the first entry */
5171                         if (i == 0) {
5172                                 error = EINVAL;
5173                                 goto out;
5174                         }
5175                         continue;
5176                 }
5177
5178                 /*
5179                  * If the offset of the string for this descriptor is before
5180                  * what we believe is the current actual last descriptor,
5181                  * then we need to adjust our estimate downward; this permits
5182                  * the string table following the last descriptor to be out
5183                  * of order relative to the descriptor list.
5184                  */
5185                 if (j < desc_actual)
5186                         desc_actual = j;
5187         }
5188
5189         /*
5190          * We limit the actual number of descriptors we are willing to process
5191          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5192          * requested does not exceed this limit,
5193          */
5194         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5195                 error = ENOMEM;
5196                 goto out;
5197         }
5198         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK);
5199         if (result == NULL) {
5200                 error = ENOMEM;
5201                 goto out;
5202         }
5203
5204         /*
5205          * Do the work by iterating over the descriptor entries we know to
5206          * at least appear to contain valid data.
5207          */
5208         error = 0;
5209         for (i = 0; i < desc_actual; i++) {
5210                 /*
5211                  * If the ad_name_offset is 0, then we use the previous
5212                  * results to make the check; otherwise, we are looking up
5213                  * a new file name.
5214                  */
5215                 if (input[i].ad_name_offset != 0) {
5216                         /* discard old vnodes */
5217                         if (vp) {
5218                                 vnode_put(vp);
5219                                 vp = NULL;
5220                         }
5221                         if (dvp) {
5222                                 vnode_put(dvp);
5223                                 dvp = NULL;
5224                         }
5225
5226                         /*
5227                          * Scan forward in the descriptor list to see if we
5228                          * need the parent vnode.  We will need it if we are
5229                          * deleting, since we must have rights  to remove
5230                          * entries in the parent directory, as well as the
5231                          * rights to delete the object itself.
5232                          */
5233                         wantdelete = input[i].ad_flags & _DELETE_OK;
5234                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++)
5235                                 if (input[j].ad_flags & _DELETE_OK)
5236                                         wantdelete = 1;
5237
5238                         niopts = FOLLOW | AUDITVNPATH1;
5239
5240                         /* need parent for vnode_authorize for deletion test */
5241                         if (wantdelete)
5242                                 niopts |= WANTPARENT;
5243
5244                         /* do the lookup */
5245                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5246                                CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5247                                &context);
5248                         error = namei(&nd);
5249                         if (!error) {
5250                                 vp = nd.ni_vp;
5251                                 if (wantdelete)
5252                                         dvp = nd.ni_dvp;
5253                         }
5254                         nameidone(&nd);
5255                 }
5256
5257                 /*
5258                  * Handle lookup errors.
5259                  */
5260                 switch(error) {
5261                 case ENOENT:
5262                 case EACCES:
5263                 case EPERM:
5264                 case ENOTDIR:
5265                         result[i] = error;
5266                         break;
5267                 case 0:
5268                         /* run this access check */
5269                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5270                         break;
5271                 default:
5272                         /* fatal lookup error */
5273
5274                         goto out;
5275                 }
5276         }
5277
5278         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5279
5280         /* copy out results */
5281         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5282
5283 out:
5284         if (input && input != stack_input)
5285                 FREE(input, M_TEMP);
5286         if (result)
5287                 FREE(result, M_TEMP);
5288         if (vp)
5289                 vnode_put(vp);
5290         if (dvp)
5291                 vnode_put(dvp);
5292         if (IS_VALID_CRED(context.vc_ucred))
5293                 kauth_cred_unref(&context.vc_ucred);
5294         return(error);
5295 }
5296
5297
5298 /*
5299  * Returns:     0                       Success
5300  *              namei:EFAULT            Bad address
5301  *              namei:ENAMETOOLONG      Filename too long
5302  *              namei:ENOENT            No such file or directory
5303  *              namei:ELOOP             Too many levels of symbolic links
5304  *              namei:EBADF             Bad file descriptor
5305  *              namei:ENOTDIR           Not a directory
5306  *              namei:???
5307  *              access1:
5308  */
5309 static int
5310 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5311     int flag, enum uio_seg segflg)
5312 {
5313         int error;
5314         struct nameidata nd;
5315         int niopts;
5316         struct vfs_context context;
5317 #if NAMEDRSRCFORK
5318         int is_namedstream = 0;
5319 #endif
5320
5321         /*
5322          * Unless the AT_EACCESS option is used, Access is defined as checking
5323          * against the process' real identity, even if operations are checking
5324          * the effective identity.  So we need to tweak the credential
5325          * in the context for that case.
5326          */
5327         if (!(flag & AT_EACCESS))
5328                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5329         else
5330                 context.vc_ucred = ctx->vc_ucred;
5331         context.vc_thread = ctx->vc_thread;
5332
5333
5334         niopts = FOLLOW | AUDITVNPATH1;
5335         /* need parent for vnode_authorize for deletion test */
5336         if (amode & _DELETE_OK)
5337                 niopts |= WANTPARENT;
5338         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5339                path, &context);
5340
5341 #if NAMEDRSRCFORK
5342         /* access(F_OK) calls are allowed for resource forks. */
5343         if (amode == F_OK)
5344                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5345 #endif
5346         error = nameiat(&nd, fd);
5347         if (error)
5348                 goto out;
5349
5350 #if NAMEDRSRCFORK
5351         /* Grab reference on the shadow stream file vnode to
5352          * force an inactive on release which will mark it
5353          * for recycle.
5354          */
5355         if (vnode_isnamedstream(nd.ni_vp) &&
5356             (nd.ni_vp->v_parent != NULLVP) &&
5357             vnode_isshadow(nd.ni_vp)) {
5358                 is_namedstream = 1;
5359                 vnode_ref(nd.ni_vp);
5360         }
5361 #endif
5362
5363         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
5364
5365 #if NAMEDRSRCFORK
5366         if (is_namedstream) {
5367                 vnode_rele(nd.ni_vp);
5368         }
5369 #endif
5370
5371         vnode_put(nd.ni_vp);
5372         if (amode & _DELETE_OK)
5373                 vnode_put(nd.ni_dvp);
5374         nameidone(&nd);
5375
5376 out:
5377         if (!(flag & AT_EACCESS))
5378                 kauth_cred_unref(&context.vc_ucred);
5379         return (error);
5380 }
5381
5382 int
5383 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
5384 {
5385         return (faccessat_internal(vfs_context_current(), AT_FDCWD,
5386             uap->path, uap->flags, 0, UIO_USERSPACE));
5387 }
5388
5389 int
5390 faccessat(__unused proc_t p, struct faccessat_args *uap,
5391           __unused int32_t *retval)
5392 {
5393         if (uap->flag & ~AT_EACCESS)
5394                 return (EINVAL);
5395
5396         return (faccessat_internal(vfs_context_current(), uap->fd,
5397             uap->path, uap->amode, uap->flag, UIO_USERSPACE));
5398 }
5399
5400 /*
5401  * Returns:     0                       Success
5402  *              EFAULT
5403  *      copyout:EFAULT
5404  *      namei:???
5405  *      vn_stat:???
5406  */
5407 static int
5408 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
5409     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
5410     enum uio_seg segflg, int fd, int flag)
5411 {
5412         struct nameidata nd;
5413         int follow;
5414         union {
5415                 struct stat sb;
5416                 struct stat64 sb64;
5417         } source;
5418         union {
5419                 struct user64_stat user64_sb;
5420                 struct user32_stat user32_sb;
5421                 struct user64_stat64 user64_sb64;
5422                 struct user32_stat64 user32_sb64;
5423         } dest;
5424         caddr_t sbp;
5425         int error, my_size;
5426         kauth_filesec_t fsec;
5427         size_t xsecurity_bufsize;
5428         void * statptr;
5429
5430         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5431         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
5432             segflg, path, ctx);
5433
5434 #if NAMEDRSRCFORK
5435         int is_namedstream = 0;
5436         /* stat calls are allowed for resource forks. */
5437         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5438 #endif
5439         error = nameiat(&nd, fd);
5440         if (error)
5441                 return (error);
5442         fsec = KAUTH_FILESEC_NONE;
5443
5444         statptr = (void *)&source;
5445
5446 #if NAMEDRSRCFORK
5447         /* Grab reference on the shadow stream file vnode to
5448          * force an inactive on release which will mark it
5449          * for recycle.
5450          */
5451         if (vnode_isnamedstream(nd.ni_vp) &&
5452             (nd.ni_vp->v_parent != NULLVP) &&
5453             vnode_isshadow(nd.ni_vp)) {
5454                 is_namedstream = 1;
5455                 vnode_ref(nd.ni_vp);
5456         }
5457 #endif
5458
5459         error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
5460
5461 #if NAMEDRSRCFORK
5462         if (is_namedstream) {
5463                 vnode_rele(nd.ni_vp);
5464         }
5465 #endif
5466         vnode_put(nd.ni_vp);
5467         nameidone(&nd);
5468
5469         if (error)
5470                 return (error);
5471         /* Zap spare fields */
5472         if (isstat64 != 0) {
5473                 source.sb64.st_lspare = 0;
5474                 source.sb64.st_qspare[0] = 0LL;
5475                 source.sb64.st_qspare[1] = 0LL;
5476                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5477                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
5478                         my_size = sizeof(dest.user64_sb64);
5479                         sbp = (caddr_t)&dest.user64_sb64;
5480                 } else {
5481                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
5482                         my_size = sizeof(dest.user32_sb64);
5483                         sbp = (caddr_t)&dest.user32_sb64;
5484                 }
5485                 /*
5486                  * Check if we raced (post lookup) against the last unlink of a file.
5487                  */
5488                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
5489                         source.sb64.st_nlink = 1;
5490                 }
5491         } else {
5492                 source.sb.st_lspare = 0;
5493                 source.sb.st_qspare[0] = 0LL;
5494                 source.sb.st_qspare[1] = 0LL;
5495                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5496                         munge_user64_stat(&source.sb, &dest.user64_sb);
5497                         my_size = sizeof(dest.user64_sb);
5498                         sbp = (caddr_t)&dest.user64_sb;
5499                 } else {
5500                         munge_user32_stat(&source.sb, &dest.user32_sb);
5501                         my_size = sizeof(dest.user32_sb);
5502                         sbp = (caddr_t)&dest.user32_sb;
5503                 }
5504
5505                 /*
5506                  * Check if we raced (post lookup) against the last unlink of a file.
5507                  */
5508                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
5509                         source.sb.st_nlink = 1;
5510                 }
5511         }
5512         if ((error = copyout(sbp, ub, my_size)) != 0)
5513                 goto out;
5514
5515         /* caller wants extended security information? */
5516         if (xsecurity != USER_ADDR_NULL) {
5517
5518                 /* did we get any? */
5519                 if (fsec == KAUTH_FILESEC_NONE) {
5520                         if (susize(xsecurity_size, 0) != 0) {
5521                                 error = EFAULT;
5522                                 goto out;
5523                         }
5524                 } else {
5525                         /* find the user buffer size */
5526                         xsecurity_bufsize = fusize(xsecurity_size);
5527
5528                         /* copy out the actual data size */
5529                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
5530                                 error = EFAULT;
5531                                 goto out;
5532                         }
5533
5534                         /* if the caller supplied enough room, copy out to it */
5535                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec))
5536                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
5537                 }
5538         }
5539 out:
5540         if (fsec != KAUTH_FILESEC_NONE)
5541                 kauth_filesec_free(fsec);
5542         return (error);
5543 }
5544
5545 /*
5546  * stat_extended: Get file status; with extended security (ACL).
5547  *
5548  * Parameters:    p                       (ignored)
5549  *                uap                     User argument descriptor (see below)
5550  *                retval                  (ignored)
5551  *
5552  * Indirect:      uap->path               Path of file to get status from
5553  *                uap->ub                 User buffer (holds file status info)
5554  *                uap->xsecurity          ACL to get (extended security)
5555  *                uap->xsecurity_size     Size of ACL
5556  *
5557  * Returns:        0                      Success
5558  *                !0                      errno value
5559  *
5560  */
5561 int
5562 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
5563     __unused int32_t *retval)
5564 {
5565         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5566             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5567             0));
5568 }
5569
5570 /*
5571  * Returns:     0                       Success
5572  *      fstatat_internal:???            [see fstatat_internal() in this file]
5573  */
5574 int
5575 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
5576 {
5577         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5578             0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0));
5579 }
5580
5581 int
5582 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
5583 {
5584         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5585             0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0));
5586 }
5587
5588 /*
5589  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
5590  *
5591  * Parameters:    p                       (ignored)
5592  *                uap                     User argument descriptor (see below)
5593  *                retval                  (ignored)
5594  *
5595  * Indirect:      uap->path               Path of file to get status from
5596  *                uap->ub                 User buffer (holds file status info)
5597  *                uap->xsecurity          ACL to get (extended security)
5598  *                uap->xsecurity_size     Size of ACL
5599  *
5600  * Returns:        0                      Success
5601  *                !0                      errno value
5602  *
5603  */
5604 int
5605 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
5606 {
5607         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5608             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5609             0));
5610 }
5611
5612 /*
5613  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
5614  *
5615  * Parameters:    p                       (ignored)
5616  *                uap                     User argument descriptor (see below)
5617  *                retval                  (ignored)
5618  *
5619  * Indirect:      uap->path               Path of file to get status from
5620  *                uap->ub                 User buffer (holds file status info)
5621  *                uap->xsecurity          ACL to get (extended security)
5622  *                uap->xsecurity_size     Size of ACL
5623  *
5624  * Returns:        0                      Success
5625  *                !0                      errno value
5626  *
5627  */
5628 int
5629 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
5630 {
5631         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5632             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5633             AT_SYMLINK_NOFOLLOW));
5634 }
5635
5636 /*
5637  * Get file status; this version does not follow links.
5638  */
5639 int
5640 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
5641 {
5642         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5643             0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5644 }
5645
5646 int
5647 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
5648 {
5649         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5650             0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5651 }
5652
5653 /*
5654  * lstat64_extended: Get file status; can handle large inode numbers; does not
5655  * follow links; with extended security (ACL).
5656  *
5657  * Parameters:    p                       (ignored)
5658  *                uap                     User argument descriptor (see below)
5659  *                retval                  (ignored)
5660  *
5661  * Indirect:      uap->path               Path of file to get status from
5662  *                uap->ub                 User buffer (holds file status info)
5663  *                uap->xsecurity          ACL to get (extended security)
5664  *                uap->xsecurity_size     Size of ACL
5665  *
5666  * Returns:        0                      Success
5667  *                !0                      errno value
5668  *
5669  */
5670 int
5671 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
5672 {
5673         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5674             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5675             AT_SYMLINK_NOFOLLOW));
5676 }
5677
5678 int
5679 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
5680 {
5681         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5682                 return (EINVAL);
5683
5684         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5685             0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag));
5686 }
5687
5688 int
5689 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
5690     __unused int32_t *retval)
5691 {
5692         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5693                 return (EINVAL);
5694
5695         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5696             0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag));
5697 }
5698
5699 /*
5700  * Get configurable pathname variables.
5701  *
5702  * Returns:     0                       Success
5703  *      namei:???
5704  *      vn_pathconf:???
5705  *
5706  * Notes:       Global implementation  constants are intended to be
5707  *              implemented in this function directly; all other constants
5708  *              are per-FS implementation, and therefore must be handled in
5709  *              each respective FS, instead.
5710  *
5711  * XXX We implement some things globally right now that should actually be
5712  * XXX per-FS; we will need to deal with this at some point.
5713  */
5714 /* ARGSUSED */
5715 int
5716 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
5717 {
5718         int error;
5719         struct nameidata nd;
5720         vfs_context_t ctx = vfs_context_current();
5721
5722         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
5723                 UIO_USERSPACE, uap->path, ctx);
5724         error = namei(&nd);
5725         if (error)
5726                 return (error);
5727
5728         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
5729
5730         vnode_put(nd.ni_vp);
5731         nameidone(&nd);
5732         return (error);
5733 }
5734
5735 /*
5736  * Return target name of a symbolic link.
5737  */
5738 /* ARGSUSED */
5739 static int
5740 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
5741     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
5742     int *retval)
5743 {
5744         vnode_t vp;
5745         uio_t auio;
5746         int error;
5747         struct nameidata nd;
5748         char uio_buf[ UIO_SIZEOF(1) ];
5749
5750         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
5751             seg, path, ctx);
5752
5753         error = nameiat(&nd, fd);
5754         if (error)
5755                 return (error);
5756         vp = nd.ni_vp;
5757
5758         nameidone(&nd);
5759
5760         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
5761                                     &uio_buf[0], sizeof(uio_buf));
5762         uio_addiov(auio, buf, bufsize);
5763         if (vp->v_type != VLNK) {
5764                 error = EINVAL;
5765         } else {
5766 #if CONFIG_MACF
5767                 error = mac_vnode_check_readlink(ctx, vp);
5768 #endif
5769                 if (error == 0)
5770                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
5771                                                 ctx);
5772                 if (error == 0)
5773                         error = VNOP_READLINK(vp, auio, ctx);
5774         }
5775         vnode_put(vp);
5776
5777         *retval = bufsize - (int)uio_resid(auio);
5778         return (error);
5779 }
5780
5781 int
5782 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
5783 {
5784         enum uio_seg procseg;
5785
5786         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5787         return (readlinkat_internal(vfs_context_current(), AT_FDCWD,
5788             CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
5789             uap->count, procseg, retval));
5790 }
5791
5792 int
5793 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
5794 {
5795         enum uio_seg procseg;
5796
5797         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5798         return (readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
5799             procseg, uap->buf, uap->bufsize, procseg, retval));
5800 }
5801
5802 /*
5803  * Change file flags.
5804  *
5805  * NOTE: this will vnode_put() `vp'
5806  */
5807 static int
5808 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
5809 {
5810         struct vnode_attr va;
5811         kauth_action_t action;
5812         int error;
5813
5814         VATTR_INIT(&va);
5815         VATTR_SET(&va, va_flags, flags);
5816
5817 #if CONFIG_MACF
5818         error = mac_vnode_check_setflags(ctx, vp, flags);
5819         if (error)
5820                 goto out;
5821 #endif
5822
5823         /* request authorisation, disregard immutability */
5824         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5825                 goto out;
5826         /*
5827          * Request that the auth layer disregard those file flags it's allowed to when
5828          * authorizing this operation; we need to do this in order to be able to
5829          * clear immutable flags.
5830          */
5831         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0))
5832                 goto out;
5833         error = vnode_setattr(vp, &va, ctx);
5834
5835 #if CONFIG_MACF
5836         if (error == 0)
5837                 mac_vnode_notify_setflags(ctx, vp, flags);
5838 #endif
5839
5840         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
5841                 error = ENOTSUP;
5842         }
5843 out:
5844         vnode_put(vp);
5845         return(error);
5846 }
5847
5848 /*
5849  * Change flags of a file given a path name.
5850  */
5851 /* ARGSUSED */
5852 int
5853 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
5854 {
5855         vnode_t vp;
5856         vfs_context_t ctx = vfs_context_current();
5857         int error;
5858         struct nameidata nd;
5859
5860         AUDIT_ARG(fflags, uap->flags);
5861         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
5862                 UIO_USERSPACE, uap->path, ctx);
5863         error = namei(&nd);
5864         if (error)
5865                 return (error);
5866         vp = nd.ni_vp;
5867         nameidone(&nd);
5868
5869         /* we don't vnode_put() here because chflags1 does internally */
5870         error = chflags1(vp, uap->flags, ctx);
5871
5872         return(error);
5873 }
5874
5875 /*
5876  * Change flags of a file given a file descriptor.
5877  */
5878 /* ARGSUSED */
5879 int
5880 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
5881 {
5882         vnode_t vp;
5883         int error;
5884
5885         AUDIT_ARG(fd, uap->fd);
5886         AUDIT_ARG(fflags, uap->flags);
5887         if ( (error = file_vnode(uap->fd, &vp)) )
5888                 return (error);
5889
5890         if ((error = vnode_getwithref(vp))) {
5891                 file_drop(uap->fd);
5892                 return(error);
5893         }
5894
5895         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5896
5897         /* we don't vnode_put() here because chflags1 does internally */
5898         error = chflags1(vp, uap->flags, vfs_context_current());
5899
5900         file_drop(uap->fd);
5901         return (error);
5902 }
5903
5904 /*
5905  * Change security information on a filesystem object.
5906  *
5907  * Returns:     0                       Success
5908  *              EPERM                   Operation not permitted
5909  *              vnode_authattr:???      [anything vnode_authattr can return]
5910  *              vnode_authorize:???     [anything vnode_authorize can return]
5911  *              vnode_setattr:???       [anything vnode_setattr can return]
5912  *
5913  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
5914  *              translated to EPERM before being returned.
5915  */
5916 static int
5917 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
5918 {
5919         kauth_action_t action;
5920         int error;
5921
5922         AUDIT_ARG(mode, vap->va_mode);
5923         /* XXX audit new args */
5924
5925 #if NAMEDSTREAMS
5926         /* chmod calls are not allowed for resource forks. */
5927         if (vp->v_flag & VISNAMEDSTREAM) {
5928                 return (EPERM);
5929         }
5930 #endif
5931
5932 #if CONFIG_MACF
5933         if (VATTR_IS_ACTIVE(vap, va_mode) &&
5934             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0)
5935                 return (error);
5936
5937         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
5938                 if ((error = mac_vnode_check_setowner(ctx, vp,
5939                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
5940                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1)))
5941                         return (error);
5942         }
5943
5944         if (VATTR_IS_ACTIVE(vap, va_acl) &&
5945             (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl)))
5946                 return (error);
5947 #endif
5948
5949         /* make sure that the caller is allowed to set this security information */
5950         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
5951             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
5952                 if (error == EACCES)
5953                         error = EPERM;
5954                 return(error);
5955         }
5956
5957         if ((error = vnode_setattr(vp, vap, ctx)) != 0)
5958                 return (error);
5959
5960 #if CONFIG_MACF
5961         if (VATTR_IS_ACTIVE(vap, va_mode))
5962                 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
5963
5964         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))
5965                 mac_vnode_notify_setowner(ctx, vp,
5966                         VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
5967                         VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
5968
5969         if (VATTR_IS_ACTIVE(vap, va_acl))
5970                 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
5971 #endif
5972
5973         return (error);
5974 }
5975
5976
5977 /*
5978  * Change mode of a file given a path name.
5979  *
5980  * Returns:     0                       Success
5981  *              namei:???               [anything namei can return]
5982  *              chmod_vnode:???         [anything chmod_vnode can return]
5983  */
5984 static int
5985 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
5986     int fd, int flag, enum uio_seg segflg)
5987 {
5988         struct nameidata nd;
5989         int follow, error;
5990
5991         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5992         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
5993             segflg, path, ctx);
5994         if ((error = nameiat(&nd, fd)))
5995                 return (error);
5996         error = chmod_vnode(ctx, nd.ni_vp, vap);
5997         vnode_put(nd.ni_vp);
5998         nameidone(&nd);
5999         return(error);
6000 }
6001
6002 /*
6003  * chmod_extended: Change the mode of a file given a path name; with extended
6004  * argument list (including extended security (ACL)).
6005  *
6006  * Parameters:  p                       Process requesting the open
6007  *              uap                     User argument descriptor (see below)
6008  *              retval                  (ignored)
6009  *
6010  * Indirect:    uap->path               Path to object (same as 'chmod')
6011  *              uap->uid                UID to set
6012  *              uap->gid                GID to set
6013  *              uap->mode               File mode to set (same as 'chmod')
6014  *              uap->xsecurity          ACL to set (or delete)
6015  *
6016  * Returns:     0                       Success
6017  *              !0                      errno value
6018  *
6019  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
6020  *
6021  * XXX:         We should enummerate the possible errno values here, and where
6022  *              in the code they originated.
6023  */
6024 int
6025 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
6026 {
6027         int error;
6028         struct vnode_attr va;
6029         kauth_filesec_t xsecdst;
6030
6031         AUDIT_ARG(owner, uap->uid, uap->gid);
6032
6033         VATTR_INIT(&va);
6034         if (uap->mode != -1)
6035                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6036         if (uap->uid != KAUTH_UID_NONE)
6037                 VATTR_SET(&va, va_uid, uap->uid);
6038         if (uap->gid != KAUTH_GID_NONE)
6039                 VATTR_SET(&va, va_gid, uap->gid);
6040
6041         xsecdst = NULL;
6042         switch(uap->xsecurity) {
6043                 /* explicit remove request */
6044         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6045                 VATTR_SET(&va, va_acl, NULL);
6046                 break;
6047                 /* not being set */
6048         case USER_ADDR_NULL:
6049                 break;
6050         default:
6051                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
6052                         return(error);
6053                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6054                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6055         }
6056
6057         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6058             UIO_USERSPACE);
6059
6060         if (xsecdst != NULL)
6061                 kauth_filesec_free(xsecdst);
6062         return(error);
6063 }
6064
6065 /*
6066  * Returns:     0                       Success
6067  *              chmodat:???             [anything chmodat can return]
6068  */
6069 static int
6070 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6071     int flag, enum uio_seg segflg)
6072 {
6073         struct vnode_attr va;
6074
6075         VATTR_INIT(&va);
6076         VATTR_SET(&va, va_mode, mode & ALLPERMS);
6077
6078         return (chmodat(ctx, path, &va, fd, flag, segflg));
6079 }
6080
6081 int
6082 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6083 {
6084         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6085             AT_FDCWD, 0, UIO_USERSPACE));
6086 }
6087
6088 int
6089 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6090 {
6091         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6092                 return (EINVAL);
6093
6094         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6095             uap->fd, uap->flag, UIO_USERSPACE));
6096 }
6097
6098 /*
6099  * Change mode of a file given a file descriptor.
6100  */
6101 static int
6102 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6103 {
6104         vnode_t vp;
6105         int error;
6106
6107         AUDIT_ARG(fd, fd);
6108
6109         if ((error = file_vnode(fd, &vp)) != 0)
6110                 return (error);
6111         if ((error = vnode_getwithref(vp)) != 0) {
6112                 file_drop(fd);
6113                 return(error);
6114         }
6115         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6116
6117         error = chmod_vnode(vfs_context_current(), vp, vap);
6118         (void)vnode_put(vp);
6119         file_drop(fd);
6120
6121         return (error);
6122 }
6123
6124 /*
6125  * fchmod_extended: Change mode of a file given a file descriptor; with
6126  * extended argument list (including extended security (ACL)).
6127  *
6128  * Parameters:    p                       Process requesting to change file mode
6129  *                uap                     User argument descriptor (see below)
6130  *                retval                  (ignored)
6131  *
6132  * Indirect:      uap->mode               File mode to set (same as 'chmod')
6133  *                uap->uid                UID to set
6134  *                uap->gid                GID to set
6135  *                uap->xsecurity          ACL to set (or delete)
6136  *                uap->fd                 File descriptor of file to change mode
6137  *
6138  * Returns:        0                      Success
6139  *                !0                      errno value
6140  *
6141  */
6142 int
6143 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6144 {
6145         int error;
6146         struct vnode_attr va;
6147         kauth_filesec_t xsecdst;
6148
6149         AUDIT_ARG(owner, uap->uid, uap->gid);
6150
6151         VATTR_INIT(&va);
6152         if (uap->mode != -1)
6153                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6154         if (uap->uid != KAUTH_UID_NONE)
6155                 VATTR_SET(&va, va_uid, uap->uid);
6156         if (uap->gid != KAUTH_GID_NONE)
6157                 VATTR_SET(&va, va_gid, uap->gid);
6158
6159         xsecdst = NULL;
6160         switch(uap->xsecurity) {
6161         case USER_ADDR_NULL:
6162                 VATTR_SET(&va, va_acl, NULL);
6163                 break;
6164         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6165                 VATTR_SET(&va, va_acl, NULL);
6166                 break;
6167                 /* not being set */
6168         case CAST_USER_ADDR_T(-1):
6169                 break;
6170         default:
6171                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
6172                         return(error);
6173                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6174         }
6175
6176         error = fchmod1(p, uap->fd, &va);
6177
6178
6179         switch(uap->xsecurity) {
6180         case USER_ADDR_NULL:
6181         case CAST_USER_ADDR_T(-1):
6182                 break;
6183         default:
6184                 if (xsecdst != NULL)
6185                         kauth_filesec_free(xsecdst);
6186         }
6187         return(error);
6188 }
6189
6190 int
6191 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6192 {
6193         struct vnode_attr va;
6194
6195         VATTR_INIT(&va);
6196         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6197
6198         return(fchmod1(p, uap->fd, &va));
6199 }
6200
6201
6202 /*
6203  * Set ownership given a path name.
6204  */
6205 /* ARGSUSED */
6206 static int
6207 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6208    gid_t gid, int flag, enum uio_seg segflg)
6209 {
6210         vnode_t vp;
6211         struct vnode_attr va;
6212         int error;
6213         struct nameidata nd;
6214         int follow;
6215         kauth_action_t action;
6216
6217         AUDIT_ARG(owner, uid, gid);
6218
6219         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6220         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6221             path, ctx);
6222         error = nameiat(&nd, fd);
6223         if (error)
6224                 return (error);
6225         vp = nd.ni_vp;
6226
6227         nameidone(&nd);
6228
6229         VATTR_INIT(&va);
6230         if (uid != (uid_t)VNOVAL)
6231                 VATTR_SET(&va, va_uid, uid);
6232         if (gid != (gid_t)VNOVAL)
6233                 VATTR_SET(&va, va_gid, gid);
6234
6235 #if CONFIG_MACF
6236         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6237         if (error)
6238                 goto out;
6239 #endif
6240
6241         /* preflight and authorize attribute changes */
6242         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6243                 goto out;
6244         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6245                 goto out;
6246         error = vnode_setattr(vp, &va, ctx);
6247
6248 #if CONFIG_MACF
6249         if (error == 0)
6250                 mac_vnode_notify_setowner(ctx, vp, uid, gid);
6251 #endif
6252
6253 out:
6254         /*
6255          * EACCES is only allowed from namei(); permissions failure should
6256          * return EPERM, so we need to translate the error code.
6257          */
6258         if (error == EACCES)
6259                 error = EPERM;
6260
6261         vnode_put(vp);
6262         return (error);
6263 }
6264
6265 int
6266 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
6267 {
6268         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6269             uap->uid, uap->gid, 0, UIO_USERSPACE));
6270 }
6271
6272 int
6273 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
6274 {
6275         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6276             uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE));
6277 }
6278
6279 int
6280 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
6281 {
6282         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6283                 return (EINVAL);
6284
6285         return (fchownat_internal(vfs_context_current(), uap->fd, uap->path,
6286             uap->uid, uap->gid, uap->flag, UIO_USERSPACE));
6287 }
6288
6289 /*
6290  * Set ownership given a file descriptor.
6291  */
6292 /* ARGSUSED */
6293 int
6294 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
6295 {
6296         struct vnode_attr va;
6297         vfs_context_t ctx = vfs_context_current();
6298         vnode_t vp;
6299         int error;
6300         kauth_action_t action;
6301
6302         AUDIT_ARG(owner, uap->uid, uap->gid);
6303         AUDIT_ARG(fd, uap->fd);
6304
6305         if ( (error = file_vnode(uap->fd, &vp)) )
6306                 return (error);
6307
6308         if ( (error = vnode_getwithref(vp)) ) {
6309                 file_drop(uap->fd);
6310                 return(error);
6311         }
6312         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6313
6314         VATTR_INIT(&va);
6315         if (uap->uid != VNOVAL)
6316                 VATTR_SET(&va, va_uid, uap->uid);
6317         if (uap->gid != VNOVAL)
6318                 VATTR_SET(&va, va_gid, uap->gid);
6319
6320 #if NAMEDSTREAMS
6321         /* chown calls are not allowed for resource forks. */
6322         if (vp->v_flag & VISNAMEDSTREAM) {
6323                 error = EPERM;
6324                 goto out;
6325         }
6326 #endif
6327
6328 #if CONFIG_MACF
6329         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
6330         if (error)
6331                 goto out;
6332 #endif
6333
6334         /* preflight and authorize attribute changes */
6335         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6336                 goto out;
6337         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6338                 if (error == EACCES)
6339                         error = EPERM;
6340                 goto out;
6341         }
6342         error = vnode_setattr(vp, &va, ctx);
6343
6344 #if CONFIG_MACF
6345         if (error == 0)
6346                 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
6347 #endif
6348
6349 out:
6350         (void)vnode_put(vp);
6351         file_drop(uap->fd);
6352         return (error);
6353 }
6354
6355 static int
6356 getutimes(user_addr_t usrtvp, struct timespec *tsp)
6357 {
6358         int error;
6359
6360         if (usrtvp == USER_ADDR_NULL) {
6361                 struct timeval old_tv;
6362                 /* XXX Y2038 bug because of microtime argument */
6363                 microtime(&old_tv);
6364                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
6365                 tsp[1] = tsp[0];
6366         } else {
6367                 if (IS_64BIT_PROCESS(current_proc())) {
6368                         struct user64_timeval tv[2];
6369                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6370                         if (error)
6371                                 return (error);
6372                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6373                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6374                 } else {
6375                         struct user32_timeval tv[2];
6376                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6377                         if (error)
6378                                 return (error);
6379                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6380                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6381                 }
6382         }
6383         return 0;
6384 }
6385
6386 static int
6387 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
6388         int nullflag)
6389 {
6390         int error;
6391         struct vnode_attr va;
6392         kauth_action_t action;
6393
6394         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6395
6396         VATTR_INIT(&va);
6397         VATTR_SET(&va, va_access_time, ts[0]);
6398         VATTR_SET(&va, va_modify_time, ts[1]);
6399         if (nullflag)
6400                 va.va_vaflags |= VA_UTIMES_NULL;
6401
6402 #if NAMEDSTREAMS
6403         /* utimes calls are not allowed for resource forks. */
6404         if (vp->v_flag & VISNAMEDSTREAM) {
6405                 error = EPERM;
6406                 goto out;
6407         }
6408 #endif
6409
6410 #if CONFIG_MACF
6411         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
6412         if (error)
6413                 goto out;
6414 #endif
6415         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6416                 if (!nullflag && error == EACCES)
6417                         error = EPERM;
6418                 goto out;
6419         }
6420
6421         /* since we may not need to auth anything, check here */
6422         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6423                 if (!nullflag && error == EACCES)
6424                         error = EPERM;
6425                 goto out;
6426         }
6427         error = vnode_setattr(vp, &va, ctx);
6428
6429 #if CONFIG_MACF
6430         if (error == 0)
6431                 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
6432 #endif
6433
6434 out:
6435         return error;
6436 }
6437
6438 /*
6439  * Set the access and modification times of a file.
6440  */
6441 /* ARGSUSED */
6442 int
6443 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
6444 {
6445         struct timespec ts[2];
6446         user_addr_t usrtvp;
6447         int error;
6448         struct nameidata nd;
6449         vfs_context_t ctx = vfs_context_current();
6450
6451         /*
6452          * AUDIT: Needed to change the order of operations to do the
6453          * name lookup first because auditing wants the path.
6454          */
6455         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6456                 UIO_USERSPACE, uap->path, ctx);
6457         error = namei(&nd);
6458         if (error)
6459                 return (error);
6460         nameidone(&nd);
6461
6462         /*
6463          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
6464          * the current time instead.
6465          */
6466         usrtvp = uap->tptr;
6467         if ((error = getutimes(usrtvp, ts)) != 0)
6468                 goto out;
6469
6470         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
6471
6472 out:
6473         vnode_put(nd.ni_vp);
6474         return (error);
6475 }
6476
6477 /*
6478  * Set the access and modification times of a file.
6479  */
6480 /* ARGSUSED */
6481 int
6482 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
6483 {
6484         struct timespec ts[2];
6485         vnode_t vp;
6486         user_addr_t usrtvp;
6487         int error;
6488
6489         AUDIT_ARG(fd, uap->fd);
6490         usrtvp = uap->tptr;
6491         if ((error = getutimes(usrtvp, ts)) != 0)
6492                 return (error);
6493         if ((error = file_vnode(uap->fd, &vp)) != 0)
6494                 return (error);
6495         if((error = vnode_getwithref(vp))) {
6496                 file_drop(uap->fd);
6497                 return(error);
6498         }
6499
6500         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
6501         vnode_put(vp);
6502         file_drop(uap->fd);
6503         return(error);
6504 }
6505
6506 /*
6507  * Truncate a file given its path name.
6508  */
6509 /* ARGSUSED */
6510 int
6511 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
6512 {
6513         vnode_t vp;
6514         struct vnode_attr va;
6515         vfs_context_t ctx = vfs_context_current();
6516         int error;
6517         struct nameidata nd;
6518         kauth_action_t action;
6519
6520         if (uap->length < 0)
6521                 return(EINVAL);
6522         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
6523                 UIO_USERSPACE, uap->path, ctx);
6524         if ((error = namei(&nd)))
6525                 return (error);
6526         vp = nd.ni_vp;
6527
6528         nameidone(&nd);
6529
6530         VATTR_INIT(&va);
6531         VATTR_SET(&va, va_data_size, uap->length);
6532
6533 #if CONFIG_MACF
6534         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
6535         if (error)
6536                 goto out;
6537 #endif
6538
6539         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6540                 goto out;
6541         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6542                 goto out;
6543         error = vnode_setattr(vp, &va, ctx);
6544
6545 #if CONFIG_MACF
6546         if (error == 0)
6547                 mac_vnode_notify_truncate(ctx, NOCRED, vp);
6548 #endif
6549
6550 out:
6551         vnode_put(vp);
6552         return (error);
6553 }
6554
6555 /*
6556  * Truncate a file given a file descriptor.
6557  */
6558 /* ARGSUSED */
6559 int
6560 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
6561 {
6562         vfs_context_t ctx = vfs_context_current();
6563         struct vnode_attr va;
6564         vnode_t vp;
6565         struct fileproc *fp;
6566         int error ;
6567         int fd = uap->fd;
6568
6569         AUDIT_ARG(fd, uap->fd);
6570         if (uap->length < 0)
6571                 return(EINVAL);
6572
6573         if ( (error = fp_lookup(p,fd,&fp,0)) ) {
6574                 return(error);
6575         }
6576
6577         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
6578         case DTYPE_PSXSHM:
6579                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
6580                 goto out;
6581         case DTYPE_VNODE:
6582                 break;
6583         default:
6584                 error = EINVAL;
6585                 goto out;
6586         }
6587
6588         vp = (vnode_t)fp->f_fglob->fg_data;
6589
6590         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
6591                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
6592                 error = EINVAL;
6593                 goto out;
6594         }
6595
6596         if ((error = vnode_getwithref(vp)) != 0) {
6597                 goto out;
6598         }
6599
6600         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6601
6602 #if CONFIG_MACF
6603         error = mac_vnode_check_truncate(ctx,
6604             fp->f_fglob->fg_cred, vp);
6605         if (error) {
6606                 (void)vnode_put(vp);
6607                 goto out;
6608         }
6609 #endif
6610         VATTR_INIT(&va);
6611         VATTR_SET(&va, va_data_size, uap->length);
6612         error = vnode_setattr(vp, &va, ctx);
6613
6614 #if CONFIG_MACF
6615         if (error == 0)
6616                 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
6617 #endif
6618
6619         (void)vnode_put(vp);
6620 out:
6621         file_drop(fd);
6622         return (error);
6623 }
6624
6625
6626 /*
6627  * Sync an open file with synchronized I/O _file_ integrity completion
6628  */
6629 /* ARGSUSED */
6630 int
6631 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
6632 {
6633         __pthread_testcancel(1);
6634         return(fsync_common(p, uap, MNT_WAIT));
6635 }
6636
6637
6638 /*
6639  * Sync an open file with synchronized I/O _file_ integrity completion
6640  *
6641  * Notes:       This is a legacy support function that does not test for
6642  *              thread cancellation points.
6643  */
6644 /* ARGSUSED */
6645 int
6646 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
6647 {
6648         return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT));
6649 }
6650
6651
6652 /*
6653  * Sync an open file with synchronized I/O _data_ integrity completion
6654  */
6655 /* ARGSUSED */
6656 int
6657 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
6658 {
6659         __pthread_testcancel(1);
6660         return(fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT));
6661 }
6662
6663
6664 /*
6665  * fsync_common
6666  *
6667  * Common fsync code to support both synchronized I/O file integrity completion
6668  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
6669  *
6670  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
6671  * will only guarantee that the file data contents are retrievable.  If
6672  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
6673  * includes additional metadata unnecessary for retrieving the file data
6674  * contents, such as atime, mtime, ctime, etc., also be committed to stable
6675  * storage.
6676  *
6677  * Parameters:  p                               The process
6678  *              uap->fd                         The descriptor to synchronize
6679  *              flags                           The data integrity flags
6680  *
6681  * Returns:     int                             Success
6682  *      fp_getfvp:EBADF                         Bad file descriptor
6683  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
6684  *      VNOP_FSYNC:???                          unspecified
6685  *
6686  * Notes:       We use struct fsync_args because it is a short name, and all
6687  *              caller argument structures are otherwise identical.
6688  */
6689 static int
6690 fsync_common(proc_t p, struct fsync_args *uap, int flags)
6691 {
6692         vnode_t vp;
6693         struct fileproc *fp;
6694         vfs_context_t ctx = vfs_context_current();
6695         int error;
6696
6697         AUDIT_ARG(fd, uap->fd);
6698
6699         if ( (error = fp_getfvp(p, uap->fd, &fp, &vp)) )
6700                 return (error);
6701         if ( (error = vnode_getwithref(vp)) ) {
6702                 file_drop(uap->fd);
6703                 return(error);
6704         }
6705
6706         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6707
6708         error = VNOP_FSYNC(vp, flags, ctx);
6709
6710 #if NAMEDRSRCFORK
6711         /* Sync resource fork shadow file if necessary. */
6712         if ((error == 0) &&
6713             (vp->v_flag & VISNAMEDSTREAM) &&
6714             (vp->v_parent != NULLVP) &&
6715             vnode_isshadow(vp) &&
6716             (fp->f_flags & FP_WRITTEN)) {
6717                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
6718         }
6719 #endif
6720
6721         (void)vnode_put(vp);
6722         file_drop(uap->fd);
6723         return (error);
6724 }
6725
6726 /*
6727  * Duplicate files.  Source must be a file, target must be a file or
6728  * must not exist.
6729  *
6730  * XXX Copyfile authorisation checking is woefully inadequate, and will not
6731  *     perform inheritance correctly.
6732  */
6733 /* ARGSUSED */
6734 int
6735 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
6736 {
6737         vnode_t tvp, fvp, tdvp, sdvp;
6738         struct nameidata fromnd, tond;
6739         int error;
6740         vfs_context_t ctx = vfs_context_current();
6741 #if CONFIG_MACF
6742         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
6743         struct vnode_attr va;
6744 #endif
6745
6746         /* Check that the flags are valid. */
6747
6748         if (uap->flags & ~CPF_MASK) {
6749                 return(EINVAL);
6750         }
6751
6752         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
6753                 UIO_USERSPACE, uap->from, ctx);
6754         if ((error = namei(&fromnd)))
6755                 return (error);
6756         fvp = fromnd.ni_vp;
6757
6758         NDINIT(&tond, CREATE, OP_LINK,
6759                LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6760                UIO_USERSPACE, uap->to, ctx);
6761         if ((error = namei(&tond))) {
6762                 goto out1;
6763         }
6764         tdvp = tond.ni_dvp;
6765         tvp = tond.ni_vp;
6766
6767         if (tvp != NULL) {
6768                 if (!(uap->flags & CPF_OVERWRITE)) {
6769                         error = EEXIST;
6770                         goto out;
6771                 }
6772         }
6773
6774         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
6775                 error = EISDIR;
6776                 goto out;
6777         }
6778
6779         /* This calls existing MAC hooks for open  */
6780         if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
6781             NULL))) {
6782                 goto out;
6783         }
6784
6785         if (tvp) {
6786                 /*
6787                  * See unlinkat_internal for an explanation of the potential
6788                  * ENOENT from the MAC hook but the gist is that the MAC hook
6789                  * can fail because vn_getpath isn't able to return the full
6790                  * path. We choose to ignore this failure.
6791                  */
6792                 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
6793                 if (error && error != ENOENT)
6794                         goto out;
6795                 error = 0;
6796         }
6797
6798 #if CONFIG_MACF
6799         VATTR_INIT(&va);
6800         VATTR_SET(&va, va_type, fvp->v_type);
6801         /* Mask off all but regular access permissions */
6802         VATTR_SET(&va, va_mode,
6803             ((((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
6804         error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
6805         if (error)
6806                 goto out;
6807 #endif /* CONFIG_MACF */
6808
6809         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
6810                 goto out;
6811
6812         if (fvp == tdvp)
6813                 error = EINVAL;
6814         /*
6815          * If source is the same as the destination (that is the
6816          * same inode number) then there is nothing to do.
6817          * (fixed to have POSIX semantics - CSM 3/2/98)
6818          */
6819         if (fvp == tvp)
6820                 error = -1;
6821         if (!error)
6822                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
6823 out:
6824         sdvp = tond.ni_startdir;
6825         /*
6826          * nameidone has to happen before we vnode_put(tdvp)
6827          * since it may need to release the fs_nodelock on the tdvp
6828          */
6829         nameidone(&tond);
6830
6831         if (tvp)
6832                 vnode_put(tvp);
6833         vnode_put(tdvp);
6834         vnode_put(sdvp);
6835 out1:
6836         vnode_put(fvp);
6837
6838         nameidone(&fromnd);
6839
6840         if (error == -1)
6841                 return (0);
6842         return (error);
6843 }
6844
6845 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
6846
6847 /*
6848  * Helper function for doing clones. The caller is expected to provide an
6849  * iocounted source vnode and release it.
6850  */
6851 static int
6852 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
6853     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
6854 {
6855         vnode_t tvp, tdvp;
6856         struct nameidata tond;
6857         int error;
6858         int follow;
6859         boolean_t free_src_acl;
6860         boolean_t attr_cleanup;
6861         enum vtype v_type;
6862         kauth_action_t action;
6863         struct componentname *cnp;
6864         uint32_t defaulted;
6865         struct vnode_attr va;
6866         struct vnode_attr nva;
6867
6868         v_type = vnode_vtype(fvp);
6869         switch (v_type) {
6870         case VLNK:
6871                 /* FALLTHRU */
6872         case VREG:
6873                 action = KAUTH_VNODE_ADD_FILE;
6874                 break;
6875         case VDIR:
6876                 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
6877                     fvp->v_mountedhere) {
6878                         return (EINVAL);
6879                 }
6880                 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
6881                 break;
6882         default:
6883                 return (EINVAL);
6884         }
6885
6886         AUDIT_ARG(fd2, dst_dirfd);
6887         AUDIT_ARG(value32, flags);
6888
6889         follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6890         NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
6891             UIO_USERSPACE, dst, ctx);
6892         if ((error = nameiat(&tond, dst_dirfd)))
6893                 return (error);
6894         cnp = &tond.ni_cnd;
6895         tdvp = tond.ni_dvp;
6896         tvp = tond.ni_vp;
6897
6898         free_src_acl = FALSE;
6899         attr_cleanup = FALSE;
6900
6901         if (tvp != NULL) {
6902                 error = EEXIST;
6903                 goto out;
6904         }
6905
6906         if (vnode_mount(tdvp) != vnode_mount(fvp)) {
6907                 error = EXDEV;
6908                 goto out;
6909         }
6910
6911 #if CONFIG_MACF
6912         if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp)))
6913                 goto out;
6914 #endif
6915         if ((error = vnode_authorize(tdvp, NULL, action, ctx)))
6916                 goto out;
6917
6918         action = KAUTH_VNODE_GENERIC_READ_BITS;
6919         if (data_read_authorised)
6920                 action &= ~KAUTH_VNODE_READ_DATA;
6921         if ((error = vnode_authorize(fvp, NULL, action, ctx)))
6922                 goto out;
6923
6924         /*
6925          * certain attributes may need to be changed from the source, we ask for
6926          * those here.
6927          */
6928         VATTR_INIT(&va);
6929         VATTR_WANTED(&va, va_uid);
6930         VATTR_WANTED(&va, va_gid);
6931         VATTR_WANTED(&va, va_mode);
6932         VATTR_WANTED(&va, va_flags);
6933         VATTR_WANTED(&va, va_acl);
6934
6935         if ((error = vnode_getattr(fvp, &va, ctx)) != 0)
6936                 goto out;
6937
6938         VATTR_INIT(&nva);
6939         VATTR_SET(&nva, va_type, v_type);
6940         if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
6941                 VATTR_SET(&nva, va_acl, va.va_acl);
6942                 free_src_acl = TRUE;
6943         }
6944
6945         /* Handle ACL inheritance, initialize vap. */
6946         if (v_type == VLNK) {
6947                 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
6948         } else {
6949                 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
6950                 if (error)
6951                         goto out;
6952                 attr_cleanup = TRUE;
6953         }
6954
6955         /*
6956          * We've got initial values for all security parameters,
6957          * If we are superuser, then we can change owners to be the
6958          * same as the source. Both superuser and the owner have default
6959          * WRITE_SECURITY privileges so all other fields can be taken
6960          * from source as well.
6961          */
6962         if (vfs_context_issuser(ctx)) {
6963                 if (VATTR_IS_SUPPORTED(&va, va_uid))
6964                         VATTR_SET(&nva, va_uid, va.va_uid);
6965                 if (VATTR_IS_SUPPORTED(&va, va_gid))
6966                         VATTR_SET(&nva, va_gid, va.va_gid);
6967         }
6968         if (VATTR_IS_SUPPORTED(&va, va_mode))
6969                 VATTR_SET(&nva, va_mode, va.va_mode);
6970         if (VATTR_IS_SUPPORTED(&va, va_flags)) {
6971                 VATTR_SET(&nva, va_flags,
6972                     ((va.va_flags & ~SF_RESTRICTED) | /* Turn off from source */
6973                     (nva.va_flags & SF_RESTRICTED)));
6974         }
6975
6976         error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva,
6977             VNODE_CLONEFILE_DEFAULT, ctx);
6978
6979         if (!error && tvp) {
6980                 int     update_flags = 0;
6981 #if CONFIG_FSE
6982                 int fsevent;
6983 #endif /* CONFIG_FSE */
6984
6985 #if CONFIG_MACF
6986                 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
6987                     VNODE_LABEL_CREATE, ctx);
6988 #endif
6989                 /*
6990                  * If some of the requested attributes weren't handled by the
6991                  * VNOP, use our fallback code.
6992                  */
6993                 if (!VATTR_ALL_SUPPORTED(&va))
6994                         (void)vnode_setattr_fallback(tvp, &nva, ctx);
6995
6996                 // Make sure the name & parent pointers are hooked up
6997                 if (tvp->v_name == NULL)
6998                         update_flags |= VNODE_UPDATE_NAME;
6999                 if (tvp->v_parent == NULLVP)
7000                         update_flags |= VNODE_UPDATE_PARENT;
7001
7002                 if (update_flags) {
7003                         (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
7004                             cnp->cn_namelen, cnp->cn_hash, update_flags);
7005                 }
7006
7007 #if CONFIG_FSE
7008                 switch (vnode_vtype(tvp)) {
7009                 case VLNK:
7010                         /* FALLTHRU */
7011                 case VREG:
7012                         fsevent = FSE_CREATE_FILE;
7013                         break;
7014                 case VDIR:
7015                         fsevent = FSE_CREATE_DIR;
7016                         break;
7017                 default:
7018                         goto out;
7019                 }
7020
7021                 if (need_fsevent(fsevent, tvp)) {
7022                         add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
7023                             FSE_ARG_DONE);
7024                 }
7025 #endif /* CONFIG_FSE */
7026         }
7027
7028 out:
7029         if (attr_cleanup)
7030                 vn_attribute_cleanup(&nva, defaulted);
7031         if (free_src_acl && va.va_acl)
7032                 kauth_acl_free(va.va_acl);
7033         nameidone(&tond);
7034         if (tvp)
7035                 vnode_put(tvp);
7036         vnode_put(tdvp);
7037         return (error);
7038 }
7039
7040 /*
7041  * clone files or directories, target must not exist.
7042  */
7043 /* ARGSUSED */
7044 int
7045 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7046     __unused int32_t *retval)
7047 {
7048         vnode_t fvp;
7049         struct nameidata fromnd;
7050         int follow;
7051         int error;
7052         vfs_context_t ctx = vfs_context_current();
7053
7054         /* Check that the flags are valid. */
7055         if (uap->flags & ~CLONE_NOFOLLOW)
7056                 return (EINVAL);
7057
7058         AUDIT_ARG(fd, uap->src_dirfd);
7059
7060         follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7061         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7062             UIO_USERSPACE, uap->src, ctx);
7063         if ((error = nameiat(&fromnd, uap->src_dirfd)))
7064                 return (error);
7065
7066         fvp = fromnd.ni_vp;
7067         nameidone(&fromnd);
7068
7069         error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7070             uap->flags, ctx);
7071
7072         vnode_put(fvp);
7073         return (error);
7074 }
7075
7076 int
7077 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7078     __unused int32_t *retval)
7079 {
7080         vnode_t fvp;
7081         struct fileproc *fp;
7082         int error;
7083         vfs_context_t ctx = vfs_context_current();
7084
7085         AUDIT_ARG(fd, uap->src_fd);
7086         error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7087         if (error)
7088                 return (error);
7089
7090         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7091                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7092                 error = EBADF;
7093                 goto out;
7094         }
7095
7096         if ((error = vnode_getwithref(fvp)))
7097                 goto out;
7098
7099         AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7100
7101         error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7102             uap->flags, ctx);
7103
7104         vnode_put(fvp);
7105 out:
7106         file_drop(uap->src_fd);
7107         return (error);
7108 }
7109
7110 /*
7111  * Rename files.  Source and destination must either both be directories,
7112  * or both not be directories.  If target is a directory, it must be empty.
7113  */
7114 /* ARGSUSED */
7115 static int
7116 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
7117     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
7118 {
7119         if (flags & ~VFS_RENAME_FLAGS_MASK)
7120                 return EINVAL;
7121
7122         if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL))
7123                 return EINVAL;
7124
7125         vnode_t tvp, tdvp;
7126         vnode_t fvp, fdvp;
7127         struct nameidata *fromnd, *tond;
7128         int error;
7129         int do_retry;
7130         int retry_count;
7131         int mntrename;
7132         int need_event;
7133         const char *oname = NULL;
7134         char *from_name = NULL, *to_name = NULL;
7135         int from_len=0, to_len=0;
7136         int holding_mntlock;
7137         mount_t locked_mp = NULL;
7138         vnode_t oparent = NULLVP;
7139 #if CONFIG_FSE
7140         fse_info from_finfo, to_finfo;
7141 #endif
7142         int from_truncated=0, to_truncated;
7143         int batched = 0;
7144         struct vnode_attr *fvap, *tvap;
7145         int continuing = 0;
7146         /* carving out a chunk for structs that are too big to be on stack. */
7147         struct {
7148                 struct nameidata from_node, to_node;
7149                 struct vnode_attr fv_attr, tv_attr;
7150         } * __rename_data;
7151         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
7152         fromnd = &__rename_data->from_node;
7153         tond = &__rename_data->to_node;
7154
7155         holding_mntlock = 0;
7156         do_retry = 0;
7157         retry_count = 0;
7158 retry:
7159         fvp = tvp = NULL;
7160         fdvp = tdvp = NULL;
7161         fvap = tvap = NULL;
7162         mntrename = FALSE;
7163
7164         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
7165             segflg, from, ctx);
7166         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
7167
7168         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7169             segflg, to, ctx);
7170         tond->ni_flag = NAMEI_COMPOUNDRENAME;
7171
7172 continue_lookup:
7173         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
7174                 if ( (error = nameiat(fromnd, fromfd)) )
7175                         goto out1;
7176                 fdvp = fromnd->ni_dvp;
7177                 fvp  = fromnd->ni_vp;
7178
7179                 if (fvp && fvp->v_type == VDIR)
7180                         tond->ni_cnd.cn_flags |= WILLBEDIR;
7181         }
7182
7183         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
7184                 if ( (error = nameiat(tond, tofd)) ) {
7185                         /*
7186                          * Translate error code for rename("dir1", "dir2/.").
7187                          */
7188                         if (error == EISDIR && fvp->v_type == VDIR)
7189                                 error = EINVAL;
7190                         goto out1;
7191                 }
7192                 tdvp = tond->ni_dvp;
7193                 tvp  = tond->ni_vp;
7194         }
7195
7196 #if DEVELOPMENT || DEBUG
7197         /*
7198          * XXX VSWAP: Check for entitlements or special flag here
7199          * so we can restrict access appropriately.
7200          */
7201 #else /* DEVELOPMENT || DEBUG */
7202
7203         if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
7204                 error = EPERM;
7205                 goto out1;
7206         }
7207
7208         if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
7209                 error = EPERM;
7210                 goto out1;
7211         }
7212 #endif /* DEVELOPMENT || DEBUG */
7213
7214         if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
7215                 error = ENOENT;
7216                 goto out1;
7217         }
7218
7219         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
7220                 error = EEXIST;
7221                 goto out1;
7222         }
7223
7224         batched = vnode_compound_rename_available(fdvp);
7225         if (!fvp) {
7226                 /*
7227                  * Claim: this check will never reject a valid rename.
7228                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
7229                  * Suppose fdvp and tdvp are not on the same mount.
7230                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
7231                  *      then you can't move it to within another dir on the same mountpoint.
7232                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
7233                  *
7234                  * If this check passes, then we are safe to pass these vnodes to the same FS.
7235                  */
7236                 if (fdvp->v_mount != tdvp->v_mount) {
7237                         error = EXDEV;
7238                         goto out1;
7239                 }
7240                 goto skipped_lookup;
7241         }
7242
7243         if (!batched) {
7244                 error = vn_authorize_renamex(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, flags, NULL);
7245                 if (error) {
7246                         if (error == ENOENT) {
7247                                 assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7248                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7249                                         /*
7250                                          * We encountered a race where after doing the namei, tvp stops
7251                                          * being valid. If so, simply re-drive the rename call from the
7252                                          * top.
7253                                          */
7254                                         do_retry = 1;
7255                                         retry_count += 1;
7256                                 }
7257                         }
7258                         goto out1;
7259                 }
7260         }
7261
7262         /*
7263          * If the source and destination are the same (i.e. they're
7264          * links to the same vnode) and the target file system is
7265          * case sensitive, then there is nothing to do.
7266          *
7267          * XXX Come back to this.
7268          */
7269         if (fvp == tvp) {
7270                 int pathconf_val;
7271
7272                 /*
7273                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
7274                  * then assume that this file system is case sensitive.
7275                  */
7276                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
7277                     pathconf_val != 0) {
7278                         goto out1;
7279                 }
7280         }
7281
7282         /*
7283          * Allow the renaming of mount points.
7284          * - target must not exist
7285          * - target must reside in the same directory as source
7286          * - union mounts cannot be renamed
7287          * - "/" cannot be renamed
7288          *
7289          * XXX Handle this in VFS after a continued lookup (if we missed
7290          * in the cache to start off)
7291          *
7292          * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
7293          * we'll skip past here.  The file system is responsible for
7294          * checking that @tvp is not a descendent of @fvp and vice versa
7295          * so it should always return EINVAL if either @tvp or @fvp is the
7296          * root of a volume.
7297          */
7298         if ((fvp->v_flag & VROOT) &&
7299             (fvp->v_type == VDIR) &&
7300             (tvp == NULL)  &&
7301             (fvp->v_mountedhere == NULL)  &&
7302             (fdvp == tdvp)  &&
7303             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0)  &&
7304             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
7305                 vnode_t coveredvp;
7306
7307                 /* switch fvp to the covered vnode */
7308                 coveredvp = fvp->v_mount->mnt_vnodecovered;
7309                 if ( (vnode_getwithref(coveredvp)) ) {
7310                         error = ENOENT;
7311                         goto out1;
7312                 }
7313                 vnode_put(fvp);
7314
7315                 fvp = coveredvp;
7316                 mntrename = TRUE;
7317         }
7318         /*
7319          * Check for cross-device rename.
7320          */
7321         if ((fvp->v_mount != tdvp->v_mount) ||
7322             (tvp && (fvp->v_mount != tvp->v_mount))) {
7323                 error = EXDEV;
7324                 goto out1;
7325         }
7326
7327         /*
7328          * If source is the same as the destination (that is the
7329          * same inode number) then there is nothing to do...
7330          * EXCEPT if the underlying file system supports case
7331          * insensitivity and is case preserving.  In this case
7332          * the file system needs to handle the special case of
7333          * getting the same vnode as target (fvp) and source (tvp).
7334          *
7335          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
7336          * and _PC_CASE_PRESERVING can have this exception, and they need to
7337          * handle the special case of getting the same vnode as target and
7338          * source.  NOTE: Then the target is unlocked going into vnop_rename,
7339          * so not to cause locking problems. There is a single reference on tvp.
7340          *
7341          * NOTE - that fvp == tvp also occurs if they are hard linked and
7342          * that correct behaviour then is just to return success without doing
7343          * anything.
7344          *
7345          * XXX filesystem should take care of this itself, perhaps...
7346          */
7347         if (fvp == tvp && fdvp == tdvp) {
7348                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
7349                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
7350                           fromnd->ni_cnd.cn_namelen)) {
7351                         goto out1;
7352                 }
7353         }
7354
7355         if (holding_mntlock && fvp->v_mount != locked_mp) {
7356                 /*
7357                  * we're holding a reference and lock
7358                  * on locked_mp, but it no longer matches
7359                  * what we want to do... so drop our hold
7360                  */
7361                 mount_unlock_renames(locked_mp);
7362                 mount_drop(locked_mp, 0);
7363                 holding_mntlock = 0;
7364         }
7365         if (tdvp != fdvp && fvp->v_type == VDIR) {
7366                 /*
7367                  * serialize renames that re-shape
7368                  * the tree... if holding_mntlock is
7369                  * set, then we're ready to go...
7370                  * otherwise we
7371                  * first need to drop the iocounts
7372                  * we picked up, second take the
7373                  * lock to serialize the access,
7374                  * then finally start the lookup
7375                  * process over with the lock held
7376                  */
7377                 if (!holding_mntlock) {
7378                         /*
7379                          * need to grab a reference on
7380                          * the mount point before we
7381                          * drop all the iocounts... once
7382                          * the iocounts are gone, the mount
7383                          * could follow
7384                          */
7385                         locked_mp = fvp->v_mount;
7386                         mount_ref(locked_mp, 0);
7387
7388                         /*
7389                          * nameidone has to happen before we vnode_put(tvp)
7390                          * since it may need to release the fs_nodelock on the tvp
7391                          */
7392                         nameidone(tond);
7393
7394                         if (tvp)
7395                                 vnode_put(tvp);
7396                         vnode_put(tdvp);
7397
7398                         /*
7399                          * nameidone has to happen before we vnode_put(fdvp)
7400                          * since it may need to release the fs_nodelock on the fvp
7401                          */
7402                         nameidone(fromnd);
7403
7404                         vnode_put(fvp);
7405                         vnode_put(fdvp);
7406
7407                         mount_lock_renames(locked_mp);
7408                         holding_mntlock = 1;
7409
7410                         goto retry;
7411                 }
7412         } else {
7413                 /*
7414                  * when we dropped the iocounts to take
7415                  * the lock, we allowed the identity of
7416                  * the various vnodes to change... if they did,
7417                  * we may no longer be dealing with a rename
7418                  * that reshapes the tree... once we're holding
7419                  * the iocounts, the vnodes can't change type
7420                  * so we're free to drop the lock at this point
7421                  * and continue on
7422                  */
7423                 if (holding_mntlock) {
7424                         mount_unlock_renames(locked_mp);
7425                         mount_drop(locked_mp, 0);
7426                         holding_mntlock = 0;
7427                 }
7428         }
7429
7430         // save these off so we can later verify that fvp is the same
7431         oname   = fvp->v_name;
7432         oparent = fvp->v_parent;
7433
7434 skipped_lookup:
7435 #if CONFIG_FSE
7436         need_event = need_fsevent(FSE_RENAME, fdvp);
7437         if (need_event) {
7438                 if (fvp) {
7439                         get_fse_info(fvp, &from_finfo, ctx);
7440                 } else {
7441                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
7442                         if (error) {
7443                                 goto out1;
7444                         }
7445
7446                         fvap = &__rename_data->fv_attr;
7447                 }
7448
7449                 if (tvp) {
7450                         get_fse_info(tvp, &to_finfo, ctx);
7451                 } else if (batched) {
7452                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
7453                         if (error) {
7454                                 goto out1;
7455                         }
7456
7457                         tvap = &__rename_data->tv_attr;
7458                 }
7459         }
7460 #else
7461         need_event = 0;
7462 #endif /* CONFIG_FSE */
7463
7464         if (need_event || kauth_authorize_fileop_has_listeners()) {
7465                 if (from_name == NULL) {
7466                         GET_PATH(from_name);
7467                         if (from_name == NULL) {
7468                                 error = ENOMEM;
7469                                 goto out1;
7470                         }
7471                 }
7472
7473                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
7474
7475                 if (to_name == NULL) {
7476                         GET_PATH(to_name);
7477                         if (to_name == NULL) {
7478                                 error = ENOMEM;
7479                                 goto out1;
7480                         }
7481                 }
7482
7483                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
7484         }
7485         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
7486                             tdvp, &tvp, &tond->ni_cnd, tvap,
7487                             flags, ctx);
7488
7489         if (holding_mntlock) {
7490                 /*
7491                  * we can drop our serialization
7492                  * lock now
7493                  */
7494                 mount_unlock_renames(locked_mp);
7495                 mount_drop(locked_mp, 0);
7496                 holding_mntlock = 0;
7497         }
7498         if (error) {
7499                 if (error == EKEEPLOOKING) {
7500                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
7501                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
7502                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
7503                                 }
7504                         }
7505
7506                         fromnd->ni_vp = fvp;
7507                         tond->ni_vp = tvp;
7508
7509                         goto continue_lookup;
7510                 }
7511
7512                 /*
7513                  * We may encounter a race in the VNOP where the destination didn't
7514                  * exist when we did the namei, but it does by the time we go and
7515                  * try to create the entry. In this case, we should re-drive this rename
7516                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
7517                  * but other filesystems susceptible to this race could return it, too.
7518                  */
7519                 if (error == ERECYCLE) {
7520                         do_retry = 1;
7521                 }
7522
7523                 /*
7524                  * For compound VNOPs, the authorization callback may return
7525                  * ENOENT in case of racing hardlink lookups hitting the name
7526                  * cache, redrive the lookup.
7527                  */
7528                 if (batched && error == ENOENT) {
7529                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7530                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7531                                 do_retry = 1;
7532                                 retry_count += 1;
7533                         }
7534                 }
7535
7536                 goto out1;
7537         }
7538
7539         /* call out to allow 3rd party notification of rename.
7540          * Ignore result of kauth_authorize_fileop call.
7541          */
7542         kauth_authorize_fileop(vfs_context_ucred(ctx),
7543                         KAUTH_FILEOP_RENAME,
7544                         (uintptr_t)from_name, (uintptr_t)to_name);
7545         if (flags & VFS_RENAME_SWAP) {
7546                 kauth_authorize_fileop(vfs_context_ucred(ctx),
7547                                                            KAUTH_FILEOP_RENAME,
7548                                                            (uintptr_t)to_name, (uintptr_t)from_name);
7549         }
7550
7551 #if CONFIG_FSE
7552         if (from_name != NULL && to_name != NULL) {
7553                 if (from_truncated || to_truncated) {
7554                         // set it here since only the from_finfo gets reported up to user space
7555                         from_finfo.mode |= FSE_TRUNCATED_PATH;
7556                 }
7557
7558                 if (tvap && tvp) {
7559                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
7560                 }
7561                 if (fvap) {
7562                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
7563                 }
7564
7565                 if (tvp) {
7566                         add_fsevent(FSE_RENAME, ctx,
7567                                                 FSE_ARG_STRING, from_len, from_name,
7568                                                 FSE_ARG_FINFO, &from_finfo,
7569                                                 FSE_ARG_STRING, to_len, to_name,
7570                                                 FSE_ARG_FINFO, &to_finfo,
7571                                                 FSE_ARG_DONE);
7572                         if (flags & VFS_RENAME_SWAP) {
7573                                 /*
7574                                  * Strictly speaking, swap is the equivalent of
7575                                  * *three* renames.  FSEvents clients should only take
7576                                  * the events as a hint, so we only bother reporting
7577                                  * two.
7578                                  */
7579                                 add_fsevent(FSE_RENAME, ctx,
7580                                                         FSE_ARG_STRING, to_len, to_name,
7581                                                         FSE_ARG_FINFO, &to_finfo,
7582                                                         FSE_ARG_STRING, from_len, from_name,
7583                                                         FSE_ARG_FINFO, &from_finfo,
7584                                                         FSE_ARG_DONE);
7585                         }
7586                 } else {
7587                         add_fsevent(FSE_RENAME, ctx,
7588                                     FSE_ARG_STRING, from_len, from_name,
7589                                     FSE_ARG_FINFO, &from_finfo,
7590                                     FSE_ARG_STRING, to_len, to_name,
7591                                     FSE_ARG_DONE);
7592                 }
7593         }
7594 #endif /* CONFIG_FSE */
7595
7596         /*
7597          * update filesystem's mount point data
7598          */
7599         if (mntrename) {
7600                 char *cp, *pathend, *mpname;
7601                 char * tobuf;
7602                 struct mount *mp;
7603                 int maxlen;
7604                 size_t len = 0;
7605
7606                 mp = fvp->v_mountedhere;
7607
7608                 if (vfs_busy(mp, LK_NOWAIT)) {
7609                         error = EBUSY;
7610                         goto out1;
7611                 }
7612                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
7613
7614                 if (UIO_SEG_IS_USER_SPACE(segflg))
7615                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
7616                 else
7617                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
7618                 if (!error) {
7619                         /* find current mount point prefix */
7620                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
7621                         for (cp = pathend; *cp != '\0'; ++cp) {
7622                                 if (*cp == '/')
7623                                         pathend = cp + 1;
7624                         }
7625                         /* find last component of target name */
7626                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
7627                                 if (*cp == '/')
7628                                         mpname = cp + 1;
7629                         }
7630                         /* append name to prefix */
7631                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
7632                         bzero(pathend, maxlen);
7633                         strlcpy(pathend, mpname, maxlen);
7634                 }
7635                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
7636
7637                 vfs_unbusy(mp);
7638         }
7639         /*
7640          * fix up name & parent pointers.  note that we first
7641          * check that fvp has the same name/parent pointers it
7642          * had before the rename call... this is a 'weak' check
7643          * at best...
7644          *
7645          * XXX oparent and oname may not be set in the compound vnop case
7646          */
7647         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
7648                 int update_flags;
7649
7650                 update_flags = VNODE_UPDATE_NAME;
7651
7652                 if (fdvp != tdvp)
7653                         update_flags |= VNODE_UPDATE_PARENT;
7654
7655                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
7656         }
7657 out1:
7658         if (to_name != NULL) {
7659                 RELEASE_PATH(to_name);
7660                 to_name = NULL;
7661         }
7662         if (from_name != NULL) {
7663                 RELEASE_PATH(from_name);
7664                 from_name = NULL;
7665         }
7666         if (holding_mntlock) {
7667                 mount_unlock_renames(locked_mp);
7668                 mount_drop(locked_mp, 0);
7669                 holding_mntlock = 0;
7670         }
7671         if (tdvp) {
7672                 /*
7673                  * nameidone has to happen before we vnode_put(tdvp)
7674                  * since it may need to release the fs_nodelock on the tdvp
7675                  */
7676                 nameidone(tond);
7677
7678                 if (tvp)
7679                         vnode_put(tvp);
7680                 vnode_put(tdvp);
7681         }
7682         if (fdvp) {
7683                 /*
7684                  * nameidone has to happen before we vnode_put(fdvp)
7685                  * since it may need to release the fs_nodelock on the fdvp
7686                  */
7687                 nameidone(fromnd);
7688
7689                 if (fvp)
7690                         vnode_put(fvp);
7691                 vnode_put(fdvp);
7692         }
7693
7694         /*
7695          * If things changed after we did the namei, then we will re-drive
7696          * this rename call from the top.
7697          */
7698         if (do_retry) {
7699                 do_retry = 0;
7700                 goto retry;
7701         }
7702
7703         FREE(__rename_data, M_TEMP);
7704         return (error);
7705 }
7706
7707 int
7708 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
7709 {
7710         return (renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
7711             AT_FDCWD, uap->to, UIO_USERSPACE, 0));
7712 }
7713
7714 int renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
7715 {
7716         return renameat_internal(
7717                 vfs_context_current(),
7718                 uap->fromfd, uap->from,
7719                 uap->tofd, uap->to,
7720                 UIO_USERSPACE, uap->flags);
7721 }
7722
7723 int
7724 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
7725 {
7726         return (renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
7727             uap->tofd, uap->to, UIO_USERSPACE, 0));
7728 }
7729
7730 /*
7731  * Make a directory file.
7732  *
7733  * Returns:     0                       Success
7734  *              EEXIST
7735  *      namei:???
7736  *      vnode_authorize:???
7737  *      vn_create:???
7738  */
7739 /* ARGSUSED */
7740 static int
7741 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
7742     enum uio_seg segflg)
7743 {
7744         vnode_t vp, dvp;
7745         int error;
7746         int update_flags = 0;
7747         int batched;
7748         struct nameidata nd;
7749
7750         AUDIT_ARG(mode, vap->va_mode);
7751         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
7752                path, ctx);
7753         nd.ni_cnd.cn_flags |= WILLBEDIR;
7754         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
7755
7756 continue_lookup:
7757         error = nameiat(&nd, fd);
7758         if (error)
7759                 return (error);
7760         dvp = nd.ni_dvp;
7761         vp = nd.ni_vp;
7762
7763         if (vp != NULL) {
7764                 error = EEXIST;
7765                 goto out;
7766         }
7767
7768         batched = vnode_compound_mkdir_available(dvp);
7769
7770         VATTR_SET(vap, va_type, VDIR);
7771
7772         /*
7773          * XXX
7774          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
7775          * only get EXISTS or EISDIR for existing path components, and not that it could see
7776          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
7777          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
7778          */
7779         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
7780                 if (error == EACCES || error == EPERM) {
7781                         int error2;
7782
7783                         nameidone(&nd);
7784                         vnode_put(dvp);
7785                         dvp = NULLVP;
7786
7787                         /*
7788                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
7789                          * rather than EACCESS if the target exists.
7790                          */
7791                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
7792                                         path, ctx);
7793                         error2 = nameiat(&nd, fd);
7794                         if (error2) {
7795                                 goto out;
7796                         } else {
7797                                 vp = nd.ni_vp;
7798                                 error = EEXIST;
7799                                 goto out;
7800                         }
7801                 }
7802
7803                 goto out;
7804         }
7805
7806         /*
7807          * make the directory
7808          */
7809         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
7810                 if (error == EKEEPLOOKING) {
7811                         nd.ni_vp = vp;
7812                         goto continue_lookup;
7813                 }
7814
7815                 goto out;
7816         }
7817
7818         // Make sure the name & parent pointers are hooked up
7819         if (vp->v_name == NULL)
7820                 update_flags |= VNODE_UPDATE_NAME;
7821         if (vp->v_parent == NULLVP)
7822                 update_flags |= VNODE_UPDATE_PARENT;
7823
7824         if (update_flags)
7825                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
7826
7827 #if CONFIG_FSE
7828         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
7829 #endif
7830
7831 out:
7832         /*
7833          * nameidone has to happen before we vnode_put(dvp)
7834          * since it may need to release the fs_nodelock on the dvp
7835          */
7836         nameidone(&nd);
7837
7838         if (vp)
7839                 vnode_put(vp);
7840         if (dvp)
7841                 vnode_put(dvp);
7842
7843         return (error);
7844 }
7845
7846 /*
7847  * mkdir_extended: Create a directory; with extended security (ACL).
7848  *
7849  * Parameters:    p                       Process requesting to create the directory
7850  *                uap                     User argument descriptor (see below)
7851  *                retval                  (ignored)
7852  *
7853  * Indirect:      uap->path               Path of directory to create
7854  *                uap->mode               Access permissions to set
7855  *                uap->xsecurity          ACL to set
7856  *
7857  * Returns:        0                      Success
7858  *                !0                      Not success
7859  *
7860  */
7861 int
7862 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
7863 {
7864         int ciferror;
7865         kauth_filesec_t xsecdst;
7866         struct vnode_attr va;
7867
7868         AUDIT_ARG(owner, uap->uid, uap->gid);
7869
7870         xsecdst = NULL;
7871         if ((uap->xsecurity != USER_ADDR_NULL) &&
7872             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
7873                 return ciferror;
7874
7875         VATTR_INIT(&va);
7876         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7877         if (xsecdst != NULL)
7878                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7879
7880         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7881             UIO_USERSPACE);
7882         if (xsecdst != NULL)
7883                 kauth_filesec_free(xsecdst);
7884         return ciferror;
7885 }
7886
7887 int
7888 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
7889 {
7890         struct vnode_attr va;
7891
7892         VATTR_INIT(&va);
7893         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7894
7895         return (mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7896             UIO_USERSPACE));
7897 }
7898
7899 int
7900 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
7901 {
7902         struct vnode_attr va;
7903
7904         VATTR_INIT(&va);
7905         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7906
7907         return(mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
7908             UIO_USERSPACE));
7909 }
7910
7911 static int
7912 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
7913     enum uio_seg segflg)
7914 {
7915         vnode_t vp, dvp;
7916         int error;
7917         struct nameidata nd;
7918         char     *path = NULL;
7919         int       len=0;
7920         int has_listeners = 0;
7921         int need_event = 0;
7922         int truncated = 0;
7923 #if CONFIG_FSE
7924         struct vnode_attr va;
7925 #endif /* CONFIG_FSE */
7926         struct vnode_attr *vap = NULL;
7927         int restart_count = 0;
7928         int batched;
7929
7930         int restart_flag;
7931
7932         /*
7933          * This loop exists to restart rmdir in the unlikely case that two
7934          * processes are simultaneously trying to remove the same directory
7935          * containing orphaned appleDouble files.
7936          */
7937         do {
7938                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
7939                     segflg, dirpath, ctx);
7940                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
7941 continue_lookup:
7942                 restart_flag = 0;
7943                 vap = NULL;
7944
7945                 error = nameiat(&nd, fd);
7946                 if (error)
7947                         return (error);
7948
7949                 dvp = nd.ni_dvp;
7950                 vp = nd.ni_vp;
7951
7952                 if (vp) {
7953                         batched = vnode_compound_rmdir_available(vp);
7954
7955                         if (vp->v_flag & VROOT) {
7956                                 /*
7957                                  * The root of a mounted filesystem cannot be deleted.
7958                                  */
7959                                 error = EBUSY;
7960                                 goto out;
7961                         }
7962
7963 #if DEVELOPMENT || DEBUG
7964                         /*
7965                          * XXX VSWAP: Check for entitlements or special flag here
7966                          * so we can restrict access appropriately.
7967                          */
7968 #else /* DEVELOPMENT || DEBUG */
7969
7970                         if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
7971                                 error = EPERM;
7972                                 goto out;
7973                         }
7974 #endif /* DEVELOPMENT || DEBUG */
7975
7976                         /*
7977                          * Removed a check here; we used to abort if vp's vid
7978                          * was not the same as what we'd seen the last time around.
7979                          * I do not think that check was valid, because if we retry
7980                          * and all dirents are gone, the directory could legitimately
7981                          * be recycled but still be present in a situation where we would
7982                          * have had permission to delete.  Therefore, we won't make
7983                          * an effort to preserve that check now that we may not have a
7984                          * vp here.
7985                          */
7986
7987                         if (!batched) {
7988                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
7989                                 if (error) {
7990                                         if (error == ENOENT) {
7991                                                 assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7992                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7993                                                         restart_flag = 1;
7994                                                         restart_count += 1;
7995                                                 }
7996                                         }
7997                                         goto out;
7998                                 }
7999                         }
8000                 } else {
8001                         batched = 1;
8002
8003                         if (!vnode_compound_rmdir_available(dvp)) {
8004                                 panic("No error, but no compound rmdir?");
8005                         }
8006                 }
8007
8008 #if CONFIG_FSE
8009                 fse_info  finfo;
8010
8011                 need_event = need_fsevent(FSE_DELETE, dvp);
8012                 if (need_event) {
8013                         if (!batched) {
8014                                 get_fse_info(vp, &finfo, ctx);
8015                         } else {
8016                                 error = vfs_get_notify_attributes(&va);
8017                                 if (error) {
8018                                         goto out;
8019                                 }
8020
8021                                 vap = &va;
8022                         }
8023                 }
8024 #endif
8025                 has_listeners = kauth_authorize_fileop_has_listeners();
8026                 if (need_event || has_listeners) {
8027                         if (path == NULL) {
8028                                 GET_PATH(path);
8029                                 if (path == NULL) {
8030                                         error = ENOMEM;
8031                                         goto out;
8032                                 }
8033                         }
8034
8035                         len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
8036 #if CONFIG_FSE
8037                         if (truncated) {
8038                                 finfo.mode |= FSE_TRUNCATED_PATH;
8039                         }
8040 #endif
8041                 }
8042
8043                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8044                 nd.ni_vp = vp;
8045                 if (vp == NULLVP) {
8046                         /* Couldn't find a vnode */
8047                         goto out;
8048                 }
8049
8050                 if (error == EKEEPLOOKING) {
8051                         goto continue_lookup;
8052                 } else if (batched && error == ENOENT) {
8053                         assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
8054                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8055                                 /*
8056                                  * For compound VNOPs, the authorization callback
8057                                  * may return ENOENT in case of racing hard link lookups
8058                                  * redrive the lookup.
8059                                  */
8060                                 restart_flag = 1;
8061                                 restart_count += 1;
8062                                 goto out;
8063                         }
8064                 }
8065 #if CONFIG_APPLEDOUBLE
8066                 /*
8067                  * Special case to remove orphaned AppleDouble
8068                  * files. I don't like putting this in the kernel,
8069                  * but carbon does not like putting this in carbon either,
8070                  * so here we are.
8071                  */
8072                 if (error == ENOTEMPTY) {
8073                         error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
8074                         if (error == EBUSY) {
8075                                 goto out;
8076                         }
8077
8078
8079                         /*
8080                          * Assuming everything went well, we will try the RMDIR again
8081                          */
8082                         if (!error)
8083                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8084                 }
8085 #endif /* CONFIG_APPLEDOUBLE */
8086                 /*
8087                  * Call out to allow 3rd party notification of delete.
8088                  * Ignore result of kauth_authorize_fileop call.
8089                  */
8090                 if (!error) {
8091                         if (has_listeners) {
8092                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8093                                                 KAUTH_FILEOP_DELETE,
8094                                                 (uintptr_t)vp,
8095                                                 (uintptr_t)path);
8096                         }
8097
8098                         if (vp->v_flag & VISHARDLINK) {
8099                                 // see the comment in unlink1() about why we update
8100                                 // the parent of a hard link when it is removed
8101                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
8102                         }
8103
8104 #if CONFIG_FSE
8105                         if (need_event) {
8106                                 if (vap) {
8107                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
8108                                 }
8109                                 add_fsevent(FSE_DELETE, ctx,
8110                                                 FSE_ARG_STRING, len, path,
8111                                                 FSE_ARG_FINFO, &finfo,
8112                                                 FSE_ARG_DONE);
8113                         }
8114 #endif
8115                 }
8116
8117 out:
8118                 if (path != NULL) {
8119                         RELEASE_PATH(path);
8120                         path = NULL;
8121                 }
8122                 /*
8123                  * nameidone has to happen before we vnode_put(dvp)
8124                  * since it may need to release the fs_nodelock on the dvp
8125                  */
8126                 nameidone(&nd);
8127                 vnode_put(dvp);
8128
8129                 if (vp)
8130                         vnode_put(vp);
8131
8132                 if (restart_flag == 0) {
8133                         wakeup_one((caddr_t)vp);
8134                         return (error);
8135                 }
8136                 tsleep(vp, PVFS, "rm AD", 1);
8137
8138         } while (restart_flag != 0);
8139
8140         return (error);
8141
8142 }
8143
8144 /*
8145  * Remove a directory file.
8146  */
8147 /* ARGSUSED */
8148 int
8149 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
8150 {
8151         return (rmdirat_internal(vfs_context_current(), AT_FDCWD,
8152             CAST_USER_ADDR_T(uap->path), UIO_USERSPACE));
8153 }
8154
8155 /* Get direntry length padded to 8 byte alignment */
8156 #define DIRENT64_LEN(namlen) \
8157         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
8158
8159 errno_t
8160 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
8161                 int *numdirent, vfs_context_t ctxp)
8162 {
8163         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
8164         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
8165                    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0))  {
8166                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
8167         } else {
8168                 size_t bufsize;
8169                 void * bufptr;
8170                 uio_t auio;
8171                 struct direntry *entry64;
8172                 struct dirent *dep;
8173                 int bytesread;
8174                 int error;
8175
8176                 /*
8177                  * Our kernel buffer needs to be smaller since re-packing
8178                  * will expand each dirent.  The worse case (when the name
8179                  * length is 3) corresponds to a struct direntry size of 32
8180                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
8181                  * (4-byte aligned).  So having a buffer that is 3/8 the size
8182                  * will prevent us from reading more than we can pack.
8183                  *
8184                  * Since this buffer is wired memory, we will limit the
8185                  * buffer size to a maximum of 32K. We would really like to
8186                  * use 32K in the MIN(), but we use magic number 87371 to
8187                  * prevent uio_resid() * 3 / 8 from overflowing.
8188                  */
8189                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
8190                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
8191                 if (bufptr == NULL) {
8192                         return ENOMEM;
8193                 }
8194
8195                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
8196                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
8197                 auio->uio_offset = uio->uio_offset;
8198
8199                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
8200
8201                 dep = (struct dirent *)bufptr;
8202                 bytesread = bufsize - uio_resid(auio);
8203
8204                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
8205                        M_TEMP, M_WAITOK);
8206                 /*
8207                  * Convert all the entries and copy them out to user's buffer.
8208                  */
8209                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
8210                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
8211
8212                         bzero(entry64, enbufsize);
8213                         /* Convert a dirent to a dirent64. */
8214                         entry64->d_ino = dep->d_ino;
8215                         entry64->d_seekoff = 0;
8216                         entry64->d_reclen = enbufsize;
8217                         entry64->d_namlen = dep->d_namlen;
8218                         entry64->d_type = dep->d_type;
8219                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
8220
8221                         /* Move to next entry. */
8222                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
8223
8224                         /* Copy entry64 to user's buffer. */
8225                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
8226                 }
8227
8228                 /* Update the real offset using the offset we got from VNOP_READDIR. */
8229                 if (error == 0) {
8230                         uio->uio_offset = auio->uio_offset;
8231                 }
8232                 uio_free(auio);
8233                 FREE(bufptr, M_TEMP);
8234                 FREE(entry64, M_TEMP);
8235                 return (error);
8236         }
8237 }
8238
8239 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
8240
8241 /*
8242  * Read a block of directory entries in a file system independent format.
8243  */
8244 static int
8245 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
8246                      off_t *offset, int flags)
8247 {
8248         vnode_t vp;
8249         struct vfs_context context = *vfs_context_current();    /* local copy */
8250         struct fileproc *fp;
8251         uio_t auio;
8252         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8253         off_t loff;
8254         int error, eofflag, numdirent;
8255         char uio_buf[ UIO_SIZEOF(1) ];
8256
8257         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
8258         if (error) {
8259                 return (error);
8260         }
8261         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
8262                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8263                 error = EBADF;
8264                 goto out;
8265         }
8266
8267         if (bufsize > GETDIRENTRIES_MAXBUFSIZE)
8268                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
8269
8270 #if CONFIG_MACF
8271         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
8272         if (error)
8273                 goto out;
8274 #endif
8275         if ( (error = vnode_getwithref(vp)) ) {
8276                 goto out;
8277         }
8278         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8279
8280 unionread:
8281         if (vp->v_type != VDIR) {
8282                 (void)vnode_put(vp);
8283                 error = EINVAL;
8284                 goto out;
8285         }
8286
8287 #if CONFIG_MACF
8288         error = mac_vnode_check_readdir(&context, vp);
8289         if (error != 0) {
8290                 (void)vnode_put(vp);
8291                 goto out;
8292         }
8293 #endif /* MAC */
8294
8295         loff = fp->f_fglob->fg_offset;
8296         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8297         uio_addiov(auio, bufp, bufsize);
8298
8299         if (flags & VNODE_READDIR_EXTENDED) {
8300                 error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
8301                 fp->f_fglob->fg_offset = uio_offset(auio);
8302         } else {
8303                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
8304                 fp->f_fglob->fg_offset = uio_offset(auio);
8305         }
8306         if (error) {
8307                 (void)vnode_put(vp);
8308                 goto out;
8309         }
8310
8311         if ((user_ssize_t)bufsize == uio_resid(auio)){
8312                 if (union_dircheckp) {
8313                         error = union_dircheckp(&vp, fp, &context);
8314                         if (error == -1)
8315                                 goto unionread;
8316                         if (error) {
8317                                 (void)vnode_put(vp);
8318                                 goto out;
8319                         }
8320                 }
8321
8322                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
8323                         struct vnode *tvp = vp;
8324                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
8325                                 vnode_ref(vp);
8326                                 fp->f_fglob->fg_data = (caddr_t) vp;
8327                                 fp->f_fglob->fg_offset = 0;
8328                                 vnode_rele(tvp);
8329                                 vnode_put(tvp);
8330                                 goto unionread;
8331                         }
8332                         vp = tvp;
8333                 }
8334         }
8335
8336         vnode_put(vp);
8337         if (offset) {
8338                 *offset = loff;
8339         }
8340
8341         *bytesread = bufsize - uio_resid(auio);
8342 out:
8343         file_drop(fd);
8344         return (error);
8345 }
8346
8347
8348 int
8349 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
8350 {
8351         off_t offset;
8352         ssize_t bytesread;
8353         int error;
8354
8355         AUDIT_ARG(fd, uap->fd);
8356         error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
8357
8358         if (error == 0) {
8359                 if (proc_is64bit(p)) {
8360                         user64_long_t base = (user64_long_t)offset;
8361                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
8362                 } else {
8363                         user32_long_t base = (user32_long_t)offset;
8364                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
8365                 }
8366                 *retval = bytesread;
8367         }
8368         return (error);
8369 }
8370
8371 int
8372 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
8373 {
8374         off_t offset;
8375         ssize_t bytesread;
8376         int error;
8377
8378         AUDIT_ARG(fd, uap->fd);
8379         error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
8380
8381         if (error == 0) {
8382                 *retval = bytesread;
8383                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
8384         }
8385         return (error);
8386 }
8387
8388
8389 /*
8390  * Set the mode mask for creation of filesystem nodes.
8391  * XXX implement xsecurity
8392  */
8393 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
8394 static int
8395 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
8396 {
8397         struct filedesc *fdp;
8398
8399         AUDIT_ARG(mask, newmask);
8400         proc_fdlock(p);
8401         fdp = p->p_fd;
8402         *retval = fdp->fd_cmask;
8403         fdp->fd_cmask = newmask & ALLPERMS;
8404         proc_fdunlock(p);
8405         return (0);
8406 }
8407
8408 /*
8409  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
8410  *
8411  * Parameters:    p                       Process requesting to set the umask
8412  *                uap                     User argument descriptor (see below)
8413  *                retval                  umask of the process (parameter p)
8414  *
8415  * Indirect:      uap->newmask            umask to set
8416  *                uap->xsecurity          ACL to set
8417  *
8418  * Returns:        0                      Success
8419  *                !0                      Not success
8420  *
8421  */
8422 int
8423 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
8424 {
8425         int ciferror;
8426         kauth_filesec_t xsecdst;
8427
8428         xsecdst = KAUTH_FILESEC_NONE;
8429         if (uap->xsecurity != USER_ADDR_NULL) {
8430                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
8431                         return ciferror;
8432         } else {
8433                 xsecdst = KAUTH_FILESEC_NONE;
8434         }
8435
8436         ciferror = umask1(p, uap->newmask, xsecdst, retval);
8437
8438         if (xsecdst != KAUTH_FILESEC_NONE)
8439                 kauth_filesec_free(xsecdst);
8440         return ciferror;
8441 }
8442
8443 int
8444 umask(proc_t p, struct umask_args *uap, int32_t *retval)
8445 {
8446         return(umask1(p, uap->newmask, UMASK_NOXSECURITY, retval));
8447 }
8448
8449 /*
8450  * Void all references to file by ripping underlying filesystem
8451  * away from vnode.
8452  */
8453 /* ARGSUSED */
8454 int
8455 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
8456 {
8457         vnode_t vp;
8458         struct vnode_attr va;
8459         vfs_context_t ctx = vfs_context_current();
8460         int error;
8461         struct nameidata nd;
8462
8463         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
8464                uap->path, ctx);
8465         error = namei(&nd);
8466         if (error)
8467                 return (error);
8468         vp = nd.ni_vp;
8469
8470         nameidone(&nd);
8471
8472         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
8473                 error = ENOTSUP;
8474                 goto out;
8475         }
8476
8477         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
8478                 error = EBUSY;
8479                 goto out;
8480         }
8481
8482 #if CONFIG_MACF
8483         error = mac_vnode_check_revoke(ctx, vp);
8484         if (error)
8485                 goto out;
8486 #endif
8487
8488         VATTR_INIT(&va);
8489         VATTR_WANTED(&va, va_uid);
8490         if ((error = vnode_getattr(vp, &va, ctx)))
8491                 goto out;
8492         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
8493             (error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
8494                 goto out;
8495         if (vp->v_usecount > 0 || (vnode_isaliased(vp)))
8496                 VNOP_REVOKE(vp, REVOKEALL, ctx);
8497 out:
8498         vnode_put(vp);
8499         return (error);
8500 }
8501
8502
8503 /*
8504  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
8505  *  The following system calls are designed to support features
8506  *  which are specific to the HFS & HFS Plus volume formats
8507  */
8508
8509
8510 /*
8511  * Obtain attribute information on objects in a directory while enumerating
8512  * the directory.
8513  */
8514 /* ARGSUSED */
8515 int
8516 getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
8517 {
8518         vnode_t vp;
8519         struct fileproc *fp;
8520         uio_t auio = NULL;
8521         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8522         uint32_t count, savecount;
8523         uint32_t newstate;
8524         int error, eofflag;
8525         uint32_t loff;
8526         struct attrlist attributelist;
8527         vfs_context_t ctx = vfs_context_current();
8528         int fd = uap->fd;
8529         char uio_buf[ UIO_SIZEOF(1) ];
8530         kauth_action_t action;
8531
8532         AUDIT_ARG(fd, fd);
8533
8534         /* Get the attributes into kernel space */
8535         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
8536                 return(error);
8537         }
8538         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
8539                 return(error);
8540         }
8541         savecount = count;
8542         if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) {
8543                 return (error);
8544         }
8545         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
8546                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8547                 error = EBADF;
8548                 goto out;
8549         }
8550
8551
8552 #if CONFIG_MACF
8553         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
8554             fp->f_fglob);
8555         if (error)
8556                 goto out;
8557 #endif
8558
8559
8560         if ( (error = vnode_getwithref(vp)) )
8561                 goto out;
8562
8563         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8564
8565 unionread:
8566         if (vp->v_type != VDIR) {
8567                 (void)vnode_put(vp);
8568                 error = EINVAL;
8569                 goto out;
8570         }
8571
8572 #if CONFIG_MACF
8573         error = mac_vnode_check_readdir(ctx, vp);
8574         if (error != 0) {
8575                 (void)vnode_put(vp);
8576                 goto out;
8577         }
8578 #endif /* MAC */
8579
8580         /* set up the uio structure which will contain the users return buffer */
8581         loff = fp->f_fglob->fg_offset;
8582         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8583         uio_addiov(auio, uap->buffer, uap->buffersize);
8584
8585         /*
8586          * If the only item requested is file names, we can let that past with
8587          * just LIST_DIRECTORY.  If they want any other attributes, that means
8588          * they need SEARCH as well.
8589          */
8590         action = KAUTH_VNODE_LIST_DIRECTORY;
8591         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
8592             attributelist.fileattr || attributelist.dirattr)
8593                 action |= KAUTH_VNODE_SEARCH;
8594
8595         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
8596
8597                 /* Believe it or not, uap->options only has 32-bits of valid
8598                  * info, so truncate before extending again */
8599
8600                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
8601                                 (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
8602         }
8603
8604         if (error) {
8605                 (void) vnode_put(vp);
8606                 goto out;
8607         }
8608
8609         /*
8610          * If we've got the last entry of a directory in a union mount
8611          * then reset the eofflag and pretend there's still more to come.
8612          * The next call will again set eofflag and the buffer will be empty,
8613          * so traverse to the underlying directory and do the directory
8614          * read there.
8615          */
8616         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
8617                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
8618                         eofflag = 0;
8619                 } else {                                                // Empty buffer
8620                         struct vnode *tvp = vp;
8621                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
8622                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
8623                                 fp->f_fglob->fg_data = (caddr_t) vp;
8624                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
8625                                 count = savecount;
8626                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
8627                                 vnode_put(tvp);
8628                                 goto unionread;
8629                         }
8630                         vp = tvp;
8631                 }
8632         }
8633
8634         (void)vnode_put(vp);
8635
8636         if (error)
8637                 goto out;
8638         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
8639
8640         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count))))
8641                 goto out;
8642         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate))))
8643                 goto out;
8644         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff))))
8645                 goto out;
8646
8647         *retval = eofflag;  /* similar to getdirentries */
8648         error = 0;
8649 out:
8650         file_drop(fd);
8651         return (error); /* return error earlier, an retval of 0 or 1 now */
8652
8653 } /* end of getdirentriesattr system call */
8654
8655 /*
8656 * Exchange data between two files
8657 */
8658
8659 /* ARGSUSED */
8660 int
8661 exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
8662 {
8663
8664         struct nameidata fnd, snd;
8665         vfs_context_t ctx = vfs_context_current();
8666         vnode_t fvp;
8667         vnode_t svp;
8668         int error;
8669         u_int32_t nameiflags;
8670         char *fpath = NULL;
8671         char *spath = NULL;
8672         int   flen=0, slen=0;
8673         int from_truncated=0, to_truncated=0;
8674 #if CONFIG_FSE
8675         fse_info f_finfo, s_finfo;
8676 #endif
8677
8678         nameiflags = 0;
8679         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8680
8681         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
8682                UIO_USERSPACE, uap->path1, ctx);
8683
8684         error = namei(&fnd);
8685         if (error)
8686                 goto out2;
8687
8688         nameidone(&fnd);
8689         fvp = fnd.ni_vp;
8690
8691         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
8692                UIO_USERSPACE, uap->path2, ctx);
8693
8694         error = namei(&snd);
8695         if (error) {
8696                 vnode_put(fvp);
8697                 goto out2;
8698         }
8699         nameidone(&snd);
8700         svp = snd.ni_vp;
8701
8702         /*
8703          * if the files are the same, return an inval error
8704          */
8705         if (svp == fvp) {
8706                 error = EINVAL;
8707                 goto out;
8708         }
8709
8710         /*
8711          * if the files are on different volumes, return an error
8712          */
8713         if (svp->v_mount != fvp->v_mount) {
8714                 error = EXDEV;
8715                 goto out;
8716         }
8717
8718         /* If they're not files, return an error */
8719         if ( (vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
8720                 error = EINVAL;
8721                 goto out;
8722         }
8723
8724 #if CONFIG_MACF
8725         error = mac_vnode_check_exchangedata(ctx,
8726             fvp, svp);
8727         if (error)
8728                 goto out;
8729 #endif
8730         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
8731             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0))
8732                 goto out;
8733
8734         if (
8735 #if CONFIG_FSE
8736         need_fsevent(FSE_EXCHANGE, fvp) ||
8737 #endif
8738         kauth_authorize_fileop_has_listeners()) {
8739                 GET_PATH(fpath);
8740                 GET_PATH(spath);
8741                 if (fpath == NULL || spath == NULL) {
8742                         error = ENOMEM;
8743                         goto out;
8744                 }
8745
8746                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
8747                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
8748
8749 #if CONFIG_FSE
8750                 get_fse_info(fvp, &f_finfo, ctx);
8751                 get_fse_info(svp, &s_finfo, ctx);
8752                 if (from_truncated || to_truncated) {
8753                         // set it here since only the f_finfo gets reported up to user space
8754                         f_finfo.mode |= FSE_TRUNCATED_PATH;
8755                 }
8756 #endif
8757         }
8758         /* Ok, make the call */
8759         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
8760
8761         if (error == 0) {
8762             const char *tmpname;
8763
8764             if (fpath != NULL && spath != NULL) {
8765                     /* call out to allow 3rd party notification of exchangedata.
8766                      * Ignore result of kauth_authorize_fileop call.
8767                      */
8768                     kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
8769                                            (uintptr_t)fpath, (uintptr_t)spath);
8770             }
8771             name_cache_lock();
8772
8773             tmpname     = fvp->v_name;
8774             fvp->v_name = svp->v_name;
8775             svp->v_name = tmpname;
8776
8777             if (fvp->v_parent != svp->v_parent) {
8778                 vnode_t tmp;
8779
8780                 tmp           = fvp->v_parent;
8781                 fvp->v_parent = svp->v_parent;
8782                 svp->v_parent = tmp;
8783             }
8784             name_cache_unlock();
8785
8786 #if CONFIG_FSE
8787             if (fpath != NULL && spath != NULL) {
8788                     add_fsevent(FSE_EXCHANGE, ctx,
8789                                 FSE_ARG_STRING, flen, fpath,
8790                                 FSE_ARG_FINFO, &f_finfo,
8791                                 FSE_ARG_STRING, slen, spath,
8792                                 FSE_ARG_FINFO, &s_finfo,
8793                                 FSE_ARG_DONE);
8794             }
8795 #endif
8796         }
8797
8798 out:
8799         if (fpath != NULL)
8800                 RELEASE_PATH(fpath);
8801         if (spath != NULL)
8802                 RELEASE_PATH(spath);
8803         vnode_put(svp);
8804         vnode_put(fvp);
8805 out2:
8806         return (error);
8807 }
8808
8809 /*
8810  * Return (in MB) the amount of freespace on the given vnode's volume.
8811  */
8812 uint32_t freespace_mb(vnode_t vp);
8813
8814 uint32_t
8815 freespace_mb(vnode_t vp)
8816 {
8817         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
8818         return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
8819                 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
8820 }
8821
8822 #if CONFIG_SEARCHFS
8823
8824 /* ARGSUSED */
8825
8826 int
8827 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
8828 {
8829         vnode_t vp, tvp;
8830         int i, error=0;
8831         int fserror = 0;
8832         struct nameidata nd;
8833         struct user64_fssearchblock searchblock;
8834         struct searchstate *state;
8835         struct attrlist *returnattrs;
8836         struct timeval timelimit;
8837         void *searchparams1,*searchparams2;
8838         uio_t auio = NULL;
8839         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8840         uint32_t nummatches;
8841         int mallocsize;
8842         uint32_t nameiflags;
8843         vfs_context_t ctx = vfs_context_current();
8844         char uio_buf[ UIO_SIZEOF(1) ];
8845
8846         /* Start by copying in fsearchblock parameter list */
8847     if (IS_64BIT_PROCESS(p)) {
8848         error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
8849         timelimit.tv_sec = searchblock.timelimit.tv_sec;
8850         timelimit.tv_usec = searchblock.timelimit.tv_usec;
8851     }
8852     else {
8853         struct user32_fssearchblock tmp_searchblock;
8854
8855         error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
8856         // munge into 64-bit version
8857         searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
8858         searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
8859         searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
8860         searchblock.maxmatches = tmp_searchblock.maxmatches;
8861                 /*
8862                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
8863                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
8864                  */
8865         timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
8866         timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
8867         searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
8868         searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
8869         searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
8870         searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
8871         searchblock.searchattrs = tmp_searchblock.searchattrs;
8872     }
8873         if (error)
8874                 return(error);
8875
8876         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
8877          */
8878         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
8879                 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS)
8880                 return(EINVAL);
8881
8882         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
8883         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
8884         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
8885         /* block.                                                                                             */
8886         /*                                                                                                    */
8887         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
8888         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
8889         /*       assumes the size is still 556 bytes it will continue to work                                 */
8890
8891         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
8892                 sizeof(struct attrlist) + sizeof(struct searchstate) + (2*sizeof(uint32_t));
8893
8894         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
8895
8896         /* Now set up the various pointers to the correct place in our newly allocated memory */
8897
8898         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
8899         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
8900         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof (struct attrlist));
8901
8902         /* Now copy in the stuff given our local variables. */
8903
8904         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1)))
8905                 goto freeandexit;
8906
8907         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2)))
8908                 goto freeandexit;
8909
8910         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist))))
8911                 goto freeandexit;
8912
8913         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate))))
8914                 goto freeandexit;
8915
8916         /*
8917          * When searching a union mount, need to set the
8918          * start flag at the first call on each layer to
8919          * reset state for the new volume.
8920          */
8921         if (uap->options & SRCHFS_START)
8922                 state->ss_union_layer = 0;
8923         else
8924                 uap->options |= state->ss_union_flags;
8925         state->ss_union_flags = 0;
8926
8927         /*
8928          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
8929          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
8930          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
8931          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
8932          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
8933          */
8934
8935         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
8936                 attrreference_t* string_ref;
8937                 u_int32_t* start_length;
8938                 user64_size_t param_length;
8939
8940                 /* validate searchparams1 */
8941                 param_length = searchblock.sizeofsearchparams1;
8942                 /* skip the word that specifies length of the buffer */
8943                 start_length= (u_int32_t*) searchparams1;
8944                 start_length= start_length+1;
8945                 string_ref= (attrreference_t*) start_length;
8946
8947                 /* ensure no negative offsets or too big offsets */
8948                 if (string_ref->attr_dataoffset < 0 ) {
8949                         error = EINVAL;
8950                         goto freeandexit;
8951                 }
8952                 if (string_ref->attr_length > MAXPATHLEN) {
8953                         error = EINVAL;
8954                         goto freeandexit;
8955                 }
8956
8957                 /* Check for pointer overflow in the string ref */
8958                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
8959                         error = EINVAL;
8960                         goto freeandexit;
8961                 }
8962
8963                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
8964                         error = EINVAL;
8965                         goto freeandexit;
8966                 }
8967                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
8968                         error = EINVAL;
8969                         goto freeandexit;
8970                 }
8971         }
8972
8973         /* set up the uio structure which will contain the users return buffer */
8974         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8975         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
8976
8977         nameiflags = 0;
8978         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8979         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
8980                UIO_USERSPACE, uap->path, ctx);
8981
8982         error = namei(&nd);
8983         if (error)
8984                 goto freeandexit;
8985         vp = nd.ni_vp;
8986         nameidone(&nd);
8987
8988         /*
8989          * Switch to the root vnode for the volume
8990          */
8991         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
8992         vnode_put(vp);
8993         if (error)
8994                 goto freeandexit;
8995         vp = tvp;
8996
8997         /*
8998          * If it's a union mount, the path lookup takes
8999          * us to the top layer. But we may need to descend
9000          * to a lower layer. For non-union mounts the layer
9001          * is always zero.
9002          */
9003         for (i = 0; i < (int) state->ss_union_layer; i++) {
9004                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0)
9005                         break;
9006                 tvp = vp;
9007                 vp = vp->v_mount->mnt_vnodecovered;
9008                 if (vp == NULL) {
9009                         vnode_put(tvp);
9010                         error = ENOENT;
9011                         goto freeandexit;
9012                 }
9013                 error = vnode_getwithref(vp);
9014                 vnode_put(tvp);
9015                 if (error)
9016                         goto freeandexit;
9017         }
9018
9019 #if CONFIG_MACF
9020         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
9021         if (error) {
9022                 vnode_put(vp);
9023                 goto freeandexit;
9024         }
9025 #endif
9026
9027
9028         /*
9029          * If searchblock.maxmatches == 0, then skip the search. This has happened
9030          * before and sometimes the underlying code doesnt deal with it well.
9031          */
9032          if (searchblock.maxmatches == 0) {
9033                 nummatches = 0;
9034                 goto saveandexit;
9035          }
9036
9037         /*
9038          * Allright, we have everything we need, so lets make that call.
9039          *
9040          * We keep special track of the return value from the file system:
9041          * EAGAIN is an acceptable error condition that shouldn't keep us
9042          * from copying out any results...
9043          */
9044
9045         fserror = VNOP_SEARCHFS(vp,
9046                 searchparams1,
9047                 searchparams2,
9048                 &searchblock.searchattrs,
9049                 (u_long)searchblock.maxmatches,
9050                 &timelimit,
9051                 returnattrs,
9052                 &nummatches,
9053                 (u_long)uap->scriptcode,
9054                 (u_long)uap->options,
9055                 auio,
9056                 (struct searchstate *) &state->ss_fsstate,
9057                 ctx);
9058
9059         /*
9060          * If it's a union mount we need to be called again
9061          * to search the mounted-on filesystem.
9062          */
9063         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
9064                 state->ss_union_flags = SRCHFS_START;
9065                 state->ss_union_layer++;        // search next layer down
9066                 fserror = EAGAIN;
9067         }
9068
9069 saveandexit:
9070
9071         vnode_put(vp);
9072
9073         /* Now copy out the stuff that needs copying out. That means the number of matches, the
9074            search state.  Everything was already put into he return buffer by the vop call. */
9075
9076         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0)
9077                 goto freeandexit;
9078
9079         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0)
9080                 goto freeandexit;
9081
9082         error = fserror;
9083
9084 freeandexit:
9085
9086         FREE(searchparams1,M_TEMP);
9087
9088         return(error);
9089
9090
9091 } /* end of searchfs system call */
9092
9093 #else /* CONFIG_SEARCHFS */
9094
9095 int
9096 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
9097 {
9098         return (ENOTSUP);
9099 }
9100
9101 #endif /* CONFIG_SEARCHFS */
9102
9103
9104 lck_grp_attr_t *  nspace_group_attr;
9105 lck_attr_t *      nspace_lock_attr;
9106 lck_grp_t *       nspace_mutex_group;
9107
9108 lck_mtx_t         nspace_handler_lock;
9109 lck_mtx_t         nspace_handler_exclusion_lock;
9110
9111 time_t snapshot_timestamp=0;
9112 int nspace_allow_virtual_devs=0;
9113
9114 void nspace_handler_init(void);
9115
9116 typedef struct nspace_item_info {
9117         struct vnode *vp;
9118         void         *arg;
9119         uint64_t      op;
9120         uint32_t      vid;
9121         uint32_t      flags;
9122         uint32_t      token;
9123         uint32_t      refcount;
9124 } nspace_item_info;
9125
9126 #define MAX_NSPACE_ITEMS   128
9127 nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
9128 uint32_t      nspace_item_idx=0;              // also used as the sleep/wakeup rendezvous address
9129 uint32_t      nspace_token_id=0;
9130 uint32_t      nspace_handler_timeout = 15;    // seconds
9131
9132 #define NSPACE_ITEM_NEW         0x0001
9133 #define NSPACE_ITEM_PROCESSING  0x0002
9134 #define NSPACE_ITEM_DEAD        0x0004
9135 #define NSPACE_ITEM_CANCELLED   0x0008
9136 #define NSPACE_ITEM_DONE        0x0010
9137 #define NSPACE_ITEM_RESET_TIMER 0x0020
9138
9139 #define NSPACE_ITEM_NSPACE_EVENT   0x0040
9140 #define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
9141
9142 #define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT)
9143
9144 //#pragma optimization_level 0
9145
9146 typedef enum {
9147         NSPACE_HANDLER_NSPACE = 0,
9148         NSPACE_HANDLER_SNAPSHOT = 1,
9149
9150         NSPACE_HANDLER_COUNT,
9151 } nspace_type_t;
9152
9153 typedef struct {
9154         uint64_t handler_tid;
9155         struct proc *handler_proc;
9156         int handler_busy;
9157 } nspace_handler_t;
9158
9159 nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
9160
9161 /* namespace fsctl functions */
9162 static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
9163 static int nspace_item_flags_for_type(nspace_type_t nspace_type);
9164 static int nspace_open_flags_for_type(nspace_type_t nspace_type);
9165 static nspace_type_t nspace_type_for_op(uint64_t op);
9166 static int nspace_is_special_process(struct proc *proc);
9167 static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
9168 static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
9169 static int validate_namespace_args (int is64bit, int size);
9170 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
9171
9172
9173 static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
9174 {
9175         switch(nspace_type) {
9176                 case NSPACE_HANDLER_NSPACE:
9177                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
9178                 case NSPACE_HANDLER_SNAPSHOT:
9179                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
9180                 default:
9181                         printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
9182                         return 0;
9183         }
9184 }
9185
9186 static inline int nspace_item_flags_for_type(nspace_type_t nspace_type)
9187 {
9188         switch(nspace_type) {
9189                 case NSPACE_HANDLER_NSPACE:
9190                         return NSPACE_ITEM_NSPACE_EVENT;
9191                 case NSPACE_HANDLER_SNAPSHOT:
9192                         return NSPACE_ITEM_SNAPSHOT_EVENT;
9193                 default:
9194                         printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
9195                         return 0;
9196         }
9197 }
9198
9199 static inline int nspace_open_flags_for_type(nspace_type_t nspace_type)
9200 {
9201         switch(nspace_type) {
9202                 case NSPACE_HANDLER_NSPACE:
9203                         return FREAD | FWRITE | O_EVTONLY;
9204                 case NSPACE_HANDLER_SNAPSHOT:
9205                         return FREAD | O_EVTONLY;
9206                 default:
9207                         printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
9208                         return 0;
9209         }
9210 }
9211
9212 static inline nspace_type_t nspace_type_for_op(uint64_t op)
9213 {
9214         switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
9215                 case NAMESPACE_HANDLER_NSPACE_EVENT:
9216                         return NSPACE_HANDLER_NSPACE;
9217                 case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
9218                         return NSPACE_HANDLER_SNAPSHOT;
9219                 default:
9220                         printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
9221                         return NSPACE_HANDLER_NSPACE;
9222         }
9223 }
9224
9225 static inline int nspace_is_special_process(struct proc *proc)
9226 {
9227         int i;
9228         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
9229                 if (proc == nspace_handlers[i].handler_proc)
9230                         return 1;
9231         }
9232         return 0;
9233 }
9234
9235 void
9236 nspace_handler_init(void)
9237 {
9238         nspace_lock_attr    = lck_attr_alloc_init();
9239         nspace_group_attr   = lck_grp_attr_alloc_init();
9240         nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
9241         lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
9242         lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
9243         memset(&nspace_items[0], 0, sizeof(nspace_items));
9244 }
9245
9246 void
9247 nspace_proc_exit(struct proc *p)
9248 {
9249         int i, event_mask = 0;
9250
9251         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
9252                 if (p == nspace_handlers[i].handler_proc) {
9253                         event_mask |= nspace_item_flags_for_type(i);
9254                         nspace_handlers[i].handler_tid = 0;
9255                         nspace_handlers[i].handler_proc = NULL;
9256                 }
9257         }
9258
9259         if (event_mask == 0) {
9260                 return;
9261         }
9262
9263         lck_mtx_lock(&nspace_handler_lock);
9264         if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
9265                 // if this process was the snapshot handler, zero snapshot_timeout
9266                 snapshot_timestamp = 0;
9267         }
9268
9269         //
9270         // unblock anyone that's waiting for the handler that died
9271         //
9272         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9273                 if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
9274
9275                         if ( nspace_items[i].flags & event_mask ) {
9276
9277                                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9278                                         vnode_lock_spin(nspace_items[i].vp);
9279                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9280                                         vnode_unlock(nspace_items[i].vp);
9281                                 }
9282                                 nspace_items[i].vp = NULL;
9283                                 nspace_items[i].vid = 0;
9284                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9285                                 nspace_items[i].token = 0;
9286
9287                                 wakeup((caddr_t)&(nspace_items[i].vp));
9288                         }
9289                 }
9290         }
9291
9292         wakeup((caddr_t)&nspace_item_idx);
9293         lck_mtx_unlock(&nspace_handler_lock);
9294 }
9295
9296
9297 int
9298 resolve_nspace_item(struct vnode *vp, uint64_t op)
9299 {
9300         return resolve_nspace_item_ext(vp, op, NULL);
9301 }
9302
9303 int
9304 resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
9305 {
9306         int i, error, keep_waiting;
9307         struct timespec ts;
9308         nspace_type_t nspace_type = nspace_type_for_op(op);
9309
9310         // only allow namespace events on regular files, directories and symlinks.
9311         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
9312                 return 0;
9313         }
9314
9315         //
9316         // if this is a snapshot event and the vnode is on a
9317         // disk image just pretend nothing happened since any
9318         // change to the disk image will cause the disk image
9319         // itself to get backed up and this avoids multi-way
9320         // deadlocks between the snapshot handler and the ever
9321         // popular diskimages-helper process.  the variable
9322         // nspace_allow_virtual_devs allows this behavior to
9323         // be overridden (for use by the Mobile TimeMachine
9324         // testing infrastructure which uses disk images)
9325         //
9326         if (   (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
9327             && (vp->v_mount != NULL)
9328             && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
9329             && !nspace_allow_virtual_devs) {
9330
9331                 return 0;
9332         }
9333
9334         // if (thread_tid(current_thread()) == namespace_handler_tid) {
9335         if (nspace_handlers[nspace_type].handler_proc == NULL) {
9336                 return 0;
9337         }
9338
9339         if (nspace_is_special_process(current_proc())) {
9340                 return EDEADLK;
9341         }
9342
9343         lck_mtx_lock(&nspace_handler_lock);
9344
9345 retry:
9346         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9347                 if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
9348                         break;
9349                 }
9350         }
9351
9352         if (i >= MAX_NSPACE_ITEMS) {
9353                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9354                         if (nspace_items[i].flags == 0) {
9355                                 break;
9356                         }
9357                 }
9358         } else {
9359                 nspace_items[i].refcount++;
9360         }
9361
9362         if (i >= MAX_NSPACE_ITEMS) {
9363                 ts.tv_sec = nspace_handler_timeout;
9364                 ts.tv_nsec = 0;
9365
9366                 error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS|PCATCH, "nspace-no-space", &ts);
9367                 if (error == 0) {
9368                         // an entry got free'd up, go see if we can get a slot
9369                         goto retry;
9370                 } else {
9371                         lck_mtx_unlock(&nspace_handler_lock);
9372                         return error;
9373                 }
9374         }
9375
9376         //
9377         // if it didn't already exist, add it.  if it did exist
9378         // we'll get woken up when someone does a wakeup() on
9379         // the slot in the nspace_items table.
9380         //
9381         if (vp != nspace_items[i].vp) {
9382                 nspace_items[i].vp = vp;
9383                 nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
9384                 nspace_items[i].op = op;
9385                 nspace_items[i].vid = vnode_vid(vp);
9386                 nspace_items[i].flags = NSPACE_ITEM_NEW;
9387                 nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
9388                 if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
9389                         if (arg) {
9390                                 vnode_lock_spin(vp);
9391                                 vp->v_flag |= VNEEDSSNAPSHOT;
9392                                 vnode_unlock(vp);
9393                         }
9394                 }
9395
9396                 nspace_items[i].token = 0;
9397                 nspace_items[i].refcount = 1;
9398
9399                 wakeup((caddr_t)&nspace_item_idx);
9400         }
9401
9402         //
9403         // Now go to sleep until the handler does a wakeup on this
9404         // slot in the nspace_items table (or we timeout).
9405         //
9406         keep_waiting = 1;
9407         while(keep_waiting) {
9408                 ts.tv_sec = nspace_handler_timeout;
9409                 ts.tv_nsec = 0;
9410                 error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS|PCATCH, "namespace-done", &ts);
9411
9412                 if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
9413                         error = 0;
9414                 } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
9415                         error = nspace_items[i].token;
9416                 } else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
9417                         if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
9418                                 nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
9419                                 continue;
9420                         } else {
9421                                 error = ETIMEDOUT;
9422                         }
9423                 } else if (error == 0) {
9424                         // hmmm, why did we get woken up?
9425                         printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
9426                                nspace_items[i].token);
9427                 }
9428
9429                 if (--nspace_items[i].refcount == 0) {
9430                         nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
9431                         nspace_items[i].arg = NULL;
9432                         nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
9433                         nspace_items[i].flags = 0;     // this clears it for re-use
9434                 }
9435                 wakeup(&nspace_token_id);
9436                 keep_waiting = 0;
9437         }
9438
9439         lck_mtx_unlock(&nspace_handler_lock);
9440
9441         return error;
9442 }
9443
9444 int nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg)
9445 {
9446         int snapshot_error = 0;
9447
9448         if (vp == NULL) {
9449                 return 0;
9450         }
9451
9452         /* Swap files are special; skip them */
9453         if (vnode_isswap(vp)) {
9454                 return 0;
9455         }
9456
9457         if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) {
9458                 // the change time is within this epoch
9459                 int error;
9460
9461                 error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg);
9462                 if (error == EDEADLK) {
9463                         snapshot_error = 0;
9464                 } else if (error) {
9465                         if (error == EAGAIN) {
9466                                 printf("nspace_snapshot_event: timed out waiting for namespace handler...\n");
9467                         } else if (error == EINTR) {
9468                                 // printf("nspace_snapshot_event: got a signal while waiting for namespace handler...\n");
9469                                 snapshot_error = EINTR;
9470                         }
9471                 }
9472         }
9473
9474         return snapshot_error;
9475 }
9476
9477 int
9478 get_nspace_item_status(struct vnode *vp, int32_t *status)
9479 {
9480         int i;
9481
9482         lck_mtx_lock(&nspace_handler_lock);
9483         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9484                 if (nspace_items[i].vp == vp) {
9485                         break;
9486                 }
9487         }
9488
9489         if (i >= MAX_NSPACE_ITEMS) {
9490                 lck_mtx_unlock(&nspace_handler_lock);
9491                 return ENOENT;
9492         }
9493
9494         *status = nspace_items[i].flags;
9495         lck_mtx_unlock(&nspace_handler_lock);
9496         return 0;
9497 }
9498
9499
9500 #if 0
9501 static int
9502 build_volfs_path(struct vnode *vp, char *path, int *len)
9503 {
9504         struct vnode_attr va;
9505         int ret;
9506
9507         VATTR_INIT(&va);
9508         VATTR_WANTED(&va, va_fsid);
9509         VATTR_WANTED(&va, va_fileid);
9510
9511         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
9512                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
9513                 ret = -1;
9514         } else {
9515                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
9516                 ret = 0;
9517         }
9518
9519         return ret;
9520 }
9521 #endif
9522
9523 //
9524 // Note: this function does NOT check permissions on all of the
9525 // parent directories leading to this vnode.  It should only be
9526 // called on behalf of a root process.  Otherwise a process may
9527 // get access to a file because the file itself is readable even
9528 // though its parent directories would prevent access.
9529 //
9530 static int
9531 vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
9532 {
9533         int error, action;
9534
9535         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9536                 return error;
9537         }
9538
9539 #if CONFIG_MACF
9540         error = mac_vnode_check_open(ctx, vp, fmode);
9541         if (error)
9542                 return error;
9543 #endif
9544
9545         /* compute action to be authorized */
9546         action = 0;
9547         if (fmode & FREAD) {
9548                 action |= KAUTH_VNODE_READ_DATA;
9549         }
9550         if (fmode & (FWRITE | O_TRUNC)) {
9551                 /*
9552                  * If we are writing, appending, and not truncating,
9553                  * indicate that we are appending so that if the
9554                  * UF_APPEND or SF_APPEND bits are set, we do not deny
9555                  * the open.
9556                  */
9557                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
9558                         action |= KAUTH_VNODE_APPEND_DATA;
9559                 } else {
9560                         action |= KAUTH_VNODE_WRITE_DATA;
9561                 }
9562         }
9563
9564         if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
9565                 return error;
9566
9567
9568         //
9569         // if the vnode is tagged VOPENEVT and the current process
9570         // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
9571         // flag to the open mode so that this open won't count against
9572         // the vnode when carbon delete() does a vnode_isinuse() to see
9573         // if a file is currently in use.  this allows spotlight
9574         // importers to not interfere with carbon apps that depend on
9575         // the no-delete-if-busy semantics of carbon delete().
9576         //
9577         if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
9578                 fmode |= O_EVTONLY;
9579         }
9580
9581         if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) {
9582                 return error;
9583         }
9584         if ( (error = vnode_ref_ext(vp, fmode, 0)) ) {
9585                 VNOP_CLOSE(vp, fmode, ctx);
9586                 return error;
9587         }
9588
9589         /* Call out to allow 3rd party notification of open.
9590          * Ignore result of kauth_authorize_fileop call.
9591          */
9592 #if CONFIG_MACF
9593         mac_vnode_notify_open(ctx, vp, fmode);
9594 #endif
9595         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
9596                                (uintptr_t)vp, 0);
9597
9598
9599         return 0;
9600 }
9601
9602 static int
9603 wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
9604 {
9605         int i;
9606         int error = 0;
9607         int unblock = 0;
9608         task_t curtask;
9609
9610         lck_mtx_lock(&nspace_handler_exclusion_lock);
9611         if (nspace_handlers[nspace_type].handler_busy) {
9612                 lck_mtx_unlock(&nspace_handler_exclusion_lock);
9613                 return EBUSY;
9614         }
9615
9616         nspace_handlers[nspace_type].handler_busy = 1;
9617         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9618
9619         /*
9620          * Any process that gets here will be one of the namespace handlers.
9621          * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
9622          * as we can cause deadlocks to occur, because the namespace handler may prevent
9623          * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
9624          * process.
9625          */
9626         curtask = current_task();
9627         bsd_set_dependency_capable (curtask);
9628
9629         lck_mtx_lock(&nspace_handler_lock);
9630         if (nspace_handlers[nspace_type].handler_proc == NULL) {
9631                 nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
9632                 nspace_handlers[nspace_type].handler_proc = current_proc();
9633         }
9634
9635         if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
9636                         (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9637                 error = EINVAL;
9638         }
9639
9640         while (error == 0) {
9641
9642                 /* Try to find matching namespace item */
9643                 for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
9644                         if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9645                                 if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9646                                         break;
9647                                 }
9648                         }
9649                 }
9650
9651                 if (i >= MAX_NSPACE_ITEMS) {
9652                         /* Nothing is there yet. Wait for wake up and retry */
9653                         error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0);
9654                         if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9655                                 /* Prevent infinite loop if snapshot handler exited */
9656                                 error = EINVAL;
9657                                 break;
9658                         }
9659                         continue;
9660                 }
9661
9662                 nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
9663                 nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
9664                 nspace_items[i].token  = ++nspace_token_id;
9665
9666                 assert(nspace_items[i].vp);
9667                 struct fileproc *fp;
9668                 int32_t indx;
9669                 int32_t fmode;
9670                 struct proc *p = current_proc();
9671                 vfs_context_t ctx = vfs_context_current();
9672                 struct vnode_attr va;
9673                 bool vn_get_succsessful = false;
9674                 bool vn_open_successful = false;
9675                 bool fp_alloc_successful = false;
9676
9677                 /*
9678                  * Use vnode pointer to acquire a file descriptor for
9679                  * hand-off to userland
9680                  */
9681                 fmode = nspace_open_flags_for_type(nspace_type);
9682                 error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
9683                 if (error) goto cleanup;
9684                 vn_get_succsessful = true;
9685
9686                 error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
9687                 if (error) goto cleanup;
9688                 vn_open_successful = true;
9689
9690                 error = falloc(p, &fp, &indx, ctx);
9691                 if (error) goto cleanup;
9692                 fp_alloc_successful = true;
9693
9694                 fp->f_fglob->fg_flag = fmode;
9695                 fp->f_fglob->fg_ops = &vnops;
9696                 fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
9697
9698                 proc_fdlock(p);
9699                 procfdtbl_releasefd(p, indx, NULL);
9700                 fp_drop(p, indx, fp, 1);
9701                 proc_fdunlock(p);
9702
9703                 /*
9704                  * All variants of the namespace handler struct support these three fields:
9705                  * token, flags, and the FD pointer
9706                  */
9707                 error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
9708                 if (error) goto cleanup;
9709                 error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
9710                 if (error) goto cleanup;
9711                 error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
9712                 if (error) goto cleanup;
9713
9714                 /*
9715                  * Handle optional fields:
9716                  * extended version support an info ptr (offset, length), and the
9717                  *
9718                  * namedata version supports a unique per-link object ID
9719                  *
9720                  */
9721                 if (nhd->infoptr) {
9722                         uio_t uio = (uio_t)nspace_items[i].arg;
9723                         uint64_t u_offset, u_length;
9724
9725                         if (uio) {
9726                                 u_offset = uio_offset(uio);
9727                                 u_length = uio_resid(uio);
9728                         } else {
9729                                 u_offset = 0;
9730                                 u_length = 0;
9731                         }
9732                         error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
9733                         if (error) goto cleanup;
9734                         error = copyout(&u_length, nhd->infoptr + sizeof(uint64_t), sizeof(uint64_t));
9735                         if (error) goto cleanup;
9736                 }
9737
9738                 if (nhd->objid) {
9739                         VATTR_INIT(&va);
9740                         VATTR_WANTED(&va, va_linkid);
9741                         error = vnode_getattr(nspace_items[i].vp, &va, ctx);
9742                         if (error) goto cleanup;
9743
9744                         uint64_t linkid = 0;
9745                         if (VATTR_IS_SUPPORTED (&va, va_linkid)) {
9746                                 linkid = (uint64_t)va.va_linkid;
9747                         }
9748                         error = copyout(&linkid, nhd->objid, sizeof(uint64_t));
9749                 }
9750 cleanup:
9751                 if (error) {
9752                         if (fp_alloc_successful) fp_free(p, indx, fp);
9753                         if (vn_open_successful) vn_close(nspace_items[i].vp, fmode, ctx);
9754                         unblock = 1;
9755                 }
9756
9757                 if (vn_get_succsessful) vnode_put(nspace_items[i].vp);
9758
9759                 break;
9760         }
9761
9762         if (unblock) {
9763                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9764                         vnode_lock_spin(nspace_items[i].vp);
9765                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9766                         vnode_unlock(nspace_items[i].vp);
9767                 }
9768                 nspace_items[i].vp = NULL;
9769                 nspace_items[i].vid = 0;
9770                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9771                 nspace_items[i].token = 0;
9772
9773                 wakeup((caddr_t)&(nspace_items[i].vp));
9774         }
9775
9776         if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
9777                 // just go through every snapshot event and unblock it immediately.
9778                 if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9779                         for(i = 0; i < MAX_NSPACE_ITEMS; i++) {
9780                                 if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9781                                         if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9782                                                 nspace_items[i].vp = NULL;
9783                                                 nspace_items[i].vid = 0;
9784                                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9785                                                 nspace_items[i].token = 0;
9786
9787                                                 wakeup((caddr_t)&(nspace_items[i].vp));
9788                                         }
9789                                 }
9790                         }
9791                 }
9792         }
9793
9794         lck_mtx_unlock(&nspace_handler_lock);
9795
9796         lck_mtx_lock(&nspace_handler_exclusion_lock);
9797         nspace_handlers[nspace_type].handler_busy = 0;
9798         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9799
9800         return error;
9801 }
9802
9803 static inline int validate_namespace_args (int is64bit, int size) {
9804
9805         if (is64bit) {
9806                 /* Must be one of these */
9807                 if (size == sizeof(user64_namespace_handler_info)) {
9808                         goto sizeok;
9809                 }
9810                 if (size == sizeof(user64_namespace_handler_info_ext)) {
9811                         goto sizeok;
9812                 }
9813                 if (size == sizeof(user64_namespace_handler_data)) {
9814                         goto sizeok;
9815                 }
9816                 return EINVAL;
9817         }
9818         else {
9819                 /* 32 bit -- must be one of these */
9820                 if (size == sizeof(user32_namespace_handler_info)) {
9821                         goto sizeok;
9822                 }
9823                 if (size == sizeof(user32_namespace_handler_info_ext)) {
9824                         goto sizeok;
9825                 }
9826                 if (size == sizeof(user32_namespace_handler_data)) {
9827                         goto sizeok;
9828                 }
9829                 return EINVAL;
9830         }
9831
9832 sizeok:
9833
9834         return 0;
9835
9836 }
9837
9838 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
9839 {
9840         int error = 0;
9841         namespace_handler_data nhd;
9842
9843         bzero (&nhd, sizeof(namespace_handler_data));
9844
9845         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9846                 return error;
9847         }
9848
9849         error = validate_namespace_args (is64bit, size);
9850         if (error) {
9851                 return error;
9852         }
9853
9854         /* Copy in the userland pointers into our kernel-only struct */
9855
9856         if (is64bit) {
9857                 /* 64 bit userland structures */
9858                 nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
9859                 nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
9860                 nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
9861
9862                 /* If the size is greater than the standard info struct, add in extra fields */
9863                 if (size > (sizeof(user64_namespace_handler_info))) {
9864                         if (size >= (sizeof(user64_namespace_handler_info_ext))) {
9865                                 nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
9866                         }
9867                         if (size == (sizeof(user64_namespace_handler_data))) {
9868                                 nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
9869                         }
9870                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9871                 }
9872         }
9873         else {
9874                 /* 32 bit userland structures */
9875                 nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
9876                 nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
9877                 nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
9878
9879                 if (size > (sizeof(user32_namespace_handler_info))) {
9880                         if (size >= (sizeof(user32_namespace_handler_info_ext))) {
9881                                 nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
9882                         }
9883                         if (size == (sizeof(user32_namespace_handler_data))) {
9884                                 nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
9885                         }
9886                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9887                 }
9888         }
9889
9890         return wait_for_namespace_event(&nhd, nspace_type);
9891 }
9892
9893 /*
9894  * Make a filesystem-specific control call:
9895  */
9896 /* ARGSUSED */
9897 static int
9898 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
9899 {
9900         int error=0;
9901         boolean_t is64bit;
9902         u_int size;
9903 #define STK_PARAMS 128
9904         char stkbuf[STK_PARAMS] = {0};
9905         caddr_t data, memp;
9906         vnode_t vp = *arg_vp;
9907
9908         size = IOCPARM_LEN(cmd);
9909         if (size > IOCPARM_MAX) return (EINVAL);
9910
9911         is64bit = proc_is64bit(p);
9912
9913         memp = NULL;
9914
9915
9916         /*
9917          * ensure the buffer is large enough for underlying calls
9918          */
9919 #ifndef HFSIOC_GETPATH
9920         typedef char pn_t[MAXPATHLEN];
9921 #define HFSIOC_GETPATH  _IOWR('h', 13, pn_t)
9922 #endif
9923
9924 #ifndef HFS_GETPATH
9925 #define HFS_GETPATH  IOCBASECMD(HFSIOC_GETPATH)
9926 #endif
9927         if (IOCBASECMD(cmd) == HFS_GETPATH) {
9928                 /* Round up to MAXPATHLEN regardless of user input */
9929                 size = MAXPATHLEN;
9930         }
9931         else if (vp->v_tag == VT_CIFS) {
9932                 /*
9933                  * XXX Until fsctl's length encoding can be
9934                  * XXX fixed properly.
9935                  */
9936                 if (IOCBASECMD(cmd) == _IOWR('z', 19, 0) && size < 1432) {
9937                         size = 1432; /* sizeof(struct UniqueSMBShareID) */
9938                 } else if (IOCBASECMD(cmd) == _IOWR('z', 28, 0) && size < 308) {
9939                         size = 308; /* sizeof(struct smbDebugTestPB) */
9940                 }
9941         }
9942
9943         if (size > sizeof (stkbuf)) {
9944                 if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
9945                 data = memp;
9946         } else {
9947                 data = &stkbuf[0];
9948         };
9949
9950         if (cmd & IOC_IN) {
9951                 if (size) {
9952                         error = copyin(udata, data, size);
9953                         if (error) {
9954                                 if (memp) {
9955                                         kfree (memp, size);
9956                                 }
9957                                 return error;
9958                         }
9959                 } else {
9960                         if (is64bit) {
9961                                 *(user_addr_t *)data = udata;
9962                         }
9963                         else {
9964                                 *(uint32_t *)data = (uint32_t)udata;
9965                         }
9966                 };
9967         } else if ((cmd & IOC_OUT) && size) {
9968                 /*
9969                  * Zero the buffer so the user always
9970                  * gets back something deterministic.
9971                  */
9972                 bzero(data, size);
9973         } else if (cmd & IOC_VOID) {
9974                 if (is64bit) {
9975                         *(user_addr_t *)data = udata;
9976                 }
9977                 else {
9978                         *(uint32_t *)data = (uint32_t)udata;
9979                 }
9980         }
9981
9982         /* Check to see if it's a generic command */
9983         switch (IOCBASECMD(cmd)) {
9984
9985                 case FSCTL_SYNC_VOLUME: {
9986                         mount_t mp = vp->v_mount;
9987                         int arg = *(uint32_t*)data;
9988
9989                         /* record vid of vp so we can drop it below. */
9990                         uint32_t vvid = vp->v_id;
9991
9992                         /*
9993                          * Then grab mount_iterref so that we can release the vnode.
9994                          * Without this, a thread may call vnode_iterate_prepare then
9995                          * get into a deadlock because we've never released the root vp
9996                          */
9997                         error = mount_iterref (mp, 0);
9998                         if (error)  {
9999                                 break;
10000                         }
10001                         vnode_put(vp);
10002
10003                         /* issue the sync for this volume */
10004                         (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
10005
10006                         /*
10007                          * Then release the mount_iterref once we're done syncing; it's not
10008                          * needed for the VNOP_IOCTL below
10009                          */
10010                         mount_iterdrop(mp);
10011
10012                         if (arg & FSCTL_SYNC_FULLSYNC) {
10013                                 /* re-obtain vnode iocount on the root vp, if possible */
10014                                 error = vnode_getwithvid (vp, vvid);
10015                                 if (error == 0) {
10016                                         error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
10017                                         vnode_put (vp);
10018                                 }
10019                         }
10020                         /* mark the argument VP as having been released */
10021                         *arg_vp = NULL;
10022                 }
10023                 break;
10024
10025                 case FSCTL_ROUTEFS_SETROUTEID: {
10026 #if ROUTEFS
10027                         char routepath[MAXPATHLEN];
10028                         size_t len = 0;
10029
10030                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10031                                 break;
10032                         }
10033                         bzero(routepath, MAXPATHLEN);
10034                         error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
10035                         if (error) {
10036                                 break;
10037                         }
10038                         error = routefs_kernel_mount(routepath);
10039                         if (error) {
10040                                 break;
10041                         }
10042 #endif
10043                 }
10044                 break;
10045
10046                 case FSCTL_SET_PACKAGE_EXTS: {
10047                         user_addr_t ext_strings;
10048                         uint32_t    num_entries;
10049                         uint32_t    max_width;
10050
10051                         if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0)))
10052                                 break;
10053
10054                         if (   (is64bit && size != sizeof(user64_package_ext_info))
10055                                         || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
10056
10057                                 // either you're 64-bit and passed a 64-bit struct or
10058                                 // you're 32-bit and passed a 32-bit struct.  otherwise
10059                                 // it's not ok.
10060                                 error = EINVAL;
10061                                 break;
10062                         }
10063
10064                         if (is64bit) {
10065                                 ext_strings = ((user64_package_ext_info *)data)->strings;
10066                                 num_entries = ((user64_package_ext_info *)data)->num_entries;
10067                                 max_width   = ((user64_package_ext_info *)data)->max_width;
10068                         } else {
10069                                 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
10070                                 num_entries = ((user32_package_ext_info *)data)->num_entries;
10071                                 max_width   = ((user32_package_ext_info *)data)->max_width;
10072                         }
10073                         error = set_package_extensions_table(ext_strings, num_entries, max_width);
10074                 }
10075                 break;
10076
10077                 /* namespace handlers */
10078                 case FSCTL_NAMESPACE_HANDLER_GET: {
10079                         error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
10080                 }
10081                 break;
10082
10083                 /* Snapshot handlers */
10084                 case FSCTL_OLD_SNAPSHOT_HANDLER_GET: {
10085                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10086                 }
10087                 break;
10088
10089                 case FSCTL_SNAPSHOT_HANDLER_GET_EXT: {
10090                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10091                 }
10092                 break;
10093
10094                 case FSCTL_NAMESPACE_HANDLER_UPDATE: {
10095                         uint32_t token, val;
10096                         int i;
10097
10098                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10099                                 break;
10100                         }
10101
10102                         if (!nspace_is_special_process(p)) {
10103                                 error = EINVAL;
10104                                 break;
10105                         }
10106
10107                         token = ((uint32_t *)data)[0];
10108                         val   = ((uint32_t *)data)[1];
10109
10110                         lck_mtx_lock(&nspace_handler_lock);
10111
10112                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10113                                 if (nspace_items[i].token == token) {
10114                                         break;  /* exit for loop, not case stmt */
10115                                 }
10116                         }
10117
10118                         if (i >= MAX_NSPACE_ITEMS) {
10119                                 error = ENOENT;
10120                         } else {
10121                                 //
10122                                 // if this bit is set, when resolve_nspace_item() times out
10123                                 // it will loop and go back to sleep.
10124                                 //
10125                                 nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
10126                         }
10127
10128                         lck_mtx_unlock(&nspace_handler_lock);
10129
10130                         if (error) {
10131                                 printf("nspace-handler-update: did not find token %u\n", token);
10132                         }
10133                 }
10134                 break;
10135
10136                 case FSCTL_NAMESPACE_HANDLER_UNBLOCK: {
10137                         uint32_t token, val;
10138                         int i;
10139
10140                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10141                                 break;
10142                         }
10143
10144                         if (!nspace_is_special_process(p)) {
10145                                 error = EINVAL;
10146                                 break;
10147                         }
10148
10149                         token = ((uint32_t *)data)[0];
10150                         val   = ((uint32_t *)data)[1];
10151
10152                         lck_mtx_lock(&nspace_handler_lock);
10153
10154                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10155                                 if (nspace_items[i].token == token) {
10156                                         break; /* exit for loop, not case statement */
10157                                 }
10158                         }
10159
10160                         if (i >= MAX_NSPACE_ITEMS) {
10161                                 printf("nspace-handler-unblock: did not find token %u\n", token);
10162                                 error = ENOENT;
10163                         } else {
10164                                 if (val == 0 && nspace_items[i].vp) {
10165                                         vnode_lock_spin(nspace_items[i].vp);
10166                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10167                                         vnode_unlock(nspace_items[i].vp);
10168                                 }
10169
10170                                 nspace_items[i].vp = NULL;
10171                                 nspace_items[i].arg = NULL;
10172                                 nspace_items[i].op = 0;
10173                                 nspace_items[i].vid = 0;
10174                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
10175                                 nspace_items[i].token = 0;
10176
10177                                 wakeup((caddr_t)&(nspace_items[i].vp));
10178                         }
10179
10180                         lck_mtx_unlock(&nspace_handler_lock);
10181                 }
10182                 break;
10183
10184                 case FSCTL_NAMESPACE_HANDLER_CANCEL: {
10185                         uint32_t token, val;
10186                         int i;
10187
10188                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10189                                 break;
10190                         }
10191
10192                         if (!nspace_is_special_process(p)) {
10193                                 error = EINVAL;
10194                                 break;
10195                         }
10196
10197                         token = ((uint32_t *)data)[0];
10198                         val   = ((uint32_t *)data)[1];
10199
10200                         lck_mtx_lock(&nspace_handler_lock);
10201
10202                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10203                                 if (nspace_items[i].token == token) {
10204                                         break;  /* exit for loop, not case stmt */
10205                                 }
10206                         }
10207
10208                         if (i >= MAX_NSPACE_ITEMS) {
10209                                 printf("nspace-handler-cancel: did not find token %u\n", token);
10210                                 error = ENOENT;
10211                         } else {
10212                                 if (nspace_items[i].vp) {
10213                                         vnode_lock_spin(nspace_items[i].vp);
10214                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10215                                         vnode_unlock(nspace_items[i].vp);
10216                                 }
10217
10218                                 nspace_items[i].vp = NULL;
10219                                 nspace_items[i].arg = NULL;
10220                                 nspace_items[i].vid = 0;
10221                                 nspace_items[i].token = val;
10222                                 nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
10223                                 nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
10224
10225                                 wakeup((caddr_t)&(nspace_items[i].vp));
10226                         }
10227
10228                         lck_mtx_unlock(&nspace_handler_lock);
10229                 }
10230                 break;
10231
10232                 case FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
10233                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10234                                 break;
10235                         }
10236
10237                         // we explicitly do not do the namespace_handler_proc check here
10238
10239                         lck_mtx_lock(&nspace_handler_lock);
10240                         snapshot_timestamp = ((uint32_t *)data)[0];
10241                         wakeup(&nspace_item_idx);
10242                         lck_mtx_unlock(&nspace_handler_lock);
10243                         printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
10244
10245                 }
10246                 break;
10247
10248                 case FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
10249                 {
10250                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10251                                 break;
10252                         }
10253
10254                         lck_mtx_lock(&nspace_handler_lock);
10255                         nspace_allow_virtual_devs = ((uint32_t *)data)[0];
10256                         lck_mtx_unlock(&nspace_handler_lock);
10257                         printf("nspace-snapshot-handler will%s allow events on disk-images\n",
10258                                         nspace_allow_virtual_devs ? "" : " NOT");
10259                         error = 0;
10260
10261                 }
10262                 break;
10263
10264                 case FSCTL_SET_FSTYPENAME_OVERRIDE:
10265                 {
10266                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10267                                 break;
10268                         }
10269                         if (vp->v_mount) {
10270                                 mount_lock(vp->v_mount);
10271                                 if (data[0] != 0) {
10272                                         strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
10273                                         vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
10274                                         if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
10275                                                 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
10276                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
10277                                         }
10278                                 } else {
10279                                         if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
10280                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
10281                                         }
10282                                         vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
10283                                         vp->v_mount->fstypename_override[0] = '\0';
10284                                 }
10285                                 mount_unlock(vp->v_mount);
10286                         }
10287                 }
10288                 break;
10289
10290                 default: {
10291                         /* Invoke the filesystem-specific code */
10292                         error = VNOP_IOCTL(vp, IOCBASECMD(cmd), data, options, ctx);
10293                 }
10294
10295         } /* end switch stmt */
10296
10297         /*
10298          * if no errors, copy any data to user. Size was
10299          * already set and checked above.
10300          */
10301         if (error == 0 && (cmd & IOC_OUT) && size)
10302                 error = copyout(data, udata, size);
10303
10304         if (memp) {
10305                 kfree(memp, size);
10306         }
10307
10308         return error;
10309 }
10310
10311 /* ARGSUSED */
10312 int
10313 fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
10314 {
10315         int error;
10316         struct nameidata nd;
10317         u_long nameiflags;
10318         vnode_t vp = NULL;
10319         vfs_context_t ctx = vfs_context_current();
10320
10321         AUDIT_ARG(cmd, uap->cmd);
10322         AUDIT_ARG(value32, uap->options);
10323         /* Get the vnode for the file we are getting info on:  */
10324         nameiflags = 0;
10325         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
10326         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
10327                UIO_USERSPACE, uap->path, ctx);
10328         if ((error = namei(&nd))) goto done;
10329         vp = nd.ni_vp;
10330         nameidone(&nd);
10331
10332 #if CONFIG_MACF
10333         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
10334         if (error) {
10335                 goto done;
10336         }
10337 #endif
10338
10339         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
10340
10341 done:
10342         if (vp)
10343                 vnode_put(vp);
10344         return error;
10345 }
10346 /* ARGSUSED */
10347 int
10348 ffsctl (proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
10349 {
10350         int error;
10351         vnode_t vp = NULL;
10352         vfs_context_t ctx = vfs_context_current();
10353         int fd = -1;
10354
10355         AUDIT_ARG(fd, uap->fd);
10356         AUDIT_ARG(cmd, uap->cmd);
10357         AUDIT_ARG(value32, uap->options);
10358
10359         /* Get the vnode for the file we are getting info on:  */
10360         if ((error = file_vnode(uap->fd, &vp)))
10361                 return error;
10362         fd = uap->fd;
10363         if ((error = vnode_getwithref(vp))) {
10364                 file_drop(fd);
10365                 return error;
10366         }
10367
10368 #if CONFIG_MACF
10369         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
10370                 file_drop(fd);
10371                 vnode_put(vp);
10372                 return error;
10373         }
10374 #endif
10375
10376         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
10377
10378         file_drop(fd);
10379
10380         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
10381         if (vp) {
10382                 vnode_put(vp);
10383         }
10384
10385         return error;
10386 }
10387 /* end of fsctl system call */
10388
10389 /*
10390  *  Retrieve the data of an extended attribute.
10391  */
10392 int
10393 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
10394 {
10395         vnode_t vp;
10396         struct nameidata nd;
10397         char attrname[XATTR_MAXNAMELEN+1];
10398         vfs_context_t ctx = vfs_context_current();
10399         uio_t auio = NULL;
10400         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10401         size_t attrsize = 0;
10402         size_t namelen;
10403         u_int32_t nameiflags;
10404         int error;
10405         char uio_buf[ UIO_SIZEOF(1) ];
10406
10407         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10408                 return (EINVAL);
10409
10410         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10411         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
10412         if ((error = namei(&nd))) {
10413                 return (error);
10414         }
10415         vp = nd.ni_vp;
10416         nameidone(&nd);
10417
10418         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10419                 goto out;
10420         }
10421         if (xattr_protected(attrname)) {
10422                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
10423                         error = EPERM;
10424                         goto out;
10425                 }
10426         }
10427         /*
10428          * the specific check for 0xffffffff is a hack to preserve
10429          * binaray compatibilty in K64 with applications that discovered
10430          * that passing in a buf pointer and a size of -1 resulted in
10431          * just the size of the indicated extended attribute being returned.
10432          * this isn't part of the documented behavior, but because of the
10433          * original implemtation's check for "uap->size > 0", this behavior
10434          * was allowed. In K32 that check turned into a signed comparison
10435          * even though uap->size is unsigned...  in K64, we blow by that
10436          * check because uap->size is unsigned and doesn't get sign smeared
10437          * in the munger for a 32 bit user app.  we also need to add a
10438          * check to limit the maximum size of the buffer being passed in...
10439          * unfortunately, the underlying fileystems seem to just malloc
10440          * the requested size even if the actual extended attribute is tiny.
10441          * because that malloc is for kernel wired memory, we have to put a
10442          * sane limit on it.
10443          *
10444          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
10445          * U64 running on K64 will yield -1 (64 bits wide)
10446          * U32/U64 running on K32 will yield -1 (32 bits wide)
10447          */
10448         if (uap->size == 0xffffffff || uap->size == (size_t)-1)
10449                 goto no_uio;
10450
10451         if (uap->value) {
10452                 if (uap->size > (size_t)XATTR_MAXSIZE)
10453                         uap->size = XATTR_MAXSIZE;
10454
10455                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
10456                                             &uio_buf[0], sizeof(uio_buf));
10457                 uio_addiov(auio, uap->value, uap->size);
10458         }
10459 no_uio:
10460         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
10461 out:
10462         vnode_put(vp);
10463
10464         if (auio) {
10465                 *retval = uap->size - uio_resid(auio);
10466         } else {
10467                 *retval = (user_ssize_t)attrsize;
10468         }
10469
10470         return (error);
10471 }
10472
10473 /*
10474  * Retrieve the data of an extended attribute.
10475  */
10476 int
10477 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
10478 {
10479         vnode_t vp;
10480         char attrname[XATTR_MAXNAMELEN+1];
10481         uio_t auio = NULL;
10482         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10483         size_t attrsize = 0;
10484         size_t namelen;
10485         int error;
10486         char uio_buf[ UIO_SIZEOF(1) ];
10487
10488         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10489                 return (EINVAL);
10490
10491         if ( (error = file_vnode(uap->fd, &vp)) ) {
10492                 return (error);
10493         }
10494         if ( (error = vnode_getwithref(vp)) ) {
10495                 file_drop(uap->fd);
10496                 return(error);
10497         }
10498         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10499                 goto out;
10500         }
10501         if (xattr_protected(attrname)) {
10502                 error = EPERM;
10503                 goto out;
10504         }
10505         if (uap->value && uap->size > 0) {
10506                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
10507                                             &uio_buf[0], sizeof(uio_buf));
10508                 uio_addiov(auio, uap->value, uap->size);
10509         }
10510
10511         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
10512 out:
10513         (void)vnode_put(vp);
10514         file_drop(uap->fd);
10515
10516         if (auio) {
10517                 *retval = uap->size - uio_resid(auio);
10518         } else {
10519                 *retval = (user_ssize_t)attrsize;
10520         }
10521         return (error);
10522 }
10523
10524 /*
10525  * Set the data of an extended attribute.
10526  */
10527 int
10528 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
10529 {
10530         vnode_t vp;
10531         struct nameidata nd;
10532         char attrname[XATTR_MAXNAMELEN+1];
10533         vfs_context_t ctx = vfs_context_current();
10534         uio_t auio = NULL;
10535         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10536         size_t namelen;
10537         u_int32_t nameiflags;
10538         int error;
10539         char uio_buf[ UIO_SIZEOF(1) ];
10540
10541         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10542                 return (EINVAL);
10543
10544         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10545                 if (error == EPERM) {
10546                         /* if the string won't fit in attrname, copyinstr emits EPERM */
10547                         return (ENAMETOOLONG);
10548                 }
10549                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
10550                 return error;
10551         }
10552         if (xattr_protected(attrname))
10553                 return(EPERM);
10554         if (uap->size != 0 && uap->value == 0) {
10555                 return (EINVAL);
10556         }
10557
10558         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10559         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
10560         if ((error = namei(&nd))) {
10561                 return (error);
10562         }
10563         vp = nd.ni_vp;
10564         nameidone(&nd);
10565
10566         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
10567                                     &uio_buf[0], sizeof(uio_buf));
10568         uio_addiov(auio, uap->value, uap->size);
10569
10570         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
10571 #if CONFIG_FSE
10572         if (error == 0) {
10573                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
10574                     FSE_ARG_VNODE, vp,
10575                     FSE_ARG_DONE);
10576         }
10577 #endif
10578         vnode_put(vp);
10579         *retval = 0;
10580         return (error);
10581 }
10582
10583 /*
10584  * Set the data of an extended attribute.
10585  */
10586 int
10587 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
10588 {
10589         vnode_t vp;
10590         char attrname[XATTR_MAXNAMELEN+1];
10591         uio_t auio = NULL;
10592         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10593         size_t namelen;
10594         int error;
10595         char uio_buf[ UIO_SIZEOF(1) ];
10596 #if CONFIG_FSE
10597         vfs_context_t ctx = vfs_context_current();
10598 #endif
10599
10600         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10601                 return (EINVAL);
10602
10603         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10604                 if (error == EPERM) {
10605                         /* if the string won't fit in attrname, copyinstr emits EPERM */
10606                         return (ENAMETOOLONG);
10607                 }
10608                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
10609                 return error;
10610         }
10611         if (xattr_protected(attrname))
10612                 return(EPERM);
10613         if (uap->size != 0 && uap->value == 0) {
10614                 return (EINVAL);
10615         }
10616         if ( (error = file_vnode(uap->fd, &vp)) ) {
10617                 return (error);
10618         }
10619         if ( (error = vnode_getwithref(vp)) ) {
10620                 file_drop(uap->fd);
10621                 return(error);
10622         }
10623         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
10624                                     &uio_buf[0], sizeof(uio_buf));
10625         uio_addiov(auio, uap->value, uap->size);
10626
10627         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
10628 #if CONFIG_FSE
10629         if (error == 0) {
10630                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
10631                     FSE_ARG_VNODE, vp,
10632                     FSE_ARG_DONE);
10633         }
10634 #endif
10635         vnode_put(vp);
10636         file_drop(uap->fd);
10637         *retval = 0;
10638         return (error);
10639 }
10640
10641 /*
10642  * Remove an extended attribute.
10643  * XXX Code duplication here.
10644  */
10645 int
10646 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
10647 {
10648         vnode_t vp;
10649         struct nameidata nd;
10650         char attrname[XATTR_MAXNAMELEN+1];
10651         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10652         vfs_context_t ctx = vfs_context_current();
10653         size_t namelen;
10654         u_int32_t nameiflags;
10655         int error;
10656
10657         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10658                 return (EINVAL);
10659
10660         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10661         if (error != 0) {
10662                 return (error);
10663         }
10664         if (xattr_protected(attrname))
10665                 return(EPERM);
10666         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10667         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
10668         if ((error = namei(&nd))) {
10669                 return (error);
10670         }
10671         vp = nd.ni_vp;
10672         nameidone(&nd);
10673
10674         error = vn_removexattr(vp, attrname, uap->options, ctx);
10675 #if CONFIG_FSE
10676         if (error == 0) {
10677                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10678                     FSE_ARG_VNODE, vp,
10679                     FSE_ARG_DONE);
10680         }
10681 #endif
10682         vnode_put(vp);
10683         *retval = 0;
10684         return (error);
10685 }
10686
10687 /*
10688  * Remove an extended attribute.
10689  * XXX Code duplication here.
10690  */
10691 int
10692 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
10693 {
10694         vnode_t vp;
10695         char attrname[XATTR_MAXNAMELEN+1];
10696         size_t namelen;
10697         int error;
10698 #if CONFIG_FSE
10699         vfs_context_t ctx = vfs_context_current();
10700 #endif
10701
10702         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10703                 return (EINVAL);
10704
10705         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10706         if (error != 0) {
10707                 return (error);
10708         }
10709         if (xattr_protected(attrname))
10710                 return(EPERM);
10711         if ( (error = file_vnode(uap->fd, &vp)) ) {
10712                 return (error);
10713         }
10714         if ( (error = vnode_getwithref(vp)) ) {
10715                 file_drop(uap->fd);
10716                 return(error);
10717         }
10718
10719         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
10720 #if CONFIG_FSE
10721         if (error == 0) {
10722                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10723                     FSE_ARG_VNODE, vp,
10724                     FSE_ARG_DONE);
10725         }
10726 #endif
10727         vnode_put(vp);
10728         file_drop(uap->fd);
10729         *retval = 0;
10730         return (error);
10731 }
10732
10733 /*
10734  * Retrieve the list of extended attribute names.
10735  * XXX Code duplication here.
10736  */
10737 int
10738 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
10739 {
10740         vnode_t vp;
10741         struct nameidata nd;
10742         vfs_context_t ctx = vfs_context_current();
10743         uio_t auio = NULL;
10744         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10745         size_t attrsize = 0;
10746         u_int32_t nameiflags;
10747         int error;
10748         char uio_buf[ UIO_SIZEOF(1) ];
10749
10750         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10751                 return (EINVAL);
10752
10753         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10754         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
10755         if ((error = namei(&nd))) {
10756                 return (error);
10757         }
10758         vp = nd.ni_vp;
10759         nameidone(&nd);
10760         if (uap->namebuf != 0 && uap->bufsize > 0) {
10761                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
10762                                             &uio_buf[0], sizeof(uio_buf));
10763                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10764         }
10765
10766         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
10767
10768         vnode_put(vp);
10769         if (auio) {
10770                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10771         } else {
10772                 *retval = (user_ssize_t)attrsize;
10773         }
10774         return (error);
10775 }
10776
10777 /*
10778  * Retrieve the list of extended attribute names.
10779  * XXX Code duplication here.
10780  */
10781 int
10782 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
10783 {
10784         vnode_t vp;
10785         uio_t auio = NULL;
10786         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10787         size_t attrsize = 0;
10788         int error;
10789         char uio_buf[ UIO_SIZEOF(1) ];
10790
10791         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10792                 return (EINVAL);
10793
10794         if ( (error = file_vnode(uap->fd, &vp)) ) {
10795                 return (error);
10796         }
10797         if ( (error = vnode_getwithref(vp)) ) {
10798                 file_drop(uap->fd);
10799                 return(error);
10800         }
10801         if (uap->namebuf != 0 && uap->bufsize > 0) {
10802                 auio = uio_createwithbuffer(1, 0, spacetype,
10803                                                                           UIO_READ, &uio_buf[0], sizeof(uio_buf));
10804                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10805         }
10806
10807         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
10808
10809         vnode_put(vp);
10810         file_drop(uap->fd);
10811         if (auio) {
10812                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10813         } else {
10814                 *retval = (user_ssize_t)attrsize;
10815         }
10816         return (error);
10817 }
10818
10819 static int fsgetpath_internal(
10820         vfs_context_t ctx, int volfs_id, uint64_t objid,
10821         vm_size_t bufsize, caddr_t buf, int *pathlen)
10822 {
10823         int error;
10824         struct mount *mp = NULL;
10825         vnode_t vp;
10826         int length;
10827         int bpflags;
10828         /* maximum number of times to retry build_path */
10829         unsigned int retries = 0x10;
10830
10831         if (bufsize > PAGE_SIZE) {
10832                 return (EINVAL);
10833         }
10834
10835         if (buf == NULL) {
10836                 return (ENOMEM);
10837         }
10838
10839 retry:
10840         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
10841                 error = ENOTSUP;  /* unexpected failure */
10842                 return ENOTSUP;
10843         }
10844
10845 unionget:
10846         if (objid == 2) {
10847                 error = VFS_ROOT(mp, &vp, ctx);
10848         } else {
10849                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
10850         }
10851
10852         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
10853                 /*
10854                  * If the fileid isn't found and we're in a union
10855                  * mount volume, then see if the fileid is in the
10856                  * mounted-on volume.
10857                  */
10858                 struct mount *tmp = mp;
10859                 mp = vnode_mount(tmp->mnt_vnodecovered);
10860                 vfs_unbusy(tmp);
10861                 if (vfs_busy(mp, LK_NOWAIT) == 0)
10862                         goto unionget;
10863         } else {
10864                 vfs_unbusy(mp);
10865         }
10866
10867         if (error) {
10868                 return error;
10869         }
10870
10871 #if CONFIG_MACF
10872         error = mac_vnode_check_fsgetpath(ctx, vp);
10873         if (error) {
10874                 vnode_put(vp);
10875                 return error;
10876         }
10877 #endif
10878
10879         /* Obtain the absolute path to this vnode. */
10880         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
10881         bpflags |= BUILDPATH_CHECK_MOVED;
10882         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
10883         vnode_put(vp);
10884
10885         if (error) {
10886                 /* there was a race building the path, try a few more times */
10887                 if (error == EAGAIN) {
10888                         --retries;
10889                         if (retries > 0)
10890                                 goto retry;
10891
10892                         error = ENOENT;
10893                 }
10894                 goto out;
10895         }
10896
10897         AUDIT_ARG(text, buf);
10898
10899         if (kdebug_enable) {
10900                 long dbg_parms[NUMPARMS];
10901                 int  dbg_namelen;
10902
10903                 dbg_namelen = (int)sizeof(dbg_parms);
10904
10905         if (length < dbg_namelen) {
10906                         memcpy((char *)dbg_parms, buf, length);
10907                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
10908
10909                         dbg_namelen = length;
10910                 } else {
10911                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
10912                 }
10913
10914                 kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE);
10915         }
10916
10917         *pathlen = (user_ssize_t)length; /* may be superseded by error */
10918
10919 out:
10920         return (error);
10921 }
10922
10923 /*
10924  * Obtain the full pathname of a file system object by id.
10925  *
10926  * This is a private SPI used by the File Manager.
10927  */
10928 __private_extern__
10929 int
10930 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
10931 {
10932         vfs_context_t ctx = vfs_context_current();
10933         fsid_t fsid;
10934         char *realpath;
10935         int length;
10936         int error;
10937
10938         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
10939                 return (error);
10940         }
10941         AUDIT_ARG(value32, fsid.val[0]);
10942         AUDIT_ARG(value64, uap->objid);
10943         /* Restrict output buffer size for now. */
10944
10945         if (uap->bufsize > PAGE_SIZE) {
10946                 return (EINVAL);
10947         }
10948         MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK);
10949         if (realpath == NULL) {
10950                 return (ENOMEM);
10951         }
10952
10953         error = fsgetpath_internal(
10954                 ctx, fsid.val[0], uap->objid,
10955                 uap->bufsize, realpath, &length);
10956
10957         if (error) {
10958                 goto out;
10959         }
10960
10961         error = copyout((caddr_t)realpath, uap->buf, length);
10962
10963         *retval = (user_ssize_t)length; /* may be superseded by error */
10964 out:
10965         if (realpath) {
10966                 FREE(realpath, M_TEMP);
10967         }
10968         return (error);
10969 }
10970
10971 /*
10972  * Common routine to handle various flavors of statfs data heading out
10973  *      to user space.
10974  *
10975  * Returns:     0                       Success
10976  *              EFAULT
10977  */
10978 static int
10979 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
10980     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
10981     boolean_t partial_copy)
10982 {
10983         int             error;
10984         int             my_size, copy_size;
10985
10986         if (is_64_bit) {
10987                 struct user64_statfs sfs;
10988                 my_size = copy_size = sizeof(sfs);
10989                 bzero(&sfs, my_size);
10990                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10991                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10992                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10993                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
10994                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
10995                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
10996                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
10997                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
10998                 sfs.f_files = (user64_long_t)sfsp->f_files;
10999                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
11000                 sfs.f_fsid = sfsp->f_fsid;
11001                 sfs.f_owner = sfsp->f_owner;
11002                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11003                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
11004                 } else {
11005                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
11006                 }
11007                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
11008                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
11009
11010                 if (partial_copy) {
11011                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
11012                 }
11013                 error = copyout((caddr_t)&sfs, bufp, copy_size);
11014         }
11015         else {
11016                 struct user32_statfs sfs;
11017
11018                 my_size = copy_size = sizeof(sfs);
11019                 bzero(&sfs, my_size);
11020
11021                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
11022                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
11023                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
11024
11025                 /*
11026                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
11027                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
11028                  * to reflect the filesystem size as best we can.
11029                  */
11030                 if ((sfsp->f_blocks > INT_MAX)
11031                         /* Hack for 4061702 . I think the real fix is for Carbon to
11032                          * look for some volume capability and not depend on hidden
11033                          * semantics agreed between a FS and carbon.
11034                          * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
11035                          * for Carbon to set bNoVolumeSizes volume attribute.
11036                          * Without this the webdavfs files cannot be copied onto
11037                          * disk as they look huge. This change should not affect
11038                          * XSAN as they should not setting these to -1..
11039                          */
11040                          && (sfsp->f_blocks != 0xffffffffffffffffULL)
11041                          && (sfsp->f_bfree != 0xffffffffffffffffULL)
11042                          && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
11043                         int             shift;
11044
11045                         /*
11046                          * Work out how far we have to shift the block count down to make it fit.
11047                          * Note that it's possible to have to shift so far that the resulting
11048                          * blocksize would be unreportably large.  At that point, we will clip
11049                          * any values that don't fit.
11050                          *
11051                          * For safety's sake, we also ensure that f_iosize is never reported as
11052                          * being smaller than f_bsize.
11053                          */
11054                         for (shift = 0; shift < 32; shift++) {
11055                                 if ((sfsp->f_blocks >> shift) <= INT_MAX)
11056                                         break;
11057                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX)
11058                                         break;
11059                         }
11060 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
11061                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
11062                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
11063                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
11064 #undef __SHIFT_OR_CLIP
11065                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
11066                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
11067                 } else {
11068                         /* filesystem is small enough to be reported honestly */
11069                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
11070                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
11071                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
11072                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
11073                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
11074                 }
11075                 sfs.f_files = (user32_long_t)sfsp->f_files;
11076                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
11077                 sfs.f_fsid = sfsp->f_fsid;
11078                 sfs.f_owner = sfsp->f_owner;
11079                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11080                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
11081                 } else {
11082                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
11083                 }
11084                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
11085                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
11086
11087                 if (partial_copy) {
11088                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
11089                 }
11090                 error = copyout((caddr_t)&sfs, bufp, copy_size);
11091         }
11092
11093         if (sizep != NULL) {
11094                 *sizep = my_size;
11095         }
11096         return(error);
11097 }
11098
11099 /*
11100  * copy stat structure into user_stat structure.
11101  */
11102 void munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
11103 {
11104         bzero(usbp, sizeof(*usbp));
11105
11106         usbp->st_dev = sbp->st_dev;
11107         usbp->st_ino = sbp->st_ino;
11108         usbp->st_mode = sbp->st_mode;
11109         usbp->st_nlink = sbp->st_nlink;
11110         usbp->st_uid = sbp->st_uid;
11111         usbp->st_gid = sbp->st_gid;
11112         usbp->st_rdev = sbp->st_rdev;
11113 #ifndef _POSIX_C_SOURCE
11114         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11115         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11116         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11117         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11118         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11119         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11120 #else
11121         usbp->st_atime = sbp->st_atime;
11122         usbp->st_atimensec = sbp->st_atimensec;
11123         usbp->st_mtime = sbp->st_mtime;
11124         usbp->st_mtimensec = sbp->st_mtimensec;
11125         usbp->st_ctime = sbp->st_ctime;
11126         usbp->st_ctimensec = sbp->st_ctimensec;
11127 #endif
11128         usbp->st_size = sbp->st_size;
11129         usbp->st_blocks = sbp->st_blocks;
11130         usbp->st_blksize = sbp->st_blksize;
11131         usbp->st_flags = sbp->st_flags;
11132         usbp->st_gen = sbp->st_gen;
11133         usbp->st_lspare = sbp->st_lspare;
11134         usbp->st_qspare[0] = sbp->st_qspare[0];
11135         usbp->st_qspare[1] = sbp->st_qspare[1];
11136 }
11137
11138 void munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
11139 {
11140         bzero(usbp, sizeof(*usbp));
11141
11142         usbp->st_dev = sbp->st_dev;
11143         usbp->st_ino = sbp->st_ino;
11144         usbp->st_mode = sbp->st_mode;
11145         usbp->st_nlink = sbp->st_nlink;
11146         usbp->st_uid = sbp->st_uid;
11147         usbp->st_gid = sbp->st_gid;
11148         usbp->st_rdev = sbp->st_rdev;
11149 #ifndef _POSIX_C_SOURCE
11150         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11151         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11152         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11153         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11154         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11155         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11156 #else
11157         usbp->st_atime = sbp->st_atime;
11158         usbp->st_atimensec = sbp->st_atimensec;
11159         usbp->st_mtime = sbp->st_mtime;
11160         usbp->st_mtimensec = sbp->st_mtimensec;
11161         usbp->st_ctime = sbp->st_ctime;
11162         usbp->st_ctimensec = sbp->st_ctimensec;
11163 #endif
11164         usbp->st_size = sbp->st_size;
11165         usbp->st_blocks = sbp->st_blocks;
11166         usbp->st_blksize = sbp->st_blksize;
11167         usbp->st_flags = sbp->st_flags;
11168         usbp->st_gen = sbp->st_gen;
11169         usbp->st_lspare = sbp->st_lspare;
11170         usbp->st_qspare[0] = sbp->st_qspare[0];
11171         usbp->st_qspare[1] = sbp->st_qspare[1];
11172 }
11173
11174 /*
11175  * copy stat64 structure into user_stat64 structure.
11176  */
11177 void munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
11178 {
11179         bzero(usbp, sizeof(*usbp));
11180
11181         usbp->st_dev = sbp->st_dev;
11182         usbp->st_ino = sbp->st_ino;
11183         usbp->st_mode = sbp->st_mode;
11184         usbp->st_nlink = sbp->st_nlink;
11185         usbp->st_uid = sbp->st_uid;
11186         usbp->st_gid = sbp->st_gid;
11187         usbp->st_rdev = sbp->st_rdev;
11188 #ifndef _POSIX_C_SOURCE
11189         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11190         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11191         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11192         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11193         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11194         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11195         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11196         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11197 #else
11198         usbp->st_atime = sbp->st_atime;
11199         usbp->st_atimensec = sbp->st_atimensec;
11200         usbp->st_mtime = sbp->st_mtime;
11201         usbp->st_mtimensec = sbp->st_mtimensec;
11202         usbp->st_ctime = sbp->st_ctime;
11203         usbp->st_ctimensec = sbp->st_ctimensec;
11204         usbp->st_birthtime = sbp->st_birthtime;
11205         usbp->st_birthtimensec = sbp->st_birthtimensec;
11206 #endif
11207         usbp->st_size = sbp->st_size;
11208         usbp->st_blocks = sbp->st_blocks;
11209         usbp->st_blksize = sbp->st_blksize;
11210         usbp->st_flags = sbp->st_flags;
11211         usbp->st_gen = sbp->st_gen;
11212         usbp->st_lspare = sbp->st_lspare;
11213         usbp->st_qspare[0] = sbp->st_qspare[0];
11214         usbp->st_qspare[1] = sbp->st_qspare[1];
11215 }
11216
11217 void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
11218 {
11219         bzero(usbp, sizeof(*usbp));
11220
11221         usbp->st_dev = sbp->st_dev;
11222         usbp->st_ino = sbp->st_ino;
11223         usbp->st_mode = sbp->st_mode;
11224         usbp->st_nlink = sbp->st_nlink;
11225         usbp->st_uid = sbp->st_uid;
11226         usbp->st_gid = sbp->st_gid;
11227         usbp->st_rdev = sbp->st_rdev;
11228 #ifndef _POSIX_C_SOURCE
11229         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11230         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11231         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11232         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11233         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11234         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11235         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11236         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11237 #else
11238         usbp->st_atime = sbp->st_atime;
11239         usbp->st_atimensec = sbp->st_atimensec;
11240         usbp->st_mtime = sbp->st_mtime;
11241         usbp->st_mtimensec = sbp->st_mtimensec;
11242         usbp->st_ctime = sbp->st_ctime;
11243         usbp->st_ctimensec = sbp->st_ctimensec;
11244         usbp->st_birthtime = sbp->st_birthtime;
11245         usbp->st_birthtimensec = sbp->st_birthtimensec;
11246 #endif
11247         usbp->st_size = sbp->st_size;
11248         usbp->st_blocks = sbp->st_blocks;
11249         usbp->st_blksize = sbp->st_blksize;
11250         usbp->st_flags = sbp->st_flags;
11251         usbp->st_gen = sbp->st_gen;
11252         usbp->st_lspare = sbp->st_lspare;
11253         usbp->st_qspare[0] = sbp->st_qspare[0];
11254         usbp->st_qspare[1] = sbp->st_qspare[1];
11255 }
11256
11257 /*
11258  * Purge buffer cache for simulating cold starts
11259  */
11260 static int vnode_purge_callback(struct vnode *vp, __unused void *cargs)
11261 {
11262         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
11263
11264         return VNODE_RETURNED;
11265 }
11266
11267 static int vfs_purge_callback(mount_t mp, __unused void * arg)
11268 {
11269         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
11270
11271         return VFS_RETURNED;
11272 }
11273
11274 int
11275 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
11276 {
11277         if (!kauth_cred_issuser(kauth_cred_get()))
11278                 return EPERM;
11279
11280         vfs_iterate(0/* flags */, vfs_purge_callback, NULL);
11281
11282         return 0;
11283 }
11284
11285 /*
11286  * gets the vnode associated with the (unnamed) snapshot directory
11287  * for a Filesystem. The snapshot directory vnode is returned with
11288  * an iocount on it.
11289  */
11290 int
11291 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
11292 {
11293         return (VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx));
11294 }
11295
11296 /*
11297  * Get the snapshot vnode.
11298  *
11299  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
11300  * needs nameidone() on ndp.
11301  *
11302  * If the snapshot vnode exists it is returned in ndp->ni_vp.
11303  *
11304  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
11305  * not needed.
11306  */
11307 static int
11308 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
11309     user_addr_t name, struct nameidata *ndp, int32_t op,
11310 #if !CONFIG_TRIGGERS
11311     __unused
11312 #endif
11313     enum path_operation pathop,
11314     vfs_context_t ctx)
11315 {
11316         int error, i;
11317         caddr_t name_buf;
11318         size_t name_len;
11319         struct vfs_attr vfa;
11320
11321         *sdvpp = NULLVP;
11322         *rvpp = NULLVP;
11323
11324         error = vnode_getfromfd(ctx, dirfd, rvpp);
11325         if (error)
11326                 return (error);
11327
11328         if (!vnode_isvroot(*rvpp)) {
11329                 error = EINVAL;
11330                 goto out;
11331         }
11332
11333         /* Make sure the filesystem supports snapshots */
11334         VFSATTR_INIT(&vfa);
11335         VFSATTR_WANTED(&vfa, f_capabilities);
11336         if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
11337             !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
11338             !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
11339             VOL_CAP_INT_SNAPSHOT)) ||
11340             !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
11341             VOL_CAP_INT_SNAPSHOT))) {
11342                 error = ENOTSUP;
11343                 goto out;
11344         }
11345
11346         error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
11347         if (error)
11348                 goto out;
11349
11350         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11351         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11352         if (error)
11353                 goto out1;
11354
11355         /*
11356          * Some sanity checks- name can't be empty, "." or ".." or have slashes.
11357          * (the length returned by copyinstr includes the terminating NUL)
11358          */
11359         if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
11360             (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
11361                 error = EINVAL;
11362                 goto out1;
11363         }
11364         for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++);
11365         if (i < (int)name_len) {
11366                 error = EINVAL;
11367                 goto out1;
11368         }
11369
11370 #if CONFIG_MACF
11371         if (op == CREATE) {
11372                 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
11373                     name_buf);
11374         } else if (op == DELETE) {
11375                 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
11376                     name_buf);
11377         }
11378         if (error)
11379                 goto out1;
11380 #endif
11381
11382         /* Check if the snapshot already exists ... */
11383         NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
11384             UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
11385         ndp->ni_dvp = *sdvpp;
11386
11387         error = namei(ndp);
11388 out1:
11389         FREE(name_buf, M_TEMP);
11390 out:
11391         if (error) {
11392                 if (*sdvpp) {
11393                         vnode_put(*sdvpp);
11394                         *sdvpp = NULLVP;
11395                 }
11396                 if (*rvpp) {
11397                         vnode_put(*rvpp);
11398                         *rvpp = NULLVP;
11399                 }
11400         }
11401         return (error);
11402 }
11403
11404 /*
11405  * create a filesystem snapshot (for supporting filesystems)
11406  *
11407  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
11408  * We get to the (unnamed) snapshot directory vnode and create the vnode
11409  * for the snapshot in it.
11410  *
11411  * Restrictions:
11412  *
11413  *    a) Passed in name for snapshot cannot have slashes.
11414  *    b) name can't be "." or ".."
11415  *
11416  * Since this requires superuser privileges, vnode_authorize calls are not
11417  * made.
11418  */
11419 static int
11420 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
11421     vfs_context_t ctx)
11422 {
11423         vnode_t rvp, snapdvp;
11424         int error;
11425         struct nameidata namend;
11426
11427         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
11428             OP_LINK, ctx);
11429         if (error)
11430                 return (error);
11431
11432         if (namend.ni_vp) {
11433                 vnode_put(namend.ni_vp);
11434                 error = EEXIST;
11435         } else {
11436                 struct vnode_attr va;
11437                 vnode_t vp = NULLVP;
11438
11439                 VATTR_INIT(&va);
11440                 VATTR_SET(&va, va_type, VREG);
11441                 VATTR_SET(&va, va_mode, 0);
11442
11443                 error = vn_create(snapdvp, &vp, &namend, &va,
11444                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
11445                 if (!error && vp)
11446                         vnode_put(vp);
11447         }
11448
11449         nameidone(&namend);
11450         vnode_put(snapdvp);
11451         vnode_put(rvp);
11452         return (error);
11453 }
11454
11455 /*
11456  * Delete a Filesystem snapshot
11457  *
11458  * get the vnode for the unnamed snapshot directory and the snapshot and
11459  * delete the snapshot.
11460  */
11461 static int
11462 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
11463     vfs_context_t ctx)
11464 {
11465         vnode_t rvp, snapdvp;
11466         int error;
11467         struct nameidata namend;
11468
11469         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
11470             OP_UNLINK, ctx);
11471         if (error)
11472                 goto out;
11473
11474         error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
11475             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
11476
11477         vnode_put(namend.ni_vp);
11478         nameidone(&namend);
11479         vnode_put(snapdvp);
11480         vnode_put(rvp);
11481 out:
11482         return (error);
11483 }
11484
11485 /*
11486  * Revert a filesystem to a snapshot
11487  *
11488  * Marks the filesystem to revert to the given snapshot on next mount.
11489  */
11490 static int
11491 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
11492                 vfs_context_t ctx)
11493 {
11494     int error;
11495     vnode_t rvp;
11496     mount_t mp;
11497     struct fs_snapshot_revert_args revert_data;
11498     struct componentname cnp;
11499     caddr_t name_buf;
11500     size_t name_len;
11501
11502     error = vnode_getfromfd(ctx, dirfd, &rvp);
11503     if (error) {
11504         return (error);
11505     }
11506     mp = vnode_mount(rvp);
11507
11508     MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11509     error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11510     if (error) {
11511         FREE(name_buf, M_TEMP);
11512         vnode_put(rvp);
11513         return (error);
11514     }
11515
11516 #if CONFIG_MACF
11517     error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
11518     if (error) {
11519         FREE(name_buf, M_TEMP);
11520         vnode_put(rvp);
11521         return (error);
11522     }
11523 #endif
11524
11525     /*
11526      * Grab mount_iterref so that we can release the vnode,
11527      * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
11528      */
11529     error = mount_iterref (mp, 0);
11530     vnode_put(rvp);
11531     if (error) {
11532         FREE(name_buf, M_TEMP);
11533         return (error);
11534     }
11535
11536     memset(&cnp, 0, sizeof(cnp));
11537     cnp.cn_pnbuf = (char *)name_buf;
11538     cnp.cn_nameiop = LOOKUP;
11539     cnp.cn_flags = ISLASTCN | HASBUF;
11540     cnp.cn_pnlen = MAXPATHLEN;
11541     cnp.cn_nameptr = cnp.cn_pnbuf;
11542     cnp.cn_namelen = (int)name_len;
11543     revert_data.sr_cnp = &cnp;
11544
11545     error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
11546     mount_iterdrop(mp);
11547     FREE(name_buf, M_TEMP);
11548
11549     if (error) {
11550         /* If there was any error, try again using VNOP_IOCTL */
11551
11552         vnode_t snapdvp;
11553         struct nameidata namend;
11554
11555         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
11556                                    OP_LOOKUP, ctx);
11557         if (error) {
11558             return (error);
11559         }
11560
11561
11562 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
11563 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
11564 #endif
11565
11566 #ifndef APFS_REVERT_TO_SNAPSHOT
11567 #define APFS_REVERT_TO_SNAPSHOT     IOCBASECMD(APFSIOC_REVERT_TO_SNAPSHOT)
11568 #endif
11569
11570         error = VNOP_IOCTL(namend.ni_vp, APFS_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
11571                            0, ctx);
11572
11573         vnode_put(namend.ni_vp);
11574         nameidone(&namend);
11575         vnode_put(snapdvp);
11576         vnode_put(rvp);
11577     }
11578
11579         return (error);
11580 }
11581
11582 /*
11583  * rename a Filesystem snapshot
11584  *
11585  * get the vnode for the unnamed snapshot directory and the snapshot and
11586  * rename the snapshot. This is a very specialised (and simple) case of
11587  * rename(2) (which has to deal with a lot more complications). It differs
11588  * slightly from rename(2) in that EEXIST is returned if the new name exists.
11589  */
11590 static int
11591 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
11592     __unused uint32_t flags, vfs_context_t ctx)
11593 {
11594         vnode_t rvp, snapdvp;
11595         int error, i;
11596         caddr_t newname_buf;
11597         size_t name_len;
11598         vnode_t fvp;
11599         struct nameidata *fromnd, *tond;
11600         /* carving out a chunk for structs that are too big to be on stack. */
11601         struct {
11602                 struct nameidata from_node;
11603                 struct nameidata to_node;
11604         } * __rename_data;
11605
11606         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
11607         fromnd = &__rename_data->from_node;
11608         tond = &__rename_data->to_node;
11609
11610         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
11611             OP_UNLINK, ctx);
11612         if (error)
11613                 goto out;
11614         fvp  = fromnd->ni_vp;
11615
11616         MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11617         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
11618         if (error)
11619                 goto out1;
11620
11621         /*
11622          * Some sanity checks- new name can't be empty, "." or ".." or have
11623          * slashes.
11624          * (the length returned by copyinstr includes the terminating NUL)
11625          *
11626          * The FS rename VNOP is suppossed to handle this but we'll pick it
11627          * off here itself.
11628          */
11629         if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
11630             (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
11631                 error = EINVAL;
11632                 goto out1;
11633         }
11634         for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++);
11635         if (i < (int)name_len) {
11636                 error = EINVAL;
11637                 goto out1;
11638         }
11639
11640 #if CONFIG_MACF
11641         error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
11642             newname_buf);
11643         if (error)
11644                 goto out1;
11645 #endif
11646
11647         NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
11648             UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
11649         tond->ni_dvp = snapdvp;
11650
11651         error = namei(tond);
11652         if (error) {
11653                 goto out2;
11654         } else if (tond->ni_vp) {
11655                 /*
11656                  * snapshot rename behaves differently than rename(2) - if the
11657                  * new name exists, EEXIST is returned.
11658                  */
11659                 vnode_put(tond->ni_vp);
11660                 error = EEXIST;
11661                 goto out2;
11662         }
11663
11664         error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
11665             &tond->ni_cnd, ctx);
11666
11667 out2:
11668         nameidone(tond);
11669 out1:
11670         FREE(newname_buf, M_TEMP);
11671         vnode_put(fvp);
11672         vnode_put(snapdvp);
11673         vnode_put(rvp);
11674         nameidone(fromnd);
11675 out:
11676         FREE(__rename_data, M_TEMP);
11677         return (error);
11678 }
11679
11680 /*
11681  * Mount a Filesystem snapshot
11682  *
11683  * get the vnode for the unnamed snapshot directory and the snapshot and
11684  * mount the snapshot.
11685  */
11686 static int
11687 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
11688     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
11689 {
11690         vnode_t rvp, snapdvp, snapvp, vp, pvp;
11691         int error;
11692         struct nameidata *snapndp, *dirndp;
11693         /* carving out a chunk for structs that are too big to be on stack. */
11694         struct {
11695                 struct nameidata snapnd;
11696                 struct nameidata dirnd;
11697         } * __snapshot_mount_data;
11698
11699         MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
11700             M_TEMP, M_WAITOK);
11701         snapndp = &__snapshot_mount_data->snapnd;
11702         dirndp = &__snapshot_mount_data->dirnd;
11703
11704         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
11705             OP_LOOKUP, ctx);
11706         if (error)
11707                 goto out;
11708
11709         snapvp  = snapndp->ni_vp;
11710         if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
11711                 error = EIO;
11712                 goto out1;
11713         }
11714
11715         /* Get the vnode to be covered */
11716         NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
11717             UIO_USERSPACE, directory, ctx);
11718         error = namei(dirndp);
11719         if (error)
11720                 goto out1;
11721
11722         vp = dirndp->ni_vp;
11723         pvp = dirndp->ni_dvp;
11724
11725         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
11726                 error = EINVAL;
11727         } else {
11728                 mount_t mp = vnode_mount(rvp);
11729                 struct fs_snapshot_mount_args smnt_data;
11730
11731                 smnt_data.sm_mp  = mp;
11732                 smnt_data.sm_cnp = &snapndp->ni_cnd;
11733                 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
11734                    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), 0,
11735                    KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
11736         }
11737
11738         vnode_put(vp);
11739         vnode_put(pvp);
11740         nameidone(dirndp);
11741 out1:
11742         vnode_put(snapvp);
11743         vnode_put(snapdvp);
11744         vnode_put(rvp);
11745         nameidone(snapndp);
11746 out:
11747         FREE(__snapshot_mount_data, M_TEMP);
11748         return (error);
11749 }
11750
11751 /*
11752  * Root from a snapshot of the filesystem
11753  *
11754  * Marks the filesystem to root from the given snapshot on next boot.
11755  */
11756 static int
11757 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
11758                 vfs_context_t ctx)
11759 {
11760     int error;
11761     vnode_t rvp;
11762     mount_t mp;
11763     struct fs_snapshot_root_args root_data;
11764     struct componentname cnp;
11765     caddr_t name_buf;
11766     size_t name_len;
11767
11768     error = vnode_getfromfd(ctx, dirfd, &rvp);
11769     if (error) {
11770         return (error);
11771     }
11772     mp = vnode_mount(rvp);
11773
11774     MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11775     error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11776     if (error) {
11777         FREE(name_buf, M_TEMP);
11778         vnode_put(rvp);
11779         return (error);
11780     }
11781
11782     // XXX MAC checks ?
11783
11784     /*
11785      * Grab mount_iterref so that we can release the vnode,
11786      * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
11787      */
11788     error = mount_iterref (mp, 0);
11789     vnode_put(rvp);
11790     if (error) {
11791         FREE(name_buf, M_TEMP);
11792         return (error);
11793     }
11794
11795     memset(&cnp, 0, sizeof(cnp));
11796     cnp.cn_pnbuf = (char *)name_buf;
11797     cnp.cn_nameiop = LOOKUP;
11798     cnp.cn_flags = ISLASTCN | HASBUF;
11799     cnp.cn_pnlen = MAXPATHLEN;
11800     cnp.cn_nameptr = cnp.cn_pnbuf;
11801     cnp.cn_namelen = (int)name_len;
11802     root_data.sr_cnp = &cnp;
11803
11804     error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
11805
11806     mount_iterdrop(mp);
11807     FREE(name_buf, M_TEMP);
11808
11809     return (error);
11810 }
11811
11812 /*
11813  * FS snapshot operations dispatcher
11814  */
11815 int
11816 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
11817     __unused int32_t *retval)
11818 {
11819         int error;
11820         vfs_context_t ctx = vfs_context_current();
11821
11822         AUDIT_ARG(fd, uap->dirfd);
11823         AUDIT_ARG(value32, uap->op);
11824
11825         error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
11826         if (error)
11827                 return (error);
11828
11829         switch (uap->op) {
11830         case SNAPSHOT_OP_CREATE:
11831                 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
11832                 break;
11833         case SNAPSHOT_OP_DELETE:
11834                 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
11835                 break;
11836         case SNAPSHOT_OP_RENAME:
11837                 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
11838                     uap->flags, ctx);
11839                 break;
11840         case SNAPSHOT_OP_MOUNT:
11841                 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
11842                     uap->data, uap->flags, ctx);
11843                 break;
11844     case SNAPSHOT_OP_REVERT:
11845         error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
11846         break;
11847         case SNAPSHOT_OP_ROOT:
11848                 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
11849                 break;
11850         default:
11851                 error = ENOSYS;
11852         }
11853
11854         return (error);
11855 }