bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/xattr.h>
  98 #include <sys/fcntl.h>
  99 #include <sys/fsctl.h>
 100 #include <sys/ubc_internal.h>
 101 #include <sys/disk.h>
 102 #include <sys/content_protection.h>
 103 #include <sys/clonefile.h>
 104 #include <sys/snapshot.h>
 105 #include <sys/priv.h>
 106 #include <machine/cons.h>
 107 #include <machine/limits.h>
 108 #include <miscfs/specfs/specdev.h>
 109
 110 #include <security/audit/audit.h>
 111 #include <bsm/audit_kevents.h>
 112
 113 #include <mach/mach_types.h>
 114 #include <kern/kern_types.h>
 115 #include <kern/kalloc.h>
 116 #include <kern/task.h>
 117
 118 #include <vm/vm_pageout.h>
 119 #include <vm/vm_protos.h>
 120
 121 #include <libkern/OSAtomic.h>
 122 #include <pexpert/pexpert.h>
 123 #include <IOKit/IOBSD.h>
 124
 125 #if ROUTEFS
 126 #include <miscfs/routefs/routefs.h>
 127 #endif /* ROUTEFS */
 128
 129 #if CONFIG_MACF
 130 #include <security/mac.h>
 131 #include <security/mac_framework.h>
 132 #endif
 133
 134 #if CONFIG_FSE
 135 #define GET_PATH(x) \
 136         (x) = get_pathbuff();
 137 #define RELEASE_PATH(x) \
 138         release_pathbuff(x);
 139 #else
 140 #define GET_PATH(x)     \
 141         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 142 #define RELEASE_PATH(x) \
 143         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 144 #endif /* CONFIG_FSE */
 145
 146 /* struct for checkdirs iteration */
 147 struct cdirargs {
 148         vnode_t olddp;
 149         vnode_t newdp;
 150 };
 151 /* callback  for checkdirs iteration */
 152 static int checkdirs_callback(proc_t p, void * arg);
 153
 154 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 155 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 156 void enablequotas(struct mount *mp, vfs_context_t ctx);
 157 static int getfsstat_callback(mount_t mp, void * arg);
 158 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 159 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 160 static int sync_callback(mount_t, void *);
 161 static void sync_thread(void *, __unused wait_result_t);
 162 static int sync_async(int);
 163 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 164                         user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 165                                                 boolean_t partial_copy);
 166 static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
 167                         user_addr_t bufp);
 168 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 169 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 170                         struct componentname *cnp, user_addr_t fsmountargs,
 171                         int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 172                         vfs_context_t ctx);
 173 void vfs_notify_mount(vnode_t pdvp);
 174
 175 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 176
 177 struct fd_vn_data * fg_vn_data_alloc(void);
 178
 179 /*
 180  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 181  * Concurrent lookups (or lookups by ids) on hard links can cause the
 182  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 183  * does) to return ENOENT as the path cannot be returned from the name cache
 184  * alone. We have no option but to retry and hope to get one namei->reverse path
 185  * generation done without an intervening lookup, lookup by id on the hard link
 186  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 187  * which currently are the MAC hooks for rename, unlink and rmdir.
 188  */
 189 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 190
 191 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
 192
 193 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
 194
 195 #ifdef CONFIG_IMGSRC_ACCESS
 196 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 197 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 198 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 199 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 200 static void mount_end_update(mount_t mp);
 201 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 202 #endif /* CONFIG_IMGSRC_ACCESS */
 203
 204 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 205
 206 __private_extern__
 207 int sync_internal(void);
 208
 209 __private_extern__
 210 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 211
 212 extern lck_grp_t *fd_vn_lck_grp;
 213 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 214 extern lck_attr_t *fd_vn_lck_attr;
 215
 216 /*
 217  * incremented each time a mount or unmount operation occurs
 218  * used to invalidate the cached value of the rootvp in the
 219  * mount structure utilized by cache_lookup_path
 220  */
 221 uint32_t mount_generation = 0;
 222
 223 /* counts number of mount and unmount operations */
 224 unsigned int vfs_nummntops=0;
 225
 226 extern const struct fileops vnops;
 227 #if CONFIG_APPLEDOUBLE
 228 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 229 #endif /* CONFIG_APPLEDOUBLE */
 230
 231 /*
 232  * Virtual File System System Calls
 233  */
 234
 235 #if NFSCLIENT || DEVFS || ROUTEFS
 236 /*
 237  * Private in-kernel mounting spi (NFS only, not exported)
 238  */
 239  __private_extern__
 240 boolean_t
 241 vfs_iskernelmount(mount_t mp)
 242 {
 243         return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE);
 244 }
 245
 246  __private_extern__
 247 int
 248 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 249              void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
 250 {
 251         struct nameidata nd;
 252         boolean_t did_namei;
 253         int error;
 254
 255         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 256                UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 257
 258         /*
 259          * Get the vnode to be covered if it's not supplied
 260          */
 261         if (vp == NULLVP) {
 262                 error = namei(&nd);
 263                 if (error)
 264                         return (error);
 265                 vp = nd.ni_vp;
 266                 pvp = nd.ni_dvp;
 267                 did_namei = TRUE;
 268         } else {
 269                 char *pnbuf = CAST_DOWN(char *, path);
 270
 271                 nd.ni_cnd.cn_pnbuf = pnbuf;
 272                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 273                 did_namei = FALSE;
 274         }
 275
 276         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 277                              syscall_flags, kern_flags, NULL, TRUE, ctx);
 278
 279         if (did_namei) {
 280                 vnode_put(vp);
 281                 vnode_put(pvp);
 282                 nameidone(&nd);
 283         }
 284
 285         return (error);
 286 }
 287 #endif /* NFSCLIENT || DEVFS */
 288
 289 /*
 290  * Mount a file system.
 291  */
 292 /* ARGSUSED */
 293 int
 294 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 295 {
 296         struct __mac_mount_args muap;
 297
 298         muap.type = uap->type;
 299         muap.path = uap->path;
 300         muap.flags = uap->flags;
 301         muap.data = uap->data;
 302         muap.mac_p = USER_ADDR_NULL;
 303         return (__mac_mount(p, &muap, retval));
 304 }
 305
 306 void
 307 vfs_notify_mount(vnode_t pdvp)
 308 {
 309         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 310         lock_vnode_and_post(pdvp, NOTE_WRITE);
 311 }
 312
 313 /*
 314  * __mac_mount:
 315  *      Mount a file system taking into account MAC label behavior.
 316  *      See mount(2) man page for more information
 317  *
 318  * Parameters:    p                        Process requesting the mount
 319  *                uap                      User argument descriptor (see below)
 320  *                retval                   (ignored)
 321  *
 322  * Indirect:      uap->type                Filesystem type
 323  *                uap->path                Path to mount
 324  *                uap->data                Mount arguments
 325  *                uap->mac_p               MAC info
 326  *                uap->flags               Mount flags
 327  *
 328  *
 329  * Returns:        0                       Success
 330  *                !0                       Not success
 331  */
 332 boolean_t root_fs_upgrade_try = FALSE;
 333
 334 int
 335 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 336 {
 337         vnode_t pvp = NULL;
 338         vnode_t vp = NULL;
 339         int need_nameidone = 0;
 340         vfs_context_t ctx = vfs_context_current();
 341         char fstypename[MFSNAMELEN];
 342         struct nameidata nd;
 343         size_t dummy=0;
 344         char *labelstr = NULL;
 345         int flags = uap->flags;
 346         int error;
 347 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 348         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 349 #else
 350 #pragma unused(p)
 351 #endif
 352         /*
 353          * Get the fs type name from user space
 354          */
 355         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 356         if (error)
 357                 return (error);
 358
 359         /*
 360          * Get the vnode to be covered
 361          */
 362         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 363                UIO_USERSPACE, uap->path, ctx);
 364         error = namei(&nd);
 365         if (error) {
 366                 goto out;
 367         }
 368         need_nameidone = 1;
 369         vp = nd.ni_vp;
 370         pvp = nd.ni_dvp;
 371
 372 #ifdef CONFIG_IMGSRC_ACCESS
 373         /* Mounting image source cannot be batched with other operations */
 374         if (flags == MNT_IMGSRC_BY_INDEX) {
 375                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 376                                                   ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 377                 goto out;
 378         }
 379 #endif /* CONFIG_IMGSRC_ACCESS */
 380
 381 #if CONFIG_MACF
 382         /*
 383          * Get the label string (if any) from user space
 384          */
 385         if (uap->mac_p != USER_ADDR_NULL) {
 386                 struct user_mac mac;
 387                 size_t ulen = 0;
 388
 389                 if (is_64bit) {
 390                         struct user64_mac mac64;
 391                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 392                         mac.m_buflen = mac64.m_buflen;
 393                         mac.m_string = mac64.m_string;
 394                 } else {
 395                         struct user32_mac mac32;
 396                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 397                         mac.m_buflen = mac32.m_buflen;
 398                         mac.m_string = mac32.m_string;
 399                 }
 400                 if (error)
 401                         goto out;
 402                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 403                     (mac.m_buflen < 2)) {
 404                         error = EINVAL;
 405                         goto out;
 406                 }
 407                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 408                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 409                 if (error) {
 410                         goto out;
 411                 }
 412                 AUDIT_ARG(mac_string, labelstr);
 413         }
 414 #endif /* CONFIG_MACF */
 415
 416         AUDIT_ARG(fflags, flags);
 417
 418 #if SECURE_KERNEL
 419         if (flags & MNT_UNION) {
 420                 /* No union mounts on release kernels */
 421                 error = EPERM;
 422                 goto out;
 423         }
 424 #endif
 425
 426         if ((vp->v_flag & VROOT) &&
 427                         (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 428                 if (!(flags & MNT_UNION)) {
 429                         flags |= MNT_UPDATE;
 430                 }
 431                 else {
 432                         /*
 433                          * For a union mount on '/', treat it as fresh
 434                          * mount instead of update.
 435                          * Otherwise, union mouting on '/' used to panic the
 436                          * system before, since mnt_vnodecovered was found to
 437                          * be NULL for '/' which is required for unionlookup
 438                          * after it gets ENOENT on union mount.
 439                          */
 440                         flags = (flags & ~(MNT_UPDATE));
 441                 }
 442
 443 #if SECURE_KERNEL
 444                 if ((flags & MNT_RDONLY) == 0) {
 445                         /* Release kernels are not allowed to mount "/" as rw */
 446                         error = EPERM;
 447                         goto out;
 448                 }
 449 #endif
 450                 /*
 451                  * See 7392553 for more details on why this check exists.
 452                  * Suffice to say: If this check is ON and something tries
 453                  * to mount the rootFS RW, we'll turn off the codesign
 454                  * bitmap optimization.
 455                  */
 456 #if CHECK_CS_VALIDATION_BITMAP
 457                 if ((flags & MNT_RDONLY) == 0 ) {
 458                         root_fs_upgrade_try = TRUE;
 459                 }
 460 #endif
 461         }
 462
 463         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 464                              labelstr, FALSE, ctx);
 465
 466 out:
 467
 468 #if CONFIG_MACF
 469         if (labelstr)
 470                 FREE(labelstr, M_MACTEMP);
 471 #endif /* CONFIG_MACF */
 472
 473         if (vp) {
 474                 vnode_put(vp);
 475         }
 476         if (pvp) {
 477                 vnode_put(pvp);
 478         }
 479         if (need_nameidone) {
 480                 nameidone(&nd);
 481         }
 482
 483         return (error);
 484 }
 485
 486 /*
 487  * common mount implementation (final stage of mounting)
 488
 489  * Arguments:
 490  *  fstypename  file system type (ie it's vfs name)
 491  *  pvp         parent of covered vnode
 492  *  vp          covered vnode
 493  *  cnp         component name (ie path) of covered vnode
 494  *  flags       generic mount flags
 495  *  fsmountargs file system specific data
 496  *  labelstr    optional MAC label
 497  *  kernelmount TRUE for mounts initiated from inside the kernel
 498  *  ctx         caller's context
 499  */
 500 static int
 501 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 502              struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 503              char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 504 {
 505 #if !CONFIG_MACF
 506 #pragma unused(labelstr)
 507 #endif
 508         struct vnode *devvp = NULLVP;
 509         struct vnode *device_vnode = NULLVP;
 510 #if CONFIG_MACF
 511         struct vnode *rvp;
 512 #endif
 513         struct mount *mp;
 514         struct vfstable *vfsp = (struct vfstable *)0;
 515         struct proc *p = vfs_context_proc(ctx);
 516         int error, flag = 0;
 517         user_addr_t devpath = USER_ADDR_NULL;
 518         int ronly = 0;
 519         int mntalloc = 0;
 520         boolean_t vfsp_ref = FALSE;
 521         boolean_t is_rwlock_locked = FALSE;
 522         boolean_t did_rele = FALSE;
 523         boolean_t have_usecount = FALSE;
 524
 525         /*
 526          * Process an update for an existing mount
 527          */
 528         if (flags & MNT_UPDATE) {
 529                 if ((vp->v_flag & VROOT) == 0) {
 530                         error = EINVAL;
 531                         goto out1;
 532                 }
 533                 mp = vp->v_mount;
 534
 535                 /* unmount in progress return error */
 536                 mount_lock_spin(mp);
 537                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 538                         mount_unlock(mp);
 539                         error = EBUSY;
 540                         goto out1;
 541                 }
 542                 mount_unlock(mp);
 543                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 544                 is_rwlock_locked = TRUE;
 545                 /*
 546                  * We only allow the filesystem to be reloaded if it
 547                  * is currently mounted read-only.
 548                  */
 549                 if ((flags & MNT_RELOAD) &&
 550                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 551                         error = ENOTSUP;
 552                         goto out1;
 553                 }
 554
 555                 /*
 556                  * If content protection is enabled, update mounts are not
 557                  * allowed to turn it off.
 558                  */
 559                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 560                            ((flags & MNT_CPROTECT) == 0)) {
 561                         error = EINVAL;
 562                         goto out1;
 563                 }
 564
 565 #ifdef CONFIG_IMGSRC_ACCESS
 566                 /* Can't downgrade the backer of the root FS */
 567                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 568                         (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 569                         error = ENOTSUP;
 570                         goto out1;
 571                 }
 572 #endif /* CONFIG_IMGSRC_ACCESS */
 573
 574                 /*
 575                  * Only root, or the user that did the original mount is
 576                  * permitted to update it.
 577                  */
 578                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 579                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 580                         goto out1;
 581                 }
 582 #if CONFIG_MACF
 583                 error = mac_mount_check_remount(ctx, mp);
 584                 if (error != 0) {
 585                         goto out1;
 586                 }
 587 #endif
 588                 /*
 589                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 590                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 591                  */
 592                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 593                         flags |= MNT_NOSUID | MNT_NODEV;
 594                         if (mp->mnt_flag & MNT_NOEXEC)
 595                                 flags |= MNT_NOEXEC;
 596                 }
 597                 flag = mp->mnt_flag;
 598
 599
 600
 601                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 602
 603                 vfsp = mp->mnt_vtable;
 604                 goto update;
 605         }
 606         /*
 607          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 608          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 609          */
 610         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 611                 flags |= MNT_NOSUID | MNT_NODEV;
 612                 if (vp->v_mount->mnt_flag & MNT_NOEXEC)
 613                         flags |= MNT_NOEXEC;
 614         }
 615
 616         /* XXXAUDIT: Should we capture the type on the error path as well? */
 617         AUDIT_ARG(text, fstypename);
 618         mount_list_lock();
 619         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 620                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 621                         vfsp->vfc_refcount++;
 622                         vfsp_ref = TRUE;
 623                         break;
 624                 }
 625         mount_list_unlock();
 626         if (vfsp == NULL) {
 627                 error = ENODEV;
 628                 goto out1;
 629         }
 630
 631         /*
 632          * VFC_VFSLOCALARGS is not currently supported for kernel mounts
 633          */
 634         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
 635                 error = EINVAL;  /* unsupported request */
 636                 goto out1;
 637         }
 638
 639         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 640         if (error != 0) {
 641                 goto out1;
 642         }
 643
 644         /*
 645          * Allocate and initialize the filesystem (mount_t)
 646          */
 647         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 648                 M_MOUNT, M_WAITOK);
 649         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 650         mntalloc = 1;
 651
 652         /* Initialize the default IO constraints */
 653         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 654         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 655         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 656         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 657         mp->mnt_devblocksize = DEV_BSIZE;
 658         mp->mnt_alignmentmask = PAGE_MASK;
 659         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 660         mp->mnt_ioscale = 1;
 661         mp->mnt_ioflags = 0;
 662         mp->mnt_realrootvp = NULLVP;
 663         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 664
 665         TAILQ_INIT(&mp->mnt_vnodelist);
 666         TAILQ_INIT(&mp->mnt_workerqueue);
 667         TAILQ_INIT(&mp->mnt_newvnodes);
 668         mount_lock_init(mp);
 669         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 670         is_rwlock_locked = TRUE;
 671         mp->mnt_op = vfsp->vfc_vfsops;
 672         mp->mnt_vtable = vfsp;
 673         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 674         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 675         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 676         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 677         mp->mnt_vnodecovered = vp;
 678         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 679         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 680         mp->mnt_devbsdunit = 0;
 681
 682         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 683         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 684
 685 #if NFSCLIENT || DEVFS || ROUTEFS
 686         if (kernelmount)
 687                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 688         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0)
 689                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 690 #endif /* NFSCLIENT || DEVFS */
 691
 692 update:
 693         /*
 694          * Set the mount level flags.
 695          */
 696         if (flags & MNT_RDONLY)
 697                 mp->mnt_flag |= MNT_RDONLY;
 698         else if (mp->mnt_flag & MNT_RDONLY) {
 699                 // disallow read/write upgrades of file systems that
 700                 // had the TYPENAME_OVERRIDE feature set.
 701                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 702                         error = EPERM;
 703                         goto out1;
 704                 }
 705                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 706         }
 707         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 708                           MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 709                           MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 710                           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 711                           MNT_QUARANTINE | MNT_CPROTECT);
 712         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 713                                  MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 714                                  MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 715                                  MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 716                                  MNT_QUARANTINE | MNT_CPROTECT);
 717
 718 #if CONFIG_MACF
 719         if (flags & MNT_MULTILABEL) {
 720                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 721                         error = EINVAL;
 722                         goto out1;
 723                 }
 724                 mp->mnt_flag |= MNT_MULTILABEL;
 725         }
 726 #endif
 727         /*
 728          * Process device path for local file systems if requested
 729          */
 730         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
 731             !(internal_flags & KERNEL_MOUNT_SNAPSHOT)) {
 732                 if (vfs_context_is64bit(ctx)) {
 733                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
 734                                 goto out1;
 735                         fsmountargs += sizeof(devpath);
 736                 } else {
 737                         user32_addr_t tmp;
 738                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
 739                                 goto out1;
 740                         /* munge into LP64 addr */
 741                         devpath = CAST_USER_ADDR_T(tmp);
 742                         fsmountargs += sizeof(tmp);
 743                 }
 744
 745                 /* Lookup device and authorize access to it */
 746                 if ((devpath)) {
 747                         struct nameidata nd;
 748
 749                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 750                         if ( (error = namei(&nd)) )
 751                                 goto out1;
 752
 753                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 754                         devvp = nd.ni_vp;
 755
 756                         nameidone(&nd);
 757
 758                         if (devvp->v_type != VBLK) {
 759                                 error = ENOTBLK;
 760                                 goto out2;
 761                         }
 762                         if (major(devvp->v_rdev) >= nblkdev) {
 763                                 error = ENXIO;
 764                                 goto out2;
 765                         }
 766                         /*
 767                         * If mount by non-root, then verify that user has necessary
 768                         * permissions on the device.
 769                         */
 770                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 771                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 772
 773                                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
 774                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 775                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0)
 776                                         goto out2;
 777                         }
 778                 }
 779                 /* On first mount, preflight and open device */
 780                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 781                         if ( (error = vnode_ref(devvp)) )
 782                                 goto out2;
 783                         /*
 784                         * Disallow multiple mounts of the same device.
 785                         * Disallow mounting of a device that is currently in use
 786                         * (except for root, which might share swap device for miniroot).
 787                         * Flush out any old buffers remaining from a previous use.
 788                         */
 789                         if ( (error = vfs_mountedon(devvp)) )
 790                                 goto out3;
 791
 792                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 793                                 error = EBUSY;
 794                                 goto out3;
 795                         }
 796                         if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, ctx)) ) {
 797                                 error = ENOTBLK;
 798                                 goto out3;
 799                         }
 800                         if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0)) )
 801                                 goto out3;
 802
 803                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 804 #if CONFIG_MACF
 805                         error = mac_vnode_check_open(ctx,
 806                             devvp,
 807                             ronly ? FREAD : FREAD|FWRITE);
 808                         if (error)
 809                                 goto out3;
 810 #endif /* MAC */
 811                         if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, ctx)) )
 812                                 goto out3;
 813
 814                         mp->mnt_devvp = devvp;
 815                         device_vnode = devvp;
 816
 817                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 818                            (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 819                            (device_vnode = mp->mnt_devvp)) {
 820                         dev_t dev;
 821                         int maj;
 822                         /*
 823                          * If upgrade to read-write by non-root, then verify
 824                          * that user has necessary permissions on the device.
 825                          */
 826                         vnode_getalways(device_vnode);
 827
 828                         if (suser(vfs_context_ucred(ctx), NULL) &&
 829                             (error = vnode_authorize(device_vnode, NULL,
 830                              KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
 831                              ctx)) != 0) {
 832                                 vnode_put(device_vnode);
 833                                 goto out2;
 834                         }
 835
 836                         /* Tell the device that we're upgrading */
 837                         dev = (dev_t)device_vnode->v_rdev;
 838                         maj = major(dev);
 839
 840                         if ((u_int)maj >= (u_int)nblkdev)
 841                                 panic("Volume mounted on a device with invalid major number.");
 842
 843                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
 844                         vnode_put(device_vnode);
 845                         device_vnode = NULLVP;
 846                         if (error != 0) {
 847                                 goto out2;
 848                         }
 849                 }
 850         }
 851 #if CONFIG_MACF
 852         if ((flags & MNT_UPDATE) == 0) {
 853                 mac_mount_label_init(mp);
 854                 mac_mount_label_associate(ctx, mp);
 855         }
 856         if (labelstr) {
 857                 if ((flags & MNT_UPDATE) != 0) {
 858                         error = mac_mount_check_label_update(ctx, mp);
 859                         if (error != 0)
 860                                 goto out3;
 861                 }
 862         }
 863 #endif
 864         /*
 865          * Mount the filesystem.
 866          */
 867         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
 868                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
 869                     (caddr_t)fsmountargs, 0, ctx);
 870         } else {
 871                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
 872         }
 873
 874         if (flags & MNT_UPDATE) {
 875                 if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 876                         mp->mnt_flag &= ~MNT_RDONLY;
 877                 mp->mnt_flag &=~
 878                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
 879                 mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 880                 if (error)
 881                         mp->mnt_flag = flag;  /* restore flag value */
 882                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
 883                 lck_rw_done(&mp->mnt_rwlock);
 884                 is_rwlock_locked = FALSE;
 885                 if (!error)
 886                         enablequotas(mp, ctx);
 887                 goto exit;
 888         }
 889
 890         /*
 891          * Put the new filesystem on the mount list after root.
 892          */
 893         if (error == 0) {
 894                 struct vfs_attr vfsattr;
 895 #if CONFIG_MACF
 896                 if (vfs_flags(mp) & MNT_MULTILABEL) {
 897                         error = VFS_ROOT(mp, &rvp, ctx);
 898                         if (error) {
 899                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
 900                                 goto out3;
 901                         }
 902                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
 903                         /*
 904                          * drop reference provided by VFS_ROOT
 905                          */
 906                         vnode_put(rvp);
 907
 908                         if (error)
 909                                 goto out3;
 910                 }
 911 #endif  /* MAC */
 912
 913                 vnode_lock_spin(vp);
 914                 CLR(vp->v_flag, VMOUNT);
 915                 vp->v_mountedhere = mp;
 916                 vnode_unlock(vp);
 917
 918                 /*
 919                  * taking the name_cache_lock exclusively will
 920                  * insure that everyone is out of the fast path who
 921                  * might be trying to use a now stale copy of
 922                  * vp->v_mountedhere->mnt_realrootvp
 923                  * bumping mount_generation causes the cached values
 924                  * to be invalidated
 925                  */
 926                 name_cache_lock();
 927                 mount_generation++;
 928                 name_cache_unlock();
 929
 930                 error = vnode_ref(vp);
 931                 if (error != 0) {
 932                         goto out4;
 933                 }
 934
 935                 have_usecount = TRUE;
 936
 937                 error = checkdirs(vp, ctx);
 938                 if (error != 0)  {
 939                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
 940                         goto out4;
 941                 }
 942                 /*
 943                  * there is no cleanup code here so I have made it void
 944                  * we need to revisit this
 945                  */
 946                 (void)VFS_START(mp, 0, ctx);
 947
 948                 if (mount_list_add(mp) != 0) {
 949                         /*
 950                          * The system is shutting down trying to umount
 951                          * everything, so fail with a plausible errno.
 952                          */
 953                         error = EBUSY;
 954                         goto out4;
 955                 }
 956                 lck_rw_done(&mp->mnt_rwlock);
 957                 is_rwlock_locked = FALSE;
 958
 959                 /* Check if this mounted file system supports EAs or named streams. */
 960                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
 961                 VFSATTR_INIT(&vfsattr);
 962                 VFSATTR_WANTED(&vfsattr, f_capabilities);
 963                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
 964                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
 965                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
 966                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
 967                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
 968                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
 969                         }
 970 #if NAMEDSTREAMS
 971                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
 972                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
 973                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
 974                         }
 975 #endif
 976                         /* Check if this file system supports path from id lookups. */
 977                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
 978                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
 979                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 980                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
 981                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
 982                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 983                         }
 984
 985                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
 986                                 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
 987                                 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
 988                         }
 989                 }
 990                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
 991                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
 992                 }
 993                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
 994                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
 995                 }
 996                 /* increment the operations count */
 997                 OSAddAtomic(1, &vfs_nummntops);
 998                 enablequotas(mp, ctx);
 999
1000                 if (device_vnode) {
1001                         device_vnode->v_specflags |= SI_MOUNTEDON;
1002
1003                         /*
1004                          *   cache the IO attributes for the underlying physical media...
1005                          *   an error return indicates the underlying driver doesn't
1006                          *   support all the queries necessary... however, reasonable
1007                          *   defaults will have been set, so no reason to bail or care
1008                          */
1009                         vfs_init_io_attributes(device_vnode, mp);
1010                 }
1011
1012                 /* Now that mount is setup, notify the listeners */
1013                 vfs_notify_mount(pvp);
1014                 IOBSDMountChange(mp, kIOMountChangeMount);
1015
1016         } else {
1017                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1018                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1019                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1020                                         mp->mnt_vtable->vfc_name, error);
1021                 }
1022
1023                 vnode_lock_spin(vp);
1024                 CLR(vp->v_flag, VMOUNT);
1025                 vnode_unlock(vp);
1026                 mount_list_lock();
1027                 mp->mnt_vtable->vfc_refcount--;
1028                 mount_list_unlock();
1029
1030                 if (device_vnode ) {
1031                         vnode_rele(device_vnode);
1032                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD|FWRITE, ctx);
1033                 }
1034                 lck_rw_done(&mp->mnt_rwlock);
1035                 is_rwlock_locked = FALSE;
1036
1037                 /*
1038                  * if we get here, we have a mount structure that needs to be freed,
1039                  * but since the coveredvp hasn't yet been updated to point at it,
1040                  * no need to worry about other threads holding a crossref on this mp
1041                  * so it's ok to just free it
1042                  */
1043                 mount_lock_destroy(mp);
1044 #if CONFIG_MACF
1045                 mac_mount_label_destroy(mp);
1046 #endif
1047                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1048         }
1049 exit:
1050         /*
1051          * drop I/O count on the device vp if there was one
1052          */
1053         if (devpath && devvp)
1054                 vnode_put(devvp);
1055
1056         return(error);
1057
1058 /* Error condition exits */
1059 out4:
1060         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1061
1062         /*
1063          * If the mount has been placed on the covered vp,
1064          * it may have been discovered by now, so we have
1065          * to treat this just like an unmount
1066          */
1067         mount_lock_spin(mp);
1068         mp->mnt_lflag |= MNT_LDEAD;
1069         mount_unlock(mp);
1070
1071         if (device_vnode != NULLVP) {
1072                 vnode_rele(device_vnode);
1073                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1074                        ctx);
1075                 did_rele = TRUE;
1076         }
1077
1078         vnode_lock_spin(vp);
1079
1080         mp->mnt_crossref++;
1081         vp->v_mountedhere = (mount_t) 0;
1082
1083         vnode_unlock(vp);
1084
1085         if (have_usecount) {
1086                 vnode_rele(vp);
1087         }
1088 out3:
1089         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele))
1090                 vnode_rele(devvp);
1091 out2:
1092         if (devpath && devvp)
1093                 vnode_put(devvp);
1094 out1:
1095         /* Release mnt_rwlock only when it was taken */
1096         if (is_rwlock_locked == TRUE) {
1097                 lck_rw_done(&mp->mnt_rwlock);
1098         }
1099
1100         if (mntalloc) {
1101                 if (mp->mnt_crossref)
1102                         mount_dropcrossref(mp, vp, 0);
1103                 else {
1104                         mount_lock_destroy(mp);
1105 #if CONFIG_MACF
1106                         mac_mount_label_destroy(mp);
1107 #endif
1108                         FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1109                 }
1110         }
1111         if (vfsp_ref) {
1112                 mount_list_lock();
1113                 vfsp->vfc_refcount--;
1114                 mount_list_unlock();
1115         }
1116
1117         return(error);
1118 }
1119
1120 /*
1121  * Flush in-core data, check for competing mount attempts,
1122  * and set VMOUNT
1123  */
1124 int
1125 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1126 {
1127 #if !CONFIG_MACF
1128 #pragma unused(cnp,fsname)
1129 #endif
1130         struct vnode_attr va;
1131         int error;
1132
1133         if (!skip_auth) {
1134                 /*
1135                  * If the user is not root, ensure that they own the directory
1136                  * onto which we are attempting to mount.
1137                  */
1138                 VATTR_INIT(&va);
1139                 VATTR_WANTED(&va, va_uid);
1140                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1141                                 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1142                                  (!vfs_context_issuser(ctx)))) {
1143                         error = EPERM;
1144                         goto out;
1145                 }
1146         }
1147
1148         if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
1149                 goto out;
1150
1151         if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) )
1152                 goto out;
1153
1154         if (vp->v_type != VDIR) {
1155                 error = ENOTDIR;
1156                 goto out;
1157         }
1158
1159         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1160                 error = EBUSY;
1161                 goto out;
1162         }
1163
1164 #if CONFIG_MACF
1165         error = mac_mount_check_mount(ctx, vp,
1166             cnp, fsname);
1167         if (error != 0)
1168                 goto out;
1169 #endif
1170
1171         vnode_lock_spin(vp);
1172         SET(vp->v_flag, VMOUNT);
1173         vnode_unlock(vp);
1174
1175 out:
1176         return error;
1177 }
1178
1179 #if CONFIG_IMGSRC_ACCESS
1180
1181 #if DEBUG
1182 #define IMGSRC_DEBUG(args...) printf(args)
1183 #else
1184 #define IMGSRC_DEBUG(args...) do { } while(0)
1185 #endif
1186
1187 static int
1188 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1189 {
1190         struct nameidata nd;
1191         vnode_t vp, realdevvp;
1192         mode_t accessmode;
1193         int error;
1194
1195         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
1196         if ( (error = namei(&nd)) ) {
1197                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1198                 return error;
1199         }
1200
1201         vp = nd.ni_vp;
1202
1203         if (!vnode_isblk(vp)) {
1204                 IMGSRC_DEBUG("Not block device.\n");
1205                 error = ENOTBLK;
1206                 goto out;
1207         }
1208
1209         realdevvp = mp->mnt_devvp;
1210         if (realdevvp == NULLVP) {
1211                 IMGSRC_DEBUG("No device backs the mount.\n");
1212                 error = ENXIO;
1213                 goto out;
1214         }
1215
1216         error = vnode_getwithref(realdevvp);
1217         if (error != 0) {
1218                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1219                 goto out;
1220         }
1221
1222         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1223                 IMGSRC_DEBUG("Wrong dev_t.\n");
1224                 error = ENXIO;
1225                 goto out1;
1226         }
1227
1228         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1229
1230         /*
1231          * If mount by non-root, then verify that user has necessary
1232          * permissions on the device.
1233          */
1234         if (!vfs_context_issuser(ctx)) {
1235                 accessmode = KAUTH_VNODE_READ_DATA;
1236                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
1237                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1238                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1239                         IMGSRC_DEBUG("Access denied.\n");
1240                         goto out1;
1241                 }
1242         }
1243
1244         *devvpp = vp;
1245
1246 out1:
1247         vnode_put(realdevvp);
1248 out:
1249         nameidone(&nd);
1250         if (error) {
1251                 vnode_put(vp);
1252         }
1253
1254         return error;
1255 }
1256
1257 /*
1258  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1259  * and call checkdirs()
1260  */
1261 static int
1262 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1263 {
1264         int error;
1265
1266         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1267
1268         vnode_lock_spin(vp);
1269         CLR(vp->v_flag, VMOUNT);
1270         vp->v_mountedhere = mp;
1271         vnode_unlock(vp);
1272
1273         /*
1274          * taking the name_cache_lock exclusively will
1275          * insure that everyone is out of the fast path who
1276          * might be trying to use a now stale copy of
1277          * vp->v_mountedhere->mnt_realrootvp
1278          * bumping mount_generation causes the cached values
1279          * to be invalidated
1280          */
1281         name_cache_lock();
1282         mount_generation++;
1283         name_cache_unlock();
1284
1285         error = vnode_ref(vp);
1286         if (error != 0) {
1287                 goto out;
1288         }
1289
1290         error = checkdirs(vp, ctx);
1291         if (error != 0)  {
1292                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1293                 vnode_rele(vp);
1294                 goto out;
1295         }
1296
1297 out:
1298         if (error != 0) {
1299                 mp->mnt_vnodecovered = NULLVP;
1300         }
1301         return error;
1302 }
1303
1304 static void
1305 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1306 {
1307         vnode_rele(vp);
1308         vnode_lock_spin(vp);
1309         vp->v_mountedhere = (mount_t)NULL;
1310         vnode_unlock(vp);
1311
1312         mp->mnt_vnodecovered = NULLVP;
1313 }
1314
1315 static int
1316 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1317 {
1318         int error;
1319
1320         /* unmount in progress return error */
1321         mount_lock_spin(mp);
1322         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1323                 mount_unlock(mp);
1324                 return EBUSY;
1325         }
1326         mount_unlock(mp);
1327         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1328
1329         /*
1330          * We only allow the filesystem to be reloaded if it
1331          * is currently mounted read-only.
1332          */
1333         if ((flags & MNT_RELOAD) &&
1334                         ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1335                 error = ENOTSUP;
1336                 goto out;
1337         }
1338
1339         /*
1340          * Only root, or the user that did the original mount is
1341          * permitted to update it.
1342          */
1343         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1344                         (!vfs_context_issuser(ctx))) {
1345                 error = EPERM;
1346                 goto out;
1347         }
1348 #if CONFIG_MACF
1349         error = mac_mount_check_remount(ctx, mp);
1350         if (error != 0) {
1351                 goto out;
1352         }
1353 #endif
1354
1355 out:
1356         if (error) {
1357                 lck_rw_done(&mp->mnt_rwlock);
1358         }
1359
1360         return error;
1361 }
1362
1363 static void
1364 mount_end_update(mount_t mp)
1365 {
1366         lck_rw_done(&mp->mnt_rwlock);
1367 }
1368
1369 static int
1370 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1371 {
1372         vnode_t vp;
1373
1374         if (height >= MAX_IMAGEBOOT_NESTING) {
1375                 return EINVAL;
1376         }
1377
1378         vp = imgsrc_rootvnodes[height];
1379         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1380                 *rvpp = vp;
1381                 return 0;
1382         } else {
1383                 return ENOENT;
1384         }
1385 }
1386
1387 static int
1388 relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
1389                 const char *fsname, vfs_context_t ctx,
1390                 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1391 {
1392         int error;
1393         mount_t mp;
1394         boolean_t placed = FALSE;
1395         vnode_t devvp = NULLVP;
1396         struct vfstable *vfsp;
1397         user_addr_t devpath;
1398         char *old_mntonname;
1399         vnode_t rvp;
1400         uint32_t height;
1401         uint32_t flags;
1402
1403         /* If we didn't imageboot, nothing to move */
1404         if (imgsrc_rootvnodes[0] == NULLVP) {
1405                 return EINVAL;
1406         }
1407
1408         /* Only root can do this */
1409         if (!vfs_context_issuser(ctx)) {
1410                 return EPERM;
1411         }
1412
1413         IMGSRC_DEBUG("looking for root vnode.\n");
1414
1415         /*
1416          * Get root vnode of filesystem we're moving.
1417          */
1418         if (by_index) {
1419                 if (is64bit) {
1420                         struct user64_mnt_imgsrc_args mia64;
1421                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1422                         if (error != 0) {
1423                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1424                                 return error;
1425                         }
1426
1427                         height = mia64.mi_height;
1428                         flags = mia64.mi_flags;
1429                         devpath = mia64.mi_devpath;
1430                 } else {
1431                         struct user32_mnt_imgsrc_args mia32;
1432                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1433                         if (error != 0) {
1434                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1435                                 return error;
1436                         }
1437
1438                         height = mia32.mi_height;
1439                         flags = mia32.mi_flags;
1440                         devpath = mia32.mi_devpath;
1441                 }
1442         } else {
1443                 /*
1444                  * For binary compatibility--assumes one level of nesting.
1445                  */
1446                 if (is64bit) {
1447                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
1448                                 return error;
1449                 } else {
1450                         user32_addr_t tmp;
1451                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
1452                                 return error;
1453
1454                         /* munge into LP64 addr */
1455                         devpath = CAST_USER_ADDR_T(tmp);
1456                 }
1457
1458                 height = 0;
1459                 flags = 0;
1460         }
1461
1462         if (flags != 0) {
1463                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1464                 return EINVAL;
1465         }
1466
1467         error = get_imgsrc_rootvnode(height, &rvp);
1468         if (error != 0) {
1469                 IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
1470                 return error;
1471         }
1472
1473         IMGSRC_DEBUG("got root vnode.\n");
1474
1475         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1476
1477         /* Can only move once */
1478         mp = vnode_mount(rvp);
1479         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1480                 IMGSRC_DEBUG("Already moved.\n");
1481                 error = EBUSY;
1482                 goto out0;
1483         }
1484
1485         IMGSRC_DEBUG("Starting updated.\n");
1486
1487         /* Get exclusive rwlock on mount, authorize update on mp */
1488         error = mount_begin_update(mp , ctx, 0);
1489         if (error != 0) {
1490                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1491                 goto out0;
1492         }
1493
1494         /*
1495          * It can only be moved once.  Flag is set under the rwlock,
1496          * so we're now safe to proceed.
1497          */
1498         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1499                 IMGSRC_DEBUG("Already moved [2]\n");
1500                 goto out1;
1501         }
1502
1503
1504         IMGSRC_DEBUG("Preparing coveredvp.\n");
1505
1506         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1507         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1508         if (error != 0) {
1509                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1510                 goto out1;
1511         }
1512
1513         IMGSRC_DEBUG("Covered vp OK.\n");
1514
1515         /* Sanity check the name caller has provided */
1516         vfsp = mp->mnt_vtable;
1517         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1518                 IMGSRC_DEBUG("Wrong fs name.\n");
1519                 error = EINVAL;
1520                 goto out2;
1521         }
1522
1523         /* Check the device vnode and update mount-from name, for local filesystems */
1524         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1525                 IMGSRC_DEBUG("Local, doing device validation.\n");
1526
1527                 if (devpath != USER_ADDR_NULL) {
1528                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1529                         if (error) {
1530                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1531                                 goto out2;
1532                         }
1533
1534                         vnode_put(devvp);
1535                 }
1536         }
1537
1538         /*
1539          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1540          * and increment the name cache's mount generation
1541          */
1542
1543         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1544         error = place_mount_and_checkdirs(mp, vp, ctx);
1545         if (error != 0) {
1546                 goto out2;
1547         }
1548
1549         placed = TRUE;
1550
1551         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1552         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1553
1554         /* Forbid future moves */
1555         mount_lock(mp);
1556         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1557         mount_unlock(mp);
1558
1559         /* Finally, add to mount list, completely ready to go */
1560         if (mount_list_add(mp) != 0) {
1561                 /*
1562                  * The system is shutting down trying to umount
1563                  * everything, so fail with a plausible errno.
1564                  */
1565                 error = EBUSY;
1566                 goto out3;
1567         }
1568
1569         mount_end_update(mp);
1570         vnode_put(rvp);
1571         FREE(old_mntonname, M_TEMP);
1572
1573         vfs_notify_mount(pvp);
1574
1575         return 0;
1576 out3:
1577         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1578
1579         mount_lock(mp);
1580         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1581         mount_unlock(mp);
1582
1583 out2:
1584         /*
1585          * Placing the mp on the vnode clears VMOUNT,
1586          * so cleanup is different after that point
1587          */
1588         if (placed) {
1589                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1590                 undo_place_on_covered_vp(mp, vp);
1591         } else {
1592                 vnode_lock_spin(vp);
1593                 CLR(vp->v_flag, VMOUNT);
1594                 vnode_unlock(vp);
1595         }
1596 out1:
1597         mount_end_update(mp);
1598
1599 out0:
1600         vnode_put(rvp);
1601         FREE(old_mntonname, M_TEMP);
1602         return error;
1603 }
1604
1605 #endif /* CONFIG_IMGSRC_ACCESS */
1606
1607 void
1608 enablequotas(struct mount *mp, vfs_context_t ctx)
1609 {
1610         struct nameidata qnd;
1611         int type;
1612         char qfpath[MAXPATHLEN];
1613         const char *qfname = QUOTAFILENAME;
1614         const char *qfopsname = QUOTAOPSNAME;
1615         const char *qfextension[] = INITQFNAMES;
1616
1617         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1618         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0 ) {
1619                 return;
1620         }
1621         /*
1622          * Enable filesystem disk quotas if necessary.
1623          * We ignore errors as this should not interfere with final mount
1624          */
1625         for (type=0; type < MAXQUOTAS; type++) {
1626                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1627                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1628                        CAST_USER_ADDR_T(qfpath), ctx);
1629                 if (namei(&qnd) != 0)
1630                         continue;           /* option file to trigger quotas is not present */
1631                 vnode_put(qnd.ni_vp);
1632                 nameidone(&qnd);
1633                 snprintf(qfpath, sizeof(qfpath),  "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1634
1635                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1636         }
1637         return;
1638 }
1639
1640
1641 static int
1642 checkdirs_callback(proc_t p, void * arg)
1643 {
1644         struct cdirargs * cdrp = (struct cdirargs * )arg;
1645         vnode_t olddp = cdrp->olddp;
1646         vnode_t newdp = cdrp->newdp;
1647         struct filedesc *fdp;
1648         vnode_t tvp;
1649         vnode_t fdp_cvp;
1650         vnode_t fdp_rvp;
1651         int cdir_changed = 0;
1652         int rdir_changed = 0;
1653
1654         /*
1655          * XXX Also needs to iterate each thread in the process to see if it
1656          * XXX is using a per-thread current working directory, and, if so,
1657          * XXX update that as well.
1658          */
1659
1660         proc_fdlock(p);
1661         fdp = p->p_fd;
1662         if (fdp == (struct filedesc *)0) {
1663                 proc_fdunlock(p);
1664                 return(PROC_RETURNED);
1665         }
1666         fdp_cvp = fdp->fd_cdir;
1667         fdp_rvp = fdp->fd_rdir;
1668         proc_fdunlock(p);
1669
1670         if (fdp_cvp == olddp) {
1671                 vnode_ref(newdp);
1672                 tvp = fdp->fd_cdir;
1673                 fdp_cvp = newdp;
1674                 cdir_changed = 1;
1675                 vnode_rele(tvp);
1676         }
1677         if (fdp_rvp == olddp) {
1678                 vnode_ref(newdp);
1679                 tvp = fdp->fd_rdir;
1680                 fdp_rvp = newdp;
1681                 rdir_changed = 1;
1682                 vnode_rele(tvp);
1683         }
1684         if (cdir_changed || rdir_changed) {
1685                 proc_fdlock(p);
1686                 fdp->fd_cdir = fdp_cvp;
1687                 fdp->fd_rdir = fdp_rvp;
1688                 proc_fdunlock(p);
1689         }
1690         return(PROC_RETURNED);
1691 }
1692
1693
1694
1695 /*
1696  * Scan all active processes to see if any of them have a current
1697  * or root directory onto which the new filesystem has just been
1698  * mounted. If so, replace them with the new mount point.
1699  */
1700 static int
1701 checkdirs(vnode_t olddp, vfs_context_t ctx)
1702 {
1703         vnode_t newdp;
1704         vnode_t tvp;
1705         int err;
1706         struct cdirargs cdr;
1707
1708         if (olddp->v_usecount == 1)
1709                 return(0);
1710         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
1711
1712         if (err != 0) {
1713 #if DIAGNOSTIC
1714                 panic("mount: lost mount: error %d", err);
1715 #endif
1716                 return(err);
1717         }
1718
1719         cdr.olddp = olddp;
1720         cdr.newdp = newdp;
1721         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
1722         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
1723
1724         if (rootvnode == olddp) {
1725                 vnode_ref(newdp);
1726                 tvp = rootvnode;
1727                 rootvnode = newdp;
1728                 vnode_rele(tvp);
1729         }
1730
1731         vnode_put(newdp);
1732         return(0);
1733 }
1734
1735 /*
1736  * Unmount a file system.
1737  *
1738  * Note: unmount takes a path to the vnode mounted on as argument,
1739  * not special file (as before).
1740  */
1741 /* ARGSUSED */
1742 int
1743 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
1744 {
1745         vnode_t vp;
1746         struct mount *mp;
1747         int error;
1748         struct nameidata nd;
1749         vfs_context_t ctx = vfs_context_current();
1750
1751         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
1752                 UIO_USERSPACE, uap->path, ctx);
1753         error = namei(&nd);
1754         if (error)
1755                 return (error);
1756         vp = nd.ni_vp;
1757         mp = vp->v_mount;
1758         nameidone(&nd);
1759
1760 #if CONFIG_MACF
1761         error = mac_mount_check_umount(ctx, mp);
1762         if (error != 0) {
1763                 vnode_put(vp);
1764                 return (error);
1765         }
1766 #endif
1767         /*
1768          * Must be the root of the filesystem
1769          */
1770         if ((vp->v_flag & VROOT) == 0) {
1771                 vnode_put(vp);
1772                 return (EINVAL);
1773         }
1774         mount_ref(mp, 0);
1775         vnode_put(vp);
1776         /* safedounmount consumes the mount ref */
1777         return (safedounmount(mp, uap->flags, ctx));
1778 }
1779
1780 int
1781 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
1782 {
1783         mount_t mp;
1784
1785         mp = mount_list_lookupby_fsid(fsid, 0, 1);
1786         if (mp == (mount_t)0) {
1787                 return(ENOENT);
1788         }
1789         mount_ref(mp, 0);
1790         mount_iterdrop(mp);
1791         /* safedounmount consumes the mount ref */
1792         return(safedounmount(mp, flags, ctx));
1793 }
1794
1795
1796 /*
1797  * The mount struct comes with a mount ref which will be consumed.
1798  * Do the actual file system unmount, prevent some common foot shooting.
1799  */
1800 int
1801 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
1802 {
1803         int error;
1804         proc_t p = vfs_context_proc(ctx);
1805
1806         /*
1807          * If the file system is not responding and MNT_NOBLOCK
1808          * is set and not a forced unmount then return EBUSY.
1809          */
1810         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
1811                 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
1812                 error = EBUSY;
1813                 goto out;
1814         }
1815
1816         /*
1817          * Skip authorization if the mount is tagged as permissive and
1818          * this is not a forced-unmount attempt.
1819          */
1820         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
1821                 /*
1822                  * Only root, or the user that did the original mount is
1823                  * permitted to unmount this filesystem.
1824                  */
1825                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
1826                                 (error = suser(kauth_cred_get(), &p->p_acflag)))
1827                         goto out;
1828         }
1829         /*
1830          * Don't allow unmounting the root file system.
1831          */
1832         if (mp->mnt_flag & MNT_ROOTFS) {
1833                 error = EBUSY; /* the root is always busy */
1834                 goto out;
1835         }
1836
1837 #ifdef CONFIG_IMGSRC_ACCESS
1838         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
1839                 error = EBUSY;
1840                 goto out;
1841         }
1842 #endif /* CONFIG_IMGSRC_ACCESS */
1843
1844         return (dounmount(mp, flags, 1, ctx));
1845
1846 out:
1847         mount_drop(mp, 0);
1848         return(error);
1849 }
1850
1851 /*
1852  * Do the actual file system unmount.
1853  */
1854 int
1855 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
1856 {
1857         vnode_t coveredvp = (vnode_t)0;
1858         int error;
1859         int needwakeup = 0;
1860         int forcedunmount = 0;
1861         int lflags = 0;
1862         struct vnode *devvp = NULLVP;
1863 #if CONFIG_TRIGGERS
1864         proc_t p = vfs_context_proc(ctx);
1865         int did_vflush = 0;
1866         int pflags_save = 0;
1867 #endif /* CONFIG_TRIGGERS */
1868
1869         mount_lock(mp);
1870
1871         /*
1872          * If already an unmount in progress just return EBUSY.
1873          * Even a forced unmount cannot override.
1874          */
1875         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1876                 if (withref != 0)
1877                         mount_drop(mp, 1);
1878                 mount_unlock(mp);
1879                 return (EBUSY);
1880         }
1881
1882         if (flags & MNT_FORCE) {
1883                 forcedunmount = 1;
1884                 mp->mnt_lflag |= MNT_LFORCE;
1885         }
1886
1887 #if CONFIG_TRIGGERS
1888         if (flags & MNT_NOBLOCK && p != kernproc)
1889                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
1890 #endif
1891
1892         mp->mnt_kern_flag |= MNTK_UNMOUNT;
1893         mp->mnt_lflag |= MNT_LUNMOUNT;
1894         mp->mnt_flag &=~ MNT_ASYNC;
1895         /*
1896          * anyone currently in the fast path that
1897          * trips over the cached rootvp will be
1898          * dumped out and forced into the slow path
1899          * to regenerate a new cached value
1900          */
1901         mp->mnt_realrootvp = NULLVP;
1902         mount_unlock(mp);
1903
1904         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
1905                 /*
1906                  * Force unmount any mounts in this filesystem.
1907                  * If any unmounts fail - just leave them dangling.
1908                  * Avoids recursion.
1909                  */
1910                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
1911         }
1912
1913         /*
1914          * taking the name_cache_lock exclusively will
1915          * insure that everyone is out of the fast path who
1916          * might be trying to use a now stale copy of
1917          * vp->v_mountedhere->mnt_realrootvp
1918          * bumping mount_generation causes the cached values
1919          * to be invalidated
1920          */
1921         name_cache_lock();
1922         mount_generation++;
1923         name_cache_unlock();
1924
1925
1926         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1927         if (withref != 0)
1928                 mount_drop(mp, 0);
1929 #if CONFIG_FSE
1930         fsevent_unmount(mp);  /* has to come first! */
1931 #endif
1932         error = 0;
1933         if (forcedunmount == 0) {
1934                 ubc_umount(mp); /* release cached vnodes */
1935                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1936                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
1937                         if (error) {
1938                                 mount_lock(mp);
1939                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1940                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1941                                 mp->mnt_lflag &= ~MNT_LFORCE;
1942                                 goto out;
1943                         }
1944                 }
1945         }
1946
1947         IOBSDMountChange(mp, kIOMountChangeUnmount);
1948
1949 #if CONFIG_TRIGGERS
1950         vfs_nested_trigger_unmounts(mp, flags, ctx);
1951         did_vflush = 1;
1952 #endif
1953         if (forcedunmount)
1954                 lflags |= FORCECLOSE;
1955         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
1956         if ((forcedunmount == 0) && error) {
1957                 mount_lock(mp);
1958                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1959                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1960                 mp->mnt_lflag &= ~MNT_LFORCE;
1961                 goto out;
1962         }
1963
1964         /* make sure there are no one in the mount iterations or lookup */
1965         mount_iterdrain(mp);
1966
1967         error = VFS_UNMOUNT(mp, flags, ctx);
1968         if (error) {
1969                 mount_iterreset(mp);
1970                 mount_lock(mp);
1971                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1972                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1973                 mp->mnt_lflag &= ~MNT_LFORCE;
1974                 goto out;
1975         }
1976
1977         /* increment the operations count */
1978         if (!error)
1979                 OSAddAtomic(1, &vfs_nummntops);
1980
1981         if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
1982                 /* hold an io reference and drop the usecount before close */
1983                 devvp = mp->mnt_devvp;
1984                 vnode_getalways(devvp);
1985                 vnode_rele(devvp);
1986                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1987                        ctx);
1988                 vnode_clearmountedon(devvp);
1989                 vnode_put(devvp);
1990         }
1991         lck_rw_done(&mp->mnt_rwlock);
1992         mount_list_remove(mp);
1993         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1994
1995         /* mark the mount point hook in the vp but not drop the ref yet */
1996         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
1997                 /*
1998                  * The covered vnode needs special handling. Trying to get an
1999                  * iocount must not block here as this may lead to deadlocks
2000                  * if the Filesystem to which the covered vnode belongs is
2001                  * undergoing forced unmounts. Since we hold a usecount, the
2002                  * vnode cannot be reused (it can, however, still be terminated)
2003                  */
2004                 vnode_getalways(coveredvp);
2005                 vnode_lock_spin(coveredvp);
2006
2007                 mp->mnt_crossref++;
2008                 coveredvp->v_mountedhere = (struct mount *)0;
2009                 CLR(coveredvp->v_flag, VMOUNT);
2010
2011                 vnode_unlock(coveredvp);
2012                 vnode_put(coveredvp);
2013         }
2014
2015         mount_list_lock();
2016         mp->mnt_vtable->vfc_refcount--;
2017         mount_list_unlock();
2018
2019         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2020         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2021         mount_lock(mp);
2022         mp->mnt_lflag |= MNT_LDEAD;
2023
2024         if (mp->mnt_lflag & MNT_LWAIT) {
2025                 /*
2026                  * do the wakeup here
2027                  * in case we block in mount_refdrain
2028                  * which will drop the mount lock
2029                  * and allow anyone blocked in vfs_busy
2030                  * to wakeup and see the LDEAD state
2031                  */
2032                 mp->mnt_lflag &= ~MNT_LWAIT;
2033                 wakeup((caddr_t)mp);
2034         }
2035         mount_refdrain(mp);
2036 out:
2037         if (mp->mnt_lflag & MNT_LWAIT) {
2038                 mp->mnt_lflag &= ~MNT_LWAIT;
2039                 needwakeup = 1;
2040         }
2041
2042 #if CONFIG_TRIGGERS
2043         if (flags & MNT_NOBLOCK && p != kernproc) {
2044                 // Restore P_NOREMOTEHANG bit to its previous value
2045                 if ((pflags_save & P_NOREMOTEHANG) == 0)
2046                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2047         }
2048
2049         /*
2050          * Callback and context are set together under the mount lock, and
2051          * never cleared, so we're safe to examine them here, drop the lock,
2052          * and call out.
2053          */
2054         if (mp->mnt_triggercallback != NULL) {
2055                 mount_unlock(mp);
2056                 if (error == 0) {
2057                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2058                 } else if (did_vflush) {
2059                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2060                 }
2061         } else {
2062                 mount_unlock(mp);
2063         }
2064 #else
2065         mount_unlock(mp);
2066 #endif /* CONFIG_TRIGGERS */
2067
2068         lck_rw_done(&mp->mnt_rwlock);
2069
2070         if (needwakeup)
2071                 wakeup((caddr_t)mp);
2072
2073         if (!error) {
2074                 if ((coveredvp != NULLVP)) {
2075                         vnode_t pvp = NULLVP;
2076
2077                         /*
2078                          * The covered vnode needs special handling. Trying to
2079                          * get an iocount must not block here as this may lead
2080                          * to deadlocks if the Filesystem to which the covered
2081                          * vnode belongs is undergoing forced unmounts. Since we
2082                          * hold a usecount, the  vnode cannot be reused
2083                          * (it can, however, still be terminated).
2084                          */
2085                         vnode_getalways(coveredvp);
2086
2087                         mount_dropcrossref(mp, coveredvp, 0);
2088                         /*
2089                          * We'll _try_ to detect if this really needs to be
2090                          * done. The coveredvp can only be in termination (or
2091                          * terminated) if the coveredvp's mount point is in a
2092                          * forced unmount (or has been) since we still hold the
2093                          * ref.
2094                          */
2095                         if (!vnode_isrecycled(coveredvp)) {
2096                                 pvp = vnode_getparent(coveredvp);
2097 #if CONFIG_TRIGGERS
2098                                 if (coveredvp->v_resolve) {
2099                                         vnode_trigger_rearm(coveredvp, ctx);
2100                                 }
2101 #endif
2102                         }
2103
2104                         vnode_rele(coveredvp);
2105                         vnode_put(coveredvp);
2106                         coveredvp = NULLVP;
2107
2108                         if (pvp) {
2109                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2110                                 vnode_put(pvp);
2111                         }
2112                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2113                                 mount_lock_destroy(mp);
2114 #if CONFIG_MACF
2115                                 mac_mount_label_destroy(mp);
2116 #endif
2117                                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2118                 } else
2119                         panic("dounmount: no coveredvp");
2120         }
2121         return (error);
2122 }
2123
2124 /*
2125  * Unmount any mounts in this filesystem.
2126  */
2127 void
2128 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2129 {
2130         mount_t smp;
2131         fsid_t *fsids, fsid;
2132         int fsids_sz;
2133         int count = 0, i, m = 0;
2134         vnode_t vp;
2135
2136         mount_list_lock();
2137
2138         // Get an array to hold the submounts fsids.
2139         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2140                 count++;
2141         fsids_sz = count * sizeof(fsid_t);
2142         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2143         if (fsids == NULL) {
2144                 mount_list_unlock();
2145                 goto out;
2146         }
2147         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2148
2149         /*
2150          * Fill the array with submount fsids.
2151          * Since mounts are always added to the tail of the mount list, the
2152          * list is always in mount order.
2153          * For each mount check if the mounted-on vnode belongs to a
2154          * mount that's already added to our array of mounts to be unmounted.
2155          */
2156         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2157                 vp = smp->mnt_vnodecovered;
2158                 if (vp == NULL)
2159                         continue;
2160                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2161                 for (i = 0; i <= m; i++) {
2162                         if (fsids[i].val[0] == fsid.val[0] &&
2163                             fsids[i].val[1] == fsid.val[1]) {
2164                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2165                                 break;
2166                         }
2167                 }
2168         }
2169         mount_list_unlock();
2170
2171         // Unmount the submounts in reverse order. Ignore errors.
2172         for (i = m; i > 0; i--) {
2173                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2174                 if (smp) {
2175                         mount_ref(smp, 0);
2176                         mount_iterdrop(smp);
2177                         (void) dounmount(smp, flags, 1, ctx);
2178                 }
2179         }
2180 out:
2181         if (fsids)
2182                 FREE(fsids, M_TEMP);
2183 }
2184
2185 void
2186 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2187 {
2188         vnode_lock(dp);
2189         mp->mnt_crossref--;
2190
2191         if (mp->mnt_crossref < 0)
2192                 panic("mount cross refs -ve");
2193
2194         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2195
2196                 if (need_put)
2197                         vnode_put_locked(dp);
2198                 vnode_unlock(dp);
2199
2200                 mount_lock_destroy(mp);
2201 #if CONFIG_MACF
2202                 mac_mount_label_destroy(mp);
2203 #endif
2204                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2205                 return;
2206         }
2207         if (need_put)
2208                 vnode_put_locked(dp);
2209         vnode_unlock(dp);
2210 }
2211
2212
2213 /*
2214  * Sync each mounted filesystem.
2215  */
2216 #if DIAGNOSTIC
2217 int syncprt = 0;
2218 #endif
2219
2220 int print_vmpage_stat=0;
2221 int sync_timeout = 60;  // Sync time limit (sec)
2222
2223 static int
2224 sync_callback(mount_t mp, __unused void *arg)
2225 {
2226         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2227                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2228
2229                 mp->mnt_flag &= ~MNT_ASYNC;
2230                 VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
2231                 if (asyncflag)
2232                         mp->mnt_flag |= MNT_ASYNC;
2233         }
2234
2235         return (VFS_RETURNED);
2236 }
2237
2238 /* ARGSUSED */
2239 int
2240 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2241 {
2242         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2243
2244         if (print_vmpage_stat) {
2245                 vm_countdirtypages();
2246         }
2247
2248 #if DIAGNOSTIC
2249         if (syncprt)
2250                 vfs_bufstats();
2251 #endif /* DIAGNOSTIC */
2252         return 0;
2253 }
2254
2255 static void
2256 sync_thread(void *arg, __unused wait_result_t wr)
2257 {
2258         int *timeout = (int *) arg;
2259
2260         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2261
2262         if (timeout)
2263                 wakeup((caddr_t) timeout);
2264         if (print_vmpage_stat) {
2265                 vm_countdirtypages();
2266         }
2267
2268 #if DIAGNOSTIC
2269         if (syncprt)
2270                 vfs_bufstats();
2271 #endif /* DIAGNOSTIC */
2272 }
2273
2274 /*
2275  * Sync in a separate thread so we can time out if it blocks.
2276  */
2277 static int
2278 sync_async(int timeout)
2279 {
2280         thread_t thd;
2281         int error;
2282         struct timespec ts = {timeout, 0};
2283
2284         lck_mtx_lock(sync_mtx_lck);
2285         if (kernel_thread_start(sync_thread, &timeout, &thd) != KERN_SUCCESS) {
2286                 printf("sync_thread failed\n");
2287                 lck_mtx_unlock(sync_mtx_lck);
2288                 return (0);
2289         }
2290
2291         error = msleep((caddr_t) &timeout, sync_mtx_lck, (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2292         if (error) {
2293                 printf("sync timed out: %d sec\n", timeout);
2294         }
2295         thread_deallocate(thd);
2296
2297         return (0);
2298 }
2299
2300 /*
2301  * An in-kernel sync for power management to call.
2302  */
2303 __private_extern__ int
2304 sync_internal(void)
2305 {
2306         (void) sync_async(sync_timeout);
2307
2308         return 0;
2309 } /* end of sync_internal call */
2310
2311 /*
2312  * Change filesystem quotas.
2313  */
2314 #if QUOTA
2315 int
2316 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2317 {
2318         struct mount *mp;
2319         int error, quota_cmd, quota_status;
2320         caddr_t datap;
2321         size_t fnamelen;
2322         struct nameidata nd;
2323         vfs_context_t ctx = vfs_context_current();
2324         struct dqblk my_dqblk;
2325
2326         AUDIT_ARG(uid, uap->uid);
2327         AUDIT_ARG(cmd, uap->cmd);
2328         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2329                uap->path, ctx);
2330         error = namei(&nd);
2331         if (error)
2332                 return (error);
2333         mp = nd.ni_vp->v_mount;
2334         vnode_put(nd.ni_vp);
2335         nameidone(&nd);
2336
2337         /* copyin any data we will need for downstream code */
2338         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2339
2340         switch (quota_cmd) {
2341         case Q_QUOTAON:
2342                 /* uap->arg specifies a file from which to take the quotas */
2343                 fnamelen = MAXPATHLEN;
2344                 datap = kalloc(MAXPATHLEN);
2345                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2346                 break;
2347         case Q_GETQUOTA:
2348                 /* uap->arg is a pointer to a dqblk structure. */
2349                 datap = (caddr_t) &my_dqblk;
2350                 break;
2351         case Q_SETQUOTA:
2352         case Q_SETUSE:
2353                 /* uap->arg is a pointer to a dqblk structure. */
2354                 datap = (caddr_t) &my_dqblk;
2355                 if (proc_is64bit(p)) {
2356                         struct user_dqblk       my_dqblk64;
2357                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof (my_dqblk64));
2358                         if (error == 0) {
2359                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2360                         }
2361                 }
2362                 else {
2363                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof (my_dqblk));
2364                 }
2365                 break;
2366         case Q_QUOTASTAT:
2367                 /* uap->arg is a pointer to an integer */
2368                 datap = (caddr_t) &quota_status;
2369                 break;
2370         default:
2371                 datap = NULL;
2372                 break;
2373         } /* switch */
2374
2375         if (error == 0) {
2376                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2377         }
2378
2379         switch (quota_cmd) {
2380         case Q_QUOTAON:
2381                 if (datap != NULL)
2382                         kfree(datap, MAXPATHLEN);
2383                 break;
2384         case Q_GETQUOTA:
2385                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2386                 if (error == 0) {
2387                         if (proc_is64bit(p)) {
2388                                 struct user_dqblk       my_dqblk64 = {.dqb_bhardlimit = 0};
2389                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2390                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof (my_dqblk64));
2391                         }
2392                         else {
2393                                 error = copyout(datap, uap->arg, sizeof (struct dqblk));
2394                         }
2395                 }
2396                 break;
2397         case Q_QUOTASTAT:
2398                 /* uap->arg is a pointer to an integer */
2399                 if (error == 0) {
2400                         error = copyout(datap, uap->arg, sizeof(quota_status));
2401                 }
2402                 break;
2403         default:
2404                 break;
2405         } /* switch */
2406
2407         return (error);
2408 }
2409 #else
2410 int
2411 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2412 {
2413         return (EOPNOTSUPP);
2414 }
2415 #endif /* QUOTA */
2416
2417 /*
2418  * Get filesystem statistics.
2419  *
2420  * Returns:     0                       Success
2421  *      namei:???
2422  *      vfs_update_vfsstat:???
2423  *      munge_statfs:EFAULT
2424  */
2425 /* ARGSUSED */
2426 int
2427 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2428 {
2429         struct mount *mp;
2430         struct vfsstatfs *sp;
2431         int error;
2432         struct nameidata nd;
2433         vfs_context_t ctx = vfs_context_current();
2434         vnode_t vp;
2435
2436         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2437                 UIO_USERSPACE, uap->path, ctx);
2438         error = namei(&nd);
2439         if (error != 0)
2440                 return (error);
2441         vp = nd.ni_vp;
2442         mp = vp->v_mount;
2443         sp = &mp->mnt_vfsstat;
2444         nameidone(&nd);
2445
2446 #if CONFIG_MACF
2447         error = mac_mount_check_stat(ctx, mp);
2448         if (error != 0)
2449                 return (error);
2450 #endif
2451
2452         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2453         if (error != 0) {
2454                 vnode_put(vp);
2455                 return (error);
2456         }
2457
2458         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2459         vnode_put(vp);
2460         return (error);
2461 }
2462
2463 /*
2464  * Get filesystem statistics.
2465  */
2466 /* ARGSUSED */
2467 int
2468 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2469 {
2470         vnode_t vp;
2471         struct mount *mp;
2472         struct vfsstatfs *sp;
2473         int error;
2474
2475         AUDIT_ARG(fd, uap->fd);
2476
2477         if ( (error = file_vnode(uap->fd, &vp)) )
2478                 return (error);
2479
2480         error = vnode_getwithref(vp);
2481         if (error) {
2482                 file_drop(uap->fd);
2483                 return (error);
2484         }
2485
2486         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2487
2488         mp = vp->v_mount;
2489         if (!mp) {
2490                 error = EBADF;
2491                 goto out;
2492         }
2493
2494 #if CONFIG_MACF
2495         error = mac_mount_check_stat(vfs_context_current(), mp);
2496         if (error != 0)
2497                 goto out;
2498 #endif
2499
2500         sp = &mp->mnt_vfsstat;
2501         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2502                 goto out;
2503         }
2504
2505         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2506
2507 out:
2508         file_drop(uap->fd);
2509         vnode_put(vp);
2510
2511         return (error);
2512 }
2513
2514 /*
2515  * Common routine to handle copying of statfs64 data to user space
2516  */
2517 static int
2518 statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
2519 {
2520         int error;
2521         struct statfs64 sfs;
2522
2523         bzero(&sfs, sizeof(sfs));
2524
2525         sfs.f_bsize = sfsp->f_bsize;
2526         sfs.f_iosize = (int32_t)sfsp->f_iosize;
2527         sfs.f_blocks = sfsp->f_blocks;
2528         sfs.f_bfree = sfsp->f_bfree;
2529         sfs.f_bavail = sfsp->f_bavail;
2530         sfs.f_files = sfsp->f_files;
2531         sfs.f_ffree = sfsp->f_ffree;
2532         sfs.f_fsid = sfsp->f_fsid;
2533         sfs.f_owner = sfsp->f_owner;
2534         sfs.f_type = mp->mnt_vtable->vfc_typenum;
2535         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2536         sfs.f_fssubtype = sfsp->f_fssubtype;
2537         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2538                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
2539         } else {
2540                 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
2541         }
2542         strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
2543         strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
2544
2545         error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
2546
2547         return(error);
2548 }
2549
2550 /*
2551  * Get file system statistics in 64-bit mode
2552  */
2553 int
2554 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
2555 {
2556         struct mount *mp;
2557         struct vfsstatfs *sp;
2558         int error;
2559         struct nameidata nd;
2560         vfs_context_t ctxp = vfs_context_current();
2561         vnode_t vp;
2562
2563         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2564                 UIO_USERSPACE, uap->path, ctxp);
2565         error = namei(&nd);
2566         if (error != 0)
2567                 return (error);
2568         vp = nd.ni_vp;
2569         mp = vp->v_mount;
2570         sp = &mp->mnt_vfsstat;
2571         nameidone(&nd);
2572
2573 #if CONFIG_MACF
2574         error = mac_mount_check_stat(ctxp, mp);
2575         if (error != 0)
2576                 return (error);
2577 #endif
2578
2579         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
2580         if (error != 0) {
2581                 vnode_put(vp);
2582                 return (error);
2583         }
2584
2585         error = statfs64_common(mp, sp, uap->buf);
2586         vnode_put(vp);
2587
2588         return (error);
2589 }
2590
2591 /*
2592  * Get file system statistics in 64-bit mode
2593  */
2594 int
2595 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
2596 {
2597         struct vnode *vp;
2598         struct mount *mp;
2599         struct vfsstatfs *sp;
2600         int error;
2601
2602         AUDIT_ARG(fd, uap->fd);
2603
2604         if ( (error = file_vnode(uap->fd, &vp)) )
2605                 return (error);
2606
2607         error = vnode_getwithref(vp);
2608         if (error) {
2609                 file_drop(uap->fd);
2610                 return (error);
2611         }
2612
2613         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2614
2615         mp = vp->v_mount;
2616         if (!mp) {
2617                 error = EBADF;
2618                 goto out;
2619         }
2620
2621 #if CONFIG_MACF
2622         error = mac_mount_check_stat(vfs_context_current(), mp);
2623         if (error != 0)
2624                 goto out;
2625 #endif
2626
2627         sp = &mp->mnt_vfsstat;
2628         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2629                 goto out;
2630         }
2631
2632         error = statfs64_common(mp, sp, uap->buf);
2633
2634 out:
2635         file_drop(uap->fd);
2636         vnode_put(vp);
2637
2638         return (error);
2639 }
2640
2641 struct getfsstat_struct {
2642         user_addr_t     sfsp;
2643         user_addr_t     *mp;
2644         int             count;
2645         int             maxcount;
2646         int             flags;
2647         int             error;
2648 };
2649
2650
2651 static int
2652 getfsstat_callback(mount_t mp, void * arg)
2653 {
2654
2655         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2656         struct vfsstatfs *sp;
2657         int error, my_size;
2658         vfs_context_t ctx = vfs_context_current();
2659
2660         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2661 #if CONFIG_MACF
2662                 error = mac_mount_check_stat(ctx, mp);
2663                 if (error != 0) {
2664                         fstp->error = error;
2665                         return(VFS_RETURNED_DONE);
2666                 }
2667 #endif
2668                 sp = &mp->mnt_vfsstat;
2669                 /*
2670                  * If MNT_NOWAIT is specified, do not refresh the
2671                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
2672                  */
2673                 if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2674                         (error = vfs_update_vfsstat(mp, ctx,
2675                             VFS_USER_EVENT))) {
2676                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2677                         return(VFS_RETURNED);
2678                 }
2679
2680                 /*
2681                  * Need to handle LP64 version of struct statfs
2682                  */
2683                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
2684                 if (error) {
2685                         fstp->error = error;
2686                         return(VFS_RETURNED_DONE);
2687                 }
2688                 fstp->sfsp += my_size;
2689
2690                 if (fstp->mp) {
2691 #if CONFIG_MACF
2692                         error = mac_mount_label_get(mp, *fstp->mp);
2693                         if (error) {
2694                                 fstp->error = error;
2695                                 return(VFS_RETURNED_DONE);
2696                         }
2697 #endif
2698                         fstp->mp++;
2699                 }
2700         }
2701         fstp->count++;
2702         return(VFS_RETURNED);
2703 }
2704
2705 /*
2706  * Get statistics on all filesystems.
2707  */
2708 int
2709 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
2710 {
2711         struct __mac_getfsstat_args muap;
2712
2713         muap.buf = uap->buf;
2714         muap.bufsize = uap->bufsize;
2715         muap.mac = USER_ADDR_NULL;
2716         muap.macsize = 0;
2717         muap.flags = uap->flags;
2718
2719         return (__mac_getfsstat(p, &muap, retval));
2720 }
2721
2722 /*
2723  * __mac_getfsstat: Get MAC-related file system statistics
2724  *
2725  * Parameters:    p                        (ignored)
2726  *                uap                      User argument descriptor (see below)
2727  *                retval                   Count of file system statistics (N stats)
2728  *
2729  * Indirect:      uap->bufsize             Buffer size
2730  *                uap->macsize             MAC info size
2731  *                uap->buf                 Buffer where information will be returned
2732  *                uap->mac                 MAC info
2733  *                uap->flags               File system flags
2734  *
2735  *
2736  * Returns:        0                       Success
2737  *                !0                       Not success
2738  *
2739  */
2740 int
2741 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
2742 {
2743         user_addr_t sfsp;
2744         user_addr_t *mp;
2745         size_t count, maxcount, bufsize, macsize;
2746         struct getfsstat_struct fst;
2747
2748         bufsize = (size_t) uap->bufsize;
2749         macsize = (size_t) uap->macsize;
2750
2751         if (IS_64BIT_PROCESS(p)) {
2752                 maxcount = bufsize / sizeof(struct user64_statfs);
2753         }
2754         else {
2755                 maxcount = bufsize / sizeof(struct user32_statfs);
2756         }
2757         sfsp = uap->buf;
2758         count = 0;
2759
2760         mp = NULL;
2761
2762 #if CONFIG_MACF
2763         if (uap->mac != USER_ADDR_NULL) {
2764                 u_int32_t *mp0;
2765                 int error;
2766                 unsigned int i;
2767
2768                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
2769                 if (count != maxcount)
2770                         return (EINVAL);
2771
2772                 /* Copy in the array */
2773                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
2774                 if (mp0 == NULL) {
2775                         return (ENOMEM);
2776                 }
2777
2778                 error = copyin(uap->mac, mp0, macsize);
2779                 if (error) {
2780                         FREE(mp0, M_MACTEMP);
2781                         return (error);
2782                 }
2783
2784                 /* Normalize to an array of user_addr_t */
2785                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
2786                 if (mp == NULL) {
2787                         FREE(mp0, M_MACTEMP);
2788                         return (ENOMEM);
2789                 }
2790
2791                 for (i = 0; i < count; i++) {
2792                         if (IS_64BIT_PROCESS(p))
2793                                 mp[i] = ((user_addr_t *)mp0)[i];
2794                         else
2795                                 mp[i] = (user_addr_t)mp0[i];
2796                 }
2797                 FREE(mp0, M_MACTEMP);
2798         }
2799 #endif
2800
2801
2802         fst.sfsp = sfsp;
2803         fst.mp = mp;
2804         fst.flags = uap->flags;
2805         fst.count = 0;
2806         fst.error = 0;
2807         fst.maxcount = maxcount;
2808
2809
2810         vfs_iterate(0, getfsstat_callback, &fst);
2811
2812         if (mp)
2813                 FREE(mp, M_MACTEMP);
2814
2815         if (fst.error ) {
2816                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2817                 return(fst.error);
2818         }
2819
2820         if (fst.sfsp && fst.count > fst.maxcount)
2821                 *retval = fst.maxcount;
2822         else
2823                 *retval = fst.count;
2824         return (0);
2825 }
2826
2827 static int
2828 getfsstat64_callback(mount_t mp, void * arg)
2829 {
2830         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2831         struct vfsstatfs *sp;
2832         int error;
2833
2834         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2835 #if CONFIG_MACF
2836                 error = mac_mount_check_stat(vfs_context_current(), mp);
2837                 if (error != 0) {
2838                         fstp->error = error;
2839                         return(VFS_RETURNED_DONE);
2840                 }
2841 #endif
2842                 sp = &mp->mnt_vfsstat;
2843                 /*
2844                  * If MNT_NOWAIT is specified, do not refresh the fsstat
2845                  * cache. MNT_WAIT overrides MNT_NOWAIT.
2846                  *
2847                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
2848                  * getfsstat, since the constants are out of the same
2849                  * namespace.
2850                  */
2851                 if (((fstp->flags & MNT_NOWAIT) == 0 ||
2852                      (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2853                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
2854                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2855                         return(VFS_RETURNED);
2856                 }
2857
2858                 error = statfs64_common(mp, sp, fstp->sfsp);
2859                 if (error) {
2860                         fstp->error = error;
2861                         return(VFS_RETURNED_DONE);
2862                 }
2863                 fstp->sfsp += sizeof(struct statfs64);
2864         }
2865         fstp->count++;
2866         return(VFS_RETURNED);
2867 }
2868
2869 /*
2870  * Get statistics on all file systems in 64 bit mode.
2871  */
2872 int
2873 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
2874 {
2875         user_addr_t sfsp;
2876         int count, maxcount;
2877         struct getfsstat_struct fst;
2878
2879         maxcount = uap->bufsize / sizeof(struct statfs64);
2880
2881         sfsp = uap->buf;
2882         count = 0;
2883
2884         fst.sfsp = sfsp;
2885         fst.flags = uap->flags;
2886         fst.count = 0;
2887         fst.error = 0;
2888         fst.maxcount = maxcount;
2889
2890         vfs_iterate(0, getfsstat64_callback, &fst);
2891
2892         if (fst.error ) {
2893                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2894                 return(fst.error);
2895         }
2896
2897         if (fst.sfsp && fst.count > fst.maxcount)
2898                 *retval = fst.maxcount;
2899         else
2900                 *retval = fst.count;
2901
2902         return (0);
2903 }
2904
2905 /*
2906  * gets the associated vnode with the file descriptor passed.
2907  * as input
2908  *
2909  * INPUT
2910  * ctx - vfs context of caller
2911  * fd - file descriptor for which vnode is required.
2912  * vpp - Pointer to pointer to vnode to be returned.
2913  *
2914  * The vnode is returned with an iocount so any vnode obtained
2915  * by this call needs a vnode_put
2916  *
2917  */
2918 int
2919 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
2920 {
2921         int error;
2922         vnode_t vp;
2923         struct fileproc *fp;
2924         proc_t p = vfs_context_proc(ctx);
2925
2926         *vpp =  NULLVP;
2927
2928         error = fp_getfvp(p, fd, &fp, &vp);
2929         if (error)
2930                 return (error);
2931
2932         error = vnode_getwithref(vp);
2933         if (error) {
2934                 (void)fp_drop(p, fd, fp, 0);
2935                 return (error);
2936         }
2937
2938         (void)fp_drop(p, fd, fp, 0);
2939         *vpp = vp;
2940         return (error);
2941 }
2942
2943 /*
2944  * Wrapper function around namei to start lookup from a directory
2945  * specified by a file descriptor ni_dirfd.
2946  *
2947  * In addition to all the errors returned by namei, this call can
2948  * return ENOTDIR if the file descriptor does not refer to a directory.
2949  * and EBADF if the file descriptor is not valid.
2950  */
2951 int
2952 nameiat(struct nameidata *ndp, int dirfd)
2953 {
2954         if ((dirfd != AT_FDCWD) &&
2955             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
2956             !(ndp->ni_cnd.cn_flags & USEDVP)) {
2957                 int error = 0;
2958                 char c;
2959
2960                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
2961                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
2962                         if (error)
2963                                 return (error);
2964                 } else {
2965                         c = *((char *)(ndp->ni_dirp));
2966                 }
2967
2968                 if (c != '/') {
2969                         vnode_t dvp_at;
2970
2971                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
2972                             &dvp_at);
2973                         if (error)
2974                                 return (error);
2975
2976                         if (vnode_vtype(dvp_at) != VDIR) {
2977                                 vnode_put(dvp_at);
2978                                 return (ENOTDIR);
2979                         }
2980
2981                         ndp->ni_dvp = dvp_at;
2982                         ndp->ni_cnd.cn_flags |= USEDVP;
2983                         error = namei(ndp);
2984                         ndp->ni_cnd.cn_flags &= ~USEDVP;
2985                         vnode_put(dvp_at);
2986                         return (error);
2987                 }
2988         }
2989
2990         return (namei(ndp));
2991 }
2992
2993 /*
2994  * Change current working directory to a given file descriptor.
2995  */
2996 /* ARGSUSED */
2997 static int
2998 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
2999 {
3000         struct filedesc *fdp = p->p_fd;
3001         vnode_t vp;
3002         vnode_t tdp;
3003         vnode_t tvp;
3004         struct mount *mp;
3005         int error;
3006         vfs_context_t ctx = vfs_context_current();
3007
3008         AUDIT_ARG(fd, uap->fd);
3009         if (per_thread && uap->fd == -1) {
3010                 /*
3011                  * Switching back from per-thread to per process CWD; verify we
3012                  * in fact have one before proceeding.  The only success case
3013                  * for this code path is to return 0 preemptively after zapping
3014                  * the thread structure contents.
3015                  */
3016                 thread_t th = vfs_context_thread(ctx);
3017                 if (th) {
3018                         uthread_t uth = get_bsdthread_info(th);
3019                         tvp = uth->uu_cdir;
3020                         uth->uu_cdir = NULLVP;
3021                         if (tvp != NULLVP) {
3022                                 vnode_rele(tvp);
3023                                 return (0);
3024                         }
3025                 }
3026                 return (EBADF);
3027         }
3028
3029         if ( (error = file_vnode(uap->fd, &vp)) )
3030                 return(error);
3031         if ( (error = vnode_getwithref(vp)) ) {
3032                 file_drop(uap->fd);
3033                 return(error);
3034         }
3035
3036         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3037
3038         if (vp->v_type != VDIR) {
3039                 error = ENOTDIR;
3040                 goto out;
3041         }
3042
3043 #if CONFIG_MACF
3044         error = mac_vnode_check_chdir(ctx, vp);
3045         if (error)
3046                 goto out;
3047 #endif
3048         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3049         if (error)
3050                 goto out;
3051
3052         while (!error && (mp = vp->v_mountedhere) != NULL) {
3053                 if (vfs_busy(mp, LK_NOWAIT)) {
3054                         error = EACCES;
3055                         goto out;
3056                 }
3057                 error = VFS_ROOT(mp, &tdp, ctx);
3058                 vfs_unbusy(mp);
3059                 if (error)
3060                         break;
3061                 vnode_put(vp);
3062                 vp = tdp;
3063         }
3064         if (error)
3065                 goto out;
3066         if ( (error = vnode_ref(vp)) )
3067                 goto out;
3068         vnode_put(vp);
3069
3070         if (per_thread) {
3071                 thread_t th = vfs_context_thread(ctx);
3072                 if (th) {
3073                         uthread_t uth = get_bsdthread_info(th);
3074                         tvp = uth->uu_cdir;
3075                         uth->uu_cdir = vp;
3076                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3077                 } else {
3078                         vnode_rele(vp);
3079                         return (ENOENT);
3080                 }
3081         } else {
3082                 proc_fdlock(p);
3083                 tvp = fdp->fd_cdir;
3084                 fdp->fd_cdir = vp;
3085                 proc_fdunlock(p);
3086         }
3087
3088         if (tvp)
3089                 vnode_rele(tvp);
3090         file_drop(uap->fd);
3091
3092         return (0);
3093 out:
3094         vnode_put(vp);
3095         file_drop(uap->fd);
3096
3097         return(error);
3098 }
3099
3100 int
3101 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3102 {
3103         return common_fchdir(p, uap, 0);
3104 }
3105
3106 int
3107 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3108 {
3109         return common_fchdir(p, (void *)uap, 1);
3110 }
3111
3112 /*
3113  * Change current working directory (".").
3114  *
3115  * Returns:     0                       Success
3116  *      change_dir:ENOTDIR
3117  *      change_dir:???
3118  *      vnode_ref:ENOENT                No such file or directory
3119  */
3120 /* ARGSUSED */
3121 static int
3122 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3123 {
3124         struct filedesc *fdp = p->p_fd;
3125         int error;
3126         struct nameidata nd;
3127         vnode_t tvp;
3128         vfs_context_t ctx = vfs_context_current();
3129
3130         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3131                 UIO_USERSPACE, uap->path, ctx);
3132         error = change_dir(&nd, ctx);
3133         if (error)
3134                 return (error);
3135         if ( (error = vnode_ref(nd.ni_vp)) ) {
3136                 vnode_put(nd.ni_vp);
3137                 return (error);
3138         }
3139         /*
3140          * drop the iocount we picked up in change_dir
3141          */
3142         vnode_put(nd.ni_vp);
3143
3144         if (per_thread) {
3145                 thread_t th = vfs_context_thread(ctx);
3146                 if (th) {
3147                         uthread_t uth = get_bsdthread_info(th);
3148                         tvp = uth->uu_cdir;
3149                         uth->uu_cdir = nd.ni_vp;
3150                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3151                 } else {
3152                         vnode_rele(nd.ni_vp);
3153                         return (ENOENT);
3154                 }
3155         } else {
3156                 proc_fdlock(p);
3157                 tvp = fdp->fd_cdir;
3158                 fdp->fd_cdir = nd.ni_vp;
3159                 proc_fdunlock(p);
3160         }
3161
3162         if (tvp)
3163                 vnode_rele(tvp);
3164
3165         return (0);
3166 }
3167
3168
3169 /*
3170  * chdir
3171  *
3172  * Change current working directory (".") for the entire process
3173  *
3174  * Parameters:  p       Process requesting the call
3175  *              uap     User argument descriptor (see below)
3176  *              retval  (ignored)
3177  *
3178  * Indirect parameters: uap->path       Directory path
3179  *
3180  * Returns:     0                       Success
3181  *              common_chdir: ENOTDIR
3182  *              common_chdir: ENOENT    No such file or directory
3183  *              common_chdir: ???
3184  *
3185  */
3186 int
3187 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3188 {
3189         return common_chdir(p, (void *)uap, 0);
3190 }
3191
3192 /*
3193  * __pthread_chdir
3194  *
3195  * Change current working directory (".") for a single thread
3196  *
3197  * Parameters:  p       Process requesting the call
3198  *              uap     User argument descriptor (see below)
3199  *              retval  (ignored)
3200  *
3201  * Indirect parameters: uap->path       Directory path
3202  *
3203  * Returns:     0                       Success
3204  *              common_chdir: ENOTDIR
3205  *              common_chdir: ENOENT    No such file or directory
3206  *              common_chdir: ???
3207  *
3208  */
3209 int
3210 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3211 {
3212         return common_chdir(p, (void *)uap, 1);
3213 }
3214
3215
3216 /*
3217  * Change notion of root (``/'') directory.
3218  */
3219 /* ARGSUSED */
3220 int
3221 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3222 {
3223         struct filedesc *fdp = p->p_fd;
3224         int error;
3225         struct nameidata nd;
3226         vnode_t tvp;
3227         vfs_context_t ctx = vfs_context_current();
3228
3229         if ((error = suser(kauth_cred_get(), &p->p_acflag)))
3230                 return (error);
3231
3232         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3233                 UIO_USERSPACE, uap->path, ctx);
3234         error = change_dir(&nd, ctx);
3235         if (error)
3236                 return (error);
3237
3238 #if CONFIG_MACF
3239         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3240             &nd.ni_cnd);
3241         if (error) {
3242                 vnode_put(nd.ni_vp);
3243                 return (error);
3244         }
3245 #endif
3246
3247         if ( (error = vnode_ref(nd.ni_vp)) ) {
3248                 vnode_put(nd.ni_vp);
3249                 return (error);
3250         }
3251         vnode_put(nd.ni_vp);
3252
3253         proc_fdlock(p);
3254         tvp = fdp->fd_rdir;
3255         fdp->fd_rdir = nd.ni_vp;
3256         fdp->fd_flags |= FD_CHROOT;
3257         proc_fdunlock(p);
3258
3259         if (tvp != NULL)
3260                 vnode_rele(tvp);
3261
3262         return (0);
3263 }
3264
3265 /*
3266  * Common routine for chroot and chdir.
3267  *
3268  * Returns:     0                       Success
3269  *              ENOTDIR                 Not a directory
3270  *              namei:???               [anything namei can return]
3271  *              vnode_authorize:???     [anything vnode_authorize can return]
3272  */
3273 static int
3274 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3275 {
3276         vnode_t vp;
3277         int error;
3278
3279         if ((error = namei(ndp)))
3280                 return (error);
3281         nameidone(ndp);
3282         vp = ndp->ni_vp;
3283
3284         if (vp->v_type != VDIR) {
3285                 vnode_put(vp);
3286                 return (ENOTDIR);
3287         }
3288
3289 #if CONFIG_MACF
3290         error = mac_vnode_check_chdir(ctx, vp);
3291         if (error) {
3292                 vnode_put(vp);
3293                 return (error);
3294         }
3295 #endif
3296
3297         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3298         if (error) {
3299                 vnode_put(vp);
3300                 return (error);
3301         }
3302
3303         return (error);
3304 }
3305
3306 /*
3307  * Free the vnode data (for directories) associated with the file glob.
3308  */
3309 struct fd_vn_data *
3310 fg_vn_data_alloc(void)
3311 {
3312         struct fd_vn_data *fvdata;
3313
3314         /* Allocate per fd vnode data */
3315         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3316                M_FD_VN_DATA, M_WAITOK | M_ZERO);
3317         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3318         return fvdata;
3319 }
3320
3321 /*
3322  * Free the vnode data (for directories) associated with the file glob.
3323  */
3324 void
3325 fg_vn_data_free(void *fgvndata)
3326 {
3327         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3328
3329         if (fvdata->fv_buf)
3330                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3331         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3332         FREE(fvdata, M_FD_VN_DATA);
3333 }
3334
3335 /*
3336  * Check permissions, allocate an open file structure,
3337  * and call the device open routine if any.
3338  *
3339  * Returns:     0                       Success
3340  *              EINVAL
3341  *              EINTR
3342  *      falloc:ENFILE
3343  *      falloc:EMFILE
3344  *      falloc:ENOMEM
3345  *      vn_open_auth:???
3346  *      dupfdopen:???
3347  *      VNOP_ADVLOCK:???
3348  *      vnode_setsize:???
3349  *
3350  * XXX Need to implement uid, gid
3351  */
3352 int
3353 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3354     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3355     int32_t *retval)
3356 {
3357         proc_t p = vfs_context_proc(ctx);
3358         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3359         struct fileproc *fp;
3360         vnode_t vp;
3361         int flags, oflags;
3362         int type, indx, error;
3363         struct flock lf;
3364         struct vfs_context context;
3365
3366         oflags = uflags;
3367
3368         if ((oflags & O_ACCMODE) == O_ACCMODE)
3369                 return(EINVAL);
3370
3371         flags = FFLAGS(uflags);
3372         CLR(flags, FENCRYPTED);
3373         CLR(flags, FUNENCRYPTED);
3374
3375         AUDIT_ARG(fflags, oflags);
3376         AUDIT_ARG(mode, vap->va_mode);
3377
3378         if ((error = falloc_withalloc(p,
3379             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3380                 return (error);
3381         }
3382         uu->uu_dupfd = -indx - 1;
3383
3384         if ((error = vn_open_auth(ndp, &flags, vap))) {
3385                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)){        /* XXX from fdopen */
3386                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3387                                 fp_drop(p, indx, NULL, 0);
3388                                 *retval = indx;
3389                                 return (0);
3390                         }
3391                 }
3392                 if (error == ERESTART)
3393                         error = EINTR;
3394                 fp_free(p, indx, fp);
3395                 return (error);
3396         }
3397         uu->uu_dupfd = 0;
3398         vp = ndp->ni_vp;
3399
3400         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3401         fp->f_fglob->fg_ops = &vnops;
3402         fp->f_fglob->fg_data = (caddr_t)vp;
3403
3404         if (flags & (O_EXLOCK | O_SHLOCK)) {
3405                 lf.l_whence = SEEK_SET;
3406                 lf.l_start = 0;
3407                 lf.l_len = 0;
3408                 if (flags & O_EXLOCK)
3409                         lf.l_type = F_WRLCK;
3410                 else
3411                         lf.l_type = F_RDLCK;
3412                 type = F_FLOCK;
3413                 if ((flags & FNONBLOCK) == 0)
3414                         type |= F_WAIT;
3415 #if CONFIG_MACF
3416                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3417                     F_SETLK, &lf);
3418                 if (error)
3419                         goto bad;
3420 #endif
3421                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL)))
3422                         goto bad;
3423                 fp->f_fglob->fg_flag |= FHASLOCK;
3424         }
3425
3426         /* try to truncate by setting the size attribute */
3427         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0))
3428                 goto bad;
3429
3430         /*
3431          * For directories we hold some additional information in the fd.
3432          */
3433         if (vnode_vtype(vp) == VDIR) {
3434                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3435         } else {
3436                 fp->f_fglob->fg_vn_data = NULL;
3437         }
3438
3439         vnode_put(vp);
3440
3441         /*
3442          * The first terminal open (without a O_NOCTTY) by a session leader
3443          * results in it being set as the controlling terminal.
3444          */
3445         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3446             !(flags & O_NOCTTY)) {
3447                 int tmp = 0;
3448
3449                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3450                     (caddr_t)&tmp, ctx);
3451         }
3452
3453         proc_fdlock(p);
3454         if (flags & O_CLOEXEC)
3455                 *fdflags(p, indx) |= UF_EXCLOSE;
3456         if (flags & O_CLOFORK)
3457                 *fdflags(p, indx) |= UF_FORKCLOSE;
3458         procfdtbl_releasefd(p, indx, NULL);
3459
3460 #if CONFIG_SECLUDED_MEMORY
3461         if (secluded_for_filecache &&
3462             FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
3463             vnode_vtype(vp) == VREG) {
3464                 memory_object_control_t moc;
3465
3466                 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
3467
3468                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
3469                         /* nothing to do... */
3470                 } else if (fp->f_fglob->fg_flag & FWRITE) {
3471                         /* writable -> no longer  eligible for secluded pages */
3472                         memory_object_mark_eligible_for_secluded(moc,
3473                                                                  FALSE);
3474                 } else if (secluded_for_filecache == 1) {
3475                         char pathname[32] = { 0, };
3476                         size_t copied;
3477                         /* XXX FBDP: better way to detect /Applications/ ? */
3478                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3479                                 copyinstr(ndp->ni_dirp,
3480                                           pathname,
3481                                           sizeof (pathname),
3482                                           &copied);
3483                         } else {
3484                                 copystr(CAST_DOWN(void *, ndp->ni_dirp),
3485                                         pathname,
3486                                         sizeof (pathname),
3487                                         &copied);
3488                         }
3489                         pathname[sizeof (pathname) - 1] = '\0';
3490                         if (strncmp(pathname,
3491                                     "/Applications/",
3492                                     strlen("/Applications/")) == 0 &&
3493                             strncmp(pathname,
3494                                     "/Applications/Camera.app/",
3495                                     strlen("/Applications/Camera.app/")) != 0) {
3496                                 /*
3497                                  * not writable
3498                                  * AND from "/Applications/"
3499                                  * AND not from "/Applications/Camera.app/"
3500                                  * ==> eligible for secluded
3501                                  */
3502                                 memory_object_mark_eligible_for_secluded(moc,
3503                                                                          TRUE);
3504                         }
3505                 } else if (secluded_for_filecache == 2) {
3506 /* not implemented... */
3507                         if (!strncmp(vp->v_name,
3508                                      DYLD_SHARED_CACHE_NAME,
3509                                      strlen(DYLD_SHARED_CACHE_NAME)) ||
3510                             !strncmp(vp->v_name,
3511                                      "dyld",
3512                                      strlen(vp->v_name)) ||
3513                             !strncmp(vp->v_name,
3514                                      "launchd",
3515                                      strlen(vp->v_name)) ||
3516                             !strncmp(vp->v_name,
3517                                      "Camera",
3518                                      strlen(vp->v_name)) ||
3519                             !strncmp(vp->v_name,
3520                                      "mediaserverd",
3521                                      strlen(vp->v_name))) {
3522                                 /*
3523                                  * This file matters when launching Camera:
3524                                  * do not store its contents in the secluded
3525                                  * pool that will be drained on Camera launch.
3526                                  */
3527                                 memory_object_mark_eligible_for_secluded(moc,
3528                                                                          FALSE);
3529                         }
3530                 }
3531         }
3532 #endif /* CONFIG_SECLUDED_MEMORY */
3533
3534         fp_drop(p, indx, fp, 1);
3535         proc_fdunlock(p);
3536
3537         *retval = indx;
3538
3539         return (0);
3540 bad:
3541         context = *vfs_context_current();
3542         context.vc_ucred = fp->f_fglob->fg_cred;
3543
3544         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
3545             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
3546                 lf.l_whence = SEEK_SET;
3547                 lf.l_start = 0;
3548                 lf.l_len = 0;
3549                 lf.l_type = F_UNLCK;
3550
3551                 (void)VNOP_ADVLOCK(
3552                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
3553         }
3554
3555         vn_close(vp, fp->f_fglob->fg_flag, &context);
3556         vnode_put(vp);
3557         fp_free(p, indx, fp);
3558
3559         return (error);
3560 }
3561
3562 /*
3563  * While most of the *at syscall handlers can call nameiat() which
3564  * is a wrapper around namei, the use of namei and initialisation
3565  * of nameidata are far removed and in different functions  - namei
3566  * gets called in vn_open_auth for open1. So we'll just do here what
3567  * nameiat() does.
3568  */
3569 static int
3570 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3571     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
3572     int dirfd)
3573 {
3574         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
3575                 int error;
3576                 char c;
3577
3578                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3579                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3580                         if (error)
3581                                 return (error);
3582                 } else {
3583                         c = *((char *)(ndp->ni_dirp));
3584                 }
3585
3586                 if (c != '/') {
3587                         vnode_t dvp_at;
3588
3589                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3590                             &dvp_at);
3591                         if (error)
3592                                 return (error);
3593
3594                         if (vnode_vtype(dvp_at) != VDIR) {
3595                                 vnode_put(dvp_at);
3596                                 return (ENOTDIR);
3597                         }
3598
3599                         ndp->ni_dvp = dvp_at;
3600                         ndp->ni_cnd.cn_flags |= USEDVP;
3601                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
3602                             retval);
3603                         vnode_put(dvp_at);
3604                         return (error);
3605                 }
3606         }
3607
3608         return (open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval));
3609 }
3610
3611 /*
3612  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
3613  *
3614  * Parameters:  p                       Process requesting the open
3615  *              uap                     User argument descriptor (see below)
3616  *              retval                  Pointer to an area to receive the
3617  *                                      return calue from the system call
3618  *
3619  * Indirect:    uap->path               Path to open (same as 'open')
3620  *              uap->flags              Flags to open (same as 'open'
3621  *              uap->uid                UID to set, if creating
3622  *              uap->gid                GID to set, if creating
3623  *              uap->mode               File mode, if creating (same as 'open')
3624  *              uap->xsecurity          ACL to set, if creating
3625  *
3626  * Returns:     0                       Success
3627  *              !0                      errno value
3628  *
3629  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3630  *
3631  * XXX:         We should enummerate the possible errno values here, and where
3632  *              in the code they originated.
3633  */
3634 int
3635 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
3636 {
3637         struct filedesc *fdp = p->p_fd;
3638         int ciferror;
3639         kauth_filesec_t xsecdst;
3640         struct vnode_attr va;
3641         struct nameidata nd;
3642         int cmode;
3643
3644         AUDIT_ARG(owner, uap->uid, uap->gid);
3645
3646         xsecdst = NULL;
3647         if ((uap->xsecurity != USER_ADDR_NULL) &&
3648             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
3649                 return ciferror;
3650
3651         VATTR_INIT(&va);
3652         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3653         VATTR_SET(&va, va_mode, cmode);
3654         if (uap->uid != KAUTH_UID_NONE)
3655                 VATTR_SET(&va, va_uid, uap->uid);
3656         if (uap->gid != KAUTH_GID_NONE)
3657                 VATTR_SET(&va, va_gid, uap->gid);
3658         if (xsecdst != NULL)
3659                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3660
3661         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3662                uap->path, vfs_context_current());
3663
3664         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
3665                          fileproc_alloc_init, NULL, retval);
3666         if (xsecdst != NULL)
3667                 kauth_filesec_free(xsecdst);
3668
3669         return ciferror;
3670 }
3671
3672 /*
3673  * Go through the data-protected atomically controlled open (2)
3674  *
3675  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
3676  */
3677 int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval) {
3678         int flags = uap->flags;
3679         int class = uap->class;
3680         int dpflags = uap->dpflags;
3681
3682         /*
3683          * Follow the same path as normal open(2)
3684          * Look up the item if it exists, and acquire the vnode.
3685          */
3686         struct filedesc *fdp = p->p_fd;
3687         struct vnode_attr va;
3688         struct nameidata nd;
3689         int cmode;
3690         int error;
3691
3692         VATTR_INIT(&va);
3693         /* Mask off all but regular access permissions */
3694         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3695         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3696
3697         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3698                uap->path, vfs_context_current());
3699
3700         /*
3701          * Initialize the extra fields in vnode_attr to pass down our
3702          * extra fields.
3703          * 1. target cprotect class.
3704          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
3705          */
3706         if (flags & O_CREAT) {
3707                /* lower level kernel code validates that the class is valid before applying it. */
3708                if (class != PROTECTION_CLASS_DEFAULT) {
3709                        /*
3710                         * PROTECTION_CLASS_DEFAULT implies that we make the class for this
3711                         * file behave the same as open (2)
3712                         */
3713                        VATTR_SET(&va, va_dataprotect_class, class);
3714                }
3715         }
3716
3717         if (dpflags & (O_DP_GETRAWENCRYPTED|O_DP_GETRAWUNENCRYPTED)) {
3718                 if ( flags & (O_RDWR | O_WRONLY)) {
3719                         /* Not allowed to write raw encrypted bytes */
3720                         return EINVAL;
3721                 }
3722                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
3723                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
3724                 }
3725                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
3726                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
3727                 }
3728         }
3729
3730         error = open1(vfs_context_current(), &nd, uap->flags, &va,
3731                       fileproc_alloc_init, NULL, retval);
3732
3733         return error;
3734 }
3735
3736 static int
3737 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
3738     int fd, enum uio_seg segflg, int *retval)
3739 {
3740         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
3741         struct vnode_attr va;
3742         struct nameidata nd;
3743         int cmode;
3744
3745         VATTR_INIT(&va);
3746         /* Mask off all but regular access permissions */
3747         cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3748         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3749
3750         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
3751             segflg, path, ctx);
3752
3753         return (open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
3754             retval, fd));
3755 }
3756
3757 int
3758 open(proc_t p, struct open_args *uap, int32_t *retval)
3759 {
3760         __pthread_testcancel(1);
3761         return(open_nocancel(p, (struct open_nocancel_args *)uap, retval));
3762 }
3763
3764 int
3765 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
3766     int32_t *retval)
3767 {
3768         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3769             uap->mode, AT_FDCWD, UIO_USERSPACE, retval));
3770 }
3771
3772 int
3773 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
3774                 int32_t *retval)
3775 {
3776         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3777             uap->mode, uap->fd, UIO_USERSPACE, retval));
3778 }
3779
3780 int
3781 openat(proc_t p, struct openat_args *uap, int32_t *retval)
3782 {
3783         __pthread_testcancel(1);
3784         return(openat_nocancel(p, (struct openat_nocancel_args *)uap, retval));
3785 }
3786
3787 /*
3788  * openbyid_np: open a file given a file system id and a file system object id
3789  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
3790  *      file systems that don't support object ids it is a node id (uint64_t).
3791  *
3792  * Parameters:  p                       Process requesting the open
3793  *              uap                     User argument descriptor (see below)
3794  *              retval                  Pointer to an area to receive the
3795  *                                      return calue from the system call
3796  *
3797  * Indirect:    uap->path               Path to open (same as 'open')
3798  *
3799  *              uap->fsid               id of target file system
3800  *              uap->objid              id of target file system object
3801  *              uap->flags              Flags to open (same as 'open')
3802  *
3803  * Returns:     0                       Success
3804  *              !0                      errno value
3805  *
3806  *
3807  * XXX:         We should enummerate the possible errno values here, and where
3808  *              in the code they originated.
3809  */
3810 int
3811 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
3812 {
3813         fsid_t fsid;
3814         uint64_t objid;
3815         int error;
3816         char *buf = NULL;
3817         int buflen = MAXPATHLEN;
3818         int pathlen = 0;
3819         vfs_context_t ctx = vfs_context_current();
3820
3821         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
3822                 return (error);
3823         }
3824
3825         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
3826                 return (error);
3827         }
3828
3829         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
3830         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
3831                 return (error);
3832         }
3833
3834         AUDIT_ARG(value32, fsid.val[0]);
3835         AUDIT_ARG(value64, objid);
3836
3837         /*resolve path from fsis, objid*/
3838         do {
3839                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
3840                 if (buf == NULL) {
3841                         return (ENOMEM);
3842                 }
3843
3844                 error = fsgetpath_internal(
3845                         ctx, fsid.val[0], objid,
3846                         buflen, buf, &pathlen);
3847
3848                 if (error) {
3849                         FREE(buf, M_TEMP);
3850                         buf = NULL;
3851                 }
3852         } while (error == ENOSPC && (buflen += MAXPATHLEN));
3853
3854         if (error) {
3855                 return error;
3856         }
3857
3858         buf[pathlen] = 0;
3859
3860         error = openat_internal(
3861                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
3862
3863         FREE(buf, M_TEMP);
3864
3865         return error;
3866 }
3867
3868
3869 /*
3870  * Create a special file.
3871  */
3872 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
3873
3874 int
3875 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
3876 {
3877         struct vnode_attr va;
3878         vfs_context_t ctx = vfs_context_current();
3879         int error;
3880         struct nameidata nd;
3881         vnode_t vp, dvp;
3882
3883         VATTR_INIT(&va);
3884         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3885         VATTR_SET(&va, va_rdev, uap->dev);
3886
3887         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
3888         if ((uap->mode & S_IFMT) == S_IFIFO)
3889                 return(mkfifo1(ctx, uap->path, &va));
3890
3891         AUDIT_ARG(mode, uap->mode);
3892         AUDIT_ARG(value32, uap->dev);
3893
3894         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
3895                 return (error);
3896         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
3897                 UIO_USERSPACE, uap->path, ctx);
3898         error = namei(&nd);
3899         if (error)
3900                 return (error);
3901         dvp = nd.ni_dvp;
3902         vp = nd.ni_vp;
3903
3904         if (vp != NULL) {
3905                 error = EEXIST;
3906                 goto out;
3907         }
3908
3909         switch (uap->mode & S_IFMT) {
3910         case S_IFCHR:
3911                 VATTR_SET(&va, va_type, VCHR);
3912                 break;
3913         case S_IFBLK:
3914                 VATTR_SET(&va, va_type, VBLK);
3915                 break;
3916         default:
3917                 error = EINVAL;
3918                 goto out;
3919         }
3920
3921 #if CONFIG_MACF
3922         error = mac_vnode_check_create(ctx,
3923             nd.ni_dvp, &nd.ni_cnd, &va);
3924         if (error)
3925                 goto out;
3926 #endif
3927
3928         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
3929                 goto out;
3930
3931         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0)
3932                 goto out;
3933
3934         if (vp) {
3935                 int     update_flags = 0;
3936
3937                 // Make sure the name & parent pointers are hooked up
3938                 if (vp->v_name == NULL)
3939                         update_flags |= VNODE_UPDATE_NAME;
3940                 if (vp->v_parent == NULLVP)
3941                         update_flags |= VNODE_UPDATE_PARENT;
3942
3943                 if (update_flags)
3944                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
3945
3946 #if CONFIG_FSE
3947                 add_fsevent(FSE_CREATE_FILE, ctx,
3948                     FSE_ARG_VNODE, vp,
3949                     FSE_ARG_DONE);
3950 #endif
3951         }
3952
3953 out:
3954         /*
3955          * nameidone has to happen before we vnode_put(dvp)
3956          * since it may need to release the fs_nodelock on the dvp
3957          */
3958         nameidone(&nd);
3959
3960         if (vp)
3961                 vnode_put(vp);
3962         vnode_put(dvp);
3963
3964         return (error);
3965 }
3966
3967 /*
3968  * Create a named pipe.
3969  *
3970  * Returns:     0                       Success
3971  *              EEXIST
3972  *      namei:???
3973  *      vnode_authorize:???
3974  *      vn_create:???
3975  */
3976 static int
3977 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
3978 {
3979         vnode_t vp, dvp;
3980         int error;
3981         struct nameidata nd;
3982
3983         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
3984                 UIO_USERSPACE, upath, ctx);
3985         error = namei(&nd);
3986         if (error)
3987                 return (error);
3988         dvp = nd.ni_dvp;
3989         vp = nd.ni_vp;
3990
3991         /* check that this is a new file and authorize addition */
3992         if (vp != NULL) {
3993                 error = EEXIST;
3994                 goto out;
3995         }
3996         VATTR_SET(vap, va_type, VFIFO);
3997
3998         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0)
3999                 goto out;
4000
4001         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4002 out:
4003         /*
4004          * nameidone has to happen before we vnode_put(dvp)
4005          * since it may need to release the fs_nodelock on the dvp
4006          */
4007         nameidone(&nd);
4008
4009         if (vp)
4010                 vnode_put(vp);
4011         vnode_put(dvp);
4012
4013         return error;
4014 }
4015
4016
4017 /*
4018  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4019  *
4020  * Parameters:  p                       Process requesting the open
4021  *              uap                     User argument descriptor (see below)
4022  *              retval                  (Ignored)
4023  *
4024  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
4025  *              uap->uid                UID to set
4026  *              uap->gid                GID to set
4027  *              uap->mode               File mode to set (same as 'mkfifo')
4028  *              uap->xsecurity          ACL to set, if creating
4029  *
4030  * Returns:     0                       Success
4031  *              !0                      errno value
4032  *
4033  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4034  *
4035  * XXX:         We should enummerate the possible errno values here, and where
4036  *              in the code they originated.
4037  */
4038 int
4039 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4040 {
4041         int ciferror;
4042         kauth_filesec_t xsecdst;
4043         struct vnode_attr va;
4044
4045         AUDIT_ARG(owner, uap->uid, uap->gid);
4046
4047         xsecdst = KAUTH_FILESEC_NONE;
4048         if (uap->xsecurity != USER_ADDR_NULL) {
4049                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
4050                         return ciferror;
4051         }
4052
4053         VATTR_INIT(&va);
4054         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4055         if (uap->uid != KAUTH_UID_NONE)
4056                 VATTR_SET(&va, va_uid, uap->uid);
4057         if (uap->gid != KAUTH_GID_NONE)
4058                 VATTR_SET(&va, va_gid, uap->gid);
4059         if (xsecdst != KAUTH_FILESEC_NONE)
4060                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4061
4062         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4063
4064         if (xsecdst != KAUTH_FILESEC_NONE)
4065                 kauth_filesec_free(xsecdst);
4066         return ciferror;
4067 }
4068
4069 /* ARGSUSED */
4070 int
4071 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4072 {
4073         struct vnode_attr va;
4074
4075         VATTR_INIT(&va);
4076         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4077
4078         return(mkfifo1(vfs_context_current(), uap->path, &va));
4079 }
4080
4081
4082 static char *
4083 my_strrchr(char *p, int ch)
4084 {
4085         char *save;
4086
4087         for (save = NULL;; ++p) {
4088                 if (*p == ch)
4089                         save = p;
4090                 if (!*p)
4091                         return(save);
4092         }
4093         /* NOTREACHED */
4094 }
4095
4096 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4097
4098 int
4099 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4100 {
4101         int ret, len = _len;
4102
4103         *truncated_path = 0;
4104         ret = vn_getpath(dvp, path, &len);
4105         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4106                 if (leafname) {
4107                         path[len-1] = '/';
4108                         len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + 1;
4109                         if (len > MAXPATHLEN) {
4110                                 char *ptr;
4111
4112                                 // the string got truncated!
4113                                 *truncated_path = 1;
4114                                 ptr = my_strrchr(path, '/');
4115                                 if (ptr) {
4116                                         *ptr = '\0';   // chop off the string at the last directory component
4117                                 }
4118                                 len = strlen(path) + 1;
4119                         }
4120                 }
4121         } else if (ret == 0) {
4122                 *truncated_path = 1;
4123         } else if (ret != 0) {
4124                 struct vnode *mydvp=dvp;
4125
4126                 if (ret != ENOSPC) {
4127                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4128                                dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4129                 }
4130                 *truncated_path = 1;
4131
4132                 do {
4133                         if (mydvp->v_parent != NULL) {
4134                                 mydvp = mydvp->v_parent;
4135                         } else if (mydvp->v_mount) {
4136                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4137                                 break;
4138                         } else {
4139                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4140                                 strlcpy(path, "/", _len);
4141                                 len = 2;
4142                                 mydvp = NULL;
4143                         }
4144
4145                         if (mydvp == NULL) {
4146                                 break;
4147                         }
4148
4149                         len = _len;
4150                         ret = vn_getpath(mydvp, path, &len);
4151                 } while (ret == ENOSPC);
4152         }
4153
4154         return len;
4155 }
4156
4157
4158 /*
4159  * Make a hard file link.
4160  *
4161  * Returns:     0                       Success
4162  *              EPERM
4163  *              EEXIST
4164  *              EXDEV
4165  *      namei:???
4166  *      vnode_authorize:???
4167  *      VNOP_LINK:???
4168  */
4169 /* ARGSUSED */
4170 static int
4171 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4172     user_addr_t link, int flag, enum uio_seg segflg)
4173 {
4174         vnode_t vp, dvp, lvp;
4175         struct nameidata nd;
4176         int follow;
4177         int error;
4178 #if CONFIG_FSE
4179         fse_info finfo;
4180 #endif
4181         int need_event, has_listeners;
4182         char *target_path = NULL;
4183         int truncated=0;
4184
4185         vp = dvp = lvp = NULLVP;
4186
4187         /* look up the object we are linking to */
4188         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4189         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4190             segflg, path, ctx);
4191
4192         error = nameiat(&nd, fd1);
4193         if (error)
4194                 return (error);
4195         vp = nd.ni_vp;
4196
4197         nameidone(&nd);
4198
4199         /*
4200          * Normally, linking to directories is not supported.
4201          * However, some file systems may have limited support.
4202          */
4203         if (vp->v_type == VDIR) {
4204                 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4205                         error = EPERM;   /* POSIX */
4206                         goto out;
4207                 }
4208
4209                 /* Linking to a directory requires ownership. */
4210                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4211                         struct vnode_attr dva;
4212
4213                         VATTR_INIT(&dva);
4214                         VATTR_WANTED(&dva, va_uid);
4215                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4216                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4217                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4218                                 error = EACCES;
4219                                 goto out;
4220                         }
4221                 }
4222         }
4223
4224         /* lookup the target node */
4225 #if CONFIG_TRIGGERS
4226         nd.ni_op = OP_LINK;
4227 #endif
4228         nd.ni_cnd.cn_nameiop = CREATE;
4229         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4230         nd.ni_dirp = link;
4231         error = nameiat(&nd, fd2);
4232         if (error != 0)
4233                 goto out;
4234         dvp = nd.ni_dvp;
4235         lvp = nd.ni_vp;
4236
4237 #if CONFIG_MACF
4238         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0)
4239                 goto out2;
4240 #endif
4241
4242         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4243         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0)
4244                 goto out2;
4245
4246         /* target node must not exist */
4247         if (lvp != NULLVP) {
4248                 error = EEXIST;
4249                 goto out2;
4250         }
4251         /* cannot link across mountpoints */
4252         if (vnode_mount(vp) != vnode_mount(dvp)) {
4253                 error = EXDEV;
4254                 goto out2;
4255         }
4256
4257         /* authorize creation of the target note */
4258         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
4259                 goto out2;
4260
4261         /* and finally make the link */
4262         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4263         if (error)
4264                 goto out2;
4265
4266 #if CONFIG_MACF
4267         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4268 #endif
4269
4270 #if CONFIG_FSE
4271         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4272 #else
4273         need_event = 0;
4274 #endif
4275         has_listeners = kauth_authorize_fileop_has_listeners();
4276
4277         if (need_event || has_listeners) {
4278                 char *link_to_path = NULL;
4279                 int len, link_name_len;
4280
4281                 /* build the path to the new link file */
4282                 GET_PATH(target_path);
4283                 if (target_path == NULL) {
4284                         error = ENOMEM;
4285                         goto out2;
4286                 }
4287
4288                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4289
4290                 if (has_listeners) {
4291                         /* build the path to file we are linking to */
4292                         GET_PATH(link_to_path);
4293                         if (link_to_path == NULL) {
4294                                 error = ENOMEM;
4295                                 goto out2;
4296                         }
4297
4298                         link_name_len = MAXPATHLEN;
4299                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4300                                 /*
4301                                  * Call out to allow 3rd party notification of rename.
4302                                  * Ignore result of kauth_authorize_fileop call.
4303                                  */
4304                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4305                                                        (uintptr_t)link_to_path,
4306                                                        (uintptr_t)target_path);
4307                         }
4308                         if (link_to_path != NULL) {
4309                                 RELEASE_PATH(link_to_path);
4310                         }
4311                 }
4312 #if CONFIG_FSE
4313                 if (need_event) {
4314                         /* construct fsevent */
4315                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4316                                 if (truncated) {
4317                                         finfo.mode |= FSE_TRUNCATED_PATH;
4318                                 }
4319
4320                                 // build the path to the destination of the link
4321                                 add_fsevent(FSE_CREATE_FILE, ctx,
4322                                             FSE_ARG_STRING, len, target_path,
4323                                             FSE_ARG_FINFO, &finfo,
4324                                             FSE_ARG_DONE);
4325                         }
4326                         if (vp->v_parent) {
4327                             add_fsevent(FSE_STAT_CHANGED, ctx,
4328                                 FSE_ARG_VNODE, vp->v_parent,
4329                                 FSE_ARG_DONE);
4330                         }
4331                 }
4332 #endif
4333         }
4334 out2:
4335         /*
4336          * nameidone has to happen before we vnode_put(dvp)
4337          * since it may need to release the fs_nodelock on the dvp
4338          */
4339         nameidone(&nd);
4340         if (target_path != NULL) {
4341                 RELEASE_PATH(target_path);
4342         }
4343 out:
4344         if (lvp)
4345                 vnode_put(lvp);
4346         if (dvp)
4347                 vnode_put(dvp);
4348         vnode_put(vp);
4349         return (error);
4350 }
4351
4352 int
4353 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4354 {
4355         return (linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4356             AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE));
4357 }
4358
4359 int
4360 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4361 {
4362         if (uap->flag & ~AT_SYMLINK_FOLLOW)
4363                 return (EINVAL);
4364
4365         return (linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4366             uap->fd2, uap->link, uap->flag, UIO_USERSPACE));
4367 }
4368
4369 /*
4370  * Make a symbolic link.
4371  *
4372  * We could add support for ACLs here too...
4373  */
4374 /* ARGSUSED */
4375 static int
4376 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4377     user_addr_t link, enum uio_seg segflg)
4378 {
4379         struct vnode_attr va;
4380         char *path;
4381         int error;
4382         struct nameidata nd;
4383         vnode_t vp, dvp;
4384         size_t dummy=0;
4385         proc_t p;
4386
4387         error = 0;
4388         if (UIO_SEG_IS_USER_SPACE(segflg)) {
4389                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
4390                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
4391         } else {
4392                 path = (char *)path_data;
4393         }
4394         if (error)
4395                 goto out;
4396         AUDIT_ARG(text, path);  /* This is the link string */
4397
4398         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
4399             segflg, link, ctx);
4400
4401         error = nameiat(&nd, fd);
4402         if (error)
4403                 goto out;
4404         dvp = nd.ni_dvp;
4405         vp = nd.ni_vp;
4406
4407         p = vfs_context_proc(ctx);
4408         VATTR_INIT(&va);
4409         VATTR_SET(&va, va_type, VLNK);
4410         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
4411
4412 #if CONFIG_MACF
4413         error = mac_vnode_check_create(ctx,
4414                         dvp, &nd.ni_cnd, &va);
4415 #endif
4416         if (error != 0) {
4417             goto skipit;
4418         }
4419
4420         if (vp != NULL) {
4421             error = EEXIST;
4422             goto skipit;
4423         }
4424
4425         /* authorize */
4426         if (error == 0)
4427                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
4428         /* get default ownership, etc. */
4429         if (error == 0)
4430                 error = vnode_authattr_new(dvp, &va, 0, ctx);
4431         if (error == 0)
4432                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
4433
4434 #if CONFIG_MACF
4435         if (error == 0 && vp)
4436                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
4437 #endif
4438
4439         /* do fallback attribute handling */
4440         if (error == 0 && vp)
4441                 error = vnode_setattr_fallback(vp, &va, ctx);
4442
4443         if (error == 0) {
4444                 int     update_flags = 0;
4445
4446                 /*check if a new vnode was created, else try to get one*/
4447                 if (vp == NULL) {
4448                         nd.ni_cnd.cn_nameiop = LOOKUP;
4449 #if CONFIG_TRIGGERS
4450                         nd.ni_op = OP_LOOKUP;
4451 #endif
4452                         nd.ni_cnd.cn_flags = 0;
4453                         error = nameiat(&nd, fd);
4454                         vp = nd.ni_vp;
4455
4456                         if (vp == NULL)
4457                                 goto skipit;
4458                 }
4459
4460 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
4461                 /* call out to allow 3rd party notification of rename.
4462                  * Ignore result of kauth_authorize_fileop call.
4463                  */
4464                 if (kauth_authorize_fileop_has_listeners() &&
4465                     namei(&nd) == 0) {
4466                         char *new_link_path = NULL;
4467                         int             len;
4468
4469                         /* build the path to the new link file */
4470                         new_link_path = get_pathbuff();
4471                         len = MAXPATHLEN;
4472                         vn_getpath(dvp, new_link_path, &len);
4473                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
4474                                 new_link_path[len - 1] = '/';
4475                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN-len);
4476                         }
4477
4478                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
4479                                            (uintptr_t)path, (uintptr_t)new_link_path);
4480                         if (new_link_path != NULL)
4481                                 release_pathbuff(new_link_path);
4482                 }
4483 #endif
4484                 // Make sure the name & parent pointers are hooked up
4485                 if (vp->v_name == NULL)
4486                         update_flags |= VNODE_UPDATE_NAME;
4487                 if (vp->v_parent == NULLVP)
4488                         update_flags |= VNODE_UPDATE_PARENT;
4489
4490                 if (update_flags)
4491                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4492
4493 #if CONFIG_FSE
4494                 add_fsevent(FSE_CREATE_FILE, ctx,
4495                             FSE_ARG_VNODE, vp,
4496                             FSE_ARG_DONE);
4497 #endif
4498         }
4499
4500 skipit:
4501         /*
4502          * nameidone has to happen before we vnode_put(dvp)
4503          * since it may need to release the fs_nodelock on the dvp
4504          */
4505         nameidone(&nd);
4506
4507         if (vp)
4508                 vnode_put(vp);
4509         vnode_put(dvp);
4510 out:
4511         if (path && (path != (char *)path_data))
4512                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
4513
4514         return (error);
4515 }
4516
4517 int
4518 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
4519 {
4520         return (symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
4521             uap->link, UIO_USERSPACE));
4522 }
4523
4524 int
4525 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
4526     __unused int32_t *retval)
4527 {
4528         return (symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
4529             uap->path2, UIO_USERSPACE));
4530 }
4531
4532 /*
4533  * Delete a whiteout from the filesystem.
4534  * No longer supported.
4535  */
4536 int
4537 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
4538 {
4539         return (ENOTSUP);
4540 }
4541
4542 /*
4543  * Delete a name from the filesystem.
4544  */
4545 /* ARGSUSED */
4546 static int
4547 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
4548     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
4549 {
4550         struct nameidata nd;
4551         vnode_t vp, dvp;
4552         int error;
4553         struct componentname *cnp;
4554         char  *path = NULL;
4555         int  len=0;
4556 #if CONFIG_FSE
4557         fse_info  finfo;
4558         struct vnode_attr va;
4559 #endif
4560         int flags;
4561         int need_event;
4562         int has_listeners;
4563         int truncated_path;
4564         int batched;
4565         struct vnode_attr *vap;
4566         int do_retry;
4567         int retry_count = 0;
4568         int cn_flags;
4569
4570         cn_flags = LOCKPARENT;
4571         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH))
4572                 cn_flags |= AUDITVNPATH1;
4573         /* If a starting dvp is passed, it trumps any fd passed. */
4574         if (start_dvp)
4575                 cn_flags |= USEDVP;
4576
4577 #if NAMEDRSRCFORK
4578         /* unlink or delete is allowed on rsrc forks and named streams */
4579         cn_flags |= CN_ALLOWRSRCFORK;
4580 #endif
4581
4582 retry:
4583         do_retry = 0;
4584         flags = 0;
4585         need_event = 0;
4586         has_listeners = 0;
4587         truncated_path = 0;
4588         vap = NULL;
4589
4590         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
4591
4592         nd.ni_dvp = start_dvp;
4593         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
4594         cnp = &nd.ni_cnd;
4595
4596 lookup_continue:
4597         error = nameiat(&nd, fd);
4598         if (error)
4599                 return (error);
4600
4601         dvp = nd.ni_dvp;
4602         vp = nd.ni_vp;
4603
4604
4605         /* With Carbon delete semantics, busy files cannot be deleted */
4606         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
4607                 flags |= VNODE_REMOVE_NODELETEBUSY;
4608         }
4609
4610         /* Skip any potential upcalls if told to. */
4611         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
4612                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
4613         }
4614
4615         if (vp) {
4616                 batched = vnode_compound_remove_available(vp);
4617                 /*
4618                  * The root of a mounted filesystem cannot be deleted.
4619                  */
4620                 if (vp->v_flag & VROOT) {
4621                         error = EBUSY;
4622                 }
4623
4624                 if (!batched) {
4625                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
4626                         if (error) {
4627                                 if (error == ENOENT) {
4628                                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4629                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4630                                                 do_retry = 1;
4631                                                 retry_count++;
4632                                         }
4633                                 }
4634                                 goto out;
4635                         }
4636                 }
4637         } else {
4638                 batched = 1;
4639
4640                 if (!vnode_compound_remove_available(dvp)) {
4641                         panic("No vp, but no compound remove?");
4642                 }
4643         }
4644
4645 #if CONFIG_FSE
4646         need_event = need_fsevent(FSE_DELETE, dvp);
4647         if (need_event) {
4648                 if (!batched) {
4649                         if ((vp->v_flag & VISHARDLINK) == 0) {
4650                                 /* XXX need to get these data in batched VNOP */
4651                                 get_fse_info(vp, &finfo, ctx);
4652                         }
4653                 } else {
4654                         error = vfs_get_notify_attributes(&va);
4655                         if (error) {
4656                                 goto out;
4657                         }
4658
4659                         vap = &va;
4660                 }
4661         }
4662 #endif
4663         has_listeners = kauth_authorize_fileop_has_listeners();
4664         if (need_event || has_listeners) {
4665                 if (path == NULL) {
4666                         GET_PATH(path);
4667                         if (path == NULL) {
4668                                 error = ENOMEM;
4669                                 goto out;
4670                         }
4671                 }
4672                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
4673         }
4674
4675 #if NAMEDRSRCFORK
4676         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK)
4677                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
4678         else
4679 #endif
4680         {
4681                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
4682                 vp = nd.ni_vp;
4683                 if (error == EKEEPLOOKING) {
4684                         if (!batched) {
4685                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
4686                         }
4687
4688                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
4689                                 panic("EKEEPLOOKING, but continue flag not set?");
4690                         }
4691
4692                         if (vnode_isdir(vp)) {
4693                                 error = EISDIR;
4694                                 goto out;
4695                         }
4696                         goto lookup_continue;
4697                 } else if (error == ENOENT && batched) {
4698                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4699                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4700                                 /*
4701                                  * For compound VNOPs, the authorization callback may
4702                                  * return ENOENT in case of racing hardlink lookups
4703                                  * hitting the name  cache, redrive the lookup.
4704                                  */
4705                                 do_retry = 1;
4706                                 retry_count += 1;
4707                                 goto out;
4708                         }
4709                 }
4710         }
4711
4712         /*
4713          * Call out to allow 3rd party notification of delete.
4714          * Ignore result of kauth_authorize_fileop call.
4715          */
4716         if (!error) {
4717                 if (has_listeners) {
4718                         kauth_authorize_fileop(vfs_context_ucred(ctx),
4719                                 KAUTH_FILEOP_DELETE,
4720                                 (uintptr_t)vp,
4721                                 (uintptr_t)path);
4722                 }
4723
4724                 if (vp->v_flag & VISHARDLINK) {
4725                     //
4726                     // if a hardlink gets deleted we want to blow away the
4727                     // v_parent link because the path that got us to this
4728                     // instance of the link is no longer valid.  this will
4729                     // force the next call to get the path to ask the file
4730                     // system instead of just following the v_parent link.
4731                     //
4732                     vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
4733                 }
4734
4735 #if CONFIG_FSE
4736                 if (need_event) {
4737                         if (vp->v_flag & VISHARDLINK) {
4738                                 get_fse_info(vp, &finfo, ctx);
4739                         } else if (vap) {
4740                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
4741                         }
4742                         if (truncated_path) {
4743                                 finfo.mode |= FSE_TRUNCATED_PATH;
4744                         }
4745                         add_fsevent(FSE_DELETE, ctx,
4746                                                 FSE_ARG_STRING, len, path,
4747                                                 FSE_ARG_FINFO, &finfo,
4748                                                 FSE_ARG_DONE);
4749                 }
4750 #endif
4751         }
4752
4753 out:
4754         if (path != NULL)
4755                 RELEASE_PATH(path);
4756
4757 #if NAMEDRSRCFORK
4758         /* recycle the deleted rsrc fork vnode to force a reclaim, which
4759          * will cause its shadow file to go away if necessary.
4760          */
4761          if (vp && (vnode_isnamedstream(vp)) &&
4762                 (vp->v_parent != NULLVP) &&
4763                 vnode_isshadow(vp)) {
4764                         vnode_recycle(vp);
4765          }
4766 #endif
4767         /*
4768          * nameidone has to happen before we vnode_put(dvp)
4769          * since it may need to release the fs_nodelock on the dvp
4770          */
4771         nameidone(&nd);
4772         vnode_put(dvp);
4773         if (vp) {
4774                 vnode_put(vp);
4775         }
4776
4777         if (do_retry) {
4778                 goto retry;
4779         }
4780
4781         return (error);
4782 }
4783
4784 int
4785 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
4786     enum uio_seg segflg, int unlink_flags)
4787 {
4788         return (unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
4789             unlink_flags));
4790 }
4791
4792 /*
4793  * Delete a name from the filesystem using Carbon semantics.
4794  */
4795 int
4796 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
4797 {
4798         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4799             uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY));
4800 }
4801
4802 /*
4803  * Delete a name from the filesystem using POSIX semantics.
4804  */
4805 int
4806 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
4807 {
4808         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4809             uap->path, UIO_USERSPACE, 0));
4810 }
4811
4812 int
4813 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
4814 {
4815         if (uap->flag & ~AT_REMOVEDIR)
4816                 return (EINVAL);
4817
4818         if (uap->flag & AT_REMOVEDIR)
4819                 return (rmdirat_internal(vfs_context_current(), uap->fd,
4820                     uap->path, UIO_USERSPACE));
4821         else
4822                 return (unlinkat_internal(vfs_context_current(), uap->fd,
4823                     NULLVP, uap->path, UIO_USERSPACE, 0));
4824 }
4825
4826 /*
4827  * Reposition read/write file offset.
4828  */
4829 int
4830 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
4831 {
4832         struct fileproc *fp;
4833         vnode_t vp;
4834         struct vfs_context *ctx;
4835         off_t offset = uap->offset, file_size;
4836         int error;
4837
4838         if ( (error = fp_getfvp(p,uap->fd, &fp, &vp)) ) {
4839                 if (error == ENOTSUP)
4840                         return (ESPIPE);
4841                 return (error);
4842         }
4843         if (vnode_isfifo(vp)) {
4844                 file_drop(uap->fd);
4845                 return(ESPIPE);
4846         }
4847
4848
4849         ctx = vfs_context_current();
4850 #if CONFIG_MACF
4851         if (uap->whence == L_INCR && uap->offset == 0)
4852                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
4853                     fp->f_fglob);
4854         else
4855                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
4856                     fp->f_fglob);
4857         if (error) {
4858                 file_drop(uap->fd);
4859                 return (error);
4860         }
4861 #endif
4862         if ( (error = vnode_getwithref(vp)) ) {
4863                 file_drop(uap->fd);
4864                 return(error);
4865         }
4866
4867         switch (uap->whence) {
4868         case L_INCR:
4869                 offset += fp->f_fglob->fg_offset;
4870                 break;
4871         case L_XTND:
4872                 if ((error = vnode_size(vp, &file_size, ctx)) != 0)
4873                         break;
4874                 offset += file_size;
4875                 break;
4876         case L_SET:
4877                 break;
4878         default:
4879                 error = EINVAL;
4880         }
4881         if (error == 0) {
4882                 if (uap->offset > 0 && offset < 0) {
4883                         /* Incremented/relative move past max size */
4884                         error = EOVERFLOW;
4885                 } else {
4886                         /*
4887                          * Allow negative offsets on character devices, per
4888                          * POSIX 1003.1-2001.  Most likely for writing disk
4889                          * labels.
4890                          */
4891                         if (offset < 0 && vp->v_type != VCHR) {
4892                                 /* Decremented/relative move before start */
4893                                 error = EINVAL;
4894                         } else {
4895                                 /* Success */
4896                                 fp->f_fglob->fg_offset = offset;
4897                                 *retval = fp->f_fglob->fg_offset;
4898                         }
4899                 }
4900         }
4901
4902         /*
4903          * An lseek can affect whether data is "available to read."  Use
4904          * hint of NOTE_NONE so no EVFILT_VNODE events fire
4905          */
4906         post_event_if_success(vp, error, NOTE_NONE);
4907         (void)vnode_put(vp);
4908         file_drop(uap->fd);
4909         return (error);
4910 }
4911
4912
4913 /*
4914  * Check access permissions.
4915  *
4916  * Returns:     0                       Success
4917  *              vnode_authorize:???
4918  */
4919 static int
4920 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
4921 {
4922         kauth_action_t action;
4923         int error;
4924
4925         /*
4926          * If just the regular access bits, convert them to something
4927          * that vnode_authorize will understand.
4928          */
4929         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
4930                 action = 0;
4931                 if (uflags & R_OK)
4932                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
4933                 if (uflags & W_OK) {
4934                         if (vnode_isdir(vp)) {
4935                                 action |= KAUTH_VNODE_ADD_FILE |
4936                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
4937                                 /* might want delete rights here too */
4938                         } else {
4939                                 action |= KAUTH_VNODE_WRITE_DATA;
4940                         }
4941                 }
4942                 if (uflags & X_OK) {
4943                         if (vnode_isdir(vp)) {
4944                                 action |= KAUTH_VNODE_SEARCH;
4945                         } else {
4946                                 action |= KAUTH_VNODE_EXECUTE;
4947                         }
4948                 }
4949         } else {
4950                 /* take advantage of definition of uflags */
4951                 action = uflags >> 8;
4952         }
4953
4954 #if CONFIG_MACF
4955         error = mac_vnode_check_access(ctx, vp, uflags);
4956         if (error)
4957                 return (error);
4958 #endif /* MAC */
4959
4960         /* action == 0 means only check for existence */
4961         if (action != 0) {
4962                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
4963         } else {
4964                 error = 0;
4965         }
4966
4967         return(error);
4968 }
4969
4970
4971
4972 /*
4973  * access_extended: Check access permissions in bulk.
4974  *
4975  * Description: uap->entries            Pointer to an array of accessx
4976  *                                      descriptor structs, plus one or
4977  *                                      more NULL terminated strings (see
4978  *                                      "Notes" section below).
4979  *              uap->size               Size of the area pointed to by
4980  *                                      uap->entries.
4981  *              uap->results            Pointer to the results array.
4982  *
4983  * Returns:     0                       Success
4984  *              ENOMEM                  Insufficient memory
4985  *              EINVAL                  Invalid arguments
4986  *              namei:EFAULT            Bad address
4987  *              namei:ENAMETOOLONG      Filename too long
4988  *              namei:ENOENT            No such file or directory
4989  *              namei:ELOOP             Too many levels of symbolic links
4990  *              namei:EBADF             Bad file descriptor
4991  *              namei:ENOTDIR           Not a directory
4992  *              namei:???
4993  *              access1:
4994  *
4995  * Implicit returns:
4996  *              uap->results            Array contents modified
4997  *
4998  * Notes:       The uap->entries are structured as an arbitrary length array
4999  *              of accessx descriptors, followed by one or more NULL terminated
5000  *              strings
5001  *
5002  *                      struct accessx_descriptor[0]
5003  *                      ...
5004  *                      struct accessx_descriptor[n]
5005  *                      char name_data[0];
5006  *
5007  *              We determine the entry count by walking the buffer containing
5008  *              the uap->entries argument descriptor.  For each descriptor we
5009  *              see, the valid values for the offset ad_name_offset will be
5010  *              in the byte range:
5011  *
5012  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
5013  *                                              to
5014  *                              [ uap->entries + uap->size - 2 ]
5015  *
5016  *              since we must have at least one string, and the string must
5017  *              be at least one character plus the NULL terminator in length.
5018  *
5019  * XXX:         Need to support the check-as uid argument
5020  */
5021 int
5022 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5023 {
5024         struct accessx_descriptor *input = NULL;
5025         errno_t *result = NULL;
5026         errno_t error = 0;
5027         int wantdelete = 0;
5028         unsigned int desc_max, desc_actual, i, j;
5029         struct vfs_context context;
5030         struct nameidata nd;
5031         int niopts;
5032         vnode_t vp = NULL;
5033         vnode_t dvp = NULL;
5034 #define ACCESSX_MAX_DESCR_ON_STACK 10
5035         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5036
5037         context.vc_ucred = NULL;
5038
5039         /*
5040          * Validate parameters; if valid, copy the descriptor array and string
5041          * arguments into local memory.  Before proceeding, the following
5042          * conditions must have been met:
5043          *
5044          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5045          * o    There must be sufficient room in the request for at least one
5046          *      descriptor and a one yte NUL terminated string.
5047          * o    The allocation of local storage must not fail.
5048          */
5049         if (uap->size > ACCESSX_MAX_TABLESIZE)
5050                 return(ENOMEM);
5051         if (uap->size < (sizeof(struct accessx_descriptor) + 2))
5052                 return(EINVAL);
5053         if (uap->size <= sizeof (stack_input)) {
5054                 input = stack_input;
5055         } else {
5056         MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5057         if (input == NULL) {
5058                 error = ENOMEM;
5059                 goto out;
5060         }
5061         }
5062         error = copyin(uap->entries, input, uap->size);
5063         if (error)
5064                 goto out;
5065
5066         AUDIT_ARG(opaque, input, uap->size);
5067
5068         /*
5069          * Force NUL termination of the copyin buffer to avoid nami() running
5070          * off the end.  If the caller passes us bogus data, they may get a
5071          * bogus result.
5072          */
5073         ((char *)input)[uap->size - 1] = 0;
5074
5075         /*
5076          * Access is defined as checking against the process' real identity,
5077          * even if operations are checking the effective identity.  This
5078          * requires that we use a local vfs context.
5079          */
5080         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5081         context.vc_thread = current_thread();
5082
5083         /*
5084          * Find out how many entries we have, so we can allocate the result
5085          * array by walking the list and adjusting the count downward by the
5086          * earliest string offset we see.
5087          */
5088         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5089         desc_actual = desc_max;
5090         for (i = 0; i < desc_actual; i++) {
5091                 /*
5092                  * Take the offset to the name string for this entry and
5093                  * convert to an input array index, which would be one off
5094                  * the end of the array if this entry was the lowest-addressed
5095                  * name string.
5096                  */
5097                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5098
5099                 /*
5100                  * An offset greater than the max allowable offset is an error.
5101                  * It is also an error for any valid entry to point
5102                  * to a location prior to the end of the current entry, if
5103                  * it's not a reference to the string of the previous entry.
5104                  */
5105                 if (j > desc_max || (j != 0 && j <= i)) {
5106                         error = EINVAL;
5107                         goto out;
5108                 }
5109
5110                 /* Also do not let ad_name_offset point to something beyond the size of the input */
5111                 if (input[i].ad_name_offset >= uap->size) {
5112                         error = EINVAL;
5113                         goto out;
5114                 }
5115
5116                 /*
5117                  * An offset of 0 means use the previous descriptor's offset;
5118                  * this is used to chain multiple requests for the same file
5119                  * to avoid multiple lookups.
5120                  */
5121                 if (j == 0) {
5122                         /* This is not valid for the first entry */
5123                         if (i == 0) {
5124                                 error = EINVAL;
5125                                 goto out;
5126                         }
5127                         continue;
5128                 }
5129
5130                 /*
5131                  * If the offset of the string for this descriptor is before
5132                  * what we believe is the current actual last descriptor,
5133                  * then we need to adjust our estimate downward; this permits
5134                  * the string table following the last descriptor to be out
5135                  * of order relative to the descriptor list.
5136                  */
5137                 if (j < desc_actual)
5138                         desc_actual = j;
5139         }
5140
5141         /*
5142          * We limit the actual number of descriptors we are willing to process
5143          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5144          * requested does not exceed this limit,
5145          */
5146         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5147                 error = ENOMEM;
5148                 goto out;
5149         }
5150         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK);
5151         if (result == NULL) {
5152                 error = ENOMEM;
5153                 goto out;
5154         }
5155
5156         /*
5157          * Do the work by iterating over the descriptor entries we know to
5158          * at least appear to contain valid data.
5159          */
5160         error = 0;
5161         for (i = 0; i < desc_actual; i++) {
5162                 /*
5163                  * If the ad_name_offset is 0, then we use the previous
5164                  * results to make the check; otherwise, we are looking up
5165                  * a new file name.
5166                  */
5167                 if (input[i].ad_name_offset != 0) {
5168                         /* discard old vnodes */
5169                         if (vp) {
5170                                 vnode_put(vp);
5171                                 vp = NULL;
5172                         }
5173                         if (dvp) {
5174                                 vnode_put(dvp);
5175                                 dvp = NULL;
5176                         }
5177
5178                         /*
5179                          * Scan forward in the descriptor list to see if we
5180                          * need the parent vnode.  We will need it if we are
5181                          * deleting, since we must have rights  to remove
5182                          * entries in the parent directory, as well as the
5183                          * rights to delete the object itself.
5184                          */
5185                         wantdelete = input[i].ad_flags & _DELETE_OK;
5186                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++)
5187                                 if (input[j].ad_flags & _DELETE_OK)
5188                                         wantdelete = 1;
5189
5190                         niopts = FOLLOW | AUDITVNPATH1;
5191
5192                         /* need parent for vnode_authorize for deletion test */
5193                         if (wantdelete)
5194                                 niopts |= WANTPARENT;
5195
5196                         /* do the lookup */
5197                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5198                                CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5199                                &context);
5200                         error = namei(&nd);
5201                         if (!error) {
5202                                 vp = nd.ni_vp;
5203                                 if (wantdelete)
5204                                         dvp = nd.ni_dvp;
5205                         }
5206                         nameidone(&nd);
5207                 }
5208
5209                 /*
5210                  * Handle lookup errors.
5211                  */
5212                 switch(error) {
5213                 case ENOENT:
5214                 case EACCES:
5215                 case EPERM:
5216                 case ENOTDIR:
5217                         result[i] = error;
5218                         break;
5219                 case 0:
5220                         /* run this access check */
5221                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5222                         break;
5223                 default:
5224                         /* fatal lookup error */
5225
5226                         goto out;
5227                 }
5228         }
5229
5230         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5231
5232         /* copy out results */
5233         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5234
5235 out:
5236         if (input && input != stack_input)
5237                 FREE(input, M_TEMP);
5238         if (result)
5239                 FREE(result, M_TEMP);
5240         if (vp)
5241                 vnode_put(vp);
5242         if (dvp)
5243                 vnode_put(dvp);
5244         if (IS_VALID_CRED(context.vc_ucred))
5245                 kauth_cred_unref(&context.vc_ucred);
5246         return(error);
5247 }
5248
5249
5250 /*
5251  * Returns:     0                       Success
5252  *              namei:EFAULT            Bad address
5253  *              namei:ENAMETOOLONG      Filename too long
5254  *              namei:ENOENT            No such file or directory
5255  *              namei:ELOOP             Too many levels of symbolic links
5256  *              namei:EBADF             Bad file descriptor
5257  *              namei:ENOTDIR           Not a directory
5258  *              namei:???
5259  *              access1:
5260  */
5261 static int
5262 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5263     int flag, enum uio_seg segflg)
5264 {
5265         int error;
5266         struct nameidata nd;
5267         int niopts;
5268         struct vfs_context context;
5269 #if NAMEDRSRCFORK
5270         int is_namedstream = 0;
5271 #endif
5272
5273         /*
5274          * Unless the AT_EACCESS option is used, Access is defined as checking
5275          * against the process' real identity, even if operations are checking
5276          * the effective identity.  So we need to tweak the credential
5277          * in the context for that case.
5278          */
5279         if (!(flag & AT_EACCESS))
5280                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5281         else
5282                 context.vc_ucred = ctx->vc_ucred;
5283         context.vc_thread = ctx->vc_thread;
5284
5285
5286         niopts = FOLLOW | AUDITVNPATH1;
5287         /* need parent for vnode_authorize for deletion test */
5288         if (amode & _DELETE_OK)
5289                 niopts |= WANTPARENT;
5290         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5291                path, &context);
5292
5293 #if NAMEDRSRCFORK
5294         /* access(F_OK) calls are allowed for resource forks. */
5295         if (amode == F_OK)
5296                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5297 #endif
5298         error = nameiat(&nd, fd);
5299         if (error)
5300                 goto out;
5301
5302 #if NAMEDRSRCFORK
5303         /* Grab reference on the shadow stream file vnode to
5304          * force an inactive on release which will mark it
5305          * for recycle.
5306          */
5307         if (vnode_isnamedstream(nd.ni_vp) &&
5308             (nd.ni_vp->v_parent != NULLVP) &&
5309             vnode_isshadow(nd.ni_vp)) {
5310                 is_namedstream = 1;
5311                 vnode_ref(nd.ni_vp);
5312         }
5313 #endif
5314
5315         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
5316
5317 #if NAMEDRSRCFORK
5318         if (is_namedstream) {
5319                 vnode_rele(nd.ni_vp);
5320         }
5321 #endif
5322
5323         vnode_put(nd.ni_vp);
5324         if (amode & _DELETE_OK)
5325                 vnode_put(nd.ni_dvp);
5326         nameidone(&nd);
5327
5328 out:
5329         if (!(flag & AT_EACCESS))
5330                 kauth_cred_unref(&context.vc_ucred);
5331         return (error);
5332 }
5333
5334 int
5335 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
5336 {
5337         return (faccessat_internal(vfs_context_current(), AT_FDCWD,
5338             uap->path, uap->flags, 0, UIO_USERSPACE));
5339 }
5340
5341 int
5342 faccessat(__unused proc_t p, struct faccessat_args *uap,
5343           __unused int32_t *retval)
5344 {
5345         if (uap->flag & ~AT_EACCESS)
5346                 return (EINVAL);
5347
5348         return (faccessat_internal(vfs_context_current(), uap->fd,
5349             uap->path, uap->amode, uap->flag, UIO_USERSPACE));
5350 }
5351
5352 /*
5353  * Returns:     0                       Success
5354  *              EFAULT
5355  *      copyout:EFAULT
5356  *      namei:???
5357  *      vn_stat:???
5358  */
5359 static int
5360 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
5361     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
5362     enum uio_seg segflg, int fd, int flag)
5363 {
5364         struct nameidata nd;
5365         int follow;
5366         union {
5367                 struct stat sb;
5368                 struct stat64 sb64;
5369         } source;
5370         union {
5371                 struct user64_stat user64_sb;
5372                 struct user32_stat user32_sb;
5373                 struct user64_stat64 user64_sb64;
5374                 struct user32_stat64 user32_sb64;
5375         } dest;
5376         caddr_t sbp;
5377         int error, my_size;
5378         kauth_filesec_t fsec;
5379         size_t xsecurity_bufsize;
5380         void * statptr;
5381
5382         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5383         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
5384             segflg, path, ctx);
5385
5386 #if NAMEDRSRCFORK
5387         int is_namedstream = 0;
5388         /* stat calls are allowed for resource forks. */
5389         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5390 #endif
5391         error = nameiat(&nd, fd);
5392         if (error)
5393                 return (error);
5394         fsec = KAUTH_FILESEC_NONE;
5395
5396         statptr = (void *)&source;
5397
5398 #if NAMEDRSRCFORK
5399         /* Grab reference on the shadow stream file vnode to
5400          * force an inactive on release which will mark it
5401          * for recycle.
5402          */
5403         if (vnode_isnamedstream(nd.ni_vp) &&
5404             (nd.ni_vp->v_parent != NULLVP) &&
5405             vnode_isshadow(nd.ni_vp)) {
5406                 is_namedstream = 1;
5407                 vnode_ref(nd.ni_vp);
5408         }
5409 #endif
5410
5411         error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
5412
5413 #if NAMEDRSRCFORK
5414         if (is_namedstream) {
5415                 vnode_rele(nd.ni_vp);
5416         }
5417 #endif
5418         vnode_put(nd.ni_vp);
5419         nameidone(&nd);
5420
5421         if (error)
5422                 return (error);
5423         /* Zap spare fields */
5424         if (isstat64 != 0) {
5425                 source.sb64.st_lspare = 0;
5426                 source.sb64.st_qspare[0] = 0LL;
5427                 source.sb64.st_qspare[1] = 0LL;
5428                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5429                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
5430                         my_size = sizeof(dest.user64_sb64);
5431                         sbp = (caddr_t)&dest.user64_sb64;
5432                 } else {
5433                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
5434                         my_size = sizeof(dest.user32_sb64);
5435                         sbp = (caddr_t)&dest.user32_sb64;
5436                 }
5437                 /*
5438                  * Check if we raced (post lookup) against the last unlink of a file.
5439                  */
5440                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
5441                         source.sb64.st_nlink = 1;
5442                 }
5443         } else {
5444                 source.sb.st_lspare = 0;
5445                 source.sb.st_qspare[0] = 0LL;
5446                 source.sb.st_qspare[1] = 0LL;
5447                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5448                         munge_user64_stat(&source.sb, &dest.user64_sb);
5449                         my_size = sizeof(dest.user64_sb);
5450                         sbp = (caddr_t)&dest.user64_sb;
5451                 } else {
5452                         munge_user32_stat(&source.sb, &dest.user32_sb);
5453                         my_size = sizeof(dest.user32_sb);
5454                         sbp = (caddr_t)&dest.user32_sb;
5455                 }
5456
5457                 /*
5458                  * Check if we raced (post lookup) against the last unlink of a file.
5459                  */
5460                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
5461                         source.sb.st_nlink = 1;
5462                 }
5463         }
5464         if ((error = copyout(sbp, ub, my_size)) != 0)
5465                 goto out;
5466
5467         /* caller wants extended security information? */
5468         if (xsecurity != USER_ADDR_NULL) {
5469
5470                 /* did we get any? */
5471                 if (fsec == KAUTH_FILESEC_NONE) {
5472                         if (susize(xsecurity_size, 0) != 0) {
5473                                 error = EFAULT;
5474                                 goto out;
5475                         }
5476                 } else {
5477                         /* find the user buffer size */
5478                         xsecurity_bufsize = fusize(xsecurity_size);
5479
5480                         /* copy out the actual data size */
5481                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
5482                                 error = EFAULT;
5483                                 goto out;
5484                         }
5485
5486                         /* if the caller supplied enough room, copy out to it */
5487                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec))
5488                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
5489                 }
5490         }
5491 out:
5492         if (fsec != KAUTH_FILESEC_NONE)
5493                 kauth_filesec_free(fsec);
5494         return (error);
5495 }
5496
5497 /*
5498  * stat_extended: Get file status; with extended security (ACL).
5499  *
5500  * Parameters:    p                       (ignored)
5501  *                uap                     User argument descriptor (see below)
5502  *                retval                  (ignored)
5503  *
5504  * Indirect:      uap->path               Path of file to get status from
5505  *                uap->ub                 User buffer (holds file status info)
5506  *                uap->xsecurity          ACL to get (extended security)
5507  *                uap->xsecurity_size     Size of ACL
5508  *
5509  * Returns:        0                      Success
5510  *                !0                      errno value
5511  *
5512  */
5513 int
5514 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
5515     __unused int32_t *retval)
5516 {
5517         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5518             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5519             0));
5520 }
5521
5522 /*
5523  * Returns:     0                       Success
5524  *      fstatat_internal:???            [see fstatat_internal() in this file]
5525  */
5526 int
5527 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
5528 {
5529         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5530             0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0));
5531 }
5532
5533 int
5534 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
5535 {
5536         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5537             0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0));
5538 }
5539
5540 /*
5541  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
5542  *
5543  * Parameters:    p                       (ignored)
5544  *                uap                     User argument descriptor (see below)
5545  *                retval                  (ignored)
5546  *
5547  * Indirect:      uap->path               Path of file to get status from
5548  *                uap->ub                 User buffer (holds file status info)
5549  *                uap->xsecurity          ACL to get (extended security)
5550  *                uap->xsecurity_size     Size of ACL
5551  *
5552  * Returns:        0                      Success
5553  *                !0                      errno value
5554  *
5555  */
5556 int
5557 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
5558 {
5559         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5560             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5561             0));
5562 }
5563
5564 /*
5565  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
5566  *
5567  * Parameters:    p                       (ignored)
5568  *                uap                     User argument descriptor (see below)
5569  *                retval                  (ignored)
5570  *
5571  * Indirect:      uap->path               Path of file to get status from
5572  *                uap->ub                 User buffer (holds file status info)
5573  *                uap->xsecurity          ACL to get (extended security)
5574  *                uap->xsecurity_size     Size of ACL
5575  *
5576  * Returns:        0                      Success
5577  *                !0                      errno value
5578  *
5579  */
5580 int
5581 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
5582 {
5583         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5584             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5585             AT_SYMLINK_NOFOLLOW));
5586 }
5587
5588 /*
5589  * Get file status; this version does not follow links.
5590  */
5591 int
5592 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
5593 {
5594         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5595             0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5596 }
5597
5598 int
5599 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
5600 {
5601         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5602             0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5603 }
5604
5605 /*
5606  * lstat64_extended: Get file status; can handle large inode numbers; does not
5607  * follow links; with extended security (ACL).
5608  *
5609  * Parameters:    p                       (ignored)
5610  *                uap                     User argument descriptor (see below)
5611  *                retval                  (ignored)
5612  *
5613  * Indirect:      uap->path               Path of file to get status from
5614  *                uap->ub                 User buffer (holds file status info)
5615  *                uap->xsecurity          ACL to get (extended security)
5616  *                uap->xsecurity_size     Size of ACL
5617  *
5618  * Returns:        0                      Success
5619  *                !0                      errno value
5620  *
5621  */
5622 int
5623 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
5624 {
5625         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5626             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5627             AT_SYMLINK_NOFOLLOW));
5628 }
5629
5630 int
5631 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
5632 {
5633         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5634                 return (EINVAL);
5635
5636         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5637             0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag));
5638 }
5639
5640 int
5641 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
5642     __unused int32_t *retval)
5643 {
5644         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5645                 return (EINVAL);
5646
5647         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5648             0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag));
5649 }
5650
5651 /*
5652  * Get configurable pathname variables.
5653  *
5654  * Returns:     0                       Success
5655  *      namei:???
5656  *      vn_pathconf:???
5657  *
5658  * Notes:       Global implementation  constants are intended to be
5659  *              implemented in this function directly; all other constants
5660  *              are per-FS implementation, and therefore must be handled in
5661  *              each respective FS, instead.
5662  *
5663  * XXX We implement some things globally right now that should actually be
5664  * XXX per-FS; we will need to deal with this at some point.
5665  */
5666 /* ARGSUSED */
5667 int
5668 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
5669 {
5670         int error;
5671         struct nameidata nd;
5672         vfs_context_t ctx = vfs_context_current();
5673
5674         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
5675                 UIO_USERSPACE, uap->path, ctx);
5676         error = namei(&nd);
5677         if (error)
5678                 return (error);
5679
5680         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
5681
5682         vnode_put(nd.ni_vp);
5683         nameidone(&nd);
5684         return (error);
5685 }
5686
5687 /*
5688  * Return target name of a symbolic link.
5689  */
5690 /* ARGSUSED */
5691 static int
5692 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
5693     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
5694     int *retval)
5695 {
5696         vnode_t vp;
5697         uio_t auio;
5698         int error;
5699         struct nameidata nd;
5700         char uio_buf[ UIO_SIZEOF(1) ];
5701
5702         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
5703             seg, path, ctx);
5704
5705         error = nameiat(&nd, fd);
5706         if (error)
5707                 return (error);
5708         vp = nd.ni_vp;
5709
5710         nameidone(&nd);
5711
5712         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
5713                                     &uio_buf[0], sizeof(uio_buf));
5714         uio_addiov(auio, buf, bufsize);
5715         if (vp->v_type != VLNK) {
5716                 error = EINVAL;
5717         } else {
5718 #if CONFIG_MACF
5719                 error = mac_vnode_check_readlink(ctx, vp);
5720 #endif
5721                 if (error == 0)
5722                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
5723                                                 ctx);
5724                 if (error == 0)
5725                         error = VNOP_READLINK(vp, auio, ctx);
5726         }
5727         vnode_put(vp);
5728
5729         *retval = bufsize - (int)uio_resid(auio);
5730         return (error);
5731 }
5732
5733 int
5734 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
5735 {
5736         enum uio_seg procseg;
5737
5738         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5739         return (readlinkat_internal(vfs_context_current(), AT_FDCWD,
5740             CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
5741             uap->count, procseg, retval));
5742 }
5743
5744 int
5745 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
5746 {
5747         enum uio_seg procseg;
5748
5749         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5750         return (readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
5751             procseg, uap->buf, uap->bufsize, procseg, retval));
5752 }
5753
5754 /*
5755  * Change file flags.
5756  */
5757 static int
5758 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
5759 {
5760         struct vnode_attr va;
5761         kauth_action_t action;
5762         int error;
5763
5764         VATTR_INIT(&va);
5765         VATTR_SET(&va, va_flags, flags);
5766
5767 #if CONFIG_MACF
5768         error = mac_vnode_check_setflags(ctx, vp, flags);
5769         if (error)
5770                 goto out;
5771 #endif
5772
5773         /* request authorisation, disregard immutability */
5774         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5775                 goto out;
5776         /*
5777          * Request that the auth layer disregard those file flags it's allowed to when
5778          * authorizing this operation; we need to do this in order to be able to
5779          * clear immutable flags.
5780          */
5781         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0))
5782                 goto out;
5783         error = vnode_setattr(vp, &va, ctx);
5784
5785 #if CONFIG_MACF
5786         if (error == 0)
5787                 mac_vnode_notify_setflags(ctx, vp, flags);
5788 #endif
5789
5790         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
5791                 error = ENOTSUP;
5792         }
5793 out:
5794         vnode_put(vp);
5795         return(error);
5796 }
5797
5798 /*
5799  * Change flags of a file given a path name.
5800  */
5801 /* ARGSUSED */
5802 int
5803 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
5804 {
5805         vnode_t vp;
5806         vfs_context_t ctx = vfs_context_current();
5807         int error;
5808         struct nameidata nd;
5809
5810         AUDIT_ARG(fflags, uap->flags);
5811         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
5812                 UIO_USERSPACE, uap->path, ctx);
5813         error = namei(&nd);
5814         if (error)
5815                 return (error);
5816         vp = nd.ni_vp;
5817         nameidone(&nd);
5818
5819         error = chflags1(vp, uap->flags, ctx);
5820
5821         return(error);
5822 }
5823
5824 /*
5825  * Change flags of a file given a file descriptor.
5826  */
5827 /* ARGSUSED */
5828 int
5829 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
5830 {
5831         vnode_t vp;
5832         int error;
5833
5834         AUDIT_ARG(fd, uap->fd);
5835         AUDIT_ARG(fflags, uap->flags);
5836         if ( (error = file_vnode(uap->fd, &vp)) )
5837                 return (error);
5838
5839         if ((error = vnode_getwithref(vp))) {
5840                 file_drop(uap->fd);
5841                 return(error);
5842         }
5843
5844         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5845
5846         error = chflags1(vp, uap->flags, vfs_context_current());
5847
5848         file_drop(uap->fd);
5849         return (error);
5850 }
5851
5852 /*
5853  * Change security information on a filesystem object.
5854  *
5855  * Returns:     0                       Success
5856  *              EPERM                   Operation not permitted
5857  *              vnode_authattr:???      [anything vnode_authattr can return]
5858  *              vnode_authorize:???     [anything vnode_authorize can return]
5859  *              vnode_setattr:???       [anything vnode_setattr can return]
5860  *
5861  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
5862  *              translated to EPERM before being returned.
5863  */
5864 static int
5865 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
5866 {
5867         kauth_action_t action;
5868         int error;
5869
5870         AUDIT_ARG(mode, vap->va_mode);
5871         /* XXX audit new args */
5872
5873 #if NAMEDSTREAMS
5874         /* chmod calls are not allowed for resource forks. */
5875         if (vp->v_flag & VISNAMEDSTREAM) {
5876                 return (EPERM);
5877         }
5878 #endif
5879
5880 #if CONFIG_MACF
5881         if (VATTR_IS_ACTIVE(vap, va_mode) &&
5882             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0)
5883                 return (error);
5884
5885         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
5886                 if ((error = mac_vnode_check_setowner(ctx, vp,
5887                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
5888                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1)))
5889                         return (error);
5890         }
5891
5892         if (VATTR_IS_ACTIVE(vap, va_acl) &&
5893             (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl)))
5894                 return (error);
5895 #endif
5896
5897         /* make sure that the caller is allowed to set this security information */
5898         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
5899             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
5900                 if (error == EACCES)
5901                         error = EPERM;
5902                 return(error);
5903         }
5904
5905         if ((error = vnode_setattr(vp, vap, ctx)) != 0)
5906                 return (error);
5907
5908 #if CONFIG_MACF
5909         if (VATTR_IS_ACTIVE(vap, va_mode))
5910                 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
5911
5912         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))
5913                 mac_vnode_notify_setowner(ctx, vp,
5914                         VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
5915                         VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
5916
5917         if (VATTR_IS_ACTIVE(vap, va_acl))
5918                 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
5919 #endif
5920
5921         return (error);
5922 }
5923
5924
5925 /*
5926  * Change mode of a file given a path name.
5927  *
5928  * Returns:     0                       Success
5929  *              namei:???               [anything namei can return]
5930  *              chmod_vnode:???         [anything chmod_vnode can return]
5931  */
5932 static int
5933 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
5934     int fd, int flag, enum uio_seg segflg)
5935 {
5936         struct nameidata nd;
5937         int follow, error;
5938
5939         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5940         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
5941             segflg, path, ctx);
5942         if ((error = nameiat(&nd, fd)))
5943                 return (error);
5944         error = chmod_vnode(ctx, nd.ni_vp, vap);
5945         vnode_put(nd.ni_vp);
5946         nameidone(&nd);
5947         return(error);
5948 }
5949
5950 /*
5951  * chmod_extended: Change the mode of a file given a path name; with extended
5952  * argument list (including extended security (ACL)).
5953  *
5954  * Parameters:  p                       Process requesting the open
5955  *              uap                     User argument descriptor (see below)
5956  *              retval                  (ignored)
5957  *
5958  * Indirect:    uap->path               Path to object (same as 'chmod')
5959  *              uap->uid                UID to set
5960  *              uap->gid                GID to set
5961  *              uap->mode               File mode to set (same as 'chmod')
5962  *              uap->xsecurity          ACL to set (or delete)
5963  *
5964  * Returns:     0                       Success
5965  *              !0                      errno value
5966  *
5967  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
5968  *
5969  * XXX:         We should enummerate the possible errno values here, and where
5970  *              in the code they originated.
5971  */
5972 int
5973 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
5974 {
5975         int error;
5976         struct vnode_attr va;
5977         kauth_filesec_t xsecdst;
5978
5979         AUDIT_ARG(owner, uap->uid, uap->gid);
5980
5981         VATTR_INIT(&va);
5982         if (uap->mode != -1)
5983                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5984         if (uap->uid != KAUTH_UID_NONE)
5985                 VATTR_SET(&va, va_uid, uap->uid);
5986         if (uap->gid != KAUTH_GID_NONE)
5987                 VATTR_SET(&va, va_gid, uap->gid);
5988
5989         xsecdst = NULL;
5990         switch(uap->xsecurity) {
5991                 /* explicit remove request */
5992         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
5993                 VATTR_SET(&va, va_acl, NULL);
5994                 break;
5995                 /* not being set */
5996         case USER_ADDR_NULL:
5997                 break;
5998         default:
5999                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
6000                         return(error);
6001                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6002                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6003         }
6004
6005         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6006             UIO_USERSPACE);
6007
6008         if (xsecdst != NULL)
6009                 kauth_filesec_free(xsecdst);
6010         return(error);
6011 }
6012
6013 /*
6014  * Returns:     0                       Success
6015  *              chmodat:???             [anything chmodat can return]
6016  */
6017 static int
6018 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6019     int flag, enum uio_seg segflg)
6020 {
6021         struct vnode_attr va;
6022
6023         VATTR_INIT(&va);
6024         VATTR_SET(&va, va_mode, mode & ALLPERMS);
6025
6026         return (chmodat(ctx, path, &va, fd, flag, segflg));
6027 }
6028
6029 int
6030 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6031 {
6032         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6033             AT_FDCWD, 0, UIO_USERSPACE));
6034 }
6035
6036 int
6037 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6038 {
6039         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6040                 return (EINVAL);
6041
6042         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6043             uap->fd, uap->flag, UIO_USERSPACE));
6044 }
6045
6046 /*
6047  * Change mode of a file given a file descriptor.
6048  */
6049 static int
6050 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6051 {
6052         vnode_t vp;
6053         int error;
6054
6055         AUDIT_ARG(fd, fd);
6056
6057         if ((error = file_vnode(fd, &vp)) != 0)
6058                 return (error);
6059         if ((error = vnode_getwithref(vp)) != 0) {
6060                 file_drop(fd);
6061                 return(error);
6062         }
6063         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6064
6065         error = chmod_vnode(vfs_context_current(), vp, vap);
6066         (void)vnode_put(vp);
6067         file_drop(fd);
6068
6069         return (error);
6070 }
6071
6072 /*
6073  * fchmod_extended: Change mode of a file given a file descriptor; with
6074  * extended argument list (including extended security (ACL)).
6075  *
6076  * Parameters:    p                       Process requesting to change file mode
6077  *                uap                     User argument descriptor (see below)
6078  *                retval                  (ignored)
6079  *
6080  * Indirect:      uap->mode               File mode to set (same as 'chmod')
6081  *                uap->uid                UID to set
6082  *                uap->gid                GID to set
6083  *                uap->xsecurity          ACL to set (or delete)
6084  *                uap->fd                 File descriptor of file to change mode
6085  *
6086  * Returns:        0                      Success
6087  *                !0                      errno value
6088  *
6089  */
6090 int
6091 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6092 {
6093         int error;
6094         struct vnode_attr va;
6095         kauth_filesec_t xsecdst;
6096
6097         AUDIT_ARG(owner, uap->uid, uap->gid);
6098
6099         VATTR_INIT(&va);
6100         if (uap->mode != -1)
6101                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6102         if (uap->uid != KAUTH_UID_NONE)
6103                 VATTR_SET(&va, va_uid, uap->uid);
6104         if (uap->gid != KAUTH_GID_NONE)
6105                 VATTR_SET(&va, va_gid, uap->gid);
6106
6107         xsecdst = NULL;
6108         switch(uap->xsecurity) {
6109         case USER_ADDR_NULL:
6110                 VATTR_SET(&va, va_acl, NULL);
6111                 break;
6112         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6113                 VATTR_SET(&va, va_acl, NULL);
6114                 break;
6115                 /* not being set */
6116         case CAST_USER_ADDR_T(-1):
6117                 break;
6118         default:
6119                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
6120                         return(error);
6121                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6122         }
6123
6124         error = fchmod1(p, uap->fd, &va);
6125
6126
6127         switch(uap->xsecurity) {
6128         case USER_ADDR_NULL:
6129         case CAST_USER_ADDR_T(-1):
6130                 break;
6131         default:
6132                 if (xsecdst != NULL)
6133                         kauth_filesec_free(xsecdst);
6134         }
6135         return(error);
6136 }
6137
6138 int
6139 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6140 {
6141         struct vnode_attr va;
6142
6143         VATTR_INIT(&va);
6144         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6145
6146         return(fchmod1(p, uap->fd, &va));
6147 }
6148
6149
6150 /*
6151  * Set ownership given a path name.
6152  */
6153 /* ARGSUSED */
6154 static int
6155 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6156    gid_t gid, int flag, enum uio_seg segflg)
6157 {
6158         vnode_t vp;
6159         struct vnode_attr va;
6160         int error;
6161         struct nameidata nd;
6162         int follow;
6163         kauth_action_t action;
6164
6165         AUDIT_ARG(owner, uid, gid);
6166
6167         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6168         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6169             path, ctx);
6170         error = nameiat(&nd, fd);
6171         if (error)
6172                 return (error);
6173         vp = nd.ni_vp;
6174
6175         nameidone(&nd);
6176
6177         VATTR_INIT(&va);
6178         if (uid != (uid_t)VNOVAL)
6179                 VATTR_SET(&va, va_uid, uid);
6180         if (gid != (gid_t)VNOVAL)
6181                 VATTR_SET(&va, va_gid, gid);
6182
6183 #if CONFIG_MACF
6184         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6185         if (error)
6186                 goto out;
6187 #endif
6188
6189         /* preflight and authorize attribute changes */
6190         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6191                 goto out;
6192         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6193                 goto out;
6194         error = vnode_setattr(vp, &va, ctx);
6195
6196 #if CONFIG_MACF
6197         if (error == 0)
6198                 mac_vnode_notify_setowner(ctx, vp, uid, gid);
6199 #endif
6200
6201 out:
6202         /*
6203          * EACCES is only allowed from namei(); permissions failure should
6204          * return EPERM, so we need to translate the error code.
6205          */
6206         if (error == EACCES)
6207                 error = EPERM;
6208
6209         vnode_put(vp);
6210         return (error);
6211 }
6212
6213 int
6214 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
6215 {
6216         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6217             uap->uid, uap->gid, 0, UIO_USERSPACE));
6218 }
6219
6220 int
6221 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
6222 {
6223         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6224             uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE));
6225 }
6226
6227 int
6228 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
6229 {
6230         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6231                 return (EINVAL);
6232
6233         return (fchownat_internal(vfs_context_current(), uap->fd, uap->path,
6234             uap->uid, uap->gid, uap->flag, UIO_USERSPACE));
6235 }
6236
6237 /*
6238  * Set ownership given a file descriptor.
6239  */
6240 /* ARGSUSED */
6241 int
6242 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
6243 {
6244         struct vnode_attr va;
6245         vfs_context_t ctx = vfs_context_current();
6246         vnode_t vp;
6247         int error;
6248         kauth_action_t action;
6249
6250         AUDIT_ARG(owner, uap->uid, uap->gid);
6251         AUDIT_ARG(fd, uap->fd);
6252
6253         if ( (error = file_vnode(uap->fd, &vp)) )
6254                 return (error);
6255
6256         if ( (error = vnode_getwithref(vp)) ) {
6257                 file_drop(uap->fd);
6258                 return(error);
6259         }
6260         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6261
6262         VATTR_INIT(&va);
6263         if (uap->uid != VNOVAL)
6264                 VATTR_SET(&va, va_uid, uap->uid);
6265         if (uap->gid != VNOVAL)
6266                 VATTR_SET(&va, va_gid, uap->gid);
6267
6268 #if NAMEDSTREAMS
6269         /* chown calls are not allowed for resource forks. */
6270         if (vp->v_flag & VISNAMEDSTREAM) {
6271                 error = EPERM;
6272                 goto out;
6273         }
6274 #endif
6275
6276 #if CONFIG_MACF
6277         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
6278         if (error)
6279                 goto out;
6280 #endif
6281
6282         /* preflight and authorize attribute changes */
6283         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6284                 goto out;
6285         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6286                 if (error == EACCES)
6287                         error = EPERM;
6288                 goto out;
6289         }
6290         error = vnode_setattr(vp, &va, ctx);
6291
6292 #if CONFIG_MACF
6293         if (error == 0)
6294                 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
6295 #endif
6296
6297 out:
6298         (void)vnode_put(vp);
6299         file_drop(uap->fd);
6300         return (error);
6301 }
6302
6303 static int
6304 getutimes(user_addr_t usrtvp, struct timespec *tsp)
6305 {
6306         int error;
6307
6308         if (usrtvp == USER_ADDR_NULL) {
6309                 struct timeval old_tv;
6310                 /* XXX Y2038 bug because of microtime argument */
6311                 microtime(&old_tv);
6312                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
6313                 tsp[1] = tsp[0];
6314         } else {
6315                 if (IS_64BIT_PROCESS(current_proc())) {
6316                         struct user64_timeval tv[2];
6317                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6318                         if (error)
6319                                 return (error);
6320                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6321                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6322                 } else {
6323                         struct user32_timeval tv[2];
6324                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6325                         if (error)
6326                                 return (error);
6327                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6328                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6329                 }
6330         }
6331         return 0;
6332 }
6333
6334 static int
6335 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
6336         int nullflag)
6337 {
6338         int error;
6339         struct vnode_attr va;
6340         kauth_action_t action;
6341
6342         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6343
6344         VATTR_INIT(&va);
6345         VATTR_SET(&va, va_access_time, ts[0]);
6346         VATTR_SET(&va, va_modify_time, ts[1]);
6347         if (nullflag)
6348                 va.va_vaflags |= VA_UTIMES_NULL;
6349
6350 #if NAMEDSTREAMS
6351         /* utimes calls are not allowed for resource forks. */
6352         if (vp->v_flag & VISNAMEDSTREAM) {
6353                 error = EPERM;
6354                 goto out;
6355         }
6356 #endif
6357
6358 #if CONFIG_MACF
6359         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
6360         if (error)
6361                 goto out;
6362 #endif
6363         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6364                 if (!nullflag && error == EACCES)
6365                         error = EPERM;
6366                 goto out;
6367         }
6368
6369         /* since we may not need to auth anything, check here */
6370         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6371                 if (!nullflag && error == EACCES)
6372                         error = EPERM;
6373                 goto out;
6374         }
6375         error = vnode_setattr(vp, &va, ctx);
6376
6377 #if CONFIG_MACF
6378         if (error == 0)
6379                 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
6380 #endif
6381
6382 out:
6383         return error;
6384 }
6385
6386 /*
6387  * Set the access and modification times of a file.
6388  */
6389 /* ARGSUSED */
6390 int
6391 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
6392 {
6393         struct timespec ts[2];
6394         user_addr_t usrtvp;
6395         int error;
6396         struct nameidata nd;
6397         vfs_context_t ctx = vfs_context_current();
6398
6399         /*
6400          * AUDIT: Needed to change the order of operations to do the
6401          * name lookup first because auditing wants the path.
6402          */
6403         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6404                 UIO_USERSPACE, uap->path, ctx);
6405         error = namei(&nd);
6406         if (error)
6407                 return (error);
6408         nameidone(&nd);
6409
6410         /*
6411          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
6412          * the current time instead.
6413          */
6414         usrtvp = uap->tptr;
6415         if ((error = getutimes(usrtvp, ts)) != 0)
6416                 goto out;
6417
6418         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
6419
6420 out:
6421         vnode_put(nd.ni_vp);
6422         return (error);
6423 }
6424
6425 /*
6426  * Set the access and modification times of a file.
6427  */
6428 /* ARGSUSED */
6429 int
6430 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
6431 {
6432         struct timespec ts[2];
6433         vnode_t vp;
6434         user_addr_t usrtvp;
6435         int error;
6436
6437         AUDIT_ARG(fd, uap->fd);
6438         usrtvp = uap->tptr;
6439         if ((error = getutimes(usrtvp, ts)) != 0)
6440                 return (error);
6441         if ((error = file_vnode(uap->fd, &vp)) != 0)
6442                 return (error);
6443         if((error = vnode_getwithref(vp))) {
6444                 file_drop(uap->fd);
6445                 return(error);
6446         }
6447
6448         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
6449         vnode_put(vp);
6450         file_drop(uap->fd);
6451         return(error);
6452 }
6453
6454 /*
6455  * Truncate a file given its path name.
6456  */
6457 /* ARGSUSED */
6458 int
6459 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
6460 {
6461         vnode_t vp;
6462         struct vnode_attr va;
6463         vfs_context_t ctx = vfs_context_current();
6464         int error;
6465         struct nameidata nd;
6466         kauth_action_t action;
6467
6468         if (uap->length < 0)
6469                 return(EINVAL);
6470         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
6471                 UIO_USERSPACE, uap->path, ctx);
6472         if ((error = namei(&nd)))
6473                 return (error);
6474         vp = nd.ni_vp;
6475
6476         nameidone(&nd);
6477
6478         VATTR_INIT(&va);
6479         VATTR_SET(&va, va_data_size, uap->length);
6480
6481 #if CONFIG_MACF
6482         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
6483         if (error)
6484                 goto out;
6485 #endif
6486
6487         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6488                 goto out;
6489         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6490                 goto out;
6491         error = vnode_setattr(vp, &va, ctx);
6492
6493 #if CONFIG_MACF
6494         if (error == 0)
6495                 mac_vnode_notify_truncate(ctx, NOCRED, vp);
6496 #endif
6497
6498 out:
6499         vnode_put(vp);
6500         return (error);
6501 }
6502
6503 /*
6504  * Truncate a file given a file descriptor.
6505  */
6506 /* ARGSUSED */
6507 int
6508 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
6509 {
6510         vfs_context_t ctx = vfs_context_current();
6511         struct vnode_attr va;
6512         vnode_t vp;
6513         struct fileproc *fp;
6514         int error ;
6515         int fd = uap->fd;
6516
6517         AUDIT_ARG(fd, uap->fd);
6518         if (uap->length < 0)
6519                 return(EINVAL);
6520
6521         if ( (error = fp_lookup(p,fd,&fp,0)) ) {
6522                 return(error);
6523         }
6524
6525         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
6526         case DTYPE_PSXSHM:
6527                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
6528                 goto out;
6529         case DTYPE_VNODE:
6530                 break;
6531         default:
6532                 error = EINVAL;
6533                 goto out;
6534         }
6535
6536         vp = (vnode_t)fp->f_fglob->fg_data;
6537
6538         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
6539                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
6540                 error = EINVAL;
6541                 goto out;
6542         }
6543
6544         if ((error = vnode_getwithref(vp)) != 0) {
6545                 goto out;
6546         }
6547
6548         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6549
6550 #if CONFIG_MACF
6551         error = mac_vnode_check_truncate(ctx,
6552             fp->f_fglob->fg_cred, vp);
6553         if (error) {
6554                 (void)vnode_put(vp);
6555                 goto out;
6556         }
6557 #endif
6558         VATTR_INIT(&va);
6559         VATTR_SET(&va, va_data_size, uap->length);
6560         error = vnode_setattr(vp, &va, ctx);
6561
6562 #if CONFIG_MACF
6563         if (error == 0)
6564                 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
6565 #endif
6566
6567         (void)vnode_put(vp);
6568 out:
6569         file_drop(fd);
6570         return (error);
6571 }
6572
6573
6574 /*
6575  * Sync an open file with synchronized I/O _file_ integrity completion
6576  */
6577 /* ARGSUSED */
6578 int
6579 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
6580 {
6581         __pthread_testcancel(1);
6582         return(fsync_common(p, uap, MNT_WAIT));
6583 }
6584
6585
6586 /*
6587  * Sync an open file with synchronized I/O _file_ integrity completion
6588  *
6589  * Notes:       This is a legacy support function that does not test for
6590  *              thread cancellation points.
6591  */
6592 /* ARGSUSED */
6593 int
6594 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
6595 {
6596         return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT));
6597 }
6598
6599
6600 /*
6601  * Sync an open file with synchronized I/O _data_ integrity completion
6602  */
6603 /* ARGSUSED */
6604 int
6605 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
6606 {
6607         __pthread_testcancel(1);
6608         return(fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT));
6609 }
6610
6611
6612 /*
6613  * fsync_common
6614  *
6615  * Common fsync code to support both synchronized I/O file integrity completion
6616  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
6617  *
6618  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
6619  * will only guarantee that the file data contents are retrievable.  If
6620  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
6621  * includes additional metadata unnecessary for retrieving the file data
6622  * contents, such as atime, mtime, ctime, etc., also be committed to stable
6623  * storage.
6624  *
6625  * Parameters:  p                               The process
6626  *              uap->fd                         The descriptor to synchronize
6627  *              flags                           The data integrity flags
6628  *
6629  * Returns:     int                             Success
6630  *      fp_getfvp:EBADF                         Bad file descriptor
6631  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
6632  *      VNOP_FSYNC:???                          unspecified
6633  *
6634  * Notes:       We use struct fsync_args because it is a short name, and all
6635  *              caller argument structures are otherwise identical.
6636  */
6637 static int
6638 fsync_common(proc_t p, struct fsync_args *uap, int flags)
6639 {
6640         vnode_t vp;
6641         struct fileproc *fp;
6642         vfs_context_t ctx = vfs_context_current();
6643         int error;
6644
6645         AUDIT_ARG(fd, uap->fd);
6646
6647         if ( (error = fp_getfvp(p, uap->fd, &fp, &vp)) )
6648                 return (error);
6649         if ( (error = vnode_getwithref(vp)) ) {
6650                 file_drop(uap->fd);
6651                 return(error);
6652         }
6653
6654         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6655
6656         error = VNOP_FSYNC(vp, flags, ctx);
6657
6658 #if NAMEDRSRCFORK
6659         /* Sync resource fork shadow file if necessary. */
6660         if ((error == 0) &&
6661             (vp->v_flag & VISNAMEDSTREAM) &&
6662             (vp->v_parent != NULLVP) &&
6663             vnode_isshadow(vp) &&
6664             (fp->f_flags & FP_WRITTEN)) {
6665                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
6666         }
6667 #endif
6668
6669         (void)vnode_put(vp);
6670         file_drop(uap->fd);
6671         return (error);
6672 }
6673
6674 /*
6675  * Duplicate files.  Source must be a file, target must be a file or
6676  * must not exist.
6677  *
6678  * XXX Copyfile authorisation checking is woefully inadequate, and will not
6679  *     perform inheritance correctly.
6680  */
6681 /* ARGSUSED */
6682 int
6683 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
6684 {
6685         vnode_t tvp, fvp, tdvp, sdvp;
6686         struct nameidata fromnd, tond;
6687         int error;
6688         vfs_context_t ctx = vfs_context_current();
6689 #if CONFIG_MACF
6690         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
6691         struct vnode_attr va;
6692 #endif
6693
6694         /* Check that the flags are valid. */
6695
6696         if (uap->flags & ~CPF_MASK) {
6697                 return(EINVAL);
6698         }
6699
6700         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
6701                 UIO_USERSPACE, uap->from, ctx);
6702         if ((error = namei(&fromnd)))
6703                 return (error);
6704         fvp = fromnd.ni_vp;
6705
6706         NDINIT(&tond, CREATE, OP_LINK,
6707                LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6708                UIO_USERSPACE, uap->to, ctx);
6709         if ((error = namei(&tond))) {
6710                 goto out1;
6711         }
6712         tdvp = tond.ni_dvp;
6713         tvp = tond.ni_vp;
6714
6715         if (tvp != NULL) {
6716                 if (!(uap->flags & CPF_OVERWRITE)) {
6717                         error = EEXIST;
6718                         goto out;
6719                 }
6720         }
6721
6722         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
6723                 error = EISDIR;
6724                 goto out;
6725         }
6726
6727         /* This calls existing MAC hooks for open  */
6728         if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
6729             NULL))) {
6730                 goto out;
6731         }
6732
6733         if (tvp) {
6734                 /*
6735                  * See unlinkat_internal for an explanation of the potential
6736                  * ENOENT from the MAC hook but the gist is that the MAC hook
6737                  * can fail because vn_getpath isn't able to return the full
6738                  * path. We choose to ignore this failure.
6739                  */
6740                 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
6741                 if (error && error != ENOENT)
6742                         goto out;
6743                 error = 0;
6744         }
6745
6746 #if CONFIG_MACF
6747         VATTR_INIT(&va);
6748         VATTR_SET(&va, va_type, fvp->v_type);
6749         /* Mask off all but regular access permissions */
6750         VATTR_SET(&va, va_mode,
6751             ((((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
6752         error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
6753         if (error)
6754                 goto out;
6755 #endif /* CONFIG_MACF */
6756
6757         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
6758                 goto out;
6759
6760         if (fvp == tdvp)
6761                 error = EINVAL;
6762         /*
6763          * If source is the same as the destination (that is the
6764          * same inode number) then there is nothing to do.
6765          * (fixed to have POSIX semantics - CSM 3/2/98)
6766          */
6767         if (fvp == tvp)
6768                 error = -1;
6769         if (!error)
6770                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
6771 out:
6772         sdvp = tond.ni_startdir;
6773         /*
6774          * nameidone has to happen before we vnode_put(tdvp)
6775          * since it may need to release the fs_nodelock on the tdvp
6776          */
6777         nameidone(&tond);
6778
6779         if (tvp)
6780                 vnode_put(tvp);
6781         vnode_put(tdvp);
6782         vnode_put(sdvp);
6783 out1:
6784         vnode_put(fvp);
6785
6786         nameidone(&fromnd);
6787
6788         if (error == -1)
6789                 return (0);
6790         return (error);
6791 }
6792
6793 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
6794
6795 /*
6796  * Helper function for doing clones. The caller is expected to provide an
6797  * iocounted source vnode and release it.
6798  */
6799 static int
6800 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
6801     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
6802 {
6803         vnode_t tvp, tdvp;
6804         struct nameidata tond;
6805         int error;
6806         int follow;
6807         boolean_t free_acl;
6808         boolean_t attr_cleanup;
6809         enum vtype v_type;
6810         kauth_action_t action;
6811         struct componentname *cnp;
6812         uint32_t defaulted;
6813         struct vnode_attr va;
6814
6815         v_type = vnode_vtype(fvp);
6816         switch (v_type) {
6817         case VLNK:
6818                 /* FALLTHRU */
6819         case VREG:
6820                 action = KAUTH_VNODE_ADD_FILE;
6821                 break;
6822         case VDIR:
6823                 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
6824                     fvp->v_mountedhere) {
6825                         return (EINVAL);
6826                 }
6827                 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
6828                 break;
6829         default:
6830                 return (EINVAL);
6831         }
6832
6833         AUDIT_ARG(fd2, dst_dirfd);
6834         AUDIT_ARG(value32, flags);
6835
6836         follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6837         NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
6838             UIO_USERSPACE, dst, ctx);
6839         if ((error = nameiat(&tond, dst_dirfd)))
6840                 return (error);
6841         cnp = &tond.ni_cnd;
6842         tdvp = tond.ni_dvp;
6843         tvp = tond.ni_vp;
6844
6845         free_acl = FALSE;
6846         attr_cleanup = FALSE;
6847
6848         if (tvp != NULL) {
6849                 error = EEXIST;
6850                 goto out;
6851         }
6852
6853         if (vnode_mount(tdvp) != vnode_mount(fvp)) {
6854                 error = EXDEV;
6855                 goto out;
6856         }
6857
6858 #if CONFIG_MACF
6859         if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp)))
6860                 goto out;
6861 #endif
6862         if ((error = vnode_authorize(tdvp, NULL, action, ctx)))
6863                 goto out;
6864
6865         action = KAUTH_VNODE_GENERIC_READ_BITS;
6866         if (data_read_authorised)
6867                 action &= ~KAUTH_VNODE_READ_DATA;
6868         if ((error = vnode_authorize(fvp, NULL, action, ctx)))
6869                 goto out;
6870
6871         /*
6872          * certain attributes may need to be changed from the source, we ask for
6873          * those here.
6874          */
6875         VATTR_INIT(&va);
6876         VATTR_WANTED(&va, va_type);
6877         VATTR_WANTED(&va, va_mode);
6878         VATTR_WANTED(&va, va_flags);
6879         VATTR_WANTED(&va, va_acl);
6880
6881         if ((error = vnode_getattr(fvp, &va, ctx)) != 0)
6882                 goto out;
6883
6884         if (!VATTR_IS_SUPPORTED(&va, va_acl))
6885                 VATTR_CLEAR_ACTIVE(&va, va_acl);
6886         else if (va.va_acl != NULL)
6887                 free_acl = TRUE;
6888
6889         if (!VATTR_IS_SUPPORTED(&va, va_mode)) {
6890                 VATTR_CLEAR_ACTIVE(&va, va_mode);
6891         } else {
6892                 proc_t p = vfs_context_proc(ctx);
6893
6894                 VATTR_SET(&va, va_mode,
6895                     (va.va_mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
6896         }
6897
6898         if (!VATTR_IS_SUPPORTED(&va, va_flags)) {
6899                 VATTR_CLEAR_ACTIVE(&va, va_flags);
6900         } else if (va.va_flags & SF_RESTRICTED) {
6901                 /*
6902                  * Turn off SF_RESTRICTED from source, if the destination needs
6903                  * it, it will be handled in vnode_authattr_new.
6904                  */
6905                 VATTR_SET(&va, va_flags, (va.va_flags & ~SF_RESTRICTED));
6906         }
6907
6908         /* Handle ACL inheritance, initialize vap. */
6909         if (v_type == VLNK) {
6910                 error = vnode_authattr_new(tdvp, &va, 0, ctx);
6911         } else {
6912                 error = vn_attribute_prepare(tdvp, &va, &defaulted, ctx);
6913                 attr_cleanup = TRUE;
6914         }
6915
6916         if (error) {
6917                 attr_cleanup = FALSE;
6918                 goto out;
6919         }
6920
6921         error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &va, flags, ctx);
6922
6923         if (!error && tvp) {
6924                 int     update_flags = 0;
6925 #if CONFIG_FSE
6926                 int fsevent;
6927 #endif /* CONFIG_FSE */
6928
6929 #if CONFIG_MACF
6930                 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
6931                     VNODE_LABEL_CREATE, ctx);
6932 #endif
6933                 /*
6934                  * If some of the requested attributes weren't handled by the
6935                  * VNOP, use our fallback code.
6936                  */
6937                 if (!VATTR_ALL_SUPPORTED(&va))
6938                         (void)vnode_setattr_fallback(tvp, &va, ctx);
6939
6940                 // Make sure the name & parent pointers are hooked up
6941                 if (tvp->v_name == NULL)
6942                         update_flags |= VNODE_UPDATE_NAME;
6943                 if (tvp->v_parent == NULLVP)
6944                         update_flags |= VNODE_UPDATE_PARENT;
6945
6946                 if (update_flags) {
6947                         (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
6948                             cnp->cn_namelen, cnp->cn_hash, update_flags);
6949                 }
6950
6951 #if CONFIG_FSE
6952                 switch (vnode_vtype(tvp)) {
6953                 case VLNK:
6954                         /* FALLTHRU */
6955                 case VREG:
6956                         fsevent = FSE_CREATE_FILE;
6957                         break;
6958                 case VDIR:
6959                         fsevent = FSE_CREATE_DIR;
6960                         break;
6961                 default:
6962                         goto out;
6963                 }
6964
6965                 if (need_fsevent(fsevent, tvp)) {
6966                         add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
6967                             FSE_ARG_DONE);
6968                 }
6969 #endif /* CONFIG_FSE */
6970         }
6971 #if CLONE_SNAPSHOT_FALLBACKS_ENABLED
6972           else if (error == ENOTSUP) {
6973                 struct vfs_attr vfa;
6974
6975                 /*
6976                  * Fallback to VNOP_COPYFILE but check first that the
6977                  * filesystem supports cloning.
6978                  */
6979                 VFSATTR_INIT(&vfa);
6980                 VFSATTR_WANTED(&vfa, f_capabilities);
6981                 if ((vfs_getattr(vnode_mount(tdvp), &vfa, ctx) == 0) &&
6982                     VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
6983                     (vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_CLONE) &&
6984                     (vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_CLONE)) {
6985
6986                         error = VNOP_COPYFILE(fvp, tdvp, tvp, cnp, 0,
6987                             0, ctx);
6988                 }
6989         }
6990 #endif /* CLONE_SNAPSHOT_FALLBACKS_ENABLED */
6991
6992 out:
6993         if (attr_cleanup)
6994                 vn_attribute_cleanup(&va, defaulted);
6995         if (free_acl && va.va_acl)
6996                 kauth_acl_free(va.va_acl);
6997         nameidone(&tond);
6998         if (tvp)
6999                 vnode_put(tvp);
7000         vnode_put(tdvp);
7001         return (error);
7002 }
7003
7004 /*
7005  * clone files or directories, target must not exist.
7006  */
7007 /* ARGSUSED */
7008 int
7009 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7010     __unused int32_t *retval)
7011 {
7012         vnode_t fvp;
7013         struct nameidata fromnd;
7014         int follow;
7015         int error;
7016         vfs_context_t ctx = vfs_context_current();
7017
7018         /* Check that the flags are valid. */
7019         if (uap->flags & ~CLONE_NOFOLLOW)
7020                 return (EINVAL);
7021
7022         AUDIT_ARG(fd, uap->src_dirfd);
7023
7024         follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7025         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7026             UIO_USERSPACE, uap->src, ctx);
7027         if ((error = nameiat(&fromnd, uap->src_dirfd)))
7028                 return (error);
7029
7030         fvp = fromnd.ni_vp;
7031         nameidone(&fromnd);
7032
7033         error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7034             uap->flags, ctx);
7035
7036         vnode_put(fvp);
7037         return (error);
7038 }
7039
7040 int
7041 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7042     __unused int32_t *retval)
7043 {
7044         vnode_t fvp;
7045         struct fileproc *fp;
7046         int error;
7047         vfs_context_t ctx = vfs_context_current();
7048
7049         AUDIT_ARG(fd, uap->src_fd);
7050         error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7051         if (error)
7052                 return (error);
7053
7054         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7055                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7056                 error = EBADF;
7057                 goto out;
7058         }
7059
7060         if ((error = vnode_getwithref(fvp)))
7061                 goto out;
7062
7063         AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7064
7065         error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7066             uap->flags, ctx);
7067
7068         vnode_put(fvp);
7069 out:
7070         file_drop(uap->src_fd);
7071         return (error);
7072 }
7073
7074 /*
7075  * Rename files.  Source and destination must either both be directories,
7076  * or both not be directories.  If target is a directory, it must be empty.
7077  */
7078 /* ARGSUSED */
7079 static int
7080 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
7081     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
7082 {
7083         if (flags & ~VFS_RENAME_FLAGS_MASK)
7084                 return EINVAL;
7085
7086         if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL))
7087                 return EINVAL;
7088
7089         vnode_t tvp, tdvp;
7090         vnode_t fvp, fdvp;
7091         struct nameidata *fromnd, *tond;
7092         int error;
7093         int do_retry;
7094         int retry_count;
7095         int mntrename;
7096         int need_event;
7097         const char *oname = NULL;
7098         char *from_name = NULL, *to_name = NULL;
7099         int from_len=0, to_len=0;
7100         int holding_mntlock;
7101         mount_t locked_mp = NULL;
7102         vnode_t oparent = NULLVP;
7103 #if CONFIG_FSE
7104         fse_info from_finfo, to_finfo;
7105 #endif
7106         int from_truncated=0, to_truncated;
7107         int batched = 0;
7108         struct vnode_attr *fvap, *tvap;
7109         int continuing = 0;
7110         /* carving out a chunk for structs that are too big to be on stack. */
7111         struct {
7112                 struct nameidata from_node, to_node;
7113                 struct vnode_attr fv_attr, tv_attr;
7114         } * __rename_data;
7115         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
7116         fromnd = &__rename_data->from_node;
7117         tond = &__rename_data->to_node;
7118
7119         holding_mntlock = 0;
7120         do_retry = 0;
7121         retry_count = 0;
7122 retry:
7123         fvp = tvp = NULL;
7124         fdvp = tdvp = NULL;
7125         fvap = tvap = NULL;
7126         mntrename = FALSE;
7127
7128         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
7129             segflg, from, ctx);
7130         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
7131
7132         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7133             segflg, to, ctx);
7134         tond->ni_flag = NAMEI_COMPOUNDRENAME;
7135
7136 continue_lookup:
7137         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
7138                 if ( (error = nameiat(fromnd, fromfd)) )
7139                         goto out1;
7140                 fdvp = fromnd->ni_dvp;
7141                 fvp  = fromnd->ni_vp;
7142
7143                 if (fvp && fvp->v_type == VDIR)
7144                         tond->ni_cnd.cn_flags |= WILLBEDIR;
7145         }
7146
7147         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
7148                 if ( (error = nameiat(tond, tofd)) ) {
7149                         /*
7150                          * Translate error code for rename("dir1", "dir2/.").
7151                          */
7152                         if (error == EISDIR && fvp->v_type == VDIR)
7153                                 error = EINVAL;
7154                         goto out1;
7155                 }
7156                 tdvp = tond->ni_dvp;
7157                 tvp  = tond->ni_vp;
7158         }
7159
7160         if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
7161                 error = ENOENT;
7162                 goto out1;
7163         }
7164
7165         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
7166                 error = EEXIST;
7167                 goto out1;
7168         }
7169
7170         batched = vnode_compound_rename_available(fdvp);
7171         if (!fvp) {
7172                 /*
7173                  * Claim: this check will never reject a valid rename.
7174                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
7175                  * Suppose fdvp and tdvp are not on the same mount.
7176                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
7177                  *      then you can't move it to within another dir on the same mountpoint.
7178                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
7179                  *
7180                  * If this check passes, then we are safe to pass these vnodes to the same FS.
7181                  */
7182                 if (fdvp->v_mount != tdvp->v_mount) {
7183                         error = EXDEV;
7184                         goto out1;
7185                 }
7186                 goto skipped_lookup;
7187         }
7188
7189         if (!batched) {
7190                 error = vn_authorize_renamex(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, flags, NULL);
7191                 if (error) {
7192                         if (error == ENOENT) {
7193                                 assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7194                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7195                                         /*
7196                                          * We encountered a race where after doing the namei, tvp stops
7197                                          * being valid. If so, simply re-drive the rename call from the
7198                                          * top.
7199                                          */
7200                                         do_retry = 1;
7201                                         retry_count += 1;
7202                                 }
7203                         }
7204                         goto out1;
7205                 }
7206         }
7207
7208         /*
7209          * If the source and destination are the same (i.e. they're
7210          * links to the same vnode) and the target file system is
7211          * case sensitive, then there is nothing to do.
7212          *
7213          * XXX Come back to this.
7214          */
7215         if (fvp == tvp) {
7216                 int pathconf_val;
7217
7218                 /*
7219                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
7220                  * then assume that this file system is case sensitive.
7221                  */
7222                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
7223                     pathconf_val != 0) {
7224                         goto out1;
7225                 }
7226         }
7227
7228         /*
7229          * Allow the renaming of mount points.
7230          * - target must not exist
7231          * - target must reside in the same directory as source
7232          * - union mounts cannot be renamed
7233          * - "/" cannot be renamed
7234          *
7235          * XXX Handle this in VFS after a continued lookup (if we missed
7236          * in the cache to start off)
7237          *
7238          * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
7239          * we'll skip past here.  The file system is responsible for
7240          * checking that @tvp is not a descendent of @fvp and vice versa
7241          * so it should always return EINVAL if either @tvp or @fvp is the
7242          * root of a volume.
7243          */
7244         if ((fvp->v_flag & VROOT) &&
7245             (fvp->v_type == VDIR) &&
7246             (tvp == NULL)  &&
7247             (fvp->v_mountedhere == NULL)  &&
7248             (fdvp == tdvp)  &&
7249             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0)  &&
7250             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
7251                 vnode_t coveredvp;
7252
7253                 /* switch fvp to the covered vnode */
7254                 coveredvp = fvp->v_mount->mnt_vnodecovered;
7255                 if ( (vnode_getwithref(coveredvp)) ) {
7256                         error = ENOENT;
7257                         goto out1;
7258                 }
7259                 vnode_put(fvp);
7260
7261                 fvp = coveredvp;
7262                 mntrename = TRUE;
7263         }
7264         /*
7265          * Check for cross-device rename.
7266          */
7267         if ((fvp->v_mount != tdvp->v_mount) ||
7268             (tvp && (fvp->v_mount != tvp->v_mount))) {
7269                 error = EXDEV;
7270                 goto out1;
7271         }
7272
7273         /*
7274          * If source is the same as the destination (that is the
7275          * same inode number) then there is nothing to do...
7276          * EXCEPT if the underlying file system supports case
7277          * insensitivity and is case preserving.  In this case
7278          * the file system needs to handle the special case of
7279          * getting the same vnode as target (fvp) and source (tvp).
7280          *
7281          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
7282          * and _PC_CASE_PRESERVING can have this exception, and they need to
7283          * handle the special case of getting the same vnode as target and
7284          * source.  NOTE: Then the target is unlocked going into vnop_rename,
7285          * so not to cause locking problems. There is a single reference on tvp.
7286          *
7287          * NOTE - that fvp == tvp also occurs if they are hard linked and
7288          * that correct behaviour then is just to return success without doing
7289          * anything.
7290          *
7291          * XXX filesystem should take care of this itself, perhaps...
7292          */
7293         if (fvp == tvp && fdvp == tdvp) {
7294                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
7295                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
7296                           fromnd->ni_cnd.cn_namelen)) {
7297                         goto out1;
7298                 }
7299         }
7300
7301         if (holding_mntlock && fvp->v_mount != locked_mp) {
7302                 /*
7303                  * we're holding a reference and lock
7304                  * on locked_mp, but it no longer matches
7305                  * what we want to do... so drop our hold
7306                  */
7307                 mount_unlock_renames(locked_mp);
7308                 mount_drop(locked_mp, 0);
7309                 holding_mntlock = 0;
7310         }
7311         if (tdvp != fdvp && fvp->v_type == VDIR) {
7312                 /*
7313                  * serialize renames that re-shape
7314                  * the tree... if holding_mntlock is
7315                  * set, then we're ready to go...
7316                  * otherwise we
7317                  * first need to drop the iocounts
7318                  * we picked up, second take the
7319                  * lock to serialize the access,
7320                  * then finally start the lookup
7321                  * process over with the lock held
7322                  */
7323                 if (!holding_mntlock) {
7324                         /*
7325                          * need to grab a reference on
7326                          * the mount point before we
7327                          * drop all the iocounts... once
7328                          * the iocounts are gone, the mount
7329                          * could follow
7330                          */
7331                         locked_mp = fvp->v_mount;
7332                         mount_ref(locked_mp, 0);
7333
7334                         /*
7335                          * nameidone has to happen before we vnode_put(tvp)
7336                          * since it may need to release the fs_nodelock on the tvp
7337                          */
7338                         nameidone(tond);
7339
7340                         if (tvp)
7341                                 vnode_put(tvp);
7342                         vnode_put(tdvp);
7343
7344                         /*
7345                          * nameidone has to happen before we vnode_put(fdvp)
7346                          * since it may need to release the fs_nodelock on the fvp
7347                          */
7348                         nameidone(fromnd);
7349
7350                         vnode_put(fvp);
7351                         vnode_put(fdvp);
7352
7353                         mount_lock_renames(locked_mp);
7354                         holding_mntlock = 1;
7355
7356                         goto retry;
7357                 }
7358         } else {
7359                 /*
7360                  * when we dropped the iocounts to take
7361                  * the lock, we allowed the identity of
7362                  * the various vnodes to change... if they did,
7363                  * we may no longer be dealing with a rename
7364                  * that reshapes the tree... once we're holding
7365                  * the iocounts, the vnodes can't change type
7366                  * so we're free to drop the lock at this point
7367                  * and continue on
7368                  */
7369                 if (holding_mntlock) {
7370                         mount_unlock_renames(locked_mp);
7371                         mount_drop(locked_mp, 0);
7372                         holding_mntlock = 0;
7373                 }
7374         }
7375
7376         // save these off so we can later verify that fvp is the same
7377         oname   = fvp->v_name;
7378         oparent = fvp->v_parent;
7379
7380 skipped_lookup:
7381 #if CONFIG_FSE
7382         need_event = need_fsevent(FSE_RENAME, fdvp);
7383         if (need_event) {
7384                 if (fvp) {
7385                         get_fse_info(fvp, &from_finfo, ctx);
7386                 } else {
7387                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
7388                         if (error) {
7389                                 goto out1;
7390                         }
7391
7392                         fvap = &__rename_data->fv_attr;
7393                 }
7394
7395                 if (tvp) {
7396                         get_fse_info(tvp, &to_finfo, ctx);
7397                 } else if (batched) {
7398                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
7399                         if (error) {
7400                                 goto out1;
7401                         }
7402
7403                         tvap = &__rename_data->tv_attr;
7404                 }
7405         }
7406 #else
7407         need_event = 0;
7408 #endif /* CONFIG_FSE */
7409
7410         if (need_event || kauth_authorize_fileop_has_listeners()) {
7411                 if (from_name == NULL) {
7412                         GET_PATH(from_name);
7413                         if (from_name == NULL) {
7414                                 error = ENOMEM;
7415                                 goto out1;
7416                         }
7417                 }
7418
7419                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
7420
7421                 if (to_name == NULL) {
7422                         GET_PATH(to_name);
7423                         if (to_name == NULL) {
7424                                 error = ENOMEM;
7425                                 goto out1;
7426                         }
7427                 }
7428
7429                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
7430         }
7431         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
7432                             tdvp, &tvp, &tond->ni_cnd, tvap,
7433                             flags, ctx);
7434
7435         if (holding_mntlock) {
7436                 /*
7437                  * we can drop our serialization
7438                  * lock now
7439                  */
7440                 mount_unlock_renames(locked_mp);
7441                 mount_drop(locked_mp, 0);
7442                 holding_mntlock = 0;
7443         }
7444         if (error) {
7445                 if (error == EKEEPLOOKING) {
7446                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
7447                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
7448                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
7449                                 }
7450                         }
7451
7452                         fromnd->ni_vp = fvp;
7453                         tond->ni_vp = tvp;
7454
7455                         goto continue_lookup;
7456                 }
7457
7458                 /*
7459                  * We may encounter a race in the VNOP where the destination didn't
7460                  * exist when we did the namei, but it does by the time we go and
7461                  * try to create the entry. In this case, we should re-drive this rename
7462                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
7463                  * but other filesystems susceptible to this race could return it, too.
7464                  */
7465                 if (error == ERECYCLE) {
7466                         do_retry = 1;
7467                 }
7468
7469                 /*
7470                  * For compound VNOPs, the authorization callback may return
7471                  * ENOENT in case of racing hardlink lookups hitting the name
7472                  * cache, redrive the lookup.
7473                  */
7474                 if (batched && error == ENOENT) {
7475                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7476                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7477                                 do_retry = 1;
7478                                 retry_count += 1;
7479                         }
7480                 }
7481
7482                 goto out1;
7483         }
7484
7485         /* call out to allow 3rd party notification of rename.
7486          * Ignore result of kauth_authorize_fileop call.
7487          */
7488         kauth_authorize_fileop(vfs_context_ucred(ctx),
7489                         KAUTH_FILEOP_RENAME,
7490                         (uintptr_t)from_name, (uintptr_t)to_name);
7491         if (flags & VFS_RENAME_SWAP) {
7492                 kauth_authorize_fileop(vfs_context_ucred(ctx),
7493                                                            KAUTH_FILEOP_RENAME,
7494                                                            (uintptr_t)to_name, (uintptr_t)from_name);
7495         }
7496
7497 #if CONFIG_FSE
7498         if (from_name != NULL && to_name != NULL) {
7499                 if (from_truncated || to_truncated) {
7500                         // set it here since only the from_finfo gets reported up to user space
7501                         from_finfo.mode |= FSE_TRUNCATED_PATH;
7502                 }
7503
7504                 if (tvap && tvp) {
7505                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
7506                 }
7507                 if (fvap) {
7508                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
7509                 }
7510
7511                 if (tvp) {
7512                         add_fsevent(FSE_RENAME, ctx,
7513                                                 FSE_ARG_STRING, from_len, from_name,
7514                                                 FSE_ARG_FINFO, &from_finfo,
7515                                                 FSE_ARG_STRING, to_len, to_name,
7516                                                 FSE_ARG_FINFO, &to_finfo,
7517                                                 FSE_ARG_DONE);
7518                         if (flags & VFS_RENAME_SWAP) {
7519                                 /*
7520                                  * Strictly speaking, swap is the equivalent of
7521                                  * *three* renames.  FSEvents clients should only take
7522                                  * the events as a hint, so we only bother reporting
7523                                  * two.
7524                                  */
7525                                 add_fsevent(FSE_RENAME, ctx,
7526                                                         FSE_ARG_STRING, to_len, to_name,
7527                                                         FSE_ARG_FINFO, &to_finfo,
7528                                                         FSE_ARG_STRING, from_len, from_name,
7529                                                         FSE_ARG_FINFO, &from_finfo,
7530                                                         FSE_ARG_DONE);
7531                         }
7532                 } else {
7533                         add_fsevent(FSE_RENAME, ctx,
7534                                     FSE_ARG_STRING, from_len, from_name,
7535                                     FSE_ARG_FINFO, &from_finfo,
7536                                     FSE_ARG_STRING, to_len, to_name,
7537                                     FSE_ARG_DONE);
7538                 }
7539         }
7540 #endif /* CONFIG_FSE */
7541
7542         /*
7543          * update filesystem's mount point data
7544          */
7545         if (mntrename) {
7546                 char *cp, *pathend, *mpname;
7547                 char * tobuf;
7548                 struct mount *mp;
7549                 int maxlen;
7550                 size_t len = 0;
7551
7552                 mp = fvp->v_mountedhere;
7553
7554                 if (vfs_busy(mp, LK_NOWAIT)) {
7555                         error = EBUSY;
7556                         goto out1;
7557                 }
7558                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
7559
7560                 if (UIO_SEG_IS_USER_SPACE(segflg))
7561                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
7562                 else
7563                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
7564                 if (!error) {
7565                         /* find current mount point prefix */
7566                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
7567                         for (cp = pathend; *cp != '\0'; ++cp) {
7568                                 if (*cp == '/')
7569                                         pathend = cp + 1;
7570                         }
7571                         /* find last component of target name */
7572                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
7573                                 if (*cp == '/')
7574                                         mpname = cp + 1;
7575                         }
7576                         /* append name to prefix */
7577                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
7578                         bzero(pathend, maxlen);
7579                         strlcpy(pathend, mpname, maxlen);
7580                 }
7581                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
7582
7583                 vfs_unbusy(mp);
7584         }
7585         /*
7586          * fix up name & parent pointers.  note that we first
7587          * check that fvp has the same name/parent pointers it
7588          * had before the rename call... this is a 'weak' check
7589          * at best...
7590          *
7591          * XXX oparent and oname may not be set in the compound vnop case
7592          */
7593         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
7594                 int update_flags;
7595
7596                 update_flags = VNODE_UPDATE_NAME;
7597
7598                 if (fdvp != tdvp)
7599                         update_flags |= VNODE_UPDATE_PARENT;
7600
7601                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
7602         }
7603 out1:
7604         if (to_name != NULL) {
7605                 RELEASE_PATH(to_name);
7606                 to_name = NULL;
7607         }
7608         if (from_name != NULL) {
7609                 RELEASE_PATH(from_name);
7610                 from_name = NULL;
7611         }
7612         if (holding_mntlock) {
7613                 mount_unlock_renames(locked_mp);
7614                 mount_drop(locked_mp, 0);
7615                 holding_mntlock = 0;
7616         }
7617         if (tdvp) {
7618                 /*
7619                  * nameidone has to happen before we vnode_put(tdvp)
7620                  * since it may need to release the fs_nodelock on the tdvp
7621                  */
7622                 nameidone(tond);
7623
7624                 if (tvp)
7625                         vnode_put(tvp);
7626                 vnode_put(tdvp);
7627         }
7628         if (fdvp) {
7629                 /*
7630                  * nameidone has to happen before we vnode_put(fdvp)
7631                  * since it may need to release the fs_nodelock on the fdvp
7632                  */
7633                 nameidone(fromnd);
7634
7635                 if (fvp)
7636                         vnode_put(fvp);
7637                 vnode_put(fdvp);
7638         }
7639
7640         /*
7641          * If things changed after we did the namei, then we will re-drive
7642          * this rename call from the top.
7643          */
7644         if (do_retry) {
7645                 do_retry = 0;
7646                 goto retry;
7647         }
7648
7649         FREE(__rename_data, M_TEMP);
7650         return (error);
7651 }
7652
7653 int
7654 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
7655 {
7656         return (renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
7657             AT_FDCWD, uap->to, UIO_USERSPACE, 0));
7658 }
7659
7660 int renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
7661 {
7662         return renameat_internal(
7663                 vfs_context_current(),
7664                 uap->fromfd, uap->from,
7665                 uap->tofd, uap->to,
7666                 UIO_USERSPACE, uap->flags);
7667 }
7668
7669 int
7670 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
7671 {
7672         return (renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
7673             uap->tofd, uap->to, UIO_USERSPACE, 0));
7674 }
7675
7676 /*
7677  * Make a directory file.
7678  *
7679  * Returns:     0                       Success
7680  *              EEXIST
7681  *      namei:???
7682  *      vnode_authorize:???
7683  *      vn_create:???
7684  */
7685 /* ARGSUSED */
7686 static int
7687 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
7688     enum uio_seg segflg)
7689 {
7690         vnode_t vp, dvp;
7691         int error;
7692         int update_flags = 0;
7693         int batched;
7694         struct nameidata nd;
7695
7696         AUDIT_ARG(mode, vap->va_mode);
7697         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
7698                path, ctx);
7699         nd.ni_cnd.cn_flags |= WILLBEDIR;
7700         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
7701
7702 continue_lookup:
7703         error = nameiat(&nd, fd);
7704         if (error)
7705                 return (error);
7706         dvp = nd.ni_dvp;
7707         vp = nd.ni_vp;
7708
7709         if (vp != NULL) {
7710                 error = EEXIST;
7711                 goto out;
7712         }
7713
7714         batched = vnode_compound_mkdir_available(dvp);
7715
7716         VATTR_SET(vap, va_type, VDIR);
7717
7718         /*
7719          * XXX
7720          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
7721          * only get EXISTS or EISDIR for existing path components, and not that it could see
7722          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
7723          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
7724          */
7725         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
7726                 if (error == EACCES || error == EPERM) {
7727                         int error2;
7728
7729                         nameidone(&nd);
7730                         vnode_put(dvp);
7731                         dvp = NULLVP;
7732
7733                         /*
7734                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
7735                          * rather than EACCESS if the target exists.
7736                          */
7737                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
7738                                         path, ctx);
7739                         error2 = nameiat(&nd, fd);
7740                         if (error2) {
7741                                 goto out;
7742                         } else {
7743                                 vp = nd.ni_vp;
7744                                 error = EEXIST;
7745                                 goto out;
7746                         }
7747                 }
7748
7749                 goto out;
7750         }
7751
7752         /*
7753          * make the directory
7754          */
7755         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
7756                 if (error == EKEEPLOOKING) {
7757                         nd.ni_vp = vp;
7758                         goto continue_lookup;
7759                 }
7760
7761                 goto out;
7762         }
7763
7764         // Make sure the name & parent pointers are hooked up
7765         if (vp->v_name == NULL)
7766                 update_flags |= VNODE_UPDATE_NAME;
7767         if (vp->v_parent == NULLVP)
7768                 update_flags |= VNODE_UPDATE_PARENT;
7769
7770         if (update_flags)
7771                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
7772
7773 #if CONFIG_FSE
7774         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
7775 #endif
7776
7777 out:
7778         /*
7779          * nameidone has to happen before we vnode_put(dvp)
7780          * since it may need to release the fs_nodelock on the dvp
7781          */
7782         nameidone(&nd);
7783
7784         if (vp)
7785                 vnode_put(vp);
7786         if (dvp)
7787                 vnode_put(dvp);
7788
7789         return (error);
7790 }
7791
7792 /*
7793  * mkdir_extended: Create a directory; with extended security (ACL).
7794  *
7795  * Parameters:    p                       Process requesting to create the directory
7796  *                uap                     User argument descriptor (see below)
7797  *                retval                  (ignored)
7798  *
7799  * Indirect:      uap->path               Path of directory to create
7800  *                uap->mode               Access permissions to set
7801  *                uap->xsecurity          ACL to set
7802  *
7803  * Returns:        0                      Success
7804  *                !0                      Not success
7805  *
7806  */
7807 int
7808 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
7809 {
7810         int ciferror;
7811         kauth_filesec_t xsecdst;
7812         struct vnode_attr va;
7813
7814         AUDIT_ARG(owner, uap->uid, uap->gid);
7815
7816         xsecdst = NULL;
7817         if ((uap->xsecurity != USER_ADDR_NULL) &&
7818             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
7819                 return ciferror;
7820
7821         VATTR_INIT(&va);
7822         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7823         if (xsecdst != NULL)
7824                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7825
7826         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7827             UIO_USERSPACE);
7828         if (xsecdst != NULL)
7829                 kauth_filesec_free(xsecdst);
7830         return ciferror;
7831 }
7832
7833 int
7834 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
7835 {
7836         struct vnode_attr va;
7837
7838         VATTR_INIT(&va);
7839         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7840
7841         return (mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7842             UIO_USERSPACE));
7843 }
7844
7845 int
7846 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
7847 {
7848         struct vnode_attr va;
7849
7850         VATTR_INIT(&va);
7851         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7852
7853         return(mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
7854             UIO_USERSPACE));
7855 }
7856
7857 static int
7858 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
7859     enum uio_seg segflg)
7860 {
7861         vnode_t vp, dvp;
7862         int error;
7863         struct nameidata nd;
7864         char     *path = NULL;
7865         int       len=0;
7866         int has_listeners = 0;
7867         int need_event = 0;
7868         int truncated = 0;
7869 #if CONFIG_FSE
7870         struct vnode_attr va;
7871 #endif /* CONFIG_FSE */
7872         struct vnode_attr *vap = NULL;
7873         int restart_count = 0;
7874         int batched;
7875
7876         int restart_flag;
7877
7878         /*
7879          * This loop exists to restart rmdir in the unlikely case that two
7880          * processes are simultaneously trying to remove the same directory
7881          * containing orphaned appleDouble files.
7882          */
7883         do {
7884                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
7885                     segflg, dirpath, ctx);
7886                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
7887 continue_lookup:
7888                 restart_flag = 0;
7889                 vap = NULL;
7890
7891                 error = nameiat(&nd, fd);
7892                 if (error)
7893                         return (error);
7894
7895                 dvp = nd.ni_dvp;
7896                 vp = nd.ni_vp;
7897
7898                 if (vp) {
7899                         batched = vnode_compound_rmdir_available(vp);
7900
7901                         if (vp->v_flag & VROOT) {
7902                                 /*
7903                                  * The root of a mounted filesystem cannot be deleted.
7904                                  */
7905                                 error = EBUSY;
7906                                 goto out;
7907                         }
7908
7909                         /*
7910                          * Removed a check here; we used to abort if vp's vid
7911                          * was not the same as what we'd seen the last time around.
7912                          * I do not think that check was valid, because if we retry
7913                          * and all dirents are gone, the directory could legitimately
7914                          * be recycled but still be present in a situation where we would
7915                          * have had permission to delete.  Therefore, we won't make
7916                          * an effort to preserve that check now that we may not have a
7917                          * vp here.
7918                          */
7919
7920                         if (!batched) {
7921                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
7922                                 if (error) {
7923                                         if (error == ENOENT) {
7924                                                 assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7925                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7926                                                         restart_flag = 1;
7927                                                         restart_count += 1;
7928                                                 }
7929                                         }
7930                                         goto out;
7931                                 }
7932                         }
7933                 } else {
7934                         batched = 1;
7935
7936                         if (!vnode_compound_rmdir_available(dvp)) {
7937                                 panic("No error, but no compound rmdir?");
7938                         }
7939                 }
7940
7941 #if CONFIG_FSE
7942                 fse_info  finfo;
7943
7944                 need_event = need_fsevent(FSE_DELETE, dvp);
7945                 if (need_event) {
7946                         if (!batched) {
7947                                 get_fse_info(vp, &finfo, ctx);
7948                         } else {
7949                                 error = vfs_get_notify_attributes(&va);
7950                                 if (error) {
7951                                         goto out;
7952                                 }
7953
7954                                 vap = &va;
7955                         }
7956                 }
7957 #endif
7958                 has_listeners = kauth_authorize_fileop_has_listeners();
7959                 if (need_event || has_listeners) {
7960                         if (path == NULL) {
7961                                 GET_PATH(path);
7962                                 if (path == NULL) {
7963                                         error = ENOMEM;
7964                                         goto out;
7965                                 }
7966                         }
7967
7968                         len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
7969 #if CONFIG_FSE
7970                         if (truncated) {
7971                                 finfo.mode |= FSE_TRUNCATED_PATH;
7972                         }
7973 #endif
7974                 }
7975
7976                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
7977                 nd.ni_vp = vp;
7978                 if (vp == NULLVP) {
7979                         /* Couldn't find a vnode */
7980                         goto out;
7981                 }
7982
7983                 if (error == EKEEPLOOKING) {
7984                         goto continue_lookup;
7985                 } else if (batched && error == ENOENT) {
7986                         assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7987                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7988                                 /*
7989                                  * For compound VNOPs, the authorization callback
7990                                  * may return ENOENT in case of racing hard link lookups
7991                                  * redrive the lookup.
7992                                  */
7993                                 restart_flag = 1;
7994                                 restart_count += 1;
7995                                 goto out;
7996                         }
7997                 }
7998 #if CONFIG_APPLEDOUBLE
7999                 /*
8000                  * Special case to remove orphaned AppleDouble
8001                  * files. I don't like putting this in the kernel,
8002                  * but carbon does not like putting this in carbon either,
8003                  * so here we are.
8004                  */
8005                 if (error == ENOTEMPTY) {
8006                         error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
8007                         if (error == EBUSY) {
8008                                 goto out;
8009                         }
8010
8011
8012                         /*
8013                          * Assuming everything went well, we will try the RMDIR again
8014                          */
8015                         if (!error)
8016                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8017                 }
8018 #endif /* CONFIG_APPLEDOUBLE */
8019                 /*
8020                  * Call out to allow 3rd party notification of delete.
8021                  * Ignore result of kauth_authorize_fileop call.
8022                  */
8023                 if (!error) {
8024                         if (has_listeners) {
8025                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8026                                                 KAUTH_FILEOP_DELETE,
8027                                                 (uintptr_t)vp,
8028                                                 (uintptr_t)path);
8029                         }
8030
8031                         if (vp->v_flag & VISHARDLINK) {
8032                                 // see the comment in unlink1() about why we update
8033                                 // the parent of a hard link when it is removed
8034                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
8035                         }
8036
8037 #if CONFIG_FSE
8038                         if (need_event) {
8039                                 if (vap) {
8040                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
8041                                 }
8042                                 add_fsevent(FSE_DELETE, ctx,
8043                                                 FSE_ARG_STRING, len, path,
8044                                                 FSE_ARG_FINFO, &finfo,
8045                                                 FSE_ARG_DONE);
8046                         }
8047 #endif
8048                 }
8049
8050 out:
8051                 if (path != NULL) {
8052                         RELEASE_PATH(path);
8053                         path = NULL;
8054                 }
8055                 /*
8056                  * nameidone has to happen before we vnode_put(dvp)
8057                  * since it may need to release the fs_nodelock on the dvp
8058                  */
8059                 nameidone(&nd);
8060                 vnode_put(dvp);
8061
8062                 if (vp)
8063                         vnode_put(vp);
8064
8065                 if (restart_flag == 0) {
8066                         wakeup_one((caddr_t)vp);
8067                         return (error);
8068                 }
8069                 tsleep(vp, PVFS, "rm AD", 1);
8070
8071         } while (restart_flag != 0);
8072
8073         return (error);
8074
8075 }
8076
8077 /*
8078  * Remove a directory file.
8079  */
8080 /* ARGSUSED */
8081 int
8082 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
8083 {
8084         return (rmdirat_internal(vfs_context_current(), AT_FDCWD,
8085             CAST_USER_ADDR_T(uap->path), UIO_USERSPACE));
8086 }
8087
8088 /* Get direntry length padded to 8 byte alignment */
8089 #define DIRENT64_LEN(namlen) \
8090         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
8091
8092 errno_t
8093 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
8094                 int *numdirent, vfs_context_t ctxp)
8095 {
8096         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
8097         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
8098                    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0))  {
8099                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
8100         } else {
8101                 size_t bufsize;
8102                 void * bufptr;
8103                 uio_t auio;
8104                 struct direntry *entry64;
8105                 struct dirent *dep;
8106                 int bytesread;
8107                 int error;
8108
8109                 /*
8110                  * Our kernel buffer needs to be smaller since re-packing
8111                  * will expand each dirent.  The worse case (when the name
8112                  * length is 3) corresponds to a struct direntry size of 32
8113                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
8114                  * (4-byte aligned).  So having a buffer that is 3/8 the size
8115                  * will prevent us from reading more than we can pack.
8116                  *
8117                  * Since this buffer is wired memory, we will limit the
8118                  * buffer size to a maximum of 32K. We would really like to
8119                  * use 32K in the MIN(), but we use magic number 87371 to
8120                  * prevent uio_resid() * 3 / 8 from overflowing.
8121                  */
8122                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
8123                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
8124                 if (bufptr == NULL) {
8125                         return ENOMEM;
8126                 }
8127
8128                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
8129                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
8130                 auio->uio_offset = uio->uio_offset;
8131
8132                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
8133
8134                 dep = (struct dirent *)bufptr;
8135                 bytesread = bufsize - uio_resid(auio);
8136
8137                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
8138                        M_TEMP, M_WAITOK);
8139                 /*
8140                  * Convert all the entries and copy them out to user's buffer.
8141                  */
8142                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
8143                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
8144
8145                         bzero(entry64, enbufsize);
8146                         /* Convert a dirent to a dirent64. */
8147                         entry64->d_ino = dep->d_ino;
8148                         entry64->d_seekoff = 0;
8149                         entry64->d_reclen = enbufsize;
8150                         entry64->d_namlen = dep->d_namlen;
8151                         entry64->d_type = dep->d_type;
8152                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
8153
8154                         /* Move to next entry. */
8155                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
8156
8157                         /* Copy entry64 to user's buffer. */
8158                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
8159                 }
8160
8161                 /* Update the real offset using the offset we got from VNOP_READDIR. */
8162                 if (error == 0) {
8163                         uio->uio_offset = auio->uio_offset;
8164                 }
8165                 uio_free(auio);
8166                 FREE(bufptr, M_TEMP);
8167                 FREE(entry64, M_TEMP);
8168                 return (error);
8169         }
8170 }
8171
8172 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
8173
8174 /*
8175  * Read a block of directory entries in a file system independent format.
8176  */
8177 static int
8178 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
8179                      off_t *offset, int flags)
8180 {
8181         vnode_t vp;
8182         struct vfs_context context = *vfs_context_current();    /* local copy */
8183         struct fileproc *fp;
8184         uio_t auio;
8185         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8186         off_t loff;
8187         int error, eofflag, numdirent;
8188         char uio_buf[ UIO_SIZEOF(1) ];
8189
8190         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
8191         if (error) {
8192                 return (error);
8193         }
8194         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
8195                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8196                 error = EBADF;
8197                 goto out;
8198         }
8199
8200         if (bufsize > GETDIRENTRIES_MAXBUFSIZE)
8201                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
8202
8203 #if CONFIG_MACF
8204         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
8205         if (error)
8206                 goto out;
8207 #endif
8208         if ( (error = vnode_getwithref(vp)) ) {
8209                 goto out;
8210         }
8211         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8212
8213 unionread:
8214         if (vp->v_type != VDIR) {
8215                 (void)vnode_put(vp);
8216                 error = EINVAL;
8217                 goto out;
8218         }
8219
8220 #if CONFIG_MACF
8221         error = mac_vnode_check_readdir(&context, vp);
8222         if (error != 0) {
8223                 (void)vnode_put(vp);
8224                 goto out;
8225         }
8226 #endif /* MAC */
8227
8228         loff = fp->f_fglob->fg_offset;
8229         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8230         uio_addiov(auio, bufp, bufsize);
8231
8232         if (flags & VNODE_READDIR_EXTENDED) {
8233                 error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
8234                 fp->f_fglob->fg_offset = uio_offset(auio);
8235         } else {
8236                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
8237                 fp->f_fglob->fg_offset = uio_offset(auio);
8238         }
8239         if (error) {
8240                 (void)vnode_put(vp);
8241                 goto out;
8242         }
8243
8244         if ((user_ssize_t)bufsize == uio_resid(auio)){
8245                 if (union_dircheckp) {
8246                         error = union_dircheckp(&vp, fp, &context);
8247                         if (error == -1)
8248                                 goto unionread;
8249                         if (error)
8250                                 goto out;
8251                 }
8252
8253                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
8254                         struct vnode *tvp = vp;
8255                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
8256                                 vnode_ref(vp);
8257                                 fp->f_fglob->fg_data = (caddr_t) vp;
8258                                 fp->f_fglob->fg_offset = 0;
8259                                 vnode_rele(tvp);
8260                                 vnode_put(tvp);
8261                                 goto unionread;
8262                         }
8263                         vp = tvp;
8264                 }
8265         }
8266
8267         vnode_put(vp);
8268         if (offset) {
8269                 *offset = loff;
8270         }
8271
8272         *bytesread = bufsize - uio_resid(auio);
8273 out:
8274         file_drop(fd);
8275         return (error);
8276 }
8277
8278
8279 int
8280 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
8281 {
8282         off_t offset;
8283         ssize_t bytesread;
8284         int error;
8285
8286         AUDIT_ARG(fd, uap->fd);
8287         error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
8288
8289         if (error == 0) {
8290                 if (proc_is64bit(p)) {
8291                         user64_long_t base = (user64_long_t)offset;
8292                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
8293                 } else {
8294                         user32_long_t base = (user32_long_t)offset;
8295                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
8296                 }
8297                 *retval = bytesread;
8298         }
8299         return (error);
8300 }
8301
8302 int
8303 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
8304 {
8305         off_t offset;
8306         ssize_t bytesread;
8307         int error;
8308
8309         AUDIT_ARG(fd, uap->fd);
8310         error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
8311
8312         if (error == 0) {
8313                 *retval = bytesread;
8314                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
8315         }
8316         return (error);
8317 }
8318
8319
8320 /*
8321  * Set the mode mask for creation of filesystem nodes.
8322  * XXX implement xsecurity
8323  */
8324 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
8325 static int
8326 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
8327 {
8328         struct filedesc *fdp;
8329
8330         AUDIT_ARG(mask, newmask);
8331         proc_fdlock(p);
8332         fdp = p->p_fd;
8333         *retval = fdp->fd_cmask;
8334         fdp->fd_cmask = newmask & ALLPERMS;
8335         proc_fdunlock(p);
8336         return (0);
8337 }
8338
8339 /*
8340  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
8341  *
8342  * Parameters:    p                       Process requesting to set the umask
8343  *                uap                     User argument descriptor (see below)
8344  *                retval                  umask of the process (parameter p)
8345  *
8346  * Indirect:      uap->newmask            umask to set
8347  *                uap->xsecurity          ACL to set
8348  *
8349  * Returns:        0                      Success
8350  *                !0                      Not success
8351  *
8352  */
8353 int
8354 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
8355 {
8356         int ciferror;
8357         kauth_filesec_t xsecdst;
8358
8359         xsecdst = KAUTH_FILESEC_NONE;
8360         if (uap->xsecurity != USER_ADDR_NULL) {
8361                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
8362                         return ciferror;
8363         } else {
8364                 xsecdst = KAUTH_FILESEC_NONE;
8365         }
8366
8367         ciferror = umask1(p, uap->newmask, xsecdst, retval);
8368
8369         if (xsecdst != KAUTH_FILESEC_NONE)
8370                 kauth_filesec_free(xsecdst);
8371         return ciferror;
8372 }
8373
8374 int
8375 umask(proc_t p, struct umask_args *uap, int32_t *retval)
8376 {
8377         return(umask1(p, uap->newmask, UMASK_NOXSECURITY, retval));
8378 }
8379
8380 /*
8381  * Void all references to file by ripping underlying filesystem
8382  * away from vnode.
8383  */
8384 /* ARGSUSED */
8385 int
8386 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
8387 {
8388         vnode_t vp;
8389         struct vnode_attr va;
8390         vfs_context_t ctx = vfs_context_current();
8391         int error;
8392         struct nameidata nd;
8393
8394         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
8395                uap->path, ctx);
8396         error = namei(&nd);
8397         if (error)
8398                 return (error);
8399         vp = nd.ni_vp;
8400
8401         nameidone(&nd);
8402
8403         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
8404                 error = ENOTSUP;
8405                 goto out;
8406         }
8407
8408         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
8409                 error = EBUSY;
8410                 goto out;
8411         }
8412
8413 #if CONFIG_MACF
8414         error = mac_vnode_check_revoke(ctx, vp);
8415         if (error)
8416                 goto out;
8417 #endif
8418
8419         VATTR_INIT(&va);
8420         VATTR_WANTED(&va, va_uid);
8421         if ((error = vnode_getattr(vp, &va, ctx)))
8422                 goto out;
8423         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
8424             (error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
8425                 goto out;
8426         if (vp->v_usecount > 0 || (vnode_isaliased(vp)))
8427                 VNOP_REVOKE(vp, REVOKEALL, ctx);
8428 out:
8429         vnode_put(vp);
8430         return (error);
8431 }
8432
8433
8434 /*
8435  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
8436  *  The following system calls are designed to support features
8437  *  which are specific to the HFS & HFS Plus volume formats
8438  */
8439
8440
8441 /*
8442  * Obtain attribute information on objects in a directory while enumerating
8443  * the directory.
8444  */
8445 /* ARGSUSED */
8446 int
8447 getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
8448 {
8449         vnode_t vp;
8450         struct fileproc *fp;
8451         uio_t auio = NULL;
8452         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8453         uint32_t count, savecount;
8454         uint32_t newstate;
8455         int error, eofflag;
8456         uint32_t loff;
8457         struct attrlist attributelist;
8458         vfs_context_t ctx = vfs_context_current();
8459         int fd = uap->fd;
8460         char uio_buf[ UIO_SIZEOF(1) ];
8461         kauth_action_t action;
8462
8463         AUDIT_ARG(fd, fd);
8464
8465         /* Get the attributes into kernel space */
8466         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
8467                 return(error);
8468         }
8469         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
8470                 return(error);
8471         }
8472         savecount = count;
8473         if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) {
8474                 return (error);
8475         }
8476         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
8477                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8478                 error = EBADF;
8479                 goto out;
8480         }
8481
8482
8483 #if CONFIG_MACF
8484         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
8485             fp->f_fglob);
8486         if (error)
8487                 goto out;
8488 #endif
8489
8490
8491         if ( (error = vnode_getwithref(vp)) )
8492                 goto out;
8493
8494         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8495
8496 unionread:
8497         if (vp->v_type != VDIR) {
8498                 (void)vnode_put(vp);
8499                 error = EINVAL;
8500                 goto out;
8501         }
8502
8503 #if CONFIG_MACF
8504         error = mac_vnode_check_readdir(ctx, vp);
8505         if (error != 0) {
8506                 (void)vnode_put(vp);
8507                 goto out;
8508         }
8509 #endif /* MAC */
8510
8511         /* set up the uio structure which will contain the users return buffer */
8512         loff = fp->f_fglob->fg_offset;
8513         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8514         uio_addiov(auio, uap->buffer, uap->buffersize);
8515
8516         /*
8517          * If the only item requested is file names, we can let that past with
8518          * just LIST_DIRECTORY.  If they want any other attributes, that means
8519          * they need SEARCH as well.
8520          */
8521         action = KAUTH_VNODE_LIST_DIRECTORY;
8522         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
8523             attributelist.fileattr || attributelist.dirattr)
8524                 action |= KAUTH_VNODE_SEARCH;
8525
8526         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
8527
8528                 /* Believe it or not, uap->options only has 32-bits of valid
8529                  * info, so truncate before extending again */
8530
8531                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
8532                                 (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
8533         }
8534
8535         if (error) {
8536                 (void) vnode_put(vp);
8537                 goto out;
8538         }
8539
8540         /*
8541          * If we've got the last entry of a directory in a union mount
8542          * then reset the eofflag and pretend there's still more to come.
8543          * The next call will again set eofflag and the buffer will be empty,
8544          * so traverse to the underlying directory and do the directory
8545          * read there.
8546          */
8547         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
8548                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
8549                         eofflag = 0;
8550                 } else {                                                // Empty buffer
8551                         struct vnode *tvp = vp;
8552                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
8553                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
8554                                 fp->f_fglob->fg_data = (caddr_t) vp;
8555                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
8556                                 count = savecount;
8557                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
8558                                 vnode_put(tvp);
8559                                 goto unionread;
8560                         }
8561                         vp = tvp;
8562                 }
8563         }
8564
8565         (void)vnode_put(vp);
8566
8567         if (error)
8568                 goto out;
8569         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
8570
8571         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count))))
8572                 goto out;
8573         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate))))
8574                 goto out;
8575         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff))))
8576                 goto out;
8577
8578         *retval = eofflag;  /* similar to getdirentries */
8579         error = 0;
8580 out:
8581         file_drop(fd);
8582         return (error); /* return error earlier, an retval of 0 or 1 now */
8583
8584 } /* end of getdirentriesattr system call */
8585
8586 /*
8587 * Exchange data between two files
8588 */
8589
8590 /* ARGSUSED */
8591 int
8592 exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
8593 {
8594
8595         struct nameidata fnd, snd;
8596         vfs_context_t ctx = vfs_context_current();
8597         vnode_t fvp;
8598         vnode_t svp;
8599         int error;
8600         u_int32_t nameiflags;
8601         char *fpath = NULL;
8602         char *spath = NULL;
8603         int   flen=0, slen=0;
8604         int from_truncated=0, to_truncated=0;
8605 #if CONFIG_FSE
8606         fse_info f_finfo, s_finfo;
8607 #endif
8608
8609         nameiflags = 0;
8610         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8611
8612         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
8613                UIO_USERSPACE, uap->path1, ctx);
8614
8615         error = namei(&fnd);
8616         if (error)
8617                 goto out2;
8618
8619         nameidone(&fnd);
8620         fvp = fnd.ni_vp;
8621
8622         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
8623                UIO_USERSPACE, uap->path2, ctx);
8624
8625         error = namei(&snd);
8626         if (error) {
8627                 vnode_put(fvp);
8628                 goto out2;
8629         }
8630         nameidone(&snd);
8631         svp = snd.ni_vp;
8632
8633         /*
8634          * if the files are the same, return an inval error
8635          */
8636         if (svp == fvp) {
8637                 error = EINVAL;
8638                 goto out;
8639         }
8640
8641         /*
8642          * if the files are on different volumes, return an error
8643          */
8644         if (svp->v_mount != fvp->v_mount) {
8645                 error = EXDEV;
8646                 goto out;
8647         }
8648
8649         /* If they're not files, return an error */
8650         if ( (vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
8651                 error = EINVAL;
8652                 goto out;
8653         }
8654
8655 #if CONFIG_MACF
8656         error = mac_vnode_check_exchangedata(ctx,
8657             fvp, svp);
8658         if (error)
8659                 goto out;
8660 #endif
8661         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
8662             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0))
8663                 goto out;
8664
8665         if (
8666 #if CONFIG_FSE
8667         need_fsevent(FSE_EXCHANGE, fvp) ||
8668 #endif
8669         kauth_authorize_fileop_has_listeners()) {
8670                 GET_PATH(fpath);
8671                 GET_PATH(spath);
8672                 if (fpath == NULL || spath == NULL) {
8673                         error = ENOMEM;
8674                         goto out;
8675                 }
8676
8677                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
8678                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
8679
8680 #if CONFIG_FSE
8681                 get_fse_info(fvp, &f_finfo, ctx);
8682                 get_fse_info(svp, &s_finfo, ctx);
8683                 if (from_truncated || to_truncated) {
8684                         // set it here since only the f_finfo gets reported up to user space
8685                         f_finfo.mode |= FSE_TRUNCATED_PATH;
8686                 }
8687 #endif
8688         }
8689         /* Ok, make the call */
8690         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
8691
8692         if (error == 0) {
8693             const char *tmpname;
8694
8695             if (fpath != NULL && spath != NULL) {
8696                     /* call out to allow 3rd party notification of exchangedata.
8697                      * Ignore result of kauth_authorize_fileop call.
8698                      */
8699                     kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
8700                                            (uintptr_t)fpath, (uintptr_t)spath);
8701             }
8702             name_cache_lock();
8703
8704             tmpname     = fvp->v_name;
8705             fvp->v_name = svp->v_name;
8706             svp->v_name = tmpname;
8707
8708             if (fvp->v_parent != svp->v_parent) {
8709                 vnode_t tmp;
8710
8711                 tmp           = fvp->v_parent;
8712                 fvp->v_parent = svp->v_parent;
8713                 svp->v_parent = tmp;
8714             }
8715             name_cache_unlock();
8716
8717 #if CONFIG_FSE
8718             if (fpath != NULL && spath != NULL) {
8719                     add_fsevent(FSE_EXCHANGE, ctx,
8720                                 FSE_ARG_STRING, flen, fpath,
8721                                 FSE_ARG_FINFO, &f_finfo,
8722                                 FSE_ARG_STRING, slen, spath,
8723                                 FSE_ARG_FINFO, &s_finfo,
8724                                 FSE_ARG_DONE);
8725             }
8726 #endif
8727         }
8728
8729 out:
8730         if (fpath != NULL)
8731                 RELEASE_PATH(fpath);
8732         if (spath != NULL)
8733                 RELEASE_PATH(spath);
8734         vnode_put(svp);
8735         vnode_put(fvp);
8736 out2:
8737         return (error);
8738 }
8739
8740 /*
8741  * Return (in MB) the amount of freespace on the given vnode's volume.
8742  */
8743 uint32_t freespace_mb(vnode_t vp);
8744
8745 uint32_t
8746 freespace_mb(vnode_t vp)
8747 {
8748         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
8749         return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
8750                 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
8751 }
8752
8753 #if CONFIG_SEARCHFS
8754
8755 /* ARGSUSED */
8756
8757 int
8758 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
8759 {
8760         vnode_t vp, tvp;
8761         int i, error=0;
8762         int fserror = 0;
8763         struct nameidata nd;
8764         struct user64_fssearchblock searchblock;
8765         struct searchstate *state;
8766         struct attrlist *returnattrs;
8767         struct timeval timelimit;
8768         void *searchparams1,*searchparams2;
8769         uio_t auio = NULL;
8770         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8771         uint32_t nummatches;
8772         int mallocsize;
8773         uint32_t nameiflags;
8774         vfs_context_t ctx = vfs_context_current();
8775         char uio_buf[ UIO_SIZEOF(1) ];
8776
8777         /* Start by copying in fsearchblock parameter list */
8778     if (IS_64BIT_PROCESS(p)) {
8779         error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
8780         timelimit.tv_sec = searchblock.timelimit.tv_sec;
8781         timelimit.tv_usec = searchblock.timelimit.tv_usec;
8782     }
8783     else {
8784         struct user32_fssearchblock tmp_searchblock;
8785
8786         error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
8787         // munge into 64-bit version
8788         searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
8789         searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
8790         searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
8791         searchblock.maxmatches = tmp_searchblock.maxmatches;
8792                 /*
8793                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
8794                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
8795                  */
8796         timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
8797         timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
8798         searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
8799         searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
8800         searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
8801         searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
8802         searchblock.searchattrs = tmp_searchblock.searchattrs;
8803     }
8804         if (error)
8805                 return(error);
8806
8807         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
8808          */
8809         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
8810                 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS)
8811                 return(EINVAL);
8812
8813         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
8814         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
8815         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
8816         /* block.                                                                                             */
8817         /*                                                                                                    */
8818         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
8819         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
8820         /*       assumes the size is still 556 bytes it will continue to work                                 */
8821
8822         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
8823                 sizeof(struct attrlist) + sizeof(struct searchstate) + (2*sizeof(uint32_t));
8824
8825         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
8826
8827         /* Now set up the various pointers to the correct place in our newly allocated memory */
8828
8829         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
8830         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
8831         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof (struct attrlist));
8832
8833         /* Now copy in the stuff given our local variables. */
8834
8835         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1)))
8836                 goto freeandexit;
8837
8838         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2)))
8839                 goto freeandexit;
8840
8841         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist))))
8842                 goto freeandexit;
8843
8844         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate))))
8845                 goto freeandexit;
8846
8847         /*
8848          * When searching a union mount, need to set the
8849          * start flag at the first call on each layer to
8850          * reset state for the new volume.
8851          */
8852         if (uap->options & SRCHFS_START)
8853                 state->ss_union_layer = 0;
8854         else
8855                 uap->options |= state->ss_union_flags;
8856         state->ss_union_flags = 0;
8857
8858         /*
8859          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
8860          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
8861          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
8862          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
8863          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
8864          */
8865
8866         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
8867                 attrreference_t* string_ref;
8868                 u_int32_t* start_length;
8869                 user64_size_t param_length;
8870
8871                 /* validate searchparams1 */
8872                 param_length = searchblock.sizeofsearchparams1;
8873                 /* skip the word that specifies length of the buffer */
8874                 start_length= (u_int32_t*) searchparams1;
8875                 start_length= start_length+1;
8876                 string_ref= (attrreference_t*) start_length;
8877
8878                 /* ensure no negative offsets or too big offsets */
8879                 if (string_ref->attr_dataoffset < 0 ) {
8880                         error = EINVAL;
8881                         goto freeandexit;
8882                 }
8883                 if (string_ref->attr_length > MAXPATHLEN) {
8884                         error = EINVAL;
8885                         goto freeandexit;
8886                 }
8887
8888                 /* Check for pointer overflow in the string ref */
8889                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
8890                         error = EINVAL;
8891                         goto freeandexit;
8892                 }
8893
8894                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
8895                         error = EINVAL;
8896                         goto freeandexit;
8897                 }
8898                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
8899                         error = EINVAL;
8900                         goto freeandexit;
8901                 }
8902         }
8903
8904         /* set up the uio structure which will contain the users return buffer */
8905         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8906         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
8907
8908         nameiflags = 0;
8909         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8910         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
8911                UIO_USERSPACE, uap->path, ctx);
8912
8913         error = namei(&nd);
8914         if (error)
8915                 goto freeandexit;
8916         vp = nd.ni_vp;
8917         nameidone(&nd);
8918
8919         /*
8920          * Switch to the root vnode for the volume
8921          */
8922         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
8923         vnode_put(vp);
8924         if (error)
8925                 goto freeandexit;
8926         vp = tvp;
8927
8928         /*
8929          * If it's a union mount, the path lookup takes
8930          * us to the top layer. But we may need to descend
8931          * to a lower layer. For non-union mounts the layer
8932          * is always zero.
8933          */
8934         for (i = 0; i < (int) state->ss_union_layer; i++) {
8935                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0)
8936                         break;
8937                 tvp = vp;
8938                 vp = vp->v_mount->mnt_vnodecovered;
8939                 if (vp == NULL) {
8940                         vnode_put(tvp);
8941                         error = ENOENT;
8942                         goto freeandexit;
8943                 }
8944                 vnode_getwithref(vp);
8945                 vnode_put(tvp);
8946         }
8947
8948 #if CONFIG_MACF
8949         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
8950         if (error) {
8951                 vnode_put(vp);
8952                 goto freeandexit;
8953         }
8954 #endif
8955
8956
8957         /*
8958          * If searchblock.maxmatches == 0, then skip the search. This has happened
8959          * before and sometimes the underlying code doesnt deal with it well.
8960          */
8961          if (searchblock.maxmatches == 0) {
8962                 nummatches = 0;
8963                 goto saveandexit;
8964          }
8965
8966         /*
8967          * Allright, we have everything we need, so lets make that call.
8968          *
8969          * We keep special track of the return value from the file system:
8970          * EAGAIN is an acceptable error condition that shouldn't keep us
8971          * from copying out any results...
8972          */
8973
8974         fserror = VNOP_SEARCHFS(vp,
8975                 searchparams1,
8976                 searchparams2,
8977                 &searchblock.searchattrs,
8978                 (u_long)searchblock.maxmatches,
8979                 &timelimit,
8980                 returnattrs,
8981                 &nummatches,
8982                 (u_long)uap->scriptcode,
8983                 (u_long)uap->options,
8984                 auio,
8985                 (struct searchstate *) &state->ss_fsstate,
8986                 ctx);
8987
8988         /*
8989          * If it's a union mount we need to be called again
8990          * to search the mounted-on filesystem.
8991          */
8992         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
8993                 state->ss_union_flags = SRCHFS_START;
8994                 state->ss_union_layer++;        // search next layer down
8995                 fserror = EAGAIN;
8996         }
8997
8998 saveandexit:
8999
9000         vnode_put(vp);
9001
9002         /* Now copy out the stuff that needs copying out. That means the number of matches, the
9003            search state.  Everything was already put into he return buffer by the vop call. */
9004
9005         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0)
9006                 goto freeandexit;
9007
9008         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0)
9009                 goto freeandexit;
9010
9011         error = fserror;
9012
9013 freeandexit:
9014
9015         FREE(searchparams1,M_TEMP);
9016
9017         return(error);
9018
9019
9020 } /* end of searchfs system call */
9021
9022 #else /* CONFIG_SEARCHFS */
9023
9024 int
9025 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
9026 {
9027         return (ENOTSUP);
9028 }
9029
9030 #endif /* CONFIG_SEARCHFS */
9031
9032
9033 lck_grp_attr_t *  nspace_group_attr;
9034 lck_attr_t *      nspace_lock_attr;
9035 lck_grp_t *       nspace_mutex_group;
9036
9037 lck_mtx_t         nspace_handler_lock;
9038 lck_mtx_t         nspace_handler_exclusion_lock;
9039
9040 time_t snapshot_timestamp=0;
9041 int nspace_allow_virtual_devs=0;
9042
9043 void nspace_handler_init(void);
9044
9045 typedef struct nspace_item_info {
9046         struct vnode *vp;
9047         void         *arg;
9048         uint64_t      op;
9049         uint32_t      vid;
9050         uint32_t      flags;
9051         uint32_t      token;
9052         uint32_t      refcount;
9053 } nspace_item_info;
9054
9055 #define MAX_NSPACE_ITEMS   128
9056 nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
9057 uint32_t      nspace_item_idx=0;              // also used as the sleep/wakeup rendezvous address
9058 uint32_t      nspace_token_id=0;
9059 uint32_t      nspace_handler_timeout = 15;    // seconds
9060
9061 #define NSPACE_ITEM_NEW         0x0001
9062 #define NSPACE_ITEM_PROCESSING  0x0002
9063 #define NSPACE_ITEM_DEAD        0x0004
9064 #define NSPACE_ITEM_CANCELLED   0x0008
9065 #define NSPACE_ITEM_DONE        0x0010
9066 #define NSPACE_ITEM_RESET_TIMER 0x0020
9067
9068 #define NSPACE_ITEM_NSPACE_EVENT   0x0040
9069 #define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
9070
9071 #define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT)
9072
9073 //#pragma optimization_level 0
9074
9075 typedef enum {
9076         NSPACE_HANDLER_NSPACE = 0,
9077         NSPACE_HANDLER_SNAPSHOT = 1,
9078
9079         NSPACE_HANDLER_COUNT,
9080 } nspace_type_t;
9081
9082 typedef struct {
9083         uint64_t handler_tid;
9084         struct proc *handler_proc;
9085         int handler_busy;
9086 } nspace_handler_t;
9087
9088 nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
9089
9090 /* namespace fsctl functions */
9091 static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
9092 static int nspace_item_flags_for_type(nspace_type_t nspace_type);
9093 static int nspace_open_flags_for_type(nspace_type_t nspace_type);
9094 static nspace_type_t nspace_type_for_op(uint64_t op);
9095 static int nspace_is_special_process(struct proc *proc);
9096 static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
9097 static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
9098 static int validate_namespace_args (int is64bit, int size);
9099 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
9100
9101
9102 static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
9103 {
9104         switch(nspace_type) {
9105                 case NSPACE_HANDLER_NSPACE:
9106                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
9107                 case NSPACE_HANDLER_SNAPSHOT:
9108                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
9109                 default:
9110                         printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
9111                         return 0;
9112         }
9113 }
9114
9115 static inline int nspace_item_flags_for_type(nspace_type_t nspace_type)
9116 {
9117         switch(nspace_type) {
9118                 case NSPACE_HANDLER_NSPACE:
9119                         return NSPACE_ITEM_NSPACE_EVENT;
9120                 case NSPACE_HANDLER_SNAPSHOT:
9121                         return NSPACE_ITEM_SNAPSHOT_EVENT;
9122                 default:
9123                         printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
9124                         return 0;
9125         }
9126 }
9127
9128 static inline int nspace_open_flags_for_type(nspace_type_t nspace_type)
9129 {
9130         switch(nspace_type) {
9131                 case NSPACE_HANDLER_NSPACE:
9132                         return FREAD | FWRITE | O_EVTONLY;
9133                 case NSPACE_HANDLER_SNAPSHOT:
9134                         return FREAD | O_EVTONLY;
9135                 default:
9136                         printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
9137                         return 0;
9138         }
9139 }
9140
9141 static inline nspace_type_t nspace_type_for_op(uint64_t op)
9142 {
9143         switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
9144                 case NAMESPACE_HANDLER_NSPACE_EVENT:
9145                         return NSPACE_HANDLER_NSPACE;
9146                 case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
9147                         return NSPACE_HANDLER_SNAPSHOT;
9148                 default:
9149                         printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
9150                         return NSPACE_HANDLER_NSPACE;
9151         }
9152 }
9153
9154 static inline int nspace_is_special_process(struct proc *proc)
9155 {
9156         int i;
9157         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
9158                 if (proc == nspace_handlers[i].handler_proc)
9159                         return 1;
9160         }
9161         return 0;
9162 }
9163
9164 void
9165 nspace_handler_init(void)
9166 {
9167         nspace_lock_attr    = lck_attr_alloc_init();
9168         nspace_group_attr   = lck_grp_attr_alloc_init();
9169         nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
9170         lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
9171         lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
9172         memset(&nspace_items[0], 0, sizeof(nspace_items));
9173 }
9174
9175 void
9176 nspace_proc_exit(struct proc *p)
9177 {
9178         int i, event_mask = 0;
9179
9180         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
9181                 if (p == nspace_handlers[i].handler_proc) {
9182                         event_mask |= nspace_item_flags_for_type(i);
9183                         nspace_handlers[i].handler_tid = 0;
9184                         nspace_handlers[i].handler_proc = NULL;
9185                 }
9186         }
9187
9188         if (event_mask == 0) {
9189                 return;
9190         }
9191
9192         lck_mtx_lock(&nspace_handler_lock);
9193         if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
9194                 // if this process was the snapshot handler, zero snapshot_timeout
9195                 snapshot_timestamp = 0;
9196         }
9197
9198         //
9199         // unblock anyone that's waiting for the handler that died
9200         //
9201         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9202                 if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
9203
9204                         if ( nspace_items[i].flags & event_mask ) {
9205
9206                                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9207                                         vnode_lock_spin(nspace_items[i].vp);
9208                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9209                                         vnode_unlock(nspace_items[i].vp);
9210                                 }
9211                                 nspace_items[i].vp = NULL;
9212                                 nspace_items[i].vid = 0;
9213                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9214                                 nspace_items[i].token = 0;
9215
9216                                 wakeup((caddr_t)&(nspace_items[i].vp));
9217                         }
9218                 }
9219         }
9220
9221         wakeup((caddr_t)&nspace_item_idx);
9222         lck_mtx_unlock(&nspace_handler_lock);
9223 }
9224
9225
9226 int
9227 resolve_nspace_item(struct vnode *vp, uint64_t op)
9228 {
9229         return resolve_nspace_item_ext(vp, op, NULL);
9230 }
9231
9232 int
9233 resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
9234 {
9235         int i, error, keep_waiting;
9236         struct timespec ts;
9237         nspace_type_t nspace_type = nspace_type_for_op(op);
9238
9239         // only allow namespace events on regular files, directories and symlinks.
9240         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
9241                 return 0;
9242         }
9243
9244         //
9245         // if this is a snapshot event and the vnode is on a
9246         // disk image just pretend nothing happened since any
9247         // change to the disk image will cause the disk image
9248         // itself to get backed up and this avoids multi-way
9249         // deadlocks between the snapshot handler and the ever
9250         // popular diskimages-helper process.  the variable
9251         // nspace_allow_virtual_devs allows this behavior to
9252         // be overridden (for use by the Mobile TimeMachine
9253         // testing infrastructure which uses disk images)
9254         //
9255         if (   (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
9256             && (vp->v_mount != NULL)
9257             && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
9258             && !nspace_allow_virtual_devs) {
9259
9260                 return 0;
9261         }
9262
9263         // if (thread_tid(current_thread()) == namespace_handler_tid) {
9264         if (nspace_handlers[nspace_type].handler_proc == NULL) {
9265                 return 0;
9266         }
9267
9268         if (nspace_is_special_process(current_proc())) {
9269                 return EDEADLK;
9270         }
9271
9272         lck_mtx_lock(&nspace_handler_lock);
9273
9274 retry:
9275         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9276                 if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
9277                         break;
9278                 }
9279         }
9280
9281         if (i >= MAX_NSPACE_ITEMS) {
9282                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9283                         if (nspace_items[i].flags == 0) {
9284                                 break;
9285                         }
9286                 }
9287         } else {
9288                 nspace_items[i].refcount++;
9289         }
9290
9291         if (i >= MAX_NSPACE_ITEMS) {
9292                 ts.tv_sec = nspace_handler_timeout;
9293                 ts.tv_nsec = 0;
9294
9295                 error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS|PCATCH, "nspace-no-space", &ts);
9296                 if (error == 0) {
9297                         // an entry got free'd up, go see if we can get a slot
9298                         goto retry;
9299                 } else {
9300                         lck_mtx_unlock(&nspace_handler_lock);
9301                         return error;
9302                 }
9303         }
9304
9305         //
9306         // if it didn't already exist, add it.  if it did exist
9307         // we'll get woken up when someone does a wakeup() on
9308         // the slot in the nspace_items table.
9309         //
9310         if (vp != nspace_items[i].vp) {
9311                 nspace_items[i].vp = vp;
9312                 nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
9313                 nspace_items[i].op = op;
9314                 nspace_items[i].vid = vnode_vid(vp);
9315                 nspace_items[i].flags = NSPACE_ITEM_NEW;
9316                 nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
9317                 if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
9318                         if (arg) {
9319                                 vnode_lock_spin(vp);
9320                                 vp->v_flag |= VNEEDSSNAPSHOT;
9321                                 vnode_unlock(vp);
9322                         }
9323                 }
9324
9325                 nspace_items[i].token = 0;
9326                 nspace_items[i].refcount = 1;
9327
9328                 wakeup((caddr_t)&nspace_item_idx);
9329         }
9330
9331         //
9332         // Now go to sleep until the handler does a wakeup on this
9333         // slot in the nspace_items table (or we timeout).
9334         //
9335         keep_waiting = 1;
9336         while(keep_waiting) {
9337                 ts.tv_sec = nspace_handler_timeout;
9338                 ts.tv_nsec = 0;
9339                 error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS|PCATCH, "namespace-done", &ts);
9340
9341                 if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
9342                         error = 0;
9343                 } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
9344                         error = nspace_items[i].token;
9345                 } else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
9346                         if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
9347                                 nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
9348                                 continue;
9349                         } else {
9350                                 error = ETIMEDOUT;
9351                         }
9352                 } else if (error == 0) {
9353                         // hmmm, why did we get woken up?
9354                         printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
9355                                nspace_items[i].token);
9356                 }
9357
9358                 if (--nspace_items[i].refcount == 0) {
9359                         nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
9360                         nspace_items[i].arg = NULL;
9361                         nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
9362                         nspace_items[i].flags = 0;     // this clears it for re-use
9363                 }
9364                 wakeup(&nspace_token_id);
9365                 keep_waiting = 0;
9366         }
9367
9368         lck_mtx_unlock(&nspace_handler_lock);
9369
9370         return error;
9371 }
9372
9373 int nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg)
9374 {
9375         int snapshot_error = 0;
9376
9377         if (vp == NULL) {
9378                 return 0;
9379         }
9380
9381         /* Swap files are special; skip them */
9382         if (vnode_isswap(vp)) {
9383                 return 0;
9384         }
9385
9386         if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) {
9387                 // the change time is within this epoch
9388                 int error;
9389
9390                 error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg);
9391                 if (error == EDEADLK) {
9392                         snapshot_error = 0;
9393                 } else if (error) {
9394                         if (error == EAGAIN) {
9395                                 printf("nspace_snapshot_event: timed out waiting for namespace handler...\n");
9396                         } else if (error == EINTR) {
9397                                 // printf("nspace_snapshot_event: got a signal while waiting for namespace handler...\n");
9398                                 snapshot_error = EINTR;
9399                         }
9400                 }
9401         }
9402
9403         return snapshot_error;
9404 }
9405
9406 int
9407 get_nspace_item_status(struct vnode *vp, int32_t *status)
9408 {
9409         int i;
9410
9411         lck_mtx_lock(&nspace_handler_lock);
9412         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9413                 if (nspace_items[i].vp == vp) {
9414                         break;
9415                 }
9416         }
9417
9418         if (i >= MAX_NSPACE_ITEMS) {
9419                 lck_mtx_unlock(&nspace_handler_lock);
9420                 return ENOENT;
9421         }
9422
9423         *status = nspace_items[i].flags;
9424         lck_mtx_unlock(&nspace_handler_lock);
9425         return 0;
9426 }
9427
9428
9429 #if 0
9430 static int
9431 build_volfs_path(struct vnode *vp, char *path, int *len)
9432 {
9433         struct vnode_attr va;
9434         int ret;
9435
9436         VATTR_INIT(&va);
9437         VATTR_WANTED(&va, va_fsid);
9438         VATTR_WANTED(&va, va_fileid);
9439
9440         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
9441                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
9442                 ret = -1;
9443         } else {
9444                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
9445                 ret = 0;
9446         }
9447
9448         return ret;
9449 }
9450 #endif
9451
9452 //
9453 // Note: this function does NOT check permissions on all of the
9454 // parent directories leading to this vnode.  It should only be
9455 // called on behalf of a root process.  Otherwise a process may
9456 // get access to a file because the file itself is readable even
9457 // though its parent directories would prevent access.
9458 //
9459 static int
9460 vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
9461 {
9462         int error, action;
9463
9464         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9465                 return error;
9466         }
9467
9468 #if CONFIG_MACF
9469         error = mac_vnode_check_open(ctx, vp, fmode);
9470         if (error)
9471                 return error;
9472 #endif
9473
9474         /* compute action to be authorized */
9475         action = 0;
9476         if (fmode & FREAD) {
9477                 action |= KAUTH_VNODE_READ_DATA;
9478         }
9479         if (fmode & (FWRITE | O_TRUNC)) {
9480                 /*
9481                  * If we are writing, appending, and not truncating,
9482                  * indicate that we are appending so that if the
9483                  * UF_APPEND or SF_APPEND bits are set, we do not deny
9484                  * the open.
9485                  */
9486                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
9487                         action |= KAUTH_VNODE_APPEND_DATA;
9488                 } else {
9489                         action |= KAUTH_VNODE_WRITE_DATA;
9490                 }
9491         }
9492
9493         if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
9494                 return error;
9495
9496
9497         //
9498         // if the vnode is tagged VOPENEVT and the current process
9499         // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
9500         // flag to the open mode so that this open won't count against
9501         // the vnode when carbon delete() does a vnode_isinuse() to see
9502         // if a file is currently in use.  this allows spotlight
9503         // importers to not interfere with carbon apps that depend on
9504         // the no-delete-if-busy semantics of carbon delete().
9505         //
9506         if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
9507                 fmode |= O_EVTONLY;
9508         }
9509
9510         if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) {
9511                 return error;
9512         }
9513         if ( (error = vnode_ref_ext(vp, fmode, 0)) ) {
9514                 VNOP_CLOSE(vp, fmode, ctx);
9515                 return error;
9516         }
9517
9518         /* Call out to allow 3rd party notification of open.
9519          * Ignore result of kauth_authorize_fileop call.
9520          */
9521 #if CONFIG_MACF
9522         mac_vnode_notify_open(ctx, vp, fmode);
9523 #endif
9524         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
9525                                (uintptr_t)vp, 0);
9526
9527
9528         return 0;
9529 }
9530
9531 static int
9532 wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
9533 {
9534         int i;
9535         int error = 0;
9536         int unblock = 0;
9537         task_t curtask;
9538
9539         lck_mtx_lock(&nspace_handler_exclusion_lock);
9540         if (nspace_handlers[nspace_type].handler_busy) {
9541                 lck_mtx_unlock(&nspace_handler_exclusion_lock);
9542                 return EBUSY;
9543         }
9544
9545         nspace_handlers[nspace_type].handler_busy = 1;
9546         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9547
9548         /*
9549          * Any process that gets here will be one of the namespace handlers.
9550          * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
9551          * as we can cause deadlocks to occur, because the namespace handler may prevent
9552          * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
9553          * process.
9554          */
9555         curtask = current_task();
9556         bsd_set_dependency_capable (curtask);
9557
9558         lck_mtx_lock(&nspace_handler_lock);
9559         if (nspace_handlers[nspace_type].handler_proc == NULL) {
9560                 nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
9561                 nspace_handlers[nspace_type].handler_proc = current_proc();
9562         }
9563
9564         if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
9565                         (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9566                 error = EINVAL;
9567         }
9568
9569         while (error == 0) {
9570
9571                 /* Try to find matching namespace item */
9572                 for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
9573                         if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9574                                 if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9575                                         break;
9576                                 }
9577                         }
9578                 }
9579
9580                 if (i >= MAX_NSPACE_ITEMS) {
9581                         /* Nothing is there yet. Wait for wake up and retry */
9582                         error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0);
9583                         if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9584                                 /* Prevent infinite loop if snapshot handler exited */
9585                                 error = EINVAL;
9586                                 break;
9587                         }
9588                         continue;
9589                 }
9590
9591                 nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
9592                 nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
9593                 nspace_items[i].token  = ++nspace_token_id;
9594
9595                 assert(nspace_items[i].vp);
9596                 struct fileproc *fp;
9597                 int32_t indx;
9598                 int32_t fmode;
9599                 struct proc *p = current_proc();
9600                 vfs_context_t ctx = vfs_context_current();
9601                 struct vnode_attr va;
9602                 bool vn_get_succsessful = false;
9603                 bool vn_open_successful = false;
9604                 bool fp_alloc_successful = false;
9605
9606                 /*
9607                  * Use vnode pointer to acquire a file descriptor for
9608                  * hand-off to userland
9609                  */
9610                 fmode = nspace_open_flags_for_type(nspace_type);
9611                 error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
9612                 if (error) goto cleanup;
9613                 vn_get_succsessful = true;
9614
9615                 error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
9616                 if (error) goto cleanup;
9617                 vn_open_successful = true;
9618
9619                 error = falloc(p, &fp, &indx, ctx);
9620                 if (error) goto cleanup;
9621                 fp_alloc_successful = true;
9622
9623                 fp->f_fglob->fg_flag = fmode;
9624                 fp->f_fglob->fg_ops = &vnops;
9625                 fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
9626
9627                 proc_fdlock(p);
9628                 procfdtbl_releasefd(p, indx, NULL);
9629                 fp_drop(p, indx, fp, 1);
9630                 proc_fdunlock(p);
9631
9632                 /*
9633                  * All variants of the namespace handler struct support these three fields:
9634                  * token, flags, and the FD pointer
9635                  */
9636                 error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
9637                 if (error) goto cleanup;
9638                 error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
9639                 if (error) goto cleanup;
9640                 error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
9641                 if (error) goto cleanup;
9642
9643                 /*
9644                  * Handle optional fields:
9645                  * extended version support an info ptr (offset, length), and the
9646                  *
9647                  * namedata version supports a unique per-link object ID
9648                  *
9649                  */
9650                 if (nhd->infoptr) {
9651                         uio_t uio = (uio_t)nspace_items[i].arg;
9652                         uint64_t u_offset, u_length;
9653
9654                         if (uio) {
9655                                 u_offset = uio_offset(uio);
9656                                 u_length = uio_resid(uio);
9657                         } else {
9658                                 u_offset = 0;
9659                                 u_length = 0;
9660                         }
9661                         error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
9662                         if (error) goto cleanup;
9663                         error = copyout(&u_length, nhd->infoptr + sizeof(uint64_t), sizeof(uint64_t));
9664                         if (error) goto cleanup;
9665                 }
9666
9667                 if (nhd->objid) {
9668                         VATTR_INIT(&va);
9669                         VATTR_WANTED(&va, va_linkid);
9670                         error = vnode_getattr(nspace_items[i].vp, &va, ctx);
9671                         if (error) goto cleanup;
9672
9673                         uint64_t linkid = 0;
9674                         if (VATTR_IS_SUPPORTED (&va, va_linkid)) {
9675                                 linkid = (uint64_t)va.va_linkid;
9676                         }
9677                         error = copyout(&linkid, nhd->objid, sizeof(uint64_t));
9678                 }
9679 cleanup:
9680                 if (error) {
9681                         if (fp_alloc_successful) fp_free(p, indx, fp);
9682                         if (vn_open_successful) vn_close(nspace_items[i].vp, fmode, ctx);
9683                         unblock = 1;
9684                 }
9685
9686                 if (vn_get_succsessful) vnode_put(nspace_items[i].vp);
9687
9688                 break;
9689         }
9690
9691         if (unblock) {
9692                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9693                         vnode_lock_spin(nspace_items[i].vp);
9694                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9695                         vnode_unlock(nspace_items[i].vp);
9696                 }
9697                 nspace_items[i].vp = NULL;
9698                 nspace_items[i].vid = 0;
9699                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9700                 nspace_items[i].token = 0;
9701
9702                 wakeup((caddr_t)&(nspace_items[i].vp));
9703         }
9704
9705         if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
9706                 // just go through every snapshot event and unblock it immediately.
9707                 if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9708                         for(i = 0; i < MAX_NSPACE_ITEMS; i++) {
9709                                 if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9710                                         if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9711                                                 nspace_items[i].vp = NULL;
9712                                                 nspace_items[i].vid = 0;
9713                                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9714                                                 nspace_items[i].token = 0;
9715
9716                                                 wakeup((caddr_t)&(nspace_items[i].vp));
9717                                         }
9718                                 }
9719                         }
9720                 }
9721         }
9722
9723         lck_mtx_unlock(&nspace_handler_lock);
9724
9725         lck_mtx_lock(&nspace_handler_exclusion_lock);
9726         nspace_handlers[nspace_type].handler_busy = 0;
9727         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9728
9729         return error;
9730 }
9731
9732 static inline int validate_namespace_args (int is64bit, int size) {
9733
9734         if (is64bit) {
9735                 /* Must be one of these */
9736                 if (size == sizeof(user64_namespace_handler_info)) {
9737                         goto sizeok;
9738                 }
9739                 if (size == sizeof(user64_namespace_handler_info_ext)) {
9740                         goto sizeok;
9741                 }
9742                 if (size == sizeof(user64_namespace_handler_data)) {
9743                         goto sizeok;
9744                 }
9745                 return EINVAL;
9746         }
9747         else {
9748                 /* 32 bit -- must be one of these */
9749                 if (size == sizeof(user32_namespace_handler_info)) {
9750                         goto sizeok;
9751                 }
9752                 if (size == sizeof(user32_namespace_handler_info_ext)) {
9753                         goto sizeok;
9754                 }
9755                 if (size == sizeof(user32_namespace_handler_data)) {
9756                         goto sizeok;
9757                 }
9758                 return EINVAL;
9759         }
9760
9761 sizeok:
9762
9763         return 0;
9764
9765 }
9766
9767 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
9768 {
9769         int error = 0;
9770         namespace_handler_data nhd;
9771
9772         bzero (&nhd, sizeof(namespace_handler_data));
9773
9774         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9775                 return error;
9776         }
9777
9778         error = validate_namespace_args (is64bit, size);
9779         if (error) {
9780                 return error;
9781         }
9782
9783         /* Copy in the userland pointers into our kernel-only struct */
9784
9785         if (is64bit) {
9786                 /* 64 bit userland structures */
9787                 nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
9788                 nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
9789                 nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
9790
9791                 /* If the size is greater than the standard info struct, add in extra fields */
9792                 if (size > (sizeof(user64_namespace_handler_info))) {
9793                         if (size >= (sizeof(user64_namespace_handler_info_ext))) {
9794                                 nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
9795                         }
9796                         if (size == (sizeof(user64_namespace_handler_data))) {
9797                                 nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
9798                         }
9799                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9800                 }
9801         }
9802         else {
9803                 /* 32 bit userland structures */
9804                 nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
9805                 nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
9806                 nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
9807
9808                 if (size > (sizeof(user32_namespace_handler_info))) {
9809                         if (size >= (sizeof(user32_namespace_handler_info_ext))) {
9810                                 nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
9811                         }
9812                         if (size == (sizeof(user32_namespace_handler_data))) {
9813                                 nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
9814                         }
9815                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9816                 }
9817         }
9818
9819         return wait_for_namespace_event(&nhd, nspace_type);
9820 }
9821
9822 /*
9823  * Make a filesystem-specific control call:
9824  */
9825 /* ARGSUSED */
9826 static int
9827 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
9828 {
9829         int error=0;
9830         boolean_t is64bit;
9831         u_int size;
9832 #define STK_PARAMS 128
9833         char stkbuf[STK_PARAMS] = {0};
9834         caddr_t data, memp;
9835         vnode_t vp = *arg_vp;
9836
9837         size = IOCPARM_LEN(cmd);
9838         if (size > IOCPARM_MAX) return (EINVAL);
9839
9840         is64bit = proc_is64bit(p);
9841
9842         memp = NULL;
9843
9844
9845         /*
9846          * ensure the buffer is large enough for underlying calls
9847          */
9848 #ifndef HFSIOC_GETPATH
9849         typedef char pn_t[MAXPATHLEN];
9850 #define HFSIOC_GETPATH  _IOWR('h', 13, pn_t)
9851 #endif
9852
9853 #ifndef HFS_GETPATH
9854 #define HFS_GETPATH  IOCBASECMD(HFSIOC_GETPATH)
9855 #endif
9856         if (IOCBASECMD(cmd) == HFS_GETPATH) {
9857                 /* Round up to MAXPATHLEN regardless of user input */
9858                 size = MAXPATHLEN;
9859         }
9860
9861         if (size > sizeof (stkbuf)) {
9862                 if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
9863                 data = memp;
9864         } else {
9865                 data = &stkbuf[0];
9866         };
9867
9868         if (cmd & IOC_IN) {
9869                 if (size) {
9870                         error = copyin(udata, data, size);
9871                         if (error) {
9872                                 if (memp) {
9873                                         kfree (memp, size);
9874                                 }
9875                                 return error;
9876                         }
9877                 } else {
9878                         if (is64bit) {
9879                                 *(user_addr_t *)data = udata;
9880                         }
9881                         else {
9882                                 *(uint32_t *)data = (uint32_t)udata;
9883                         }
9884                 };
9885         } else if ((cmd & IOC_OUT) && size) {
9886                 /*
9887                  * Zero the buffer so the user always
9888                  * gets back something deterministic.
9889                  */
9890                 bzero(data, size);
9891         } else if (cmd & IOC_VOID) {
9892                 if (is64bit) {
9893                         *(user_addr_t *)data = udata;
9894                 }
9895                 else {
9896                         *(uint32_t *)data = (uint32_t)udata;
9897                 }
9898         }
9899
9900         /* Check to see if it's a generic command */
9901         switch (IOCBASECMD(cmd)) {
9902
9903                 case FSCTL_SYNC_VOLUME: {
9904                         mount_t mp = vp->v_mount;
9905                         int arg = *(uint32_t*)data;
9906
9907                         /* record vid of vp so we can drop it below. */
9908                         uint32_t vvid = vp->v_id;
9909
9910                         /*
9911                          * Then grab mount_iterref so that we can release the vnode.
9912                          * Without this, a thread may call vnode_iterate_prepare then
9913                          * get into a deadlock because we've never released the root vp
9914                          */
9915                         error = mount_iterref (mp, 0);
9916                         if (error)  {
9917                                 break;
9918                         }
9919                         vnode_put(vp);
9920
9921                         /* issue the sync for this volume */
9922                         (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
9923
9924                         /*
9925                          * Then release the mount_iterref once we're done syncing; it's not
9926                          * needed for the VNOP_IOCTL below
9927                          */
9928                         mount_iterdrop(mp);
9929
9930                         if (arg & FSCTL_SYNC_FULLSYNC) {
9931                                 /* re-obtain vnode iocount on the root vp, if possible */
9932                                 error = vnode_getwithvid (vp, vvid);
9933                                 if (error == 0) {
9934                                         error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
9935                                         vnode_put (vp);
9936                                 }
9937                         }
9938                         /* mark the argument VP as having been released */
9939                         *arg_vp = NULL;
9940                 }
9941                 break;
9942
9943                 case FSCTL_ROUTEFS_SETROUTEID: {
9944 #if ROUTEFS
9945                         char routepath[MAXPATHLEN];
9946                         size_t len = 0;
9947
9948                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9949                                 break;
9950                         }
9951                         bzero(routepath, MAXPATHLEN);
9952                         error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
9953                         if (error) {
9954                                 break;
9955                         }
9956                         error = routefs_kernel_mount(routepath);
9957                         if (error) {
9958                                 break;
9959                         }
9960 #endif
9961                 }
9962                 break;
9963
9964                 case FSCTL_SET_PACKAGE_EXTS: {
9965                         user_addr_t ext_strings;
9966                         uint32_t    num_entries;
9967                         uint32_t    max_width;
9968
9969                         if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0)))
9970                                 break;
9971
9972                         if (   (is64bit && size != sizeof(user64_package_ext_info))
9973                                         || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
9974
9975                                 // either you're 64-bit and passed a 64-bit struct or
9976                                 // you're 32-bit and passed a 32-bit struct.  otherwise
9977                                 // it's not ok.
9978                                 error = EINVAL;
9979                                 break;
9980                         }
9981
9982                         if (is64bit) {
9983                                 ext_strings = ((user64_package_ext_info *)data)->strings;
9984                                 num_entries = ((user64_package_ext_info *)data)->num_entries;
9985                                 max_width   = ((user64_package_ext_info *)data)->max_width;
9986                         } else {
9987                                 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
9988                                 num_entries = ((user32_package_ext_info *)data)->num_entries;
9989                                 max_width   = ((user32_package_ext_info *)data)->max_width;
9990                         }
9991                         error = set_package_extensions_table(ext_strings, num_entries, max_width);
9992                 }
9993                 break;
9994
9995                 /* namespace handlers */
9996                 case FSCTL_NAMESPACE_HANDLER_GET: {
9997                         error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
9998                 }
9999                 break;
10000
10001                 /* Snapshot handlers */
10002                 case FSCTL_OLD_SNAPSHOT_HANDLER_GET: {
10003                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10004                 }
10005                 break;
10006
10007                 case FSCTL_SNAPSHOT_HANDLER_GET_EXT: {
10008                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10009                 }
10010                 break;
10011
10012                 case FSCTL_NAMESPACE_HANDLER_UPDATE: {
10013                         uint32_t token, val;
10014                         int i;
10015
10016                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10017                                 break;
10018                         }
10019
10020                         if (!nspace_is_special_process(p)) {
10021                                 error = EINVAL;
10022                                 break;
10023                         }
10024
10025                         token = ((uint32_t *)data)[0];
10026                         val   = ((uint32_t *)data)[1];
10027
10028                         lck_mtx_lock(&nspace_handler_lock);
10029
10030                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10031                                 if (nspace_items[i].token == token) {
10032                                         break;  /* exit for loop, not case stmt */
10033                                 }
10034                         }
10035
10036                         if (i >= MAX_NSPACE_ITEMS) {
10037                                 error = ENOENT;
10038                         } else {
10039                                 //
10040                                 // if this bit is set, when resolve_nspace_item() times out
10041                                 // it will loop and go back to sleep.
10042                                 //
10043                                 nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
10044                         }
10045
10046                         lck_mtx_unlock(&nspace_handler_lock);
10047
10048                         if (error) {
10049                                 printf("nspace-handler-update: did not find token %u\n", token);
10050                         }
10051                 }
10052                 break;
10053
10054                 case FSCTL_NAMESPACE_HANDLER_UNBLOCK: {
10055                         uint32_t token, val;
10056                         int i;
10057
10058                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10059                                 break;
10060                         }
10061
10062                         if (!nspace_is_special_process(p)) {
10063                                 error = EINVAL;
10064                                 break;
10065                         }
10066
10067                         token = ((uint32_t *)data)[0];
10068                         val   = ((uint32_t *)data)[1];
10069
10070                         lck_mtx_lock(&nspace_handler_lock);
10071
10072                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10073                                 if (nspace_items[i].token == token) {
10074                                         break; /* exit for loop, not case statement */
10075                                 }
10076                         }
10077
10078                         if (i >= MAX_NSPACE_ITEMS) {
10079                                 printf("nspace-handler-unblock: did not find token %u\n", token);
10080                                 error = ENOENT;
10081                         } else {
10082                                 if (val == 0 && nspace_items[i].vp) {
10083                                         vnode_lock_spin(nspace_items[i].vp);
10084                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10085                                         vnode_unlock(nspace_items[i].vp);
10086                                 }
10087
10088                                 nspace_items[i].vp = NULL;
10089                                 nspace_items[i].arg = NULL;
10090                                 nspace_items[i].op = 0;
10091                                 nspace_items[i].vid = 0;
10092                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
10093                                 nspace_items[i].token = 0;
10094
10095                                 wakeup((caddr_t)&(nspace_items[i].vp));
10096                         }
10097
10098                         lck_mtx_unlock(&nspace_handler_lock);
10099                 }
10100                 break;
10101
10102                 case FSCTL_NAMESPACE_HANDLER_CANCEL: {
10103                         uint32_t token, val;
10104                         int i;
10105
10106                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10107                                 break;
10108                         }
10109
10110                         if (!nspace_is_special_process(p)) {
10111                                 error = EINVAL;
10112                                 break;
10113                         }
10114
10115                         token = ((uint32_t *)data)[0];
10116                         val   = ((uint32_t *)data)[1];
10117
10118                         lck_mtx_lock(&nspace_handler_lock);
10119
10120                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10121                                 if (nspace_items[i].token == token) {
10122                                         break;  /* exit for loop, not case stmt */
10123                                 }
10124                         }
10125
10126                         if (i >= MAX_NSPACE_ITEMS) {
10127                                 printf("nspace-handler-cancel: did not find token %u\n", token);
10128                                 error = ENOENT;
10129                         } else {
10130                                 if (nspace_items[i].vp) {
10131                                         vnode_lock_spin(nspace_items[i].vp);
10132                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10133                                         vnode_unlock(nspace_items[i].vp);
10134                                 }
10135
10136                                 nspace_items[i].vp = NULL;
10137                                 nspace_items[i].arg = NULL;
10138                                 nspace_items[i].vid = 0;
10139                                 nspace_items[i].token = val;
10140                                 nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
10141                                 nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
10142
10143                                 wakeup((caddr_t)&(nspace_items[i].vp));
10144                         }
10145
10146                         lck_mtx_unlock(&nspace_handler_lock);
10147                 }
10148                 break;
10149
10150                 case FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
10151                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10152                                 break;
10153                         }
10154
10155                         // we explicitly do not do the namespace_handler_proc check here
10156
10157                         lck_mtx_lock(&nspace_handler_lock);
10158                         snapshot_timestamp = ((uint32_t *)data)[0];
10159                         wakeup(&nspace_item_idx);
10160                         lck_mtx_unlock(&nspace_handler_lock);
10161                         printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
10162
10163                 }
10164                 break;
10165
10166                 case FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
10167                 {
10168                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10169                                 break;
10170                         }
10171
10172                         lck_mtx_lock(&nspace_handler_lock);
10173                         nspace_allow_virtual_devs = ((uint32_t *)data)[0];
10174                         lck_mtx_unlock(&nspace_handler_lock);
10175                         printf("nspace-snapshot-handler will%s allow events on disk-images\n",
10176                                         nspace_allow_virtual_devs ? "" : " NOT");
10177                         error = 0;
10178
10179                 }
10180                 break;
10181
10182                 case FSCTL_SET_FSTYPENAME_OVERRIDE:
10183                 {
10184                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10185                                 break;
10186                         }
10187                         if (vp->v_mount) {
10188                                 mount_lock(vp->v_mount);
10189                                 if (data[0] != 0) {
10190                                         strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
10191                                         vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
10192                                         if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
10193                                                 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
10194                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
10195                                         }
10196                                 } else {
10197                                         if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
10198                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
10199                                         }
10200                                         vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
10201                                         vp->v_mount->fstypename_override[0] = '\0';
10202                                 }
10203                                 mount_unlock(vp->v_mount);
10204                         }
10205                 }
10206                 break;
10207
10208                 default: {
10209                         /* Invoke the filesystem-specific code */
10210                         error = VNOP_IOCTL(vp, IOCBASECMD(cmd), data, options, ctx);
10211                 }
10212
10213         } /* end switch stmt */
10214
10215         /*
10216          * if no errors, copy any data to user. Size was
10217          * already set and checked above.
10218          */
10219         if (error == 0 && (cmd & IOC_OUT) && size)
10220                 error = copyout(data, udata, size);
10221
10222         if (memp) {
10223                 kfree(memp, size);
10224         }
10225
10226         return error;
10227 }
10228
10229 /* ARGSUSED */
10230 int
10231 fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
10232 {
10233         int error;
10234         struct nameidata nd;
10235         u_long nameiflags;
10236         vnode_t vp = NULL;
10237         vfs_context_t ctx = vfs_context_current();
10238
10239         AUDIT_ARG(cmd, uap->cmd);
10240         AUDIT_ARG(value32, uap->options);
10241         /* Get the vnode for the file we are getting info on:  */
10242         nameiflags = 0;
10243         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
10244         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
10245                UIO_USERSPACE, uap->path, ctx);
10246         if ((error = namei(&nd))) goto done;
10247         vp = nd.ni_vp;
10248         nameidone(&nd);
10249
10250 #if CONFIG_MACF
10251         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
10252         if (error) {
10253                 goto done;
10254         }
10255 #endif
10256
10257         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
10258
10259 done:
10260         if (vp)
10261                 vnode_put(vp);
10262         return error;
10263 }
10264 /* ARGSUSED */
10265 int
10266 ffsctl (proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
10267 {
10268         int error;
10269         vnode_t vp = NULL;
10270         vfs_context_t ctx = vfs_context_current();
10271         int fd = -1;
10272
10273         AUDIT_ARG(fd, uap->fd);
10274         AUDIT_ARG(cmd, uap->cmd);
10275         AUDIT_ARG(value32, uap->options);
10276
10277         /* Get the vnode for the file we are getting info on:  */
10278         if ((error = file_vnode(uap->fd, &vp)))
10279                 return error;
10280         fd = uap->fd;
10281         if ((error = vnode_getwithref(vp))) {
10282                 file_drop(fd);
10283                 return error;
10284         }
10285
10286 #if CONFIG_MACF
10287         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
10288                 file_drop(fd);
10289                 vnode_put(vp);
10290                 return error;
10291         }
10292 #endif
10293
10294         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
10295
10296         file_drop(fd);
10297
10298         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
10299         if (vp) {
10300                 vnode_put(vp);
10301         }
10302
10303         return error;
10304 }
10305 /* end of fsctl system call */
10306
10307 /*
10308  *  Retrieve the data of an extended attribute.
10309  */
10310 int
10311 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
10312 {
10313         vnode_t vp;
10314         struct nameidata nd;
10315         char attrname[XATTR_MAXNAMELEN+1];
10316         vfs_context_t ctx = vfs_context_current();
10317         uio_t auio = NULL;
10318         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10319         size_t attrsize = 0;
10320         size_t namelen;
10321         u_int32_t nameiflags;
10322         int error;
10323         char uio_buf[ UIO_SIZEOF(1) ];
10324
10325         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10326                 return (EINVAL);
10327
10328         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10329         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
10330         if ((error = namei(&nd))) {
10331                 return (error);
10332         }
10333         vp = nd.ni_vp;
10334         nameidone(&nd);
10335
10336         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10337                 goto out;
10338         }
10339         if (xattr_protected(attrname)) {
10340                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
10341                         error = EPERM;
10342                         goto out;
10343                 }
10344         }
10345         /*
10346          * the specific check for 0xffffffff is a hack to preserve
10347          * binaray compatibilty in K64 with applications that discovered
10348          * that passing in a buf pointer and a size of -1 resulted in
10349          * just the size of the indicated extended attribute being returned.
10350          * this isn't part of the documented behavior, but because of the
10351          * original implemtation's check for "uap->size > 0", this behavior
10352          * was allowed. In K32 that check turned into a signed comparison
10353          * even though uap->size is unsigned...  in K64, we blow by that
10354          * check because uap->size is unsigned and doesn't get sign smeared
10355          * in the munger for a 32 bit user app.  we also need to add a
10356          * check to limit the maximum size of the buffer being passed in...
10357          * unfortunately, the underlying fileystems seem to just malloc
10358          * the requested size even if the actual extended attribute is tiny.
10359          * because that malloc is for kernel wired memory, we have to put a
10360          * sane limit on it.
10361          *
10362          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
10363          * U64 running on K64 will yield -1 (64 bits wide)
10364          * U32/U64 running on K32 will yield -1 (32 bits wide)
10365          */
10366         if (uap->size == 0xffffffff || uap->size == (size_t)-1)
10367                 goto no_uio;
10368
10369         if (uap->value) {
10370                 if (uap->size > (size_t)XATTR_MAXSIZE)
10371                         uap->size = XATTR_MAXSIZE;
10372
10373                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
10374                                             &uio_buf[0], sizeof(uio_buf));
10375                 uio_addiov(auio, uap->value, uap->size);
10376         }
10377 no_uio:
10378         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
10379 out:
10380         vnode_put(vp);
10381
10382         if (auio) {
10383                 *retval = uap->size - uio_resid(auio);
10384         } else {
10385                 *retval = (user_ssize_t)attrsize;
10386         }
10387
10388         return (error);
10389 }
10390
10391 /*
10392  * Retrieve the data of an extended attribute.
10393  */
10394 int
10395 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
10396 {
10397         vnode_t vp;
10398         char attrname[XATTR_MAXNAMELEN+1];
10399         uio_t auio = NULL;
10400         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10401         size_t attrsize = 0;
10402         size_t namelen;
10403         int error;
10404         char uio_buf[ UIO_SIZEOF(1) ];
10405
10406         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10407                 return (EINVAL);
10408
10409         if ( (error = file_vnode(uap->fd, &vp)) ) {
10410                 return (error);
10411         }
10412         if ( (error = vnode_getwithref(vp)) ) {
10413                 file_drop(uap->fd);
10414                 return(error);
10415         }
10416         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10417                 goto out;
10418         }
10419         if (xattr_protected(attrname)) {
10420                 error = EPERM;
10421                 goto out;
10422         }
10423         if (uap->value && uap->size > 0) {
10424                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
10425                                             &uio_buf[0], sizeof(uio_buf));
10426                 uio_addiov(auio, uap->value, uap->size);
10427         }
10428
10429         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
10430 out:
10431         (void)vnode_put(vp);
10432         file_drop(uap->fd);
10433
10434         if (auio) {
10435                 *retval = uap->size - uio_resid(auio);
10436         } else {
10437                 *retval = (user_ssize_t)attrsize;
10438         }
10439         return (error);
10440 }
10441
10442 /*
10443  * Set the data of an extended attribute.
10444  */
10445 int
10446 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
10447 {
10448         vnode_t vp;
10449         struct nameidata nd;
10450         char attrname[XATTR_MAXNAMELEN+1];
10451         vfs_context_t ctx = vfs_context_current();
10452         uio_t auio = NULL;
10453         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10454         size_t namelen;
10455         u_int32_t nameiflags;
10456         int error;
10457         char uio_buf[ UIO_SIZEOF(1) ];
10458
10459         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10460                 return (EINVAL);
10461
10462         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10463                 if (error == EPERM) {
10464                         /* if the string won't fit in attrname, copyinstr emits EPERM */
10465                         return (ENAMETOOLONG);
10466                 }
10467                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
10468                 return error;
10469         }
10470         if (xattr_protected(attrname))
10471                 return(EPERM);
10472         if (uap->size != 0 && uap->value == 0) {
10473                 return (EINVAL);
10474         }
10475
10476         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10477         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
10478         if ((error = namei(&nd))) {
10479                 return (error);
10480         }
10481         vp = nd.ni_vp;
10482         nameidone(&nd);
10483
10484         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
10485                                     &uio_buf[0], sizeof(uio_buf));
10486         uio_addiov(auio, uap->value, uap->size);
10487
10488         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
10489 #if CONFIG_FSE
10490         if (error == 0) {
10491                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
10492                     FSE_ARG_VNODE, vp,
10493                     FSE_ARG_DONE);
10494         }
10495 #endif
10496         vnode_put(vp);
10497         *retval = 0;
10498         return (error);
10499 }
10500
10501 /*
10502  * Set the data of an extended attribute.
10503  */
10504 int
10505 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
10506 {
10507         vnode_t vp;
10508         char attrname[XATTR_MAXNAMELEN+1];
10509         uio_t auio = NULL;
10510         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10511         size_t namelen;
10512         int error;
10513         char uio_buf[ UIO_SIZEOF(1) ];
10514 #if CONFIG_FSE
10515         vfs_context_t ctx = vfs_context_current();
10516 #endif
10517
10518         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10519                 return (EINVAL);
10520
10521         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10522                 if (error == EPERM) {
10523                         /* if the string won't fit in attrname, copyinstr emits EPERM */
10524                         return (ENAMETOOLONG);
10525                 }
10526                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
10527                 return error;
10528         }
10529         if (xattr_protected(attrname))
10530                 return(EPERM);
10531         if (uap->size != 0 && uap->value == 0) {
10532                 return (EINVAL);
10533         }
10534         if ( (error = file_vnode(uap->fd, &vp)) ) {
10535                 return (error);
10536         }
10537         if ( (error = vnode_getwithref(vp)) ) {
10538                 file_drop(uap->fd);
10539                 return(error);
10540         }
10541         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
10542                                     &uio_buf[0], sizeof(uio_buf));
10543         uio_addiov(auio, uap->value, uap->size);
10544
10545         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
10546 #if CONFIG_FSE
10547         if (error == 0) {
10548                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
10549                     FSE_ARG_VNODE, vp,
10550                     FSE_ARG_DONE);
10551         }
10552 #endif
10553         vnode_put(vp);
10554         file_drop(uap->fd);
10555         *retval = 0;
10556         return (error);
10557 }
10558
10559 /*
10560  * Remove an extended attribute.
10561  * XXX Code duplication here.
10562  */
10563 int
10564 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
10565 {
10566         vnode_t vp;
10567         struct nameidata nd;
10568         char attrname[XATTR_MAXNAMELEN+1];
10569         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10570         vfs_context_t ctx = vfs_context_current();
10571         size_t namelen;
10572         u_int32_t nameiflags;
10573         int error;
10574
10575         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10576                 return (EINVAL);
10577
10578         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10579         if (error != 0) {
10580                 return (error);
10581         }
10582         if (xattr_protected(attrname))
10583                 return(EPERM);
10584         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10585         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
10586         if ((error = namei(&nd))) {
10587                 return (error);
10588         }
10589         vp = nd.ni_vp;
10590         nameidone(&nd);
10591
10592         error = vn_removexattr(vp, attrname, uap->options, ctx);
10593 #if CONFIG_FSE
10594         if (error == 0) {
10595                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10596                     FSE_ARG_VNODE, vp,
10597                     FSE_ARG_DONE);
10598         }
10599 #endif
10600         vnode_put(vp);
10601         *retval = 0;
10602         return (error);
10603 }
10604
10605 /*
10606  * Remove an extended attribute.
10607  * XXX Code duplication here.
10608  */
10609 int
10610 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
10611 {
10612         vnode_t vp;
10613         char attrname[XATTR_MAXNAMELEN+1];
10614         size_t namelen;
10615         int error;
10616 #if CONFIG_FSE
10617         vfs_context_t ctx = vfs_context_current();
10618 #endif
10619
10620         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10621                 return (EINVAL);
10622
10623         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10624         if (error != 0) {
10625                 return (error);
10626         }
10627         if (xattr_protected(attrname))
10628                 return(EPERM);
10629         if ( (error = file_vnode(uap->fd, &vp)) ) {
10630                 return (error);
10631         }
10632         if ( (error = vnode_getwithref(vp)) ) {
10633                 file_drop(uap->fd);
10634                 return(error);
10635         }
10636
10637         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
10638 #if CONFIG_FSE
10639         if (error == 0) {
10640                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10641                     FSE_ARG_VNODE, vp,
10642                     FSE_ARG_DONE);
10643         }
10644 #endif
10645         vnode_put(vp);
10646         file_drop(uap->fd);
10647         *retval = 0;
10648         return (error);
10649 }
10650
10651 /*
10652  * Retrieve the list of extended attribute names.
10653  * XXX Code duplication here.
10654  */
10655 int
10656 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
10657 {
10658         vnode_t vp;
10659         struct nameidata nd;
10660         vfs_context_t ctx = vfs_context_current();
10661         uio_t auio = NULL;
10662         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10663         size_t attrsize = 0;
10664         u_int32_t nameiflags;
10665         int error;
10666         char uio_buf[ UIO_SIZEOF(1) ];
10667
10668         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10669                 return (EINVAL);
10670
10671         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10672         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
10673         if ((error = namei(&nd))) {
10674                 return (error);
10675         }
10676         vp = nd.ni_vp;
10677         nameidone(&nd);
10678         if (uap->namebuf != 0 && uap->bufsize > 0) {
10679                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
10680                                             &uio_buf[0], sizeof(uio_buf));
10681                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10682         }
10683
10684         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
10685
10686         vnode_put(vp);
10687         if (auio) {
10688                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10689         } else {
10690                 *retval = (user_ssize_t)attrsize;
10691         }
10692         return (error);
10693 }
10694
10695 /*
10696  * Retrieve the list of extended attribute names.
10697  * XXX Code duplication here.
10698  */
10699 int
10700 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
10701 {
10702         vnode_t vp;
10703         uio_t auio = NULL;
10704         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10705         size_t attrsize = 0;
10706         int error;
10707         char uio_buf[ UIO_SIZEOF(1) ];
10708
10709         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10710                 return (EINVAL);
10711
10712         if ( (error = file_vnode(uap->fd, &vp)) ) {
10713                 return (error);
10714         }
10715         if ( (error = vnode_getwithref(vp)) ) {
10716                 file_drop(uap->fd);
10717                 return(error);
10718         }
10719         if (uap->namebuf != 0 && uap->bufsize > 0) {
10720                 auio = uio_createwithbuffer(1, 0, spacetype,
10721                                                                           UIO_READ, &uio_buf[0], sizeof(uio_buf));
10722                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10723         }
10724
10725         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
10726
10727         vnode_put(vp);
10728         file_drop(uap->fd);
10729         if (auio) {
10730                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10731         } else {
10732                 *retval = (user_ssize_t)attrsize;
10733         }
10734         return (error);
10735 }
10736
10737 static int fsgetpath_internal(
10738         vfs_context_t ctx, int volfs_id, uint64_t objid,
10739         vm_size_t bufsize, caddr_t buf, int *pathlen)
10740 {
10741         int error;
10742         struct mount *mp = NULL;
10743         vnode_t vp;
10744         int length;
10745         int bpflags;
10746
10747         if (bufsize > PAGE_SIZE) {
10748                 return (EINVAL);
10749         }
10750
10751         if (buf == NULL) {
10752                 return (ENOMEM);
10753         }
10754
10755         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
10756                 error = ENOTSUP;  /* unexpected failure */
10757                 return ENOTSUP;
10758         }
10759
10760 unionget:
10761         if (objid == 2) {
10762                 error = VFS_ROOT(mp, &vp, ctx);
10763         } else {
10764                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
10765         }
10766
10767         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
10768                 /*
10769                  * If the fileid isn't found and we're in a union
10770                  * mount volume, then see if the fileid is in the
10771                  * mounted-on volume.
10772                  */
10773                 struct mount *tmp = mp;
10774                 mp = vnode_mount(tmp->mnt_vnodecovered);
10775                 vfs_unbusy(tmp);
10776                 if (vfs_busy(mp, LK_NOWAIT) == 0)
10777                         goto unionget;
10778         } else {
10779                 vfs_unbusy(mp);
10780         }
10781
10782         if (error) {
10783                 return error;
10784         }
10785
10786 #if CONFIG_MACF
10787         error = mac_vnode_check_fsgetpath(ctx, vp);
10788         if (error) {
10789                 vnode_put(vp);
10790                 return error;
10791         }
10792 #endif
10793
10794         /* Obtain the absolute path to this vnode. */
10795         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
10796         bpflags |= BUILDPATH_CHECK_MOVED;
10797         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
10798         vnode_put(vp);
10799
10800         if (error) {
10801                 goto out;
10802         }
10803
10804         AUDIT_ARG(text, buf);
10805
10806         if (kdebug_enable) {
10807                 long dbg_parms[NUMPARMS];
10808                 int  dbg_namelen;
10809
10810                 dbg_namelen = (int)sizeof(dbg_parms);
10811
10812         if (length < dbg_namelen) {
10813                         memcpy((char *)dbg_parms, buf, length);
10814                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
10815
10816                         dbg_namelen = length;
10817                 } else {
10818                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
10819                 }
10820
10821                 kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE);
10822         }
10823
10824         *pathlen = (user_ssize_t)length; /* may be superseded by error */
10825
10826 out:
10827         return (error);
10828 }
10829
10830 /*
10831  * Obtain the full pathname of a file system object by id.
10832  *
10833  * This is a private SPI used by the File Manager.
10834  */
10835 __private_extern__
10836 int
10837 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
10838 {
10839         vfs_context_t ctx = vfs_context_current();
10840         fsid_t fsid;
10841         char *realpath;
10842         int length;
10843         int error;
10844
10845         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
10846                 return (error);
10847         }
10848         AUDIT_ARG(value32, fsid.val[0]);
10849         AUDIT_ARG(value64, uap->objid);
10850         /* Restrict output buffer size for now. */
10851
10852         if (uap->bufsize > PAGE_SIZE) {
10853                 return (EINVAL);
10854         }
10855         MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK);
10856         if (realpath == NULL) {
10857                 return (ENOMEM);
10858         }
10859
10860         error = fsgetpath_internal(
10861                 ctx, fsid.val[0], uap->objid,
10862                 uap->bufsize, realpath, &length);
10863
10864         if (error) {
10865                 goto out;
10866         }
10867
10868         error = copyout((caddr_t)realpath, uap->buf, length);
10869
10870         *retval = (user_ssize_t)length; /* may be superseded by error */
10871 out:
10872         if (realpath) {
10873                 FREE(realpath, M_TEMP);
10874         }
10875         return (error);
10876 }
10877
10878 /*
10879  * Common routine to handle various flavors of statfs data heading out
10880  *      to user space.
10881  *
10882  * Returns:     0                       Success
10883  *              EFAULT
10884  */
10885 static int
10886 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
10887     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
10888     boolean_t partial_copy)
10889 {
10890         int             error;
10891         int             my_size, copy_size;
10892
10893         if (is_64_bit) {
10894                 struct user64_statfs sfs;
10895                 my_size = copy_size = sizeof(sfs);
10896                 bzero(&sfs, my_size);
10897                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10898                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10899                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10900                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
10901                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
10902                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
10903                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
10904                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
10905                 sfs.f_files = (user64_long_t)sfsp->f_files;
10906                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
10907                 sfs.f_fsid = sfsp->f_fsid;
10908                 sfs.f_owner = sfsp->f_owner;
10909                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
10910                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
10911                 } else {
10912                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
10913                 }
10914                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
10915                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
10916
10917                 if (partial_copy) {
10918                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
10919                 }
10920                 error = copyout((caddr_t)&sfs, bufp, copy_size);
10921         }
10922         else {
10923                 struct user32_statfs sfs;
10924
10925                 my_size = copy_size = sizeof(sfs);
10926                 bzero(&sfs, my_size);
10927
10928                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10929                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10930                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10931
10932                 /*
10933                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
10934                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
10935                  * to reflect the filesystem size as best we can.
10936                  */
10937                 if ((sfsp->f_blocks > INT_MAX)
10938                         /* Hack for 4061702 . I think the real fix is for Carbon to
10939                          * look for some volume capability and not depend on hidden
10940                          * semantics agreed between a FS and carbon.
10941                          * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
10942                          * for Carbon to set bNoVolumeSizes volume attribute.
10943                          * Without this the webdavfs files cannot be copied onto
10944                          * disk as they look huge. This change should not affect
10945                          * XSAN as they should not setting these to -1..
10946                          */
10947                          && (sfsp->f_blocks != 0xffffffffffffffffULL)
10948                          && (sfsp->f_bfree != 0xffffffffffffffffULL)
10949                          && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
10950                         int             shift;
10951
10952                         /*
10953                          * Work out how far we have to shift the block count down to make it fit.
10954                          * Note that it's possible to have to shift so far that the resulting
10955                          * blocksize would be unreportably large.  At that point, we will clip
10956                          * any values that don't fit.
10957                          *
10958                          * For safety's sake, we also ensure that f_iosize is never reported as
10959                          * being smaller than f_bsize.
10960                          */
10961                         for (shift = 0; shift < 32; shift++) {
10962                                 if ((sfsp->f_blocks >> shift) <= INT_MAX)
10963                                         break;
10964                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX)
10965                                         break;
10966                         }
10967 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
10968                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
10969                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
10970                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
10971 #undef __SHIFT_OR_CLIP
10972                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
10973                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
10974                 } else {
10975                         /* filesystem is small enough to be reported honestly */
10976                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
10977                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
10978                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
10979                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
10980                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
10981                 }
10982                 sfs.f_files = (user32_long_t)sfsp->f_files;
10983                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
10984                 sfs.f_fsid = sfsp->f_fsid;
10985                 sfs.f_owner = sfsp->f_owner;
10986                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
10987                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
10988                 } else {
10989                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
10990                 }
10991                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
10992                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
10993
10994                 if (partial_copy) {
10995                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
10996                 }
10997                 error = copyout((caddr_t)&sfs, bufp, copy_size);
10998         }
10999
11000         if (sizep != NULL) {
11001                 *sizep = my_size;
11002         }
11003         return(error);
11004 }
11005
11006 /*
11007  * copy stat structure into user_stat structure.
11008  */
11009 void munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
11010 {
11011         bzero(usbp, sizeof(*usbp));
11012
11013         usbp->st_dev = sbp->st_dev;
11014         usbp->st_ino = sbp->st_ino;
11015         usbp->st_mode = sbp->st_mode;
11016         usbp->st_nlink = sbp->st_nlink;
11017         usbp->st_uid = sbp->st_uid;
11018         usbp->st_gid = sbp->st_gid;
11019         usbp->st_rdev = sbp->st_rdev;
11020 #ifndef _POSIX_C_SOURCE
11021         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11022         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11023         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11024         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11025         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11026         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11027 #else
11028         usbp->st_atime = sbp->st_atime;
11029         usbp->st_atimensec = sbp->st_atimensec;
11030         usbp->st_mtime = sbp->st_mtime;
11031         usbp->st_mtimensec = sbp->st_mtimensec;
11032         usbp->st_ctime = sbp->st_ctime;
11033         usbp->st_ctimensec = sbp->st_ctimensec;
11034 #endif
11035         usbp->st_size = sbp->st_size;
11036         usbp->st_blocks = sbp->st_blocks;
11037         usbp->st_blksize = sbp->st_blksize;
11038         usbp->st_flags = sbp->st_flags;
11039         usbp->st_gen = sbp->st_gen;
11040         usbp->st_lspare = sbp->st_lspare;
11041         usbp->st_qspare[0] = sbp->st_qspare[0];
11042         usbp->st_qspare[1] = sbp->st_qspare[1];
11043 }
11044
11045 void munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
11046 {
11047         bzero(usbp, sizeof(*usbp));
11048
11049         usbp->st_dev = sbp->st_dev;
11050         usbp->st_ino = sbp->st_ino;
11051         usbp->st_mode = sbp->st_mode;
11052         usbp->st_nlink = sbp->st_nlink;
11053         usbp->st_uid = sbp->st_uid;
11054         usbp->st_gid = sbp->st_gid;
11055         usbp->st_rdev = sbp->st_rdev;
11056 #ifndef _POSIX_C_SOURCE
11057         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11058         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11059         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11060         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11061         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11062         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11063 #else
11064         usbp->st_atime = sbp->st_atime;
11065         usbp->st_atimensec = sbp->st_atimensec;
11066         usbp->st_mtime = sbp->st_mtime;
11067         usbp->st_mtimensec = sbp->st_mtimensec;
11068         usbp->st_ctime = sbp->st_ctime;
11069         usbp->st_ctimensec = sbp->st_ctimensec;
11070 #endif
11071         usbp->st_size = sbp->st_size;
11072         usbp->st_blocks = sbp->st_blocks;
11073         usbp->st_blksize = sbp->st_blksize;
11074         usbp->st_flags = sbp->st_flags;
11075         usbp->st_gen = sbp->st_gen;
11076         usbp->st_lspare = sbp->st_lspare;
11077         usbp->st_qspare[0] = sbp->st_qspare[0];
11078         usbp->st_qspare[1] = sbp->st_qspare[1];
11079 }
11080
11081 /*
11082  * copy stat64 structure into user_stat64 structure.
11083  */
11084 void munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
11085 {
11086         bzero(usbp, sizeof(*usbp));
11087
11088         usbp->st_dev = sbp->st_dev;
11089         usbp->st_ino = sbp->st_ino;
11090         usbp->st_mode = sbp->st_mode;
11091         usbp->st_nlink = sbp->st_nlink;
11092         usbp->st_uid = sbp->st_uid;
11093         usbp->st_gid = sbp->st_gid;
11094         usbp->st_rdev = sbp->st_rdev;
11095 #ifndef _POSIX_C_SOURCE
11096         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11097         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11098         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11099         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11100         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11101         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11102         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11103         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11104 #else
11105         usbp->st_atime = sbp->st_atime;
11106         usbp->st_atimensec = sbp->st_atimensec;
11107         usbp->st_mtime = sbp->st_mtime;
11108         usbp->st_mtimensec = sbp->st_mtimensec;
11109         usbp->st_ctime = sbp->st_ctime;
11110         usbp->st_ctimensec = sbp->st_ctimensec;
11111         usbp->st_birthtime = sbp->st_birthtime;
11112         usbp->st_birthtimensec = sbp->st_birthtimensec;
11113 #endif
11114         usbp->st_size = sbp->st_size;
11115         usbp->st_blocks = sbp->st_blocks;
11116         usbp->st_blksize = sbp->st_blksize;
11117         usbp->st_flags = sbp->st_flags;
11118         usbp->st_gen = sbp->st_gen;
11119         usbp->st_lspare = sbp->st_lspare;
11120         usbp->st_qspare[0] = sbp->st_qspare[0];
11121         usbp->st_qspare[1] = sbp->st_qspare[1];
11122 }
11123
11124 void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
11125 {
11126         bzero(usbp, sizeof(*usbp));
11127
11128         usbp->st_dev = sbp->st_dev;
11129         usbp->st_ino = sbp->st_ino;
11130         usbp->st_mode = sbp->st_mode;
11131         usbp->st_nlink = sbp->st_nlink;
11132         usbp->st_uid = sbp->st_uid;
11133         usbp->st_gid = sbp->st_gid;
11134         usbp->st_rdev = sbp->st_rdev;
11135 #ifndef _POSIX_C_SOURCE
11136         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11137         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11138         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11139         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11140         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11141         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11142         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11143         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11144 #else
11145         usbp->st_atime = sbp->st_atime;
11146         usbp->st_atimensec = sbp->st_atimensec;
11147         usbp->st_mtime = sbp->st_mtime;
11148         usbp->st_mtimensec = sbp->st_mtimensec;
11149         usbp->st_ctime = sbp->st_ctime;
11150         usbp->st_ctimensec = sbp->st_ctimensec;
11151         usbp->st_birthtime = sbp->st_birthtime;
11152         usbp->st_birthtimensec = sbp->st_birthtimensec;
11153 #endif
11154         usbp->st_size = sbp->st_size;
11155         usbp->st_blocks = sbp->st_blocks;
11156         usbp->st_blksize = sbp->st_blksize;
11157         usbp->st_flags = sbp->st_flags;
11158         usbp->st_gen = sbp->st_gen;
11159         usbp->st_lspare = sbp->st_lspare;
11160         usbp->st_qspare[0] = sbp->st_qspare[0];
11161         usbp->st_qspare[1] = sbp->st_qspare[1];
11162 }
11163
11164 /*
11165  * Purge buffer cache for simulating cold starts
11166  */
11167 static int vnode_purge_callback(struct vnode *vp, __unused void *cargs)
11168 {
11169         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
11170
11171         return VNODE_RETURNED;
11172 }
11173
11174 static int vfs_purge_callback(mount_t mp, __unused void * arg)
11175 {
11176         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
11177
11178         return VFS_RETURNED;
11179 }
11180
11181 int
11182 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
11183 {
11184         if (!kauth_cred_issuser(kauth_cred_get()))
11185                 return EPERM;
11186
11187         vfs_iterate(0/* flags */, vfs_purge_callback, NULL);
11188
11189         return 0;
11190 }
11191
11192 /*
11193  * gets the vnode associated with the (unnamed) snapshot directory
11194  * for a Filesystem. The snapshot directory vnode is returned with
11195  * an iocount on it.
11196  */
11197 int
11198 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
11199 {
11200         int error;
11201
11202         error = VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
11203
11204 #if CLONE_SNAPSHOT_FALLBACKS_ENABLED
11205         if (error == ENOTSUP) {
11206                 struct nameidata snapnd;
11207
11208                 /*
11209                  * Temporary fallback to <mountpoint>/.snaps lookup
11210                  * XXX: To be removed.
11211                  */
11212                 NDINIT(&snapnd, LOOKUP, OP_LOOKUP,  USEDVP,
11213                     UIO_SYSSPACE, CAST_USER_ADDR_T(".snaps"), ctx);
11214                 snapnd.ni_dvp = rvp;
11215
11216                 if ((error = namei(&snapnd))) {
11217                         error = ENOTSUP;
11218                         *sdvpp = NULLVP;
11219                 } else {
11220                         *sdvpp = snapnd.ni_vp;
11221                         nameidone(&snapnd);
11222                 }
11223         }
11224 #endif /* CLONE_SNAPSHOT_FALLBACKS_ENABLED */
11225         return (error);
11226 }
11227
11228 /*
11229  * Get the snapshot vnode.
11230  *
11231  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
11232  * needs nameidone() on ndp.
11233  *
11234  * If the snapshot vnode exists it is returned in ndp->ni_vp.
11235  *
11236  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
11237  * not needed.
11238  */
11239 static int
11240 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
11241     user_addr_t name, struct nameidata *ndp, int32_t op,
11242 #if !CONFIG_TRIGGERS
11243     __unused
11244 #endif
11245     enum path_operation pathop,
11246     vfs_context_t ctx)
11247 {
11248         int error, i;
11249         caddr_t name_buf;
11250         size_t name_len;
11251         struct vfs_attr vfa;
11252
11253         *sdvpp = NULLVP;
11254         *rvpp = NULLVP;
11255
11256         error = vnode_getfromfd(ctx, dirfd, rvpp);
11257         if (error)
11258                 return (error);
11259
11260         if (!vnode_isvroot(*rvpp)) {
11261                 error = EINVAL;
11262                 goto out;
11263         }
11264
11265         /* Make sure the filesystem supports snapshots */
11266         VFSATTR_INIT(&vfa);
11267         VFSATTR_WANTED(&vfa, f_capabilities);
11268         if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
11269             !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
11270             !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
11271             VOL_CAP_INT_SNAPSHOT)) ||
11272             !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
11273             VOL_CAP_INT_SNAPSHOT))) {
11274                 error = ENOTSUP;
11275                 goto out;
11276         }
11277
11278         error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
11279         if (error)
11280                 goto out;
11281
11282         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11283         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11284         if (error)
11285                 goto out1;
11286
11287         /*
11288          * Some sanity checks- name can't be empty, "." or ".." or have slashes.
11289          * (the length returned by copyinstr includes the terminating NUL)
11290          */
11291         if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
11292             (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
11293                 error = EINVAL;
11294                 goto out1;
11295         }
11296         for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++);
11297         if (i < (int)name_len) {
11298                 error = EINVAL;
11299                 goto out1;
11300         }
11301
11302 #if CONFIG_MACF
11303         if (op == CREATE) {
11304                 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
11305                     name_buf);
11306         } else if (op == DELETE) {
11307                 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
11308                     name_buf);
11309         }
11310         if (error)
11311                 goto out1;
11312 #endif
11313
11314         /* Check if the snapshot already exists ... */
11315         NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
11316             UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
11317         ndp->ni_dvp = *sdvpp;
11318
11319         error = namei(ndp);
11320 out1:
11321         FREE(name_buf, M_TEMP);
11322 out:
11323         if (error) {
11324                 if (*sdvpp) {
11325                         vnode_put(*sdvpp);
11326                         *sdvpp = NULLVP;
11327                 }
11328                 if (*rvpp) {
11329                         vnode_put(*rvpp);
11330                         *rvpp = NULLVP;
11331                 }
11332         }
11333         return (error);
11334 }
11335
11336 /*
11337  * create a filesystem snapshot (for supporting filesystems)
11338  *
11339  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
11340  * We get to the (unnamed) snapshot directory vnode and create the vnode
11341  * for the snapshot in it.
11342  *
11343  * Restrictions:
11344  *
11345  *    a) Passed in name for snapshot cannot have slashes.
11346  *    b) name can't be "." or ".."
11347  *
11348  * Since this requires superuser privileges, vnode_authorize calls are not
11349  * made.
11350  */
11351 static int
11352 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
11353     vfs_context_t ctx)
11354 {
11355         vnode_t rvp, snapdvp;
11356         int error;
11357         struct nameidata namend;
11358
11359         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
11360             OP_LINK, ctx);
11361         if (error)
11362                 return (error);
11363
11364         if (namend.ni_vp) {
11365                 vnode_put(namend.ni_vp);
11366                 error = EEXIST;
11367         } else {
11368                 struct vnode_attr va;
11369                 vnode_t vp = NULLVP;
11370
11371                 VATTR_INIT(&va);
11372                 VATTR_SET(&va, va_type, VREG);
11373                 VATTR_SET(&va, va_mode, 0);
11374
11375                 error = vn_create(snapdvp, &vp, &namend, &va,
11376                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
11377                 if (!error && vp)
11378                         vnode_put(vp);
11379 #if CLONE_SNAPSHOT_FALLBACKS_ENABLED
11380                 else if (error) {
11381                         error = VNOP_COPYFILE(rvp, rvp, NULLVP, &namend.ni_cnd,
11382                             0, 0, ctx);
11383                 }
11384 #endif /* CLONE_SNAPSHOT_FALLBACKS_ENABLED */
11385         }
11386
11387         nameidone(&namend);
11388         vnode_put(snapdvp);
11389         vnode_put(rvp);
11390         return (error);
11391 }
11392
11393 /*
11394  * Delete a Filesystem snapshot
11395  *
11396  * get the vnode for the unnamed snapshot directory and the snapshot and
11397  * delete the snapshot.
11398  */
11399 static int
11400 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
11401     vfs_context_t ctx)
11402 {
11403         vnode_t rvp, snapdvp;
11404         int error;
11405         struct nameidata namend;
11406
11407         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
11408             OP_UNLINK, ctx);
11409         if (error)
11410                 goto out;
11411
11412         error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
11413             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
11414
11415         vnode_put(namend.ni_vp);
11416         nameidone(&namend);
11417         vnode_put(snapdvp);
11418         vnode_put(rvp);
11419 out:
11420         return (error);
11421 }
11422
11423 /*
11424  * Revert a filesystem to a snapshot
11425  *
11426  * Marks the filesystem to revert to the given snapshot on next mount.
11427  */
11428 static int
11429 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
11430                 vfs_context_t ctx)
11431 {
11432     int error;
11433     vnode_t rvp;
11434     mount_t mp;
11435     struct fs_snapshot_revert_args revert_data;
11436     struct componentname cnp;
11437     caddr_t name_buf;
11438     size_t name_len;
11439
11440     error = vnode_getfromfd(ctx, dirfd, &rvp);
11441     if (error) {
11442         return (error);
11443     }
11444     mp = vnode_mount(rvp);
11445
11446     /*
11447      * Grab mount_iterref so that we can release the vnode,
11448      * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
11449      */
11450     error = mount_iterref (mp, 0);
11451     vnode_put(rvp);
11452     if (error) {
11453         return (error);
11454     }
11455
11456     MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11457     error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11458     if (error) {
11459         mount_iterdrop(mp);
11460         FREE(name_buf, M_TEMP);
11461         return (error);
11462     }
11463
11464     memset(&cnp, 0, sizeof(cnp));
11465     cnp.cn_pnbuf = (char *)name_buf;
11466     cnp.cn_nameiop = LOOKUP;
11467     cnp.cn_flags = ISLASTCN | HASBUF;
11468     cnp.cn_pnlen = MAXPATHLEN;
11469     cnp.cn_nameptr = cnp.cn_pnbuf;
11470     cnp.cn_namelen = (int)name_len;
11471     revert_data.sr_cnp = &cnp;
11472
11473     error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
11474     mount_iterdrop(mp);
11475     FREE(name_buf, M_TEMP);
11476
11477     if (error) {
11478         /* If there was any error, try again using VNOP_IOCTL */
11479
11480         vnode_t snapdvp;
11481         struct nameidata namend;
11482
11483         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
11484                                    OP_LOOKUP, ctx);
11485         if (error) {
11486             return (error);
11487         }
11488
11489
11490 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
11491 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
11492 #endif
11493
11494 #ifndef APFS_REVERT_TO_SNAPSHOT
11495 #define APFS_REVERT_TO_SNAPSHOT     IOCBASECMD(APFSIOC_REVERT_TO_SNAPSHOT)
11496 #endif
11497
11498         error = VNOP_IOCTL(namend.ni_vp, APFS_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
11499                            0, ctx);
11500
11501         vnode_put(namend.ni_vp);
11502         nameidone(&namend);
11503         vnode_put(snapdvp);
11504         vnode_put(rvp);
11505     }
11506
11507         return (error);
11508 }
11509
11510 /*
11511  * rename a Filesystem snapshot
11512  *
11513  * get the vnode for the unnamed snapshot directory and the snapshot and
11514  * rename the snapshot. This is a very specialised (and simple) case of
11515  * rename(2) (which has to deal with a lot more complications). It differs
11516  * slightly from rename(2) in that EEXIST is returned if the new name exists.
11517  */
11518 static int
11519 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
11520     __unused uint32_t flags, vfs_context_t ctx)
11521 {
11522         vnode_t rvp, snapdvp;
11523         int error, i;
11524         caddr_t newname_buf;
11525         size_t name_len;
11526         vnode_t fvp;
11527         struct nameidata *fromnd, *tond;
11528         /* carving out a chunk for structs that are too big to be on stack. */
11529         struct {
11530                 struct nameidata from_node;
11531                 struct nameidata to_node;
11532         } * __rename_data;
11533
11534         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
11535         fromnd = &__rename_data->from_node;
11536         tond = &__rename_data->to_node;
11537
11538         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
11539             OP_UNLINK, ctx);
11540         if (error)
11541                 goto out;
11542         fvp  = fromnd->ni_vp;
11543
11544         MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11545         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
11546         if (error)
11547                 goto out1;
11548
11549         /*
11550          * Some sanity checks- new name can't be empty, "." or ".." or have
11551          * slashes.
11552          * (the length returned by copyinstr includes the terminating NUL)
11553          *
11554          * The FS rename VNOP is suppossed to handle this but we'll pick it
11555          * off here itself.
11556          */
11557         if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
11558             (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
11559                 error = EINVAL;
11560                 goto out1;
11561         }
11562         for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++);
11563         if (i < (int)name_len) {
11564                 error = EINVAL;
11565                 goto out1;
11566         }
11567
11568 #if CONFIG_MACF
11569         error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
11570             newname_buf);
11571         if (error)
11572                 goto out1;
11573 #endif
11574
11575         NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
11576             UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
11577         tond->ni_dvp = snapdvp;
11578
11579         error = namei(tond);
11580         if (error) {
11581                 goto out2;
11582         } else if (tond->ni_vp) {
11583                 /*
11584                  * snapshot rename behaves differently than rename(2) - if the
11585                  * new name exists, EEXIST is returned.
11586                  */
11587                 vnode_put(tond->ni_vp);
11588                 error = EEXIST;
11589                 goto out2;
11590         }
11591
11592         error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
11593             &tond->ni_cnd, ctx);
11594
11595 out2:
11596         nameidone(tond);
11597 out1:
11598         FREE(newname_buf, M_TEMP);
11599         vnode_put(fvp);
11600         vnode_put(snapdvp);
11601         vnode_put(rvp);
11602         nameidone(fromnd);
11603 out:
11604         FREE(__rename_data, M_TEMP);
11605         return (error);
11606 }
11607
11608 /*
11609  * Mount a Filesystem snapshot
11610  *
11611  * get the vnode for the unnamed snapshot directory and the snapshot and
11612  * mount the snapshot.
11613  */
11614 static int
11615 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
11616     user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
11617 {
11618         vnode_t rvp, snapdvp, snapvp, vp, pvp;
11619         int error;
11620         struct nameidata *snapndp, *dirndp;
11621         /* carving out a chunk for structs that are too big to be on stack. */
11622         struct {
11623                 struct nameidata snapnd;
11624                 struct nameidata dirnd;
11625         } * __snapshot_mount_data;
11626
11627         MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
11628             M_TEMP, M_WAITOK);
11629         snapndp = &__snapshot_mount_data->snapnd;
11630         dirndp = &__snapshot_mount_data->dirnd;
11631
11632         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
11633             OP_LOOKUP, ctx);
11634         if (error)
11635                 goto out;
11636
11637         snapvp  = snapndp->ni_vp;
11638         if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
11639                 error = EIO;
11640                 goto out1;
11641         }
11642
11643         /* Get the vnode to be covered */
11644         NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
11645             UIO_USERSPACE, directory, ctx);
11646         error = namei(dirndp);
11647         if (error)
11648                 goto out1;
11649
11650         vp = dirndp->ni_vp;
11651         pvp = dirndp->ni_dvp;
11652
11653         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
11654                 error = EINVAL;
11655         } else {
11656                 mount_t mp = vnode_mount(rvp);
11657                 struct fs_snapshot_mount_args smnt_data;
11658
11659                 smnt_data.sm_mp  = mp;
11660                 smnt_data.sm_cnp = &snapndp->ni_cnd;
11661                 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
11662                    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), 0,
11663                    KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
11664                 if (error) {
11665                         /* Retry with user passed args */
11666                         error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp,
11667                            vp, &dirndp->ni_cnd, CAST_USER_ADDR_T(mnt_data), 0,
11668                            0, NULL, FALSE, ctx);
11669                 }
11670         }
11671
11672         vnode_put(vp);
11673         vnode_put(pvp);
11674         nameidone(dirndp);
11675 out1:
11676         vnode_put(snapvp);
11677         vnode_put(snapdvp);
11678         vnode_put(rvp);
11679         nameidone(snapndp);
11680 out:
11681         FREE(__snapshot_mount_data, M_TEMP);
11682         return (error);
11683 }
11684
11685 /*
11686  * FS snapshot operations dispatcher
11687  */
11688 int
11689 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
11690     __unused int32_t *retval)
11691 {
11692         int error;
11693         vfs_context_t ctx = vfs_context_current();
11694
11695         error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
11696         if (error)
11697                 return (error);
11698
11699         switch (uap->op) {
11700         case SNAPSHOT_OP_CREATE:
11701                 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
11702                 break;
11703         case SNAPSHOT_OP_DELETE:
11704                 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
11705                 break;
11706         case SNAPSHOT_OP_RENAME:
11707                 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
11708                     uap->flags, ctx);
11709                 break;
11710         case SNAPSHOT_OP_MOUNT:
11711                 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
11712                     uap->data, uap->flags, ctx);
11713                 break;
11714     case SNAPSHOT_OP_REVERT:
11715         error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
11716         break;
11717         default:
11718                 error = ENOSYS;
11719         }
11720
11721         return (error);
11722 }