bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/xattr.h>
  98 #include <sys/fcntl.h>
  99 #include <sys/fsctl.h>
 100 #include <sys/ubc_internal.h>
 101 #include <sys/disk.h>
 102 #include <sys/content_protection.h>
 103 #include <machine/cons.h>
 104 #include <machine/limits.h>
 105 #include <miscfs/specfs/specdev.h>
 106
 107 #include <security/audit/audit.h>
 108 #include <bsm/audit_kevents.h>
 109
 110 #include <mach/mach_types.h>
 111 #include <kern/kern_types.h>
 112 #include <kern/kalloc.h>
 113 #include <kern/task.h>
 114
 115 #include <vm/vm_pageout.h>
 116
 117 #include <libkern/OSAtomic.h>
 118 #include <pexpert/pexpert.h>
 119 #include <IOKit/IOBSD.h>
 120
 121 #if CONFIG_MACF
 122 #include <security/mac.h>
 123 #include <security/mac_framework.h>
 124 #endif
 125
 126 #if CONFIG_FSE
 127 #define GET_PATH(x) \
 128         (x) = get_pathbuff();
 129 #define RELEASE_PATH(x) \
 130         release_pathbuff(x);
 131 #else
 132 #define GET_PATH(x)     \
 133         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 134 #define RELEASE_PATH(x) \
 135         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 136 #endif /* CONFIG_FSE */
 137
 138 /* struct for checkdirs iteration */
 139 struct cdirargs {
 140         vnode_t olddp;
 141         vnode_t newdp;
 142 };
 143 /* callback  for checkdirs iteration */
 144 static int checkdirs_callback(proc_t p, void * arg);
 145
 146 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 147 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 148 void enablequotas(struct mount *mp, vfs_context_t ctx);
 149 static int getfsstat_callback(mount_t mp, void * arg);
 150 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 151 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 152 static int sync_callback(mount_t, void *);
 153 static void sync_thread(void *, __unused wait_result_t);
 154 static int sync_async(int);
 155 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 156                         user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 157                                                 boolean_t partial_copy);
 158 static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
 159                         user_addr_t bufp);
 160 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 161 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 162                         struct componentname *cnp, user_addr_t fsmountargs,
 163                         int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 164                         vfs_context_t ctx);
 165 void vfs_notify_mount(vnode_t pdvp);
 166
 167 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 168
 169 struct fd_vn_data * fg_vn_data_alloc(void);
 170
 171 /*
 172  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 173  * Concurrent lookups (or lookups by ids) on hard links can cause the
 174  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 175  * does) to return ENOENT as the path cannot be returned from the name cache
 176  * alone. We have no option but to retry and hope to get one namei->reverse path
 177  * generation done without an intervening lookup, lookup by id on the hard link
 178  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 179  * which currently are the MAC hooks for rename, unlink and rmdir.
 180  */
 181 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 182
 183 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
 184
 185 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
 186
 187 #ifdef CONFIG_IMGSRC_ACCESS
 188 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 189 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 190 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 191 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 192 static void mount_end_update(mount_t mp);
 193 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 194 #endif /* CONFIG_IMGSRC_ACCESS */
 195
 196 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 197
 198 __private_extern__
 199 int sync_internal(void);
 200
 201 __private_extern__
 202 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 203
 204 extern lck_grp_t *fd_vn_lck_grp;
 205 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 206 extern lck_attr_t *fd_vn_lck_attr;
 207
 208 /*
 209  * incremented each time a mount or unmount operation occurs
 210  * used to invalidate the cached value of the rootvp in the
 211  * mount structure utilized by cache_lookup_path
 212  */
 213 uint32_t mount_generation = 0;
 214
 215 /* counts number of mount and unmount operations */
 216 unsigned int vfs_nummntops=0;
 217
 218 extern const struct fileops vnops;
 219 #if CONFIG_APPLEDOUBLE
 220 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 221 #endif /* CONFIG_APPLEDOUBLE */
 222
 223 typedef uint32_t vfs_rename_flags_t;
 224 #if CONFIG_SECLUDED_RENAME
 225 enum {
 226         VFS_SECLUDE_RENAME              = 0x00000001
 227 };
 228 #endif
 229
 230 /*
 231  * Virtual File System System Calls
 232  */
 233
 234 #if NFSCLIENT || DEVFS
 235 /*
 236  * Private in-kernel mounting spi (NFS only, not exported)
 237  */
 238  __private_extern__
 239 boolean_t
 240 vfs_iskernelmount(mount_t mp)
 241 {
 242         return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE);
 243 }
 244
 245  __private_extern__
 246 int
 247 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 248              void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
 249 {
 250         struct nameidata nd;
 251         boolean_t did_namei;
 252         int error;
 253
 254         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 255                UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 256
 257         /*
 258          * Get the vnode to be covered if it's not supplied
 259          */
 260         if (vp == NULLVP) {
 261                 error = namei(&nd);
 262                 if (error)
 263                         return (error);
 264                 vp = nd.ni_vp;
 265                 pvp = nd.ni_dvp;
 266                 did_namei = TRUE;
 267         } else {
 268                 char *pnbuf = CAST_DOWN(char *, path);
 269
 270                 nd.ni_cnd.cn_pnbuf = pnbuf;
 271                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 272                 did_namei = FALSE;
 273         }
 274
 275         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 276                              syscall_flags, kern_flags, NULL, TRUE, ctx);
 277
 278         if (did_namei) {
 279                 vnode_put(vp);
 280                 vnode_put(pvp);
 281                 nameidone(&nd);
 282         }
 283
 284         return (error);
 285 }
 286 #endif /* NFSCLIENT || DEVFS */
 287
 288 /*
 289  * Mount a file system.
 290  */
 291 /* ARGSUSED */
 292 int
 293 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 294 {
 295         struct __mac_mount_args muap;
 296
 297         muap.type = uap->type;
 298         muap.path = uap->path;
 299         muap.flags = uap->flags;
 300         muap.data = uap->data;
 301         muap.mac_p = USER_ADDR_NULL;
 302         return (__mac_mount(p, &muap, retval));
 303 }
 304
 305 void
 306 vfs_notify_mount(vnode_t pdvp)
 307 {
 308         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 309         lock_vnode_and_post(pdvp, NOTE_WRITE);
 310 }
 311
 312 /*
 313  * __mac_mount:
 314  *      Mount a file system taking into account MAC label behavior.
 315  *      See mount(2) man page for more information
 316  *
 317  * Parameters:    p                        Process requesting the mount
 318  *                uap                      User argument descriptor (see below)
 319  *                retval                   (ignored)
 320  *
 321  * Indirect:      uap->type                Filesystem type
 322  *                uap->path                Path to mount
 323  *                uap->data                Mount arguments
 324  *                uap->mac_p               MAC info
 325  *                uap->flags               Mount flags
 326  *
 327  *
 328  * Returns:        0                       Success
 329  *                !0                       Not success
 330  */
 331 boolean_t root_fs_upgrade_try = FALSE;
 332
 333 int
 334 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 335 {
 336         vnode_t pvp = NULL;
 337         vnode_t vp = NULL;
 338         int need_nameidone = 0;
 339         vfs_context_t ctx = vfs_context_current();
 340         char fstypename[MFSNAMELEN];
 341         struct nameidata nd;
 342         size_t dummy=0;
 343         char *labelstr = NULL;
 344         int flags = uap->flags;
 345         int error;
 346 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 347         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 348 #else
 349 #pragma unused(p)
 350 #endif
 351         /*
 352          * Get the fs type name from user space
 353          */
 354         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 355         if (error)
 356                 return (error);
 357
 358         /*
 359          * Get the vnode to be covered
 360          */
 361         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 362                UIO_USERSPACE, uap->path, ctx);
 363         error = namei(&nd);
 364         if (error) {
 365                 goto out;
 366         }
 367         need_nameidone = 1;
 368         vp = nd.ni_vp;
 369         pvp = nd.ni_dvp;
 370
 371 #ifdef CONFIG_IMGSRC_ACCESS
 372         /* Mounting image source cannot be batched with other operations */
 373         if (flags == MNT_IMGSRC_BY_INDEX) {
 374                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 375                                                   ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 376                 goto out;
 377         }
 378 #endif /* CONFIG_IMGSRC_ACCESS */
 379
 380 #if CONFIG_MACF
 381         /*
 382          * Get the label string (if any) from user space
 383          */
 384         if (uap->mac_p != USER_ADDR_NULL) {
 385                 struct user_mac mac;
 386                 size_t ulen = 0;
 387
 388                 if (is_64bit) {
 389                         struct user64_mac mac64;
 390                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 391                         mac.m_buflen = mac64.m_buflen;
 392                         mac.m_string = mac64.m_string;
 393                 } else {
 394                         struct user32_mac mac32;
 395                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 396                         mac.m_buflen = mac32.m_buflen;
 397                         mac.m_string = mac32.m_string;
 398                 }
 399                 if (error)
 400                         goto out;
 401                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 402                     (mac.m_buflen < 2)) {
 403                         error = EINVAL;
 404                         goto out;
 405                 }
 406                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 407                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 408                 if (error) {
 409                         goto out;
 410                 }
 411                 AUDIT_ARG(mac_string, labelstr);
 412         }
 413 #endif /* CONFIG_MACF */
 414
 415         AUDIT_ARG(fflags, flags);
 416
 417 #if SECURE_KERNEL
 418         if (flags & MNT_UNION) {
 419                 /* No union mounts on release kernels */
 420                 error = EPERM;
 421                 goto out;
 422         }
 423 #endif
 424
 425         if ((vp->v_flag & VROOT) &&
 426                         (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 427                 if (!(flags & MNT_UNION)) {
 428                         flags |= MNT_UPDATE;
 429                 }
 430                 else {
 431                         /*
 432                          * For a union mount on '/', treat it as fresh
 433                          * mount instead of update.
 434                          * Otherwise, union mouting on '/' used to panic the
 435                          * system before, since mnt_vnodecovered was found to
 436                          * be NULL for '/' which is required for unionlookup
 437                          * after it gets ENOENT on union mount.
 438                          */
 439                         flags = (flags & ~(MNT_UPDATE));
 440                 }
 441
 442 #if SECURE_KERNEL
 443                 if ((flags & MNT_RDONLY) == 0) {
 444                         /* Release kernels are not allowed to mount "/" as rw */
 445                         error = EPERM;
 446                         goto out;
 447                 }
 448 #endif
 449                 /*
 450                  * See 7392553 for more details on why this check exists.
 451                  * Suffice to say: If this check is ON and something tries
 452                  * to mount the rootFS RW, we'll turn off the codesign
 453                  * bitmap optimization.
 454                  */
 455 #if CHECK_CS_VALIDATION_BITMAP
 456                 if ((flags & MNT_RDONLY) == 0 ) {
 457                         root_fs_upgrade_try = TRUE;
 458                 }
 459 #endif
 460         }
 461
 462         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 463                              labelstr, FALSE, ctx);
 464
 465 out:
 466
 467 #if CONFIG_MACF
 468         if (labelstr)
 469                 FREE(labelstr, M_MACTEMP);
 470 #endif /* CONFIG_MACF */
 471
 472         if (vp) {
 473                 vnode_put(vp);
 474         }
 475         if (pvp) {
 476                 vnode_put(pvp);
 477         }
 478         if (need_nameidone) {
 479                 nameidone(&nd);
 480         }
 481
 482         return (error);
 483 }
 484
 485 /*
 486  * common mount implementation (final stage of mounting)
 487
 488  * Arguments:
 489  *  fstypename  file system type (ie it's vfs name)
 490  *  pvp         parent of covered vnode
 491  *  vp          covered vnode
 492  *  cnp         component name (ie path) of covered vnode
 493  *  flags       generic mount flags
 494  *  fsmountargs file system specific data
 495  *  labelstr    optional MAC label
 496  *  kernelmount TRUE for mounts initiated from inside the kernel
 497  *  ctx         caller's context
 498  */
 499 static int
 500 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 501              struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 502              char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 503 {
 504 #if !CONFIG_MACF
 505 #pragma unused(labelstr)
 506 #endif
 507         struct vnode *devvp = NULLVP;
 508         struct vnode *device_vnode = NULLVP;
 509 #if CONFIG_MACF
 510         struct vnode *rvp;
 511 #endif
 512         struct mount *mp;
 513         struct vfstable *vfsp = (struct vfstable *)0;
 514         struct proc *p = vfs_context_proc(ctx);
 515         int error, flag = 0;
 516         user_addr_t devpath = USER_ADDR_NULL;
 517         int ronly = 0;
 518         int mntalloc = 0;
 519         boolean_t vfsp_ref = FALSE;
 520         boolean_t is_rwlock_locked = FALSE;
 521         boolean_t did_rele = FALSE;
 522         boolean_t have_usecount = FALSE;
 523
 524         /*
 525          * Process an update for an existing mount
 526          */
 527         if (flags & MNT_UPDATE) {
 528                 if ((vp->v_flag & VROOT) == 0) {
 529                         error = EINVAL;
 530                         goto out1;
 531                 }
 532                 mp = vp->v_mount;
 533
 534                 /* unmount in progress return error */
 535                 mount_lock_spin(mp);
 536                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 537                         mount_unlock(mp);
 538                         error = EBUSY;
 539                         goto out1;
 540                 }
 541                 mount_unlock(mp);
 542                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 543                 is_rwlock_locked = TRUE;
 544                 /*
 545                  * We only allow the filesystem to be reloaded if it
 546                  * is currently mounted read-only.
 547                  */
 548                 if ((flags & MNT_RELOAD) &&
 549                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 550                         error = ENOTSUP;
 551                         goto out1;
 552                 }
 553
 554                 /*
 555                  * If content protection is enabled, update mounts are not
 556                  * allowed to turn it off.
 557                  */
 558                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 559                            ((flags & MNT_CPROTECT) == 0)) {
 560                         error = EINVAL;
 561                         goto out1;
 562                 }
 563
 564 #ifdef CONFIG_IMGSRC_ACCESS
 565                 /* Can't downgrade the backer of the root FS */
 566                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 567                         (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 568                         error = ENOTSUP;
 569                         goto out1;
 570                 }
 571 #endif /* CONFIG_IMGSRC_ACCESS */
 572
 573                 /*
 574                  * Only root, or the user that did the original mount is
 575                  * permitted to update it.
 576                  */
 577                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 578                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 579                         goto out1;
 580                 }
 581 #if CONFIG_MACF
 582                 error = mac_mount_check_remount(ctx, mp);
 583                 if (error != 0) {
 584                         goto out1;
 585                 }
 586 #endif
 587                 /*
 588                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 589                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 590                  */
 591                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 592                         flags |= MNT_NOSUID | MNT_NODEV;
 593                         if (mp->mnt_flag & MNT_NOEXEC)
 594                                 flags |= MNT_NOEXEC;
 595                 }
 596                 flag = mp->mnt_flag;
 597
 598
 599
 600                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 601
 602                 vfsp = mp->mnt_vtable;
 603                 goto update;
 604         }
 605         /*
 606          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 607          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 608          */
 609         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 610                 flags |= MNT_NOSUID | MNT_NODEV;
 611                 if (vp->v_mount->mnt_flag & MNT_NOEXEC)
 612                         flags |= MNT_NOEXEC;
 613         }
 614
 615         /* XXXAUDIT: Should we capture the type on the error path as well? */
 616         AUDIT_ARG(text, fstypename);
 617         mount_list_lock();
 618         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 619                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 620                         vfsp->vfc_refcount++;
 621                         vfsp_ref = TRUE;
 622                         break;
 623                 }
 624         mount_list_unlock();
 625         if (vfsp == NULL) {
 626                 error = ENODEV;
 627                 goto out1;
 628         }
 629
 630         /*
 631          * VFC_VFSLOCALARGS is not currently supported for kernel mounts
 632          */
 633         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
 634                 error = EINVAL;  /* unsupported request */
 635                 goto out1;
 636         }
 637
 638         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 639         if (error != 0) {
 640                 goto out1;
 641         }
 642
 643         /*
 644          * Allocate and initialize the filesystem (mount_t)
 645          */
 646         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 647                 M_MOUNT, M_WAITOK);
 648         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 649         mntalloc = 1;
 650
 651         /* Initialize the default IO constraints */
 652         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 653         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 654         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 655         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 656         mp->mnt_devblocksize = DEV_BSIZE;
 657         mp->mnt_alignmentmask = PAGE_MASK;
 658         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 659         mp->mnt_ioscale = 1;
 660         mp->mnt_ioflags = 0;
 661         mp->mnt_realrootvp = NULLVP;
 662         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 663
 664         TAILQ_INIT(&mp->mnt_vnodelist);
 665         TAILQ_INIT(&mp->mnt_workerqueue);
 666         TAILQ_INIT(&mp->mnt_newvnodes);
 667         mount_lock_init(mp);
 668         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 669         is_rwlock_locked = TRUE;
 670         mp->mnt_op = vfsp->vfc_vfsops;
 671         mp->mnt_vtable = vfsp;
 672         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 673         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 674         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 675         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 676         mp->mnt_vnodecovered = vp;
 677         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 678         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 679         mp->mnt_devbsdunit = 0;
 680
 681         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 682         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 683
 684 #if NFSCLIENT || DEVFS
 685         if (kernelmount)
 686                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 687         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0)
 688                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 689 #endif /* NFSCLIENT || DEVFS */
 690
 691 update:
 692         /*
 693          * Set the mount level flags.
 694          */
 695         if (flags & MNT_RDONLY)
 696                 mp->mnt_flag |= MNT_RDONLY;
 697         else if (mp->mnt_flag & MNT_RDONLY) {
 698                 // disallow read/write upgrades of file systems that
 699                 // had the TYPENAME_OVERRIDE feature set.
 700                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 701                         error = EPERM;
 702                         goto out1;
 703                 }
 704                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 705         }
 706         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 707                           MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 708                           MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 709                           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 710                           MNT_QUARANTINE | MNT_CPROTECT);
 711         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 712                                  MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 713                                  MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 714                                  MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 715                                  MNT_QUARANTINE | MNT_CPROTECT);
 716
 717 #if CONFIG_MACF
 718         if (flags & MNT_MULTILABEL) {
 719                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 720                         error = EINVAL;
 721                         goto out1;
 722                 }
 723                 mp->mnt_flag |= MNT_MULTILABEL;
 724         }
 725 #endif
 726         /*
 727          * Process device path for local file systems if requested
 728          */
 729         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
 730                 if (vfs_context_is64bit(ctx)) {
 731                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
 732                                 goto out1;
 733                         fsmountargs += sizeof(devpath);
 734                 } else {
 735                         user32_addr_t tmp;
 736                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
 737                                 goto out1;
 738                         /* munge into LP64 addr */
 739                         devpath = CAST_USER_ADDR_T(tmp);
 740                         fsmountargs += sizeof(tmp);
 741                 }
 742
 743                 /* Lookup device and authorize access to it */
 744                 if ((devpath)) {
 745                         struct nameidata nd;
 746
 747                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 748                         if ( (error = namei(&nd)) )
 749                                 goto out1;
 750
 751                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 752                         devvp = nd.ni_vp;
 753
 754                         nameidone(&nd);
 755
 756                         if (devvp->v_type != VBLK) {
 757                                 error = ENOTBLK;
 758                                 goto out2;
 759                         }
 760                         if (major(devvp->v_rdev) >= nblkdev) {
 761                                 error = ENXIO;
 762                                 goto out2;
 763                         }
 764                         /*
 765                         * If mount by non-root, then verify that user has necessary
 766                         * permissions on the device.
 767                         */
 768                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 769                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 770
 771                                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
 772                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 773                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0)
 774                                         goto out2;
 775                         }
 776                 }
 777                 /* On first mount, preflight and open device */
 778                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 779                         if ( (error = vnode_ref(devvp)) )
 780                                 goto out2;
 781                         /*
 782                         * Disallow multiple mounts of the same device.
 783                         * Disallow mounting of a device that is currently in use
 784                         * (except for root, which might share swap device for miniroot).
 785                         * Flush out any old buffers remaining from a previous use.
 786                         */
 787                         if ( (error = vfs_mountedon(devvp)) )
 788                                 goto out3;
 789
 790                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 791                                 error = EBUSY;
 792                                 goto out3;
 793                         }
 794                         if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, ctx)) ) {
 795                                 error = ENOTBLK;
 796                                 goto out3;
 797                         }
 798                         if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0)) )
 799                                 goto out3;
 800
 801                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 802 #if CONFIG_MACF
 803                         error = mac_vnode_check_open(ctx,
 804                             devvp,
 805                             ronly ? FREAD : FREAD|FWRITE);
 806                         if (error)
 807                                 goto out3;
 808 #endif /* MAC */
 809                         if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, ctx)) )
 810                                 goto out3;
 811
 812                         mp->mnt_devvp = devvp;
 813                         device_vnode = devvp;
 814
 815                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 816                            (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 817                            (device_vnode = mp->mnt_devvp)) {
 818                         dev_t dev;
 819                         int maj;
 820                         /*
 821                          * If upgrade to read-write by non-root, then verify
 822                          * that user has necessary permissions on the device.
 823                          */
 824                         vnode_getalways(device_vnode);
 825
 826                         if (suser(vfs_context_ucred(ctx), NULL) &&
 827                             (error = vnode_authorize(device_vnode, NULL,
 828                              KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
 829                              ctx)) != 0) {
 830                                 vnode_put(device_vnode);
 831                                 goto out2;
 832                         }
 833
 834                         /* Tell the device that we're upgrading */
 835                         dev = (dev_t)device_vnode->v_rdev;
 836                         maj = major(dev);
 837
 838                         if ((u_int)maj >= (u_int)nblkdev)
 839                                 panic("Volume mounted on a device with invalid major number.");
 840
 841                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
 842                         vnode_put(device_vnode);
 843                         device_vnode = NULLVP;
 844                         if (error != 0) {
 845                                 goto out2;
 846                         }
 847                 }
 848         }
 849 #if CONFIG_MACF
 850         if ((flags & MNT_UPDATE) == 0) {
 851                 mac_mount_label_init(mp);
 852                 mac_mount_label_associate(ctx, mp);
 853         }
 854         if (labelstr) {
 855                 if ((flags & MNT_UPDATE) != 0) {
 856                         error = mac_mount_check_label_update(ctx, mp);
 857                         if (error != 0)
 858                                 goto out3;
 859                 }
 860         }
 861 #endif
 862         /*
 863          * Mount the filesystem.
 864          */
 865         error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
 866
 867         if (flags & MNT_UPDATE) {
 868                 if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 869                         mp->mnt_flag &= ~MNT_RDONLY;
 870                 mp->mnt_flag &=~
 871                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
 872                 mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 873                 if (error)
 874                         mp->mnt_flag = flag;  /* restore flag value */
 875                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
 876                 lck_rw_done(&mp->mnt_rwlock);
 877                 is_rwlock_locked = FALSE;
 878                 if (!error)
 879                         enablequotas(mp, ctx);
 880                 goto exit;
 881         }
 882
 883         /*
 884          * Put the new filesystem on the mount list after root.
 885          */
 886         if (error == 0) {
 887                 struct vfs_attr vfsattr;
 888 #if CONFIG_MACF
 889                 if (vfs_flags(mp) & MNT_MULTILABEL) {
 890                         error = VFS_ROOT(mp, &rvp, ctx);
 891                         if (error) {
 892                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
 893                                 goto out3;
 894                         }
 895                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
 896                         /*
 897                          * drop reference provided by VFS_ROOT
 898                          */
 899                         vnode_put(rvp);
 900
 901                         if (error)
 902                                 goto out3;
 903                 }
 904 #endif  /* MAC */
 905
 906                 vnode_lock_spin(vp);
 907                 CLR(vp->v_flag, VMOUNT);
 908                 vp->v_mountedhere = mp;
 909                 vnode_unlock(vp);
 910
 911                 /*
 912                  * taking the name_cache_lock exclusively will
 913                  * insure that everyone is out of the fast path who
 914                  * might be trying to use a now stale copy of
 915                  * vp->v_mountedhere->mnt_realrootvp
 916                  * bumping mount_generation causes the cached values
 917                  * to be invalidated
 918                  */
 919                 name_cache_lock();
 920                 mount_generation++;
 921                 name_cache_unlock();
 922
 923                 error = vnode_ref(vp);
 924                 if (error != 0) {
 925                         goto out4;
 926                 }
 927
 928                 have_usecount = TRUE;
 929
 930                 error = checkdirs(vp, ctx);
 931                 if (error != 0)  {
 932                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
 933                         goto out4;
 934                 }
 935                 /*
 936                  * there is no cleanup code here so I have made it void
 937                  * we need to revisit this
 938                  */
 939                 (void)VFS_START(mp, 0, ctx);
 940
 941                 if (mount_list_add(mp) != 0) {
 942                         /*
 943                          * The system is shutting down trying to umount
 944                          * everything, so fail with a plausible errno.
 945                          */
 946                         error = EBUSY;
 947                         goto out4;
 948                 }
 949                 lck_rw_done(&mp->mnt_rwlock);
 950                 is_rwlock_locked = FALSE;
 951
 952                 /* Check if this mounted file system supports EAs or named streams. */
 953                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
 954                 VFSATTR_INIT(&vfsattr);
 955                 VFSATTR_WANTED(&vfsattr, f_capabilities);
 956                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
 957                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
 958                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
 959                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
 960                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
 961                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
 962                         }
 963 #if NAMEDSTREAMS
 964                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
 965                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
 966                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
 967                         }
 968 #endif
 969                         /* Check if this file system supports path from id lookups. */
 970                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
 971                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
 972                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 973                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
 974                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
 975                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 976                         }
 977                 }
 978                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
 979                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
 980                 }
 981                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
 982                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
 983                 }
 984                 /* increment the operations count */
 985                 OSAddAtomic(1, &vfs_nummntops);
 986                 enablequotas(mp, ctx);
 987
 988                 if (device_vnode) {
 989                         device_vnode->v_specflags |= SI_MOUNTEDON;
 990
 991                         /*
 992                          *   cache the IO attributes for the underlying physical media...
 993                          *   an error return indicates the underlying driver doesn't
 994                          *   support all the queries necessary... however, reasonable
 995                          *   defaults will have been set, so no reason to bail or care
 996                          */
 997                         vfs_init_io_attributes(device_vnode, mp);
 998                 }
 999
1000                 /* Now that mount is setup, notify the listeners */
1001                 vfs_notify_mount(pvp);
1002                 IOBSDMountChange(mp, kIOMountChangeMount);
1003
1004         } else {
1005                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1006                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1007                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1008                                         mp->mnt_vtable->vfc_name, error);
1009                 }
1010
1011                 vnode_lock_spin(vp);
1012                 CLR(vp->v_flag, VMOUNT);
1013                 vnode_unlock(vp);
1014                 mount_list_lock();
1015                 mp->mnt_vtable->vfc_refcount--;
1016                 mount_list_unlock();
1017
1018                 if (device_vnode ) {
1019                         vnode_rele(device_vnode);
1020                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD|FWRITE, ctx);
1021                 }
1022                 lck_rw_done(&mp->mnt_rwlock);
1023                 is_rwlock_locked = FALSE;
1024
1025                 /*
1026                  * if we get here, we have a mount structure that needs to be freed,
1027                  * but since the coveredvp hasn't yet been updated to point at it,
1028                  * no need to worry about other threads holding a crossref on this mp
1029                  * so it's ok to just free it
1030                  */
1031                 mount_lock_destroy(mp);
1032 #if CONFIG_MACF
1033                 mac_mount_label_destroy(mp);
1034 #endif
1035                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1036         }
1037 exit:
1038         /*
1039          * drop I/O count on the device vp if there was one
1040          */
1041         if (devpath && devvp)
1042                 vnode_put(devvp);
1043
1044         return(error);
1045
1046 /* Error condition exits */
1047 out4:
1048         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1049
1050         /*
1051          * If the mount has been placed on the covered vp,
1052          * it may have been discovered by now, so we have
1053          * to treat this just like an unmount
1054          */
1055         mount_lock_spin(mp);
1056         mp->mnt_lflag |= MNT_LDEAD;
1057         mount_unlock(mp);
1058
1059         if (device_vnode != NULLVP) {
1060                 vnode_rele(device_vnode);
1061                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1062                        ctx);
1063                 did_rele = TRUE;
1064         }
1065
1066         vnode_lock_spin(vp);
1067
1068         mp->mnt_crossref++;
1069         vp->v_mountedhere = (mount_t) 0;
1070
1071         vnode_unlock(vp);
1072
1073         if (have_usecount) {
1074                 vnode_rele(vp);
1075         }
1076 out3:
1077         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele))
1078                 vnode_rele(devvp);
1079 out2:
1080         if (devpath && devvp)
1081                 vnode_put(devvp);
1082 out1:
1083         /* Release mnt_rwlock only when it was taken */
1084         if (is_rwlock_locked == TRUE) {
1085                 lck_rw_done(&mp->mnt_rwlock);
1086         }
1087
1088         if (mntalloc) {
1089                 if (mp->mnt_crossref)
1090                         mount_dropcrossref(mp, vp, 0);
1091                 else {
1092                         mount_lock_destroy(mp);
1093 #if CONFIG_MACF
1094                         mac_mount_label_destroy(mp);
1095 #endif
1096                         FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1097                 }
1098         }
1099         if (vfsp_ref) {
1100                 mount_list_lock();
1101                 vfsp->vfc_refcount--;
1102                 mount_list_unlock();
1103         }
1104
1105         return(error);
1106 }
1107
1108 /*
1109  * Flush in-core data, check for competing mount attempts,
1110  * and set VMOUNT
1111  */
1112 int
1113 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1114 {
1115 #if !CONFIG_MACF
1116 #pragma unused(cnp,fsname)
1117 #endif
1118         struct vnode_attr va;
1119         int error;
1120
1121         if (!skip_auth) {
1122                 /*
1123                  * If the user is not root, ensure that they own the directory
1124                  * onto which we are attempting to mount.
1125                  */
1126                 VATTR_INIT(&va);
1127                 VATTR_WANTED(&va, va_uid);
1128                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1129                                 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1130                                  (!vfs_context_issuser(ctx)))) {
1131                         error = EPERM;
1132                         goto out;
1133                 }
1134         }
1135
1136         if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
1137                 goto out;
1138
1139         if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) )
1140                 goto out;
1141
1142         if (vp->v_type != VDIR) {
1143                 error = ENOTDIR;
1144                 goto out;
1145         }
1146
1147         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1148                 error = EBUSY;
1149                 goto out;
1150         }
1151
1152 #if CONFIG_MACF
1153         error = mac_mount_check_mount(ctx, vp,
1154             cnp, fsname);
1155         if (error != 0)
1156                 goto out;
1157 #endif
1158
1159         vnode_lock_spin(vp);
1160         SET(vp->v_flag, VMOUNT);
1161         vnode_unlock(vp);
1162
1163 out:
1164         return error;
1165 }
1166
1167 #if CONFIG_IMGSRC_ACCESS
1168
1169 #if DEBUG
1170 #define IMGSRC_DEBUG(args...) printf(args)
1171 #else
1172 #define IMGSRC_DEBUG(args...) do { } while(0)
1173 #endif
1174
1175 static int
1176 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1177 {
1178         struct nameidata nd;
1179         vnode_t vp, realdevvp;
1180         mode_t accessmode;
1181         int error;
1182
1183         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
1184         if ( (error = namei(&nd)) ) {
1185                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1186                 return error;
1187         }
1188
1189         vp = nd.ni_vp;
1190
1191         if (!vnode_isblk(vp)) {
1192                 IMGSRC_DEBUG("Not block device.\n");
1193                 error = ENOTBLK;
1194                 goto out;
1195         }
1196
1197         realdevvp = mp->mnt_devvp;
1198         if (realdevvp == NULLVP) {
1199                 IMGSRC_DEBUG("No device backs the mount.\n");
1200                 error = ENXIO;
1201                 goto out;
1202         }
1203
1204         error = vnode_getwithref(realdevvp);
1205         if (error != 0) {
1206                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1207                 goto out;
1208         }
1209
1210         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1211                 IMGSRC_DEBUG("Wrong dev_t.\n");
1212                 error = ENXIO;
1213                 goto out1;
1214         }
1215
1216         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1217
1218         /*
1219          * If mount by non-root, then verify that user has necessary
1220          * permissions on the device.
1221          */
1222         if (!vfs_context_issuser(ctx)) {
1223                 accessmode = KAUTH_VNODE_READ_DATA;
1224                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
1225                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1226                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1227                         IMGSRC_DEBUG("Access denied.\n");
1228                         goto out1;
1229                 }
1230         }
1231
1232         *devvpp = vp;
1233
1234 out1:
1235         vnode_put(realdevvp);
1236 out:
1237         nameidone(&nd);
1238         if (error) {
1239                 vnode_put(vp);
1240         }
1241
1242         return error;
1243 }
1244
1245 /*
1246  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1247  * and call checkdirs()
1248  */
1249 static int
1250 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1251 {
1252         int error;
1253
1254         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1255
1256         vnode_lock_spin(vp);
1257         CLR(vp->v_flag, VMOUNT);
1258         vp->v_mountedhere = mp;
1259         vnode_unlock(vp);
1260
1261         /*
1262          * taking the name_cache_lock exclusively will
1263          * insure that everyone is out of the fast path who
1264          * might be trying to use a now stale copy of
1265          * vp->v_mountedhere->mnt_realrootvp
1266          * bumping mount_generation causes the cached values
1267          * to be invalidated
1268          */
1269         name_cache_lock();
1270         mount_generation++;
1271         name_cache_unlock();
1272
1273         error = vnode_ref(vp);
1274         if (error != 0) {
1275                 goto out;
1276         }
1277
1278         error = checkdirs(vp, ctx);
1279         if (error != 0)  {
1280                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1281                 vnode_rele(vp);
1282                 goto out;
1283         }
1284
1285 out:
1286         if (error != 0) {
1287                 mp->mnt_vnodecovered = NULLVP;
1288         }
1289         return error;
1290 }
1291
1292 static void
1293 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1294 {
1295         vnode_rele(vp);
1296         vnode_lock_spin(vp);
1297         vp->v_mountedhere = (mount_t)NULL;
1298         vnode_unlock(vp);
1299
1300         mp->mnt_vnodecovered = NULLVP;
1301 }
1302
1303 static int
1304 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1305 {
1306         int error;
1307
1308         /* unmount in progress return error */
1309         mount_lock_spin(mp);
1310         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1311                 mount_unlock(mp);
1312                 return EBUSY;
1313         }
1314         mount_unlock(mp);
1315         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1316
1317         /*
1318          * We only allow the filesystem to be reloaded if it
1319          * is currently mounted read-only.
1320          */
1321         if ((flags & MNT_RELOAD) &&
1322                         ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1323                 error = ENOTSUP;
1324                 goto out;
1325         }
1326
1327         /*
1328          * Only root, or the user that did the original mount is
1329          * permitted to update it.
1330          */
1331         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1332                         (!vfs_context_issuser(ctx))) {
1333                 error = EPERM;
1334                 goto out;
1335         }
1336 #if CONFIG_MACF
1337         error = mac_mount_check_remount(ctx, mp);
1338         if (error != 0) {
1339                 goto out;
1340         }
1341 #endif
1342
1343 out:
1344         if (error) {
1345                 lck_rw_done(&mp->mnt_rwlock);
1346         }
1347
1348         return error;
1349 }
1350
1351 static void
1352 mount_end_update(mount_t mp)
1353 {
1354         lck_rw_done(&mp->mnt_rwlock);
1355 }
1356
1357 static int
1358 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1359 {
1360         vnode_t vp;
1361
1362         if (height >= MAX_IMAGEBOOT_NESTING) {
1363                 return EINVAL;
1364         }
1365
1366         vp = imgsrc_rootvnodes[height];
1367         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1368                 *rvpp = vp;
1369                 return 0;
1370         } else {
1371                 return ENOENT;
1372         }
1373 }
1374
1375 static int
1376 relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
1377                 const char *fsname, vfs_context_t ctx,
1378                 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1379 {
1380         int error;
1381         mount_t mp;
1382         boolean_t placed = FALSE;
1383         vnode_t devvp = NULLVP;
1384         struct vfstable *vfsp;
1385         user_addr_t devpath;
1386         char *old_mntonname;
1387         vnode_t rvp;
1388         uint32_t height;
1389         uint32_t flags;
1390
1391         /* If we didn't imageboot, nothing to move */
1392         if (imgsrc_rootvnodes[0] == NULLVP) {
1393                 return EINVAL;
1394         }
1395
1396         /* Only root can do this */
1397         if (!vfs_context_issuser(ctx)) {
1398                 return EPERM;
1399         }
1400
1401         IMGSRC_DEBUG("looking for root vnode.\n");
1402
1403         /*
1404          * Get root vnode of filesystem we're moving.
1405          */
1406         if (by_index) {
1407                 if (is64bit) {
1408                         struct user64_mnt_imgsrc_args mia64;
1409                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1410                         if (error != 0) {
1411                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1412                                 return error;
1413                         }
1414
1415                         height = mia64.mi_height;
1416                         flags = mia64.mi_flags;
1417                         devpath = mia64.mi_devpath;
1418                 } else {
1419                         struct user32_mnt_imgsrc_args mia32;
1420                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1421                         if (error != 0) {
1422                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1423                                 return error;
1424                         }
1425
1426                         height = mia32.mi_height;
1427                         flags = mia32.mi_flags;
1428                         devpath = mia32.mi_devpath;
1429                 }
1430         } else {
1431                 /*
1432                  * For binary compatibility--assumes one level of nesting.
1433                  */
1434                 if (is64bit) {
1435                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
1436                                 return error;
1437                 } else {
1438                         user32_addr_t tmp;
1439                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
1440                                 return error;
1441
1442                         /* munge into LP64 addr */
1443                         devpath = CAST_USER_ADDR_T(tmp);
1444                 }
1445
1446                 height = 0;
1447                 flags = 0;
1448         }
1449
1450         if (flags != 0) {
1451                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1452                 return EINVAL;
1453         }
1454
1455         error = get_imgsrc_rootvnode(height, &rvp);
1456         if (error != 0) {
1457                 IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
1458                 return error;
1459         }
1460
1461         IMGSRC_DEBUG("got root vnode.\n");
1462
1463         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1464
1465         /* Can only move once */
1466         mp = vnode_mount(rvp);
1467         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1468                 IMGSRC_DEBUG("Already moved.\n");
1469                 error = EBUSY;
1470                 goto out0;
1471         }
1472
1473         IMGSRC_DEBUG("Starting updated.\n");
1474
1475         /* Get exclusive rwlock on mount, authorize update on mp */
1476         error = mount_begin_update(mp , ctx, 0);
1477         if (error != 0) {
1478                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1479                 goto out0;
1480         }
1481
1482         /*
1483          * It can only be moved once.  Flag is set under the rwlock,
1484          * so we're now safe to proceed.
1485          */
1486         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1487                 IMGSRC_DEBUG("Already moved [2]\n");
1488                 goto out1;
1489         }
1490
1491
1492         IMGSRC_DEBUG("Preparing coveredvp.\n");
1493
1494         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1495         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1496         if (error != 0) {
1497                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1498                 goto out1;
1499         }
1500
1501         IMGSRC_DEBUG("Covered vp OK.\n");
1502
1503         /* Sanity check the name caller has provided */
1504         vfsp = mp->mnt_vtable;
1505         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1506                 IMGSRC_DEBUG("Wrong fs name.\n");
1507                 error = EINVAL;
1508                 goto out2;
1509         }
1510
1511         /* Check the device vnode and update mount-from name, for local filesystems */
1512         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1513                 IMGSRC_DEBUG("Local, doing device validation.\n");
1514
1515                 if (devpath != USER_ADDR_NULL) {
1516                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1517                         if (error) {
1518                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1519                                 goto out2;
1520                         }
1521
1522                         vnode_put(devvp);
1523                 }
1524         }
1525
1526         /*
1527          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1528          * and increment the name cache's mount generation
1529          */
1530
1531         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1532         error = place_mount_and_checkdirs(mp, vp, ctx);
1533         if (error != 0) {
1534                 goto out2;
1535         }
1536
1537         placed = TRUE;
1538
1539         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1540         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1541
1542         /* Forbid future moves */
1543         mount_lock(mp);
1544         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1545         mount_unlock(mp);
1546
1547         /* Finally, add to mount list, completely ready to go */
1548         if (mount_list_add(mp) != 0) {
1549                 /*
1550                  * The system is shutting down trying to umount
1551                  * everything, so fail with a plausible errno.
1552                  */
1553                 error = EBUSY;
1554                 goto out3;
1555         }
1556
1557         mount_end_update(mp);
1558         vnode_put(rvp);
1559         FREE(old_mntonname, M_TEMP);
1560
1561         vfs_notify_mount(pvp);
1562
1563         return 0;
1564 out3:
1565         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1566
1567         mount_lock(mp);
1568         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1569         mount_unlock(mp);
1570
1571 out2:
1572         /*
1573          * Placing the mp on the vnode clears VMOUNT,
1574          * so cleanup is different after that point
1575          */
1576         if (placed) {
1577                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1578                 undo_place_on_covered_vp(mp, vp);
1579         } else {
1580                 vnode_lock_spin(vp);
1581                 CLR(vp->v_flag, VMOUNT);
1582                 vnode_unlock(vp);
1583         }
1584 out1:
1585         mount_end_update(mp);
1586
1587 out0:
1588         vnode_put(rvp);
1589         FREE(old_mntonname, M_TEMP);
1590         return error;
1591 }
1592
1593 #endif /* CONFIG_IMGSRC_ACCESS */
1594
1595 void
1596 enablequotas(struct mount *mp, vfs_context_t ctx)
1597 {
1598         struct nameidata qnd;
1599         int type;
1600         char qfpath[MAXPATHLEN];
1601         const char *qfname = QUOTAFILENAME;
1602         const char *qfopsname = QUOTAOPSNAME;
1603         const char *qfextension[] = INITQFNAMES;
1604
1605         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1606         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0 ) {
1607                 return;
1608         }
1609         /*
1610          * Enable filesystem disk quotas if necessary.
1611          * We ignore errors as this should not interfere with final mount
1612          */
1613         for (type=0; type < MAXQUOTAS; type++) {
1614                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1615                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1616                        CAST_USER_ADDR_T(qfpath), ctx);
1617                 if (namei(&qnd) != 0)
1618                         continue;           /* option file to trigger quotas is not present */
1619                 vnode_put(qnd.ni_vp);
1620                 nameidone(&qnd);
1621                 snprintf(qfpath, sizeof(qfpath),  "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1622
1623                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1624         }
1625         return;
1626 }
1627
1628
1629 static int
1630 checkdirs_callback(proc_t p, void * arg)
1631 {
1632         struct cdirargs * cdrp = (struct cdirargs * )arg;
1633         vnode_t olddp = cdrp->olddp;
1634         vnode_t newdp = cdrp->newdp;
1635         struct filedesc *fdp;
1636         vnode_t tvp;
1637         vnode_t fdp_cvp;
1638         vnode_t fdp_rvp;
1639         int cdir_changed = 0;
1640         int rdir_changed = 0;
1641
1642         /*
1643          * XXX Also needs to iterate each thread in the process to see if it
1644          * XXX is using a per-thread current working directory, and, if so,
1645          * XXX update that as well.
1646          */
1647
1648         proc_fdlock(p);
1649         fdp = p->p_fd;
1650         if (fdp == (struct filedesc *)0) {
1651                 proc_fdunlock(p);
1652                 return(PROC_RETURNED);
1653         }
1654         fdp_cvp = fdp->fd_cdir;
1655         fdp_rvp = fdp->fd_rdir;
1656         proc_fdunlock(p);
1657
1658         if (fdp_cvp == olddp) {
1659                 vnode_ref(newdp);
1660                 tvp = fdp->fd_cdir;
1661                 fdp_cvp = newdp;
1662                 cdir_changed = 1;
1663                 vnode_rele(tvp);
1664         }
1665         if (fdp_rvp == olddp) {
1666                 vnode_ref(newdp);
1667                 tvp = fdp->fd_rdir;
1668                 fdp_rvp = newdp;
1669                 rdir_changed = 1;
1670                 vnode_rele(tvp);
1671         }
1672         if (cdir_changed || rdir_changed) {
1673                 proc_fdlock(p);
1674                 fdp->fd_cdir = fdp_cvp;
1675                 fdp->fd_rdir = fdp_rvp;
1676                 proc_fdunlock(p);
1677         }
1678         return(PROC_RETURNED);
1679 }
1680
1681
1682
1683 /*
1684  * Scan all active processes to see if any of them have a current
1685  * or root directory onto which the new filesystem has just been
1686  * mounted. If so, replace them with the new mount point.
1687  */
1688 static int
1689 checkdirs(vnode_t olddp, vfs_context_t ctx)
1690 {
1691         vnode_t newdp;
1692         vnode_t tvp;
1693         int err;
1694         struct cdirargs cdr;
1695
1696         if (olddp->v_usecount == 1)
1697                 return(0);
1698         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
1699
1700         if (err != 0) {
1701 #if DIAGNOSTIC
1702                 panic("mount: lost mount: error %d", err);
1703 #endif
1704                 return(err);
1705         }
1706
1707         cdr.olddp = olddp;
1708         cdr.newdp = newdp;
1709         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
1710         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
1711
1712         if (rootvnode == olddp) {
1713                 vnode_ref(newdp);
1714                 tvp = rootvnode;
1715                 rootvnode = newdp;
1716                 vnode_rele(tvp);
1717         }
1718
1719         vnode_put(newdp);
1720         return(0);
1721 }
1722
1723 /*
1724  * Unmount a file system.
1725  *
1726  * Note: unmount takes a path to the vnode mounted on as argument,
1727  * not special file (as before).
1728  */
1729 /* ARGSUSED */
1730 int
1731 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
1732 {
1733         vnode_t vp;
1734         struct mount *mp;
1735         int error;
1736         struct nameidata nd;
1737         vfs_context_t ctx = vfs_context_current();
1738
1739         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
1740                 UIO_USERSPACE, uap->path, ctx);
1741         error = namei(&nd);
1742         if (error)
1743                 return (error);
1744         vp = nd.ni_vp;
1745         mp = vp->v_mount;
1746         nameidone(&nd);
1747
1748 #if CONFIG_MACF
1749         error = mac_mount_check_umount(ctx, mp);
1750         if (error != 0) {
1751                 vnode_put(vp);
1752                 return (error);
1753         }
1754 #endif
1755         /*
1756          * Must be the root of the filesystem
1757          */
1758         if ((vp->v_flag & VROOT) == 0) {
1759                 vnode_put(vp);
1760                 return (EINVAL);
1761         }
1762         mount_ref(mp, 0);
1763         vnode_put(vp);
1764         /* safedounmount consumes the mount ref */
1765         return (safedounmount(mp, uap->flags, ctx));
1766 }
1767
1768 int
1769 vfs_unmountbyfsid(fsid_t * fsid, int flags, vfs_context_t ctx)
1770 {
1771         mount_t mp;
1772
1773         mp = mount_list_lookupby_fsid(fsid, 0, 1);
1774         if (mp == (mount_t)0) {
1775                 return(ENOENT);
1776         }
1777         mount_ref(mp, 0);
1778         mount_iterdrop(mp);
1779         /* safedounmount consumes the mount ref */
1780         return(safedounmount(mp, flags, ctx));
1781 }
1782
1783
1784 /*
1785  * The mount struct comes with a mount ref which will be consumed.
1786  * Do the actual file system unmount, prevent some common foot shooting.
1787  */
1788 int
1789 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
1790 {
1791         int error;
1792         proc_t p = vfs_context_proc(ctx);
1793
1794         /*
1795          * If the file system is not responding and MNT_NOBLOCK
1796          * is set and not a forced unmount then return EBUSY.
1797          */
1798         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
1799                 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
1800                 error = EBUSY;
1801                 goto out;
1802         }
1803
1804         /*
1805          * Skip authorization if the mount is tagged as permissive and
1806          * this is not a forced-unmount attempt.
1807          */
1808         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
1809                 /*
1810                  * Only root, or the user that did the original mount is
1811                  * permitted to unmount this filesystem.
1812                  */
1813                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
1814                                 (error = suser(kauth_cred_get(), &p->p_acflag)))
1815                         goto out;
1816         }
1817         /*
1818          * Don't allow unmounting the root file system.
1819          */
1820         if (mp->mnt_flag & MNT_ROOTFS) {
1821                 error = EBUSY; /* the root is always busy */
1822                 goto out;
1823         }
1824
1825 #ifdef CONFIG_IMGSRC_ACCESS
1826         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
1827                 error = EBUSY;
1828                 goto out;
1829         }
1830 #endif /* CONFIG_IMGSRC_ACCESS */
1831
1832         return (dounmount(mp, flags, 1, ctx));
1833
1834 out:
1835         mount_drop(mp, 0);
1836         return(error);
1837 }
1838
1839 /*
1840  * Do the actual file system unmount.
1841  */
1842 int
1843 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
1844 {
1845         vnode_t coveredvp = (vnode_t)0;
1846         int error;
1847         int needwakeup = 0;
1848         int forcedunmount = 0;
1849         int lflags = 0;
1850         struct vnode *devvp = NULLVP;
1851 #if CONFIG_TRIGGERS
1852         proc_t p = vfs_context_proc(ctx);
1853         int did_vflush = 0;
1854         int pflags_save = 0;
1855 #endif /* CONFIG_TRIGGERS */
1856
1857         mount_lock(mp);
1858
1859         /*
1860          * If already an unmount in progress just return EBUSY.
1861          * Even a forced unmount cannot override.
1862          */
1863         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1864                 if (withref != 0)
1865                         mount_drop(mp, 1);
1866                 mount_unlock(mp);
1867                 return (EBUSY);
1868         }
1869
1870         if (flags & MNT_FORCE) {
1871                 forcedunmount = 1;
1872                 mp->mnt_lflag |= MNT_LFORCE;
1873         }
1874
1875 #if CONFIG_TRIGGERS
1876         if (flags & MNT_NOBLOCK && p != kernproc)
1877                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
1878 #endif
1879
1880         mp->mnt_kern_flag |= MNTK_UNMOUNT;
1881         mp->mnt_lflag |= MNT_LUNMOUNT;
1882         mp->mnt_flag &=~ MNT_ASYNC;
1883         /*
1884          * anyone currently in the fast path that
1885          * trips over the cached rootvp will be
1886          * dumped out and forced into the slow path
1887          * to regenerate a new cached value
1888          */
1889         mp->mnt_realrootvp = NULLVP;
1890         mount_unlock(mp);
1891
1892         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
1893                 /*
1894                  * Force unmount any mounts in this filesystem.
1895                  * If any unmounts fail - just leave them dangling.
1896                  * Avoids recursion.
1897                  */
1898                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
1899         }
1900
1901         /*
1902          * taking the name_cache_lock exclusively will
1903          * insure that everyone is out of the fast path who
1904          * might be trying to use a now stale copy of
1905          * vp->v_mountedhere->mnt_realrootvp
1906          * bumping mount_generation causes the cached values
1907          * to be invalidated
1908          */
1909         name_cache_lock();
1910         mount_generation++;
1911         name_cache_unlock();
1912
1913
1914         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1915         if (withref != 0)
1916                 mount_drop(mp, 0);
1917 #if CONFIG_FSE
1918         fsevent_unmount(mp);  /* has to come first! */
1919 #endif
1920         error = 0;
1921         if (forcedunmount == 0) {
1922                 ubc_umount(mp); /* release cached vnodes */
1923                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1924                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
1925                         if (error) {
1926                                 mount_lock(mp);
1927                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1928                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1929                                 mp->mnt_lflag &= ~MNT_LFORCE;
1930                                 goto out;
1931                         }
1932                 }
1933         }
1934
1935         IOBSDMountChange(mp, kIOMountChangeUnmount);
1936
1937 #if CONFIG_TRIGGERS
1938         vfs_nested_trigger_unmounts(mp, flags, ctx);
1939         did_vflush = 1;
1940 #endif
1941         if (forcedunmount)
1942                 lflags |= FORCECLOSE;
1943         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
1944         if ((forcedunmount == 0) && error) {
1945                 mount_lock(mp);
1946                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1947                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1948                 mp->mnt_lflag &= ~MNT_LFORCE;
1949                 goto out;
1950         }
1951
1952         /* make sure there are no one in the mount iterations or lookup */
1953         mount_iterdrain(mp);
1954
1955         error = VFS_UNMOUNT(mp, flags, ctx);
1956         if (error) {
1957                 mount_iterreset(mp);
1958                 mount_lock(mp);
1959                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1960                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1961                 mp->mnt_lflag &= ~MNT_LFORCE;
1962                 goto out;
1963         }
1964
1965         /* increment the operations count */
1966         if (!error)
1967                 OSAddAtomic(1, &vfs_nummntops);
1968
1969         if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
1970                 /* hold an io reference and drop the usecount before close */
1971                 devvp = mp->mnt_devvp;
1972                 vnode_getalways(devvp);
1973                 vnode_rele(devvp);
1974                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1975                        ctx);
1976                 vnode_clearmountedon(devvp);
1977                 vnode_put(devvp);
1978         }
1979         lck_rw_done(&mp->mnt_rwlock);
1980         mount_list_remove(mp);
1981         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1982
1983         /* mark the mount point hook in the vp but not drop the ref yet */
1984         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
1985                 /*
1986                  * The covered vnode needs special handling. Trying to get an
1987                  * iocount must not block here as this may lead to deadlocks
1988                  * if the Filesystem to which the covered vnode belongs is
1989                  * undergoing forced unmounts. Since we hold a usecount, the
1990                  * vnode cannot be reused (it can, however, still be terminated)
1991                  */
1992                 vnode_getalways(coveredvp);
1993                 vnode_lock_spin(coveredvp);
1994
1995                 mp->mnt_crossref++;
1996                 coveredvp->v_mountedhere = (struct mount *)0;
1997                 CLR(coveredvp->v_flag, VMOUNT);
1998
1999                 vnode_unlock(coveredvp);
2000                 vnode_put(coveredvp);
2001         }
2002
2003         mount_list_lock();
2004         mp->mnt_vtable->vfc_refcount--;
2005         mount_list_unlock();
2006
2007         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2008         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2009         mount_lock(mp);
2010         mp->mnt_lflag |= MNT_LDEAD;
2011
2012         if (mp->mnt_lflag & MNT_LWAIT) {
2013                 /*
2014                  * do the wakeup here
2015                  * in case we block in mount_refdrain
2016                  * which will drop the mount lock
2017                  * and allow anyone blocked in vfs_busy
2018                  * to wakeup and see the LDEAD state
2019                  */
2020                 mp->mnt_lflag &= ~MNT_LWAIT;
2021                 wakeup((caddr_t)mp);
2022         }
2023         mount_refdrain(mp);
2024 out:
2025         if (mp->mnt_lflag & MNT_LWAIT) {
2026                 mp->mnt_lflag &= ~MNT_LWAIT;
2027                 needwakeup = 1;
2028         }
2029
2030 #if CONFIG_TRIGGERS
2031         if (flags & MNT_NOBLOCK && p != kernproc) {
2032                 // Restore P_NOREMOTEHANG bit to its previous value
2033                 if ((pflags_save & P_NOREMOTEHANG) == 0)
2034                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2035         }
2036
2037         /*
2038          * Callback and context are set together under the mount lock, and
2039          * never cleared, so we're safe to examine them here, drop the lock,
2040          * and call out.
2041          */
2042         if (mp->mnt_triggercallback != NULL) {
2043                 mount_unlock(mp);
2044                 if (error == 0) {
2045                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2046                 } else if (did_vflush) {
2047                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2048                 }
2049         } else {
2050                 mount_unlock(mp);
2051         }
2052 #else
2053         mount_unlock(mp);
2054 #endif /* CONFIG_TRIGGERS */
2055
2056         lck_rw_done(&mp->mnt_rwlock);
2057
2058         if (needwakeup)
2059                 wakeup((caddr_t)mp);
2060
2061         if (!error) {
2062                 if ((coveredvp != NULLVP)) {
2063                         vnode_t pvp = NULLVP;
2064
2065                         /*
2066                          * The covered vnode needs special handling. Trying to
2067                          * get an iocount must not block here as this may lead
2068                          * to deadlocks if the Filesystem to which the covered
2069                          * vnode belongs is undergoing forced unmounts. Since we
2070                          * hold a usecount, the  vnode cannot be reused
2071                          * (it can, however, still be terminated).
2072                          */
2073                         vnode_getalways(coveredvp);
2074
2075                         mount_dropcrossref(mp, coveredvp, 0);
2076                         /*
2077                          * We'll _try_ to detect if this really needs to be
2078                          * done. The coveredvp can only be in termination (or
2079                          * terminated) if the coveredvp's mount point is in a
2080                          * forced unmount (or has been) since we still hold the
2081                          * ref.
2082                          */
2083                         if (!vnode_isrecycled(coveredvp)) {
2084                                 pvp = vnode_getparent(coveredvp);
2085 #if CONFIG_TRIGGERS
2086                                 if (coveredvp->v_resolve) {
2087                                         vnode_trigger_rearm(coveredvp, ctx);
2088                                 }
2089 #endif
2090                         }
2091
2092                         vnode_rele(coveredvp);
2093                         vnode_put(coveredvp);
2094                         coveredvp = NULLVP;
2095
2096                         if (pvp) {
2097                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2098                                 vnode_put(pvp);
2099                         }
2100                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2101                                 mount_lock_destroy(mp);
2102 #if CONFIG_MACF
2103                                 mac_mount_label_destroy(mp);
2104 #endif
2105                                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2106                 } else
2107                         panic("dounmount: no coveredvp");
2108         }
2109         return (error);
2110 }
2111
2112 /*
2113  * Unmount any mounts in this filesystem.
2114  */
2115 void
2116 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2117 {
2118         mount_t smp;
2119         fsid_t *fsids, fsid;
2120         int fsids_sz;
2121         int count = 0, i, m = 0;
2122         vnode_t vp;
2123
2124         mount_list_lock();
2125
2126         // Get an array to hold the submounts fsids.
2127         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2128                 count++;
2129         fsids_sz = count * sizeof(fsid_t);
2130         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2131         if (fsids == NULL) {
2132                 mount_list_unlock();
2133                 goto out;
2134         }
2135         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2136
2137         /*
2138          * Fill the array with submount fsids.
2139          * Since mounts are always added to the tail of the mount list, the
2140          * list is always in mount order.
2141          * For each mount check if the mounted-on vnode belongs to a
2142          * mount that's already added to our array of mounts to be unmounted.
2143          */
2144         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2145                 vp = smp->mnt_vnodecovered;
2146                 if (vp == NULL)
2147                         continue;
2148                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2149                 for (i = 0; i <= m; i++) {
2150                         if (fsids[i].val[0] == fsid.val[0] &&
2151                             fsids[i].val[1] == fsid.val[1]) {
2152                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2153                                 break;
2154                         }
2155                 }
2156         }
2157         mount_list_unlock();
2158
2159         // Unmount the submounts in reverse order. Ignore errors.
2160         for (i = m; i > 0; i--) {
2161                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2162                 if (smp) {
2163                         mount_ref(smp, 0);
2164                         mount_iterdrop(smp);
2165                         (void) dounmount(smp, flags, 1, ctx);
2166                 }
2167         }
2168 out:
2169         if (fsids)
2170                 FREE(fsids, M_TEMP);
2171 }
2172
2173 void
2174 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2175 {
2176         vnode_lock(dp);
2177         mp->mnt_crossref--;
2178
2179         if (mp->mnt_crossref < 0)
2180                 panic("mount cross refs -ve");
2181
2182         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2183
2184                 if (need_put)
2185                         vnode_put_locked(dp);
2186                 vnode_unlock(dp);
2187
2188                 mount_lock_destroy(mp);
2189 #if CONFIG_MACF
2190                 mac_mount_label_destroy(mp);
2191 #endif
2192                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2193                 return;
2194         }
2195         if (need_put)
2196                 vnode_put_locked(dp);
2197         vnode_unlock(dp);
2198 }
2199
2200
2201 /*
2202  * Sync each mounted filesystem.
2203  */
2204 #if DIAGNOSTIC
2205 int syncprt = 0;
2206 #endif
2207
2208 int print_vmpage_stat=0;
2209 int sync_timeout = 60;  // Sync time limit (sec)
2210
2211 static int
2212 sync_callback(mount_t mp, __unused void *arg)
2213 {
2214         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2215                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2216
2217                 mp->mnt_flag &= ~MNT_ASYNC;
2218                 VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
2219                 if (asyncflag)
2220                         mp->mnt_flag |= MNT_ASYNC;
2221         }
2222
2223         return (VFS_RETURNED);
2224 }
2225
2226 /* ARGSUSED */
2227 int
2228 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2229 {
2230         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2231
2232         if (print_vmpage_stat) {
2233                 vm_countdirtypages();
2234         }
2235
2236 #if DIAGNOSTIC
2237         if (syncprt)
2238                 vfs_bufstats();
2239 #endif /* DIAGNOSTIC */
2240         return 0;
2241 }
2242
2243 static void
2244 sync_thread(void *arg, __unused wait_result_t wr)
2245 {
2246         int *timeout = (int *) arg;
2247
2248         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2249
2250         if (timeout)
2251                 wakeup((caddr_t) timeout);
2252         if (print_vmpage_stat) {
2253                 vm_countdirtypages();
2254         }
2255
2256 #if DIAGNOSTIC
2257         if (syncprt)
2258                 vfs_bufstats();
2259 #endif /* DIAGNOSTIC */
2260 }
2261
2262 /*
2263  * Sync in a separate thread so we can time out if it blocks.
2264  */
2265 static int
2266 sync_async(int timeout)
2267 {
2268         thread_t thd;
2269         int error;
2270         struct timespec ts = {timeout, 0};
2271
2272         lck_mtx_lock(sync_mtx_lck);
2273         if (kernel_thread_start(sync_thread, &timeout, &thd) != KERN_SUCCESS) {
2274                 printf("sync_thread failed\n");
2275                 lck_mtx_unlock(sync_mtx_lck);
2276                 return (0);
2277         }
2278
2279         error = msleep((caddr_t) &timeout, sync_mtx_lck, (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2280         if (error) {
2281                 printf("sync timed out: %d sec\n", timeout);
2282         }
2283         thread_deallocate(thd);
2284
2285         return (0);
2286 }
2287
2288 /*
2289  * An in-kernel sync for power management to call.
2290  */
2291 __private_extern__ int
2292 sync_internal(void)
2293 {
2294         (void) sync_async(sync_timeout);
2295
2296         return 0;
2297 } /* end of sync_internal call */
2298
2299 /*
2300  * Change filesystem quotas.
2301  */
2302 #if QUOTA
2303 int
2304 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2305 {
2306         struct mount *mp;
2307         int error, quota_cmd, quota_status;
2308         caddr_t datap;
2309         size_t fnamelen;
2310         struct nameidata nd;
2311         vfs_context_t ctx = vfs_context_current();
2312         struct dqblk my_dqblk;
2313
2314         AUDIT_ARG(uid, uap->uid);
2315         AUDIT_ARG(cmd, uap->cmd);
2316         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2317                uap->path, ctx);
2318         error = namei(&nd);
2319         if (error)
2320                 return (error);
2321         mp = nd.ni_vp->v_mount;
2322         vnode_put(nd.ni_vp);
2323         nameidone(&nd);
2324
2325         /* copyin any data we will need for downstream code */
2326         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2327
2328         switch (quota_cmd) {
2329         case Q_QUOTAON:
2330                 /* uap->arg specifies a file from which to take the quotas */
2331                 fnamelen = MAXPATHLEN;
2332                 datap = kalloc(MAXPATHLEN);
2333                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2334                 break;
2335         case Q_GETQUOTA:
2336                 /* uap->arg is a pointer to a dqblk structure. */
2337                 datap = (caddr_t) &my_dqblk;
2338                 break;
2339         case Q_SETQUOTA:
2340         case Q_SETUSE:
2341                 /* uap->arg is a pointer to a dqblk structure. */
2342                 datap = (caddr_t) &my_dqblk;
2343                 if (proc_is64bit(p)) {
2344                         struct user_dqblk       my_dqblk64;
2345                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof (my_dqblk64));
2346                         if (error == 0) {
2347                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2348                         }
2349                 }
2350                 else {
2351                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof (my_dqblk));
2352                 }
2353                 break;
2354         case Q_QUOTASTAT:
2355                 /* uap->arg is a pointer to an integer */
2356                 datap = (caddr_t) &quota_status;
2357                 break;
2358         default:
2359                 datap = NULL;
2360                 break;
2361         } /* switch */
2362
2363         if (error == 0) {
2364                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2365         }
2366
2367         switch (quota_cmd) {
2368         case Q_QUOTAON:
2369                 if (datap != NULL)
2370                         kfree(datap, MAXPATHLEN);
2371                 break;
2372         case Q_GETQUOTA:
2373                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2374                 if (error == 0) {
2375                         if (proc_is64bit(p)) {
2376                                 struct user_dqblk       my_dqblk64 = {.dqb_bhardlimit = 0};
2377                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2378                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof (my_dqblk64));
2379                         }
2380                         else {
2381                                 error = copyout(datap, uap->arg, sizeof (struct dqblk));
2382                         }
2383                 }
2384                 break;
2385         case Q_QUOTASTAT:
2386                 /* uap->arg is a pointer to an integer */
2387                 if (error == 0) {
2388                         error = copyout(datap, uap->arg, sizeof(quota_status));
2389                 }
2390                 break;
2391         default:
2392                 break;
2393         } /* switch */
2394
2395         return (error);
2396 }
2397 #else
2398 int
2399 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2400 {
2401         return (EOPNOTSUPP);
2402 }
2403 #endif /* QUOTA */
2404
2405 /*
2406  * Get filesystem statistics.
2407  *
2408  * Returns:     0                       Success
2409  *      namei:???
2410  *      vfs_update_vfsstat:???
2411  *      munge_statfs:EFAULT
2412  */
2413 /* ARGSUSED */
2414 int
2415 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2416 {
2417         struct mount *mp;
2418         struct vfsstatfs *sp;
2419         int error;
2420         struct nameidata nd;
2421         vfs_context_t ctx = vfs_context_current();
2422         vnode_t vp;
2423
2424         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2425                 UIO_USERSPACE, uap->path, ctx);
2426         error = namei(&nd);
2427         if (error)
2428                 return (error);
2429         vp = nd.ni_vp;
2430         mp = vp->v_mount;
2431         sp = &mp->mnt_vfsstat;
2432         nameidone(&nd);
2433
2434         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2435         if (error != 0) {
2436                 vnode_put(vp);
2437                 return (error);
2438         }
2439
2440         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2441         vnode_put(vp);
2442         return (error);
2443 }
2444
2445 /*
2446  * Get filesystem statistics.
2447  */
2448 /* ARGSUSED */
2449 int
2450 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2451 {
2452         vnode_t vp;
2453         struct mount *mp;
2454         struct vfsstatfs *sp;
2455         int error;
2456
2457         AUDIT_ARG(fd, uap->fd);
2458
2459         if ( (error = file_vnode(uap->fd, &vp)) )
2460                 return (error);
2461
2462         error = vnode_getwithref(vp);
2463         if (error) {
2464                 file_drop(uap->fd);
2465                 return (error);
2466         }
2467
2468         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2469
2470         mp = vp->v_mount;
2471         if (!mp) {
2472                 error = EBADF;
2473                 goto out;
2474         }
2475         sp = &mp->mnt_vfsstat;
2476         if ((error = vfs_update_vfsstat(mp,vfs_context_current(),VFS_USER_EVENT)) != 0) {
2477                 goto out;
2478         }
2479
2480         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2481
2482 out:
2483         file_drop(uap->fd);
2484         vnode_put(vp);
2485
2486         return (error);
2487 }
2488
2489 /*
2490  * Common routine to handle copying of statfs64 data to user space
2491  */
2492 static int
2493 statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
2494 {
2495         int error;
2496         struct statfs64 sfs;
2497
2498         bzero(&sfs, sizeof(sfs));
2499
2500         sfs.f_bsize = sfsp->f_bsize;
2501         sfs.f_iosize = (int32_t)sfsp->f_iosize;
2502         sfs.f_blocks = sfsp->f_blocks;
2503         sfs.f_bfree = sfsp->f_bfree;
2504         sfs.f_bavail = sfsp->f_bavail;
2505         sfs.f_files = sfsp->f_files;
2506         sfs.f_ffree = sfsp->f_ffree;
2507         sfs.f_fsid = sfsp->f_fsid;
2508         sfs.f_owner = sfsp->f_owner;
2509         sfs.f_type = mp->mnt_vtable->vfc_typenum;
2510         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2511         sfs.f_fssubtype = sfsp->f_fssubtype;
2512         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2513                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
2514         } else {
2515                 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
2516         }
2517         strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
2518         strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
2519
2520         error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
2521
2522         return(error);
2523 }
2524
2525 /*
2526  * Get file system statistics in 64-bit mode
2527  */
2528 int
2529 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
2530 {
2531         struct mount *mp;
2532         struct vfsstatfs *sp;
2533         int error;
2534         struct nameidata nd;
2535         vfs_context_t ctxp = vfs_context_current();
2536         vnode_t vp;
2537
2538         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2539                 UIO_USERSPACE, uap->path, ctxp);
2540         error = namei(&nd);
2541         if (error)
2542                 return (error);
2543         vp = nd.ni_vp;
2544         mp = vp->v_mount;
2545         sp = &mp->mnt_vfsstat;
2546         nameidone(&nd);
2547
2548         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
2549         if (error != 0) {
2550                 vnode_put(vp);
2551                 return (error);
2552         }
2553
2554         error = statfs64_common(mp, sp, uap->buf);
2555         vnode_put(vp);
2556
2557         return (error);
2558 }
2559
2560 /*
2561  * Get file system statistics in 64-bit mode
2562  */
2563 int
2564 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
2565 {
2566         struct vnode *vp;
2567         struct mount *mp;
2568         struct vfsstatfs *sp;
2569         int error;
2570
2571         AUDIT_ARG(fd, uap->fd);
2572
2573         if ( (error = file_vnode(uap->fd, &vp)) )
2574                 return (error);
2575
2576         error = vnode_getwithref(vp);
2577         if (error) {
2578                 file_drop(uap->fd);
2579                 return (error);
2580         }
2581
2582         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2583
2584         mp = vp->v_mount;
2585         if (!mp) {
2586                 error = EBADF;
2587                 goto out;
2588         }
2589         sp = &mp->mnt_vfsstat;
2590         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2591                 goto out;
2592         }
2593
2594         error = statfs64_common(mp, sp, uap->buf);
2595
2596 out:
2597         file_drop(uap->fd);
2598         vnode_put(vp);
2599
2600         return (error);
2601 }
2602
2603 struct getfsstat_struct {
2604         user_addr_t     sfsp;
2605         user_addr_t     *mp;
2606         int             count;
2607         int             maxcount;
2608         int             flags;
2609         int             error;
2610 };
2611
2612
2613 static int
2614 getfsstat_callback(mount_t mp, void * arg)
2615 {
2616
2617         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2618         struct vfsstatfs *sp;
2619         int error, my_size;
2620         vfs_context_t ctx = vfs_context_current();
2621
2622         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2623                 sp = &mp->mnt_vfsstat;
2624                 /*
2625                  * If MNT_NOWAIT is specified, do not refresh the
2626                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
2627                  */
2628                 if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2629                         (error = vfs_update_vfsstat(mp, ctx,
2630                             VFS_USER_EVENT))) {
2631                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2632                         return(VFS_RETURNED);
2633                 }
2634
2635                 /*
2636                  * Need to handle LP64 version of struct statfs
2637                  */
2638                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
2639                 if (error) {
2640                         fstp->error = error;
2641                         return(VFS_RETURNED_DONE);
2642                 }
2643                 fstp->sfsp += my_size;
2644
2645                 if (fstp->mp) {
2646 #if CONFIG_MACF
2647                         error = mac_mount_label_get(mp, *fstp->mp);
2648                         if (error) {
2649                                 fstp->error = error;
2650                                 return(VFS_RETURNED_DONE);
2651                         }
2652 #endif
2653                         fstp->mp++;
2654                 }
2655         }
2656         fstp->count++;
2657         return(VFS_RETURNED);
2658 }
2659
2660 /*
2661  * Get statistics on all filesystems.
2662  */
2663 int
2664 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
2665 {
2666         struct __mac_getfsstat_args muap;
2667
2668         muap.buf = uap->buf;
2669         muap.bufsize = uap->bufsize;
2670         muap.mac = USER_ADDR_NULL;
2671         muap.macsize = 0;
2672         muap.flags = uap->flags;
2673
2674         return (__mac_getfsstat(p, &muap, retval));
2675 }
2676
2677 /*
2678  * __mac_getfsstat: Get MAC-related file system statistics
2679  *
2680  * Parameters:    p                        (ignored)
2681  *                uap                      User argument descriptor (see below)
2682  *                retval                   Count of file system statistics (N stats)
2683  *
2684  * Indirect:      uap->bufsize             Buffer size
2685  *                uap->macsize             MAC info size
2686  *                uap->buf                 Buffer where information will be returned
2687  *                uap->mac                 MAC info
2688  *                uap->flags               File system flags
2689  *
2690  *
2691  * Returns:        0                       Success
2692  *                !0                       Not success
2693  *
2694  */
2695 int
2696 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
2697 {
2698         user_addr_t sfsp;
2699         user_addr_t *mp;
2700         size_t count, maxcount, bufsize, macsize;
2701         struct getfsstat_struct fst;
2702
2703         bufsize = (size_t) uap->bufsize;
2704         macsize = (size_t) uap->macsize;
2705
2706         if (IS_64BIT_PROCESS(p)) {
2707                 maxcount = bufsize / sizeof(struct user64_statfs);
2708         }
2709         else {
2710                 maxcount = bufsize / sizeof(struct user32_statfs);
2711         }
2712         sfsp = uap->buf;
2713         count = 0;
2714
2715         mp = NULL;
2716
2717 #if CONFIG_MACF
2718         if (uap->mac != USER_ADDR_NULL) {
2719                 u_int32_t *mp0;
2720                 int error;
2721                 unsigned int i;
2722
2723                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
2724                 if (count != maxcount)
2725                         return (EINVAL);
2726
2727                 /* Copy in the array */
2728                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
2729                 if (mp0 == NULL) {
2730                         return (ENOMEM);
2731                 }
2732
2733                 error = copyin(uap->mac, mp0, macsize);
2734                 if (error) {
2735                         FREE(mp0, M_MACTEMP);
2736                         return (error);
2737                 }
2738
2739                 /* Normalize to an array of user_addr_t */
2740                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
2741                 if (mp == NULL) {
2742                         FREE(mp0, M_MACTEMP);
2743                         return (ENOMEM);
2744                 }
2745
2746                 for (i = 0; i < count; i++) {
2747                         if (IS_64BIT_PROCESS(p))
2748                                 mp[i] = ((user_addr_t *)mp0)[i];
2749                         else
2750                                 mp[i] = (user_addr_t)mp0[i];
2751                 }
2752                 FREE(mp0, M_MACTEMP);
2753         }
2754 #endif
2755
2756
2757         fst.sfsp = sfsp;
2758         fst.mp = mp;
2759         fst.flags = uap->flags;
2760         fst.count = 0;
2761         fst.error = 0;
2762         fst.maxcount = maxcount;
2763
2764
2765         vfs_iterate(0, getfsstat_callback, &fst);
2766
2767         if (mp)
2768                 FREE(mp, M_MACTEMP);
2769
2770         if (fst.error ) {
2771                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2772                 return(fst.error);
2773         }
2774
2775         if (fst.sfsp && fst.count > fst.maxcount)
2776                 *retval = fst.maxcount;
2777         else
2778                 *retval = fst.count;
2779         return (0);
2780 }
2781
2782 static int
2783 getfsstat64_callback(mount_t mp, void * arg)
2784 {
2785         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2786         struct vfsstatfs *sp;
2787         int error;
2788
2789         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2790                 sp = &mp->mnt_vfsstat;
2791                 /*
2792                  * If MNT_NOWAIT is specified, do not refresh the fsstat
2793                  * cache. MNT_WAIT overrides MNT_NOWAIT.
2794                  *
2795                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
2796                  * getfsstat, since the constants are out of the same
2797                  * namespace.
2798                  */
2799                 if (((fstp->flags & MNT_NOWAIT) == 0 ||
2800                      (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2801                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
2802                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2803                         return(VFS_RETURNED);
2804                 }
2805
2806                 error = statfs64_common(mp, sp, fstp->sfsp);
2807                 if (error) {
2808                         fstp->error = error;
2809                         return(VFS_RETURNED_DONE);
2810                 }
2811                 fstp->sfsp += sizeof(struct statfs64);
2812         }
2813         fstp->count++;
2814         return(VFS_RETURNED);
2815 }
2816
2817 /*
2818  * Get statistics on all file systems in 64 bit mode.
2819  */
2820 int
2821 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
2822 {
2823         user_addr_t sfsp;
2824         int count, maxcount;
2825         struct getfsstat_struct fst;
2826
2827         maxcount = uap->bufsize / sizeof(struct statfs64);
2828
2829         sfsp = uap->buf;
2830         count = 0;
2831
2832         fst.sfsp = sfsp;
2833         fst.flags = uap->flags;
2834         fst.count = 0;
2835         fst.error = 0;
2836         fst.maxcount = maxcount;
2837
2838         vfs_iterate(0, getfsstat64_callback, &fst);
2839
2840         if (fst.error ) {
2841                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2842                 return(fst.error);
2843         }
2844
2845         if (fst.sfsp && fst.count > fst.maxcount)
2846                 *retval = fst.maxcount;
2847         else
2848                 *retval = fst.count;
2849
2850         return (0);
2851 }
2852
2853 /*
2854  * gets the associated vnode with the file descriptor passed.
2855  * as input
2856  *
2857  * INPUT
2858  * ctx - vfs context of caller
2859  * fd - file descriptor for which vnode is required.
2860  * vpp - Pointer to pointer to vnode to be returned.
2861  *
2862  * The vnode is returned with an iocount so any vnode obtained
2863  * by this call needs a vnode_put
2864  *
2865  */
2866 static int
2867 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
2868 {
2869         int error;
2870         vnode_t vp;
2871         struct fileproc *fp;
2872         proc_t p = vfs_context_proc(ctx);
2873
2874         *vpp =  NULLVP;
2875
2876         error = fp_getfvp(p, fd, &fp, &vp);
2877         if (error)
2878                 return (error);
2879
2880         error = vnode_getwithref(vp);
2881         if (error) {
2882                 (void)fp_drop(p, fd, fp, 0);
2883                 return (error);
2884         }
2885
2886         (void)fp_drop(p, fd, fp, 0);
2887         *vpp = vp;
2888         return (error);
2889 }
2890
2891 /*
2892  * Wrapper function around namei to start lookup from a directory
2893  * specified by a file descriptor ni_dirfd.
2894  *
2895  * In addition to all the errors returned by namei, this call can
2896  * return ENOTDIR if the file descriptor does not refer to a directory.
2897  * and EBADF if the file descriptor is not valid.
2898  */
2899 int
2900 nameiat(struct nameidata *ndp, int dirfd)
2901 {
2902         if ((dirfd != AT_FDCWD) &&
2903             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
2904             !(ndp->ni_cnd.cn_flags & USEDVP)) {
2905                 int error = 0;
2906                 char c;
2907
2908                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
2909                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
2910                         if (error)
2911                                 return (error);
2912                 } else {
2913                         c = *((char *)(ndp->ni_dirp));
2914                 }
2915
2916                 if (c != '/') {
2917                         vnode_t dvp_at;
2918
2919                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
2920                             &dvp_at);
2921                         if (error)
2922                                 return (error);
2923
2924                         if (vnode_vtype(dvp_at) != VDIR) {
2925                                 vnode_put(dvp_at);
2926                                 return (ENOTDIR);
2927                         }
2928
2929                         ndp->ni_dvp = dvp_at;
2930                         ndp->ni_cnd.cn_flags |= USEDVP;
2931                         error = namei(ndp);
2932                         ndp->ni_cnd.cn_flags &= ~USEDVP;
2933                         vnode_put(dvp_at);
2934                         return (error);
2935                 }
2936         }
2937
2938         return (namei(ndp));
2939 }
2940
2941 /*
2942  * Change current working directory to a given file descriptor.
2943  */
2944 /* ARGSUSED */
2945 static int
2946 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
2947 {
2948         struct filedesc *fdp = p->p_fd;
2949         vnode_t vp;
2950         vnode_t tdp;
2951         vnode_t tvp;
2952         struct mount *mp;
2953         int error;
2954         vfs_context_t ctx = vfs_context_current();
2955
2956         AUDIT_ARG(fd, uap->fd);
2957         if (per_thread && uap->fd == -1) {
2958                 /*
2959                  * Switching back from per-thread to per process CWD; verify we
2960                  * in fact have one before proceeding.  The only success case
2961                  * for this code path is to return 0 preemptively after zapping
2962                  * the thread structure contents.
2963                  */
2964                 thread_t th = vfs_context_thread(ctx);
2965                 if (th) {
2966                         uthread_t uth = get_bsdthread_info(th);
2967                         tvp = uth->uu_cdir;
2968                         uth->uu_cdir = NULLVP;
2969                         if (tvp != NULLVP) {
2970                                 vnode_rele(tvp);
2971                                 return (0);
2972                         }
2973                 }
2974                 return (EBADF);
2975         }
2976
2977         if ( (error = file_vnode(uap->fd, &vp)) )
2978                 return(error);
2979         if ( (error = vnode_getwithref(vp)) ) {
2980                 file_drop(uap->fd);
2981                 return(error);
2982         }
2983
2984         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
2985
2986         if (vp->v_type != VDIR) {
2987                 error = ENOTDIR;
2988                 goto out;
2989         }
2990
2991 #if CONFIG_MACF
2992         error = mac_vnode_check_chdir(ctx, vp);
2993         if (error)
2994                 goto out;
2995 #endif
2996         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
2997         if (error)
2998                 goto out;
2999
3000         while (!error && (mp = vp->v_mountedhere) != NULL) {
3001                 if (vfs_busy(mp, LK_NOWAIT)) {
3002                         error = EACCES;
3003                         goto out;
3004                 }
3005                 error = VFS_ROOT(mp, &tdp, ctx);
3006                 vfs_unbusy(mp);
3007                 if (error)
3008                         break;
3009                 vnode_put(vp);
3010                 vp = tdp;
3011         }
3012         if (error)
3013                 goto out;
3014         if ( (error = vnode_ref(vp)) )
3015                 goto out;
3016         vnode_put(vp);
3017
3018         if (per_thread) {
3019                 thread_t th = vfs_context_thread(ctx);
3020                 if (th) {
3021                         uthread_t uth = get_bsdthread_info(th);
3022                         tvp = uth->uu_cdir;
3023                         uth->uu_cdir = vp;
3024                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3025                 } else {
3026                         vnode_rele(vp);
3027                         return (ENOENT);
3028                 }
3029         } else {
3030                 proc_fdlock(p);
3031                 tvp = fdp->fd_cdir;
3032                 fdp->fd_cdir = vp;
3033                 proc_fdunlock(p);
3034         }
3035
3036         if (tvp)
3037                 vnode_rele(tvp);
3038         file_drop(uap->fd);
3039
3040         return (0);
3041 out:
3042         vnode_put(vp);
3043         file_drop(uap->fd);
3044
3045         return(error);
3046 }
3047
3048 int
3049 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3050 {
3051         return common_fchdir(p, uap, 0);
3052 }
3053
3054 int
3055 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3056 {
3057         return common_fchdir(p, (void *)uap, 1);
3058 }
3059
3060 /*
3061  * Change current working directory (".").
3062  *
3063  * Returns:     0                       Success
3064  *      change_dir:ENOTDIR
3065  *      change_dir:???
3066  *      vnode_ref:ENOENT                No such file or directory
3067  */
3068 /* ARGSUSED */
3069 static int
3070 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3071 {
3072         struct filedesc *fdp = p->p_fd;
3073         int error;
3074         struct nameidata nd;
3075         vnode_t tvp;
3076         vfs_context_t ctx = vfs_context_current();
3077
3078         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3079                 UIO_USERSPACE, uap->path, ctx);
3080         error = change_dir(&nd, ctx);
3081         if (error)
3082                 return (error);
3083         if ( (error = vnode_ref(nd.ni_vp)) ) {
3084                 vnode_put(nd.ni_vp);
3085                 return (error);
3086         }
3087         /*
3088          * drop the iocount we picked up in change_dir
3089          */
3090         vnode_put(nd.ni_vp);
3091
3092         if (per_thread) {
3093                 thread_t th = vfs_context_thread(ctx);
3094                 if (th) {
3095                         uthread_t uth = get_bsdthread_info(th);
3096                         tvp = uth->uu_cdir;
3097                         uth->uu_cdir = nd.ni_vp;
3098                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3099                 } else {
3100                         vnode_rele(nd.ni_vp);
3101                         return (ENOENT);
3102                 }
3103         } else {
3104                 proc_fdlock(p);
3105                 tvp = fdp->fd_cdir;
3106                 fdp->fd_cdir = nd.ni_vp;
3107                 proc_fdunlock(p);
3108         }
3109
3110         if (tvp)
3111                 vnode_rele(tvp);
3112
3113         return (0);
3114 }
3115
3116
3117 /*
3118  * chdir
3119  *
3120  * Change current working directory (".") for the entire process
3121  *
3122  * Parameters:  p       Process requesting the call
3123  *              uap     User argument descriptor (see below)
3124  *              retval  (ignored)
3125  *
3126  * Indirect parameters: uap->path       Directory path
3127  *
3128  * Returns:     0                       Success
3129  *              common_chdir: ENOTDIR
3130  *              common_chdir: ENOENT    No such file or directory
3131  *              common_chdir: ???
3132  *
3133  */
3134 int
3135 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3136 {
3137         return common_chdir(p, (void *)uap, 0);
3138 }
3139
3140 /*
3141  * __pthread_chdir
3142  *
3143  * Change current working directory (".") for a single thread
3144  *
3145  * Parameters:  p       Process requesting the call
3146  *              uap     User argument descriptor (see below)
3147  *              retval  (ignored)
3148  *
3149  * Indirect parameters: uap->path       Directory path
3150  *
3151  * Returns:     0                       Success
3152  *              common_chdir: ENOTDIR
3153  *              common_chdir: ENOENT    No such file or directory
3154  *              common_chdir: ???
3155  *
3156  */
3157 int
3158 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3159 {
3160         return common_chdir(p, (void *)uap, 1);
3161 }
3162
3163
3164 /*
3165  * Change notion of root (``/'') directory.
3166  */
3167 /* ARGSUSED */
3168 int
3169 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3170 {
3171         struct filedesc *fdp = p->p_fd;
3172         int error;
3173         struct nameidata nd;
3174         vnode_t tvp;
3175         vfs_context_t ctx = vfs_context_current();
3176
3177         if ((error = suser(kauth_cred_get(), &p->p_acflag)))
3178                 return (error);
3179
3180         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3181                 UIO_USERSPACE, uap->path, ctx);
3182         error = change_dir(&nd, ctx);
3183         if (error)
3184                 return (error);
3185
3186 #if CONFIG_MACF
3187         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3188             &nd.ni_cnd);
3189         if (error) {
3190                 vnode_put(nd.ni_vp);
3191                 return (error);
3192         }
3193 #endif
3194
3195         if ( (error = vnode_ref(nd.ni_vp)) ) {
3196                 vnode_put(nd.ni_vp);
3197                 return (error);
3198         }
3199         vnode_put(nd.ni_vp);
3200
3201         proc_fdlock(p);
3202         tvp = fdp->fd_rdir;
3203         fdp->fd_rdir = nd.ni_vp;
3204         fdp->fd_flags |= FD_CHROOT;
3205         proc_fdunlock(p);
3206
3207         if (tvp != NULL)
3208                 vnode_rele(tvp);
3209
3210         return (0);
3211 }
3212
3213 /*
3214  * Common routine for chroot and chdir.
3215  *
3216  * Returns:     0                       Success
3217  *              ENOTDIR                 Not a directory
3218  *              namei:???               [anything namei can return]
3219  *              vnode_authorize:???     [anything vnode_authorize can return]
3220  */
3221 static int
3222 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3223 {
3224         vnode_t vp;
3225         int error;
3226
3227         if ((error = namei(ndp)))
3228                 return (error);
3229         nameidone(ndp);
3230         vp = ndp->ni_vp;
3231
3232         if (vp->v_type != VDIR) {
3233                 vnode_put(vp);
3234                 return (ENOTDIR);
3235         }
3236
3237 #if CONFIG_MACF
3238         error = mac_vnode_check_chdir(ctx, vp);
3239         if (error) {
3240                 vnode_put(vp);
3241                 return (error);
3242         }
3243 #endif
3244
3245         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3246         if (error) {
3247                 vnode_put(vp);
3248                 return (error);
3249         }
3250
3251         return (error);
3252 }
3253
3254 /*
3255  * Free the vnode data (for directories) associated with the file glob.
3256  */
3257 struct fd_vn_data *
3258 fg_vn_data_alloc(void)
3259 {
3260         struct fd_vn_data *fvdata;
3261
3262         /* Allocate per fd vnode data */
3263         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3264                M_FD_VN_DATA, M_WAITOK | M_ZERO);
3265         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3266         return fvdata;
3267 }
3268
3269 /*
3270  * Free the vnode data (for directories) associated with the file glob.
3271  */
3272 void
3273 fg_vn_data_free(void *fgvndata)
3274 {
3275         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3276
3277         if (fvdata->fv_buf)
3278                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3279         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3280         FREE(fvdata, M_FD_VN_DATA);
3281 }
3282
3283 /*
3284  * Check permissions, allocate an open file structure,
3285  * and call the device open routine if any.
3286  *
3287  * Returns:     0                       Success
3288  *              EINVAL
3289  *              EINTR
3290  *      falloc:ENFILE
3291  *      falloc:EMFILE
3292  *      falloc:ENOMEM
3293  *      vn_open_auth:???
3294  *      dupfdopen:???
3295  *      VNOP_ADVLOCK:???
3296  *      vnode_setsize:???
3297  *
3298  * XXX Need to implement uid, gid
3299  */
3300 int
3301 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3302     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3303     int32_t *retval)
3304 {
3305         proc_t p = vfs_context_proc(ctx);
3306         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3307         struct fileproc *fp;
3308         vnode_t vp;
3309         int flags, oflags;
3310         int type, indx, error;
3311         struct flock lf;
3312         struct vfs_context context;
3313
3314         oflags = uflags;
3315
3316         if ((oflags & O_ACCMODE) == O_ACCMODE)
3317                 return(EINVAL);
3318
3319         flags = FFLAGS(uflags);
3320         CLR(flags, FENCRYPTED);
3321         CLR(flags, FUNENCRYPTED);
3322
3323         AUDIT_ARG(fflags, oflags);
3324         AUDIT_ARG(mode, vap->va_mode);
3325
3326         if ((error = falloc_withalloc(p,
3327             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3328                 return (error);
3329         }
3330         uu->uu_dupfd = -indx - 1;
3331
3332         if ((error = vn_open_auth(ndp, &flags, vap))) {
3333                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)){        /* XXX from fdopen */
3334                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3335                                 fp_drop(p, indx, NULL, 0);
3336                                 *retval = indx;
3337                                 return (0);
3338                         }
3339                 }
3340                 if (error == ERESTART)
3341                         error = EINTR;
3342                 fp_free(p, indx, fp);
3343                 return (error);
3344         }
3345         uu->uu_dupfd = 0;
3346         vp = ndp->ni_vp;
3347
3348         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3349         fp->f_fglob->fg_ops = &vnops;
3350         fp->f_fglob->fg_data = (caddr_t)vp;
3351
3352         if (flags & (O_EXLOCK | O_SHLOCK)) {
3353                 lf.l_whence = SEEK_SET;
3354                 lf.l_start = 0;
3355                 lf.l_len = 0;
3356                 if (flags & O_EXLOCK)
3357                         lf.l_type = F_WRLCK;
3358                 else
3359                         lf.l_type = F_RDLCK;
3360                 type = F_FLOCK;
3361                 if ((flags & FNONBLOCK) == 0)
3362                         type |= F_WAIT;
3363 #if CONFIG_MACF
3364                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3365                     F_SETLK, &lf);
3366                 if (error)
3367                         goto bad;
3368 #endif
3369                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL)))
3370                         goto bad;
3371                 fp->f_fglob->fg_flag |= FHASLOCK;
3372         }
3373
3374         /* try to truncate by setting the size attribute */
3375         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0))
3376                 goto bad;
3377
3378         /*
3379          * For directories we hold some additional information in the fd.
3380          */
3381         if (vnode_vtype(vp) == VDIR) {
3382                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3383         } else {
3384                 fp->f_fglob->fg_vn_data = NULL;
3385         }
3386
3387         vnode_put(vp);
3388
3389         /*
3390          * The first terminal open (without a O_NOCTTY) by a session leader
3391          * results in it being set as the controlling terminal.
3392          */
3393         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3394             !(flags & O_NOCTTY)) {
3395                 int tmp = 0;
3396
3397                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3398                     (caddr_t)&tmp, ctx);
3399         }
3400
3401         proc_fdlock(p);
3402         if (flags & O_CLOEXEC)
3403                 *fdflags(p, indx) |= UF_EXCLOSE;
3404         if (flags & O_CLOFORK)
3405                 *fdflags(p, indx) |= UF_FORKCLOSE;
3406         procfdtbl_releasefd(p, indx, NULL);
3407         fp_drop(p, indx, fp, 1);
3408         proc_fdunlock(p);
3409
3410         *retval = indx;
3411
3412         return (0);
3413 bad:
3414         context = *vfs_context_current();
3415         context.vc_ucred = fp->f_fglob->fg_cred;
3416
3417         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
3418             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
3419                 lf.l_whence = SEEK_SET;
3420                 lf.l_start = 0;
3421                 lf.l_len = 0;
3422                 lf.l_type = F_UNLCK;
3423
3424                 (void)VNOP_ADVLOCK(
3425                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
3426         }
3427
3428         vn_close(vp, fp->f_fglob->fg_flag, &context);
3429         vnode_put(vp);
3430         fp_free(p, indx, fp);
3431
3432         return (error);
3433 }
3434
3435 /*
3436  * While most of the *at syscall handlers can call nameiat() which
3437  * is a wrapper around namei, the use of namei and initialisation
3438  * of nameidata are far removed and in different functions  - namei
3439  * gets called in vn_open_auth for open1. So we'll just do here what
3440  * nameiat() does.
3441  */
3442 static int
3443 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3444     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
3445     int dirfd)
3446 {
3447         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
3448                 int error;
3449                 char c;
3450
3451                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3452                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3453                         if (error)
3454                                 return (error);
3455                 } else {
3456                         c = *((char *)(ndp->ni_dirp));
3457                 }
3458
3459                 if (c != '/') {
3460                         vnode_t dvp_at;
3461
3462                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3463                             &dvp_at);
3464                         if (error)
3465                                 return (error);
3466
3467                         if (vnode_vtype(dvp_at) != VDIR) {
3468                                 vnode_put(dvp_at);
3469                                 return (ENOTDIR);
3470                         }
3471
3472                         ndp->ni_dvp = dvp_at;
3473                         ndp->ni_cnd.cn_flags |= USEDVP;
3474                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
3475                             retval);
3476                         vnode_put(dvp_at);
3477                         return (error);
3478                 }
3479         }
3480
3481         return (open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval));
3482 }
3483
3484 /*
3485  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
3486  *
3487  * Parameters:  p                       Process requesting the open
3488  *              uap                     User argument descriptor (see below)
3489  *              retval                  Pointer to an area to receive the
3490  *                                      return calue from the system call
3491  *
3492  * Indirect:    uap->path               Path to open (same as 'open')
3493  *              uap->flags              Flags to open (same as 'open'
3494  *              uap->uid                UID to set, if creating
3495  *              uap->gid                GID to set, if creating
3496  *              uap->mode               File mode, if creating (same as 'open')
3497  *              uap->xsecurity          ACL to set, if creating
3498  *
3499  * Returns:     0                       Success
3500  *              !0                      errno value
3501  *
3502  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3503  *
3504  * XXX:         We should enummerate the possible errno values here, and where
3505  *              in the code they originated.
3506  */
3507 int
3508 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
3509 {
3510         struct filedesc *fdp = p->p_fd;
3511         int ciferror;
3512         kauth_filesec_t xsecdst;
3513         struct vnode_attr va;
3514         struct nameidata nd;
3515         int cmode;
3516
3517         AUDIT_ARG(owner, uap->uid, uap->gid);
3518
3519         xsecdst = NULL;
3520         if ((uap->xsecurity != USER_ADDR_NULL) &&
3521             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
3522                 return ciferror;
3523
3524         VATTR_INIT(&va);
3525         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3526         VATTR_SET(&va, va_mode, cmode);
3527         if (uap->uid != KAUTH_UID_NONE)
3528                 VATTR_SET(&va, va_uid, uap->uid);
3529         if (uap->gid != KAUTH_GID_NONE)
3530                 VATTR_SET(&va, va_gid, uap->gid);
3531         if (xsecdst != NULL)
3532                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3533
3534         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3535                uap->path, vfs_context_current());
3536
3537         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
3538                          fileproc_alloc_init, NULL, retval);
3539         if (xsecdst != NULL)
3540                 kauth_filesec_free(xsecdst);
3541
3542         return ciferror;
3543 }
3544
3545 /*
3546  * Go through the data-protected atomically controlled open (2)
3547  *
3548  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
3549  */
3550 int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval) {
3551         int flags = uap->flags;
3552         int class = uap->class;
3553         int dpflags = uap->dpflags;
3554
3555         /*
3556          * Follow the same path as normal open(2)
3557          * Look up the item if it exists, and acquire the vnode.
3558          */
3559         struct filedesc *fdp = p->p_fd;
3560         struct vnode_attr va;
3561         struct nameidata nd;
3562         int cmode;
3563         int error;
3564
3565         VATTR_INIT(&va);
3566         /* Mask off all but regular access permissions */
3567         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3568         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3569
3570         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3571                uap->path, vfs_context_current());
3572
3573         /*
3574          * Initialize the extra fields in vnode_attr to pass down our
3575          * extra fields.
3576          * 1. target cprotect class.
3577          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
3578          */
3579         if (flags & O_CREAT) {
3580                /* lower level kernel code validates that the class is valid before applying it. */
3581                if (class != PROTECTION_CLASS_DEFAULT) {
3582                        /*
3583                         * PROTECTION_CLASS_DEFAULT implies that we make the class for this
3584                         * file behave the same as open (2)
3585                         */
3586                        VATTR_SET(&va, va_dataprotect_class, class);
3587                }
3588         }
3589
3590         if (dpflags & (O_DP_GETRAWENCRYPTED|O_DP_GETRAWUNENCRYPTED)) {
3591                 if ( flags & (O_RDWR | O_WRONLY)) {
3592                         /* Not allowed to write raw encrypted bytes */
3593                         return EINVAL;
3594                 }
3595                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
3596                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
3597                 }
3598                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
3599                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
3600                 }
3601         }
3602
3603         error = open1(vfs_context_current(), &nd, uap->flags, &va,
3604                       fileproc_alloc_init, NULL, retval);
3605
3606         return error;
3607 }
3608
3609 static int
3610 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
3611     int fd, enum uio_seg segflg, int *retval)
3612 {
3613         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
3614         struct vnode_attr va;
3615         struct nameidata nd;
3616         int cmode;
3617
3618         VATTR_INIT(&va);
3619         /* Mask off all but regular access permissions */
3620         cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3621         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3622
3623         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
3624             segflg, path, ctx);
3625
3626         return (open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
3627             retval, fd));
3628 }
3629
3630 int
3631 open(proc_t p, struct open_args *uap, int32_t *retval)
3632 {
3633         __pthread_testcancel(1);
3634         return(open_nocancel(p, (struct open_nocancel_args *)uap, retval));
3635 }
3636
3637 int
3638 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
3639     int32_t *retval)
3640 {
3641         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3642             uap->mode, AT_FDCWD, UIO_USERSPACE, retval));
3643 }
3644
3645 int
3646 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
3647                 int32_t *retval)
3648 {
3649         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3650             uap->mode, uap->fd, UIO_USERSPACE, retval));
3651 }
3652
3653 int
3654 openat(proc_t p, struct openat_args *uap, int32_t *retval)
3655 {
3656         __pthread_testcancel(1);
3657         return(openat_nocancel(p, (struct openat_nocancel_args *)uap, retval));
3658 }
3659
3660 /*
3661  * openbyid_np: open a file given a file system id and a file system object id
3662  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
3663  *      file systems that don't support object ids it is a node id (uint64_t).
3664  *
3665  * Parameters:  p                       Process requesting the open
3666  *              uap                     User argument descriptor (see below)
3667  *              retval                  Pointer to an area to receive the
3668  *                                      return calue from the system call
3669  *
3670  * Indirect:    uap->path               Path to open (same as 'open')
3671  *
3672  *              uap->fsid               id of target file system
3673  *              uap->objid              id of target file system object
3674  *              uap->flags              Flags to open (same as 'open')
3675  *
3676  * Returns:     0                       Success
3677  *              !0                      errno value
3678  *
3679  *
3680  * XXX:         We should enummerate the possible errno values here, and where
3681  *              in the code they originated.
3682  */
3683 int
3684 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
3685 {
3686         fsid_t fsid;
3687         uint64_t objid;
3688         int error;
3689         char *buf = NULL;
3690         int buflen = MAXPATHLEN;
3691         int pathlen = 0;
3692         vfs_context_t ctx = vfs_context_current();
3693
3694         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
3695                 return (error);
3696         }
3697
3698         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
3699         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
3700                 return (error);
3701         }
3702
3703         AUDIT_ARG(value32, fsid.val[0]);
3704         AUDIT_ARG(value64, objid);
3705
3706         /*resolve path from fsis, objid*/
3707         do {
3708                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
3709                 if (buf == NULL) {
3710                         return (ENOMEM);
3711                 }
3712
3713                 error = fsgetpath_internal(
3714                         ctx, fsid.val[0], objid,
3715                         buflen, buf, &pathlen);
3716
3717                 if (error) {
3718                         FREE(buf, M_TEMP);
3719                         buf = NULL;
3720                 }
3721         } while (error == ENOSPC && (buflen += MAXPATHLEN));
3722
3723         if (error) {
3724                 return error;
3725         }
3726
3727         buf[pathlen] = 0;
3728
3729         error = openat_internal(
3730                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
3731
3732         FREE(buf, M_TEMP);
3733
3734         return error;
3735 }
3736
3737
3738 /*
3739  * Create a special file.
3740  */
3741 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
3742
3743 int
3744 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
3745 {
3746         struct vnode_attr va;
3747         vfs_context_t ctx = vfs_context_current();
3748         int error;
3749         struct nameidata nd;
3750         vnode_t vp, dvp;
3751
3752         VATTR_INIT(&va);
3753         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3754         VATTR_SET(&va, va_rdev, uap->dev);
3755
3756         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
3757         if ((uap->mode & S_IFMT) == S_IFIFO)
3758                 return(mkfifo1(ctx, uap->path, &va));
3759
3760         AUDIT_ARG(mode, uap->mode);
3761         AUDIT_ARG(value32, uap->dev);
3762
3763         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
3764                 return (error);
3765         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
3766                 UIO_USERSPACE, uap->path, ctx);
3767         error = namei(&nd);
3768         if (error)
3769                 return (error);
3770         dvp = nd.ni_dvp;
3771         vp = nd.ni_vp;
3772
3773         if (vp != NULL) {
3774                 error = EEXIST;
3775                 goto out;
3776         }
3777
3778         switch (uap->mode & S_IFMT) {
3779         case S_IFCHR:
3780                 VATTR_SET(&va, va_type, VCHR);
3781                 break;
3782         case S_IFBLK:
3783                 VATTR_SET(&va, va_type, VBLK);
3784                 break;
3785         default:
3786                 error = EINVAL;
3787                 goto out;
3788         }
3789
3790 #if CONFIG_MACF
3791         error = mac_vnode_check_create(ctx,
3792             nd.ni_dvp, &nd.ni_cnd, &va);
3793         if (error)
3794                 goto out;
3795 #endif
3796
3797         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
3798                 goto out;
3799
3800         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0)
3801                 goto out;
3802
3803         if (vp) {
3804                 int     update_flags = 0;
3805
3806                 // Make sure the name & parent pointers are hooked up
3807                 if (vp->v_name == NULL)
3808                         update_flags |= VNODE_UPDATE_NAME;
3809                 if (vp->v_parent == NULLVP)
3810                         update_flags |= VNODE_UPDATE_PARENT;
3811
3812                 if (update_flags)
3813                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
3814
3815 #if CONFIG_FSE
3816                 add_fsevent(FSE_CREATE_FILE, ctx,
3817                     FSE_ARG_VNODE, vp,
3818                     FSE_ARG_DONE);
3819 #endif
3820         }
3821
3822 out:
3823         /*
3824          * nameidone has to happen before we vnode_put(dvp)
3825          * since it may need to release the fs_nodelock on the dvp
3826          */
3827         nameidone(&nd);
3828
3829         if (vp)
3830                 vnode_put(vp);
3831         vnode_put(dvp);
3832
3833         return (error);
3834 }
3835
3836 /*
3837  * Create a named pipe.
3838  *
3839  * Returns:     0                       Success
3840  *              EEXIST
3841  *      namei:???
3842  *      vnode_authorize:???
3843  *      vn_create:???
3844  */
3845 static int
3846 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
3847 {
3848         vnode_t vp, dvp;
3849         int error;
3850         struct nameidata nd;
3851
3852         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
3853                 UIO_USERSPACE, upath, ctx);
3854         error = namei(&nd);
3855         if (error)
3856                 return (error);
3857         dvp = nd.ni_dvp;
3858         vp = nd.ni_vp;
3859
3860         /* check that this is a new file and authorize addition */
3861         if (vp != NULL) {
3862                 error = EEXIST;
3863                 goto out;
3864         }
3865         VATTR_SET(vap, va_type, VFIFO);
3866
3867         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0)
3868                 goto out;
3869
3870         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
3871 out:
3872         /*
3873          * nameidone has to happen before we vnode_put(dvp)
3874          * since it may need to release the fs_nodelock on the dvp
3875          */
3876         nameidone(&nd);
3877
3878         if (vp)
3879                 vnode_put(vp);
3880         vnode_put(dvp);
3881
3882         return error;
3883 }
3884
3885
3886 /*
3887  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
3888  *
3889  * Parameters:  p                       Process requesting the open
3890  *              uap                     User argument descriptor (see below)
3891  *              retval                  (Ignored)
3892  *
3893  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
3894  *              uap->uid                UID to set
3895  *              uap->gid                GID to set
3896  *              uap->mode               File mode to set (same as 'mkfifo')
3897  *              uap->xsecurity          ACL to set, if creating
3898  *
3899  * Returns:     0                       Success
3900  *              !0                      errno value
3901  *
3902  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3903  *
3904  * XXX:         We should enummerate the possible errno values here, and where
3905  *              in the code they originated.
3906  */
3907 int
3908 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
3909 {
3910         int ciferror;
3911         kauth_filesec_t xsecdst;
3912         struct vnode_attr va;
3913
3914         AUDIT_ARG(owner, uap->uid, uap->gid);
3915
3916         xsecdst = KAUTH_FILESEC_NONE;
3917         if (uap->xsecurity != USER_ADDR_NULL) {
3918                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
3919                         return ciferror;
3920         }
3921
3922         VATTR_INIT(&va);
3923         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3924         if (uap->uid != KAUTH_UID_NONE)
3925                 VATTR_SET(&va, va_uid, uap->uid);
3926         if (uap->gid != KAUTH_GID_NONE)
3927                 VATTR_SET(&va, va_gid, uap->gid);
3928         if (xsecdst != KAUTH_FILESEC_NONE)
3929                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3930
3931         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
3932
3933         if (xsecdst != KAUTH_FILESEC_NONE)
3934                 kauth_filesec_free(xsecdst);
3935         return ciferror;
3936 }
3937
3938 /* ARGSUSED */
3939 int
3940 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
3941 {
3942         struct vnode_attr va;
3943
3944         VATTR_INIT(&va);
3945         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3946
3947         return(mkfifo1(vfs_context_current(), uap->path, &va));
3948 }
3949
3950
3951 static char *
3952 my_strrchr(char *p, int ch)
3953 {
3954         char *save;
3955
3956         for (save = NULL;; ++p) {
3957                 if (*p == ch)
3958                         save = p;
3959                 if (!*p)
3960                         return(save);
3961         }
3962         /* NOTREACHED */
3963 }
3964
3965 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
3966
3967 int
3968 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
3969 {
3970         int ret, len = _len;
3971
3972         *truncated_path = 0;
3973         ret = vn_getpath(dvp, path, &len);
3974         if (ret == 0 && len < (MAXPATHLEN - 1)) {
3975                 if (leafname) {
3976                         path[len-1] = '/';
3977                         len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + 1;
3978                         if (len > MAXPATHLEN) {
3979                                 char *ptr;
3980
3981                                 // the string got truncated!
3982                                 *truncated_path = 1;
3983                                 ptr = my_strrchr(path, '/');
3984                                 if (ptr) {
3985                                         *ptr = '\0';   // chop off the string at the last directory component
3986                                 }
3987                                 len = strlen(path) + 1;
3988                         }
3989                 }
3990         } else if (ret == 0) {
3991                 *truncated_path = 1;
3992         } else if (ret != 0) {
3993                 struct vnode *mydvp=dvp;
3994
3995                 if (ret != ENOSPC) {
3996                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
3997                                dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
3998                 }
3999                 *truncated_path = 1;
4000
4001                 do {
4002                         if (mydvp->v_parent != NULL) {
4003                                 mydvp = mydvp->v_parent;
4004                         } else if (mydvp->v_mount) {
4005                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4006                                 break;
4007                         } else {
4008                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4009                                 strlcpy(path, "/", _len);
4010                                 len = 2;
4011                                 mydvp = NULL;
4012                         }
4013
4014                         if (mydvp == NULL) {
4015                                 break;
4016                         }
4017
4018                         len = _len;
4019                         ret = vn_getpath(mydvp, path, &len);
4020                 } while (ret == ENOSPC);
4021         }
4022
4023         return len;
4024 }
4025
4026
4027 /*
4028  * Make a hard file link.
4029  *
4030  * Returns:     0                       Success
4031  *              EPERM
4032  *              EEXIST
4033  *              EXDEV
4034  *      namei:???
4035  *      vnode_authorize:???
4036  *      VNOP_LINK:???
4037  */
4038 /* ARGSUSED */
4039 static int
4040 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4041     user_addr_t link, int flag, enum uio_seg segflg)
4042 {
4043         vnode_t vp, dvp, lvp;
4044         struct nameidata nd;
4045         int follow;
4046         int error;
4047 #if CONFIG_FSE
4048         fse_info finfo;
4049 #endif
4050         int need_event, has_listeners;
4051         char *target_path = NULL;
4052         int truncated=0;
4053
4054         vp = dvp = lvp = NULLVP;
4055
4056         /* look up the object we are linking to */
4057         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4058         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4059             segflg, path, ctx);
4060
4061         error = nameiat(&nd, fd1);
4062         if (error)
4063                 return (error);
4064         vp = nd.ni_vp;
4065
4066         nameidone(&nd);
4067
4068         /*
4069          * Normally, linking to directories is not supported.
4070          * However, some file systems may have limited support.
4071          */
4072         if (vp->v_type == VDIR) {
4073                 if (!(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) {
4074                         error = EPERM;   /* POSIX */
4075                         goto out;
4076                 }
4077                 /* Linking to a directory requires ownership. */
4078                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4079                         struct vnode_attr dva;
4080
4081                         VATTR_INIT(&dva);
4082                         VATTR_WANTED(&dva, va_uid);
4083                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4084                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4085                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4086                                 error = EACCES;
4087                                 goto out;
4088                         }
4089                 }
4090         }
4091
4092         /* lookup the target node */
4093 #if CONFIG_TRIGGERS
4094         nd.ni_op = OP_LINK;
4095 #endif
4096         nd.ni_cnd.cn_nameiop = CREATE;
4097         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4098         nd.ni_dirp = link;
4099         error = nameiat(&nd, fd2);
4100         if (error != 0)
4101                 goto out;
4102         dvp = nd.ni_dvp;
4103         lvp = nd.ni_vp;
4104
4105 #if CONFIG_MACF
4106         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0)
4107                 goto out2;
4108 #endif
4109
4110         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4111         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0)
4112                 goto out2;
4113
4114         /* target node must not exist */
4115         if (lvp != NULLVP) {
4116                 error = EEXIST;
4117                 goto out2;
4118         }
4119         /* cannot link across mountpoints */
4120         if (vnode_mount(vp) != vnode_mount(dvp)) {
4121                 error = EXDEV;
4122                 goto out2;
4123         }
4124
4125         /* authorize creation of the target note */
4126         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
4127                 goto out2;
4128
4129         /* and finally make the link */
4130         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4131         if (error)
4132                 goto out2;
4133
4134 #if CONFIG_MACF
4135         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4136 #endif
4137
4138 #if CONFIG_FSE
4139         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4140 #else
4141         need_event = 0;
4142 #endif
4143         has_listeners = kauth_authorize_fileop_has_listeners();
4144
4145         if (need_event || has_listeners) {
4146                 char *link_to_path = NULL;
4147                 int len, link_name_len;
4148
4149                 /* build the path to the new link file */
4150                 GET_PATH(target_path);
4151                 if (target_path == NULL) {
4152                         error = ENOMEM;
4153                         goto out2;
4154                 }
4155
4156                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4157
4158                 if (has_listeners) {
4159                         /* build the path to file we are linking to */
4160                         GET_PATH(link_to_path);
4161                         if (link_to_path == NULL) {
4162                                 error = ENOMEM;
4163                                 goto out2;
4164                         }
4165
4166                         link_name_len = MAXPATHLEN;
4167                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4168                                 /*
4169                                  * Call out to allow 3rd party notification of rename.
4170                                  * Ignore result of kauth_authorize_fileop call.
4171                                  */
4172                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4173                                                        (uintptr_t)link_to_path,
4174                                                        (uintptr_t)target_path);
4175                         }
4176                         if (link_to_path != NULL) {
4177                                 RELEASE_PATH(link_to_path);
4178                         }
4179                 }
4180 #if CONFIG_FSE
4181                 if (need_event) {
4182                         /* construct fsevent */
4183                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4184                                 if (truncated) {
4185                                         finfo.mode |= FSE_TRUNCATED_PATH;
4186                                 }
4187
4188                                 // build the path to the destination of the link
4189                                 add_fsevent(FSE_CREATE_FILE, ctx,
4190                                             FSE_ARG_STRING, len, target_path,
4191                                             FSE_ARG_FINFO, &finfo,
4192                                             FSE_ARG_DONE);
4193                         }
4194                         if (vp->v_parent) {
4195                             add_fsevent(FSE_STAT_CHANGED, ctx,
4196                                 FSE_ARG_VNODE, vp->v_parent,
4197                                 FSE_ARG_DONE);
4198                         }
4199                 }
4200 #endif
4201         }
4202 out2:
4203         /*
4204          * nameidone has to happen before we vnode_put(dvp)
4205          * since it may need to release the fs_nodelock on the dvp
4206          */
4207         nameidone(&nd);
4208         if (target_path != NULL) {
4209                 RELEASE_PATH(target_path);
4210         }
4211 out:
4212         if (lvp)
4213                 vnode_put(lvp);
4214         if (dvp)
4215                 vnode_put(dvp);
4216         vnode_put(vp);
4217         return (error);
4218 }
4219
4220 int
4221 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4222 {
4223         return (linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4224             AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE));
4225 }
4226
4227 int
4228 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4229 {
4230         if (uap->flag & ~AT_SYMLINK_FOLLOW)
4231                 return (EINVAL);
4232
4233         return (linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4234             uap->fd2, uap->link, uap->flag, UIO_USERSPACE));
4235 }
4236
4237 /*
4238  * Make a symbolic link.
4239  *
4240  * We could add support for ACLs here too...
4241  */
4242 /* ARGSUSED */
4243 static int
4244 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4245     user_addr_t link, enum uio_seg segflg)
4246 {
4247         struct vnode_attr va;
4248         char *path;
4249         int error;
4250         struct nameidata nd;
4251         vnode_t vp, dvp;
4252         uint32_t dfflags;       // Directory file flags
4253         size_t dummy=0;
4254         proc_t p;
4255
4256         error = 0;
4257         if (UIO_SEG_IS_USER_SPACE(segflg)) {
4258                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
4259                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
4260         } else {
4261                 path = (char *)path_data;
4262         }
4263         if (error)
4264                 goto out;
4265         AUDIT_ARG(text, path);  /* This is the link string */
4266
4267         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
4268             segflg, link, ctx);
4269
4270         error = nameiat(&nd, fd);
4271         if (error)
4272                 goto out;
4273         dvp = nd.ni_dvp;
4274         vp = nd.ni_vp;
4275
4276         p = vfs_context_proc(ctx);
4277         VATTR_INIT(&va);
4278         VATTR_SET(&va, va_type, VLNK);
4279         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
4280
4281         /*
4282          * Handle inheritance of restricted flag
4283          */
4284         error = vnode_flags(dvp, &dfflags, ctx);
4285         if (error)
4286                 goto skipit;
4287         if (dfflags & SF_RESTRICTED)
4288                 VATTR_SET(&va, va_flags, SF_RESTRICTED);
4289
4290 #if CONFIG_MACF
4291         error = mac_vnode_check_create(ctx,
4292                         dvp, &nd.ni_cnd, &va);
4293 #endif
4294         if (error != 0) {
4295             goto skipit;
4296         }
4297
4298         if (vp != NULL) {
4299             error = EEXIST;
4300             goto skipit;
4301         }
4302
4303         /* authorize */
4304         if (error == 0)
4305                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
4306         /* get default ownership, etc. */
4307         if (error == 0)
4308                 error = vnode_authattr_new(dvp, &va, 0, ctx);
4309         if (error == 0)
4310                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
4311
4312 #if CONFIG_MACF
4313         if (error == 0 && vp)
4314                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
4315 #endif
4316
4317         /* do fallback attribute handling */
4318         if (error == 0 && vp)
4319                 error = vnode_setattr_fallback(vp, &va, ctx);
4320
4321         if (error == 0) {
4322                 int     update_flags = 0;
4323
4324                 /*check if a new vnode was created, else try to get one*/
4325                 if (vp == NULL) {
4326                         nd.ni_cnd.cn_nameiop = LOOKUP;
4327 #if CONFIG_TRIGGERS
4328                         nd.ni_op = OP_LOOKUP;
4329 #endif
4330                         nd.ni_cnd.cn_flags = 0;
4331                         error = nameiat(&nd, fd);
4332                         vp = nd.ni_vp;
4333
4334                         if (vp == NULL)
4335                                 goto skipit;
4336                 }
4337
4338 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
4339                 /* call out to allow 3rd party notification of rename.
4340                  * Ignore result of kauth_authorize_fileop call.
4341                  */
4342                 if (kauth_authorize_fileop_has_listeners() &&
4343                     namei(&nd) == 0) {
4344                         char *new_link_path = NULL;
4345                         int             len;
4346
4347                         /* build the path to the new link file */
4348                         new_link_path = get_pathbuff();
4349                         len = MAXPATHLEN;
4350                         vn_getpath(dvp, new_link_path, &len);
4351                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
4352                                 new_link_path[len - 1] = '/';
4353                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN-len);
4354                         }
4355
4356                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
4357                                            (uintptr_t)path, (uintptr_t)new_link_path);
4358                         if (new_link_path != NULL)
4359                                 release_pathbuff(new_link_path);
4360                 }
4361 #endif
4362                 // Make sure the name & parent pointers are hooked up
4363                 if (vp->v_name == NULL)
4364                         update_flags |= VNODE_UPDATE_NAME;
4365                 if (vp->v_parent == NULLVP)
4366                         update_flags |= VNODE_UPDATE_PARENT;
4367
4368                 if (update_flags)
4369                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4370
4371 #if CONFIG_FSE
4372                 add_fsevent(FSE_CREATE_FILE, ctx,
4373                             FSE_ARG_VNODE, vp,
4374                             FSE_ARG_DONE);
4375 #endif
4376         }
4377
4378 skipit:
4379         /*
4380          * nameidone has to happen before we vnode_put(dvp)
4381          * since it may need to release the fs_nodelock on the dvp
4382          */
4383         nameidone(&nd);
4384
4385         if (vp)
4386                 vnode_put(vp);
4387         vnode_put(dvp);
4388 out:
4389         if (path && (path != (char *)path_data))
4390                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
4391
4392         return (error);
4393 }
4394
4395 int
4396 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
4397 {
4398         return (symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
4399             uap->link, UIO_USERSPACE));
4400 }
4401
4402 int
4403 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
4404     __unused int32_t *retval)
4405 {
4406         return (symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
4407             uap->path2, UIO_USERSPACE));
4408 }
4409
4410 /*
4411  * Delete a whiteout from the filesystem.
4412  * No longer supported.
4413  */
4414 int
4415 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
4416 {
4417         return (ENOTSUP);
4418 }
4419
4420 /*
4421  * Delete a name from the filesystem.
4422  */
4423 /* ARGSUSED */
4424 static int
4425 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
4426     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
4427 {
4428         struct nameidata nd;
4429         vnode_t vp, dvp;
4430         int error;
4431         struct componentname *cnp;
4432         char  *path = NULL;
4433         int  len=0;
4434 #if CONFIG_FSE
4435         fse_info  finfo;
4436         struct vnode_attr va;
4437 #endif
4438         int flags;
4439         int need_event;
4440         int has_listeners;
4441         int truncated_path;
4442         int batched;
4443         struct vnode_attr *vap;
4444         int do_retry;
4445         int retry_count = 0;
4446         int cn_flags;
4447
4448         cn_flags = LOCKPARENT;
4449         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH))
4450                 cn_flags |= AUDITVNPATH1;
4451         /* If a starting dvp is passed, it trumps any fd passed. */
4452         if (start_dvp)
4453                 cn_flags |= USEDVP;
4454
4455 #if NAMEDRSRCFORK
4456         /* unlink or delete is allowed on rsrc forks and named streams */
4457         cn_flags |= CN_ALLOWRSRCFORK;
4458 #endif
4459
4460 retry:
4461         do_retry = 0;
4462         flags = 0;
4463         need_event = 0;
4464         has_listeners = 0;
4465         truncated_path = 0;
4466         vap = NULL;
4467
4468         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
4469
4470         nd.ni_dvp = start_dvp;
4471         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
4472         cnp = &nd.ni_cnd;
4473
4474 lookup_continue:
4475         error = nameiat(&nd, fd);
4476         if (error)
4477                 return (error);
4478
4479         dvp = nd.ni_dvp;
4480         vp = nd.ni_vp;
4481
4482
4483         /* With Carbon delete semantics, busy files cannot be deleted */
4484         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
4485                 flags |= VNODE_REMOVE_NODELETEBUSY;
4486         }
4487
4488         /* Skip any potential upcalls if told to. */
4489         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
4490                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
4491         }
4492
4493         if (vp) {
4494                 batched = vnode_compound_remove_available(vp);
4495                 /*
4496                  * The root of a mounted filesystem cannot be deleted.
4497                  */
4498                 if (vp->v_flag & VROOT) {
4499                         error = EBUSY;
4500                 }
4501
4502                 if (!batched) {
4503                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
4504                         if (error) {
4505                                 if (error == ENOENT) {
4506                                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4507                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4508                                                 do_retry = 1;
4509                                                 retry_count++;
4510                                         }
4511                                 }
4512                                 goto out;
4513                         }
4514                 }
4515         } else {
4516                 batched = 1;
4517
4518                 if (!vnode_compound_remove_available(dvp)) {
4519                         panic("No vp, but no compound remove?");
4520                 }
4521         }
4522
4523 #if CONFIG_FSE
4524         need_event = need_fsevent(FSE_DELETE, dvp);
4525         if (need_event) {
4526                 if (!batched) {
4527                         if ((vp->v_flag & VISHARDLINK) == 0) {
4528                                 /* XXX need to get these data in batched VNOP */
4529                                 get_fse_info(vp, &finfo, ctx);
4530                         }
4531                 } else {
4532                         error = vfs_get_notify_attributes(&va);
4533                         if (error) {
4534                                 goto out;
4535                         }
4536
4537                         vap = &va;
4538                 }
4539         }
4540 #endif
4541         has_listeners = kauth_authorize_fileop_has_listeners();
4542         if (need_event || has_listeners) {
4543                 if (path == NULL) {
4544                         GET_PATH(path);
4545                         if (path == NULL) {
4546                                 error = ENOMEM;
4547                                 goto out;
4548                         }
4549                 }
4550                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
4551         }
4552
4553 #if NAMEDRSRCFORK
4554         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK)
4555                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
4556         else
4557 #endif
4558         {
4559                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
4560                 vp = nd.ni_vp;
4561                 if (error == EKEEPLOOKING) {
4562                         if (!batched) {
4563                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
4564                         }
4565
4566                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
4567                                 panic("EKEEPLOOKING, but continue flag not set?");
4568                         }
4569
4570                         if (vnode_isdir(vp)) {
4571                                 error = EISDIR;
4572                                 goto out;
4573                         }
4574                         goto lookup_continue;
4575                 } else if (error == ENOENT && batched) {
4576                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4577                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4578                                 /*
4579                                  * For compound VNOPs, the authorization callback may
4580                                  * return ENOENT in case of racing hardlink lookups
4581                                  * hitting the name  cache, redrive the lookup.
4582                                  */
4583                                 do_retry = 1;
4584                                 retry_count += 1;
4585                                 goto out;
4586                         }
4587                 }
4588         }
4589
4590         /*
4591          * Call out to allow 3rd party notification of delete.
4592          * Ignore result of kauth_authorize_fileop call.
4593          */
4594         if (!error) {
4595                 if (has_listeners) {
4596                         kauth_authorize_fileop(vfs_context_ucred(ctx),
4597                                 KAUTH_FILEOP_DELETE,
4598                                 (uintptr_t)vp,
4599                                 (uintptr_t)path);
4600                 }
4601
4602                 if (vp->v_flag & VISHARDLINK) {
4603                     //
4604                     // if a hardlink gets deleted we want to blow away the
4605                     // v_parent link because the path that got us to this
4606                     // instance of the link is no longer valid.  this will
4607                     // force the next call to get the path to ask the file
4608                     // system instead of just following the v_parent link.
4609                     //
4610                     vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
4611                 }
4612
4613 #if CONFIG_FSE
4614                 if (need_event) {
4615                         if (vp->v_flag & VISHARDLINK) {
4616                                 get_fse_info(vp, &finfo, ctx);
4617                         } else if (vap) {
4618                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
4619                         }
4620                         if (truncated_path) {
4621                                 finfo.mode |= FSE_TRUNCATED_PATH;
4622                         }
4623                         add_fsevent(FSE_DELETE, ctx,
4624                                                 FSE_ARG_STRING, len, path,
4625                                                 FSE_ARG_FINFO, &finfo,
4626                                                 FSE_ARG_DONE);
4627                 }
4628 #endif
4629         }
4630
4631 out:
4632         if (path != NULL)
4633                 RELEASE_PATH(path);
4634
4635 #if NAMEDRSRCFORK
4636         /* recycle the deleted rsrc fork vnode to force a reclaim, which
4637          * will cause its shadow file to go away if necessary.
4638          */
4639          if (vp && (vnode_isnamedstream(vp)) &&
4640                 (vp->v_parent != NULLVP) &&
4641                 vnode_isshadow(vp)) {
4642                         vnode_recycle(vp);
4643          }
4644 #endif
4645         /*
4646          * nameidone has to happen before we vnode_put(dvp)
4647          * since it may need to release the fs_nodelock on the dvp
4648          */
4649         nameidone(&nd);
4650         vnode_put(dvp);
4651         if (vp) {
4652                 vnode_put(vp);
4653         }
4654
4655         if (do_retry) {
4656                 goto retry;
4657         }
4658
4659         return (error);
4660 }
4661
4662 int
4663 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
4664     enum uio_seg segflg, int unlink_flags)
4665 {
4666         return (unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
4667             unlink_flags));
4668 }
4669
4670 /*
4671  * Delete a name from the filesystem using Carbon semantics.
4672  */
4673 int
4674 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
4675 {
4676         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4677             uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY));
4678 }
4679
4680 /*
4681  * Delete a name from the filesystem using POSIX semantics.
4682  */
4683 int
4684 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
4685 {
4686         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4687             uap->path, UIO_USERSPACE, 0));
4688 }
4689
4690 int
4691 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
4692 {
4693         if (uap->flag & ~AT_REMOVEDIR)
4694                 return (EINVAL);
4695
4696         if (uap->flag & AT_REMOVEDIR)
4697                 return (rmdirat_internal(vfs_context_current(), uap->fd,
4698                     uap->path, UIO_USERSPACE));
4699         else
4700                 return (unlinkat_internal(vfs_context_current(), uap->fd,
4701                     NULLVP, uap->path, UIO_USERSPACE, 0));
4702 }
4703
4704 /*
4705  * Reposition read/write file offset.
4706  */
4707 int
4708 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
4709 {
4710         struct fileproc *fp;
4711         vnode_t vp;
4712         struct vfs_context *ctx;
4713         off_t offset = uap->offset, file_size;
4714         int error;
4715
4716         if ( (error = fp_getfvp(p,uap->fd, &fp, &vp)) ) {
4717                 if (error == ENOTSUP)
4718                         return (ESPIPE);
4719                 return (error);
4720         }
4721         if (vnode_isfifo(vp)) {
4722                 file_drop(uap->fd);
4723                 return(ESPIPE);
4724         }
4725
4726
4727         ctx = vfs_context_current();
4728 #if CONFIG_MACF
4729         if (uap->whence == L_INCR && uap->offset == 0)
4730                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
4731                     fp->f_fglob);
4732         else
4733                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
4734                     fp->f_fglob);
4735         if (error) {
4736                 file_drop(uap->fd);
4737                 return (error);
4738         }
4739 #endif
4740         if ( (error = vnode_getwithref(vp)) ) {
4741                 file_drop(uap->fd);
4742                 return(error);
4743         }
4744
4745         switch (uap->whence) {
4746         case L_INCR:
4747                 offset += fp->f_fglob->fg_offset;
4748                 break;
4749         case L_XTND:
4750                 if ((error = vnode_size(vp, &file_size, ctx)) != 0)
4751                         break;
4752                 offset += file_size;
4753                 break;
4754         case L_SET:
4755                 break;
4756         default:
4757                 error = EINVAL;
4758         }
4759         if (error == 0) {
4760                 if (uap->offset > 0 && offset < 0) {
4761                         /* Incremented/relative move past max size */
4762                         error = EOVERFLOW;
4763                 } else {
4764                         /*
4765                          * Allow negative offsets on character devices, per
4766                          * POSIX 1003.1-2001.  Most likely for writing disk
4767                          * labels.
4768                          */
4769                         if (offset < 0 && vp->v_type != VCHR) {
4770                                 /* Decremented/relative move before start */
4771                                 error = EINVAL;
4772                         } else {
4773                                 /* Success */
4774                                 fp->f_fglob->fg_offset = offset;
4775                                 *retval = fp->f_fglob->fg_offset;
4776                         }
4777                 }
4778         }
4779
4780         /*
4781          * An lseek can affect whether data is "available to read."  Use
4782          * hint of NOTE_NONE so no EVFILT_VNODE events fire
4783          */
4784         post_event_if_success(vp, error, NOTE_NONE);
4785         (void)vnode_put(vp);
4786         file_drop(uap->fd);
4787         return (error);
4788 }
4789
4790
4791 /*
4792  * Check access permissions.
4793  *
4794  * Returns:     0                       Success
4795  *              vnode_authorize:???
4796  */
4797 static int
4798 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
4799 {
4800         kauth_action_t action;
4801         int error;
4802
4803         /*
4804          * If just the regular access bits, convert them to something
4805          * that vnode_authorize will understand.
4806          */
4807         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
4808                 action = 0;
4809                 if (uflags & R_OK)
4810                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
4811                 if (uflags & W_OK) {
4812                         if (vnode_isdir(vp)) {
4813                                 action |= KAUTH_VNODE_ADD_FILE |
4814                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
4815                                 /* might want delete rights here too */
4816                         } else {
4817                                 action |= KAUTH_VNODE_WRITE_DATA;
4818                         }
4819                 }
4820                 if (uflags & X_OK) {
4821                         if (vnode_isdir(vp)) {
4822                                 action |= KAUTH_VNODE_SEARCH;
4823                         } else {
4824                                 action |= KAUTH_VNODE_EXECUTE;
4825                         }
4826                 }
4827         } else {
4828                 /* take advantage of definition of uflags */
4829                 action = uflags >> 8;
4830         }
4831
4832 #if CONFIG_MACF
4833         error = mac_vnode_check_access(ctx, vp, uflags);
4834         if (error)
4835                 return (error);
4836 #endif /* MAC */
4837
4838         /* action == 0 means only check for existence */
4839         if (action != 0) {
4840                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
4841         } else {
4842                 error = 0;
4843         }
4844
4845         return(error);
4846 }
4847
4848
4849
4850 /*
4851  * access_extended: Check access permissions in bulk.
4852  *
4853  * Description: uap->entries            Pointer to an array of accessx
4854  *                                      descriptor structs, plus one or
4855  *                                      more NULL terminated strings (see
4856  *                                      "Notes" section below).
4857  *              uap->size               Size of the area pointed to by
4858  *                                      uap->entries.
4859  *              uap->results            Pointer to the results array.
4860  *
4861  * Returns:     0                       Success
4862  *              ENOMEM                  Insufficient memory
4863  *              EINVAL                  Invalid arguments
4864  *              namei:EFAULT            Bad address
4865  *              namei:ENAMETOOLONG      Filename too long
4866  *              namei:ENOENT            No such file or directory
4867  *              namei:ELOOP             Too many levels of symbolic links
4868  *              namei:EBADF             Bad file descriptor
4869  *              namei:ENOTDIR           Not a directory
4870  *              namei:???
4871  *              access1:
4872  *
4873  * Implicit returns:
4874  *              uap->results            Array contents modified
4875  *
4876  * Notes:       The uap->entries are structured as an arbitrary length array
4877  *              of accessx descriptors, followed by one or more NULL terminated
4878  *              strings
4879  *
4880  *                      struct accessx_descriptor[0]
4881  *                      ...
4882  *                      struct accessx_descriptor[n]
4883  *                      char name_data[0];
4884  *
4885  *              We determine the entry count by walking the buffer containing
4886  *              the uap->entries argument descriptor.  For each descriptor we
4887  *              see, the valid values for the offset ad_name_offset will be
4888  *              in the byte range:
4889  *
4890  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
4891  *                                              to
4892  *                              [ uap->entries + uap->size - 2 ]
4893  *
4894  *              since we must have at least one string, and the string must
4895  *              be at least one character plus the NULL terminator in length.
4896  *
4897  * XXX:         Need to support the check-as uid argument
4898  */
4899 int
4900 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
4901 {
4902         struct accessx_descriptor *input = NULL;
4903         errno_t *result = NULL;
4904         errno_t error = 0;
4905         int wantdelete = 0;
4906         unsigned int desc_max, desc_actual, i, j;
4907         struct vfs_context context;
4908         struct nameidata nd;
4909         int niopts;
4910         vnode_t vp = NULL;
4911         vnode_t dvp = NULL;
4912 #define ACCESSX_MAX_DESCR_ON_STACK 10
4913         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
4914
4915         context.vc_ucred = NULL;
4916
4917         /*
4918          * Validate parameters; if valid, copy the descriptor array and string
4919          * arguments into local memory.  Before proceeding, the following
4920          * conditions must have been met:
4921          *
4922          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
4923          * o    There must be sufficient room in the request for at least one
4924          *      descriptor and a one yte NUL terminated string.
4925          * o    The allocation of local storage must not fail.
4926          */
4927         if (uap->size > ACCESSX_MAX_TABLESIZE)
4928                 return(ENOMEM);
4929         if (uap->size < (sizeof(struct accessx_descriptor) + 2))
4930                 return(EINVAL);
4931         if (uap->size <= sizeof (stack_input)) {
4932                 input = stack_input;
4933         } else {
4934         MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
4935         if (input == NULL) {
4936                 error = ENOMEM;
4937                 goto out;
4938         }
4939         }
4940         error = copyin(uap->entries, input, uap->size);
4941         if (error)
4942                 goto out;
4943
4944         AUDIT_ARG(opaque, input, uap->size);
4945
4946         /*
4947          * Force NUL termination of the copyin buffer to avoid nami() running
4948          * off the end.  If the caller passes us bogus data, they may get a
4949          * bogus result.
4950          */
4951         ((char *)input)[uap->size - 1] = 0;
4952
4953         /*
4954          * Access is defined as checking against the process' real identity,
4955          * even if operations are checking the effective identity.  This
4956          * requires that we use a local vfs context.
4957          */
4958         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
4959         context.vc_thread = current_thread();
4960
4961         /*
4962          * Find out how many entries we have, so we can allocate the result
4963          * array by walking the list and adjusting the count downward by the
4964          * earliest string offset we see.
4965          */
4966         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
4967         desc_actual = desc_max;
4968         for (i = 0; i < desc_actual; i++) {
4969                 /*
4970                  * Take the offset to the name string for this entry and
4971                  * convert to an input array index, which would be one off
4972                  * the end of the array if this entry was the lowest-addressed
4973                  * name string.
4974                  */
4975                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
4976
4977                 /*
4978                  * An offset greater than the max allowable offset is an error.
4979                  * It is also an error for any valid entry to point
4980                  * to a location prior to the end of the current entry, if
4981                  * it's not a reference to the string of the previous entry.
4982                  */
4983                 if (j > desc_max || (j != 0 && j <= i)) {
4984                         error = EINVAL;
4985                         goto out;
4986                 }
4987
4988                 /*
4989                  * An offset of 0 means use the previous descriptor's offset;
4990                  * this is used to chain multiple requests for the same file
4991                  * to avoid multiple lookups.
4992                  */
4993                 if (j == 0) {
4994                         /* This is not valid for the first entry */
4995                         if (i == 0) {
4996                                 error = EINVAL;
4997                                 goto out;
4998                         }
4999                         continue;
5000                 }
5001
5002                 /*
5003                  * If the offset of the string for this descriptor is before
5004                  * what we believe is the current actual last descriptor,
5005                  * then we need to adjust our estimate downward; this permits
5006                  * the string table following the last descriptor to be out
5007                  * of order relative to the descriptor list.
5008                  */
5009                 if (j < desc_actual)
5010                         desc_actual = j;
5011         }
5012
5013         /*
5014          * We limit the actual number of descriptors we are willing to process
5015          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5016          * requested does not exceed this limit,
5017          */
5018         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5019                 error = ENOMEM;
5020                 goto out;
5021         }
5022         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK);
5023         if (result == NULL) {
5024                 error = ENOMEM;
5025                 goto out;
5026         }
5027
5028         /*
5029          * Do the work by iterating over the descriptor entries we know to
5030          * at least appear to contain valid data.
5031          */
5032         error = 0;
5033         for (i = 0; i < desc_actual; i++) {
5034                 /*
5035                  * If the ad_name_offset is 0, then we use the previous
5036                  * results to make the check; otherwise, we are looking up
5037                  * a new file name.
5038                  */
5039                 if (input[i].ad_name_offset != 0) {
5040                         /* discard old vnodes */
5041                         if (vp) {
5042                                 vnode_put(vp);
5043                                 vp = NULL;
5044                         }
5045                         if (dvp) {
5046                                 vnode_put(dvp);
5047                                 dvp = NULL;
5048                         }
5049
5050                         /*
5051                          * Scan forward in the descriptor list to see if we
5052                          * need the parent vnode.  We will need it if we are
5053                          * deleting, since we must have rights  to remove
5054                          * entries in the parent directory, as well as the
5055                          * rights to delete the object itself.
5056                          */
5057                         wantdelete = input[i].ad_flags & _DELETE_OK;
5058                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++)
5059                                 if (input[j].ad_flags & _DELETE_OK)
5060                                         wantdelete = 1;
5061
5062                         niopts = FOLLOW | AUDITVNPATH1;
5063
5064                         /* need parent for vnode_authorize for deletion test */
5065                         if (wantdelete)
5066                                 niopts |= WANTPARENT;
5067
5068                         /* do the lookup */
5069                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5070                                CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5071                                &context);
5072                         error = namei(&nd);
5073                         if (!error) {
5074                                 vp = nd.ni_vp;
5075                                 if (wantdelete)
5076                                         dvp = nd.ni_dvp;
5077                         }
5078                         nameidone(&nd);
5079                 }
5080
5081                 /*
5082                  * Handle lookup errors.
5083                  */
5084                 switch(error) {
5085                 case ENOENT:
5086                 case EACCES:
5087                 case EPERM:
5088                 case ENOTDIR:
5089                         result[i] = error;
5090                         break;
5091                 case 0:
5092                         /* run this access check */
5093                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5094                         break;
5095                 default:
5096                         /* fatal lookup error */
5097
5098                         goto out;
5099                 }
5100         }
5101
5102         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5103
5104         /* copy out results */
5105         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5106
5107 out:
5108         if (input && input != stack_input)
5109                 FREE(input, M_TEMP);
5110         if (result)
5111                 FREE(result, M_TEMP);
5112         if (vp)
5113                 vnode_put(vp);
5114         if (dvp)
5115                 vnode_put(dvp);
5116         if (IS_VALID_CRED(context.vc_ucred))
5117                 kauth_cred_unref(&context.vc_ucred);
5118         return(error);
5119 }
5120
5121
5122 /*
5123  * Returns:     0                       Success
5124  *              namei:EFAULT            Bad address
5125  *              namei:ENAMETOOLONG      Filename too long
5126  *              namei:ENOENT            No such file or directory
5127  *              namei:ELOOP             Too many levels of symbolic links
5128  *              namei:EBADF             Bad file descriptor
5129  *              namei:ENOTDIR           Not a directory
5130  *              namei:???
5131  *              access1:
5132  */
5133 static int
5134 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5135     int flag, enum uio_seg segflg)
5136 {
5137         int error;
5138         struct nameidata nd;
5139         int niopts;
5140         struct vfs_context context;
5141 #if NAMEDRSRCFORK
5142         int is_namedstream = 0;
5143 #endif
5144
5145         /*
5146          * Unless the AT_EACCESS option is used, Access is defined as checking
5147          * against the process' real identity, even if operations are checking
5148          * the effective identity.  So we need to tweak the credential
5149          * in the context for that case.
5150          */
5151         if (!(flag & AT_EACCESS))
5152                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5153         else
5154                 context.vc_ucred = ctx->vc_ucred;
5155         context.vc_thread = ctx->vc_thread;
5156
5157
5158         niopts = FOLLOW | AUDITVNPATH1;
5159         /* need parent for vnode_authorize for deletion test */
5160         if (amode & _DELETE_OK)
5161                 niopts |= WANTPARENT;
5162         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5163                path, &context);
5164
5165 #if NAMEDRSRCFORK
5166         /* access(F_OK) calls are allowed for resource forks. */
5167         if (amode == F_OK)
5168                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5169 #endif
5170         error = nameiat(&nd, fd);
5171         if (error)
5172                 goto out;
5173
5174 #if NAMEDRSRCFORK
5175         /* Grab reference on the shadow stream file vnode to
5176          * force an inactive on release which will mark it
5177          * for recycle.
5178          */
5179         if (vnode_isnamedstream(nd.ni_vp) &&
5180             (nd.ni_vp->v_parent != NULLVP) &&
5181             vnode_isshadow(nd.ni_vp)) {
5182                 is_namedstream = 1;
5183                 vnode_ref(nd.ni_vp);
5184         }
5185 #endif
5186
5187         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
5188
5189 #if NAMEDRSRCFORK
5190         if (is_namedstream) {
5191                 vnode_rele(nd.ni_vp);
5192         }
5193 #endif
5194
5195         vnode_put(nd.ni_vp);
5196         if (amode & _DELETE_OK)
5197                 vnode_put(nd.ni_dvp);
5198         nameidone(&nd);
5199
5200 out:
5201         if (!(flag & AT_EACCESS))
5202                 kauth_cred_unref(&context.vc_ucred);
5203         return (error);
5204 }
5205
5206 int
5207 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
5208 {
5209         return (faccessat_internal(vfs_context_current(), AT_FDCWD,
5210             uap->path, uap->flags, 0, UIO_USERSPACE));
5211 }
5212
5213 int
5214 faccessat(__unused proc_t p, struct faccessat_args *uap,
5215           __unused int32_t *retval)
5216 {
5217         if (uap->flag & ~AT_EACCESS)
5218                 return (EINVAL);
5219
5220         return (faccessat_internal(vfs_context_current(), uap->fd,
5221             uap->path, uap->amode, uap->flag, UIO_USERSPACE));
5222 }
5223
5224 /*
5225  * Returns:     0                       Success
5226  *              EFAULT
5227  *      copyout:EFAULT
5228  *      namei:???
5229  *      vn_stat:???
5230  */
5231 static int
5232 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
5233     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
5234     enum uio_seg segflg, int fd, int flag)
5235 {
5236         struct nameidata nd;
5237         int follow;
5238         union {
5239                 struct stat sb;
5240                 struct stat64 sb64;
5241         } source;
5242         union {
5243                 struct user64_stat user64_sb;
5244                 struct user32_stat user32_sb;
5245                 struct user64_stat64 user64_sb64;
5246                 struct user32_stat64 user32_sb64;
5247         } dest;
5248         caddr_t sbp;
5249         int error, my_size;
5250         kauth_filesec_t fsec;
5251         size_t xsecurity_bufsize;
5252         void * statptr;
5253
5254         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5255         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
5256             segflg, path, ctx);
5257
5258 #if NAMEDRSRCFORK
5259         int is_namedstream = 0;
5260         /* stat calls are allowed for resource forks. */
5261         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5262 #endif
5263         error = nameiat(&nd, fd);
5264         if (error)
5265                 return (error);
5266         fsec = KAUTH_FILESEC_NONE;
5267
5268         statptr = (void *)&source;
5269
5270 #if NAMEDRSRCFORK
5271         /* Grab reference on the shadow stream file vnode to
5272          * force an inactive on release which will mark it
5273          * for recycle.
5274          */
5275         if (vnode_isnamedstream(nd.ni_vp) &&
5276             (nd.ni_vp->v_parent != NULLVP) &&
5277             vnode_isshadow(nd.ni_vp)) {
5278                 is_namedstream = 1;
5279                 vnode_ref(nd.ni_vp);
5280         }
5281 #endif
5282
5283         error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
5284
5285 #if NAMEDRSRCFORK
5286         if (is_namedstream) {
5287                 vnode_rele(nd.ni_vp);
5288         }
5289 #endif
5290         vnode_put(nd.ni_vp);
5291         nameidone(&nd);
5292
5293         if (error)
5294                 return (error);
5295         /* Zap spare fields */
5296         if (isstat64 != 0) {
5297                 source.sb64.st_lspare = 0;
5298                 source.sb64.st_qspare[0] = 0LL;
5299                 source.sb64.st_qspare[1] = 0LL;
5300                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5301                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
5302                         my_size = sizeof(dest.user64_sb64);
5303                         sbp = (caddr_t)&dest.user64_sb64;
5304                 } else {
5305                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
5306                         my_size = sizeof(dest.user32_sb64);
5307                         sbp = (caddr_t)&dest.user32_sb64;
5308                 }
5309                 /*
5310                  * Check if we raced (post lookup) against the last unlink of a file.
5311                  */
5312                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
5313                         source.sb64.st_nlink = 1;
5314                 }
5315         } else {
5316                 source.sb.st_lspare = 0;
5317                 source.sb.st_qspare[0] = 0LL;
5318                 source.sb.st_qspare[1] = 0LL;
5319                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5320                         munge_user64_stat(&source.sb, &dest.user64_sb);
5321                         my_size = sizeof(dest.user64_sb);
5322                         sbp = (caddr_t)&dest.user64_sb;
5323                 } else {
5324                         munge_user32_stat(&source.sb, &dest.user32_sb);
5325                         my_size = sizeof(dest.user32_sb);
5326                         sbp = (caddr_t)&dest.user32_sb;
5327                 }
5328
5329                 /*
5330                  * Check if we raced (post lookup) against the last unlink of a file.
5331                  */
5332                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
5333                         source.sb.st_nlink = 1;
5334                 }
5335         }
5336         if ((error = copyout(sbp, ub, my_size)) != 0)
5337                 goto out;
5338
5339         /* caller wants extended security information? */
5340         if (xsecurity != USER_ADDR_NULL) {
5341
5342                 /* did we get any? */
5343                 if (fsec == KAUTH_FILESEC_NONE) {
5344                         if (susize(xsecurity_size, 0) != 0) {
5345                                 error = EFAULT;
5346                                 goto out;
5347                         }
5348                 } else {
5349                         /* find the user buffer size */
5350                         xsecurity_bufsize = fusize(xsecurity_size);
5351
5352                         /* copy out the actual data size */
5353                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
5354                                 error = EFAULT;
5355                                 goto out;
5356                         }
5357
5358                         /* if the caller supplied enough room, copy out to it */
5359                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec))
5360                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
5361                 }
5362         }
5363 out:
5364         if (fsec != KAUTH_FILESEC_NONE)
5365                 kauth_filesec_free(fsec);
5366         return (error);
5367 }
5368
5369 /*
5370  * stat_extended: Get file status; with extended security (ACL).
5371  *
5372  * Parameters:    p                       (ignored)
5373  *                uap                     User argument descriptor (see below)
5374  *                retval                  (ignored)
5375  *
5376  * Indirect:      uap->path               Path of file to get status from
5377  *                uap->ub                 User buffer (holds file status info)
5378  *                uap->xsecurity          ACL to get (extended security)
5379  *                uap->xsecurity_size     Size of ACL
5380  *
5381  * Returns:        0                      Success
5382  *                !0                      errno value
5383  *
5384  */
5385 int
5386 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
5387     __unused int32_t *retval)
5388 {
5389         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5390             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5391             0));
5392 }
5393
5394 /*
5395  * Returns:     0                       Success
5396  *      fstatat_internal:???            [see fstatat_internal() in this file]
5397  */
5398 int
5399 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
5400 {
5401         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5402             0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0));
5403 }
5404
5405 int
5406 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
5407 {
5408         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5409             0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0));
5410 }
5411
5412 /*
5413  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
5414  *
5415  * Parameters:    p                       (ignored)
5416  *                uap                     User argument descriptor (see below)
5417  *                retval                  (ignored)
5418  *
5419  * Indirect:      uap->path               Path of file to get status from
5420  *                uap->ub                 User buffer (holds file status info)
5421  *                uap->xsecurity          ACL to get (extended security)
5422  *                uap->xsecurity_size     Size of ACL
5423  *
5424  * Returns:        0                      Success
5425  *                !0                      errno value
5426  *
5427  */
5428 int
5429 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
5430 {
5431         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5432             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5433             0));
5434 }
5435
5436 /*
5437  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
5438  *
5439  * Parameters:    p                       (ignored)
5440  *                uap                     User argument descriptor (see below)
5441  *                retval                  (ignored)
5442  *
5443  * Indirect:      uap->path               Path of file to get status from
5444  *                uap->ub                 User buffer (holds file status info)
5445  *                uap->xsecurity          ACL to get (extended security)
5446  *                uap->xsecurity_size     Size of ACL
5447  *
5448  * Returns:        0                      Success
5449  *                !0                      errno value
5450  *
5451  */
5452 int
5453 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
5454 {
5455         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5456             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5457             AT_SYMLINK_NOFOLLOW));
5458 }
5459
5460 /*
5461  * Get file status; this version does not follow links.
5462  */
5463 int
5464 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
5465 {
5466         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5467             0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5468 }
5469
5470 int
5471 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
5472 {
5473         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5474             0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5475 }
5476
5477 /*
5478  * lstat64_extended: Get file status; can handle large inode numbers; does not
5479  * follow links; with extended security (ACL).
5480  *
5481  * Parameters:    p                       (ignored)
5482  *                uap                     User argument descriptor (see below)
5483  *                retval                  (ignored)
5484  *
5485  * Indirect:      uap->path               Path of file to get status from
5486  *                uap->ub                 User buffer (holds file status info)
5487  *                uap->xsecurity          ACL to get (extended security)
5488  *                uap->xsecurity_size     Size of ACL
5489  *
5490  * Returns:        0                      Success
5491  *                !0                      errno value
5492  *
5493  */
5494 int
5495 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
5496 {
5497         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5498             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5499             AT_SYMLINK_NOFOLLOW));
5500 }
5501
5502 int
5503 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
5504 {
5505         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5506                 return (EINVAL);
5507
5508         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5509             0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag));
5510 }
5511
5512 int
5513 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
5514     __unused int32_t *retval)
5515 {
5516         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5517                 return (EINVAL);
5518
5519         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5520             0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag));
5521 }
5522
5523 /*
5524  * Get configurable pathname variables.
5525  *
5526  * Returns:     0                       Success
5527  *      namei:???
5528  *      vn_pathconf:???
5529  *
5530  * Notes:       Global implementation  constants are intended to be
5531  *              implemented in this function directly; all other constants
5532  *              are per-FS implementation, and therefore must be handled in
5533  *              each respective FS, instead.
5534  *
5535  * XXX We implement some things globally right now that should actually be
5536  * XXX per-FS; we will need to deal with this at some point.
5537  */
5538 /* ARGSUSED */
5539 int
5540 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
5541 {
5542         int error;
5543         struct nameidata nd;
5544         vfs_context_t ctx = vfs_context_current();
5545
5546         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
5547                 UIO_USERSPACE, uap->path, ctx);
5548         error = namei(&nd);
5549         if (error)
5550                 return (error);
5551
5552         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
5553
5554         vnode_put(nd.ni_vp);
5555         nameidone(&nd);
5556         return (error);
5557 }
5558
5559 /*
5560  * Return target name of a symbolic link.
5561  */
5562 /* ARGSUSED */
5563 static int
5564 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
5565     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
5566     int *retval)
5567 {
5568         vnode_t vp;
5569         uio_t auio;
5570         int error;
5571         struct nameidata nd;
5572         char uio_buf[ UIO_SIZEOF(1) ];
5573
5574         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
5575             seg, path, ctx);
5576
5577         error = nameiat(&nd, fd);
5578         if (error)
5579                 return (error);
5580         vp = nd.ni_vp;
5581
5582         nameidone(&nd);
5583
5584         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
5585                                     &uio_buf[0], sizeof(uio_buf));
5586         uio_addiov(auio, buf, bufsize);
5587         if (vp->v_type != VLNK) {
5588                 error = EINVAL;
5589         } else {
5590 #if CONFIG_MACF
5591                 error = mac_vnode_check_readlink(ctx, vp);
5592 #endif
5593                 if (error == 0)
5594                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
5595                                                 ctx);
5596                 if (error == 0)
5597                         error = VNOP_READLINK(vp, auio, ctx);
5598         }
5599         vnode_put(vp);
5600
5601         *retval = bufsize - (int)uio_resid(auio);
5602         return (error);
5603 }
5604
5605 int
5606 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
5607 {
5608         enum uio_seg procseg;
5609
5610         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5611         return (readlinkat_internal(vfs_context_current(), AT_FDCWD,
5612             CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
5613             uap->count, procseg, retval));
5614 }
5615
5616 int
5617 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
5618 {
5619         enum uio_seg procseg;
5620
5621         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5622         return (readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
5623             procseg, uap->buf, uap->bufsize, procseg, retval));
5624 }
5625
5626 /*
5627  * Change file flags.
5628  */
5629 static int
5630 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
5631 {
5632         struct vnode_attr va;
5633         kauth_action_t action;
5634         int error;
5635
5636         VATTR_INIT(&va);
5637         VATTR_SET(&va, va_flags, flags);
5638
5639 #if CONFIG_MACF
5640         error = mac_vnode_check_setflags(ctx, vp, flags);
5641         if (error)
5642                 goto out;
5643 #endif
5644
5645         /* request authorisation, disregard immutability */
5646         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5647                 goto out;
5648         /*
5649          * Request that the auth layer disregard those file flags it's allowed to when
5650          * authorizing this operation; we need to do this in order to be able to
5651          * clear immutable flags.
5652          */
5653         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0))
5654                 goto out;
5655         error = vnode_setattr(vp, &va, ctx);
5656
5657         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
5658                 error = ENOTSUP;
5659         }
5660 out:
5661         vnode_put(vp);
5662         return(error);
5663 }
5664
5665 /*
5666  * Change flags of a file given a path name.
5667  */
5668 /* ARGSUSED */
5669 int
5670 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
5671 {
5672         vnode_t vp;
5673         vfs_context_t ctx = vfs_context_current();
5674         int error;
5675         struct nameidata nd;
5676
5677         AUDIT_ARG(fflags, uap->flags);
5678         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
5679                 UIO_USERSPACE, uap->path, ctx);
5680         error = namei(&nd);
5681         if (error)
5682                 return (error);
5683         vp = nd.ni_vp;
5684         nameidone(&nd);
5685
5686         error = chflags1(vp, uap->flags, ctx);
5687
5688         return(error);
5689 }
5690
5691 /*
5692  * Change flags of a file given a file descriptor.
5693  */
5694 /* ARGSUSED */
5695 int
5696 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
5697 {
5698         vnode_t vp;
5699         int error;
5700
5701         AUDIT_ARG(fd, uap->fd);
5702         AUDIT_ARG(fflags, uap->flags);
5703         if ( (error = file_vnode(uap->fd, &vp)) )
5704                 return (error);
5705
5706         if ((error = vnode_getwithref(vp))) {
5707                 file_drop(uap->fd);
5708                 return(error);
5709         }
5710
5711         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5712
5713         error = chflags1(vp, uap->flags, vfs_context_current());
5714
5715         file_drop(uap->fd);
5716         return (error);
5717 }
5718
5719 /*
5720  * Change security information on a filesystem object.
5721  *
5722  * Returns:     0                       Success
5723  *              EPERM                   Operation not permitted
5724  *              vnode_authattr:???      [anything vnode_authattr can return]
5725  *              vnode_authorize:???     [anything vnode_authorize can return]
5726  *              vnode_setattr:???       [anything vnode_setattr can return]
5727  *
5728  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
5729  *              translated to EPERM before being returned.
5730  */
5731 static int
5732 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
5733 {
5734         kauth_action_t action;
5735         int error;
5736
5737         AUDIT_ARG(mode, vap->va_mode);
5738         /* XXX audit new args */
5739
5740 #if NAMEDSTREAMS
5741         /* chmod calls are not allowed for resource forks. */
5742         if (vp->v_flag & VISNAMEDSTREAM) {
5743                 return (EPERM);
5744         }
5745 #endif
5746
5747 #if CONFIG_MACF
5748         if (VATTR_IS_ACTIVE(vap, va_mode) &&
5749             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0)
5750                 return (error);
5751 #endif
5752
5753         /* make sure that the caller is allowed to set this security information */
5754         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
5755             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
5756                 if (error == EACCES)
5757                         error = EPERM;
5758                 return(error);
5759         }
5760
5761         error = vnode_setattr(vp, vap, ctx);
5762
5763         return (error);
5764 }
5765
5766
5767 /*
5768  * Change mode of a file given a path name.
5769  *
5770  * Returns:     0                       Success
5771  *              namei:???               [anything namei can return]
5772  *              chmod_vnode:???         [anything chmod_vnode can return]
5773  */
5774 static int
5775 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
5776     int fd, int flag, enum uio_seg segflg)
5777 {
5778         struct nameidata nd;
5779         int follow, error;
5780
5781         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5782         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
5783             segflg, path, ctx);
5784         if ((error = nameiat(&nd, fd)))
5785                 return (error);
5786         error = chmod_vnode(ctx, nd.ni_vp, vap);
5787         vnode_put(nd.ni_vp);
5788         nameidone(&nd);
5789         return(error);
5790 }
5791
5792 /*
5793  * chmod_extended: Change the mode of a file given a path name; with extended
5794  * argument list (including extended security (ACL)).
5795  *
5796  * Parameters:  p                       Process requesting the open
5797  *              uap                     User argument descriptor (see below)
5798  *              retval                  (ignored)
5799  *
5800  * Indirect:    uap->path               Path to object (same as 'chmod')
5801  *              uap->uid                UID to set
5802  *              uap->gid                GID to set
5803  *              uap->mode               File mode to set (same as 'chmod')
5804  *              uap->xsecurity          ACL to set (or delete)
5805  *
5806  * Returns:     0                       Success
5807  *              !0                      errno value
5808  *
5809  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
5810  *
5811  * XXX:         We should enummerate the possible errno values here, and where
5812  *              in the code they originated.
5813  */
5814 int
5815 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
5816 {
5817         int error;
5818         struct vnode_attr va;
5819         kauth_filesec_t xsecdst;
5820
5821         AUDIT_ARG(owner, uap->uid, uap->gid);
5822
5823         VATTR_INIT(&va);
5824         if (uap->mode != -1)
5825                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5826         if (uap->uid != KAUTH_UID_NONE)
5827                 VATTR_SET(&va, va_uid, uap->uid);
5828         if (uap->gid != KAUTH_GID_NONE)
5829                 VATTR_SET(&va, va_gid, uap->gid);
5830
5831         xsecdst = NULL;
5832         switch(uap->xsecurity) {
5833                 /* explicit remove request */
5834         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
5835                 VATTR_SET(&va, va_acl, NULL);
5836                 break;
5837                 /* not being set */
5838         case USER_ADDR_NULL:
5839                 break;
5840         default:
5841                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
5842                         return(error);
5843                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5844                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
5845         }
5846
5847         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
5848             UIO_USERSPACE);
5849
5850         if (xsecdst != NULL)
5851                 kauth_filesec_free(xsecdst);
5852         return(error);
5853 }
5854
5855 /*
5856  * Returns:     0                       Success
5857  *              chmodat:???             [anything chmodat can return]
5858  */
5859 static int
5860 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
5861     int flag, enum uio_seg segflg)
5862 {
5863         struct vnode_attr va;
5864
5865         VATTR_INIT(&va);
5866         VATTR_SET(&va, va_mode, mode & ALLPERMS);
5867
5868         return (chmodat(ctx, path, &va, fd, flag, segflg));
5869 }
5870
5871 int
5872 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
5873 {
5874         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
5875             AT_FDCWD, 0, UIO_USERSPACE));
5876 }
5877
5878 int
5879 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
5880 {
5881         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5882                 return (EINVAL);
5883
5884         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
5885             uap->fd, uap->flag, UIO_USERSPACE));
5886 }
5887
5888 /*
5889  * Change mode of a file given a file descriptor.
5890  */
5891 static int
5892 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
5893 {
5894         vnode_t vp;
5895         int error;
5896
5897         AUDIT_ARG(fd, fd);
5898
5899         if ((error = file_vnode(fd, &vp)) != 0)
5900                 return (error);
5901         if ((error = vnode_getwithref(vp)) != 0) {
5902                 file_drop(fd);
5903                 return(error);
5904         }
5905         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5906
5907         error = chmod_vnode(vfs_context_current(), vp, vap);
5908         (void)vnode_put(vp);
5909         file_drop(fd);
5910
5911         return (error);
5912 }
5913
5914 /*
5915  * fchmod_extended: Change mode of a file given a file descriptor; with
5916  * extended argument list (including extended security (ACL)).
5917  *
5918  * Parameters:    p                       Process requesting to change file mode
5919  *                uap                     User argument descriptor (see below)
5920  *                retval                  (ignored)
5921  *
5922  * Indirect:      uap->mode               File mode to set (same as 'chmod')
5923  *                uap->uid                UID to set
5924  *                uap->gid                GID to set
5925  *                uap->xsecurity          ACL to set (or delete)
5926  *                uap->fd                 File descriptor of file to change mode
5927  *
5928  * Returns:        0                      Success
5929  *                !0                      errno value
5930  *
5931  */
5932 int
5933 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
5934 {
5935         int error;
5936         struct vnode_attr va;
5937         kauth_filesec_t xsecdst;
5938
5939         AUDIT_ARG(owner, uap->uid, uap->gid);
5940
5941         VATTR_INIT(&va);
5942         if (uap->mode != -1)
5943                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5944         if (uap->uid != KAUTH_UID_NONE)
5945                 VATTR_SET(&va, va_uid, uap->uid);
5946         if (uap->gid != KAUTH_GID_NONE)
5947                 VATTR_SET(&va, va_gid, uap->gid);
5948
5949         xsecdst = NULL;
5950         switch(uap->xsecurity) {
5951         case USER_ADDR_NULL:
5952                 VATTR_SET(&va, va_acl, NULL);
5953                 break;
5954         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
5955                 VATTR_SET(&va, va_acl, NULL);
5956                 break;
5957                 /* not being set */
5958         case CAST_USER_ADDR_T(-1):
5959                 break;
5960         default:
5961                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
5962                         return(error);
5963                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5964         }
5965
5966         error = fchmod1(p, uap->fd, &va);
5967
5968
5969         switch(uap->xsecurity) {
5970         case USER_ADDR_NULL:
5971         case CAST_USER_ADDR_T(-1):
5972                 break;
5973         default:
5974                 if (xsecdst != NULL)
5975                         kauth_filesec_free(xsecdst);
5976         }
5977         return(error);
5978 }
5979
5980 int
5981 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
5982 {
5983         struct vnode_attr va;
5984
5985         VATTR_INIT(&va);
5986         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5987
5988         return(fchmod1(p, uap->fd, &va));
5989 }
5990
5991
5992 /*
5993  * Set ownership given a path name.
5994  */
5995 /* ARGSUSED */
5996 static int
5997 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
5998    gid_t gid, int flag, enum uio_seg segflg)
5999 {
6000         vnode_t vp;
6001         struct vnode_attr va;
6002         int error;
6003         struct nameidata nd;
6004         int follow;
6005         kauth_action_t action;
6006
6007         AUDIT_ARG(owner, uid, gid);
6008
6009         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6010         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6011             path, ctx);
6012         error = nameiat(&nd, fd);
6013         if (error)
6014                 return (error);
6015         vp = nd.ni_vp;
6016
6017         nameidone(&nd);
6018
6019         VATTR_INIT(&va);
6020         if (uid != (uid_t)VNOVAL)
6021                 VATTR_SET(&va, va_uid, uid);
6022         if (gid != (gid_t)VNOVAL)
6023                 VATTR_SET(&va, va_gid, gid);
6024
6025 #if CONFIG_MACF
6026         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6027         if (error)
6028                 goto out;
6029 #endif
6030
6031         /* preflight and authorize attribute changes */
6032         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6033                 goto out;
6034         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6035                 goto out;
6036         error = vnode_setattr(vp, &va, ctx);
6037
6038 out:
6039         /*
6040          * EACCES is only allowed from namei(); permissions failure should
6041          * return EPERM, so we need to translate the error code.
6042          */
6043         if (error == EACCES)
6044                 error = EPERM;
6045
6046         vnode_put(vp);
6047         return (error);
6048 }
6049
6050 int
6051 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
6052 {
6053         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6054             uap->uid, uap->gid, 0, UIO_USERSPACE));
6055 }
6056
6057 int
6058 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
6059 {
6060         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6061             uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE));
6062 }
6063
6064 int
6065 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
6066 {
6067         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6068                 return (EINVAL);
6069
6070         return (fchownat_internal(vfs_context_current(), uap->fd, uap->path,
6071             uap->uid, uap->gid, uap->flag, UIO_USERSPACE));
6072 }
6073
6074 /*
6075  * Set ownership given a file descriptor.
6076  */
6077 /* ARGSUSED */
6078 int
6079 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
6080 {
6081         struct vnode_attr va;
6082         vfs_context_t ctx = vfs_context_current();
6083         vnode_t vp;
6084         int error;
6085         kauth_action_t action;
6086
6087         AUDIT_ARG(owner, uap->uid, uap->gid);
6088         AUDIT_ARG(fd, uap->fd);
6089
6090         if ( (error = file_vnode(uap->fd, &vp)) )
6091                 return (error);
6092
6093         if ( (error = vnode_getwithref(vp)) ) {
6094                 file_drop(uap->fd);
6095                 return(error);
6096         }
6097         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6098
6099         VATTR_INIT(&va);
6100         if (uap->uid != VNOVAL)
6101                 VATTR_SET(&va, va_uid, uap->uid);
6102         if (uap->gid != VNOVAL)
6103                 VATTR_SET(&va, va_gid, uap->gid);
6104
6105 #if NAMEDSTREAMS
6106         /* chown calls are not allowed for resource forks. */
6107         if (vp->v_flag & VISNAMEDSTREAM) {
6108                 error = EPERM;
6109                 goto out;
6110         }
6111 #endif
6112
6113 #if CONFIG_MACF
6114         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
6115         if (error)
6116                 goto out;
6117 #endif
6118
6119         /* preflight and authorize attribute changes */
6120         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6121                 goto out;
6122         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6123                 if (error == EACCES)
6124                         error = EPERM;
6125                 goto out;
6126         }
6127         error = vnode_setattr(vp, &va, ctx);
6128
6129 out:
6130         (void)vnode_put(vp);
6131         file_drop(uap->fd);
6132         return (error);
6133 }
6134
6135 static int
6136 getutimes(user_addr_t usrtvp, struct timespec *tsp)
6137 {
6138         int error;
6139
6140         if (usrtvp == USER_ADDR_NULL) {
6141                 struct timeval old_tv;
6142                 /* XXX Y2038 bug because of microtime argument */
6143                 microtime(&old_tv);
6144                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
6145                 tsp[1] = tsp[0];
6146         } else {
6147                 if (IS_64BIT_PROCESS(current_proc())) {
6148                         struct user64_timeval tv[2];
6149                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6150                         if (error)
6151                                 return (error);
6152                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6153                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6154                 } else {
6155                         struct user32_timeval tv[2];
6156                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6157                         if (error)
6158                                 return (error);
6159                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6160                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6161                 }
6162         }
6163         return 0;
6164 }
6165
6166 static int
6167 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
6168         int nullflag)
6169 {
6170         int error;
6171         struct vnode_attr va;
6172         kauth_action_t action;
6173
6174         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6175
6176         VATTR_INIT(&va);
6177         VATTR_SET(&va, va_access_time, ts[0]);
6178         VATTR_SET(&va, va_modify_time, ts[1]);
6179         if (nullflag)
6180                 va.va_vaflags |= VA_UTIMES_NULL;
6181
6182 #if NAMEDSTREAMS
6183         /* utimes calls are not allowed for resource forks. */
6184         if (vp->v_flag & VISNAMEDSTREAM) {
6185                 error = EPERM;
6186                 goto out;
6187         }
6188 #endif
6189
6190 #if CONFIG_MACF
6191         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
6192         if (error)
6193                 goto out;
6194 #endif
6195         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6196                 if (!nullflag && error == EACCES)
6197                         error = EPERM;
6198                 goto out;
6199         }
6200
6201         /* since we may not need to auth anything, check here */
6202         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6203                 if (!nullflag && error == EACCES)
6204                         error = EPERM;
6205                 goto out;
6206         }
6207         error = vnode_setattr(vp, &va, ctx);
6208
6209 out:
6210         return error;
6211 }
6212
6213 /*
6214  * Set the access and modification times of a file.
6215  */
6216 /* ARGSUSED */
6217 int
6218 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
6219 {
6220         struct timespec ts[2];
6221         user_addr_t usrtvp;
6222         int error;
6223         struct nameidata nd;
6224         vfs_context_t ctx = vfs_context_current();
6225
6226         /*
6227          * AUDIT: Needed to change the order of operations to do the
6228          * name lookup first because auditing wants the path.
6229          */
6230         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6231                 UIO_USERSPACE, uap->path, ctx);
6232         error = namei(&nd);
6233         if (error)
6234                 return (error);
6235         nameidone(&nd);
6236
6237         /*
6238          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
6239          * the current time instead.
6240          */
6241         usrtvp = uap->tptr;
6242         if ((error = getutimes(usrtvp, ts)) != 0)
6243                 goto out;
6244
6245         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
6246
6247 out:
6248         vnode_put(nd.ni_vp);
6249         return (error);
6250 }
6251
6252 /*
6253  * Set the access and modification times of a file.
6254  */
6255 /* ARGSUSED */
6256 int
6257 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
6258 {
6259         struct timespec ts[2];
6260         vnode_t vp;
6261         user_addr_t usrtvp;
6262         int error;
6263
6264         AUDIT_ARG(fd, uap->fd);
6265         usrtvp = uap->tptr;
6266         if ((error = getutimes(usrtvp, ts)) != 0)
6267                 return (error);
6268         if ((error = file_vnode(uap->fd, &vp)) != 0)
6269                 return (error);
6270         if((error = vnode_getwithref(vp))) {
6271                 file_drop(uap->fd);
6272                 return(error);
6273         }
6274
6275         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
6276         vnode_put(vp);
6277         file_drop(uap->fd);
6278         return(error);
6279 }
6280
6281 /*
6282  * Truncate a file given its path name.
6283  */
6284 /* ARGSUSED */
6285 int
6286 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
6287 {
6288         vnode_t vp;
6289         struct vnode_attr va;
6290         vfs_context_t ctx = vfs_context_current();
6291         int error;
6292         struct nameidata nd;
6293         kauth_action_t action;
6294
6295         if (uap->length < 0)
6296                 return(EINVAL);
6297         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
6298                 UIO_USERSPACE, uap->path, ctx);
6299         if ((error = namei(&nd)))
6300                 return (error);
6301         vp = nd.ni_vp;
6302
6303         nameidone(&nd);
6304
6305         VATTR_INIT(&va);
6306         VATTR_SET(&va, va_data_size, uap->length);
6307
6308 #if CONFIG_MACF
6309         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
6310         if (error)
6311                 goto out;
6312 #endif
6313
6314         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6315                 goto out;
6316         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6317                 goto out;
6318         error = vnode_setattr(vp, &va, ctx);
6319 out:
6320         vnode_put(vp);
6321         return (error);
6322 }
6323
6324 /*
6325  * Truncate a file given a file descriptor.
6326  */
6327 /* ARGSUSED */
6328 int
6329 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
6330 {
6331         vfs_context_t ctx = vfs_context_current();
6332         struct vnode_attr va;
6333         vnode_t vp;
6334         struct fileproc *fp;
6335         int error ;
6336         int fd = uap->fd;
6337
6338         AUDIT_ARG(fd, uap->fd);
6339         if (uap->length < 0)
6340                 return(EINVAL);
6341
6342         if ( (error = fp_lookup(p,fd,&fp,0)) ) {
6343                 return(error);
6344         }
6345
6346         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
6347         case DTYPE_PSXSHM:
6348                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
6349                 goto out;
6350         case DTYPE_VNODE:
6351                 break;
6352         default:
6353                 error = EINVAL;
6354                 goto out;
6355         }
6356
6357         vp = (vnode_t)fp->f_fglob->fg_data;
6358
6359         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
6360                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
6361                 error = EINVAL;
6362                 goto out;
6363         }
6364
6365         if ((error = vnode_getwithref(vp)) != 0) {
6366                 goto out;
6367         }
6368
6369         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6370
6371 #if CONFIG_MACF
6372         error = mac_vnode_check_truncate(ctx,
6373             fp->f_fglob->fg_cred, vp);
6374         if (error) {
6375                 (void)vnode_put(vp);
6376                 goto out;
6377         }
6378 #endif
6379         VATTR_INIT(&va);
6380         VATTR_SET(&va, va_data_size, uap->length);
6381         error = vnode_setattr(vp, &va, ctx);
6382         (void)vnode_put(vp);
6383 out:
6384         file_drop(fd);
6385         return (error);
6386 }
6387
6388
6389 /*
6390  * Sync an open file with synchronized I/O _file_ integrity completion
6391  */
6392 /* ARGSUSED */
6393 int
6394 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
6395 {
6396         __pthread_testcancel(1);
6397         return(fsync_common(p, uap, MNT_WAIT));
6398 }
6399
6400
6401 /*
6402  * Sync an open file with synchronized I/O _file_ integrity completion
6403  *
6404  * Notes:       This is a legacy support function that does not test for
6405  *              thread cancellation points.
6406  */
6407 /* ARGSUSED */
6408 int
6409 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
6410 {
6411         return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT));
6412 }
6413
6414
6415 /*
6416  * Sync an open file with synchronized I/O _data_ integrity completion
6417  */
6418 /* ARGSUSED */
6419 int
6420 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
6421 {
6422         __pthread_testcancel(1);
6423         return(fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT));
6424 }
6425
6426
6427 /*
6428  * fsync_common
6429  *
6430  * Common fsync code to support both synchronized I/O file integrity completion
6431  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
6432  *
6433  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
6434  * will only guarantee that the file data contents are retrievable.  If
6435  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
6436  * includes additional metadata unnecessary for retrieving the file data
6437  * contents, such as atime, mtime, ctime, etc., also be committed to stable
6438  * storage.
6439  *
6440  * Parameters:  p                               The process
6441  *              uap->fd                         The descriptor to synchronize
6442  *              flags                           The data integrity flags
6443  *
6444  * Returns:     int                             Success
6445  *      fp_getfvp:EBADF                         Bad file descriptor
6446  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
6447  *      VNOP_FSYNC:???                          unspecified
6448  *
6449  * Notes:       We use struct fsync_args because it is a short name, and all
6450  *              caller argument structures are otherwise identical.
6451  */
6452 static int
6453 fsync_common(proc_t p, struct fsync_args *uap, int flags)
6454 {
6455         vnode_t vp;
6456         struct fileproc *fp;
6457         vfs_context_t ctx = vfs_context_current();
6458         int error;
6459
6460         AUDIT_ARG(fd, uap->fd);
6461
6462         if ( (error = fp_getfvp(p, uap->fd, &fp, &vp)) )
6463                 return (error);
6464         if ( (error = vnode_getwithref(vp)) ) {
6465                 file_drop(uap->fd);
6466                 return(error);
6467         }
6468
6469         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6470
6471         error = VNOP_FSYNC(vp, flags, ctx);
6472
6473 #if NAMEDRSRCFORK
6474         /* Sync resource fork shadow file if necessary. */
6475         if ((error == 0) &&
6476             (vp->v_flag & VISNAMEDSTREAM) &&
6477             (vp->v_parent != NULLVP) &&
6478             vnode_isshadow(vp) &&
6479             (fp->f_flags & FP_WRITTEN)) {
6480                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
6481         }
6482 #endif
6483
6484         (void)vnode_put(vp);
6485         file_drop(uap->fd);
6486         return (error);
6487 }
6488
6489 /*
6490  * Duplicate files.  Source must be a file, target must be a file or
6491  * must not exist.
6492  *
6493  * XXX Copyfile authorisation checking is woefully inadequate, and will not
6494  *     perform inheritance correctly.
6495  */
6496 /* ARGSUSED */
6497 int
6498 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
6499 {
6500         vnode_t tvp, fvp, tdvp, sdvp;
6501         struct nameidata fromnd, tond;
6502         int error;
6503         vfs_context_t ctx = vfs_context_current();
6504
6505         /* Check that the flags are valid. */
6506
6507         if (uap->flags & ~CPF_MASK) {
6508                 return(EINVAL);
6509         }
6510
6511         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
6512                 UIO_USERSPACE, uap->from, ctx);
6513         if ((error = namei(&fromnd)))
6514                 return (error);
6515         fvp = fromnd.ni_vp;
6516
6517         NDINIT(&tond, CREATE, OP_LINK,
6518                LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6519                UIO_USERSPACE, uap->to, ctx);
6520         if ((error = namei(&tond))) {
6521                 goto out1;
6522         }
6523         tdvp = tond.ni_dvp;
6524         tvp = tond.ni_vp;
6525
6526         if (tvp != NULL) {
6527                 if (!(uap->flags & CPF_OVERWRITE)) {
6528                         error = EEXIST;
6529                         goto out;
6530                 }
6531         }
6532         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
6533                 error = EISDIR;
6534                 goto out;
6535         }
6536
6537         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
6538                 goto out;
6539
6540         if (fvp == tdvp)
6541                 error = EINVAL;
6542         /*
6543          * If source is the same as the destination (that is the
6544          * same inode number) then there is nothing to do.
6545          * (fixed to have POSIX semantics - CSM 3/2/98)
6546          */
6547         if (fvp == tvp)
6548                 error = -1;
6549         if (!error)
6550                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
6551 out:
6552         sdvp = tond.ni_startdir;
6553         /*
6554          * nameidone has to happen before we vnode_put(tdvp)
6555          * since it may need to release the fs_nodelock on the tdvp
6556          */
6557         nameidone(&tond);
6558
6559         if (tvp)
6560                 vnode_put(tvp);
6561         vnode_put(tdvp);
6562         vnode_put(sdvp);
6563 out1:
6564         vnode_put(fvp);
6565
6566         nameidone(&fromnd);
6567
6568         if (error == -1)
6569                 return (0);
6570         return (error);
6571 }
6572
6573
6574 /*
6575  * Rename files.  Source and destination must either both be directories,
6576  * or both not be directories.  If target is a directory, it must be empty.
6577  */
6578 /* ARGSUSED */
6579 static int
6580 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
6581     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
6582 {
6583         vnode_t tvp, tdvp;
6584         vnode_t fvp, fdvp;
6585         struct nameidata *fromnd, *tond;
6586         int error;
6587         int do_retry;
6588         int retry_count;
6589         int mntrename;
6590         int need_event;
6591         const char *oname = NULL;
6592         char *from_name = NULL, *to_name = NULL;
6593         int from_len=0, to_len=0;
6594         int holding_mntlock;
6595         mount_t locked_mp = NULL;
6596         vnode_t oparent = NULLVP;
6597 #if CONFIG_FSE
6598         fse_info from_finfo, to_finfo;
6599 #endif
6600         int from_truncated=0, to_truncated;
6601         int batched = 0;
6602         struct vnode_attr *fvap, *tvap;
6603         int continuing = 0;
6604         /* carving out a chunk for structs that are too big to be on stack. */
6605         struct {
6606                 struct nameidata from_node, to_node;
6607                 struct vnode_attr fv_attr, tv_attr;
6608         } * __rename_data;
6609         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
6610         fromnd = &__rename_data->from_node;
6611         tond = &__rename_data->to_node;
6612
6613         holding_mntlock = 0;
6614         do_retry = 0;
6615         retry_count = 0;
6616 retry:
6617         fvp = tvp = NULL;
6618         fdvp = tdvp = NULL;
6619         fvap = tvap = NULL;
6620         mntrename = FALSE;
6621
6622         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
6623             segflg, from, ctx);
6624         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
6625
6626         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6627             segflg, to, ctx);
6628         tond->ni_flag = NAMEI_COMPOUNDRENAME;
6629
6630 continue_lookup:
6631         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
6632                 if ( (error = nameiat(fromnd, fromfd)) )
6633                         goto out1;
6634                 fdvp = fromnd->ni_dvp;
6635                 fvp  = fromnd->ni_vp;
6636
6637                 if (fvp && fvp->v_type == VDIR)
6638                         tond->ni_cnd.cn_flags |= WILLBEDIR;
6639         }
6640
6641         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
6642                 if ( (error = nameiat(tond, tofd)) ) {
6643                         /*
6644                          * Translate error code for rename("dir1", "dir2/.").
6645                          */
6646                         if (error == EISDIR && fvp->v_type == VDIR)
6647                                 error = EINVAL;
6648                         goto out1;
6649                 }
6650                 tdvp = tond->ni_dvp;
6651                 tvp  = tond->ni_vp;
6652         }
6653
6654         batched = vnode_compound_rename_available(fdvp);
6655         if (!fvp) {
6656                 /*
6657                  * Claim: this check will never reject a valid rename.
6658                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
6659                  * Suppose fdvp and tdvp are not on the same mount.
6660                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
6661                  *      then you can't move it to within another dir on the same mountpoint.
6662                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
6663                  *
6664                  * If this check passes, then we are safe to pass these vnodes to the same FS.
6665                  */
6666                 if (fdvp->v_mount != tdvp->v_mount) {
6667                         error = EXDEV;
6668                         goto out1;
6669                 }
6670                 goto skipped_lookup;
6671         }
6672
6673         if (!batched) {
6674                 error = vn_authorize_rename(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, NULL);
6675                 if (error) {
6676                         if (error == ENOENT) {
6677                                 assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
6678                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6679                                         /*
6680                                          * We encountered a race where after doing the namei, tvp stops
6681                                          * being valid. If so, simply re-drive the rename call from the
6682                                          * top.
6683                                          */
6684                                         do_retry = 1;
6685                                         retry_count += 1;
6686                                 }
6687                         }
6688                         goto out1;
6689                 }
6690         }
6691
6692         /*
6693          * If the source and destination are the same (i.e. they're
6694          * links to the same vnode) and the target file system is
6695          * case sensitive, then there is nothing to do.
6696          *
6697          * XXX Come back to this.
6698          */
6699         if (fvp == tvp) {
6700                 int pathconf_val;
6701
6702                 /*
6703                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
6704                  * then assume that this file system is case sensitive.
6705                  */
6706                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
6707                     pathconf_val != 0) {
6708                         goto out1;
6709                 }
6710         }
6711
6712         /*
6713          * Allow the renaming of mount points.
6714          * - target must not exist
6715          * - target must reside in the same directory as source
6716          * - union mounts cannot be renamed
6717          * - "/" cannot be renamed
6718          *
6719          * XXX Handle this in VFS after a continued lookup (if we missed
6720          * in the cache to start off)
6721          */
6722         if ((fvp->v_flag & VROOT) &&
6723             (fvp->v_type == VDIR) &&
6724             (tvp == NULL)  &&
6725             (fvp->v_mountedhere == NULL)  &&
6726             (fdvp == tdvp)  &&
6727             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0)  &&
6728             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
6729                 vnode_t coveredvp;
6730
6731                 /* switch fvp to the covered vnode */
6732                 coveredvp = fvp->v_mount->mnt_vnodecovered;
6733                 if ( (vnode_getwithref(coveredvp)) ) {
6734                         error = ENOENT;
6735                         goto out1;
6736                 }
6737                 vnode_put(fvp);
6738
6739                 fvp = coveredvp;
6740                 mntrename = TRUE;
6741         }
6742         /*
6743          * Check for cross-device rename.
6744          */
6745         if ((fvp->v_mount != tdvp->v_mount) ||
6746             (tvp && (fvp->v_mount != tvp->v_mount))) {
6747                 error = EXDEV;
6748                 goto out1;
6749         }
6750
6751         /*
6752          * If source is the same as the destination (that is the
6753          * same inode number) then there is nothing to do...
6754          * EXCEPT if the underlying file system supports case
6755          * insensitivity and is case preserving.  In this case
6756          * the file system needs to handle the special case of
6757          * getting the same vnode as target (fvp) and source (tvp).
6758          *
6759          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
6760          * and _PC_CASE_PRESERVING can have this exception, and they need to
6761          * handle the special case of getting the same vnode as target and
6762          * source.  NOTE: Then the target is unlocked going into vnop_rename,
6763          * so not to cause locking problems. There is a single reference on tvp.
6764          *
6765          * NOTE - that fvp == tvp also occurs if they are hard linked and
6766          * that correct behaviour then is just to return success without doing
6767          * anything.
6768          *
6769          * XXX filesystem should take care of this itself, perhaps...
6770          */
6771         if (fvp == tvp && fdvp == tdvp) {
6772                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
6773                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
6774                           fromnd->ni_cnd.cn_namelen)) {
6775                         goto out1;
6776                 }
6777         }
6778
6779         if (holding_mntlock && fvp->v_mount != locked_mp) {
6780                 /*
6781                  * we're holding a reference and lock
6782                  * on locked_mp, but it no longer matches
6783                  * what we want to do... so drop our hold
6784                  */
6785                 mount_unlock_renames(locked_mp);
6786                 mount_drop(locked_mp, 0);
6787                 holding_mntlock = 0;
6788         }
6789         if (tdvp != fdvp && fvp->v_type == VDIR) {
6790                 /*
6791                  * serialize renames that re-shape
6792                  * the tree... if holding_mntlock is
6793                  * set, then we're ready to go...
6794                  * otherwise we
6795                  * first need to drop the iocounts
6796                  * we picked up, second take the
6797                  * lock to serialize the access,
6798                  * then finally start the lookup
6799                  * process over with the lock held
6800                  */
6801                 if (!holding_mntlock) {
6802                         /*
6803                          * need to grab a reference on
6804                          * the mount point before we
6805                          * drop all the iocounts... once
6806                          * the iocounts are gone, the mount
6807                          * could follow
6808                          */
6809                         locked_mp = fvp->v_mount;
6810                         mount_ref(locked_mp, 0);
6811
6812                         /*
6813                          * nameidone has to happen before we vnode_put(tvp)
6814                          * since it may need to release the fs_nodelock on the tvp
6815                          */
6816                         nameidone(tond);
6817
6818                         if (tvp)
6819                                 vnode_put(tvp);
6820                         vnode_put(tdvp);
6821
6822                         /*
6823                          * nameidone has to happen before we vnode_put(fdvp)
6824                          * since it may need to release the fs_nodelock on the fvp
6825                          */
6826                         nameidone(fromnd);
6827
6828                         vnode_put(fvp);
6829                         vnode_put(fdvp);
6830
6831                         mount_lock_renames(locked_mp);
6832                         holding_mntlock = 1;
6833
6834                         goto retry;
6835                 }
6836         } else {
6837                 /*
6838                  * when we dropped the iocounts to take
6839                  * the lock, we allowed the identity of
6840                  * the various vnodes to change... if they did,
6841                  * we may no longer be dealing with a rename
6842                  * that reshapes the tree... once we're holding
6843                  * the iocounts, the vnodes can't change type
6844                  * so we're free to drop the lock at this point
6845                  * and continue on
6846                  */
6847                 if (holding_mntlock) {
6848                         mount_unlock_renames(locked_mp);
6849                         mount_drop(locked_mp, 0);
6850                         holding_mntlock = 0;
6851                 }
6852         }
6853
6854         // save these off so we can later verify that fvp is the same
6855         oname   = fvp->v_name;
6856         oparent = fvp->v_parent;
6857
6858 skipped_lookup:
6859 #if CONFIG_FSE
6860         need_event = need_fsevent(FSE_RENAME, fdvp);
6861         if (need_event) {
6862                 if (fvp) {
6863                         get_fse_info(fvp, &from_finfo, ctx);
6864                 } else {
6865                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
6866                         if (error) {
6867                                 goto out1;
6868                         }
6869
6870                         fvap = &__rename_data->fv_attr;
6871                 }
6872
6873                 if (tvp) {
6874                         get_fse_info(tvp, &to_finfo, ctx);
6875                 } else if (batched) {
6876                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
6877                         if (error) {
6878                                 goto out1;
6879                         }
6880
6881                         tvap = &__rename_data->tv_attr;
6882                 }
6883         }
6884 #else
6885         need_event = 0;
6886 #endif /* CONFIG_FSE */
6887
6888         if (need_event || kauth_authorize_fileop_has_listeners()) {
6889                 if (from_name == NULL) {
6890                         GET_PATH(from_name);
6891                         if (from_name == NULL) {
6892                                 error = ENOMEM;
6893                                 goto out1;
6894                         }
6895                 }
6896
6897                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
6898
6899                 if (to_name == NULL) {
6900                         GET_PATH(to_name);
6901                         if (to_name == NULL) {
6902                                 error = ENOMEM;
6903                                 goto out1;
6904                         }
6905                 }
6906
6907                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
6908         }
6909 #if CONFIG_SECLUDED_RENAME
6910         if (flags & VFS_SECLUDE_RENAME) {
6911                 fromnd->ni_cnd.cn_flags |=  CN_SECLUDE_RENAME;
6912         }
6913 #else
6914         #pragma unused(flags)
6915 #endif
6916         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
6917                             tdvp, &tvp, &tond->ni_cnd, tvap,
6918                             0, ctx);
6919
6920         if (holding_mntlock) {
6921                 /*
6922                  * we can drop our serialization
6923                  * lock now
6924                  */
6925                 mount_unlock_renames(locked_mp);
6926                 mount_drop(locked_mp, 0);
6927                 holding_mntlock = 0;
6928         }
6929         if (error) {
6930                 if (error == EKEEPLOOKING) {
6931                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6932                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6933                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
6934                                 }
6935                         }
6936
6937                         fromnd->ni_vp = fvp;
6938                         tond->ni_vp = tvp;
6939
6940                         goto continue_lookup;
6941                 }
6942
6943                 /*
6944                  * We may encounter a race in the VNOP where the destination didn't
6945                  * exist when we did the namei, but it does by the time we go and
6946                  * try to create the entry. In this case, we should re-drive this rename
6947                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
6948                  * but other filesystems susceptible to this race could return it, too.
6949                  */
6950                 if (error == ERECYCLE) {
6951                         do_retry = 1;
6952                 }
6953
6954                 /*
6955                  * For compound VNOPs, the authorization callback may return
6956                  * ENOENT in case of racing hardlink lookups hitting the name
6957                  * cache, redrive the lookup.
6958                  */
6959                 if (batched && error == ENOENT) {
6960                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
6961                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6962                                 do_retry = 1;
6963                                 retry_count += 1;
6964                         }
6965                 }
6966
6967                 goto out1;
6968         }
6969
6970         /* call out to allow 3rd party notification of rename.
6971          * Ignore result of kauth_authorize_fileop call.
6972          */
6973         kauth_authorize_fileop(vfs_context_ucred(ctx),
6974                         KAUTH_FILEOP_RENAME,
6975                         (uintptr_t)from_name, (uintptr_t)to_name);
6976
6977 #if CONFIG_FSE
6978         if (from_name != NULL && to_name != NULL) {
6979                 if (from_truncated || to_truncated) {
6980                         // set it here since only the from_finfo gets reported up to user space
6981                         from_finfo.mode |= FSE_TRUNCATED_PATH;
6982                 }
6983
6984                 if (tvap && tvp) {
6985                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
6986                 }
6987                 if (fvap) {
6988                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
6989                 }
6990
6991                 if (tvp) {
6992                         add_fsevent(FSE_RENAME, ctx,
6993                                     FSE_ARG_STRING, from_len, from_name,
6994                                     FSE_ARG_FINFO, &from_finfo,
6995                                     FSE_ARG_STRING, to_len, to_name,
6996                                     FSE_ARG_FINFO, &to_finfo,
6997                                     FSE_ARG_DONE);
6998                 } else {
6999                         add_fsevent(FSE_RENAME, ctx,
7000                                     FSE_ARG_STRING, from_len, from_name,
7001                                     FSE_ARG_FINFO, &from_finfo,
7002                                     FSE_ARG_STRING, to_len, to_name,
7003                                     FSE_ARG_DONE);
7004                 }
7005         }
7006 #endif /* CONFIG_FSE */
7007
7008         /*
7009          * update filesystem's mount point data
7010          */
7011         if (mntrename) {
7012                 char *cp, *pathend, *mpname;
7013                 char * tobuf;
7014                 struct mount *mp;
7015                 int maxlen;
7016                 size_t len = 0;
7017
7018                 mp = fvp->v_mountedhere;
7019
7020                 if (vfs_busy(mp, LK_NOWAIT)) {
7021                         error = EBUSY;
7022                         goto out1;
7023                 }
7024                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
7025
7026                 if (UIO_SEG_IS_USER_SPACE(segflg))
7027                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
7028                 else
7029                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
7030                 if (!error) {
7031                         /* find current mount point prefix */
7032                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
7033                         for (cp = pathend; *cp != '\0'; ++cp) {
7034                                 if (*cp == '/')
7035                                         pathend = cp + 1;
7036                         }
7037                         /* find last component of target name */
7038                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
7039                                 if (*cp == '/')
7040                                         mpname = cp + 1;
7041                         }
7042                         /* append name to prefix */
7043                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
7044                         bzero(pathend, maxlen);
7045                         strlcpy(pathend, mpname, maxlen);
7046                 }
7047                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
7048
7049                 vfs_unbusy(mp);
7050         }
7051         /*
7052          * fix up name & parent pointers.  note that we first
7053          * check that fvp has the same name/parent pointers it
7054          * had before the rename call... this is a 'weak' check
7055          * at best...
7056          *
7057          * XXX oparent and oname may not be set in the compound vnop case
7058          */
7059         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
7060                 int update_flags;
7061
7062                 update_flags = VNODE_UPDATE_NAME;
7063
7064                 if (fdvp != tdvp)
7065                         update_flags |= VNODE_UPDATE_PARENT;
7066
7067                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
7068         }
7069 out1:
7070         if (to_name != NULL) {
7071                 RELEASE_PATH(to_name);
7072                 to_name = NULL;
7073         }
7074         if (from_name != NULL) {
7075                 RELEASE_PATH(from_name);
7076                 from_name = NULL;
7077         }
7078         if (holding_mntlock) {
7079                 mount_unlock_renames(locked_mp);
7080                 mount_drop(locked_mp, 0);
7081                 holding_mntlock = 0;
7082         }
7083         if (tdvp) {
7084                 /*
7085                  * nameidone has to happen before we vnode_put(tdvp)
7086                  * since it may need to release the fs_nodelock on the tdvp
7087                  */
7088                 nameidone(tond);
7089
7090                 if (tvp)
7091                         vnode_put(tvp);
7092                 vnode_put(tdvp);
7093         }
7094         if (fdvp) {
7095                 /*
7096                  * nameidone has to happen before we vnode_put(fdvp)
7097                  * since it may need to release the fs_nodelock on the fdvp
7098                  */
7099                 nameidone(fromnd);
7100
7101                 if (fvp)
7102                         vnode_put(fvp);
7103                 vnode_put(fdvp);
7104         }
7105
7106         /*
7107          * If things changed after we did the namei, then we will re-drive
7108          * this rename call from the top.
7109          */
7110         if (do_retry) {
7111                 do_retry = 0;
7112                 goto retry;
7113         }
7114
7115         FREE(__rename_data, M_TEMP);
7116         return (error);
7117 }
7118
7119 int
7120 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
7121 {
7122         return (renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
7123             AT_FDCWD, uap->to, UIO_USERSPACE, 0));
7124 }
7125
7126 #if CONFIG_SECLUDED_RENAME
7127 int rename_ext(__unused proc_t p, struct rename_ext_args *uap, __unused int32_t *retval)
7128 {
7129         return renameat_internal(
7130                 vfs_context_current(),
7131                 AT_FDCWD, uap->from,
7132                 AT_FDCWD, uap->to,
7133                 UIO_USERSPACE, uap->flags);
7134 }
7135 #endif
7136
7137 int
7138 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
7139 {
7140         return (renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
7141             uap->tofd, uap->to, UIO_USERSPACE, 0));
7142 }
7143
7144 /*
7145  * Make a directory file.
7146  *
7147  * Returns:     0                       Success
7148  *              EEXIST
7149  *      namei:???
7150  *      vnode_authorize:???
7151  *      vn_create:???
7152  */
7153 /* ARGSUSED */
7154 static int
7155 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
7156     enum uio_seg segflg)
7157 {
7158         vnode_t vp, dvp;
7159         int error;
7160         int update_flags = 0;
7161         int batched;
7162         struct nameidata nd;
7163
7164         AUDIT_ARG(mode, vap->va_mode);
7165         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
7166                path, ctx);
7167         nd.ni_cnd.cn_flags |= WILLBEDIR;
7168         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
7169
7170 continue_lookup:
7171         error = nameiat(&nd, fd);
7172         if (error)
7173                 return (error);
7174         dvp = nd.ni_dvp;
7175         vp = nd.ni_vp;
7176
7177         if (vp != NULL) {
7178                 error = EEXIST;
7179                 goto out;
7180         }
7181
7182         batched = vnode_compound_mkdir_available(dvp);
7183
7184         VATTR_SET(vap, va_type, VDIR);
7185
7186         /*
7187          * XXX
7188          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
7189          * only get EXISTS or EISDIR for existing path components, and not that it could see
7190          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
7191          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
7192          */
7193         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
7194                 if (error == EACCES || error == EPERM) {
7195                         int error2;
7196
7197                         nameidone(&nd);
7198                         vnode_put(dvp);
7199                         dvp = NULLVP;
7200
7201                         /*
7202                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
7203                          * rather than EACCESS if the target exists.
7204                          */
7205                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
7206                                         path, ctx);
7207                         error2 = nameiat(&nd, fd);
7208                         if (error2) {
7209                                 goto out;
7210                         } else {
7211                                 vp = nd.ni_vp;
7212                                 error = EEXIST;
7213                                 goto out;
7214                         }
7215                 }
7216
7217                 goto out;
7218         }
7219
7220         /*
7221          * make the directory
7222          */
7223         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
7224                 if (error == EKEEPLOOKING) {
7225                         nd.ni_vp = vp;
7226                         goto continue_lookup;
7227                 }
7228
7229                 goto out;
7230         }
7231
7232         // Make sure the name & parent pointers are hooked up
7233         if (vp->v_name == NULL)
7234                 update_flags |= VNODE_UPDATE_NAME;
7235         if (vp->v_parent == NULLVP)
7236                 update_flags |= VNODE_UPDATE_PARENT;
7237
7238         if (update_flags)
7239                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
7240
7241 #if CONFIG_FSE
7242         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
7243 #endif
7244
7245 out:
7246         /*
7247          * nameidone has to happen before we vnode_put(dvp)
7248          * since it may need to release the fs_nodelock on the dvp
7249          */
7250         nameidone(&nd);
7251
7252         if (vp)
7253                 vnode_put(vp);
7254         if (dvp)
7255                 vnode_put(dvp);
7256
7257         return (error);
7258 }
7259
7260 /*
7261  * mkdir_extended: Create a directory; with extended security (ACL).
7262  *
7263  * Parameters:    p                       Process requesting to create the directory
7264  *                uap                     User argument descriptor (see below)
7265  *                retval                  (ignored)
7266  *
7267  * Indirect:      uap->path               Path of directory to create
7268  *                uap->mode               Access permissions to set
7269  *                uap->xsecurity          ACL to set
7270  *
7271  * Returns:        0                      Success
7272  *                !0                      Not success
7273  *
7274  */
7275 int
7276 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
7277 {
7278         int ciferror;
7279         kauth_filesec_t xsecdst;
7280         struct vnode_attr va;
7281
7282         AUDIT_ARG(owner, uap->uid, uap->gid);
7283
7284         xsecdst = NULL;
7285         if ((uap->xsecurity != USER_ADDR_NULL) &&
7286             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
7287                 return ciferror;
7288
7289         VATTR_INIT(&va);
7290         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7291         if (xsecdst != NULL)
7292                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7293
7294         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7295             UIO_USERSPACE);
7296         if (xsecdst != NULL)
7297                 kauth_filesec_free(xsecdst);
7298         return ciferror;
7299 }
7300
7301 int
7302 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
7303 {
7304         struct vnode_attr va;
7305
7306         VATTR_INIT(&va);
7307         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7308
7309         return (mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7310             UIO_USERSPACE));
7311 }
7312
7313 int
7314 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
7315 {
7316         struct vnode_attr va;
7317
7318         VATTR_INIT(&va);
7319         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7320
7321         return(mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
7322             UIO_USERSPACE));
7323 }
7324
7325 static int
7326 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
7327     enum uio_seg segflg)
7328 {
7329         vnode_t vp, dvp;
7330         int error;
7331         struct nameidata nd;
7332         char     *path = NULL;
7333         int       len=0;
7334         int has_listeners = 0;
7335         int need_event = 0;
7336         int truncated = 0;
7337 #if CONFIG_FSE
7338         struct vnode_attr va;
7339 #endif /* CONFIG_FSE */
7340         struct vnode_attr *vap = NULL;
7341         int restart_count = 0;
7342         int batched;
7343
7344         int restart_flag;
7345
7346         /*
7347          * This loop exists to restart rmdir in the unlikely case that two
7348          * processes are simultaneously trying to remove the same directory
7349          * containing orphaned appleDouble files.
7350          */
7351         do {
7352                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
7353                     segflg, dirpath, ctx);
7354                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
7355 continue_lookup:
7356                 restart_flag = 0;
7357                 vap = NULL;
7358
7359                 error = nameiat(&nd, fd);
7360                 if (error)
7361                         return (error);
7362
7363                 dvp = nd.ni_dvp;
7364                 vp = nd.ni_vp;
7365
7366                 if (vp) {
7367                         batched = vnode_compound_rmdir_available(vp);
7368
7369                         if (vp->v_flag & VROOT) {
7370                                 /*
7371                                  * The root of a mounted filesystem cannot be deleted.
7372                                  */
7373                                 error = EBUSY;
7374                                 goto out;
7375                         }
7376
7377                         /*
7378                          * Removed a check here; we used to abort if vp's vid
7379                          * was not the same as what we'd seen the last time around.
7380                          * I do not think that check was valid, because if we retry
7381                          * and all dirents are gone, the directory could legitimately
7382                          * be recycled but still be present in a situation where we would
7383                          * have had permission to delete.  Therefore, we won't make
7384                          * an effort to preserve that check now that we may not have a
7385                          * vp here.
7386                          */
7387
7388                         if (!batched) {
7389                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
7390                                 if (error) {
7391                                         if (error == ENOENT) {
7392                                                 assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7393                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7394                                                         restart_flag = 1;
7395                                                         restart_count += 1;
7396                                                 }
7397                                         }
7398                                         goto out;
7399                                 }
7400                         }
7401                 } else {
7402                         batched = 1;
7403
7404                         if (!vnode_compound_rmdir_available(dvp)) {
7405                                 panic("No error, but no compound rmdir?");
7406                         }
7407                 }
7408
7409 #if CONFIG_FSE
7410                 fse_info  finfo;
7411
7412                 need_event = need_fsevent(FSE_DELETE, dvp);
7413                 if (need_event) {
7414                         if (!batched) {
7415                                 get_fse_info(vp, &finfo, ctx);
7416                         } else {
7417                                 error = vfs_get_notify_attributes(&va);
7418                                 if (error) {
7419                                         goto out;
7420                                 }
7421
7422                                 vap = &va;
7423                         }
7424                 }
7425 #endif
7426                 has_listeners = kauth_authorize_fileop_has_listeners();
7427                 if (need_event || has_listeners) {
7428                         if (path == NULL) {
7429                                 GET_PATH(path);
7430                                 if (path == NULL) {
7431                                         error = ENOMEM;
7432                                         goto out;
7433                                 }
7434                         }
7435
7436                         len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
7437 #if CONFIG_FSE
7438                         if (truncated) {
7439                                 finfo.mode |= FSE_TRUNCATED_PATH;
7440                         }
7441 #endif
7442                 }
7443
7444                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
7445                 nd.ni_vp = vp;
7446                 if (vp == NULLVP) {
7447                         /* Couldn't find a vnode */
7448                         goto out;
7449                 }
7450
7451                 if (error == EKEEPLOOKING) {
7452                         goto continue_lookup;
7453                 } else if (batched && error == ENOENT) {
7454                         assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7455                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7456                                 /*
7457                                  * For compound VNOPs, the authorization callback
7458                                  * may return ENOENT in case of racing hard link lookups
7459                                  * redrive the lookup.
7460                                  */
7461                                 restart_flag = 1;
7462                                 restart_count += 1;
7463                                 goto out;
7464                         }
7465                 }
7466 #if CONFIG_APPLEDOUBLE
7467                 /*
7468                  * Special case to remove orphaned AppleDouble
7469                  * files. I don't like putting this in the kernel,
7470                  * but carbon does not like putting this in carbon either,
7471                  * so here we are.
7472                  */
7473                 if (error == ENOTEMPTY) {
7474                         error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
7475                         if (error == EBUSY) {
7476                                 goto out;
7477                         }
7478
7479
7480                         /*
7481                          * Assuming everything went well, we will try the RMDIR again
7482                          */
7483                         if (!error)
7484                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
7485                 }
7486 #endif /* CONFIG_APPLEDOUBLE */
7487                 /*
7488                  * Call out to allow 3rd party notification of delete.
7489                  * Ignore result of kauth_authorize_fileop call.
7490                  */
7491                 if (!error) {
7492                         if (has_listeners) {
7493                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
7494                                                 KAUTH_FILEOP_DELETE,
7495                                                 (uintptr_t)vp,
7496                                                 (uintptr_t)path);
7497                         }
7498
7499                         if (vp->v_flag & VISHARDLINK) {
7500                                 // see the comment in unlink1() about why we update
7501                                 // the parent of a hard link when it is removed
7502                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
7503                         }
7504
7505 #if CONFIG_FSE
7506                         if (need_event) {
7507                                 if (vap) {
7508                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
7509                                 }
7510                                 add_fsevent(FSE_DELETE, ctx,
7511                                                 FSE_ARG_STRING, len, path,
7512                                                 FSE_ARG_FINFO, &finfo,
7513                                                 FSE_ARG_DONE);
7514                         }
7515 #endif
7516                 }
7517
7518 out:
7519                 if (path != NULL) {
7520                         RELEASE_PATH(path);
7521                         path = NULL;
7522                 }
7523                 /*
7524                  * nameidone has to happen before we vnode_put(dvp)
7525                  * since it may need to release the fs_nodelock on the dvp
7526                  */
7527                 nameidone(&nd);
7528                 vnode_put(dvp);
7529
7530                 if (vp)
7531                         vnode_put(vp);
7532
7533                 if (restart_flag == 0) {
7534                         wakeup_one((caddr_t)vp);
7535                         return (error);
7536                 }
7537                 tsleep(vp, PVFS, "rm AD", 1);
7538
7539         } while (restart_flag != 0);
7540
7541         return (error);
7542
7543 }
7544
7545 /*
7546  * Remove a directory file.
7547  */
7548 /* ARGSUSED */
7549 int
7550 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
7551 {
7552         return (rmdirat_internal(vfs_context_current(), AT_FDCWD,
7553             CAST_USER_ADDR_T(uap->path), UIO_USERSPACE));
7554 }
7555
7556 /* Get direntry length padded to 8 byte alignment */
7557 #define DIRENT64_LEN(namlen) \
7558         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
7559
7560 errno_t
7561 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
7562                 int *numdirent, vfs_context_t ctxp)
7563 {
7564         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
7565         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
7566                    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0))  {
7567                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
7568         } else {
7569                 size_t bufsize;
7570                 void * bufptr;
7571                 uio_t auio;
7572                 struct direntry *entry64;
7573                 struct dirent *dep;
7574                 int bytesread;
7575                 int error;
7576
7577                 /*
7578                  * Our kernel buffer needs to be smaller since re-packing
7579                  * will expand each dirent.  The worse case (when the name
7580                  * length is 3) corresponds to a struct direntry size of 32
7581                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
7582                  * (4-byte aligned).  So having a buffer that is 3/8 the size
7583                  * will prevent us from reading more than we can pack.
7584                  *
7585                  * Since this buffer is wired memory, we will limit the
7586                  * buffer size to a maximum of 32K. We would really like to
7587                  * use 32K in the MIN(), but we use magic number 87371 to
7588                  * prevent uio_resid() * 3 / 8 from overflowing.
7589                  */
7590                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
7591                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
7592                 if (bufptr == NULL) {
7593                         return ENOMEM;
7594                 }
7595
7596                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
7597                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
7598                 auio->uio_offset = uio->uio_offset;
7599
7600                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
7601
7602                 dep = (struct dirent *)bufptr;
7603                 bytesread = bufsize - uio_resid(auio);
7604
7605                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
7606                        M_TEMP, M_WAITOK);
7607                 /*
7608                  * Convert all the entries and copy them out to user's buffer.
7609                  */
7610                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
7611                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
7612
7613                         bzero(entry64, enbufsize);
7614                         /* Convert a dirent to a dirent64. */
7615                         entry64->d_ino = dep->d_ino;
7616                         entry64->d_seekoff = 0;
7617                         entry64->d_reclen = enbufsize;
7618                         entry64->d_namlen = dep->d_namlen;
7619                         entry64->d_type = dep->d_type;
7620                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
7621
7622                         /* Move to next entry. */
7623                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
7624
7625                         /* Copy entry64 to user's buffer. */
7626                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
7627                 }
7628
7629                 /* Update the real offset using the offset we got from VNOP_READDIR. */
7630                 if (error == 0) {
7631                         uio->uio_offset = auio->uio_offset;
7632                 }
7633                 uio_free(auio);
7634                 FREE(bufptr, M_TEMP);
7635                 FREE(entry64, M_TEMP);
7636                 return (error);
7637         }
7638 }
7639
7640 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
7641
7642 /*
7643  * Read a block of directory entries in a file system independent format.
7644  */
7645 static int
7646 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
7647                      off_t *offset, int flags)
7648 {
7649         vnode_t vp;
7650         struct vfs_context context = *vfs_context_current();    /* local copy */
7651         struct fileproc *fp;
7652         uio_t auio;
7653         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7654         off_t loff;
7655         int error, eofflag, numdirent;
7656         char uio_buf[ UIO_SIZEOF(1) ];
7657
7658         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
7659         if (error) {
7660                 return (error);
7661         }
7662         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7663                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7664                 error = EBADF;
7665                 goto out;
7666         }
7667
7668         if (bufsize > GETDIRENTRIES_MAXBUFSIZE)
7669                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
7670
7671 #if CONFIG_MACF
7672         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
7673         if (error)
7674                 goto out;
7675 #endif
7676         if ( (error = vnode_getwithref(vp)) ) {
7677                 goto out;
7678         }
7679         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7680
7681 unionread:
7682         if (vp->v_type != VDIR) {
7683                 (void)vnode_put(vp);
7684                 error = EINVAL;
7685                 goto out;
7686         }
7687
7688 #if CONFIG_MACF
7689         error = mac_vnode_check_readdir(&context, vp);
7690         if (error != 0) {
7691                 (void)vnode_put(vp);
7692                 goto out;
7693         }
7694 #endif /* MAC */
7695
7696         loff = fp->f_fglob->fg_offset;
7697         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
7698         uio_addiov(auio, bufp, bufsize);
7699
7700         if (flags & VNODE_READDIR_EXTENDED) {
7701                 error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
7702                 fp->f_fglob->fg_offset = uio_offset(auio);
7703         } else {
7704                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
7705                 fp->f_fglob->fg_offset = uio_offset(auio);
7706         }
7707         if (error) {
7708                 (void)vnode_put(vp);
7709                 goto out;
7710         }
7711
7712         if ((user_ssize_t)bufsize == uio_resid(auio)){
7713                 if (union_dircheckp) {
7714                         error = union_dircheckp(&vp, fp, &context);
7715                         if (error == -1)
7716                                 goto unionread;
7717                         if (error)
7718                                 goto out;
7719                 }
7720
7721                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
7722                         struct vnode *tvp = vp;
7723                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
7724                                 vnode_ref(vp);
7725                                 fp->f_fglob->fg_data = (caddr_t) vp;
7726                                 fp->f_fglob->fg_offset = 0;
7727                                 vnode_rele(tvp);
7728                                 vnode_put(tvp);
7729                                 goto unionread;
7730                         }
7731                         vp = tvp;
7732                 }
7733         }
7734
7735         vnode_put(vp);
7736         if (offset) {
7737                 *offset = loff;
7738         }
7739
7740         *bytesread = bufsize - uio_resid(auio);
7741 out:
7742         file_drop(fd);
7743         return (error);
7744 }
7745
7746
7747 int
7748 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
7749 {
7750         off_t offset;
7751         ssize_t bytesread;
7752         int error;
7753
7754         AUDIT_ARG(fd, uap->fd);
7755         error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
7756
7757         if (error == 0) {
7758                 if (proc_is64bit(p)) {
7759                         user64_long_t base = (user64_long_t)offset;
7760                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
7761                 } else {
7762                         user32_long_t base = (user32_long_t)offset;
7763                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
7764                 }
7765                 *retval = bytesread;
7766         }
7767         return (error);
7768 }
7769
7770 int
7771 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
7772 {
7773         off_t offset;
7774         ssize_t bytesread;
7775         int error;
7776
7777         AUDIT_ARG(fd, uap->fd);
7778         error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
7779
7780         if (error == 0) {
7781                 *retval = bytesread;
7782                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
7783         }
7784         return (error);
7785 }
7786
7787
7788 /*
7789  * Set the mode mask for creation of filesystem nodes.
7790  * XXX implement xsecurity
7791  */
7792 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
7793 static int
7794 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
7795 {
7796         struct filedesc *fdp;
7797
7798         AUDIT_ARG(mask, newmask);
7799         proc_fdlock(p);
7800         fdp = p->p_fd;
7801         *retval = fdp->fd_cmask;
7802         fdp->fd_cmask = newmask & ALLPERMS;
7803         proc_fdunlock(p);
7804         return (0);
7805 }
7806
7807 /*
7808  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
7809  *
7810  * Parameters:    p                       Process requesting to set the umask
7811  *                uap                     User argument descriptor (see below)
7812  *                retval                  umask of the process (parameter p)
7813  *
7814  * Indirect:      uap->newmask            umask to set
7815  *                uap->xsecurity          ACL to set
7816  *
7817  * Returns:        0                      Success
7818  *                !0                      Not success
7819  *
7820  */
7821 int
7822 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
7823 {
7824         int ciferror;
7825         kauth_filesec_t xsecdst;
7826
7827         xsecdst = KAUTH_FILESEC_NONE;
7828         if (uap->xsecurity != USER_ADDR_NULL) {
7829                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
7830                         return ciferror;
7831         } else {
7832                 xsecdst = KAUTH_FILESEC_NONE;
7833         }
7834
7835         ciferror = umask1(p, uap->newmask, xsecdst, retval);
7836
7837         if (xsecdst != KAUTH_FILESEC_NONE)
7838                 kauth_filesec_free(xsecdst);
7839         return ciferror;
7840 }
7841
7842 int
7843 umask(proc_t p, struct umask_args *uap, int32_t *retval)
7844 {
7845         return(umask1(p, uap->newmask, UMASK_NOXSECURITY, retval));
7846 }
7847
7848 /*
7849  * Void all references to file by ripping underlying filesystem
7850  * away from vnode.
7851  */
7852 /* ARGSUSED */
7853 int
7854 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
7855 {
7856         vnode_t vp;
7857         struct vnode_attr va;
7858         vfs_context_t ctx = vfs_context_current();
7859         int error;
7860         struct nameidata nd;
7861
7862         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
7863                uap->path, ctx);
7864         error = namei(&nd);
7865         if (error)
7866                 return (error);
7867         vp = nd.ni_vp;
7868
7869         nameidone(&nd);
7870
7871         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
7872                 error = ENOTSUP;
7873                 goto out;
7874         }
7875
7876         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
7877                 error = EBUSY;
7878                 goto out;
7879         }
7880
7881 #if CONFIG_MACF
7882         error = mac_vnode_check_revoke(ctx, vp);
7883         if (error)
7884                 goto out;
7885 #endif
7886
7887         VATTR_INIT(&va);
7888         VATTR_WANTED(&va, va_uid);
7889         if ((error = vnode_getattr(vp, &va, ctx)))
7890                 goto out;
7891         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
7892             (error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
7893                 goto out;
7894         if (vp->v_usecount > 0 || (vnode_isaliased(vp)))
7895                 VNOP_REVOKE(vp, REVOKEALL, ctx);
7896 out:
7897         vnode_put(vp);
7898         return (error);
7899 }
7900
7901
7902 /*
7903  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
7904  *  The following system calls are designed to support features
7905  *  which are specific to the HFS & HFS Plus volume formats
7906  */
7907
7908
7909 /*
7910  * Obtain attribute information on objects in a directory while enumerating
7911  * the directory.
7912  */
7913 /* ARGSUSED */
7914 int
7915 getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
7916 {
7917         vnode_t vp;
7918         struct fileproc *fp;
7919         uio_t auio = NULL;
7920         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7921         uint32_t count, savecount;
7922         uint32_t newstate;
7923         int error, eofflag;
7924         uint32_t loff;
7925         struct attrlist attributelist;
7926         vfs_context_t ctx = vfs_context_current();
7927         int fd = uap->fd;
7928         char uio_buf[ UIO_SIZEOF(1) ];
7929         kauth_action_t action;
7930
7931         AUDIT_ARG(fd, fd);
7932
7933         /* Get the attributes into kernel space */
7934         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
7935                 return(error);
7936         }
7937         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
7938                 return(error);
7939         }
7940         savecount = count;
7941         if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) {
7942                 return (error);
7943         }
7944         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7945                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7946                 error = EBADF;
7947                 goto out;
7948         }
7949
7950
7951 #if CONFIG_MACF
7952         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
7953             fp->f_fglob);
7954         if (error)
7955                 goto out;
7956 #endif
7957
7958
7959         if ( (error = vnode_getwithref(vp)) )
7960                 goto out;
7961
7962         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7963
7964 unionread:
7965         if (vp->v_type != VDIR) {
7966                 (void)vnode_put(vp);
7967                 error = EINVAL;
7968                 goto out;
7969         }
7970
7971 #if CONFIG_MACF
7972         error = mac_vnode_check_readdir(ctx, vp);
7973         if (error != 0) {
7974                 (void)vnode_put(vp);
7975                 goto out;
7976         }
7977 #endif /* MAC */
7978
7979         /* set up the uio structure which will contain the users return buffer */
7980         loff = fp->f_fglob->fg_offset;
7981         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
7982         uio_addiov(auio, uap->buffer, uap->buffersize);
7983
7984         /*
7985          * If the only item requested is file names, we can let that past with
7986          * just LIST_DIRECTORY.  If they want any other attributes, that means
7987          * they need SEARCH as well.
7988          */
7989         action = KAUTH_VNODE_LIST_DIRECTORY;
7990         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
7991             attributelist.fileattr || attributelist.dirattr)
7992                 action |= KAUTH_VNODE_SEARCH;
7993
7994         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
7995
7996                 /* Believe it or not, uap->options only has 32-bits of valid
7997                  * info, so truncate before extending again */
7998
7999                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
8000                                 (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
8001         }
8002
8003         if (error) {
8004                 (void) vnode_put(vp);
8005                 goto out;
8006         }
8007
8008         /*
8009          * If we've got the last entry of a directory in a union mount
8010          * then reset the eofflag and pretend there's still more to come.
8011          * The next call will again set eofflag and the buffer will be empty,
8012          * so traverse to the underlying directory and do the directory
8013          * read there.
8014          */
8015         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
8016                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
8017                         eofflag = 0;
8018                 } else {                                                // Empty buffer
8019                         struct vnode *tvp = vp;
8020                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
8021                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
8022                                 fp->f_fglob->fg_data = (caddr_t) vp;
8023                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
8024                                 count = savecount;
8025                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
8026                                 vnode_put(tvp);
8027                                 goto unionread;
8028                         }
8029                         vp = tvp;
8030                 }
8031         }
8032
8033         (void)vnode_put(vp);
8034
8035         if (error)
8036                 goto out;
8037         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
8038
8039         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count))))
8040                 goto out;
8041         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate))))
8042                 goto out;
8043         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff))))
8044                 goto out;
8045
8046         *retval = eofflag;  /* similar to getdirentries */
8047         error = 0;
8048 out:
8049         file_drop(fd);
8050         return (error); /* return error earlier, an retval of 0 or 1 now */
8051
8052 } /* end of getdirentriesattr system call */
8053
8054 /*
8055 * Exchange data between two files
8056 */
8057
8058 /* ARGSUSED */
8059 int
8060 exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
8061 {
8062
8063         struct nameidata fnd, snd;
8064         vfs_context_t ctx = vfs_context_current();
8065         vnode_t fvp;
8066         vnode_t svp;
8067         int error;
8068         u_int32_t nameiflags;
8069         char *fpath = NULL;
8070         char *spath = NULL;
8071         int   flen=0, slen=0;
8072         int from_truncated=0, to_truncated=0;
8073 #if CONFIG_FSE
8074         fse_info f_finfo, s_finfo;
8075 #endif
8076
8077         nameiflags = 0;
8078         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8079
8080         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
8081                UIO_USERSPACE, uap->path1, ctx);
8082
8083         error = namei(&fnd);
8084         if (error)
8085                 goto out2;
8086
8087         nameidone(&fnd);
8088         fvp = fnd.ni_vp;
8089
8090         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
8091                UIO_USERSPACE, uap->path2, ctx);
8092
8093         error = namei(&snd);
8094         if (error) {
8095                 vnode_put(fvp);
8096                 goto out2;
8097         }
8098         nameidone(&snd);
8099         svp = snd.ni_vp;
8100
8101         /*
8102          * if the files are the same, return an inval error
8103          */
8104         if (svp == fvp) {
8105                 error = EINVAL;
8106                 goto out;
8107         }
8108
8109         /*
8110          * if the files are on different volumes, return an error
8111          */
8112         if (svp->v_mount != fvp->v_mount) {
8113                 error = EXDEV;
8114                 goto out;
8115         }
8116
8117         /* If they're not files, return an error */
8118         if ( (vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
8119                 error = EINVAL;
8120                 goto out;
8121         }
8122
8123 #if CONFIG_MACF
8124         error = mac_vnode_check_exchangedata(ctx,
8125             fvp, svp);
8126         if (error)
8127                 goto out;
8128 #endif
8129         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
8130             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0))
8131                 goto out;
8132
8133         if (
8134 #if CONFIG_FSE
8135         need_fsevent(FSE_EXCHANGE, fvp) ||
8136 #endif
8137         kauth_authorize_fileop_has_listeners()) {
8138                 GET_PATH(fpath);
8139                 GET_PATH(spath);
8140                 if (fpath == NULL || spath == NULL) {
8141                         error = ENOMEM;
8142                         goto out;
8143                 }
8144
8145                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
8146                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
8147
8148 #if CONFIG_FSE
8149                 get_fse_info(fvp, &f_finfo, ctx);
8150                 get_fse_info(svp, &s_finfo, ctx);
8151                 if (from_truncated || to_truncated) {
8152                         // set it here since only the f_finfo gets reported up to user space
8153                         f_finfo.mode |= FSE_TRUNCATED_PATH;
8154                 }
8155 #endif
8156         }
8157         /* Ok, make the call */
8158         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
8159
8160         if (error == 0) {
8161             const char *tmpname;
8162
8163             if (fpath != NULL && spath != NULL) {
8164                     /* call out to allow 3rd party notification of exchangedata.
8165                      * Ignore result of kauth_authorize_fileop call.
8166                      */
8167                     kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
8168                                            (uintptr_t)fpath, (uintptr_t)spath);
8169             }
8170             name_cache_lock();
8171
8172             tmpname     = fvp->v_name;
8173             fvp->v_name = svp->v_name;
8174             svp->v_name = tmpname;
8175
8176             if (fvp->v_parent != svp->v_parent) {
8177                 vnode_t tmp;
8178
8179                 tmp           = fvp->v_parent;
8180                 fvp->v_parent = svp->v_parent;
8181                 svp->v_parent = tmp;
8182             }
8183             name_cache_unlock();
8184
8185 #if CONFIG_FSE
8186             if (fpath != NULL && spath != NULL) {
8187                     add_fsevent(FSE_EXCHANGE, ctx,
8188                                 FSE_ARG_STRING, flen, fpath,
8189                                 FSE_ARG_FINFO, &f_finfo,
8190                                 FSE_ARG_STRING, slen, spath,
8191                                 FSE_ARG_FINFO, &s_finfo,
8192                                 FSE_ARG_DONE);
8193             }
8194 #endif
8195         }
8196
8197 out:
8198         if (fpath != NULL)
8199                 RELEASE_PATH(fpath);
8200         if (spath != NULL)
8201                 RELEASE_PATH(spath);
8202         vnode_put(svp);
8203         vnode_put(fvp);
8204 out2:
8205         return (error);
8206 }
8207
8208 /*
8209  * Return (in MB) the amount of freespace on the given vnode's volume.
8210  */
8211 uint32_t freespace_mb(vnode_t vp);
8212
8213 uint32_t
8214 freespace_mb(vnode_t vp)
8215 {
8216         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
8217         return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
8218                 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
8219 }
8220
8221 #if CONFIG_SEARCHFS
8222
8223 /* ARGSUSED */
8224
8225 int
8226 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
8227 {
8228         vnode_t vp, tvp;
8229         int i, error=0;
8230         int fserror = 0;
8231         struct nameidata nd;
8232         struct user64_fssearchblock searchblock;
8233         struct searchstate *state;
8234         struct attrlist *returnattrs;
8235         struct timeval timelimit;
8236         void *searchparams1,*searchparams2;
8237         uio_t auio = NULL;
8238         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8239         uint32_t nummatches;
8240         int mallocsize;
8241         uint32_t nameiflags;
8242         vfs_context_t ctx = vfs_context_current();
8243         char uio_buf[ UIO_SIZEOF(1) ];
8244
8245         /* Start by copying in fsearchblock parameter list */
8246     if (IS_64BIT_PROCESS(p)) {
8247         error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
8248         timelimit.tv_sec = searchblock.timelimit.tv_sec;
8249         timelimit.tv_usec = searchblock.timelimit.tv_usec;
8250     }
8251     else {
8252         struct user32_fssearchblock tmp_searchblock;
8253
8254         error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
8255         // munge into 64-bit version
8256         searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
8257         searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
8258         searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
8259         searchblock.maxmatches = tmp_searchblock.maxmatches;
8260                 /*
8261                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
8262                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
8263                  */
8264         timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
8265         timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
8266         searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
8267         searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
8268         searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
8269         searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
8270         searchblock.searchattrs = tmp_searchblock.searchattrs;
8271     }
8272         if (error)
8273                 return(error);
8274
8275         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
8276          */
8277         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
8278                 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS)
8279                 return(EINVAL);
8280
8281         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
8282         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
8283         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
8284         /* block.                                                                                             */
8285         /*                                                                                                    */
8286         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
8287         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
8288         /*       assumes the size is still 556 bytes it will continue to work                                 */
8289
8290         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
8291                 sizeof(struct attrlist) + sizeof(struct searchstate) + (2*sizeof(uint32_t));
8292
8293         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
8294
8295         /* Now set up the various pointers to the correct place in our newly allocated memory */
8296
8297         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
8298         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
8299         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof (struct attrlist));
8300
8301         /* Now copy in the stuff given our local variables. */
8302
8303         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1)))
8304                 goto freeandexit;
8305
8306         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2)))
8307                 goto freeandexit;
8308
8309         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist))))
8310                 goto freeandexit;
8311
8312         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate))))
8313                 goto freeandexit;
8314
8315         /*
8316          * When searching a union mount, need to set the
8317          * start flag at the first call on each layer to
8318          * reset state for the new volume.
8319          */
8320         if (uap->options & SRCHFS_START)
8321                 state->ss_union_layer = 0;
8322         else
8323                 uap->options |= state->ss_union_flags;
8324         state->ss_union_flags = 0;
8325
8326         /*
8327          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
8328          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
8329          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
8330          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
8331          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
8332          */
8333
8334         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
8335                 attrreference_t* string_ref;
8336                 u_int32_t* start_length;
8337                 user64_size_t param_length;
8338
8339                 /* validate searchparams1 */
8340                 param_length = searchblock.sizeofsearchparams1;
8341                 /* skip the word that specifies length of the buffer */
8342                 start_length= (u_int32_t*) searchparams1;
8343                 start_length= start_length+1;
8344                 string_ref= (attrreference_t*) start_length;
8345
8346                 /* ensure no negative offsets or too big offsets */
8347                 if (string_ref->attr_dataoffset < 0 ) {
8348                         error = EINVAL;
8349                         goto freeandexit;
8350                 }
8351                 if (string_ref->attr_length > MAXPATHLEN) {
8352                         error = EINVAL;
8353                         goto freeandexit;
8354                 }
8355
8356                 /* Check for pointer overflow in the string ref */
8357                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
8358                         error = EINVAL;
8359                         goto freeandexit;
8360                 }
8361
8362                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
8363                         error = EINVAL;
8364                         goto freeandexit;
8365                 }
8366                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
8367                         error = EINVAL;
8368                         goto freeandexit;
8369                 }
8370         }
8371
8372         /* set up the uio structure which will contain the users return buffer */
8373         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8374         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
8375
8376         nameiflags = 0;
8377         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8378         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
8379                UIO_USERSPACE, uap->path, ctx);
8380
8381         error = namei(&nd);
8382         if (error)
8383                 goto freeandexit;
8384         vp = nd.ni_vp;
8385         nameidone(&nd);
8386
8387         /*
8388          * Switch to the root vnode for the volume
8389          */
8390         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
8391         vnode_put(vp);
8392         if (error)
8393                 goto freeandexit;
8394         vp = tvp;
8395
8396         /*
8397          * If it's a union mount, the path lookup takes
8398          * us to the top layer. But we may need to descend
8399          * to a lower layer. For non-union mounts the layer
8400          * is always zero.
8401          */
8402         for (i = 0; i < (int) state->ss_union_layer; i++) {
8403                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0)
8404                         break;
8405                 tvp = vp;
8406                 vp = vp->v_mount->mnt_vnodecovered;
8407                 if (vp == NULL) {
8408                         vnode_put(tvp);
8409                         error = ENOENT;
8410                         goto freeandexit;
8411                 }
8412                 vnode_getwithref(vp);
8413                 vnode_put(tvp);
8414         }
8415
8416 #if CONFIG_MACF
8417         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
8418         if (error) {
8419                 vnode_put(vp);
8420                 goto freeandexit;
8421         }
8422 #endif
8423
8424
8425         /*
8426          * If searchblock.maxmatches == 0, then skip the search. This has happened
8427          * before and sometimes the underlying code doesnt deal with it well.
8428          */
8429          if (searchblock.maxmatches == 0) {
8430                 nummatches = 0;
8431                 goto saveandexit;
8432          }
8433
8434         /*
8435          * Allright, we have everything we need, so lets make that call.
8436          *
8437          * We keep special track of the return value from the file system:
8438          * EAGAIN is an acceptable error condition that shouldn't keep us
8439          * from copying out any results...
8440          */
8441
8442         fserror = VNOP_SEARCHFS(vp,
8443                 searchparams1,
8444                 searchparams2,
8445                 &searchblock.searchattrs,
8446                 (u_long)searchblock.maxmatches,
8447                 &timelimit,
8448                 returnattrs,
8449                 &nummatches,
8450                 (u_long)uap->scriptcode,
8451                 (u_long)uap->options,
8452                 auio,
8453                 (struct searchstate *) &state->ss_fsstate,
8454                 ctx);
8455
8456         /*
8457          * If it's a union mount we need to be called again
8458          * to search the mounted-on filesystem.
8459          */
8460         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
8461                 state->ss_union_flags = SRCHFS_START;
8462                 state->ss_union_layer++;        // search next layer down
8463                 fserror = EAGAIN;
8464         }
8465
8466 saveandexit:
8467
8468         vnode_put(vp);
8469
8470         /* Now copy out the stuff that needs copying out. That means the number of matches, the
8471            search state.  Everything was already put into he return buffer by the vop call. */
8472
8473         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0)
8474                 goto freeandexit;
8475
8476         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0)
8477                 goto freeandexit;
8478
8479         error = fserror;
8480
8481 freeandexit:
8482
8483         FREE(searchparams1,M_TEMP);
8484
8485         return(error);
8486
8487
8488 } /* end of searchfs system call */
8489
8490 #else /* CONFIG_SEARCHFS */
8491
8492 int
8493 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
8494 {
8495         return (ENOTSUP);
8496 }
8497
8498 #endif /* CONFIG_SEARCHFS */
8499
8500
8501 lck_grp_attr_t *  nspace_group_attr;
8502 lck_attr_t *      nspace_lock_attr;
8503 lck_grp_t *       nspace_mutex_group;
8504
8505 lck_mtx_t         nspace_handler_lock;
8506 lck_mtx_t         nspace_handler_exclusion_lock;
8507
8508 time_t snapshot_timestamp=0;
8509 int nspace_allow_virtual_devs=0;
8510
8511 void nspace_handler_init(void);
8512
8513 typedef struct nspace_item_info {
8514         struct vnode *vp;
8515         void         *arg;
8516         uint64_t      op;
8517         uint32_t      vid;
8518         uint32_t      flags;
8519         uint32_t      token;
8520         uint32_t      refcount;
8521 } nspace_item_info;
8522
8523 #define MAX_NSPACE_ITEMS   128
8524 nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
8525 uint32_t      nspace_item_idx=0;              // also used as the sleep/wakeup rendezvous address
8526 uint32_t      nspace_token_id=0;
8527 uint32_t      nspace_handler_timeout = 15;    // seconds
8528
8529 #define NSPACE_ITEM_NEW         0x0001
8530 #define NSPACE_ITEM_PROCESSING  0x0002
8531 #define NSPACE_ITEM_DEAD        0x0004
8532 #define NSPACE_ITEM_CANCELLED   0x0008
8533 #define NSPACE_ITEM_DONE        0x0010
8534 #define NSPACE_ITEM_RESET_TIMER 0x0020
8535
8536 #define NSPACE_ITEM_NSPACE_EVENT   0x0040
8537 #define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
8538
8539 #define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT)
8540
8541 //#pragma optimization_level 0
8542
8543 typedef enum {
8544         NSPACE_HANDLER_NSPACE = 0,
8545         NSPACE_HANDLER_SNAPSHOT = 1,
8546
8547         NSPACE_HANDLER_COUNT,
8548 } nspace_type_t;
8549
8550 typedef struct {
8551         uint64_t handler_tid;
8552         struct proc *handler_proc;
8553         int handler_busy;
8554 } nspace_handler_t;
8555
8556 nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
8557
8558 /* namespace fsctl functions */
8559 static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
8560 static int nspace_item_flags_for_type(nspace_type_t nspace_type);
8561 static int nspace_open_flags_for_type(nspace_type_t nspace_type);
8562 static nspace_type_t nspace_type_for_op(uint64_t op);
8563 static int nspace_is_special_process(struct proc *proc);
8564 static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
8565 static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
8566 static int validate_namespace_args (int is64bit, int size);
8567 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
8568
8569
8570 static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
8571 {
8572         switch(nspace_type) {
8573                 case NSPACE_HANDLER_NSPACE:
8574                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
8575                 case NSPACE_HANDLER_SNAPSHOT:
8576                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
8577                 default:
8578                         printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
8579                         return 0;
8580         }
8581 }
8582
8583 static inline int nspace_item_flags_for_type(nspace_type_t nspace_type)
8584 {
8585         switch(nspace_type) {
8586                 case NSPACE_HANDLER_NSPACE:
8587                         return NSPACE_ITEM_NSPACE_EVENT;
8588                 case NSPACE_HANDLER_SNAPSHOT:
8589                         return NSPACE_ITEM_SNAPSHOT_EVENT;
8590                 default:
8591                         printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
8592                         return 0;
8593         }
8594 }
8595
8596 static inline int nspace_open_flags_for_type(nspace_type_t nspace_type)
8597 {
8598         switch(nspace_type) {
8599                 case NSPACE_HANDLER_NSPACE:
8600                         return FREAD | FWRITE | O_EVTONLY;
8601                 case NSPACE_HANDLER_SNAPSHOT:
8602                         return FREAD | O_EVTONLY;
8603                 default:
8604                         printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
8605                         return 0;
8606         }
8607 }
8608
8609 static inline nspace_type_t nspace_type_for_op(uint64_t op)
8610 {
8611         switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
8612                 case NAMESPACE_HANDLER_NSPACE_EVENT:
8613                         return NSPACE_HANDLER_NSPACE;
8614                 case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
8615                         return NSPACE_HANDLER_SNAPSHOT;
8616                 default:
8617                         printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
8618                         return NSPACE_HANDLER_NSPACE;
8619         }
8620 }
8621
8622 static inline int nspace_is_special_process(struct proc *proc)
8623 {
8624         int i;
8625         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
8626                 if (proc == nspace_handlers[i].handler_proc)
8627                         return 1;
8628         }
8629         return 0;
8630 }
8631
8632 void
8633 nspace_handler_init(void)
8634 {
8635         nspace_lock_attr    = lck_attr_alloc_init();
8636         nspace_group_attr   = lck_grp_attr_alloc_init();
8637         nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
8638         lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
8639         lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
8640         memset(&nspace_items[0], 0, sizeof(nspace_items));
8641 }
8642
8643 void
8644 nspace_proc_exit(struct proc *p)
8645 {
8646         int i, event_mask = 0;
8647
8648         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
8649                 if (p == nspace_handlers[i].handler_proc) {
8650                         event_mask |= nspace_item_flags_for_type(i);
8651                         nspace_handlers[i].handler_tid = 0;
8652                         nspace_handlers[i].handler_proc = NULL;
8653                 }
8654         }
8655
8656         if (event_mask == 0) {
8657                 return;
8658         }
8659
8660         if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
8661                 // if this process was the snapshot handler, zero snapshot_timeout
8662                 snapshot_timestamp = 0;
8663         }
8664
8665         //
8666         // unblock anyone that's waiting for the handler that died
8667         //
8668         lck_mtx_lock(&nspace_handler_lock);
8669         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8670                 if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
8671
8672                         if ( nspace_items[i].flags & event_mask ) {
8673
8674                                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
8675                                         vnode_lock_spin(nspace_items[i].vp);
8676                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
8677                                         vnode_unlock(nspace_items[i].vp);
8678                                 }
8679                                 nspace_items[i].vp = NULL;
8680                                 nspace_items[i].vid = 0;
8681                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
8682                                 nspace_items[i].token = 0;
8683
8684                                 wakeup((caddr_t)&(nspace_items[i].vp));
8685                         }
8686                 }
8687         }
8688
8689         wakeup((caddr_t)&nspace_item_idx);
8690         lck_mtx_unlock(&nspace_handler_lock);
8691 }
8692
8693
8694 int
8695 resolve_nspace_item(struct vnode *vp, uint64_t op)
8696 {
8697         return resolve_nspace_item_ext(vp, op, NULL);
8698 }
8699
8700 int
8701 resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
8702 {
8703         int i, error, keep_waiting;
8704         struct timespec ts;
8705         nspace_type_t nspace_type = nspace_type_for_op(op);
8706
8707         // only allow namespace events on regular files, directories and symlinks.
8708         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
8709                 return 0;
8710         }
8711
8712         //
8713         // if this is a snapshot event and the vnode is on a
8714         // disk image just pretend nothing happened since any
8715         // change to the disk image will cause the disk image
8716         // itself to get backed up and this avoids multi-way
8717         // deadlocks between the snapshot handler and the ever
8718         // popular diskimages-helper process.  the variable
8719         // nspace_allow_virtual_devs allows this behavior to
8720         // be overridden (for use by the Mobile TimeMachine
8721         // testing infrastructure which uses disk images)
8722         //
8723         if (   (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
8724             && (vp->v_mount != NULL)
8725             && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
8726             && !nspace_allow_virtual_devs) {
8727
8728                 return 0;
8729         }
8730
8731         // if (thread_tid(current_thread()) == namespace_handler_tid) {
8732         if (nspace_handlers[nspace_type].handler_proc == NULL) {
8733                 return 0;
8734         }
8735
8736         if (nspace_is_special_process(current_proc())) {
8737                 return EDEADLK;
8738         }
8739
8740         lck_mtx_lock(&nspace_handler_lock);
8741
8742 retry:
8743         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8744                 if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
8745                         break;
8746                 }
8747         }
8748
8749         if (i >= MAX_NSPACE_ITEMS) {
8750                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8751                         if (nspace_items[i].flags == 0) {
8752                                 break;
8753                         }
8754                 }
8755         } else {
8756                 nspace_items[i].refcount++;
8757         }
8758
8759         if (i >= MAX_NSPACE_ITEMS) {
8760                 ts.tv_sec = nspace_handler_timeout;
8761                 ts.tv_nsec = 0;
8762
8763                 error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS|PCATCH, "nspace-no-space", &ts);
8764                 if (error == 0) {
8765                         // an entry got free'd up, go see if we can get a slot
8766                         goto retry;
8767                 } else {
8768                         lck_mtx_unlock(&nspace_handler_lock);
8769                         return error;
8770                 }
8771         }
8772
8773         //
8774         // if it didn't already exist, add it.  if it did exist
8775         // we'll get woken up when someone does a wakeup() on
8776         // the slot in the nspace_items table.
8777         //
8778         if (vp != nspace_items[i].vp) {
8779                 nspace_items[i].vp = vp;
8780                 nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
8781                 nspace_items[i].op = op;
8782                 nspace_items[i].vid = vnode_vid(vp);
8783                 nspace_items[i].flags = NSPACE_ITEM_NEW;
8784                 nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
8785                 if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
8786                         if (arg) {
8787                                 vnode_lock_spin(vp);
8788                                 vp->v_flag |= VNEEDSSNAPSHOT;
8789                                 vnode_unlock(vp);
8790                         }
8791                 }
8792
8793                 nspace_items[i].token = 0;
8794                 nspace_items[i].refcount = 1;
8795
8796                 wakeup((caddr_t)&nspace_item_idx);
8797         }
8798
8799         //
8800         // Now go to sleep until the handler does a wakeup on this
8801         // slot in the nspace_items table (or we timeout).
8802         //
8803         keep_waiting = 1;
8804         while(keep_waiting) {
8805                 ts.tv_sec = nspace_handler_timeout;
8806                 ts.tv_nsec = 0;
8807                 error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS|PCATCH, "namespace-done", &ts);
8808
8809                 if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
8810                         error = 0;
8811                 } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
8812                         error = nspace_items[i].token;
8813                 } else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
8814                         if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
8815                                 nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
8816                                 continue;
8817                         } else {
8818                                 error = ETIMEDOUT;
8819                         }
8820                 } else if (error == 0) {
8821                         // hmmm, why did we get woken up?
8822                         printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
8823                                nspace_items[i].token);
8824                 }
8825
8826                 if (--nspace_items[i].refcount == 0) {
8827                         nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
8828                         nspace_items[i].arg = NULL;
8829                         nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
8830                         nspace_items[i].flags = 0;     // this clears it for re-use
8831                 }
8832                 wakeup(&nspace_token_id);
8833                 keep_waiting = 0;
8834         }
8835
8836         lck_mtx_unlock(&nspace_handler_lock);
8837
8838         return error;
8839 }
8840
8841
8842 int
8843 get_nspace_item_status(struct vnode *vp, int32_t *status)
8844 {
8845         int i;
8846
8847         lck_mtx_lock(&nspace_handler_lock);
8848         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8849                 if (nspace_items[i].vp == vp) {
8850                         break;
8851                 }
8852         }
8853
8854         if (i >= MAX_NSPACE_ITEMS) {
8855                 lck_mtx_unlock(&nspace_handler_lock);
8856                 return ENOENT;
8857         }
8858
8859         *status = nspace_items[i].flags;
8860         lck_mtx_unlock(&nspace_handler_lock);
8861         return 0;
8862 }
8863
8864
8865 #if 0
8866 static int
8867 build_volfs_path(struct vnode *vp, char *path, int *len)
8868 {
8869         struct vnode_attr va;
8870         int ret;
8871
8872         VATTR_INIT(&va);
8873         VATTR_WANTED(&va, va_fsid);
8874         VATTR_WANTED(&va, va_fileid);
8875
8876         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
8877                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
8878                 ret = -1;
8879         } else {
8880                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
8881                 ret = 0;
8882         }
8883
8884         return ret;
8885 }
8886 #endif
8887
8888 //
8889 // Note: this function does NOT check permissions on all of the
8890 // parent directories leading to this vnode.  It should only be
8891 // called on behalf of a root process.  Otherwise a process may
8892 // get access to a file because the file itself is readable even
8893 // though its parent directories would prevent access.
8894 //
8895 static int
8896 vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
8897 {
8898         int error, action;
8899
8900         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
8901                 return error;
8902         }
8903
8904 #if CONFIG_MACF
8905         error = mac_vnode_check_open(ctx, vp, fmode);
8906         if (error)
8907                 return error;
8908 #endif
8909
8910         /* compute action to be authorized */
8911         action = 0;
8912         if (fmode & FREAD) {
8913                 action |= KAUTH_VNODE_READ_DATA;
8914         }
8915         if (fmode & (FWRITE | O_TRUNC)) {
8916                 /*
8917                  * If we are writing, appending, and not truncating,
8918                  * indicate that we are appending so that if the
8919                  * UF_APPEND or SF_APPEND bits are set, we do not deny
8920                  * the open.
8921                  */
8922                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
8923                         action |= KAUTH_VNODE_APPEND_DATA;
8924                 } else {
8925                         action |= KAUTH_VNODE_WRITE_DATA;
8926                 }
8927         }
8928
8929         if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
8930                 return error;
8931
8932
8933         //
8934         // if the vnode is tagged VOPENEVT and the current process
8935         // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
8936         // flag to the open mode so that this open won't count against
8937         // the vnode when carbon delete() does a vnode_isinuse() to see
8938         // if a file is currently in use.  this allows spotlight
8939         // importers to not interfere with carbon apps that depend on
8940         // the no-delete-if-busy semantics of carbon delete().
8941         //
8942         if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
8943                 fmode |= O_EVTONLY;
8944         }
8945
8946         if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) {
8947                 return error;
8948         }
8949         if ( (error = vnode_ref_ext(vp, fmode, 0)) ) {
8950                 VNOP_CLOSE(vp, fmode, ctx);
8951                 return error;
8952         }
8953
8954         /* Call out to allow 3rd party notification of open.
8955          * Ignore result of kauth_authorize_fileop call.
8956          */
8957 #if CONFIG_MACF
8958         mac_vnode_notify_open(ctx, vp, fmode);
8959 #endif
8960         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
8961                                (uintptr_t)vp, 0);
8962
8963
8964         return 0;
8965 }
8966
8967 static int
8968 wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
8969 {
8970         int i, error=0, unblock=0;
8971         task_t curtask;
8972
8973         lck_mtx_lock(&nspace_handler_exclusion_lock);
8974         if (nspace_handlers[nspace_type].handler_busy) {
8975                 lck_mtx_unlock(&nspace_handler_exclusion_lock);
8976                 return EBUSY;
8977         }
8978         nspace_handlers[nspace_type].handler_busy = 1;
8979         lck_mtx_unlock(&nspace_handler_exclusion_lock);
8980
8981         /*
8982          * Any process that gets here will be one of the namespace handlers.
8983          * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
8984          * as we can cause deadlocks to occur, because the namespace handler may prevent
8985          * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
8986          * process.
8987          */
8988         curtask = current_task();
8989         bsd_set_dependency_capable (curtask);
8990
8991         lck_mtx_lock(&nspace_handler_lock);
8992         if (nspace_handlers[nspace_type].handler_proc == NULL) {
8993                 nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
8994                 nspace_handlers[nspace_type].handler_proc = current_proc();
8995         }
8996
8997         while (error == 0) {
8998
8999                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9000                         if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9001                                 if (!nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9002                                         continue;
9003                                 }
9004                                 break;
9005                         }
9006                 }
9007
9008                 if (i < MAX_NSPACE_ITEMS) {
9009                         nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
9010                         nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
9011                         nspace_items[i].token  = ++nspace_token_id;
9012
9013                         if (nspace_items[i].vp) {
9014                                 struct fileproc *fp;
9015                                 int32_t indx, fmode;
9016                                 struct proc *p = current_proc();
9017                                 vfs_context_t ctx = vfs_context_current();
9018                                 struct vnode_attr va;
9019
9020
9021                                 /*
9022                                  * Use vnode pointer to acquire a file descriptor for
9023                                  * hand-off to userland
9024                                  */
9025                                 fmode = nspace_open_flags_for_type(nspace_type);
9026                                 error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
9027                                 if (error) {
9028                                         unblock = 1;
9029                                         break;
9030                                 }
9031                                 error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
9032                                 if (error) {
9033                                         unblock = 1;
9034                                         vnode_put(nspace_items[i].vp);
9035                                         break;
9036                                 }
9037
9038                                 if ((error = falloc(p, &fp, &indx, ctx))) {
9039                                         vn_close(nspace_items[i].vp, fmode, ctx);
9040                                         vnode_put(nspace_items[i].vp);
9041                                         unblock = 1;
9042                                         break;
9043                                 }
9044
9045                                 fp->f_fglob->fg_flag = fmode;
9046                                 fp->f_fglob->fg_ops = &vnops;
9047                                 fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
9048
9049                                 proc_fdlock(p);
9050                                 procfdtbl_releasefd(p, indx, NULL);
9051                                 fp_drop(p, indx, fp, 1);
9052                                 proc_fdunlock(p);
9053
9054                                 /*
9055                                  * All variants of the namespace handler struct support these three fields:
9056                                  * token, flags, and the FD pointer
9057                                  */
9058                                 error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
9059                                 error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
9060                                 error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
9061
9062                                 /*
9063                                  * Handle optional fields:
9064                                  * extended version support an info ptr (offset, length), and the
9065                                  *
9066                                  * namedata version supports a unique per-link object ID
9067                                  *
9068                                  */
9069                                 if (nhd->infoptr) {
9070                                         uio_t uio = (uio_t)nspace_items[i].arg;
9071                                         uint64_t u_offset, u_length;
9072
9073                                         if (uio) {
9074                                                 u_offset = uio_offset(uio);
9075                                                 u_length = uio_resid(uio);
9076                                         } else {
9077                                                 u_offset = 0;
9078                                                 u_length = 0;
9079                                         }
9080                                         error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
9081                                         error = copyout(&u_length, nhd->infoptr+sizeof(uint64_t), sizeof(uint64_t));
9082                                 }
9083
9084                                 if (nhd->objid) {
9085                                         VATTR_INIT(&va);
9086                                         VATTR_WANTED(&va, va_linkid);
9087                                         error = vnode_getattr(nspace_items[i].vp, &va, ctx);
9088                                         if (error == 0 ) {
9089                                                 uint64_t linkid = 0;
9090                                                 if (VATTR_IS_SUPPORTED (&va, va_linkid)) {
9091                                                         linkid = (uint64_t)va.va_linkid;
9092                                                 }
9093                                                 error = copyout (&linkid, nhd->objid, sizeof(uint64_t));
9094                                         }
9095                                 }
9096
9097                                 if (error) {
9098                                         vn_close(nspace_items[i].vp, fmode, ctx);
9099                                         fp_free(p, indx, fp);
9100                                         unblock = 1;
9101                                 }
9102
9103                                 vnode_put(nspace_items[i].vp);
9104
9105                                 break;
9106                         } else {
9107                                 printf("wait_for_nspace_event: failed (nspace_items[%d] == %p error %d, name %s)\n",
9108                                        i, nspace_items[i].vp, error, nspace_items[i].vp->v_name);
9109                         }
9110
9111                 } else {
9112                         error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0);
9113                         if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9114                                 error = EINVAL;
9115                                 break;
9116                         }
9117
9118                 }
9119         }
9120
9121         if (unblock) {
9122                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9123                         vnode_lock_spin(nspace_items[i].vp);
9124                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9125                         vnode_unlock(nspace_items[i].vp);
9126                 }
9127                 nspace_items[i].vp = NULL;
9128                 nspace_items[i].vid = 0;
9129                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9130                 nspace_items[i].token = 0;
9131
9132                 wakeup((caddr_t)&(nspace_items[i].vp));
9133         }
9134
9135         if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
9136                 // just go through every snapshot event and unblock it immediately.
9137                 if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9138                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9139                                 if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9140                                         if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9141                                                 nspace_items[i].vp = NULL;
9142                                                 nspace_items[i].vid = 0;
9143                                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9144                                                 nspace_items[i].token = 0;
9145
9146                                                 wakeup((caddr_t)&(nspace_items[i].vp));
9147                                         }
9148                                 }
9149                         }
9150                 }
9151         }
9152
9153         lck_mtx_unlock(&nspace_handler_lock);
9154
9155         lck_mtx_lock(&nspace_handler_exclusion_lock);
9156         nspace_handlers[nspace_type].handler_busy = 0;
9157         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9158
9159         return error;
9160 }
9161
9162 static inline int validate_namespace_args (int is64bit, int size) {
9163
9164         if (is64bit) {
9165                 /* Must be one of these */
9166                 if (size == sizeof(user64_namespace_handler_info)) {
9167                         goto sizeok;
9168                 }
9169                 if (size == sizeof(user64_namespace_handler_info_ext)) {
9170                         goto sizeok;
9171                 }
9172                 if (size == sizeof(user64_namespace_handler_data)) {
9173                         goto sizeok;
9174                 }
9175                 return EINVAL;
9176         }
9177         else {
9178                 /* 32 bit -- must be one of these */
9179                 if (size == sizeof(user32_namespace_handler_info)) {
9180                         goto sizeok;
9181                 }
9182                 if (size == sizeof(user32_namespace_handler_info_ext)) {
9183                         goto sizeok;
9184                 }
9185                 if (size == sizeof(user32_namespace_handler_data)) {
9186                         goto sizeok;
9187                 }
9188                 return EINVAL;
9189         }
9190
9191 sizeok:
9192
9193         return 0;
9194
9195 }
9196
9197 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
9198 {
9199         int error = 0;
9200         namespace_handler_data nhd;
9201
9202         bzero (&nhd, sizeof(namespace_handler_data));
9203
9204         if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
9205                         (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9206                 return EINVAL;
9207         }
9208
9209         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9210                 return error;
9211         }
9212
9213         error = validate_namespace_args (is64bit, size);
9214         if (error) {
9215                 return error;
9216         }
9217
9218         /* Copy in the userland pointers into our kernel-only struct */
9219
9220         if (is64bit) {
9221                 /* 64 bit userland structures */
9222                 nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
9223                 nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
9224                 nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
9225
9226                 /* If the size is greater than the standard info struct, add in extra fields */
9227                 if (size > (sizeof(user64_namespace_handler_info))) {
9228                         if (size >= (sizeof(user64_namespace_handler_info_ext))) {
9229                                 nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
9230                         }
9231                         if (size == (sizeof(user64_namespace_handler_data))) {
9232                                 nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
9233                         }
9234                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9235                 }
9236         }
9237         else {
9238                 /* 32 bit userland structures */
9239                 nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
9240                 nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
9241                 nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
9242
9243                 if (size > (sizeof(user32_namespace_handler_info))) {
9244                         if (size >= (sizeof(user32_namespace_handler_info_ext))) {
9245                                 nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
9246                         }
9247                         if (size == (sizeof(user32_namespace_handler_data))) {
9248                                 nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
9249                         }
9250                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9251                 }
9252         }
9253
9254         return wait_for_namespace_event(&nhd, nspace_type);
9255 }
9256
9257 /*
9258  * Make a filesystem-specific control call:
9259  */
9260 /* ARGSUSED */
9261 static int
9262 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
9263 {
9264         int error=0;
9265         boolean_t is64bit;
9266         u_int size;
9267 #define STK_PARAMS 128
9268         char stkbuf[STK_PARAMS];
9269         caddr_t data, memp;
9270         vnode_t vp = *arg_vp;
9271
9272         size = IOCPARM_LEN(cmd);
9273         if (size > IOCPARM_MAX) return (EINVAL);
9274
9275         is64bit = proc_is64bit(p);
9276
9277         memp = NULL;
9278
9279
9280         /*
9281          * ensure the buffer is large enough for underlying calls
9282          */
9283 #ifndef HFSIOC_GETPATH
9284         typedef char pn_t[MAXPATHLEN];
9285 #define HFSIOC_GETPATH  _IOWR('h', 13, pn_t)
9286 #endif
9287
9288 #ifndef HFS_GETPATH
9289 #define HFS_GETPATH  IOCBASECMD(HFSIOC_GETPATH)
9290 #endif
9291         if (IOCBASECMD(cmd) == HFS_GETPATH) {
9292                 /* Round up to MAXPATHLEN regardless of user input */
9293                 size = MAXPATHLEN;
9294         }
9295
9296         if (size > sizeof (stkbuf)) {
9297                 if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
9298                 data = memp;
9299         } else {
9300                 data = &stkbuf[0];
9301         };
9302
9303         if (cmd & IOC_IN) {
9304                 if (size) {
9305                         error = copyin(udata, data, size);
9306                         if (error) {
9307                                 if (memp) {
9308                                         kfree (memp, size);
9309                                 }
9310                                 return error;
9311                         }
9312                 } else {
9313                         if (is64bit) {
9314                                 *(user_addr_t *)data = udata;
9315                         }
9316                         else {
9317                                 *(uint32_t *)data = (uint32_t)udata;
9318                         }
9319                 };
9320         } else if ((cmd & IOC_OUT) && size) {
9321                 /*
9322                  * Zero the buffer so the user always
9323                  * gets back something deterministic.
9324                  */
9325                 bzero(data, size);
9326         } else if (cmd & IOC_VOID) {
9327                 if (is64bit) {
9328                         *(user_addr_t *)data = udata;
9329                 }
9330                 else {
9331                         *(uint32_t *)data = (uint32_t)udata;
9332                 }
9333         }
9334
9335         /* Check to see if it's a generic command */
9336         switch (IOCBASECMD(cmd)) {
9337
9338                 case FSCTL_SYNC_VOLUME: {
9339                         mount_t mp = vp->v_mount;
9340                         int arg = *(uint32_t*)data;
9341
9342                         /* record vid of vp so we can drop it below. */
9343                         uint32_t vvid = vp->v_id;
9344
9345                         /*
9346                          * Then grab mount_iterref so that we can release the vnode.
9347                          * Without this, a thread may call vnode_iterate_prepare then
9348                          * get into a deadlock because we've never released the root vp
9349                          */
9350                         error = mount_iterref (mp, 0);
9351                         if (error)  {
9352                                 break;
9353                         }
9354                         vnode_put(vp);
9355
9356                         /* issue the sync for this volume */
9357                         (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
9358
9359                         /*
9360                          * Then release the mount_iterref once we're done syncing; it's not
9361                          * needed for the VNOP_IOCTL below
9362                          */
9363                         mount_iterdrop(mp);
9364
9365                         if (arg & FSCTL_SYNC_FULLSYNC) {
9366                                 /* re-obtain vnode iocount on the root vp, if possible */
9367                                 error = vnode_getwithvid (vp, vvid);
9368                                 if (error == 0) {
9369                                         error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
9370                                         vnode_put (vp);
9371                                 }
9372                         }
9373                         /* mark the argument VP as having been released */
9374                         *arg_vp = NULL;
9375                 }
9376                 break;
9377
9378                 case FSCTL_SET_PACKAGE_EXTS: {
9379                         user_addr_t ext_strings;
9380                         uint32_t    num_entries;
9381                         uint32_t    max_width;
9382
9383                         if (   (is64bit && size != sizeof(user64_package_ext_info))
9384                                         || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
9385
9386                                 // either you're 64-bit and passed a 64-bit struct or
9387                                 // you're 32-bit and passed a 32-bit struct.  otherwise
9388                                 // it's not ok.
9389                                 error = EINVAL;
9390                                 break;
9391                         }
9392
9393                         if (is64bit) {
9394                                 ext_strings = ((user64_package_ext_info *)data)->strings;
9395                                 num_entries = ((user64_package_ext_info *)data)->num_entries;
9396                                 max_width   = ((user64_package_ext_info *)data)->max_width;
9397                         } else {
9398                                 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
9399                                 num_entries = ((user32_package_ext_info *)data)->num_entries;
9400                                 max_width   = ((user32_package_ext_info *)data)->max_width;
9401                         }
9402                         error = set_package_extensions_table(ext_strings, num_entries, max_width);
9403                 }
9404                 break;
9405
9406                 /* namespace handlers */
9407                 case FSCTL_NAMESPACE_HANDLER_GET: {
9408                         error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
9409                 }
9410                 break;
9411
9412                 /* Snapshot handlers */
9413                 case FSCTL_OLD_SNAPSHOT_HANDLER_GET: {
9414                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
9415                 }
9416                 break;
9417
9418                 case FSCTL_SNAPSHOT_HANDLER_GET_EXT: {
9419                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
9420                 }
9421                 break;
9422
9423                 case FSCTL_NAMESPACE_HANDLER_UPDATE: {
9424                         uint32_t token, val;
9425                         int i;
9426
9427                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
9428                                 break;
9429                         }
9430
9431                         if (!nspace_is_special_process(p)) {
9432                                 error = EINVAL;
9433                                 break;
9434                         }
9435
9436                         token = ((uint32_t *)data)[0];
9437                         val   = ((uint32_t *)data)[1];
9438
9439                         lck_mtx_lock(&nspace_handler_lock);
9440
9441                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9442                                 if (nspace_items[i].token == token) {
9443                                         break;  /* exit for loop, not case stmt */
9444                                 }
9445                         }
9446
9447                         if (i >= MAX_NSPACE_ITEMS) {
9448                                 error = ENOENT;
9449                         } else {
9450                                 //
9451                                 // if this bit is set, when resolve_nspace_item() times out
9452                                 // it will loop and go back to sleep.
9453                                 //
9454                                 nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
9455                         }
9456
9457                         lck_mtx_unlock(&nspace_handler_lock);
9458
9459                         if (error) {
9460                                 printf("nspace-handler-update: did not find token %u\n", token);
9461                         }
9462                 }
9463                 break;
9464
9465                 case FSCTL_NAMESPACE_HANDLER_UNBLOCK: {
9466                         uint32_t token, val;
9467                         int i;
9468
9469                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
9470                                 break;
9471                         }
9472
9473                         if (!nspace_is_special_process(p)) {
9474                                 error = EINVAL;
9475                                 break;
9476                         }
9477
9478                         token = ((uint32_t *)data)[0];
9479                         val   = ((uint32_t *)data)[1];
9480
9481                         lck_mtx_lock(&nspace_handler_lock);
9482
9483                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9484                                 if (nspace_items[i].token == token) {
9485                                         break; /* exit for loop, not case statement */
9486                                 }
9487                         }
9488
9489                         if (i >= MAX_NSPACE_ITEMS) {
9490                                 printf("nspace-handler-unblock: did not find token %u\n", token);
9491                                 error = ENOENT;
9492                         } else {
9493                                 if (val == 0 && nspace_items[i].vp) {
9494                                         vnode_lock_spin(nspace_items[i].vp);
9495                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9496                                         vnode_unlock(nspace_items[i].vp);
9497                                 }
9498
9499                                 nspace_items[i].vp = NULL;
9500                                 nspace_items[i].arg = NULL;
9501                                 nspace_items[i].op = 0;
9502                                 nspace_items[i].vid = 0;
9503                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9504                                 nspace_items[i].token = 0;
9505
9506                                 wakeup((caddr_t)&(nspace_items[i].vp));
9507                         }
9508
9509                         lck_mtx_unlock(&nspace_handler_lock);
9510                 }
9511                 break;
9512
9513                 case FSCTL_NAMESPACE_HANDLER_CANCEL: {
9514                         uint32_t token, val;
9515                         int i;
9516
9517                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
9518                                 break;
9519                         }
9520
9521                         if (!nspace_is_special_process(p)) {
9522                                 error = EINVAL;
9523                                 break;
9524                         }
9525
9526                         token = ((uint32_t *)data)[0];
9527                         val   = ((uint32_t *)data)[1];
9528
9529                         lck_mtx_lock(&nspace_handler_lock);
9530
9531                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9532                                 if (nspace_items[i].token == token) {
9533                                         break;  /* exit for loop, not case stmt */
9534                                 }
9535                         }
9536
9537                         if (i >= MAX_NSPACE_ITEMS) {
9538                                 printf("nspace-handler-cancel: did not find token %u\n", token);
9539                                 error = ENOENT;
9540                         } else {
9541                                 if (nspace_items[i].vp) {
9542                                         vnode_lock_spin(nspace_items[i].vp);
9543                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9544                                         vnode_unlock(nspace_items[i].vp);
9545                                 }
9546
9547                                 nspace_items[i].vp = NULL;
9548                                 nspace_items[i].arg = NULL;
9549                                 nspace_items[i].vid = 0;
9550                                 nspace_items[i].token = val;
9551                                 nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
9552                                 nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
9553
9554                                 wakeup((caddr_t)&(nspace_items[i].vp));
9555                         }
9556
9557                         lck_mtx_unlock(&nspace_handler_lock);
9558                 }
9559                 break;
9560
9561                 case FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
9562                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9563                                 break;
9564                         }
9565
9566                         // we explicitly do not do the namespace_handler_proc check here
9567
9568                         lck_mtx_lock(&nspace_handler_lock);
9569                         snapshot_timestamp = ((uint32_t *)data)[0];
9570                         wakeup(&nspace_item_idx);
9571                         lck_mtx_unlock(&nspace_handler_lock);
9572                         printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
9573
9574                 }
9575                 break;
9576
9577                 case FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
9578                 {
9579                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9580                                 break;
9581                         }
9582
9583                         lck_mtx_lock(&nspace_handler_lock);
9584                         nspace_allow_virtual_devs = ((uint32_t *)data)[0];
9585                         lck_mtx_unlock(&nspace_handler_lock);
9586                         printf("nspace-snapshot-handler will%s allow events on disk-images\n",
9587                                         nspace_allow_virtual_devs ? "" : " NOT");
9588                         error = 0;
9589
9590                 }
9591                 break;
9592
9593                 case FSCTL_SET_FSTYPENAME_OVERRIDE:
9594                 {
9595                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9596                                 break;
9597                         }
9598                         if (vp->v_mount) {
9599                                 mount_lock(vp->v_mount);
9600                                 if (data[0] != 0) {
9601                                         strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
9602                                         vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
9603                                         if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
9604                                                 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
9605                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
9606                                         }
9607                                 } else {
9608                                         if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
9609                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
9610                                         }
9611                                         vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
9612                                         vp->v_mount->fstypename_override[0] = '\0';
9613                                 }
9614                                 mount_unlock(vp->v_mount);
9615                         }
9616                 }
9617                 break;
9618
9619                 default: {
9620                         /* Invoke the filesystem-specific code */
9621                         error = VNOP_IOCTL(vp, IOCBASECMD(cmd), data, options, ctx);
9622                 }
9623
9624         } /* end switch stmt */
9625
9626         /*
9627          * if no errors, copy any data to user. Size was
9628          * already set and checked above.
9629          */
9630         if (error == 0 && (cmd & IOC_OUT) && size)
9631                 error = copyout(data, udata, size);
9632
9633         if (memp) {
9634                 kfree(memp, size);
9635         }
9636
9637         return error;
9638 }
9639
9640 /* ARGSUSED */
9641 int
9642 fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
9643 {
9644         int error;
9645         struct nameidata nd;
9646         u_long nameiflags;
9647         vnode_t vp = NULL;
9648         vfs_context_t ctx = vfs_context_current();
9649
9650         AUDIT_ARG(cmd, uap->cmd);
9651         AUDIT_ARG(value32, uap->options);
9652         /* Get the vnode for the file we are getting info on:  */
9653         nameiflags = 0;
9654         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
9655         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
9656                UIO_USERSPACE, uap->path, ctx);
9657         if ((error = namei(&nd))) goto done;
9658         vp = nd.ni_vp;
9659         nameidone(&nd);
9660
9661 #if CONFIG_MACF
9662         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
9663         if (error) {
9664                 goto done;
9665         }
9666 #endif
9667
9668         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
9669
9670 done:
9671         if (vp)
9672                 vnode_put(vp);
9673         return error;
9674 }
9675 /* ARGSUSED */
9676 int
9677 ffsctl (proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
9678 {
9679         int error;
9680         vnode_t vp = NULL;
9681         vfs_context_t ctx = vfs_context_current();
9682         int fd = -1;
9683
9684         AUDIT_ARG(fd, uap->fd);
9685         AUDIT_ARG(cmd, uap->cmd);
9686         AUDIT_ARG(value32, uap->options);
9687
9688         /* Get the vnode for the file we are getting info on:  */
9689         if ((error = file_vnode(uap->fd, &vp)))
9690                 return error;
9691         fd = uap->fd;
9692         if ((error = vnode_getwithref(vp))) {
9693                 file_drop(fd);
9694                 return error;
9695         }
9696
9697 #if CONFIG_MACF
9698         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
9699                 file_drop(fd);
9700                 vnode_put(vp);
9701                 return error;
9702         }
9703 #endif
9704
9705         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
9706
9707         file_drop(fd);
9708
9709         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
9710         if (vp) {
9711                 vnode_put(vp);
9712         }
9713
9714         return error;
9715 }
9716 /* end of fsctl system call */
9717
9718 /*
9719  *  Retrieve the data of an extended attribute.
9720  */
9721 int
9722 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
9723 {
9724         vnode_t vp;
9725         struct nameidata nd;
9726         char attrname[XATTR_MAXNAMELEN+1];
9727         vfs_context_t ctx = vfs_context_current();
9728         uio_t auio = NULL;
9729         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9730         size_t attrsize = 0;
9731         size_t namelen;
9732         u_int32_t nameiflags;
9733         int error;
9734         char uio_buf[ UIO_SIZEOF(1) ];
9735
9736         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9737                 return (EINVAL);
9738
9739         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9740         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
9741         if ((error = namei(&nd))) {
9742                 return (error);
9743         }
9744         vp = nd.ni_vp;
9745         nameidone(&nd);
9746
9747         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9748                 goto out;
9749         }
9750         if (xattr_protected(attrname)) {
9751                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
9752                         error = EPERM;
9753                         goto out;
9754                 }
9755         }
9756         /*
9757          * the specific check for 0xffffffff is a hack to preserve
9758          * binaray compatibilty in K64 with applications that discovered
9759          * that passing in a buf pointer and a size of -1 resulted in
9760          * just the size of the indicated extended attribute being returned.
9761          * this isn't part of the documented behavior, but because of the
9762          * original implemtation's check for "uap->size > 0", this behavior
9763          * was allowed. In K32 that check turned into a signed comparison
9764          * even though uap->size is unsigned...  in K64, we blow by that
9765          * check because uap->size is unsigned and doesn't get sign smeared
9766          * in the munger for a 32 bit user app.  we also need to add a
9767          * check to limit the maximum size of the buffer being passed in...
9768          * unfortunately, the underlying fileystems seem to just malloc
9769          * the requested size even if the actual extended attribute is tiny.
9770          * because that malloc is for kernel wired memory, we have to put a
9771          * sane limit on it.
9772          *
9773          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
9774          * U64 running on K64 will yield -1 (64 bits wide)
9775          * U32/U64 running on K32 will yield -1 (32 bits wide)
9776          */
9777         if (uap->size == 0xffffffff || uap->size == (size_t)-1)
9778                 goto no_uio;
9779
9780         if (uap->value) {
9781                 if (uap->size > (size_t)XATTR_MAXSIZE)
9782                         uap->size = XATTR_MAXSIZE;
9783
9784                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
9785                                             &uio_buf[0], sizeof(uio_buf));
9786                 uio_addiov(auio, uap->value, uap->size);
9787         }
9788 no_uio:
9789         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
9790 out:
9791         vnode_put(vp);
9792
9793         if (auio) {
9794                 *retval = uap->size - uio_resid(auio);
9795         } else {
9796                 *retval = (user_ssize_t)attrsize;
9797         }
9798
9799         return (error);
9800 }
9801
9802 /*
9803  * Retrieve the data of an extended attribute.
9804  */
9805 int
9806 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
9807 {
9808         vnode_t vp;
9809         char attrname[XATTR_MAXNAMELEN+1];
9810         uio_t auio = NULL;
9811         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9812         size_t attrsize = 0;
9813         size_t namelen;
9814         int error;
9815         char uio_buf[ UIO_SIZEOF(1) ];
9816
9817         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9818                 return (EINVAL);
9819
9820         if ( (error = file_vnode(uap->fd, &vp)) ) {
9821                 return (error);
9822         }
9823         if ( (error = vnode_getwithref(vp)) ) {
9824                 file_drop(uap->fd);
9825                 return(error);
9826         }
9827         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9828                 goto out;
9829         }
9830         if (xattr_protected(attrname)) {
9831                 error = EPERM;
9832                 goto out;
9833         }
9834         if (uap->value && uap->size > 0) {
9835                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
9836                                             &uio_buf[0], sizeof(uio_buf));
9837                 uio_addiov(auio, uap->value, uap->size);
9838         }
9839
9840         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
9841 out:
9842         (void)vnode_put(vp);
9843         file_drop(uap->fd);
9844
9845         if (auio) {
9846                 *retval = uap->size - uio_resid(auio);
9847         } else {
9848                 *retval = (user_ssize_t)attrsize;
9849         }
9850         return (error);
9851 }
9852
9853 /*
9854  * Set the data of an extended attribute.
9855  */
9856 int
9857 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
9858 {
9859         vnode_t vp;
9860         struct nameidata nd;
9861         char attrname[XATTR_MAXNAMELEN+1];
9862         vfs_context_t ctx = vfs_context_current();
9863         uio_t auio = NULL;
9864         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9865         size_t namelen;
9866         u_int32_t nameiflags;
9867         int error;
9868         char uio_buf[ UIO_SIZEOF(1) ];
9869
9870         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9871                 return (EINVAL);
9872
9873         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9874                 if (error == EPERM) {
9875                         /* if the string won't fit in attrname, copyinstr emits EPERM */
9876                         return (ENAMETOOLONG);
9877                 }
9878                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
9879                 return error;
9880         }
9881         if (xattr_protected(attrname))
9882                 return(EPERM);
9883         if (uap->size != 0 && uap->value == 0) {
9884                 return (EINVAL);
9885         }
9886
9887         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9888         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
9889         if ((error = namei(&nd))) {
9890                 return (error);
9891         }
9892         vp = nd.ni_vp;
9893         nameidone(&nd);
9894
9895         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
9896                                     &uio_buf[0], sizeof(uio_buf));
9897         uio_addiov(auio, uap->value, uap->size);
9898
9899         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
9900 #if CONFIG_FSE
9901         if (error == 0) {
9902                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
9903                     FSE_ARG_VNODE, vp,
9904                     FSE_ARG_DONE);
9905         }
9906 #endif
9907         vnode_put(vp);
9908         *retval = 0;
9909         return (error);
9910 }
9911
9912 /*
9913  * Set the data of an extended attribute.
9914  */
9915 int
9916 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
9917 {
9918         vnode_t vp;
9919         char attrname[XATTR_MAXNAMELEN+1];
9920         uio_t auio = NULL;
9921         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9922         size_t namelen;
9923         int error;
9924         char uio_buf[ UIO_SIZEOF(1) ];
9925 #if CONFIG_FSE
9926         vfs_context_t ctx = vfs_context_current();
9927 #endif
9928
9929         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9930                 return (EINVAL);
9931
9932         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9933                 if (error == EPERM) {
9934                         /* if the string won't fit in attrname, copyinstr emits EPERM */
9935                         return (ENAMETOOLONG);
9936                 }
9937                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
9938                 return error;
9939         }
9940         if (xattr_protected(attrname))
9941                 return(EPERM);
9942         if (uap->size != 0 && uap->value == 0) {
9943                 return (EINVAL);
9944         }
9945         if ( (error = file_vnode(uap->fd, &vp)) ) {
9946                 return (error);
9947         }
9948         if ( (error = vnode_getwithref(vp)) ) {
9949                 file_drop(uap->fd);
9950                 return(error);
9951         }
9952         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
9953                                     &uio_buf[0], sizeof(uio_buf));
9954         uio_addiov(auio, uap->value, uap->size);
9955
9956         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
9957 #if CONFIG_FSE
9958         if (error == 0) {
9959                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
9960                     FSE_ARG_VNODE, vp,
9961                     FSE_ARG_DONE);
9962         }
9963 #endif
9964         vnode_put(vp);
9965         file_drop(uap->fd);
9966         *retval = 0;
9967         return (error);
9968 }
9969
9970 /*
9971  * Remove an extended attribute.
9972  * XXX Code duplication here.
9973  */
9974 int
9975 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
9976 {
9977         vnode_t vp;
9978         struct nameidata nd;
9979         char attrname[XATTR_MAXNAMELEN+1];
9980         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9981         vfs_context_t ctx = vfs_context_current();
9982         size_t namelen;
9983         u_int32_t nameiflags;
9984         int error;
9985
9986         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9987                 return (EINVAL);
9988
9989         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
9990         if (error != 0) {
9991                 return (error);
9992         }
9993         if (xattr_protected(attrname))
9994                 return(EPERM);
9995         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9996         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
9997         if ((error = namei(&nd))) {
9998                 return (error);
9999         }
10000         vp = nd.ni_vp;
10001         nameidone(&nd);
10002
10003         error = vn_removexattr(vp, attrname, uap->options, ctx);
10004 #if CONFIG_FSE
10005         if (error == 0) {
10006                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10007                     FSE_ARG_VNODE, vp,
10008                     FSE_ARG_DONE);
10009         }
10010 #endif
10011         vnode_put(vp);
10012         *retval = 0;
10013         return (error);
10014 }
10015
10016 /*
10017  * Remove an extended attribute.
10018  * XXX Code duplication here.
10019  */
10020 int
10021 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
10022 {
10023         vnode_t vp;
10024         char attrname[XATTR_MAXNAMELEN+1];
10025         size_t namelen;
10026         int error;
10027 #if CONFIG_FSE
10028         vfs_context_t ctx = vfs_context_current();
10029 #endif
10030
10031         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10032                 return (EINVAL);
10033
10034         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10035         if (error != 0) {
10036                 return (error);
10037         }
10038         if (xattr_protected(attrname))
10039                 return(EPERM);
10040         if ( (error = file_vnode(uap->fd, &vp)) ) {
10041                 return (error);
10042         }
10043         if ( (error = vnode_getwithref(vp)) ) {
10044                 file_drop(uap->fd);
10045                 return(error);
10046         }
10047
10048         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
10049 #if CONFIG_FSE
10050         if (error == 0) {
10051                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10052                     FSE_ARG_VNODE, vp,
10053                     FSE_ARG_DONE);
10054         }
10055 #endif
10056         vnode_put(vp);
10057         file_drop(uap->fd);
10058         *retval = 0;
10059         return (error);
10060 }
10061
10062 /*
10063  * Retrieve the list of extended attribute names.
10064  * XXX Code duplication here.
10065  */
10066 int
10067 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
10068 {
10069         vnode_t vp;
10070         struct nameidata nd;
10071         vfs_context_t ctx = vfs_context_current();
10072         uio_t auio = NULL;
10073         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10074         size_t attrsize = 0;
10075         u_int32_t nameiflags;
10076         int error;
10077         char uio_buf[ UIO_SIZEOF(1) ];
10078
10079         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10080                 return (EINVAL);
10081
10082         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10083         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
10084         if ((error = namei(&nd))) {
10085                 return (error);
10086         }
10087         vp = nd.ni_vp;
10088         nameidone(&nd);
10089         if (uap->namebuf != 0 && uap->bufsize > 0) {
10090                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
10091                                             &uio_buf[0], sizeof(uio_buf));
10092                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10093         }
10094
10095         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
10096
10097         vnode_put(vp);
10098         if (auio) {
10099                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10100         } else {
10101                 *retval = (user_ssize_t)attrsize;
10102         }
10103         return (error);
10104 }
10105
10106 /*
10107  * Retrieve the list of extended attribute names.
10108  * XXX Code duplication here.
10109  */
10110 int
10111 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
10112 {
10113         vnode_t vp;
10114         uio_t auio = NULL;
10115         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10116         size_t attrsize = 0;
10117         int error;
10118         char uio_buf[ UIO_SIZEOF(1) ];
10119
10120         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10121                 return (EINVAL);
10122
10123         if ( (error = file_vnode(uap->fd, &vp)) ) {
10124                 return (error);
10125         }
10126         if ( (error = vnode_getwithref(vp)) ) {
10127                 file_drop(uap->fd);
10128                 return(error);
10129         }
10130         if (uap->namebuf != 0 && uap->bufsize > 0) {
10131                 auio = uio_createwithbuffer(1, 0, spacetype,
10132                                                                           UIO_READ, &uio_buf[0], sizeof(uio_buf));
10133                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10134         }
10135
10136         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
10137
10138         vnode_put(vp);
10139         file_drop(uap->fd);
10140         if (auio) {
10141                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10142         } else {
10143                 *retval = (user_ssize_t)attrsize;
10144         }
10145         return (error);
10146 }
10147
10148 static int fsgetpath_internal(
10149         vfs_context_t ctx, int volfs_id, uint64_t objid,
10150         vm_size_t bufsize, caddr_t buf, int *pathlen)
10151 {
10152         int error;
10153         struct mount *mp = NULL;
10154         vnode_t vp;
10155         int length;
10156         int bpflags;
10157
10158         if (bufsize > PAGE_SIZE) {
10159                 return (EINVAL);
10160         }
10161
10162         if (buf == NULL) {
10163                 return (ENOMEM);
10164         }
10165
10166         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
10167                 error = ENOTSUP;  /* unexpected failure */
10168                 return ENOTSUP;
10169         }
10170
10171 unionget:
10172         if (objid == 2) {
10173                 error = VFS_ROOT(mp, &vp, ctx);
10174         } else {
10175                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
10176         }
10177
10178         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
10179                 /*
10180                  * If the fileid isn't found and we're in a union
10181                  * mount volume, then see if the fileid is in the
10182                  * mounted-on volume.
10183                  */
10184                 struct mount *tmp = mp;
10185                 mp = vnode_mount(tmp->mnt_vnodecovered);
10186                 vfs_unbusy(tmp);
10187                 if (vfs_busy(mp, LK_NOWAIT) == 0)
10188                         goto unionget;
10189         } else {
10190                 vfs_unbusy(mp);
10191         }
10192
10193         if (error) {
10194                 return error;
10195         }
10196
10197 #if CONFIG_MACF
10198         error = mac_vnode_check_fsgetpath(ctx, vp);
10199         if (error) {
10200                 vnode_put(vp);
10201                 return error;
10202         }
10203 #endif
10204
10205         /* Obtain the absolute path to this vnode. */
10206         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
10207         bpflags |= BUILDPATH_CHECK_MOVED;
10208         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
10209         vnode_put(vp);
10210
10211         if (error) {
10212                 goto out;
10213         }
10214
10215         AUDIT_ARG(text, buf);
10216
10217         if (kdebug_enable) {
10218                 long dbg_parms[NUMPARMS];
10219                 int  dbg_namelen;
10220
10221                 dbg_namelen = (int)sizeof(dbg_parms);
10222
10223         if (length < dbg_namelen) {
10224                         memcpy((char *)dbg_parms, buf, length);
10225                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
10226
10227                         dbg_namelen = length;
10228                 } else {
10229                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
10230                 }
10231
10232                 kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE);
10233         }
10234
10235         *pathlen = (user_ssize_t)length; /* may be superseded by error */
10236
10237 out:
10238         return (error);
10239 }
10240
10241 /*
10242  * Obtain the full pathname of a file system object by id.
10243  *
10244  * This is a private SPI used by the File Manager.
10245  */
10246 __private_extern__
10247 int
10248 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
10249 {
10250         vfs_context_t ctx = vfs_context_current();
10251         fsid_t fsid;
10252         char *realpath;
10253         int length;
10254         int error;
10255
10256         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
10257                 return (error);
10258         }
10259         AUDIT_ARG(value32, fsid.val[0]);
10260         AUDIT_ARG(value64, uap->objid);
10261         /* Restrict output buffer size for now. */
10262
10263         if (uap->bufsize > PAGE_SIZE) {
10264                 return (EINVAL);
10265         }
10266         MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK);
10267         if (realpath == NULL) {
10268                 return (ENOMEM);
10269         }
10270
10271         error = fsgetpath_internal(
10272                 ctx, fsid.val[0], uap->objid,
10273                 uap->bufsize, realpath, &length);
10274
10275         if (error) {
10276                 goto out;
10277         }
10278
10279         error = copyout((caddr_t)realpath, uap->buf, length);
10280
10281         *retval = (user_ssize_t)length; /* may be superseded by error */
10282 out:
10283         if (realpath) {
10284                 FREE(realpath, M_TEMP);
10285         }
10286         return (error);
10287 }
10288
10289 /*
10290  * Common routine to handle various flavors of statfs data heading out
10291  *      to user space.
10292  *
10293  * Returns:     0                       Success
10294  *              EFAULT
10295  */
10296 static int
10297 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
10298     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
10299     boolean_t partial_copy)
10300 {
10301         int             error;
10302         int             my_size, copy_size;
10303
10304         if (is_64_bit) {
10305                 struct user64_statfs sfs;
10306                 my_size = copy_size = sizeof(sfs);
10307                 bzero(&sfs, my_size);
10308                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10309                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10310                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10311                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
10312                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
10313                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
10314                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
10315                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
10316                 sfs.f_files = (user64_long_t)sfsp->f_files;
10317                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
10318                 sfs.f_fsid = sfsp->f_fsid;
10319                 sfs.f_owner = sfsp->f_owner;
10320                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
10321                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
10322                 } else {
10323                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
10324                 }
10325                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
10326                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
10327
10328                 if (partial_copy) {
10329                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
10330                 }
10331                 error = copyout((caddr_t)&sfs, bufp, copy_size);
10332         }
10333         else {
10334                 struct user32_statfs sfs;
10335
10336                 my_size = copy_size = sizeof(sfs);
10337                 bzero(&sfs, my_size);
10338
10339                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10340                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10341                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10342
10343                 /*
10344                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
10345                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
10346                  * to reflect the filesystem size as best we can.
10347                  */
10348                 if ((sfsp->f_blocks > INT_MAX)
10349                         /* Hack for 4061702 . I think the real fix is for Carbon to
10350                          * look for some volume capability and not depend on hidden
10351                          * semantics agreed between a FS and carbon.
10352                          * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
10353                          * for Carbon to set bNoVolumeSizes volume attribute.
10354                          * Without this the webdavfs files cannot be copied onto
10355                          * disk as they look huge. This change should not affect
10356                          * XSAN as they should not setting these to -1..
10357                          */
10358                          && (sfsp->f_blocks != 0xffffffffffffffffULL)
10359                          && (sfsp->f_bfree != 0xffffffffffffffffULL)
10360                          && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
10361                         int             shift;
10362
10363                         /*
10364                          * Work out how far we have to shift the block count down to make it fit.
10365                          * Note that it's possible to have to shift so far that the resulting
10366                          * blocksize would be unreportably large.  At that point, we will clip
10367                          * any values that don't fit.
10368                          *
10369                          * For safety's sake, we also ensure that f_iosize is never reported as
10370                          * being smaller than f_bsize.
10371                          */
10372                         for (shift = 0; shift < 32; shift++) {
10373                                 if ((sfsp->f_blocks >> shift) <= INT_MAX)
10374                                         break;
10375                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX)
10376                                         break;
10377                         }
10378 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
10379                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
10380                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
10381                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
10382 #undef __SHIFT_OR_CLIP
10383                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
10384                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
10385                 } else {
10386                         /* filesystem is small enough to be reported honestly */
10387                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
10388                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
10389                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
10390                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
10391                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
10392                 }
10393                 sfs.f_files = (user32_long_t)sfsp->f_files;
10394                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
10395                 sfs.f_fsid = sfsp->f_fsid;
10396                 sfs.f_owner = sfsp->f_owner;
10397                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
10398                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
10399                 } else {
10400                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
10401                 }
10402                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
10403                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
10404
10405                 if (partial_copy) {
10406                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
10407                 }
10408                 error = copyout((caddr_t)&sfs, bufp, copy_size);
10409         }
10410
10411         if (sizep != NULL) {
10412                 *sizep = my_size;
10413         }
10414         return(error);
10415 }
10416
10417 /*
10418  * copy stat structure into user_stat structure.
10419  */
10420 void munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
10421 {
10422         bzero(usbp, sizeof(*usbp));
10423
10424         usbp->st_dev = sbp->st_dev;
10425         usbp->st_ino = sbp->st_ino;
10426         usbp->st_mode = sbp->st_mode;
10427         usbp->st_nlink = sbp->st_nlink;
10428         usbp->st_uid = sbp->st_uid;
10429         usbp->st_gid = sbp->st_gid;
10430         usbp->st_rdev = sbp->st_rdev;
10431 #ifndef _POSIX_C_SOURCE
10432         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10433         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10434         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10435         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10436         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10437         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10438 #else
10439         usbp->st_atime = sbp->st_atime;
10440         usbp->st_atimensec = sbp->st_atimensec;
10441         usbp->st_mtime = sbp->st_mtime;
10442         usbp->st_mtimensec = sbp->st_mtimensec;
10443         usbp->st_ctime = sbp->st_ctime;
10444         usbp->st_ctimensec = sbp->st_ctimensec;
10445 #endif
10446         usbp->st_size = sbp->st_size;
10447         usbp->st_blocks = sbp->st_blocks;
10448         usbp->st_blksize = sbp->st_blksize;
10449         usbp->st_flags = sbp->st_flags;
10450         usbp->st_gen = sbp->st_gen;
10451         usbp->st_lspare = sbp->st_lspare;
10452         usbp->st_qspare[0] = sbp->st_qspare[0];
10453         usbp->st_qspare[1] = sbp->st_qspare[1];
10454 }
10455
10456 void munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
10457 {
10458         bzero(usbp, sizeof(*usbp));
10459
10460         usbp->st_dev = sbp->st_dev;
10461         usbp->st_ino = sbp->st_ino;
10462         usbp->st_mode = sbp->st_mode;
10463         usbp->st_nlink = sbp->st_nlink;
10464         usbp->st_uid = sbp->st_uid;
10465         usbp->st_gid = sbp->st_gid;
10466         usbp->st_rdev = sbp->st_rdev;
10467 #ifndef _POSIX_C_SOURCE
10468         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10469         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10470         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10471         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10472         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10473         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10474 #else
10475         usbp->st_atime = sbp->st_atime;
10476         usbp->st_atimensec = sbp->st_atimensec;
10477         usbp->st_mtime = sbp->st_mtime;
10478         usbp->st_mtimensec = sbp->st_mtimensec;
10479         usbp->st_ctime = sbp->st_ctime;
10480         usbp->st_ctimensec = sbp->st_ctimensec;
10481 #endif
10482         usbp->st_size = sbp->st_size;
10483         usbp->st_blocks = sbp->st_blocks;
10484         usbp->st_blksize = sbp->st_blksize;
10485         usbp->st_flags = sbp->st_flags;
10486         usbp->st_gen = sbp->st_gen;
10487         usbp->st_lspare = sbp->st_lspare;
10488         usbp->st_qspare[0] = sbp->st_qspare[0];
10489         usbp->st_qspare[1] = sbp->st_qspare[1];
10490 }
10491
10492 /*
10493  * copy stat64 structure into user_stat64 structure.
10494  */
10495 void munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
10496 {
10497         bzero(usbp, sizeof(*usbp));
10498
10499         usbp->st_dev = sbp->st_dev;
10500         usbp->st_ino = sbp->st_ino;
10501         usbp->st_mode = sbp->st_mode;
10502         usbp->st_nlink = sbp->st_nlink;
10503         usbp->st_uid = sbp->st_uid;
10504         usbp->st_gid = sbp->st_gid;
10505         usbp->st_rdev = sbp->st_rdev;
10506 #ifndef _POSIX_C_SOURCE
10507         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10508         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10509         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10510         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10511         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10512         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10513         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
10514         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
10515 #else
10516         usbp->st_atime = sbp->st_atime;
10517         usbp->st_atimensec = sbp->st_atimensec;
10518         usbp->st_mtime = sbp->st_mtime;
10519         usbp->st_mtimensec = sbp->st_mtimensec;
10520         usbp->st_ctime = sbp->st_ctime;
10521         usbp->st_ctimensec = sbp->st_ctimensec;
10522         usbp->st_birthtime = sbp->st_birthtime;
10523         usbp->st_birthtimensec = sbp->st_birthtimensec;
10524 #endif
10525         usbp->st_size = sbp->st_size;
10526         usbp->st_blocks = sbp->st_blocks;
10527         usbp->st_blksize = sbp->st_blksize;
10528         usbp->st_flags = sbp->st_flags;
10529         usbp->st_gen = sbp->st_gen;
10530         usbp->st_lspare = sbp->st_lspare;
10531         usbp->st_qspare[0] = sbp->st_qspare[0];
10532         usbp->st_qspare[1] = sbp->st_qspare[1];
10533 }
10534
10535 void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
10536 {
10537         bzero(usbp, sizeof(*usbp));
10538
10539         usbp->st_dev = sbp->st_dev;
10540         usbp->st_ino = sbp->st_ino;
10541         usbp->st_mode = sbp->st_mode;
10542         usbp->st_nlink = sbp->st_nlink;
10543         usbp->st_uid = sbp->st_uid;
10544         usbp->st_gid = sbp->st_gid;
10545         usbp->st_rdev = sbp->st_rdev;
10546 #ifndef _POSIX_C_SOURCE
10547         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10548         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10549         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10550         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10551         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10552         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10553         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
10554         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
10555 #else
10556         usbp->st_atime = sbp->st_atime;
10557         usbp->st_atimensec = sbp->st_atimensec;
10558         usbp->st_mtime = sbp->st_mtime;
10559         usbp->st_mtimensec = sbp->st_mtimensec;
10560         usbp->st_ctime = sbp->st_ctime;
10561         usbp->st_ctimensec = sbp->st_ctimensec;
10562         usbp->st_birthtime = sbp->st_birthtime;
10563         usbp->st_birthtimensec = sbp->st_birthtimensec;
10564 #endif
10565         usbp->st_size = sbp->st_size;
10566         usbp->st_blocks = sbp->st_blocks;
10567         usbp->st_blksize = sbp->st_blksize;
10568         usbp->st_flags = sbp->st_flags;
10569         usbp->st_gen = sbp->st_gen;
10570         usbp->st_lspare = sbp->st_lspare;
10571         usbp->st_qspare[0] = sbp->st_qspare[0];
10572         usbp->st_qspare[1] = sbp->st_qspare[1];
10573 }
10574
10575 /*
10576  * Purge buffer cache for simulating cold starts
10577  */
10578 static int vnode_purge_callback(struct vnode *vp, __unused void *cargs)
10579 {
10580         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
10581
10582         return VNODE_RETURNED;
10583 }
10584
10585 static int vfs_purge_callback(mount_t mp, __unused void * arg)
10586 {
10587         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
10588
10589         return VFS_RETURNED;
10590 }
10591
10592 int
10593 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
10594 {
10595         if (!kauth_cred_issuser(kauth_cred_get()))
10596                 return EPERM;
10597
10598         vfs_iterate(0/* flags */, vfs_purge_callback, NULL);
10599
10600         return 0;
10601 }
10602