bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/xattr.h>
  98 #include <sys/fcntl.h>
  99 #include <sys/fsctl.h>
 100 #include <sys/ubc_internal.h>
 101 #include <sys/disk.h>
 102 #include <sys/content_protection.h>
 103 #include <machine/cons.h>
 104 #include <machine/limits.h>
 105 #include <miscfs/specfs/specdev.h>
 106
 107 #include <security/audit/audit.h>
 108 #include <bsm/audit_kevents.h>
 109
 110 #include <mach/mach_types.h>
 111 #include <kern/kern_types.h>
 112 #include <kern/kalloc.h>
 113 #include <kern/task.h>
 114
 115 #include <vm/vm_pageout.h>
 116
 117 #include <libkern/OSAtomic.h>
 118 #include <pexpert/pexpert.h>
 119 #include <IOKit/IOBSD.h>
 120
 121 #if CONFIG_MACF
 122 #include <security/mac.h>
 123 #include <security/mac_framework.h>
 124 #endif
 125
 126 #if CONFIG_FSE
 127 #define GET_PATH(x) \
 128         (x) = get_pathbuff();
 129 #define RELEASE_PATH(x) \
 130         release_pathbuff(x);
 131 #else
 132 #define GET_PATH(x)     \
 133         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 134 #define RELEASE_PATH(x) \
 135         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 136 #endif /* CONFIG_FSE */
 137
 138 /* struct for checkdirs iteration */
 139 struct cdirargs {
 140         vnode_t olddp;
 141         vnode_t newdp;
 142 };
 143 /* callback  for checkdirs iteration */
 144 static int checkdirs_callback(proc_t p, void * arg);
 145
 146 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 147 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 148 void enablequotas(struct mount *mp, vfs_context_t ctx);
 149 static int getfsstat_callback(mount_t mp, void * arg);
 150 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 151 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 152 static int sync_callback(mount_t, void *);
 153 static void sync_thread(void *, __unused wait_result_t);
 154 static int sync_async(int);
 155 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 156                         user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 157                                                 boolean_t partial_copy);
 158 static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
 159                         user_addr_t bufp);
 160 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 161 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 162                         struct componentname *cnp, user_addr_t fsmountargs,
 163                         int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 164                         vfs_context_t ctx);
 165 void vfs_notify_mount(vnode_t pdvp);
 166
 167 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 168
 169 struct fd_vn_data * fg_vn_data_alloc(void);
 170
 171 /*
 172  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 173  * Concurrent lookups (or lookups by ids) on hard links can cause the
 174  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 175  * does) to return ENOENT as the path cannot be returned from the name cache
 176  * alone. We have no option but to retry and hope to get one namei->reverse path
 177  * generation done without an intervening lookup, lookup by id on the hard link
 178  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 179  * which currently are the MAC hooks for rename, unlink and rmdir.
 180  */
 181 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 182
 183 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
 184
 185 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
 186
 187 #ifdef CONFIG_IMGSRC_ACCESS
 188 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 189 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 190 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 191 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 192 static void mount_end_update(mount_t mp);
 193 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 194 #endif /* CONFIG_IMGSRC_ACCESS */
 195
 196 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 197
 198 __private_extern__
 199 int sync_internal(void);
 200
 201 __private_extern__
 202 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 203
 204 extern lck_grp_t *fd_vn_lck_grp;
 205 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 206 extern lck_attr_t *fd_vn_lck_attr;
 207
 208 /*
 209  * incremented each time a mount or unmount operation occurs
 210  * used to invalidate the cached value of the rootvp in the
 211  * mount structure utilized by cache_lookup_path
 212  */
 213 uint32_t mount_generation = 0;
 214
 215 /* counts number of mount and unmount operations */
 216 unsigned int vfs_nummntops=0;
 217
 218 extern const struct fileops vnops;
 219 #if CONFIG_APPLEDOUBLE
 220 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 221 #endif /* CONFIG_APPLEDOUBLE */
 222
 223 typedef uint32_t vfs_rename_flags_t;
 224 #if CONFIG_SECLUDED_RENAME
 225 enum {
 226         VFS_SECLUDE_RENAME              = 0x00000001
 227 };
 228 #endif
 229
 230 /*
 231  * Virtual File System System Calls
 232  */
 233
 234 #if NFSCLIENT || DEVFS
 235 /*
 236  * Private in-kernel mounting spi (NFS only, not exported)
 237  */
 238  __private_extern__
 239 boolean_t
 240 vfs_iskernelmount(mount_t mp)
 241 {
 242         return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE);
 243 }
 244
 245  __private_extern__
 246 int
 247 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 248              void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
 249 {
 250         struct nameidata nd;
 251         boolean_t did_namei;
 252         int error;
 253
 254         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 255                UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 256
 257         /*
 258          * Get the vnode to be covered if it's not supplied
 259          */
 260         if (vp == NULLVP) {
 261                 error = namei(&nd);
 262                 if (error)
 263                         return (error);
 264                 vp = nd.ni_vp;
 265                 pvp = nd.ni_dvp;
 266                 did_namei = TRUE;
 267         } else {
 268                 char *pnbuf = CAST_DOWN(char *, path);
 269
 270                 nd.ni_cnd.cn_pnbuf = pnbuf;
 271                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 272                 did_namei = FALSE;
 273         }
 274
 275         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 276                              syscall_flags, kern_flags, NULL, TRUE, ctx);
 277
 278         if (did_namei) {
 279                 vnode_put(vp);
 280                 vnode_put(pvp);
 281                 nameidone(&nd);
 282         }
 283
 284         return (error);
 285 }
 286 #endif /* NFSCLIENT || DEVFS */
 287
 288 /*
 289  * Mount a file system.
 290  */
 291 /* ARGSUSED */
 292 int
 293 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 294 {
 295         struct __mac_mount_args muap;
 296
 297         muap.type = uap->type;
 298         muap.path = uap->path;
 299         muap.flags = uap->flags;
 300         muap.data = uap->data;
 301         muap.mac_p = USER_ADDR_NULL;
 302         return (__mac_mount(p, &muap, retval));
 303 }
 304
 305 void
 306 vfs_notify_mount(vnode_t pdvp)
 307 {
 308         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 309         lock_vnode_and_post(pdvp, NOTE_WRITE);
 310 }
 311
 312 /*
 313  * __mac_mount:
 314  *      Mount a file system taking into account MAC label behavior.
 315  *      See mount(2) man page for more information
 316  *
 317  * Parameters:    p                        Process requesting the mount
 318  *                uap                      User argument descriptor (see below)
 319  *                retval                   (ignored)
 320  *
 321  * Indirect:      uap->type                Filesystem type
 322  *                uap->path                Path to mount
 323  *                uap->data                Mount arguments
 324  *                uap->mac_p               MAC info
 325  *                uap->flags               Mount flags
 326  *
 327  *
 328  * Returns:        0                       Success
 329  *                !0                       Not success
 330  */
 331 boolean_t root_fs_upgrade_try = FALSE;
 332
 333 int
 334 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 335 {
 336         vnode_t pvp = NULL;
 337         vnode_t vp = NULL;
 338         int need_nameidone = 0;
 339         vfs_context_t ctx = vfs_context_current();
 340         char fstypename[MFSNAMELEN];
 341         struct nameidata nd;
 342         size_t dummy=0;
 343         char *labelstr = NULL;
 344         int flags = uap->flags;
 345         int error;
 346 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 347         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 348 #else
 349 #pragma unused(p)
 350 #endif
 351         /*
 352          * Get the fs type name from user space
 353          */
 354         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 355         if (error)
 356                 return (error);
 357
 358         /*
 359          * Get the vnode to be covered
 360          */
 361         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 362                UIO_USERSPACE, uap->path, ctx);
 363         error = namei(&nd);
 364         if (error) {
 365                 goto out;
 366         }
 367         need_nameidone = 1;
 368         vp = nd.ni_vp;
 369         pvp = nd.ni_dvp;
 370
 371 #ifdef CONFIG_IMGSRC_ACCESS
 372         /* Mounting image source cannot be batched with other operations */
 373         if (flags == MNT_IMGSRC_BY_INDEX) {
 374                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 375                                                   ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 376                 goto out;
 377         }
 378 #endif /* CONFIG_IMGSRC_ACCESS */
 379
 380 #if CONFIG_MACF
 381         /*
 382          * Get the label string (if any) from user space
 383          */
 384         if (uap->mac_p != USER_ADDR_NULL) {
 385                 struct user_mac mac;
 386                 size_t ulen = 0;
 387
 388                 if (is_64bit) {
 389                         struct user64_mac mac64;
 390                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 391                         mac.m_buflen = mac64.m_buflen;
 392                         mac.m_string = mac64.m_string;
 393                 } else {
 394                         struct user32_mac mac32;
 395                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 396                         mac.m_buflen = mac32.m_buflen;
 397                         mac.m_string = mac32.m_string;
 398                 }
 399                 if (error)
 400                         goto out;
 401                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 402                     (mac.m_buflen < 2)) {
 403                         error = EINVAL;
 404                         goto out;
 405                 }
 406                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 407                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 408                 if (error) {
 409                         goto out;
 410                 }
 411                 AUDIT_ARG(mac_string, labelstr);
 412         }
 413 #endif /* CONFIG_MACF */
 414
 415         AUDIT_ARG(fflags, flags);
 416
 417         if ((vp->v_flag & VROOT) &&
 418                         (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 419                 if (!(flags & MNT_UNION)) {
 420                         flags |= MNT_UPDATE;
 421                 }
 422                 else {
 423                         /*
 424                          * For a union mount on '/', treat it as fresh
 425                          * mount instead of update.
 426                          * Otherwise, union mouting on '/' used to panic the
 427                          * system before, since mnt_vnodecovered was found to
 428                          * be NULL for '/' which is required for unionlookup
 429                          * after it gets ENOENT on union mount.
 430                          */
 431                         flags = (flags & ~(MNT_UPDATE));
 432                 }
 433
 434 #ifdef SECURE_KERNEL
 435                 if ((flags & MNT_RDONLY) == 0) {
 436                         /* Release kernels are not allowed to mount "/" as rw */
 437                         error = EPERM;
 438                         goto out;
 439                 }
 440 #endif
 441                 /*
 442                  * See 7392553 for more details on why this check exists.
 443                  * Suffice to say: If this check is ON and something tries
 444                  * to mount the rootFS RW, we'll turn off the codesign
 445                  * bitmap optimization.
 446                  */
 447 #if CHECK_CS_VALIDATION_BITMAP
 448                 if ((flags & MNT_RDONLY) == 0 ) {
 449                         root_fs_upgrade_try = TRUE;
 450                 }
 451 #endif
 452         }
 453
 454         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 455                              labelstr, FALSE, ctx);
 456
 457 out:
 458
 459 #if CONFIG_MACF
 460         if (labelstr)
 461                 FREE(labelstr, M_MACTEMP);
 462 #endif /* CONFIG_MACF */
 463
 464         if (vp) {
 465                 vnode_put(vp);
 466         }
 467         if (pvp) {
 468                 vnode_put(pvp);
 469         }
 470         if (need_nameidone) {
 471                 nameidone(&nd);
 472         }
 473
 474         return (error);
 475 }
 476
 477 /*
 478  * common mount implementation (final stage of mounting)
 479
 480  * Arguments:
 481  *  fstypename  file system type (ie it's vfs name)
 482  *  pvp         parent of covered vnode
 483  *  vp          covered vnode
 484  *  cnp         component name (ie path) of covered vnode
 485  *  flags       generic mount flags
 486  *  fsmountargs file system specific data
 487  *  labelstr    optional MAC label
 488  *  kernelmount TRUE for mounts initiated from inside the kernel
 489  *  ctx         caller's context
 490  */
 491 static int
 492 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 493              struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 494              char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 495 {
 496 #if !CONFIG_MACF
 497 #pragma unused(labelstr)
 498 #endif
 499         struct vnode *devvp = NULLVP;
 500         struct vnode *device_vnode = NULLVP;
 501 #if CONFIG_MACF
 502         struct vnode *rvp;
 503 #endif
 504         struct mount *mp;
 505         struct vfstable *vfsp = (struct vfstable *)0;
 506         struct proc *p = vfs_context_proc(ctx);
 507         int error, flag = 0;
 508         user_addr_t devpath = USER_ADDR_NULL;
 509         int ronly = 0;
 510         int mntalloc = 0;
 511         boolean_t vfsp_ref = FALSE;
 512         boolean_t is_rwlock_locked = FALSE;
 513         boolean_t did_rele = FALSE;
 514         boolean_t have_usecount = FALSE;
 515
 516         /*
 517          * Process an update for an existing mount
 518          */
 519         if (flags & MNT_UPDATE) {
 520                 if ((vp->v_flag & VROOT) == 0) {
 521                         error = EINVAL;
 522                         goto out1;
 523                 }
 524                 mp = vp->v_mount;
 525
 526                 /* unmount in progress return error */
 527                 mount_lock_spin(mp);
 528                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 529                         mount_unlock(mp);
 530                         error = EBUSY;
 531                         goto out1;
 532                 }
 533                 mount_unlock(mp);
 534                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 535                 is_rwlock_locked = TRUE;
 536                 /*
 537                  * We only allow the filesystem to be reloaded if it
 538                  * is currently mounted read-only.
 539                  */
 540                 if ((flags & MNT_RELOAD) &&
 541                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 542                         error = ENOTSUP;
 543                         goto out1;
 544                 }
 545
 546                 /*
 547                  * If content protection is enabled, update mounts are not
 548                  * allowed to turn it off.
 549                  */
 550                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 551                            ((flags & MNT_CPROTECT) == 0)) {
 552                         error = EINVAL;
 553                         goto out1;
 554                 }
 555
 556 #ifdef CONFIG_IMGSRC_ACCESS
 557                 /* Can't downgrade the backer of the root FS */
 558                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 559                         (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 560                         error = ENOTSUP;
 561                         goto out1;
 562                 }
 563 #endif /* CONFIG_IMGSRC_ACCESS */
 564
 565                 /*
 566                  * Only root, or the user that did the original mount is
 567                  * permitted to update it.
 568                  */
 569                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 570                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 571                         goto out1;
 572                 }
 573 #if CONFIG_MACF
 574                 error = mac_mount_check_remount(ctx, mp);
 575                 if (error != 0) {
 576                         goto out1;
 577                 }
 578 #endif
 579                 /*
 580                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 581                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 582                  */
 583                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 584                         flags |= MNT_NOSUID | MNT_NODEV;
 585                         if (mp->mnt_flag & MNT_NOEXEC)
 586                                 flags |= MNT_NOEXEC;
 587                 }
 588                 flag = mp->mnt_flag;
 589
 590
 591
 592                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 593
 594                 vfsp = mp->mnt_vtable;
 595                 goto update;
 596         }
 597         /*
 598          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 599          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 600          */
 601         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 602                 flags |= MNT_NOSUID | MNT_NODEV;
 603                 if (vp->v_mount->mnt_flag & MNT_NOEXEC)
 604                         flags |= MNT_NOEXEC;
 605         }
 606
 607         /* XXXAUDIT: Should we capture the type on the error path as well? */
 608         AUDIT_ARG(text, fstypename);
 609         mount_list_lock();
 610         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 611                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 612                         vfsp->vfc_refcount++;
 613                         vfsp_ref = TRUE;
 614                         break;
 615                 }
 616         mount_list_unlock();
 617         if (vfsp == NULL) {
 618                 error = ENODEV;
 619                 goto out1;
 620         }
 621
 622         /*
 623          * VFC_VFSLOCALARGS is not currently supported for kernel mounts
 624          */
 625         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
 626                 error = EINVAL;  /* unsupported request */
 627                 goto out1;
 628         }
 629
 630         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 631         if (error != 0) {
 632                 goto out1;
 633         }
 634
 635         /*
 636          * Allocate and initialize the filesystem (mount_t)
 637          */
 638         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 639                 M_MOUNT, M_WAITOK);
 640         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 641         mntalloc = 1;
 642
 643         /* Initialize the default IO constraints */
 644         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 645         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 646         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 647         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 648         mp->mnt_devblocksize = DEV_BSIZE;
 649         mp->mnt_alignmentmask = PAGE_MASK;
 650         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 651         mp->mnt_ioscale = 1;
 652         mp->mnt_ioflags = 0;
 653         mp->mnt_realrootvp = NULLVP;
 654         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 655
 656         TAILQ_INIT(&mp->mnt_vnodelist);
 657         TAILQ_INIT(&mp->mnt_workerqueue);
 658         TAILQ_INIT(&mp->mnt_newvnodes);
 659         mount_lock_init(mp);
 660         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 661         is_rwlock_locked = TRUE;
 662         mp->mnt_op = vfsp->vfc_vfsops;
 663         mp->mnt_vtable = vfsp;
 664         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 665         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 666         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 667         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 668         mp->mnt_vnodecovered = vp;
 669         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 670         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 671         mp->mnt_devbsdunit = 0;
 672
 673         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 674         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 675
 676 #if NFSCLIENT || DEVFS
 677         if (kernelmount)
 678                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 679         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0)
 680                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 681 #endif /* NFSCLIENT || DEVFS */
 682
 683 update:
 684         /*
 685          * Set the mount level flags.
 686          */
 687         if (flags & MNT_RDONLY)
 688                 mp->mnt_flag |= MNT_RDONLY;
 689         else if (mp->mnt_flag & MNT_RDONLY) {
 690                 // disallow read/write upgrades of file systems that
 691                 // had the TYPENAME_OVERRIDE feature set.
 692                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 693                         error = EPERM;
 694                         goto out1;
 695                 }
 696                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 697         }
 698         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 699                           MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 700                           MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 701                           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 702                           MNT_QUARANTINE | MNT_CPROTECT);
 703         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 704                                  MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 705                                  MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 706                                  MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 707                                  MNT_QUARANTINE | MNT_CPROTECT);
 708
 709 #if CONFIG_MACF
 710         if (flags & MNT_MULTILABEL) {
 711                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 712                         error = EINVAL;
 713                         goto out1;
 714                 }
 715                 mp->mnt_flag |= MNT_MULTILABEL;
 716         }
 717 #endif
 718         /*
 719          * Process device path for local file systems if requested
 720          */
 721         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
 722                 if (vfs_context_is64bit(ctx)) {
 723                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
 724                                 goto out1;
 725                         fsmountargs += sizeof(devpath);
 726                 } else {
 727                         user32_addr_t tmp;
 728                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
 729                                 goto out1;
 730                         /* munge into LP64 addr */
 731                         devpath = CAST_USER_ADDR_T(tmp);
 732                         fsmountargs += sizeof(tmp);
 733                 }
 734
 735                 /* Lookup device and authorize access to it */
 736                 if ((devpath)) {
 737                         struct nameidata nd;
 738
 739                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 740                         if ( (error = namei(&nd)) )
 741                                 goto out1;
 742
 743                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 744                         devvp = nd.ni_vp;
 745
 746                         nameidone(&nd);
 747
 748                         if (devvp->v_type != VBLK) {
 749                                 error = ENOTBLK;
 750                                 goto out2;
 751                         }
 752                         if (major(devvp->v_rdev) >= nblkdev) {
 753                                 error = ENXIO;
 754                                 goto out2;
 755                         }
 756                         /*
 757                         * If mount by non-root, then verify that user has necessary
 758                         * permissions on the device.
 759                         */
 760                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 761                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 762
 763                                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
 764                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 765                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0)
 766                                         goto out2;
 767                         }
 768                 }
 769                 /* On first mount, preflight and open device */
 770                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 771                         if ( (error = vnode_ref(devvp)) )
 772                                 goto out2;
 773                         /*
 774                         * Disallow multiple mounts of the same device.
 775                         * Disallow mounting of a device that is currently in use
 776                         * (except for root, which might share swap device for miniroot).
 777                         * Flush out any old buffers remaining from a previous use.
 778                         */
 779                         if ( (error = vfs_mountedon(devvp)) )
 780                                 goto out3;
 781
 782                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 783                                 error = EBUSY;
 784                                 goto out3;
 785                         }
 786                         if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, ctx)) ) {
 787                                 error = ENOTBLK;
 788                                 goto out3;
 789                         }
 790                         if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0)) )
 791                                 goto out3;
 792
 793                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 794 #if CONFIG_MACF
 795                         error = mac_vnode_check_open(ctx,
 796                             devvp,
 797                             ronly ? FREAD : FREAD|FWRITE);
 798                         if (error)
 799                                 goto out3;
 800 #endif /* MAC */
 801                         if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, ctx)) )
 802                                 goto out3;
 803
 804                         mp->mnt_devvp = devvp;
 805                         device_vnode = devvp;
 806
 807                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 808                            (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 809                            (device_vnode = mp->mnt_devvp)) {
 810                         dev_t dev;
 811                         int maj;
 812                         /*
 813                          * If upgrade to read-write by non-root, then verify
 814                          * that user has necessary permissions on the device.
 815                          */
 816                         vnode_getalways(device_vnode);
 817
 818                         if (suser(vfs_context_ucred(ctx), NULL) &&
 819                             (error = vnode_authorize(device_vnode, NULL,
 820                              KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
 821                              ctx)) != 0) {
 822                                 vnode_put(device_vnode);
 823                                 goto out2;
 824                         }
 825
 826                         /* Tell the device that we're upgrading */
 827                         dev = (dev_t)device_vnode->v_rdev;
 828                         maj = major(dev);
 829
 830                         if ((u_int)maj >= (u_int)nblkdev)
 831                                 panic("Volume mounted on a device with invalid major number.");
 832
 833                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
 834                         vnode_put(device_vnode);
 835                         device_vnode = NULLVP;
 836                         if (error != 0) {
 837                                 goto out2;
 838                         }
 839                 }
 840         }
 841 #if CONFIG_MACF
 842         if ((flags & MNT_UPDATE) == 0) {
 843                 mac_mount_label_init(mp);
 844                 mac_mount_label_associate(ctx, mp);
 845         }
 846         if (labelstr) {
 847                 if ((flags & MNT_UPDATE) != 0) {
 848                         error = mac_mount_check_label_update(ctx, mp);
 849                         if (error != 0)
 850                                 goto out3;
 851                 }
 852         }
 853 #endif
 854         /*
 855          * Mount the filesystem.
 856          */
 857         error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
 858
 859         if (flags & MNT_UPDATE) {
 860                 if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 861                         mp->mnt_flag &= ~MNT_RDONLY;
 862                 mp->mnt_flag &=~
 863                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
 864                 mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 865                 if (error)
 866                         mp->mnt_flag = flag;  /* restore flag value */
 867                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
 868                 lck_rw_done(&mp->mnt_rwlock);
 869                 is_rwlock_locked = FALSE;
 870                 if (!error)
 871                         enablequotas(mp, ctx);
 872                 goto exit;
 873         }
 874
 875         /*
 876          * Put the new filesystem on the mount list after root.
 877          */
 878         if (error == 0) {
 879                 struct vfs_attr vfsattr;
 880 #if CONFIG_MACF
 881                 if (vfs_flags(mp) & MNT_MULTILABEL) {
 882                         error = VFS_ROOT(mp, &rvp, ctx);
 883                         if (error) {
 884                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
 885                                 goto out3;
 886                         }
 887                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
 888                         /*
 889                          * drop reference provided by VFS_ROOT
 890                          */
 891                         vnode_put(rvp);
 892
 893                         if (error)
 894                                 goto out3;
 895                 }
 896 #endif  /* MAC */
 897
 898                 vnode_lock_spin(vp);
 899                 CLR(vp->v_flag, VMOUNT);
 900                 vp->v_mountedhere = mp;
 901                 vnode_unlock(vp);
 902
 903                 /*
 904                  * taking the name_cache_lock exclusively will
 905                  * insure that everyone is out of the fast path who
 906                  * might be trying to use a now stale copy of
 907                  * vp->v_mountedhere->mnt_realrootvp
 908                  * bumping mount_generation causes the cached values
 909                  * to be invalidated
 910                  */
 911                 name_cache_lock();
 912                 mount_generation++;
 913                 name_cache_unlock();
 914
 915                 error = vnode_ref(vp);
 916                 if (error != 0) {
 917                         goto out4;
 918                 }
 919
 920                 have_usecount = TRUE;
 921
 922                 error = checkdirs(vp, ctx);
 923                 if (error != 0)  {
 924                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
 925                         goto out4;
 926                 }
 927                 /*
 928                  * there is no cleanup code here so I have made it void
 929                  * we need to revisit this
 930                  */
 931                 (void)VFS_START(mp, 0, ctx);
 932
 933                 if (mount_list_add(mp) != 0) {
 934                         /*
 935                          * The system is shutting down trying to umount
 936                          * everything, so fail with a plausible errno.
 937                          */
 938                         error = EBUSY;
 939                         goto out4;
 940                 }
 941                 lck_rw_done(&mp->mnt_rwlock);
 942                 is_rwlock_locked = FALSE;
 943
 944                 /* Check if this mounted file system supports EAs or named streams. */
 945                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
 946                 VFSATTR_INIT(&vfsattr);
 947                 VFSATTR_WANTED(&vfsattr, f_capabilities);
 948                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
 949                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
 950                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
 951                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
 952                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
 953                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
 954                         }
 955 #if NAMEDSTREAMS
 956                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
 957                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
 958                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
 959                         }
 960 #endif
 961                         /* Check if this file system supports path from id lookups. */
 962                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
 963                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
 964                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 965                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
 966                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
 967                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 968                         }
 969                 }
 970                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
 971                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
 972                 }
 973                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
 974                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
 975                 }
 976                 /* increment the operations count */
 977                 OSAddAtomic(1, &vfs_nummntops);
 978                 enablequotas(mp, ctx);
 979
 980                 if (device_vnode) {
 981                         device_vnode->v_specflags |= SI_MOUNTEDON;
 982
 983                         /*
 984                          *   cache the IO attributes for the underlying physical media...
 985                          *   an error return indicates the underlying driver doesn't
 986                          *   support all the queries necessary... however, reasonable
 987                          *   defaults will have been set, so no reason to bail or care
 988                          */
 989                         vfs_init_io_attributes(device_vnode, mp);
 990                 }
 991
 992                 /* Now that mount is setup, notify the listeners */
 993                 vfs_notify_mount(pvp);
 994                 IOBSDMountChange(mp, kIOMountChangeMount);
 995
 996         } else {
 997                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
 998                 if (mp->mnt_vnodelist.tqh_first != NULL) {
 999                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1000                                         mp->mnt_vtable->vfc_name, error);
1001                 }
1002
1003                 vnode_lock_spin(vp);
1004                 CLR(vp->v_flag, VMOUNT);
1005                 vnode_unlock(vp);
1006                 mount_list_lock();
1007                 mp->mnt_vtable->vfc_refcount--;
1008                 mount_list_unlock();
1009
1010                 if (device_vnode ) {
1011                         vnode_rele(device_vnode);
1012                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD|FWRITE, ctx);
1013                 }
1014                 lck_rw_done(&mp->mnt_rwlock);
1015                 is_rwlock_locked = FALSE;
1016
1017                 /*
1018                  * if we get here, we have a mount structure that needs to be freed,
1019                  * but since the coveredvp hasn't yet been updated to point at it,
1020                  * no need to worry about other threads holding a crossref on this mp
1021                  * so it's ok to just free it
1022                  */
1023                 mount_lock_destroy(mp);
1024 #if CONFIG_MACF
1025                 mac_mount_label_destroy(mp);
1026 #endif
1027                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1028         }
1029 exit:
1030         /*
1031          * drop I/O count on the device vp if there was one
1032          */
1033         if (devpath && devvp)
1034                 vnode_put(devvp);
1035
1036         return(error);
1037
1038 /* Error condition exits */
1039 out4:
1040         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1041
1042         /*
1043          * If the mount has been placed on the covered vp,
1044          * it may have been discovered by now, so we have
1045          * to treat this just like an unmount
1046          */
1047         mount_lock_spin(mp);
1048         mp->mnt_lflag |= MNT_LDEAD;
1049         mount_unlock(mp);
1050
1051         if (device_vnode != NULLVP) {
1052                 vnode_rele(device_vnode);
1053                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1054                        ctx);
1055                 did_rele = TRUE;
1056         }
1057
1058         vnode_lock_spin(vp);
1059
1060         mp->mnt_crossref++;
1061         vp->v_mountedhere = (mount_t) 0;
1062
1063         vnode_unlock(vp);
1064
1065         if (have_usecount) {
1066                 vnode_rele(vp);
1067         }
1068 out3:
1069         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele))
1070                 vnode_rele(devvp);
1071 out2:
1072         if (devpath && devvp)
1073                 vnode_put(devvp);
1074 out1:
1075         /* Release mnt_rwlock only when it was taken */
1076         if (is_rwlock_locked == TRUE) {
1077                 lck_rw_done(&mp->mnt_rwlock);
1078         }
1079
1080         if (mntalloc) {
1081                 if (mp->mnt_crossref)
1082                         mount_dropcrossref(mp, vp, 0);
1083                 else {
1084                         mount_lock_destroy(mp);
1085 #if CONFIG_MACF
1086                         mac_mount_label_destroy(mp);
1087 #endif
1088                         FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1089                 }
1090         }
1091         if (vfsp_ref) {
1092                 mount_list_lock();
1093                 vfsp->vfc_refcount--;
1094                 mount_list_unlock();
1095         }
1096
1097         return(error);
1098 }
1099
1100 /*
1101  * Flush in-core data, check for competing mount attempts,
1102  * and set VMOUNT
1103  */
1104 int
1105 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1106 {
1107 #if !CONFIG_MACF
1108 #pragma unused(cnp,fsname)
1109 #endif
1110         struct vnode_attr va;
1111         int error;
1112
1113         if (!skip_auth) {
1114                 /*
1115                  * If the user is not root, ensure that they own the directory
1116                  * onto which we are attempting to mount.
1117                  */
1118                 VATTR_INIT(&va);
1119                 VATTR_WANTED(&va, va_uid);
1120                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1121                                 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1122                                  (!vfs_context_issuser(ctx)))) {
1123                         error = EPERM;
1124                         goto out;
1125                 }
1126         }
1127
1128         if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
1129                 goto out;
1130
1131         if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) )
1132                 goto out;
1133
1134         if (vp->v_type != VDIR) {
1135                 error = ENOTDIR;
1136                 goto out;
1137         }
1138
1139         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1140                 error = EBUSY;
1141                 goto out;
1142         }
1143
1144 #if CONFIG_MACF
1145         error = mac_mount_check_mount(ctx, vp,
1146             cnp, fsname);
1147         if (error != 0)
1148                 goto out;
1149 #endif
1150
1151         vnode_lock_spin(vp);
1152         SET(vp->v_flag, VMOUNT);
1153         vnode_unlock(vp);
1154
1155 out:
1156         return error;
1157 }
1158
1159 #if CONFIG_IMGSRC_ACCESS
1160
1161 #if DEBUG
1162 #define IMGSRC_DEBUG(args...) printf(args)
1163 #else
1164 #define IMGSRC_DEBUG(args...) do { } while(0)
1165 #endif
1166
1167 static int
1168 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1169 {
1170         struct nameidata nd;
1171         vnode_t vp, realdevvp;
1172         mode_t accessmode;
1173         int error;
1174
1175         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
1176         if ( (error = namei(&nd)) ) {
1177                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1178                 return error;
1179         }
1180
1181         vp = nd.ni_vp;
1182
1183         if (!vnode_isblk(vp)) {
1184                 IMGSRC_DEBUG("Not block device.\n");
1185                 error = ENOTBLK;
1186                 goto out;
1187         }
1188
1189         realdevvp = mp->mnt_devvp;
1190         if (realdevvp == NULLVP) {
1191                 IMGSRC_DEBUG("No device backs the mount.\n");
1192                 error = ENXIO;
1193                 goto out;
1194         }
1195
1196         error = vnode_getwithref(realdevvp);
1197         if (error != 0) {
1198                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1199                 goto out;
1200         }
1201
1202         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1203                 IMGSRC_DEBUG("Wrong dev_t.\n");
1204                 error = ENXIO;
1205                 goto out1;
1206         }
1207
1208         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1209
1210         /*
1211          * If mount by non-root, then verify that user has necessary
1212          * permissions on the device.
1213          */
1214         if (!vfs_context_issuser(ctx)) {
1215                 accessmode = KAUTH_VNODE_READ_DATA;
1216                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
1217                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1218                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1219                         IMGSRC_DEBUG("Access denied.\n");
1220                         goto out1;
1221                 }
1222         }
1223
1224         *devvpp = vp;
1225
1226 out1:
1227         vnode_put(realdevvp);
1228 out:
1229         nameidone(&nd);
1230         if (error) {
1231                 vnode_put(vp);
1232         }
1233
1234         return error;
1235 }
1236
1237 /*
1238  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1239  * and call checkdirs()
1240  */
1241 static int
1242 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1243 {
1244         int error;
1245
1246         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1247
1248         vnode_lock_spin(vp);
1249         CLR(vp->v_flag, VMOUNT);
1250         vp->v_mountedhere = mp;
1251         vnode_unlock(vp);
1252
1253         /*
1254          * taking the name_cache_lock exclusively will
1255          * insure that everyone is out of the fast path who
1256          * might be trying to use a now stale copy of
1257          * vp->v_mountedhere->mnt_realrootvp
1258          * bumping mount_generation causes the cached values
1259          * to be invalidated
1260          */
1261         name_cache_lock();
1262         mount_generation++;
1263         name_cache_unlock();
1264
1265         error = vnode_ref(vp);
1266         if (error != 0) {
1267                 goto out;
1268         }
1269
1270         error = checkdirs(vp, ctx);
1271         if (error != 0)  {
1272                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1273                 vnode_rele(vp);
1274                 goto out;
1275         }
1276
1277 out:
1278         if (error != 0) {
1279                 mp->mnt_vnodecovered = NULLVP;
1280         }
1281         return error;
1282 }
1283
1284 static void
1285 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1286 {
1287         vnode_rele(vp);
1288         vnode_lock_spin(vp);
1289         vp->v_mountedhere = (mount_t)NULL;
1290         vnode_unlock(vp);
1291
1292         mp->mnt_vnodecovered = NULLVP;
1293 }
1294
1295 static int
1296 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1297 {
1298         int error;
1299
1300         /* unmount in progress return error */
1301         mount_lock_spin(mp);
1302         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1303                 mount_unlock(mp);
1304                 return EBUSY;
1305         }
1306         mount_unlock(mp);
1307         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1308
1309         /*
1310          * We only allow the filesystem to be reloaded if it
1311          * is currently mounted read-only.
1312          */
1313         if ((flags & MNT_RELOAD) &&
1314                         ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1315                 error = ENOTSUP;
1316                 goto out;
1317         }
1318
1319         /*
1320          * Only root, or the user that did the original mount is
1321          * permitted to update it.
1322          */
1323         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1324                         (!vfs_context_issuser(ctx))) {
1325                 error = EPERM;
1326                 goto out;
1327         }
1328 #if CONFIG_MACF
1329         error = mac_mount_check_remount(ctx, mp);
1330         if (error != 0) {
1331                 goto out;
1332         }
1333 #endif
1334
1335 out:
1336         if (error) {
1337                 lck_rw_done(&mp->mnt_rwlock);
1338         }
1339
1340         return error;
1341 }
1342
1343 static void
1344 mount_end_update(mount_t mp)
1345 {
1346         lck_rw_done(&mp->mnt_rwlock);
1347 }
1348
1349 static int
1350 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1351 {
1352         vnode_t vp;
1353
1354         if (height >= MAX_IMAGEBOOT_NESTING) {
1355                 return EINVAL;
1356         }
1357
1358         vp = imgsrc_rootvnodes[height];
1359         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1360                 *rvpp = vp;
1361                 return 0;
1362         } else {
1363                 return ENOENT;
1364         }
1365 }
1366
1367 static int
1368 relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
1369                 const char *fsname, vfs_context_t ctx,
1370                 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1371 {
1372         int error;
1373         mount_t mp;
1374         boolean_t placed = FALSE;
1375         vnode_t devvp = NULLVP;
1376         struct vfstable *vfsp;
1377         user_addr_t devpath;
1378         char *old_mntonname;
1379         vnode_t rvp;
1380         uint32_t height;
1381         uint32_t flags;
1382
1383         /* If we didn't imageboot, nothing to move */
1384         if (imgsrc_rootvnodes[0] == NULLVP) {
1385                 return EINVAL;
1386         }
1387
1388         /* Only root can do this */
1389         if (!vfs_context_issuser(ctx)) {
1390                 return EPERM;
1391         }
1392
1393         IMGSRC_DEBUG("looking for root vnode.\n");
1394
1395         /*
1396          * Get root vnode of filesystem we're moving.
1397          */
1398         if (by_index) {
1399                 if (is64bit) {
1400                         struct user64_mnt_imgsrc_args mia64;
1401                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1402                         if (error != 0) {
1403                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1404                                 return error;
1405                         }
1406
1407                         height = mia64.mi_height;
1408                         flags = mia64.mi_flags;
1409                         devpath = mia64.mi_devpath;
1410                 } else {
1411                         struct user32_mnt_imgsrc_args mia32;
1412                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1413                         if (error != 0) {
1414                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1415                                 return error;
1416                         }
1417
1418                         height = mia32.mi_height;
1419                         flags = mia32.mi_flags;
1420                         devpath = mia32.mi_devpath;
1421                 }
1422         } else {
1423                 /*
1424                  * For binary compatibility--assumes one level of nesting.
1425                  */
1426                 if (is64bit) {
1427                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
1428                                 return error;
1429                 } else {
1430                         user32_addr_t tmp;
1431                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
1432                                 return error;
1433
1434                         /* munge into LP64 addr */
1435                         devpath = CAST_USER_ADDR_T(tmp);
1436                 }
1437
1438                 height = 0;
1439                 flags = 0;
1440         }
1441
1442         if (flags != 0) {
1443                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1444                 return EINVAL;
1445         }
1446
1447         error = get_imgsrc_rootvnode(height, &rvp);
1448         if (error != 0) {
1449                 IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
1450                 return error;
1451         }
1452
1453         IMGSRC_DEBUG("got root vnode.\n");
1454
1455         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1456
1457         /* Can only move once */
1458         mp = vnode_mount(rvp);
1459         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1460                 IMGSRC_DEBUG("Already moved.\n");
1461                 error = EBUSY;
1462                 goto out0;
1463         }
1464
1465         IMGSRC_DEBUG("Starting updated.\n");
1466
1467         /* Get exclusive rwlock on mount, authorize update on mp */
1468         error = mount_begin_update(mp , ctx, 0);
1469         if (error != 0) {
1470                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1471                 goto out0;
1472         }
1473
1474         /*
1475          * It can only be moved once.  Flag is set under the rwlock,
1476          * so we're now safe to proceed.
1477          */
1478         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1479                 IMGSRC_DEBUG("Already moved [2]\n");
1480                 goto out1;
1481         }
1482
1483
1484         IMGSRC_DEBUG("Preparing coveredvp.\n");
1485
1486         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1487         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1488         if (error != 0) {
1489                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1490                 goto out1;
1491         }
1492
1493         IMGSRC_DEBUG("Covered vp OK.\n");
1494
1495         /* Sanity check the name caller has provided */
1496         vfsp = mp->mnt_vtable;
1497         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1498                 IMGSRC_DEBUG("Wrong fs name.\n");
1499                 error = EINVAL;
1500                 goto out2;
1501         }
1502
1503         /* Check the device vnode and update mount-from name, for local filesystems */
1504         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1505                 IMGSRC_DEBUG("Local, doing device validation.\n");
1506
1507                 if (devpath != USER_ADDR_NULL) {
1508                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1509                         if (error) {
1510                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1511                                 goto out2;
1512                         }
1513
1514                         vnode_put(devvp);
1515                 }
1516         }
1517
1518         /*
1519          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1520          * and increment the name cache's mount generation
1521          */
1522
1523         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1524         error = place_mount_and_checkdirs(mp, vp, ctx);
1525         if (error != 0) {
1526                 goto out2;
1527         }
1528
1529         placed = TRUE;
1530
1531         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1532         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1533
1534         /* Forbid future moves */
1535         mount_lock(mp);
1536         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1537         mount_unlock(mp);
1538
1539         /* Finally, add to mount list, completely ready to go */
1540         if (mount_list_add(mp) != 0) {
1541                 /*
1542                  * The system is shutting down trying to umount
1543                  * everything, so fail with a plausible errno.
1544                  */
1545                 error = EBUSY;
1546                 goto out3;
1547         }
1548
1549         mount_end_update(mp);
1550         vnode_put(rvp);
1551         FREE(old_mntonname, M_TEMP);
1552
1553         vfs_notify_mount(pvp);
1554
1555         return 0;
1556 out3:
1557         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1558
1559         mount_lock(mp);
1560         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1561         mount_unlock(mp);
1562
1563 out2:
1564         /*
1565          * Placing the mp on the vnode clears VMOUNT,
1566          * so cleanup is different after that point
1567          */
1568         if (placed) {
1569                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1570                 undo_place_on_covered_vp(mp, vp);
1571         } else {
1572                 vnode_lock_spin(vp);
1573                 CLR(vp->v_flag, VMOUNT);
1574                 vnode_unlock(vp);
1575         }
1576 out1:
1577         mount_end_update(mp);
1578
1579 out0:
1580         vnode_put(rvp);
1581         FREE(old_mntonname, M_TEMP);
1582         return error;
1583 }
1584
1585 #endif /* CONFIG_IMGSRC_ACCESS */
1586
1587 void
1588 enablequotas(struct mount *mp, vfs_context_t ctx)
1589 {
1590         struct nameidata qnd;
1591         int type;
1592         char qfpath[MAXPATHLEN];
1593         const char *qfname = QUOTAFILENAME;
1594         const char *qfopsname = QUOTAOPSNAME;
1595         const char *qfextension[] = INITQFNAMES;
1596
1597         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1598         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0 ) {
1599                 return;
1600         }
1601         /*
1602          * Enable filesystem disk quotas if necessary.
1603          * We ignore errors as this should not interfere with final mount
1604          */
1605         for (type=0; type < MAXQUOTAS; type++) {
1606                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1607                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1608                        CAST_USER_ADDR_T(qfpath), ctx);
1609                 if (namei(&qnd) != 0)
1610                         continue;           /* option file to trigger quotas is not present */
1611                 vnode_put(qnd.ni_vp);
1612                 nameidone(&qnd);
1613                 snprintf(qfpath, sizeof(qfpath),  "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1614
1615                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1616         }
1617         return;
1618 }
1619
1620
1621 static int
1622 checkdirs_callback(proc_t p, void * arg)
1623 {
1624         struct cdirargs * cdrp = (struct cdirargs * )arg;
1625         vnode_t olddp = cdrp->olddp;
1626         vnode_t newdp = cdrp->newdp;
1627         struct filedesc *fdp;
1628         vnode_t tvp;
1629         vnode_t fdp_cvp;
1630         vnode_t fdp_rvp;
1631         int cdir_changed = 0;
1632         int rdir_changed = 0;
1633
1634         /*
1635          * XXX Also needs to iterate each thread in the process to see if it
1636          * XXX is using a per-thread current working directory, and, if so,
1637          * XXX update that as well.
1638          */
1639
1640         proc_fdlock(p);
1641         fdp = p->p_fd;
1642         if (fdp == (struct filedesc *)0) {
1643                 proc_fdunlock(p);
1644                 return(PROC_RETURNED);
1645         }
1646         fdp_cvp = fdp->fd_cdir;
1647         fdp_rvp = fdp->fd_rdir;
1648         proc_fdunlock(p);
1649
1650         if (fdp_cvp == olddp) {
1651                 vnode_ref(newdp);
1652                 tvp = fdp->fd_cdir;
1653                 fdp_cvp = newdp;
1654                 cdir_changed = 1;
1655                 vnode_rele(tvp);
1656         }
1657         if (fdp_rvp == olddp) {
1658                 vnode_ref(newdp);
1659                 tvp = fdp->fd_rdir;
1660                 fdp_rvp = newdp;
1661                 rdir_changed = 1;
1662                 vnode_rele(tvp);
1663         }
1664         if (cdir_changed || rdir_changed) {
1665                 proc_fdlock(p);
1666                 fdp->fd_cdir = fdp_cvp;
1667                 fdp->fd_rdir = fdp_rvp;
1668                 proc_fdunlock(p);
1669         }
1670         return(PROC_RETURNED);
1671 }
1672
1673
1674
1675 /*
1676  * Scan all active processes to see if any of them have a current
1677  * or root directory onto which the new filesystem has just been
1678  * mounted. If so, replace them with the new mount point.
1679  */
1680 static int
1681 checkdirs(vnode_t olddp, vfs_context_t ctx)
1682 {
1683         vnode_t newdp;
1684         vnode_t tvp;
1685         int err;
1686         struct cdirargs cdr;
1687
1688         if (olddp->v_usecount == 1)
1689                 return(0);
1690         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
1691
1692         if (err != 0) {
1693 #if DIAGNOSTIC
1694                 panic("mount: lost mount: error %d", err);
1695 #endif
1696                 return(err);
1697         }
1698
1699         cdr.olddp = olddp;
1700         cdr.newdp = newdp;
1701         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
1702         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
1703
1704         if (rootvnode == olddp) {
1705                 vnode_ref(newdp);
1706                 tvp = rootvnode;
1707                 rootvnode = newdp;
1708                 vnode_rele(tvp);
1709         }
1710
1711         vnode_put(newdp);
1712         return(0);
1713 }
1714
1715 /*
1716  * Unmount a file system.
1717  *
1718  * Note: unmount takes a path to the vnode mounted on as argument,
1719  * not special file (as before).
1720  */
1721 /* ARGSUSED */
1722 int
1723 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
1724 {
1725         vnode_t vp;
1726         struct mount *mp;
1727         int error;
1728         struct nameidata nd;
1729         vfs_context_t ctx = vfs_context_current();
1730
1731         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
1732                 UIO_USERSPACE, uap->path, ctx);
1733         error = namei(&nd);
1734         if (error)
1735                 return (error);
1736         vp = nd.ni_vp;
1737         mp = vp->v_mount;
1738         nameidone(&nd);
1739
1740 #if CONFIG_MACF
1741         error = mac_mount_check_umount(ctx, mp);
1742         if (error != 0) {
1743                 vnode_put(vp);
1744                 return (error);
1745         }
1746 #endif
1747         /*
1748          * Must be the root of the filesystem
1749          */
1750         if ((vp->v_flag & VROOT) == 0) {
1751                 vnode_put(vp);
1752                 return (EINVAL);
1753         }
1754         mount_ref(mp, 0);
1755         vnode_put(vp);
1756         /* safedounmount consumes the mount ref */
1757         return (safedounmount(mp, uap->flags, ctx));
1758 }
1759
1760 int
1761 vfs_unmountbyfsid(fsid_t * fsid, int flags, vfs_context_t ctx)
1762 {
1763         mount_t mp;
1764
1765         mp = mount_list_lookupby_fsid(fsid, 0, 1);
1766         if (mp == (mount_t)0) {
1767                 return(ENOENT);
1768         }
1769         mount_ref(mp, 0);
1770         mount_iterdrop(mp);
1771         /* safedounmount consumes the mount ref */
1772         return(safedounmount(mp, flags, ctx));
1773 }
1774
1775
1776 /*
1777  * The mount struct comes with a mount ref which will be consumed.
1778  * Do the actual file system unmount, prevent some common foot shooting.
1779  */
1780 int
1781 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
1782 {
1783         int error;
1784         proc_t p = vfs_context_proc(ctx);
1785
1786         /*
1787          * If the file system is not responding and MNT_NOBLOCK
1788          * is set and not a forced unmount then return EBUSY.
1789          */
1790         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
1791                 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
1792                 error = EBUSY;
1793                 goto out;
1794         }
1795
1796         /*
1797          * Skip authorization if the mount is tagged as permissive and
1798          * this is not a forced-unmount attempt.
1799          */
1800         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
1801                 /*
1802                  * Only root, or the user that did the original mount is
1803                  * permitted to unmount this filesystem.
1804                  */
1805                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
1806                                 (error = suser(kauth_cred_get(), &p->p_acflag)))
1807                         goto out;
1808         }
1809         /*
1810          * Don't allow unmounting the root file system.
1811          */
1812         if (mp->mnt_flag & MNT_ROOTFS) {
1813                 error = EBUSY; /* the root is always busy */
1814                 goto out;
1815         }
1816
1817 #ifdef CONFIG_IMGSRC_ACCESS
1818         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
1819                 error = EBUSY;
1820                 goto out;
1821         }
1822 #endif /* CONFIG_IMGSRC_ACCESS */
1823
1824         return (dounmount(mp, flags, 1, ctx));
1825
1826 out:
1827         mount_drop(mp, 0);
1828         return(error);
1829 }
1830
1831 /*
1832  * Do the actual file system unmount.
1833  */
1834 int
1835 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
1836 {
1837         vnode_t coveredvp = (vnode_t)0;
1838         int error;
1839         int needwakeup = 0;
1840         int forcedunmount = 0;
1841         int lflags = 0;
1842         struct vnode *devvp = NULLVP;
1843 #if CONFIG_TRIGGERS
1844         proc_t p = vfs_context_proc(ctx);
1845         int did_vflush = 0;
1846         int pflags_save = 0;
1847 #endif /* CONFIG_TRIGGERS */
1848
1849         mount_lock(mp);
1850
1851         /*
1852          * If already an unmount in progress just return EBUSY.
1853          * Even a forced unmount cannot override.
1854          */
1855         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1856                 if (withref != 0)
1857                         mount_drop(mp, 1);
1858                 mount_unlock(mp);
1859                 return (EBUSY);
1860         }
1861
1862         if (flags & MNT_FORCE) {
1863                 forcedunmount = 1;
1864                 mp->mnt_lflag |= MNT_LFORCE;
1865         }
1866
1867 #if CONFIG_TRIGGERS
1868         if (flags & MNT_NOBLOCK && p != kernproc)
1869                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
1870 #endif
1871
1872         mp->mnt_kern_flag |= MNTK_UNMOUNT;
1873         mp->mnt_lflag |= MNT_LUNMOUNT;
1874         mp->mnt_flag &=~ MNT_ASYNC;
1875         /*
1876          * anyone currently in the fast path that
1877          * trips over the cached rootvp will be
1878          * dumped out and forced into the slow path
1879          * to regenerate a new cached value
1880          */
1881         mp->mnt_realrootvp = NULLVP;
1882         mount_unlock(mp);
1883
1884         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
1885                 /*
1886                  * Force unmount any mounts in this filesystem.
1887                  * If any unmounts fail - just leave them dangling.
1888                  * Avoids recursion.
1889                  */
1890                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
1891         }
1892
1893         /*
1894          * taking the name_cache_lock exclusively will
1895          * insure that everyone is out of the fast path who
1896          * might be trying to use a now stale copy of
1897          * vp->v_mountedhere->mnt_realrootvp
1898          * bumping mount_generation causes the cached values
1899          * to be invalidated
1900          */
1901         name_cache_lock();
1902         mount_generation++;
1903         name_cache_unlock();
1904
1905
1906         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1907         if (withref != 0)
1908                 mount_drop(mp, 0);
1909 #if CONFIG_FSE
1910         fsevent_unmount(mp);  /* has to come first! */
1911 #endif
1912         error = 0;
1913         if (forcedunmount == 0) {
1914                 ubc_umount(mp); /* release cached vnodes */
1915                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1916                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
1917                         if (error) {
1918                                 mount_lock(mp);
1919                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1920                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1921                                 mp->mnt_lflag &= ~MNT_LFORCE;
1922                                 goto out;
1923                         }
1924                 }
1925         }
1926
1927         IOBSDMountChange(mp, kIOMountChangeUnmount);
1928
1929 #if CONFIG_TRIGGERS
1930         vfs_nested_trigger_unmounts(mp, flags, ctx);
1931         did_vflush = 1;
1932 #endif
1933         if (forcedunmount)
1934                 lflags |= FORCECLOSE;
1935         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
1936         if ((forcedunmount == 0) && error) {
1937                 mount_lock(mp);
1938                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1939                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1940                 mp->mnt_lflag &= ~MNT_LFORCE;
1941                 goto out;
1942         }
1943
1944         /* make sure there are no one in the mount iterations or lookup */
1945         mount_iterdrain(mp);
1946
1947         error = VFS_UNMOUNT(mp, flags, ctx);
1948         if (error) {
1949                 mount_iterreset(mp);
1950                 mount_lock(mp);
1951                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1952                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1953                 mp->mnt_lflag &= ~MNT_LFORCE;
1954                 goto out;
1955         }
1956
1957         /* increment the operations count */
1958         if (!error)
1959                 OSAddAtomic(1, &vfs_nummntops);
1960
1961         if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
1962                 /* hold an io reference and drop the usecount before close */
1963                 devvp = mp->mnt_devvp;
1964                 vnode_getalways(devvp);
1965                 vnode_rele(devvp);
1966                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1967                        ctx);
1968                 vnode_clearmountedon(devvp);
1969                 vnode_put(devvp);
1970         }
1971         lck_rw_done(&mp->mnt_rwlock);
1972         mount_list_remove(mp);
1973         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1974
1975         /* mark the mount point hook in the vp but not drop the ref yet */
1976         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
1977                 /*
1978                  * The covered vnode needs special handling. Trying to get an
1979                  * iocount must not block here as this may lead to deadlocks
1980                  * if the Filesystem to which the covered vnode belongs is
1981                  * undergoing forced unmounts. Since we hold a usecount, the
1982                  * vnode cannot be reused (it can, however, still be terminated)
1983                  */
1984                 vnode_getalways(coveredvp);
1985                 vnode_lock_spin(coveredvp);
1986
1987                 mp->mnt_crossref++;
1988                 coveredvp->v_mountedhere = (struct mount *)0;
1989                 CLR(coveredvp->v_flag, VMOUNT);
1990
1991                 vnode_unlock(coveredvp);
1992                 vnode_put(coveredvp);
1993         }
1994
1995         mount_list_lock();
1996         mp->mnt_vtable->vfc_refcount--;
1997         mount_list_unlock();
1998
1999         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2000         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2001         mount_lock(mp);
2002         mp->mnt_lflag |= MNT_LDEAD;
2003
2004         if (mp->mnt_lflag & MNT_LWAIT) {
2005                 /*
2006                  * do the wakeup here
2007                  * in case we block in mount_refdrain
2008                  * which will drop the mount lock
2009                  * and allow anyone blocked in vfs_busy
2010                  * to wakeup and see the LDEAD state
2011                  */
2012                 mp->mnt_lflag &= ~MNT_LWAIT;
2013                 wakeup((caddr_t)mp);
2014         }
2015         mount_refdrain(mp);
2016 out:
2017         if (mp->mnt_lflag & MNT_LWAIT) {
2018                 mp->mnt_lflag &= ~MNT_LWAIT;
2019                 needwakeup = 1;
2020         }
2021
2022 #if CONFIG_TRIGGERS
2023         if (flags & MNT_NOBLOCK && p != kernproc) {
2024                 // Restore P_NOREMOTEHANG bit to its previous value
2025                 if ((pflags_save & P_NOREMOTEHANG) == 0)
2026                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2027         }
2028
2029         /*
2030          * Callback and context are set together under the mount lock, and
2031          * never cleared, so we're safe to examine them here, drop the lock,
2032          * and call out.
2033          */
2034         if (mp->mnt_triggercallback != NULL) {
2035                 mount_unlock(mp);
2036                 if (error == 0) {
2037                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2038                 } else if (did_vflush) {
2039                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2040                 }
2041         } else {
2042                 mount_unlock(mp);
2043         }
2044 #else
2045         mount_unlock(mp);
2046 #endif /* CONFIG_TRIGGERS */
2047
2048         lck_rw_done(&mp->mnt_rwlock);
2049
2050         if (needwakeup)
2051                 wakeup((caddr_t)mp);
2052
2053         if (!error) {
2054                 if ((coveredvp != NULLVP)) {
2055                         vnode_t pvp = NULLVP;
2056
2057                         /*
2058                          * The covered vnode needs special handling. Trying to
2059                          * get an iocount must not block here as this may lead
2060                          * to deadlocks if the Filesystem to which the covered
2061                          * vnode belongs is undergoing forced unmounts. Since we
2062                          * hold a usecount, the  vnode cannot be reused
2063                          * (it can, however, still be terminated).
2064                          */
2065                         vnode_getalways(coveredvp);
2066
2067                         mount_dropcrossref(mp, coveredvp, 0);
2068                         /*
2069                          * We'll _try_ to detect if this really needs to be
2070                          * done. The coveredvp can only be in termination (or
2071                          * terminated) if the coveredvp's mount point is in a
2072                          * forced unmount (or has been) since we still hold the
2073                          * ref.
2074                          */
2075                         if (!vnode_isrecycled(coveredvp)) {
2076                                 pvp = vnode_getparent(coveredvp);
2077 #if CONFIG_TRIGGERS
2078                                 if (coveredvp->v_resolve) {
2079                                         vnode_trigger_rearm(coveredvp, ctx);
2080                                 }
2081 #endif
2082                         }
2083
2084                         vnode_rele(coveredvp);
2085                         vnode_put(coveredvp);
2086                         coveredvp = NULLVP;
2087
2088                         if (pvp) {
2089                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2090                                 vnode_put(pvp);
2091                         }
2092                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2093                                 mount_lock_destroy(mp);
2094 #if CONFIG_MACF
2095                                 mac_mount_label_destroy(mp);
2096 #endif
2097                                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2098                 } else
2099                         panic("dounmount: no coveredvp");
2100         }
2101         return (error);
2102 }
2103
2104 /*
2105  * Unmount any mounts in this filesystem.
2106  */
2107 void
2108 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2109 {
2110         mount_t smp;
2111         fsid_t *fsids, fsid;
2112         int fsids_sz;
2113         int count = 0, i, m = 0;
2114         vnode_t vp;
2115
2116         mount_list_lock();
2117
2118         // Get an array to hold the submounts fsids.
2119         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2120                 count++;
2121         fsids_sz = count * sizeof(fsid_t);
2122         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2123         if (fsids == NULL) {
2124                 mount_list_unlock();
2125                 goto out;
2126         }
2127         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2128
2129         /*
2130          * Fill the array with submount fsids.
2131          * Since mounts are always added to the tail of the mount list, the
2132          * list is always in mount order.
2133          * For each mount check if the mounted-on vnode belongs to a
2134          * mount that's already added to our array of mounts to be unmounted.
2135          */
2136         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2137                 vp = smp->mnt_vnodecovered;
2138                 if (vp == NULL)
2139                         continue;
2140                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2141                 for (i = 0; i <= m; i++) {
2142                         if (fsids[i].val[0] == fsid.val[0] &&
2143                             fsids[i].val[1] == fsid.val[1]) {
2144                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2145                                 break;
2146                         }
2147                 }
2148         }
2149         mount_list_unlock();
2150
2151         // Unmount the submounts in reverse order. Ignore errors.
2152         for (i = m; i > 0; i--) {
2153                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2154                 if (smp) {
2155                         mount_ref(smp, 0);
2156                         mount_iterdrop(smp);
2157                         (void) dounmount(smp, flags, 1, ctx);
2158                 }
2159         }
2160 out:
2161         if (fsids)
2162                 FREE(fsids, M_TEMP);
2163 }
2164
2165 void
2166 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2167 {
2168         vnode_lock(dp);
2169         mp->mnt_crossref--;
2170
2171         if (mp->mnt_crossref < 0)
2172                 panic("mount cross refs -ve");
2173
2174         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2175
2176                 if (need_put)
2177                         vnode_put_locked(dp);
2178                 vnode_unlock(dp);
2179
2180                 mount_lock_destroy(mp);
2181 #if CONFIG_MACF
2182                 mac_mount_label_destroy(mp);
2183 #endif
2184                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2185                 return;
2186         }
2187         if (need_put)
2188                 vnode_put_locked(dp);
2189         vnode_unlock(dp);
2190 }
2191
2192
2193 /*
2194  * Sync each mounted filesystem.
2195  */
2196 #if DIAGNOSTIC
2197 int syncprt = 0;
2198 #endif
2199
2200 int print_vmpage_stat=0;
2201 int sync_timeout = 60;  // Sync time limit (sec)
2202
2203 static int
2204 sync_callback(mount_t mp, __unused void *arg)
2205 {
2206         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2207                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2208
2209                 mp->mnt_flag &= ~MNT_ASYNC;
2210                 VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
2211                 if (asyncflag)
2212                         mp->mnt_flag |= MNT_ASYNC;
2213         }
2214
2215         return (VFS_RETURNED);
2216 }
2217
2218 /* ARGSUSED */
2219 int
2220 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2221 {
2222         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2223
2224         if (print_vmpage_stat) {
2225                 vm_countdirtypages();
2226         }
2227
2228 #if DIAGNOSTIC
2229         if (syncprt)
2230                 vfs_bufstats();
2231 #endif /* DIAGNOSTIC */
2232         return 0;
2233 }
2234
2235 static void
2236 sync_thread(void *arg, __unused wait_result_t wr)
2237 {
2238         int *timeout = (int *) arg;
2239
2240         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2241
2242         if (timeout)
2243                 wakeup((caddr_t) timeout);
2244         if (print_vmpage_stat) {
2245                 vm_countdirtypages();
2246         }
2247
2248 #if DIAGNOSTIC
2249         if (syncprt)
2250                 vfs_bufstats();
2251 #endif /* DIAGNOSTIC */
2252 }
2253
2254 /*
2255  * Sync in a separate thread so we can time out if it blocks.
2256  */
2257 static int
2258 sync_async(int timeout)
2259 {
2260         thread_t thd;
2261         int error;
2262         struct timespec ts = {timeout, 0};
2263
2264         lck_mtx_lock(sync_mtx_lck);
2265         if (kernel_thread_start(sync_thread, &timeout, &thd) != KERN_SUCCESS) {
2266                 printf("sync_thread failed\n");
2267                 lck_mtx_unlock(sync_mtx_lck);
2268                 return (0);
2269         }
2270
2271         error = msleep((caddr_t) &timeout, sync_mtx_lck, (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2272         if (error) {
2273                 printf("sync timed out: %d sec\n", timeout);
2274         }
2275         thread_deallocate(thd);
2276
2277         return (0);
2278 }
2279
2280 /*
2281  * An in-kernel sync for power management to call.
2282  */
2283 __private_extern__ int
2284 sync_internal(void)
2285 {
2286         (void) sync_async(sync_timeout);
2287
2288         return 0;
2289 } /* end of sync_internal call */
2290
2291 /*
2292  * Change filesystem quotas.
2293  */
2294 #if QUOTA
2295 int
2296 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2297 {
2298         struct mount *mp;
2299         int error, quota_cmd, quota_status;
2300         caddr_t datap;
2301         size_t fnamelen;
2302         struct nameidata nd;
2303         vfs_context_t ctx = vfs_context_current();
2304         struct dqblk my_dqblk;
2305
2306         AUDIT_ARG(uid, uap->uid);
2307         AUDIT_ARG(cmd, uap->cmd);
2308         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2309                uap->path, ctx);
2310         error = namei(&nd);
2311         if (error)
2312                 return (error);
2313         mp = nd.ni_vp->v_mount;
2314         vnode_put(nd.ni_vp);
2315         nameidone(&nd);
2316
2317         /* copyin any data we will need for downstream code */
2318         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2319
2320         switch (quota_cmd) {
2321         case Q_QUOTAON:
2322                 /* uap->arg specifies a file from which to take the quotas */
2323                 fnamelen = MAXPATHLEN;
2324                 datap = kalloc(MAXPATHLEN);
2325                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2326                 break;
2327         case Q_GETQUOTA:
2328                 /* uap->arg is a pointer to a dqblk structure. */
2329                 datap = (caddr_t) &my_dqblk;
2330                 break;
2331         case Q_SETQUOTA:
2332         case Q_SETUSE:
2333                 /* uap->arg is a pointer to a dqblk structure. */
2334                 datap = (caddr_t) &my_dqblk;
2335                 if (proc_is64bit(p)) {
2336                         struct user_dqblk       my_dqblk64;
2337                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof (my_dqblk64));
2338                         if (error == 0) {
2339                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2340                         }
2341                 }
2342                 else {
2343                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof (my_dqblk));
2344                 }
2345                 break;
2346         case Q_QUOTASTAT:
2347                 /* uap->arg is a pointer to an integer */
2348                 datap = (caddr_t) &quota_status;
2349                 break;
2350         default:
2351                 datap = NULL;
2352                 break;
2353         } /* switch */
2354
2355         if (error == 0) {
2356                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2357         }
2358
2359         switch (quota_cmd) {
2360         case Q_QUOTAON:
2361                 if (datap != NULL)
2362                         kfree(datap, MAXPATHLEN);
2363                 break;
2364         case Q_GETQUOTA:
2365                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2366                 if (error == 0) {
2367                         if (proc_is64bit(p)) {
2368                                 struct user_dqblk       my_dqblk64 = {.dqb_bhardlimit = 0};
2369                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2370                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof (my_dqblk64));
2371                         }
2372                         else {
2373                                 error = copyout(datap, uap->arg, sizeof (struct dqblk));
2374                         }
2375                 }
2376                 break;
2377         case Q_QUOTASTAT:
2378                 /* uap->arg is a pointer to an integer */
2379                 if (error == 0) {
2380                         error = copyout(datap, uap->arg, sizeof(quota_status));
2381                 }
2382                 break;
2383         default:
2384                 break;
2385         } /* switch */
2386
2387         return (error);
2388 }
2389 #else
2390 int
2391 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2392 {
2393         return (EOPNOTSUPP);
2394 }
2395 #endif /* QUOTA */
2396
2397 /*
2398  * Get filesystem statistics.
2399  *
2400  * Returns:     0                       Success
2401  *      namei:???
2402  *      vfs_update_vfsstat:???
2403  *      munge_statfs:EFAULT
2404  */
2405 /* ARGSUSED */
2406 int
2407 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2408 {
2409         struct mount *mp;
2410         struct vfsstatfs *sp;
2411         int error;
2412         struct nameidata nd;
2413         vfs_context_t ctx = vfs_context_current();
2414         vnode_t vp;
2415
2416         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2417                 UIO_USERSPACE, uap->path, ctx);
2418         error = namei(&nd);
2419         if (error)
2420                 return (error);
2421         vp = nd.ni_vp;
2422         mp = vp->v_mount;
2423         sp = &mp->mnt_vfsstat;
2424         nameidone(&nd);
2425
2426         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2427         if (error != 0) {
2428                 vnode_put(vp);
2429                 return (error);
2430         }
2431
2432         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2433         vnode_put(vp);
2434         return (error);
2435 }
2436
2437 /*
2438  * Get filesystem statistics.
2439  */
2440 /* ARGSUSED */
2441 int
2442 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2443 {
2444         vnode_t vp;
2445         struct mount *mp;
2446         struct vfsstatfs *sp;
2447         int error;
2448
2449         AUDIT_ARG(fd, uap->fd);
2450
2451         if ( (error = file_vnode(uap->fd, &vp)) )
2452                 return (error);
2453
2454         error = vnode_getwithref(vp);
2455         if (error) {
2456                 file_drop(uap->fd);
2457                 return (error);
2458         }
2459
2460         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2461
2462         mp = vp->v_mount;
2463         if (!mp) {
2464                 error = EBADF;
2465                 goto out;
2466         }
2467         sp = &mp->mnt_vfsstat;
2468         if ((error = vfs_update_vfsstat(mp,vfs_context_current(),VFS_USER_EVENT)) != 0) {
2469                 goto out;
2470         }
2471
2472         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2473
2474 out:
2475         file_drop(uap->fd);
2476         vnode_put(vp);
2477
2478         return (error);
2479 }
2480
2481 /*
2482  * Common routine to handle copying of statfs64 data to user space
2483  */
2484 static int
2485 statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
2486 {
2487         int error;
2488         struct statfs64 sfs;
2489
2490         bzero(&sfs, sizeof(sfs));
2491
2492         sfs.f_bsize = sfsp->f_bsize;
2493         sfs.f_iosize = (int32_t)sfsp->f_iosize;
2494         sfs.f_blocks = sfsp->f_blocks;
2495         sfs.f_bfree = sfsp->f_bfree;
2496         sfs.f_bavail = sfsp->f_bavail;
2497         sfs.f_files = sfsp->f_files;
2498         sfs.f_ffree = sfsp->f_ffree;
2499         sfs.f_fsid = sfsp->f_fsid;
2500         sfs.f_owner = sfsp->f_owner;
2501         sfs.f_type = mp->mnt_vtable->vfc_typenum;
2502         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2503         sfs.f_fssubtype = sfsp->f_fssubtype;
2504         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2505                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
2506         } else {
2507                 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
2508         }
2509         strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
2510         strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
2511
2512         error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
2513
2514         return(error);
2515 }
2516
2517 /*
2518  * Get file system statistics in 64-bit mode
2519  */
2520 int
2521 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
2522 {
2523         struct mount *mp;
2524         struct vfsstatfs *sp;
2525         int error;
2526         struct nameidata nd;
2527         vfs_context_t ctxp = vfs_context_current();
2528         vnode_t vp;
2529
2530         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2531                 UIO_USERSPACE, uap->path, ctxp);
2532         error = namei(&nd);
2533         if (error)
2534                 return (error);
2535         vp = nd.ni_vp;
2536         mp = vp->v_mount;
2537         sp = &mp->mnt_vfsstat;
2538         nameidone(&nd);
2539
2540         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
2541         if (error != 0) {
2542                 vnode_put(vp);
2543                 return (error);
2544         }
2545
2546         error = statfs64_common(mp, sp, uap->buf);
2547         vnode_put(vp);
2548
2549         return (error);
2550 }
2551
2552 /*
2553  * Get file system statistics in 64-bit mode
2554  */
2555 int
2556 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
2557 {
2558         struct vnode *vp;
2559         struct mount *mp;
2560         struct vfsstatfs *sp;
2561         int error;
2562
2563         AUDIT_ARG(fd, uap->fd);
2564
2565         if ( (error = file_vnode(uap->fd, &vp)) )
2566                 return (error);
2567
2568         error = vnode_getwithref(vp);
2569         if (error) {
2570                 file_drop(uap->fd);
2571                 return (error);
2572         }
2573
2574         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2575
2576         mp = vp->v_mount;
2577         if (!mp) {
2578                 error = EBADF;
2579                 goto out;
2580         }
2581         sp = &mp->mnt_vfsstat;
2582         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2583                 goto out;
2584         }
2585
2586         error = statfs64_common(mp, sp, uap->buf);
2587
2588 out:
2589         file_drop(uap->fd);
2590         vnode_put(vp);
2591
2592         return (error);
2593 }
2594
2595 struct getfsstat_struct {
2596         user_addr_t     sfsp;
2597         user_addr_t     *mp;
2598         int             count;
2599         int             maxcount;
2600         int             flags;
2601         int             error;
2602 };
2603
2604
2605 static int
2606 getfsstat_callback(mount_t mp, void * arg)
2607 {
2608
2609         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2610         struct vfsstatfs *sp;
2611         int error, my_size;
2612         vfs_context_t ctx = vfs_context_current();
2613
2614         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2615                 sp = &mp->mnt_vfsstat;
2616                 /*
2617                  * If MNT_NOWAIT is specified, do not refresh the
2618                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
2619                  */
2620                 if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2621                         (error = vfs_update_vfsstat(mp, ctx,
2622                             VFS_USER_EVENT))) {
2623                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2624                         return(VFS_RETURNED);
2625                 }
2626
2627                 /*
2628                  * Need to handle LP64 version of struct statfs
2629                  */
2630                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
2631                 if (error) {
2632                         fstp->error = error;
2633                         return(VFS_RETURNED_DONE);
2634                 }
2635                 fstp->sfsp += my_size;
2636
2637                 if (fstp->mp) {
2638 #if CONFIG_MACF
2639                         error = mac_mount_label_get(mp, *fstp->mp);
2640                         if (error) {
2641                                 fstp->error = error;
2642                                 return(VFS_RETURNED_DONE);
2643                         }
2644 #endif
2645                         fstp->mp++;
2646                 }
2647         }
2648         fstp->count++;
2649         return(VFS_RETURNED);
2650 }
2651
2652 /*
2653  * Get statistics on all filesystems.
2654  */
2655 int
2656 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
2657 {
2658         struct __mac_getfsstat_args muap;
2659
2660         muap.buf = uap->buf;
2661         muap.bufsize = uap->bufsize;
2662         muap.mac = USER_ADDR_NULL;
2663         muap.macsize = 0;
2664         muap.flags = uap->flags;
2665
2666         return (__mac_getfsstat(p, &muap, retval));
2667 }
2668
2669 /*
2670  * __mac_getfsstat: Get MAC-related file system statistics
2671  *
2672  * Parameters:    p                        (ignored)
2673  *                uap                      User argument descriptor (see below)
2674  *                retval                   Count of file system statistics (N stats)
2675  *
2676  * Indirect:      uap->bufsize             Buffer size
2677  *                uap->macsize             MAC info size
2678  *                uap->buf                 Buffer where information will be returned
2679  *                uap->mac                 MAC info
2680  *                uap->flags               File system flags
2681  *
2682  *
2683  * Returns:        0                       Success
2684  *                !0                       Not success
2685  *
2686  */
2687 int
2688 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
2689 {
2690         user_addr_t sfsp;
2691         user_addr_t *mp;
2692         size_t count, maxcount, bufsize, macsize;
2693         struct getfsstat_struct fst;
2694
2695         bufsize = (size_t) uap->bufsize;
2696         macsize = (size_t) uap->macsize;
2697
2698         if (IS_64BIT_PROCESS(p)) {
2699                 maxcount = bufsize / sizeof(struct user64_statfs);
2700         }
2701         else {
2702                 maxcount = bufsize / sizeof(struct user32_statfs);
2703         }
2704         sfsp = uap->buf;
2705         count = 0;
2706
2707         mp = NULL;
2708
2709 #if CONFIG_MACF
2710         if (uap->mac != USER_ADDR_NULL) {
2711                 u_int32_t *mp0;
2712                 int error;
2713                 unsigned int i;
2714
2715                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
2716                 if (count != maxcount)
2717                         return (EINVAL);
2718
2719                 /* Copy in the array */
2720                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
2721                 if (mp0 == NULL) {
2722                         return (ENOMEM);
2723                 }
2724
2725                 error = copyin(uap->mac, mp0, macsize);
2726                 if (error) {
2727                         FREE(mp0, M_MACTEMP);
2728                         return (error);
2729                 }
2730
2731                 /* Normalize to an array of user_addr_t */
2732                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
2733                 if (mp == NULL) {
2734                         FREE(mp0, M_MACTEMP);
2735                         return (ENOMEM);
2736                 }
2737
2738                 for (i = 0; i < count; i++) {
2739                         if (IS_64BIT_PROCESS(p))
2740                                 mp[i] = ((user_addr_t *)mp0)[i];
2741                         else
2742                                 mp[i] = (user_addr_t)mp0[i];
2743                 }
2744                 FREE(mp0, M_MACTEMP);
2745         }
2746 #endif
2747
2748
2749         fst.sfsp = sfsp;
2750         fst.mp = mp;
2751         fst.flags = uap->flags;
2752         fst.count = 0;
2753         fst.error = 0;
2754         fst.maxcount = maxcount;
2755
2756
2757         vfs_iterate(0, getfsstat_callback, &fst);
2758
2759         if (mp)
2760                 FREE(mp, M_MACTEMP);
2761
2762         if (fst.error ) {
2763                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2764                 return(fst.error);
2765         }
2766
2767         if (fst.sfsp && fst.count > fst.maxcount)
2768                 *retval = fst.maxcount;
2769         else
2770                 *retval = fst.count;
2771         return (0);
2772 }
2773
2774 static int
2775 getfsstat64_callback(mount_t mp, void * arg)
2776 {
2777         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2778         struct vfsstatfs *sp;
2779         int error;
2780
2781         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2782                 sp = &mp->mnt_vfsstat;
2783                 /*
2784                  * If MNT_NOWAIT is specified, do not refresh the fsstat
2785                  * cache. MNT_WAIT overrides MNT_NOWAIT.
2786                  *
2787                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
2788                  * getfsstat, since the constants are out of the same
2789                  * namespace.
2790                  */
2791                 if (((fstp->flags & MNT_NOWAIT) == 0 ||
2792                      (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2793                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
2794                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2795                         return(VFS_RETURNED);
2796                 }
2797
2798                 error = statfs64_common(mp, sp, fstp->sfsp);
2799                 if (error) {
2800                         fstp->error = error;
2801                         return(VFS_RETURNED_DONE);
2802                 }
2803                 fstp->sfsp += sizeof(struct statfs64);
2804         }
2805         fstp->count++;
2806         return(VFS_RETURNED);
2807 }
2808
2809 /*
2810  * Get statistics on all file systems in 64 bit mode.
2811  */
2812 int
2813 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
2814 {
2815         user_addr_t sfsp;
2816         int count, maxcount;
2817         struct getfsstat_struct fst;
2818
2819         maxcount = uap->bufsize / sizeof(struct statfs64);
2820
2821         sfsp = uap->buf;
2822         count = 0;
2823
2824         fst.sfsp = sfsp;
2825         fst.flags = uap->flags;
2826         fst.count = 0;
2827         fst.error = 0;
2828         fst.maxcount = maxcount;
2829
2830         vfs_iterate(0, getfsstat64_callback, &fst);
2831
2832         if (fst.error ) {
2833                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2834                 return(fst.error);
2835         }
2836
2837         if (fst.sfsp && fst.count > fst.maxcount)
2838                 *retval = fst.maxcount;
2839         else
2840                 *retval = fst.count;
2841
2842         return (0);
2843 }
2844
2845 /*
2846  * gets the associated vnode with the file descriptor passed.
2847  * as input
2848  *
2849  * INPUT
2850  * ctx - vfs context of caller
2851  * fd - file descriptor for which vnode is required.
2852  * vpp - Pointer to pointer to vnode to be returned.
2853  *
2854  * The vnode is returned with an iocount so any vnode obtained
2855  * by this call needs a vnode_put
2856  *
2857  */
2858 static int
2859 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
2860 {
2861         int error;
2862         vnode_t vp;
2863         struct fileproc *fp;
2864         proc_t p = vfs_context_proc(ctx);
2865
2866         *vpp =  NULLVP;
2867
2868         error = fp_getfvp(p, fd, &fp, &vp);
2869         if (error)
2870                 return (error);
2871
2872         error = vnode_getwithref(vp);
2873         if (error) {
2874                 (void)fp_drop(p, fd, fp, 0);
2875                 return (error);
2876         }
2877
2878         (void)fp_drop(p, fd, fp, 0);
2879         *vpp = vp;
2880         return (error);
2881 }
2882
2883 /*
2884  * Wrapper function around namei to start lookup from a directory
2885  * specified by a file descriptor ni_dirfd.
2886  *
2887  * In addition to all the errors returned by namei, this call can
2888  * return ENOTDIR if the file descriptor does not refer to a directory.
2889  * and EBADF if the file descriptor is not valid.
2890  */
2891 int
2892 nameiat(struct nameidata *ndp, int dirfd)
2893 {
2894         if ((dirfd != AT_FDCWD) &&
2895             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
2896             !(ndp->ni_cnd.cn_flags & USEDVP)) {
2897                 int error = 0;
2898                 char c;
2899
2900                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
2901                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
2902                         if (error)
2903                                 return (error);
2904                 } else {
2905                         c = *((char *)(ndp->ni_dirp));
2906                 }
2907
2908                 if (c != '/') {
2909                         vnode_t dvp_at;
2910
2911                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
2912                             &dvp_at);
2913                         if (error)
2914                                 return (error);
2915
2916                         if (vnode_vtype(dvp_at) != VDIR) {
2917                                 vnode_put(dvp_at);
2918                                 return (ENOTDIR);
2919                         }
2920
2921                         ndp->ni_dvp = dvp_at;
2922                         ndp->ni_cnd.cn_flags |= USEDVP;
2923                         error = namei(ndp);
2924                         ndp->ni_cnd.cn_flags &= ~USEDVP;
2925                         vnode_put(dvp_at);
2926                         return (error);
2927                 }
2928         }
2929
2930         return (namei(ndp));
2931 }
2932
2933 /*
2934  * Change current working directory to a given file descriptor.
2935  */
2936 /* ARGSUSED */
2937 static int
2938 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
2939 {
2940         struct filedesc *fdp = p->p_fd;
2941         vnode_t vp;
2942         vnode_t tdp;
2943         vnode_t tvp;
2944         struct mount *mp;
2945         int error;
2946         vfs_context_t ctx = vfs_context_current();
2947
2948         AUDIT_ARG(fd, uap->fd);
2949         if (per_thread && uap->fd == -1) {
2950                 /*
2951                  * Switching back from per-thread to per process CWD; verify we
2952                  * in fact have one before proceeding.  The only success case
2953                  * for this code path is to return 0 preemptively after zapping
2954                  * the thread structure contents.
2955                  */
2956                 thread_t th = vfs_context_thread(ctx);
2957                 if (th) {
2958                         uthread_t uth = get_bsdthread_info(th);
2959                         tvp = uth->uu_cdir;
2960                         uth->uu_cdir = NULLVP;
2961                         if (tvp != NULLVP) {
2962                                 vnode_rele(tvp);
2963                                 return (0);
2964                         }
2965                 }
2966                 return (EBADF);
2967         }
2968
2969         if ( (error = file_vnode(uap->fd, &vp)) )
2970                 return(error);
2971         if ( (error = vnode_getwithref(vp)) ) {
2972                 file_drop(uap->fd);
2973                 return(error);
2974         }
2975
2976         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
2977
2978         if (vp->v_type != VDIR) {
2979                 error = ENOTDIR;
2980                 goto out;
2981         }
2982
2983 #if CONFIG_MACF
2984         error = mac_vnode_check_chdir(ctx, vp);
2985         if (error)
2986                 goto out;
2987 #endif
2988         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
2989         if (error)
2990                 goto out;
2991
2992         while (!error && (mp = vp->v_mountedhere) != NULL) {
2993                 if (vfs_busy(mp, LK_NOWAIT)) {
2994                         error = EACCES;
2995                         goto out;
2996                 }
2997                 error = VFS_ROOT(mp, &tdp, ctx);
2998                 vfs_unbusy(mp);
2999                 if (error)
3000                         break;
3001                 vnode_put(vp);
3002                 vp = tdp;
3003         }
3004         if (error)
3005                 goto out;
3006         if ( (error = vnode_ref(vp)) )
3007                 goto out;
3008         vnode_put(vp);
3009
3010         if (per_thread) {
3011                 thread_t th = vfs_context_thread(ctx);
3012                 if (th) {
3013                         uthread_t uth = get_bsdthread_info(th);
3014                         tvp = uth->uu_cdir;
3015                         uth->uu_cdir = vp;
3016                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3017                 } else {
3018                         vnode_rele(vp);
3019                         return (ENOENT);
3020                 }
3021         } else {
3022                 proc_fdlock(p);
3023                 tvp = fdp->fd_cdir;
3024                 fdp->fd_cdir = vp;
3025                 proc_fdunlock(p);
3026         }
3027
3028         if (tvp)
3029                 vnode_rele(tvp);
3030         file_drop(uap->fd);
3031
3032         return (0);
3033 out:
3034         vnode_put(vp);
3035         file_drop(uap->fd);
3036
3037         return(error);
3038 }
3039
3040 int
3041 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3042 {
3043         return common_fchdir(p, uap, 0);
3044 }
3045
3046 int
3047 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3048 {
3049         return common_fchdir(p, (void *)uap, 1);
3050 }
3051
3052 /*
3053  * Change current working directory (".").
3054  *
3055  * Returns:     0                       Success
3056  *      change_dir:ENOTDIR
3057  *      change_dir:???
3058  *      vnode_ref:ENOENT                No such file or directory
3059  */
3060 /* ARGSUSED */
3061 static int
3062 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3063 {
3064         struct filedesc *fdp = p->p_fd;
3065         int error;
3066         struct nameidata nd;
3067         vnode_t tvp;
3068         vfs_context_t ctx = vfs_context_current();
3069
3070         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3071                 UIO_USERSPACE, uap->path, ctx);
3072         error = change_dir(&nd, ctx);
3073         if (error)
3074                 return (error);
3075         if ( (error = vnode_ref(nd.ni_vp)) ) {
3076                 vnode_put(nd.ni_vp);
3077                 return (error);
3078         }
3079         /*
3080          * drop the iocount we picked up in change_dir
3081          */
3082         vnode_put(nd.ni_vp);
3083
3084         if (per_thread) {
3085                 thread_t th = vfs_context_thread(ctx);
3086                 if (th) {
3087                         uthread_t uth = get_bsdthread_info(th);
3088                         tvp = uth->uu_cdir;
3089                         uth->uu_cdir = nd.ni_vp;
3090                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3091                 } else {
3092                         vnode_rele(nd.ni_vp);
3093                         return (ENOENT);
3094                 }
3095         } else {
3096                 proc_fdlock(p);
3097                 tvp = fdp->fd_cdir;
3098                 fdp->fd_cdir = nd.ni_vp;
3099                 proc_fdunlock(p);
3100         }
3101
3102         if (tvp)
3103                 vnode_rele(tvp);
3104
3105         return (0);
3106 }
3107
3108
3109 /*
3110  * chdir
3111  *
3112  * Change current working directory (".") for the entire process
3113  *
3114  * Parameters:  p       Process requesting the call
3115  *              uap     User argument descriptor (see below)
3116  *              retval  (ignored)
3117  *
3118  * Indirect parameters: uap->path       Directory path
3119  *
3120  * Returns:     0                       Success
3121  *              common_chdir: ENOTDIR
3122  *              common_chdir: ENOENT    No such file or directory
3123  *              common_chdir: ???
3124  *
3125  */
3126 int
3127 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3128 {
3129         return common_chdir(p, (void *)uap, 0);
3130 }
3131
3132 /*
3133  * __pthread_chdir
3134  *
3135  * Change current working directory (".") for a single thread
3136  *
3137  * Parameters:  p       Process requesting the call
3138  *              uap     User argument descriptor (see below)
3139  *              retval  (ignored)
3140  *
3141  * Indirect parameters: uap->path       Directory path
3142  *
3143  * Returns:     0                       Success
3144  *              common_chdir: ENOTDIR
3145  *              common_chdir: ENOENT    No such file or directory
3146  *              common_chdir: ???
3147  *
3148  */
3149 int
3150 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3151 {
3152         return common_chdir(p, (void *)uap, 1);
3153 }
3154
3155
3156 /*
3157  * Change notion of root (``/'') directory.
3158  */
3159 /* ARGSUSED */
3160 int
3161 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3162 {
3163         struct filedesc *fdp = p->p_fd;
3164         int error;
3165         struct nameidata nd;
3166         vnode_t tvp;
3167         vfs_context_t ctx = vfs_context_current();
3168
3169         if ((error = suser(kauth_cred_get(), &p->p_acflag)))
3170                 return (error);
3171
3172         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3173                 UIO_USERSPACE, uap->path, ctx);
3174         error = change_dir(&nd, ctx);
3175         if (error)
3176                 return (error);
3177
3178 #if CONFIG_MACF
3179         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3180             &nd.ni_cnd);
3181         if (error) {
3182                 vnode_put(nd.ni_vp);
3183                 return (error);
3184         }
3185 #endif
3186
3187         if ( (error = vnode_ref(nd.ni_vp)) ) {
3188                 vnode_put(nd.ni_vp);
3189                 return (error);
3190         }
3191         vnode_put(nd.ni_vp);
3192
3193         proc_fdlock(p);
3194         tvp = fdp->fd_rdir;
3195         fdp->fd_rdir = nd.ni_vp;
3196         fdp->fd_flags |= FD_CHROOT;
3197         proc_fdunlock(p);
3198
3199         if (tvp != NULL)
3200                 vnode_rele(tvp);
3201
3202         return (0);
3203 }
3204
3205 /*
3206  * Common routine for chroot and chdir.
3207  *
3208  * Returns:     0                       Success
3209  *              ENOTDIR                 Not a directory
3210  *              namei:???               [anything namei can return]
3211  *              vnode_authorize:???     [anything vnode_authorize can return]
3212  */
3213 static int
3214 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3215 {
3216         vnode_t vp;
3217         int error;
3218
3219         if ((error = namei(ndp)))
3220                 return (error);
3221         nameidone(ndp);
3222         vp = ndp->ni_vp;
3223
3224         if (vp->v_type != VDIR) {
3225                 vnode_put(vp);
3226                 return (ENOTDIR);
3227         }
3228
3229 #if CONFIG_MACF
3230         error = mac_vnode_check_chdir(ctx, vp);
3231         if (error) {
3232                 vnode_put(vp);
3233                 return (error);
3234         }
3235 #endif
3236
3237         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3238         if (error) {
3239                 vnode_put(vp);
3240                 return (error);
3241         }
3242
3243         return (error);
3244 }
3245
3246 /*
3247  * Free the vnode data (for directories) associated with the file glob.
3248  */
3249 struct fd_vn_data *
3250 fg_vn_data_alloc(void)
3251 {
3252         struct fd_vn_data *fvdata;
3253
3254         /* Allocate per fd vnode data */
3255         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3256                M_FD_VN_DATA, M_WAITOK | M_ZERO);
3257         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3258         return fvdata;
3259 }
3260
3261 /*
3262  * Free the vnode data (for directories) associated with the file glob.
3263  */
3264 void
3265 fg_vn_data_free(void *fgvndata)
3266 {
3267         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3268
3269         if (fvdata->fv_buf)
3270                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3271         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3272         FREE(fvdata, M_FD_VN_DATA);
3273 }
3274
3275 /*
3276  * Check permissions, allocate an open file structure,
3277  * and call the device open routine if any.
3278  *
3279  * Returns:     0                       Success
3280  *              EINVAL
3281  *              EINTR
3282  *      falloc:ENFILE
3283  *      falloc:EMFILE
3284  *      falloc:ENOMEM
3285  *      vn_open_auth:???
3286  *      dupfdopen:???
3287  *      VNOP_ADVLOCK:???
3288  *      vnode_setsize:???
3289  *
3290  * XXX Need to implement uid, gid
3291  */
3292 int
3293 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3294     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3295     int32_t *retval)
3296 {
3297         proc_t p = vfs_context_proc(ctx);
3298         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3299         struct fileproc *fp;
3300         vnode_t vp;
3301         int flags, oflags;
3302         int type, indx, error;
3303         struct flock lf;
3304         struct vfs_context context;
3305
3306         oflags = uflags;
3307
3308         if ((oflags & O_ACCMODE) == O_ACCMODE)
3309                 return(EINVAL);
3310
3311         flags = FFLAGS(uflags);
3312         CLR(flags, FENCRYPTED);
3313         CLR(flags, FUNENCRYPTED);
3314
3315         AUDIT_ARG(fflags, oflags);
3316         AUDIT_ARG(mode, vap->va_mode);
3317
3318         if ((error = falloc_withalloc(p,
3319             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3320                 return (error);
3321         }
3322         uu->uu_dupfd = -indx - 1;
3323
3324         if ((error = vn_open_auth(ndp, &flags, vap))) {
3325                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)){        /* XXX from fdopen */
3326                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3327                                 fp_drop(p, indx, NULL, 0);
3328                                 *retval = indx;
3329                                 return (0);
3330                         }
3331                 }
3332                 if (error == ERESTART)
3333                         error = EINTR;
3334                 fp_free(p, indx, fp);
3335                 return (error);
3336         }
3337         uu->uu_dupfd = 0;
3338         vp = ndp->ni_vp;
3339
3340         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3341         fp->f_fglob->fg_ops = &vnops;
3342         fp->f_fglob->fg_data = (caddr_t)vp;
3343
3344         if (flags & (O_EXLOCK | O_SHLOCK)) {
3345                 lf.l_whence = SEEK_SET;
3346                 lf.l_start = 0;
3347                 lf.l_len = 0;
3348                 if (flags & O_EXLOCK)
3349                         lf.l_type = F_WRLCK;
3350                 else
3351                         lf.l_type = F_RDLCK;
3352                 type = F_FLOCK;
3353                 if ((flags & FNONBLOCK) == 0)
3354                         type |= F_WAIT;
3355 #if CONFIG_MACF
3356                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3357                     F_SETLK, &lf);
3358                 if (error)
3359                         goto bad;
3360 #endif
3361                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL)))
3362                         goto bad;
3363                 fp->f_fglob->fg_flag |= FHASLOCK;
3364         }
3365
3366         /* try to truncate by setting the size attribute */
3367         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0))
3368                 goto bad;
3369
3370         /*
3371          * For directories we hold some additional information in the fd.
3372          */
3373         if (vnode_vtype(vp) == VDIR) {
3374                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3375         } else {
3376                 fp->f_fglob->fg_vn_data = NULL;
3377         }
3378
3379         vnode_put(vp);
3380
3381         /*
3382          * The first terminal open (without a O_NOCTTY) by a session leader
3383          * results in it being set as the controlling terminal.
3384          */
3385         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3386             !(flags & O_NOCTTY)) {
3387                 int tmp = 0;
3388
3389                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3390                     (caddr_t)&tmp, ctx);
3391         }
3392
3393         proc_fdlock(p);
3394         if (flags & O_CLOEXEC)
3395                 *fdflags(p, indx) |= UF_EXCLOSE;
3396         if (flags & O_CLOFORK)
3397                 *fdflags(p, indx) |= UF_FORKCLOSE;
3398         procfdtbl_releasefd(p, indx, NULL);
3399         fp_drop(p, indx, fp, 1);
3400         proc_fdunlock(p);
3401
3402         *retval = indx;
3403
3404         return (0);
3405 bad:
3406         context = *vfs_context_current();
3407         context.vc_ucred = fp->f_fglob->fg_cred;
3408
3409         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
3410             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
3411                 lf.l_whence = SEEK_SET;
3412                 lf.l_start = 0;
3413                 lf.l_len = 0;
3414                 lf.l_type = F_UNLCK;
3415
3416                 (void)VNOP_ADVLOCK(
3417                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
3418         }
3419
3420         vn_close(vp, fp->f_fglob->fg_flag, &context);
3421         vnode_put(vp);
3422         fp_free(p, indx, fp);
3423
3424         return (error);
3425 }
3426
3427 /*
3428  * While most of the *at syscall handlers can call nameiat() which
3429  * is a wrapper around namei, the use of namei and initialisation
3430  * of nameidata are far removed and in different functions  - namei
3431  * gets called in vn_open_auth for open1. So we'll just do here what
3432  * nameiat() does.
3433  */
3434 static int
3435 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3436     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
3437     int dirfd)
3438 {
3439         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
3440                 int error;
3441                 char c;
3442
3443                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3444                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3445                         if (error)
3446                                 return (error);
3447                 } else {
3448                         c = *((char *)(ndp->ni_dirp));
3449                 }
3450
3451                 if (c != '/') {
3452                         vnode_t dvp_at;
3453
3454                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3455                             &dvp_at);
3456                         if (error)
3457                                 return (error);
3458
3459                         if (vnode_vtype(dvp_at) != VDIR) {
3460                                 vnode_put(dvp_at);
3461                                 return (ENOTDIR);
3462                         }
3463
3464                         ndp->ni_dvp = dvp_at;
3465                         ndp->ni_cnd.cn_flags |= USEDVP;
3466                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
3467                             retval);
3468                         vnode_put(dvp_at);
3469                         return (error);
3470                 }
3471         }
3472
3473         return (open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval));
3474 }
3475
3476 /*
3477  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
3478  *
3479  * Parameters:  p                       Process requesting the open
3480  *              uap                     User argument descriptor (see below)
3481  *              retval                  Pointer to an area to receive the
3482  *                                      return calue from the system call
3483  *
3484  * Indirect:    uap->path               Path to open (same as 'open')
3485  *              uap->flags              Flags to open (same as 'open'
3486  *              uap->uid                UID to set, if creating
3487  *              uap->gid                GID to set, if creating
3488  *              uap->mode               File mode, if creating (same as 'open')
3489  *              uap->xsecurity          ACL to set, if creating
3490  *
3491  * Returns:     0                       Success
3492  *              !0                      errno value
3493  *
3494  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3495  *
3496  * XXX:         We should enummerate the possible errno values here, and where
3497  *              in the code they originated.
3498  */
3499 int
3500 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
3501 {
3502         struct filedesc *fdp = p->p_fd;
3503         int ciferror;
3504         kauth_filesec_t xsecdst;
3505         struct vnode_attr va;
3506         struct nameidata nd;
3507         int cmode;
3508
3509         AUDIT_ARG(owner, uap->uid, uap->gid);
3510
3511         xsecdst = NULL;
3512         if ((uap->xsecurity != USER_ADDR_NULL) &&
3513             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
3514                 return ciferror;
3515
3516         VATTR_INIT(&va);
3517         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3518         VATTR_SET(&va, va_mode, cmode);
3519         if (uap->uid != KAUTH_UID_NONE)
3520                 VATTR_SET(&va, va_uid, uap->uid);
3521         if (uap->gid != KAUTH_GID_NONE)
3522                 VATTR_SET(&va, va_gid, uap->gid);
3523         if (xsecdst != NULL)
3524                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3525
3526         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3527                uap->path, vfs_context_current());
3528
3529         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
3530                          fileproc_alloc_init, NULL, retval);
3531         if (xsecdst != NULL)
3532                 kauth_filesec_free(xsecdst);
3533
3534         return ciferror;
3535 }
3536
3537 /*
3538  * Go through the data-protected atomically controlled open (2)
3539  *
3540  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
3541  */
3542 int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval) {
3543         int flags = uap->flags;
3544         int class = uap->class;
3545         int dpflags = uap->dpflags;
3546
3547         /*
3548          * Follow the same path as normal open(2)
3549          * Look up the item if it exists, and acquire the vnode.
3550          */
3551         struct filedesc *fdp = p->p_fd;
3552         struct vnode_attr va;
3553         struct nameidata nd;
3554         int cmode;
3555         int error;
3556
3557         VATTR_INIT(&va);
3558         /* Mask off all but regular access permissions */
3559         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3560         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3561
3562         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3563                uap->path, vfs_context_current());
3564
3565         /*
3566          * Initialize the extra fields in vnode_attr to pass down our
3567          * extra fields.
3568          * 1. target cprotect class.
3569          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
3570          */
3571         if (flags & O_CREAT) {
3572                /* lower level kernel code validates that the class is valid before applying it. */
3573                if (class != PROTECTION_CLASS_DEFAULT) {
3574                        /*
3575                         * PROTECTION_CLASS_DEFAULT implies that we make the class for this
3576                         * file behave the same as open (2)
3577                         */
3578                        VATTR_SET(&va, va_dataprotect_class, class);
3579                }
3580         }
3581
3582         if (dpflags & (O_DP_GETRAWENCRYPTED|O_DP_GETRAWUNENCRYPTED)) {
3583                 if ( flags & (O_RDWR | O_WRONLY)) {
3584                         /* Not allowed to write raw encrypted bytes */
3585                         return EINVAL;
3586                 }
3587                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
3588                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
3589                 }
3590                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
3591                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
3592                 }
3593         }
3594
3595         error = open1(vfs_context_current(), &nd, uap->flags, &va,
3596                       fileproc_alloc_init, NULL, retval);
3597
3598         return error;
3599 }
3600
3601 static int
3602 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
3603     int fd, enum uio_seg segflg, int *retval)
3604 {
3605         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
3606         struct vnode_attr va;
3607         struct nameidata nd;
3608         int cmode;
3609
3610         VATTR_INIT(&va);
3611         /* Mask off all but regular access permissions */
3612         cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3613         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3614
3615         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
3616             segflg, path, ctx);
3617
3618         return (open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
3619             retval, fd));
3620 }
3621
3622 int
3623 open(proc_t p, struct open_args *uap, int32_t *retval)
3624 {
3625         __pthread_testcancel(1);
3626         return(open_nocancel(p, (struct open_nocancel_args *)uap, retval));
3627 }
3628
3629 int
3630 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
3631     int32_t *retval)
3632 {
3633         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3634             uap->mode, AT_FDCWD, UIO_USERSPACE, retval));
3635 }
3636
3637 int
3638 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
3639                 int32_t *retval)
3640 {
3641         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3642             uap->mode, uap->fd, UIO_USERSPACE, retval));
3643 }
3644
3645 int
3646 openat(proc_t p, struct openat_args *uap, int32_t *retval)
3647 {
3648         __pthread_testcancel(1);
3649         return(openat_nocancel(p, (struct openat_nocancel_args *)uap, retval));
3650 }
3651
3652 /*
3653  * openbyid_np: open a file given a file system id and a file system object id
3654  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
3655  *      file systems that don't support object ids it is a node id (uint64_t).
3656  *
3657  * Parameters:  p                       Process requesting the open
3658  *              uap                     User argument descriptor (see below)
3659  *              retval                  Pointer to an area to receive the
3660  *                                      return calue from the system call
3661  *
3662  * Indirect:    uap->path               Path to open (same as 'open')
3663  *
3664  *              uap->fsid               id of target file system
3665  *              uap->objid              id of target file system object
3666  *              uap->flags              Flags to open (same as 'open')
3667  *
3668  * Returns:     0                       Success
3669  *              !0                      errno value
3670  *
3671  *
3672  * XXX:         We should enummerate the possible errno values here, and where
3673  *              in the code they originated.
3674  */
3675 int
3676 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
3677 {
3678         fsid_t fsid;
3679         uint64_t objid;
3680         int error;
3681         char *buf = NULL;
3682         int buflen = MAXPATHLEN;
3683         int pathlen = 0;
3684         vfs_context_t ctx = vfs_context_current();
3685
3686         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
3687                 return (error);
3688         }
3689
3690         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
3691         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
3692                 return (error);
3693         }
3694
3695         AUDIT_ARG(value32, fsid.val[0]);
3696         AUDIT_ARG(value64, objid);
3697
3698         /*resolve path from fsis, objid*/
3699         do {
3700                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
3701                 if (buf == NULL) {
3702                         return (ENOMEM);
3703                 }
3704
3705                 error = fsgetpath_internal(
3706                         ctx, fsid.val[0], objid,
3707                         buflen, buf, &pathlen);
3708
3709                 if (error) {
3710                         FREE(buf, M_TEMP);
3711                         buf = NULL;
3712                 }
3713         } while (error == ENOSPC && (buflen += MAXPATHLEN));
3714
3715         if (error) {
3716                 return error;
3717         }
3718
3719         buf[pathlen] = 0;
3720
3721         error = openat_internal(
3722                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
3723
3724         FREE(buf, M_TEMP);
3725
3726         return error;
3727 }
3728
3729
3730 /*
3731  * Create a special file.
3732  */
3733 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
3734
3735 int
3736 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
3737 {
3738         struct vnode_attr va;
3739         vfs_context_t ctx = vfs_context_current();
3740         int error;
3741         struct nameidata nd;
3742         vnode_t vp, dvp;
3743
3744         VATTR_INIT(&va);
3745         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3746         VATTR_SET(&va, va_rdev, uap->dev);
3747
3748         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
3749         if ((uap->mode & S_IFMT) == S_IFIFO)
3750                 return(mkfifo1(ctx, uap->path, &va));
3751
3752         AUDIT_ARG(mode, uap->mode);
3753         AUDIT_ARG(value32, uap->dev);
3754
3755         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
3756                 return (error);
3757         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
3758                 UIO_USERSPACE, uap->path, ctx);
3759         error = namei(&nd);
3760         if (error)
3761                 return (error);
3762         dvp = nd.ni_dvp;
3763         vp = nd.ni_vp;
3764
3765         if (vp != NULL) {
3766                 error = EEXIST;
3767                 goto out;
3768         }
3769
3770         switch (uap->mode & S_IFMT) {
3771         case S_IFCHR:
3772                 VATTR_SET(&va, va_type, VCHR);
3773                 break;
3774         case S_IFBLK:
3775                 VATTR_SET(&va, va_type, VBLK);
3776                 break;
3777         default:
3778                 error = EINVAL;
3779                 goto out;
3780         }
3781
3782 #if CONFIG_MACF
3783         error = mac_vnode_check_create(ctx,
3784             nd.ni_dvp, &nd.ni_cnd, &va);
3785         if (error)
3786                 goto out;
3787 #endif
3788
3789         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
3790                 goto out;
3791
3792         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0)
3793                 goto out;
3794
3795         if (vp) {
3796                 int     update_flags = 0;
3797
3798                 // Make sure the name & parent pointers are hooked up
3799                 if (vp->v_name == NULL)
3800                         update_flags |= VNODE_UPDATE_NAME;
3801                 if (vp->v_parent == NULLVP)
3802                         update_flags |= VNODE_UPDATE_PARENT;
3803
3804                 if (update_flags)
3805                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
3806
3807 #if CONFIG_FSE
3808                 add_fsevent(FSE_CREATE_FILE, ctx,
3809                     FSE_ARG_VNODE, vp,
3810                     FSE_ARG_DONE);
3811 #endif
3812         }
3813
3814 out:
3815         /*
3816          * nameidone has to happen before we vnode_put(dvp)
3817          * since it may need to release the fs_nodelock on the dvp
3818          */
3819         nameidone(&nd);
3820
3821         if (vp)
3822                 vnode_put(vp);
3823         vnode_put(dvp);
3824
3825         return (error);
3826 }
3827
3828 /*
3829  * Create a named pipe.
3830  *
3831  * Returns:     0                       Success
3832  *              EEXIST
3833  *      namei:???
3834  *      vnode_authorize:???
3835  *      vn_create:???
3836  */
3837 static int
3838 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
3839 {
3840         vnode_t vp, dvp;
3841         int error;
3842         struct nameidata nd;
3843
3844         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
3845                 UIO_USERSPACE, upath, ctx);
3846         error = namei(&nd);
3847         if (error)
3848                 return (error);
3849         dvp = nd.ni_dvp;
3850         vp = nd.ni_vp;
3851
3852         /* check that this is a new file and authorize addition */
3853         if (vp != NULL) {
3854                 error = EEXIST;
3855                 goto out;
3856         }
3857         VATTR_SET(vap, va_type, VFIFO);
3858
3859         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0)
3860                 goto out;
3861
3862         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
3863 out:
3864         /*
3865          * nameidone has to happen before we vnode_put(dvp)
3866          * since it may need to release the fs_nodelock on the dvp
3867          */
3868         nameidone(&nd);
3869
3870         if (vp)
3871                 vnode_put(vp);
3872         vnode_put(dvp);
3873
3874         return error;
3875 }
3876
3877
3878 /*
3879  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
3880  *
3881  * Parameters:  p                       Process requesting the open
3882  *              uap                     User argument descriptor (see below)
3883  *              retval                  (Ignored)
3884  *
3885  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
3886  *              uap->uid                UID to set
3887  *              uap->gid                GID to set
3888  *              uap->mode               File mode to set (same as 'mkfifo')
3889  *              uap->xsecurity          ACL to set, if creating
3890  *
3891  * Returns:     0                       Success
3892  *              !0                      errno value
3893  *
3894  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3895  *
3896  * XXX:         We should enummerate the possible errno values here, and where
3897  *              in the code they originated.
3898  */
3899 int
3900 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
3901 {
3902         int ciferror;
3903         kauth_filesec_t xsecdst;
3904         struct vnode_attr va;
3905
3906         AUDIT_ARG(owner, uap->uid, uap->gid);
3907
3908         xsecdst = KAUTH_FILESEC_NONE;
3909         if (uap->xsecurity != USER_ADDR_NULL) {
3910                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
3911                         return ciferror;
3912         }
3913
3914         VATTR_INIT(&va);
3915         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3916         if (uap->uid != KAUTH_UID_NONE)
3917                 VATTR_SET(&va, va_uid, uap->uid);
3918         if (uap->gid != KAUTH_GID_NONE)
3919                 VATTR_SET(&va, va_gid, uap->gid);
3920         if (xsecdst != KAUTH_FILESEC_NONE)
3921                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3922
3923         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
3924
3925         if (xsecdst != KAUTH_FILESEC_NONE)
3926                 kauth_filesec_free(xsecdst);
3927         return ciferror;
3928 }
3929
3930 /* ARGSUSED */
3931 int
3932 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
3933 {
3934         struct vnode_attr va;
3935
3936         VATTR_INIT(&va);
3937         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3938
3939         return(mkfifo1(vfs_context_current(), uap->path, &va));
3940 }
3941
3942
3943 static char *
3944 my_strrchr(char *p, int ch)
3945 {
3946         char *save;
3947
3948         for (save = NULL;; ++p) {
3949                 if (*p == ch)
3950                         save = p;
3951                 if (!*p)
3952                         return(save);
3953         }
3954         /* NOTREACHED */
3955 }
3956
3957 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
3958
3959 int
3960 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
3961 {
3962         int ret, len = _len;
3963
3964         *truncated_path = 0;
3965         ret = vn_getpath(dvp, path, &len);
3966         if (ret == 0 && len < (MAXPATHLEN - 1)) {
3967                 if (leafname) {
3968                         path[len-1] = '/';
3969                         len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + 1;
3970                         if (len > MAXPATHLEN) {
3971                                 char *ptr;
3972
3973                                 // the string got truncated!
3974                                 *truncated_path = 1;
3975                                 ptr = my_strrchr(path, '/');
3976                                 if (ptr) {
3977                                         *ptr = '\0';   // chop off the string at the last directory component
3978                                 }
3979                                 len = strlen(path) + 1;
3980                         }
3981                 }
3982         } else if (ret == 0) {
3983                 *truncated_path = 1;
3984         } else if (ret != 0) {
3985                 struct vnode *mydvp=dvp;
3986
3987                 if (ret != ENOSPC) {
3988                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
3989                                dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
3990                 }
3991                 *truncated_path = 1;
3992
3993                 do {
3994                         if (mydvp->v_parent != NULL) {
3995                                 mydvp = mydvp->v_parent;
3996                         } else if (mydvp->v_mount) {
3997                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
3998                                 break;
3999                         } else {
4000                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4001                                 strlcpy(path, "/", _len);
4002                                 len = 2;
4003                                 mydvp = NULL;
4004                         }
4005
4006                         if (mydvp == NULL) {
4007                                 break;
4008                         }
4009
4010                         len = _len;
4011                         ret = vn_getpath(mydvp, path, &len);
4012                 } while (ret == ENOSPC);
4013         }
4014
4015         return len;
4016 }
4017
4018
4019 /*
4020  * Make a hard file link.
4021  *
4022  * Returns:     0                       Success
4023  *              EPERM
4024  *              EEXIST
4025  *              EXDEV
4026  *      namei:???
4027  *      vnode_authorize:???
4028  *      VNOP_LINK:???
4029  */
4030 /* ARGSUSED */
4031 static int
4032 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4033     user_addr_t link, int flag, enum uio_seg segflg)
4034 {
4035         vnode_t vp, dvp, lvp;
4036         struct nameidata nd;
4037         int follow;
4038         int error;
4039 #if CONFIG_FSE
4040         fse_info finfo;
4041 #endif
4042         int need_event, has_listeners;
4043         char *target_path = NULL;
4044         int truncated=0;
4045
4046         vp = dvp = lvp = NULLVP;
4047
4048         /* look up the object we are linking to */
4049         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4050         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4051             segflg, path, ctx);
4052
4053         error = nameiat(&nd, fd1);
4054         if (error)
4055                 return (error);
4056         vp = nd.ni_vp;
4057
4058         nameidone(&nd);
4059
4060         /*
4061          * Normally, linking to directories is not supported.
4062          * However, some file systems may have limited support.
4063          */
4064         if (vp->v_type == VDIR) {
4065                 if (!(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) {
4066                         error = EPERM;   /* POSIX */
4067                         goto out;
4068                 }
4069                 /* Linking to a directory requires ownership. */
4070                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4071                         struct vnode_attr dva;
4072
4073                         VATTR_INIT(&dva);
4074                         VATTR_WANTED(&dva, va_uid);
4075                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4076                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4077                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4078                                 error = EACCES;
4079                                 goto out;
4080                         }
4081                 }
4082         }
4083
4084         /* lookup the target node */
4085 #if CONFIG_TRIGGERS
4086         nd.ni_op = OP_LINK;
4087 #endif
4088         nd.ni_cnd.cn_nameiop = CREATE;
4089         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4090         nd.ni_dirp = link;
4091         error = nameiat(&nd, fd2);
4092         if (error != 0)
4093                 goto out;
4094         dvp = nd.ni_dvp;
4095         lvp = nd.ni_vp;
4096
4097 #if CONFIG_MACF
4098         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0)
4099                 goto out2;
4100 #endif
4101
4102         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4103         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0)
4104                 goto out2;
4105
4106         /* target node must not exist */
4107         if (lvp != NULLVP) {
4108                 error = EEXIST;
4109                 goto out2;
4110         }
4111         /* cannot link across mountpoints */
4112         if (vnode_mount(vp) != vnode_mount(dvp)) {
4113                 error = EXDEV;
4114                 goto out2;
4115         }
4116
4117         /* authorize creation of the target note */
4118         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
4119                 goto out2;
4120
4121         /* and finally make the link */
4122         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4123         if (error)
4124                 goto out2;
4125
4126 #if CONFIG_MACF
4127         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4128 #endif
4129
4130 #if CONFIG_FSE
4131         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4132 #else
4133         need_event = 0;
4134 #endif
4135         has_listeners = kauth_authorize_fileop_has_listeners();
4136
4137         if (need_event || has_listeners) {
4138                 char *link_to_path = NULL;
4139                 int len, link_name_len;
4140
4141                 /* build the path to the new link file */
4142                 GET_PATH(target_path);
4143                 if (target_path == NULL) {
4144                         error = ENOMEM;
4145                         goto out2;
4146                 }
4147
4148                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4149
4150                 if (has_listeners) {
4151                         /* build the path to file we are linking to */
4152                         GET_PATH(link_to_path);
4153                         if (link_to_path == NULL) {
4154                                 error = ENOMEM;
4155                                 goto out2;
4156                         }
4157
4158                         link_name_len = MAXPATHLEN;
4159                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4160                                 /*
4161                                  * Call out to allow 3rd party notification of rename.
4162                                  * Ignore result of kauth_authorize_fileop call.
4163                                  */
4164                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4165                                                        (uintptr_t)link_to_path,
4166                                                        (uintptr_t)target_path);
4167                         }
4168                         if (link_to_path != NULL) {
4169                                 RELEASE_PATH(link_to_path);
4170                         }
4171                 }
4172 #if CONFIG_FSE
4173                 if (need_event) {
4174                         /* construct fsevent */
4175                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4176                                 if (truncated) {
4177                                         finfo.mode |= FSE_TRUNCATED_PATH;
4178                                 }
4179
4180                                 // build the path to the destination of the link
4181                                 add_fsevent(FSE_CREATE_FILE, ctx,
4182                                             FSE_ARG_STRING, len, target_path,
4183                                             FSE_ARG_FINFO, &finfo,
4184                                             FSE_ARG_DONE);
4185                         }
4186                         if (vp->v_parent) {
4187                             add_fsevent(FSE_STAT_CHANGED, ctx,
4188                                 FSE_ARG_VNODE, vp->v_parent,
4189                                 FSE_ARG_DONE);
4190                         }
4191                 }
4192 #endif
4193         }
4194 out2:
4195         /*
4196          * nameidone has to happen before we vnode_put(dvp)
4197          * since it may need to release the fs_nodelock on the dvp
4198          */
4199         nameidone(&nd);
4200         if (target_path != NULL) {
4201                 RELEASE_PATH(target_path);
4202         }
4203 out:
4204         if (lvp)
4205                 vnode_put(lvp);
4206         if (dvp)
4207                 vnode_put(dvp);
4208         vnode_put(vp);
4209         return (error);
4210 }
4211
4212 int
4213 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4214 {
4215         return (linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4216             AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE));
4217 }
4218
4219 int
4220 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4221 {
4222         if (uap->flag & ~AT_SYMLINK_FOLLOW)
4223                 return (EINVAL);
4224
4225         return (linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4226             uap->fd2, uap->link, uap->flag, UIO_USERSPACE));
4227 }
4228
4229 /*
4230  * Make a symbolic link.
4231  *
4232  * We could add support for ACLs here too...
4233  */
4234 /* ARGSUSED */
4235 static int
4236 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4237     user_addr_t link, enum uio_seg segflg)
4238 {
4239         struct vnode_attr va;
4240         char *path;
4241         int error;
4242         struct nameidata nd;
4243         vnode_t vp, dvp;
4244         uint32_t dfflags;       // Directory file flags
4245         size_t dummy=0;
4246         proc_t p;
4247
4248         error = 0;
4249         if (UIO_SEG_IS_USER_SPACE(segflg)) {
4250                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
4251                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
4252         } else {
4253                 path = (char *)path_data;
4254         }
4255         if (error)
4256                 goto out;
4257         AUDIT_ARG(text, path);  /* This is the link string */
4258
4259         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
4260             segflg, link, ctx);
4261
4262         error = nameiat(&nd, fd);
4263         if (error)
4264                 goto out;
4265         dvp = nd.ni_dvp;
4266         vp = nd.ni_vp;
4267
4268         p = vfs_context_proc(ctx);
4269         VATTR_INIT(&va);
4270         VATTR_SET(&va, va_type, VLNK);
4271         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
4272
4273         /*
4274          * Handle inheritance of restricted flag
4275          */
4276         error = vnode_flags(dvp, &dfflags, ctx);
4277         if (error)
4278                 goto skipit;
4279         if (dfflags & SF_RESTRICTED)
4280                 VATTR_SET(&va, va_flags, SF_RESTRICTED);
4281
4282 #if CONFIG_MACF
4283         error = mac_vnode_check_create(ctx,
4284                         dvp, &nd.ni_cnd, &va);
4285 #endif
4286         if (error != 0) {
4287             goto skipit;
4288         }
4289
4290         if (vp != NULL) {
4291             error = EEXIST;
4292             goto skipit;
4293         }
4294
4295         /* authorize */
4296         if (error == 0)
4297                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
4298         /* get default ownership, etc. */
4299         if (error == 0)
4300                 error = vnode_authattr_new(dvp, &va, 0, ctx);
4301         if (error == 0)
4302                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
4303
4304 #if CONFIG_MACF
4305         if (error == 0 && vp)
4306                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
4307 #endif
4308
4309         /* do fallback attribute handling */
4310         if (error == 0 && vp)
4311                 error = vnode_setattr_fallback(vp, &va, ctx);
4312
4313         if (error == 0) {
4314                 int     update_flags = 0;
4315
4316                 /*check if a new vnode was created, else try to get one*/
4317                 if (vp == NULL) {
4318                         nd.ni_cnd.cn_nameiop = LOOKUP;
4319 #if CONFIG_TRIGGERS
4320                         nd.ni_op = OP_LOOKUP;
4321 #endif
4322                         nd.ni_cnd.cn_flags = 0;
4323                         error = nameiat(&nd, fd);
4324                         vp = nd.ni_vp;
4325
4326                         if (vp == NULL)
4327                                 goto skipit;
4328                 }
4329
4330 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
4331                 /* call out to allow 3rd party notification of rename.
4332                  * Ignore result of kauth_authorize_fileop call.
4333                  */
4334                 if (kauth_authorize_fileop_has_listeners() &&
4335                     namei(&nd) == 0) {
4336                         char *new_link_path = NULL;
4337                         int             len;
4338
4339                         /* build the path to the new link file */
4340                         new_link_path = get_pathbuff();
4341                         len = MAXPATHLEN;
4342                         vn_getpath(dvp, new_link_path, &len);
4343                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
4344                                 new_link_path[len - 1] = '/';
4345                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN-len);
4346                         }
4347
4348                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
4349                                            (uintptr_t)path, (uintptr_t)new_link_path);
4350                         if (new_link_path != NULL)
4351                                 release_pathbuff(new_link_path);
4352                 }
4353 #endif
4354                 // Make sure the name & parent pointers are hooked up
4355                 if (vp->v_name == NULL)
4356                         update_flags |= VNODE_UPDATE_NAME;
4357                 if (vp->v_parent == NULLVP)
4358                         update_flags |= VNODE_UPDATE_PARENT;
4359
4360                 if (update_flags)
4361                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4362
4363 #if CONFIG_FSE
4364                 add_fsevent(FSE_CREATE_FILE, ctx,
4365                             FSE_ARG_VNODE, vp,
4366                             FSE_ARG_DONE);
4367 #endif
4368         }
4369
4370 skipit:
4371         /*
4372          * nameidone has to happen before we vnode_put(dvp)
4373          * since it may need to release the fs_nodelock on the dvp
4374          */
4375         nameidone(&nd);
4376
4377         if (vp)
4378                 vnode_put(vp);
4379         vnode_put(dvp);
4380 out:
4381         if (path && (path != (char *)path_data))
4382                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
4383
4384         return (error);
4385 }
4386
4387 int
4388 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
4389 {
4390         return (symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
4391             uap->link, UIO_USERSPACE));
4392 }
4393
4394 int
4395 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
4396     __unused int32_t *retval)
4397 {
4398         return (symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
4399             uap->path2, UIO_USERSPACE));
4400 }
4401
4402 /*
4403  * Delete a whiteout from the filesystem.
4404  * No longer supported.
4405  */
4406 int
4407 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
4408 {
4409         return (ENOTSUP);
4410 }
4411
4412 /*
4413  * Delete a name from the filesystem.
4414  */
4415 /* ARGSUSED */
4416 static int
4417 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
4418     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
4419 {
4420         struct nameidata nd;
4421         vnode_t vp, dvp;
4422         int error;
4423         struct componentname *cnp;
4424         char  *path = NULL;
4425         int  len=0;
4426 #if CONFIG_FSE
4427         fse_info  finfo;
4428         struct vnode_attr va;
4429 #endif
4430         int flags;
4431         int need_event;
4432         int has_listeners;
4433         int truncated_path;
4434         int batched;
4435         struct vnode_attr *vap;
4436         int do_retry;
4437         int retry_count = 0;
4438         int cn_flags;
4439
4440         cn_flags = LOCKPARENT;
4441         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH))
4442                 cn_flags |= AUDITVNPATH1;
4443         /* If a starting dvp is passed, it trumps any fd passed. */
4444         if (start_dvp)
4445                 cn_flags |= USEDVP;
4446
4447 #if NAMEDRSRCFORK
4448         /* unlink or delete is allowed on rsrc forks and named streams */
4449         cn_flags |= CN_ALLOWRSRCFORK;
4450 #endif
4451
4452 retry:
4453         do_retry = 0;
4454         flags = 0;
4455         need_event = 0;
4456         has_listeners = 0;
4457         truncated_path = 0;
4458         vap = NULL;
4459
4460         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
4461
4462         nd.ni_dvp = start_dvp;
4463         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
4464         cnp = &nd.ni_cnd;
4465
4466 lookup_continue:
4467         error = nameiat(&nd, fd);
4468         if (error)
4469                 return (error);
4470
4471         dvp = nd.ni_dvp;
4472         vp = nd.ni_vp;
4473
4474
4475         /* With Carbon delete semantics, busy files cannot be deleted */
4476         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
4477                 flags |= VNODE_REMOVE_NODELETEBUSY;
4478         }
4479
4480         /* Skip any potential upcalls if told to. */
4481         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
4482                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
4483         }
4484
4485         if (vp) {
4486                 batched = vnode_compound_remove_available(vp);
4487                 /*
4488                  * The root of a mounted filesystem cannot be deleted.
4489                  */
4490                 if (vp->v_flag & VROOT) {
4491                         error = EBUSY;
4492                 }
4493
4494                 if (!batched) {
4495                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
4496                         if (error) {
4497                                 if (error == ENOENT) {
4498                                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4499                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4500                                                 do_retry = 1;
4501                                                 retry_count++;
4502                                         }
4503                                 }
4504                                 goto out;
4505                         }
4506                 }
4507         } else {
4508                 batched = 1;
4509
4510                 if (!vnode_compound_remove_available(dvp)) {
4511                         panic("No vp, but no compound remove?");
4512                 }
4513         }
4514
4515 #if CONFIG_FSE
4516         need_event = need_fsevent(FSE_DELETE, dvp);
4517         if (need_event) {
4518                 if (!batched) {
4519                         if ((vp->v_flag & VISHARDLINK) == 0) {
4520                                 /* XXX need to get these data in batched VNOP */
4521                                 get_fse_info(vp, &finfo, ctx);
4522                         }
4523                 } else {
4524                         error = vfs_get_notify_attributes(&va);
4525                         if (error) {
4526                                 goto out;
4527                         }
4528
4529                         vap = &va;
4530                 }
4531         }
4532 #endif
4533         has_listeners = kauth_authorize_fileop_has_listeners();
4534         if (need_event || has_listeners) {
4535                 if (path == NULL) {
4536                         GET_PATH(path);
4537                         if (path == NULL) {
4538                                 error = ENOMEM;
4539                                 goto out;
4540                         }
4541                 }
4542                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
4543         }
4544
4545 #if NAMEDRSRCFORK
4546         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK)
4547                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
4548         else
4549 #endif
4550         {
4551                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
4552                 vp = nd.ni_vp;
4553                 if (error == EKEEPLOOKING) {
4554                         if (!batched) {
4555                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
4556                         }
4557
4558                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
4559                                 panic("EKEEPLOOKING, but continue flag not set?");
4560                         }
4561
4562                         if (vnode_isdir(vp)) {
4563                                 error = EISDIR;
4564                                 goto out;
4565                         }
4566                         goto lookup_continue;
4567                 } else if (error == ENOENT && batched) {
4568                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4569                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4570                                 /*
4571                                  * For compound VNOPs, the authorization callback may
4572                                  * return ENOENT in case of racing hardlink lookups
4573                                  * hitting the name  cache, redrive the lookup.
4574                                  */
4575                                 do_retry = 1;
4576                                 retry_count += 1;
4577                                 goto out;
4578                         }
4579                 }
4580         }
4581
4582         /*
4583          * Call out to allow 3rd party notification of delete.
4584          * Ignore result of kauth_authorize_fileop call.
4585          */
4586         if (!error) {
4587                 if (has_listeners) {
4588                         kauth_authorize_fileop(vfs_context_ucred(ctx),
4589                                 KAUTH_FILEOP_DELETE,
4590                                 (uintptr_t)vp,
4591                                 (uintptr_t)path);
4592                 }
4593
4594                 if (vp->v_flag & VISHARDLINK) {
4595                     //
4596                     // if a hardlink gets deleted we want to blow away the
4597                     // v_parent link because the path that got us to this
4598                     // instance of the link is no longer valid.  this will
4599                     // force the next call to get the path to ask the file
4600                     // system instead of just following the v_parent link.
4601                     //
4602                     vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
4603                 }
4604
4605 #if CONFIG_FSE
4606                 if (need_event) {
4607                         if (vp->v_flag & VISHARDLINK) {
4608                                 get_fse_info(vp, &finfo, ctx);
4609                         } else if (vap) {
4610                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
4611                         }
4612                         if (truncated_path) {
4613                                 finfo.mode |= FSE_TRUNCATED_PATH;
4614                         }
4615                         add_fsevent(FSE_DELETE, ctx,
4616                                                 FSE_ARG_STRING, len, path,
4617                                                 FSE_ARG_FINFO, &finfo,
4618                                                 FSE_ARG_DONE);
4619                 }
4620 #endif
4621         }
4622
4623 out:
4624         if (path != NULL)
4625                 RELEASE_PATH(path);
4626
4627 #if NAMEDRSRCFORK
4628         /* recycle the deleted rsrc fork vnode to force a reclaim, which
4629          * will cause its shadow file to go away if necessary.
4630          */
4631          if (vp && (vnode_isnamedstream(vp)) &&
4632                 (vp->v_parent != NULLVP) &&
4633                 vnode_isshadow(vp)) {
4634                         vnode_recycle(vp);
4635          }
4636 #endif
4637         /*
4638          * nameidone has to happen before we vnode_put(dvp)
4639          * since it may need to release the fs_nodelock on the dvp
4640          */
4641         nameidone(&nd);
4642         vnode_put(dvp);
4643         if (vp) {
4644                 vnode_put(vp);
4645         }
4646
4647         if (do_retry) {
4648                 goto retry;
4649         }
4650
4651         return (error);
4652 }
4653
4654 int
4655 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
4656     enum uio_seg segflg, int unlink_flags)
4657 {
4658         return (unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
4659             unlink_flags));
4660 }
4661
4662 /*
4663  * Delete a name from the filesystem using Carbon semantics.
4664  */
4665 int
4666 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
4667 {
4668         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4669             uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY));
4670 }
4671
4672 /*
4673  * Delete a name from the filesystem using POSIX semantics.
4674  */
4675 int
4676 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
4677 {
4678         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4679             uap->path, UIO_USERSPACE, 0));
4680 }
4681
4682 int
4683 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
4684 {
4685         if (uap->flag & ~AT_REMOVEDIR)
4686                 return (EINVAL);
4687
4688         if (uap->flag & AT_REMOVEDIR)
4689                 return (rmdirat_internal(vfs_context_current(), uap->fd,
4690                     uap->path, UIO_USERSPACE));
4691         else
4692                 return (unlinkat_internal(vfs_context_current(), uap->fd,
4693                     NULLVP, uap->path, UIO_USERSPACE, 0));
4694 }
4695
4696 /*
4697  * Reposition read/write file offset.
4698  */
4699 int
4700 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
4701 {
4702         struct fileproc *fp;
4703         vnode_t vp;
4704         struct vfs_context *ctx;
4705         off_t offset = uap->offset, file_size;
4706         int error;
4707
4708         if ( (error = fp_getfvp(p,uap->fd, &fp, &vp)) ) {
4709                 if (error == ENOTSUP)
4710                         return (ESPIPE);
4711                 return (error);
4712         }
4713         if (vnode_isfifo(vp)) {
4714                 file_drop(uap->fd);
4715                 return(ESPIPE);
4716         }
4717
4718
4719         ctx = vfs_context_current();
4720 #if CONFIG_MACF
4721         if (uap->whence == L_INCR && uap->offset == 0)
4722                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
4723                     fp->f_fglob);
4724         else
4725                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
4726                     fp->f_fglob);
4727         if (error) {
4728                 file_drop(uap->fd);
4729                 return (error);
4730         }
4731 #endif
4732         if ( (error = vnode_getwithref(vp)) ) {
4733                 file_drop(uap->fd);
4734                 return(error);
4735         }
4736
4737         switch (uap->whence) {
4738         case L_INCR:
4739                 offset += fp->f_fglob->fg_offset;
4740                 break;
4741         case L_XTND:
4742                 if ((error = vnode_size(vp, &file_size, ctx)) != 0)
4743                         break;
4744                 offset += file_size;
4745                 break;
4746         case L_SET:
4747                 break;
4748         default:
4749                 error = EINVAL;
4750         }
4751         if (error == 0) {
4752                 if (uap->offset > 0 && offset < 0) {
4753                         /* Incremented/relative move past max size */
4754                         error = EOVERFLOW;
4755                 } else {
4756                         /*
4757                          * Allow negative offsets on character devices, per
4758                          * POSIX 1003.1-2001.  Most likely for writing disk
4759                          * labels.
4760                          */
4761                         if (offset < 0 && vp->v_type != VCHR) {
4762                                 /* Decremented/relative move before start */
4763                                 error = EINVAL;
4764                         } else {
4765                                 /* Success */
4766                                 fp->f_fglob->fg_offset = offset;
4767                                 *retval = fp->f_fglob->fg_offset;
4768                         }
4769                 }
4770         }
4771
4772         /*
4773          * An lseek can affect whether data is "available to read."  Use
4774          * hint of NOTE_NONE so no EVFILT_VNODE events fire
4775          */
4776         post_event_if_success(vp, error, NOTE_NONE);
4777         (void)vnode_put(vp);
4778         file_drop(uap->fd);
4779         return (error);
4780 }
4781
4782
4783 /*
4784  * Check access permissions.
4785  *
4786  * Returns:     0                       Success
4787  *              vnode_authorize:???
4788  */
4789 static int
4790 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
4791 {
4792         kauth_action_t action;
4793         int error;
4794
4795         /*
4796          * If just the regular access bits, convert them to something
4797          * that vnode_authorize will understand.
4798          */
4799         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
4800                 action = 0;
4801                 if (uflags & R_OK)
4802                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
4803                 if (uflags & W_OK) {
4804                         if (vnode_isdir(vp)) {
4805                                 action |= KAUTH_VNODE_ADD_FILE |
4806                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
4807                                 /* might want delete rights here too */
4808                         } else {
4809                                 action |= KAUTH_VNODE_WRITE_DATA;
4810                         }
4811                 }
4812                 if (uflags & X_OK) {
4813                         if (vnode_isdir(vp)) {
4814                                 action |= KAUTH_VNODE_SEARCH;
4815                         } else {
4816                                 action |= KAUTH_VNODE_EXECUTE;
4817                         }
4818                 }
4819         } else {
4820                 /* take advantage of definition of uflags */
4821                 action = uflags >> 8;
4822         }
4823
4824 #if CONFIG_MACF
4825         error = mac_vnode_check_access(ctx, vp, uflags);
4826         if (error)
4827                 return (error);
4828 #endif /* MAC */
4829
4830         /* action == 0 means only check for existence */
4831         if (action != 0) {
4832                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
4833         } else {
4834                 error = 0;
4835         }
4836
4837         return(error);
4838 }
4839
4840
4841
4842 /*
4843  * access_extended: Check access permissions in bulk.
4844  *
4845  * Description: uap->entries            Pointer to an array of accessx
4846  *                                      descriptor structs, plus one or
4847  *                                      more NULL terminated strings (see
4848  *                                      "Notes" section below).
4849  *              uap->size               Size of the area pointed to by
4850  *                                      uap->entries.
4851  *              uap->results            Pointer to the results array.
4852  *
4853  * Returns:     0                       Success
4854  *              ENOMEM                  Insufficient memory
4855  *              EINVAL                  Invalid arguments
4856  *              namei:EFAULT            Bad address
4857  *              namei:ENAMETOOLONG      Filename too long
4858  *              namei:ENOENT            No such file or directory
4859  *              namei:ELOOP             Too many levels of symbolic links
4860  *              namei:EBADF             Bad file descriptor
4861  *              namei:ENOTDIR           Not a directory
4862  *              namei:???
4863  *              access1:
4864  *
4865  * Implicit returns:
4866  *              uap->results            Array contents modified
4867  *
4868  * Notes:       The uap->entries are structured as an arbitrary length array
4869  *              of accessx descriptors, followed by one or more NULL terminated
4870  *              strings
4871  *
4872  *                      struct accessx_descriptor[0]
4873  *                      ...
4874  *                      struct accessx_descriptor[n]
4875  *                      char name_data[0];
4876  *
4877  *              We determine the entry count by walking the buffer containing
4878  *              the uap->entries argument descriptor.  For each descriptor we
4879  *              see, the valid values for the offset ad_name_offset will be
4880  *              in the byte range:
4881  *
4882  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
4883  *                                              to
4884  *                              [ uap->entries + uap->size - 2 ]
4885  *
4886  *              since we must have at least one string, and the string must
4887  *              be at least one character plus the NULL terminator in length.
4888  *
4889  * XXX:         Need to support the check-as uid argument
4890  */
4891 int
4892 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
4893 {
4894         struct accessx_descriptor *input = NULL;
4895         errno_t *result = NULL;
4896         errno_t error = 0;
4897         int wantdelete = 0;
4898         unsigned int desc_max, desc_actual, i, j;
4899         struct vfs_context context;
4900         struct nameidata nd;
4901         int niopts;
4902         vnode_t vp = NULL;
4903         vnode_t dvp = NULL;
4904 #define ACCESSX_MAX_DESCR_ON_STACK 10
4905         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
4906
4907         context.vc_ucred = NULL;
4908
4909         /*
4910          * Validate parameters; if valid, copy the descriptor array and string
4911          * arguments into local memory.  Before proceeding, the following
4912          * conditions must have been met:
4913          *
4914          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
4915          * o    There must be sufficient room in the request for at least one
4916          *      descriptor and a one yte NUL terminated string.
4917          * o    The allocation of local storage must not fail.
4918          */
4919         if (uap->size > ACCESSX_MAX_TABLESIZE)
4920                 return(ENOMEM);
4921         if (uap->size < (sizeof(struct accessx_descriptor) + 2))
4922                 return(EINVAL);
4923         if (uap->size <= sizeof (stack_input)) {
4924                 input = stack_input;
4925         } else {
4926         MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
4927         if (input == NULL) {
4928                 error = ENOMEM;
4929                 goto out;
4930         }
4931         }
4932         error = copyin(uap->entries, input, uap->size);
4933         if (error)
4934                 goto out;
4935
4936         AUDIT_ARG(opaque, input, uap->size);
4937
4938         /*
4939          * Force NUL termination of the copyin buffer to avoid nami() running
4940          * off the end.  If the caller passes us bogus data, they may get a
4941          * bogus result.
4942          */
4943         ((char *)input)[uap->size - 1] = 0;
4944
4945         /*
4946          * Access is defined as checking against the process' real identity,
4947          * even if operations are checking the effective identity.  This
4948          * requires that we use a local vfs context.
4949          */
4950         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
4951         context.vc_thread = current_thread();
4952
4953         /*
4954          * Find out how many entries we have, so we can allocate the result
4955          * array by walking the list and adjusting the count downward by the
4956          * earliest string offset we see.
4957          */
4958         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
4959         desc_actual = desc_max;
4960         for (i = 0; i < desc_actual; i++) {
4961                 /*
4962                  * Take the offset to the name string for this entry and
4963                  * convert to an input array index, which would be one off
4964                  * the end of the array if this entry was the lowest-addressed
4965                  * name string.
4966                  */
4967                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
4968
4969                 /*
4970                  * An offset greater than the max allowable offset is an error.
4971                  * It is also an error for any valid entry to point
4972                  * to a location prior to the end of the current entry, if
4973                  * it's not a reference to the string of the previous entry.
4974                  */
4975                 if (j > desc_max || (j != 0 && j <= i)) {
4976                         error = EINVAL;
4977                         goto out;
4978                 }
4979
4980                 /*
4981                  * An offset of 0 means use the previous descriptor's offset;
4982                  * this is used to chain multiple requests for the same file
4983                  * to avoid multiple lookups.
4984                  */
4985                 if (j == 0) {
4986                         /* This is not valid for the first entry */
4987                         if (i == 0) {
4988                                 error = EINVAL;
4989                                 goto out;
4990                         }
4991                         continue;
4992                 }
4993
4994                 /*
4995                  * If the offset of the string for this descriptor is before
4996                  * what we believe is the current actual last descriptor,
4997                  * then we need to adjust our estimate downward; this permits
4998                  * the string table following the last descriptor to be out
4999                  * of order relative to the descriptor list.
5000                  */
5001                 if (j < desc_actual)
5002                         desc_actual = j;
5003         }
5004
5005         /*
5006          * We limit the actual number of descriptors we are willing to process
5007          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5008          * requested does not exceed this limit,
5009          */
5010         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5011                 error = ENOMEM;
5012                 goto out;
5013         }
5014         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK);
5015         if (result == NULL) {
5016                 error = ENOMEM;
5017                 goto out;
5018         }
5019
5020         /*
5021          * Do the work by iterating over the descriptor entries we know to
5022          * at least appear to contain valid data.
5023          */
5024         error = 0;
5025         for (i = 0; i < desc_actual; i++) {
5026                 /*
5027                  * If the ad_name_offset is 0, then we use the previous
5028                  * results to make the check; otherwise, we are looking up
5029                  * a new file name.
5030                  */
5031                 if (input[i].ad_name_offset != 0) {
5032                         /* discard old vnodes */
5033                         if (vp) {
5034                                 vnode_put(vp);
5035                                 vp = NULL;
5036                         }
5037                         if (dvp) {
5038                                 vnode_put(dvp);
5039                                 dvp = NULL;
5040                         }
5041
5042                         /*
5043                          * Scan forward in the descriptor list to see if we
5044                          * need the parent vnode.  We will need it if we are
5045                          * deleting, since we must have rights  to remove
5046                          * entries in the parent directory, as well as the
5047                          * rights to delete the object itself.
5048                          */
5049                         wantdelete = input[i].ad_flags & _DELETE_OK;
5050                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++)
5051                                 if (input[j].ad_flags & _DELETE_OK)
5052                                         wantdelete = 1;
5053
5054                         niopts = FOLLOW | AUDITVNPATH1;
5055
5056                         /* need parent for vnode_authorize for deletion test */
5057                         if (wantdelete)
5058                                 niopts |= WANTPARENT;
5059
5060                         /* do the lookup */
5061                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5062                                CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5063                                &context);
5064                         error = namei(&nd);
5065                         if (!error) {
5066                                 vp = nd.ni_vp;
5067                                 if (wantdelete)
5068                                         dvp = nd.ni_dvp;
5069                         }
5070                         nameidone(&nd);
5071                 }
5072
5073                 /*
5074                  * Handle lookup errors.
5075                  */
5076                 switch(error) {
5077                 case ENOENT:
5078                 case EACCES:
5079                 case EPERM:
5080                 case ENOTDIR:
5081                         result[i] = error;
5082                         break;
5083                 case 0:
5084                         /* run this access check */
5085                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5086                         break;
5087                 default:
5088                         /* fatal lookup error */
5089
5090                         goto out;
5091                 }
5092         }
5093
5094         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5095
5096         /* copy out results */
5097         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5098
5099 out:
5100         if (input && input != stack_input)
5101                 FREE(input, M_TEMP);
5102         if (result)
5103                 FREE(result, M_TEMP);
5104         if (vp)
5105                 vnode_put(vp);
5106         if (dvp)
5107                 vnode_put(dvp);
5108         if (IS_VALID_CRED(context.vc_ucred))
5109                 kauth_cred_unref(&context.vc_ucred);
5110         return(error);
5111 }
5112
5113
5114 /*
5115  * Returns:     0                       Success
5116  *              namei:EFAULT            Bad address
5117  *              namei:ENAMETOOLONG      Filename too long
5118  *              namei:ENOENT            No such file or directory
5119  *              namei:ELOOP             Too many levels of symbolic links
5120  *              namei:EBADF             Bad file descriptor
5121  *              namei:ENOTDIR           Not a directory
5122  *              namei:???
5123  *              access1:
5124  */
5125 static int
5126 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5127     int flag, enum uio_seg segflg)
5128 {
5129         int error;
5130         struct nameidata nd;
5131         int niopts;
5132         struct vfs_context context;
5133 #if NAMEDRSRCFORK
5134         int is_namedstream = 0;
5135 #endif
5136
5137         /*
5138          * Unless the AT_EACCESS option is used, Access is defined as checking
5139          * against the process' real identity, even if operations are checking
5140          * the effective identity.  So we need to tweak the credential
5141          * in the context for that case.
5142          */
5143         if (!(flag & AT_EACCESS))
5144                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5145         else
5146                 context.vc_ucred = ctx->vc_ucred;
5147         context.vc_thread = ctx->vc_thread;
5148
5149
5150         niopts = FOLLOW | AUDITVNPATH1;
5151         /* need parent for vnode_authorize for deletion test */
5152         if (amode & _DELETE_OK)
5153                 niopts |= WANTPARENT;
5154         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5155                path, &context);
5156
5157 #if NAMEDRSRCFORK
5158         /* access(F_OK) calls are allowed for resource forks. */
5159         if (amode == F_OK)
5160                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5161 #endif
5162         error = nameiat(&nd, fd);
5163         if (error)
5164                 goto out;
5165
5166 #if NAMEDRSRCFORK
5167         /* Grab reference on the shadow stream file vnode to
5168          * force an inactive on release which will mark it
5169          * for recycle.
5170          */
5171         if (vnode_isnamedstream(nd.ni_vp) &&
5172             (nd.ni_vp->v_parent != NULLVP) &&
5173             vnode_isshadow(nd.ni_vp)) {
5174                 is_namedstream = 1;
5175                 vnode_ref(nd.ni_vp);
5176         }
5177 #endif
5178
5179         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
5180
5181 #if NAMEDRSRCFORK
5182         if (is_namedstream) {
5183                 vnode_rele(nd.ni_vp);
5184         }
5185 #endif
5186
5187         vnode_put(nd.ni_vp);
5188         if (amode & _DELETE_OK)
5189                 vnode_put(nd.ni_dvp);
5190         nameidone(&nd);
5191
5192 out:
5193         if (!(flag & AT_EACCESS))
5194                 kauth_cred_unref(&context.vc_ucred);
5195         return (error);
5196 }
5197
5198 int
5199 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
5200 {
5201         return (faccessat_internal(vfs_context_current(), AT_FDCWD,
5202             uap->path, uap->flags, 0, UIO_USERSPACE));
5203 }
5204
5205 int
5206 faccessat(__unused proc_t p, struct faccessat_args *uap,
5207           __unused int32_t *retval)
5208 {
5209         if (uap->flag & ~AT_EACCESS)
5210                 return (EINVAL);
5211
5212         return (faccessat_internal(vfs_context_current(), uap->fd,
5213             uap->path, uap->amode, uap->flag, UIO_USERSPACE));
5214 }
5215
5216 /*
5217  * Returns:     0                       Success
5218  *              EFAULT
5219  *      copyout:EFAULT
5220  *      namei:???
5221  *      vn_stat:???
5222  */
5223 static int
5224 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
5225     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
5226     enum uio_seg segflg, int fd, int flag)
5227 {
5228         struct nameidata nd;
5229         int follow;
5230         union {
5231                 struct stat sb;
5232                 struct stat64 sb64;
5233         } source;
5234         union {
5235                 struct user64_stat user64_sb;
5236                 struct user32_stat user32_sb;
5237                 struct user64_stat64 user64_sb64;
5238                 struct user32_stat64 user32_sb64;
5239         } dest;
5240         caddr_t sbp;
5241         int error, my_size;
5242         kauth_filesec_t fsec;
5243         size_t xsecurity_bufsize;
5244         void * statptr;
5245
5246         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5247         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
5248             segflg, path, ctx);
5249
5250 #if NAMEDRSRCFORK
5251         int is_namedstream = 0;
5252         /* stat calls are allowed for resource forks. */
5253         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5254 #endif
5255         error = nameiat(&nd, fd);
5256         if (error)
5257                 return (error);
5258         fsec = KAUTH_FILESEC_NONE;
5259
5260         statptr = (void *)&source;
5261
5262 #if NAMEDRSRCFORK
5263         /* Grab reference on the shadow stream file vnode to
5264          * force an inactive on release which will mark it
5265          * for recycle.
5266          */
5267         if (vnode_isnamedstream(nd.ni_vp) &&
5268             (nd.ni_vp->v_parent != NULLVP) &&
5269             vnode_isshadow(nd.ni_vp)) {
5270                 is_namedstream = 1;
5271                 vnode_ref(nd.ni_vp);
5272         }
5273 #endif
5274
5275         error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
5276
5277 #if NAMEDRSRCFORK
5278         if (is_namedstream) {
5279                 vnode_rele(nd.ni_vp);
5280         }
5281 #endif
5282         vnode_put(nd.ni_vp);
5283         nameidone(&nd);
5284
5285         if (error)
5286                 return (error);
5287         /* Zap spare fields */
5288         if (isstat64 != 0) {
5289                 source.sb64.st_lspare = 0;
5290                 source.sb64.st_qspare[0] = 0LL;
5291                 source.sb64.st_qspare[1] = 0LL;
5292                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5293                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
5294                         my_size = sizeof(dest.user64_sb64);
5295                         sbp = (caddr_t)&dest.user64_sb64;
5296                 } else {
5297                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
5298                         my_size = sizeof(dest.user32_sb64);
5299                         sbp = (caddr_t)&dest.user32_sb64;
5300                 }
5301                 /*
5302                  * Check if we raced (post lookup) against the last unlink of a file.
5303                  */
5304                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
5305                         source.sb64.st_nlink = 1;
5306                 }
5307         } else {
5308                 source.sb.st_lspare = 0;
5309                 source.sb.st_qspare[0] = 0LL;
5310                 source.sb.st_qspare[1] = 0LL;
5311                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5312                         munge_user64_stat(&source.sb, &dest.user64_sb);
5313                         my_size = sizeof(dest.user64_sb);
5314                         sbp = (caddr_t)&dest.user64_sb;
5315                 } else {
5316                         munge_user32_stat(&source.sb, &dest.user32_sb);
5317                         my_size = sizeof(dest.user32_sb);
5318                         sbp = (caddr_t)&dest.user32_sb;
5319                 }
5320
5321                 /*
5322                  * Check if we raced (post lookup) against the last unlink of a file.
5323                  */
5324                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
5325                         source.sb.st_nlink = 1;
5326                 }
5327         }
5328         if ((error = copyout(sbp, ub, my_size)) != 0)
5329                 goto out;
5330
5331         /* caller wants extended security information? */
5332         if (xsecurity != USER_ADDR_NULL) {
5333
5334                 /* did we get any? */
5335                 if (fsec == KAUTH_FILESEC_NONE) {
5336                         if (susize(xsecurity_size, 0) != 0) {
5337                                 error = EFAULT;
5338                                 goto out;
5339                         }
5340                 } else {
5341                         /* find the user buffer size */
5342                         xsecurity_bufsize = fusize(xsecurity_size);
5343
5344                         /* copy out the actual data size */
5345                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
5346                                 error = EFAULT;
5347                                 goto out;
5348                         }
5349
5350                         /* if the caller supplied enough room, copy out to it */
5351                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec))
5352                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
5353                 }
5354         }
5355 out:
5356         if (fsec != KAUTH_FILESEC_NONE)
5357                 kauth_filesec_free(fsec);
5358         return (error);
5359 }
5360
5361 /*
5362  * stat_extended: Get file status; with extended security (ACL).
5363  *
5364  * Parameters:    p                       (ignored)
5365  *                uap                     User argument descriptor (see below)
5366  *                retval                  (ignored)
5367  *
5368  * Indirect:      uap->path               Path of file to get status from
5369  *                uap->ub                 User buffer (holds file status info)
5370  *                uap->xsecurity          ACL to get (extended security)
5371  *                uap->xsecurity_size     Size of ACL
5372  *
5373  * Returns:        0                      Success
5374  *                !0                      errno value
5375  *
5376  */
5377 int
5378 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
5379     __unused int32_t *retval)
5380 {
5381         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5382             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5383             0));
5384 }
5385
5386 /*
5387  * Returns:     0                       Success
5388  *      fstatat_internal:???            [see fstatat_internal() in this file]
5389  */
5390 int
5391 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
5392 {
5393         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5394             0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0));
5395 }
5396
5397 int
5398 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
5399 {
5400         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5401             0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0));
5402 }
5403
5404 /*
5405  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
5406  *
5407  * Parameters:    p                       (ignored)
5408  *                uap                     User argument descriptor (see below)
5409  *                retval                  (ignored)
5410  *
5411  * Indirect:      uap->path               Path of file to get status from
5412  *                uap->ub                 User buffer (holds file status info)
5413  *                uap->xsecurity          ACL to get (extended security)
5414  *                uap->xsecurity_size     Size of ACL
5415  *
5416  * Returns:        0                      Success
5417  *                !0                      errno value
5418  *
5419  */
5420 int
5421 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
5422 {
5423         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5424             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5425             0));
5426 }
5427
5428 /*
5429  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
5430  *
5431  * Parameters:    p                       (ignored)
5432  *                uap                     User argument descriptor (see below)
5433  *                retval                  (ignored)
5434  *
5435  * Indirect:      uap->path               Path of file to get status from
5436  *                uap->ub                 User buffer (holds file status info)
5437  *                uap->xsecurity          ACL to get (extended security)
5438  *                uap->xsecurity_size     Size of ACL
5439  *
5440  * Returns:        0                      Success
5441  *                !0                      errno value
5442  *
5443  */
5444 int
5445 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
5446 {
5447         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5448             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5449             AT_SYMLINK_NOFOLLOW));
5450 }
5451
5452 /*
5453  * Get file status; this version does not follow links.
5454  */
5455 int
5456 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
5457 {
5458         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5459             0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5460 }
5461
5462 int
5463 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
5464 {
5465         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5466             0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5467 }
5468
5469 /*
5470  * lstat64_extended: Get file status; can handle large inode numbers; does not
5471  * follow links; with extended security (ACL).
5472  *
5473  * Parameters:    p                       (ignored)
5474  *                uap                     User argument descriptor (see below)
5475  *                retval                  (ignored)
5476  *
5477  * Indirect:      uap->path               Path of file to get status from
5478  *                uap->ub                 User buffer (holds file status info)
5479  *                uap->xsecurity          ACL to get (extended security)
5480  *                uap->xsecurity_size     Size of ACL
5481  *
5482  * Returns:        0                      Success
5483  *                !0                      errno value
5484  *
5485  */
5486 int
5487 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
5488 {
5489         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5490             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5491             AT_SYMLINK_NOFOLLOW));
5492 }
5493
5494 int
5495 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
5496 {
5497         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5498                 return (EINVAL);
5499
5500         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5501             0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag));
5502 }
5503
5504 int
5505 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
5506     __unused int32_t *retval)
5507 {
5508         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5509                 return (EINVAL);
5510
5511         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5512             0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag));
5513 }
5514
5515 /*
5516  * Get configurable pathname variables.
5517  *
5518  * Returns:     0                       Success
5519  *      namei:???
5520  *      vn_pathconf:???
5521  *
5522  * Notes:       Global implementation  constants are intended to be
5523  *              implemented in this function directly; all other constants
5524  *              are per-FS implementation, and therefore must be handled in
5525  *              each respective FS, instead.
5526  *
5527  * XXX We implement some things globally right now that should actually be
5528  * XXX per-FS; we will need to deal with this at some point.
5529  */
5530 /* ARGSUSED */
5531 int
5532 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
5533 {
5534         int error;
5535         struct nameidata nd;
5536         vfs_context_t ctx = vfs_context_current();
5537
5538         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
5539                 UIO_USERSPACE, uap->path, ctx);
5540         error = namei(&nd);
5541         if (error)
5542                 return (error);
5543
5544         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
5545
5546         vnode_put(nd.ni_vp);
5547         nameidone(&nd);
5548         return (error);
5549 }
5550
5551 /*
5552  * Return target name of a symbolic link.
5553  */
5554 /* ARGSUSED */
5555 static int
5556 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
5557     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
5558     int *retval)
5559 {
5560         vnode_t vp;
5561         uio_t auio;
5562         int error;
5563         struct nameidata nd;
5564         char uio_buf[ UIO_SIZEOF(1) ];
5565
5566         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
5567             seg, path, ctx);
5568
5569         error = nameiat(&nd, fd);
5570         if (error)
5571                 return (error);
5572         vp = nd.ni_vp;
5573
5574         nameidone(&nd);
5575
5576         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
5577                                     &uio_buf[0], sizeof(uio_buf));
5578         uio_addiov(auio, buf, bufsize);
5579         if (vp->v_type != VLNK) {
5580                 error = EINVAL;
5581         } else {
5582 #if CONFIG_MACF
5583                 error = mac_vnode_check_readlink(ctx, vp);
5584 #endif
5585                 if (error == 0)
5586                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
5587                                                 ctx);
5588                 if (error == 0)
5589                         error = VNOP_READLINK(vp, auio, ctx);
5590         }
5591         vnode_put(vp);
5592
5593         *retval = bufsize - (int)uio_resid(auio);
5594         return (error);
5595 }
5596
5597 int
5598 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
5599 {
5600         enum uio_seg procseg;
5601
5602         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5603         return (readlinkat_internal(vfs_context_current(), AT_FDCWD,
5604             CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
5605             uap->count, procseg, retval));
5606 }
5607
5608 int
5609 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
5610 {
5611         enum uio_seg procseg;
5612
5613         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5614         return (readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
5615             procseg, uap->buf, uap->bufsize, procseg, retval));
5616 }
5617
5618 /*
5619  * Change file flags.
5620  */
5621 static int
5622 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
5623 {
5624         struct vnode_attr va;
5625         kauth_action_t action;
5626         int error;
5627
5628         VATTR_INIT(&va);
5629         VATTR_SET(&va, va_flags, flags);
5630
5631 #if CONFIG_MACF
5632         error = mac_vnode_check_setflags(ctx, vp, flags);
5633         if (error)
5634                 goto out;
5635 #endif
5636
5637         /* request authorisation, disregard immutability */
5638         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5639                 goto out;
5640         /*
5641          * Request that the auth layer disregard those file flags it's allowed to when
5642          * authorizing this operation; we need to do this in order to be able to
5643          * clear immutable flags.
5644          */
5645         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0))
5646                 goto out;
5647         error = vnode_setattr(vp, &va, ctx);
5648
5649         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
5650                 error = ENOTSUP;
5651         }
5652 out:
5653         vnode_put(vp);
5654         return(error);
5655 }
5656
5657 /*
5658  * Change flags of a file given a path name.
5659  */
5660 /* ARGSUSED */
5661 int
5662 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
5663 {
5664         vnode_t vp;
5665         vfs_context_t ctx = vfs_context_current();
5666         int error;
5667         struct nameidata nd;
5668
5669         AUDIT_ARG(fflags, uap->flags);
5670         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
5671                 UIO_USERSPACE, uap->path, ctx);
5672         error = namei(&nd);
5673         if (error)
5674                 return (error);
5675         vp = nd.ni_vp;
5676         nameidone(&nd);
5677
5678         error = chflags1(vp, uap->flags, ctx);
5679
5680         return(error);
5681 }
5682
5683 /*
5684  * Change flags of a file given a file descriptor.
5685  */
5686 /* ARGSUSED */
5687 int
5688 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
5689 {
5690         vnode_t vp;
5691         int error;
5692
5693         AUDIT_ARG(fd, uap->fd);
5694         AUDIT_ARG(fflags, uap->flags);
5695         if ( (error = file_vnode(uap->fd, &vp)) )
5696                 return (error);
5697
5698         if ((error = vnode_getwithref(vp))) {
5699                 file_drop(uap->fd);
5700                 return(error);
5701         }
5702
5703         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5704
5705         error = chflags1(vp, uap->flags, vfs_context_current());
5706
5707         file_drop(uap->fd);
5708         return (error);
5709 }
5710
5711 /*
5712  * Change security information on a filesystem object.
5713  *
5714  * Returns:     0                       Success
5715  *              EPERM                   Operation not permitted
5716  *              vnode_authattr:???      [anything vnode_authattr can return]
5717  *              vnode_authorize:???     [anything vnode_authorize can return]
5718  *              vnode_setattr:???       [anything vnode_setattr can return]
5719  *
5720  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
5721  *              translated to EPERM before being returned.
5722  */
5723 static int
5724 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
5725 {
5726         kauth_action_t action;
5727         int error;
5728
5729         AUDIT_ARG(mode, vap->va_mode);
5730         /* XXX audit new args */
5731
5732 #if NAMEDSTREAMS
5733         /* chmod calls are not allowed for resource forks. */
5734         if (vp->v_flag & VISNAMEDSTREAM) {
5735                 return (EPERM);
5736         }
5737 #endif
5738
5739 #if CONFIG_MACF
5740         if (VATTR_IS_ACTIVE(vap, va_mode) &&
5741             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0)
5742                 return (error);
5743 #endif
5744
5745         /* make sure that the caller is allowed to set this security information */
5746         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
5747             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
5748                 if (error == EACCES)
5749                         error = EPERM;
5750                 return(error);
5751         }
5752
5753         error = vnode_setattr(vp, vap, ctx);
5754
5755         return (error);
5756 }
5757
5758
5759 /*
5760  * Change mode of a file given a path name.
5761  *
5762  * Returns:     0                       Success
5763  *              namei:???               [anything namei can return]
5764  *              chmod_vnode:???         [anything chmod_vnode can return]
5765  */
5766 static int
5767 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
5768     int fd, int flag, enum uio_seg segflg)
5769 {
5770         struct nameidata nd;
5771         int follow, error;
5772
5773         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5774         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
5775             segflg, path, ctx);
5776         if ((error = nameiat(&nd, fd)))
5777                 return (error);
5778         error = chmod_vnode(ctx, nd.ni_vp, vap);
5779         vnode_put(nd.ni_vp);
5780         nameidone(&nd);
5781         return(error);
5782 }
5783
5784 /*
5785  * chmod_extended: Change the mode of a file given a path name; with extended
5786  * argument list (including extended security (ACL)).
5787  *
5788  * Parameters:  p                       Process requesting the open
5789  *              uap                     User argument descriptor (see below)
5790  *              retval                  (ignored)
5791  *
5792  * Indirect:    uap->path               Path to object (same as 'chmod')
5793  *              uap->uid                UID to set
5794  *              uap->gid                GID to set
5795  *              uap->mode               File mode to set (same as 'chmod')
5796  *              uap->xsecurity          ACL to set (or delete)
5797  *
5798  * Returns:     0                       Success
5799  *              !0                      errno value
5800  *
5801  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
5802  *
5803  * XXX:         We should enummerate the possible errno values here, and where
5804  *              in the code they originated.
5805  */
5806 int
5807 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
5808 {
5809         int error;
5810         struct vnode_attr va;
5811         kauth_filesec_t xsecdst;
5812
5813         AUDIT_ARG(owner, uap->uid, uap->gid);
5814
5815         VATTR_INIT(&va);
5816         if (uap->mode != -1)
5817                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5818         if (uap->uid != KAUTH_UID_NONE)
5819                 VATTR_SET(&va, va_uid, uap->uid);
5820         if (uap->gid != KAUTH_GID_NONE)
5821                 VATTR_SET(&va, va_gid, uap->gid);
5822
5823         xsecdst = NULL;
5824         switch(uap->xsecurity) {
5825                 /* explicit remove request */
5826         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
5827                 VATTR_SET(&va, va_acl, NULL);
5828                 break;
5829                 /* not being set */
5830         case USER_ADDR_NULL:
5831                 break;
5832         default:
5833                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
5834                         return(error);
5835                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5836                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
5837         }
5838
5839         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
5840             UIO_USERSPACE);
5841
5842         if (xsecdst != NULL)
5843                 kauth_filesec_free(xsecdst);
5844         return(error);
5845 }
5846
5847 /*
5848  * Returns:     0                       Success
5849  *              chmodat:???             [anything chmodat can return]
5850  */
5851 static int
5852 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
5853     int flag, enum uio_seg segflg)
5854 {
5855         struct vnode_attr va;
5856
5857         VATTR_INIT(&va);
5858         VATTR_SET(&va, va_mode, mode & ALLPERMS);
5859
5860         return (chmodat(ctx, path, &va, fd, flag, segflg));
5861 }
5862
5863 int
5864 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
5865 {
5866         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
5867             AT_FDCWD, 0, UIO_USERSPACE));
5868 }
5869
5870 int
5871 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
5872 {
5873         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5874                 return (EINVAL);
5875
5876         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
5877             uap->fd, uap->flag, UIO_USERSPACE));
5878 }
5879
5880 /*
5881  * Change mode of a file given a file descriptor.
5882  */
5883 static int
5884 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
5885 {
5886         vnode_t vp;
5887         int error;
5888
5889         AUDIT_ARG(fd, fd);
5890
5891         if ((error = file_vnode(fd, &vp)) != 0)
5892                 return (error);
5893         if ((error = vnode_getwithref(vp)) != 0) {
5894                 file_drop(fd);
5895                 return(error);
5896         }
5897         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5898
5899         error = chmod_vnode(vfs_context_current(), vp, vap);
5900         (void)vnode_put(vp);
5901         file_drop(fd);
5902
5903         return (error);
5904 }
5905
5906 /*
5907  * fchmod_extended: Change mode of a file given a file descriptor; with
5908  * extended argument list (including extended security (ACL)).
5909  *
5910  * Parameters:    p                       Process requesting to change file mode
5911  *                uap                     User argument descriptor (see below)
5912  *                retval                  (ignored)
5913  *
5914  * Indirect:      uap->mode               File mode to set (same as 'chmod')
5915  *                uap->uid                UID to set
5916  *                uap->gid                GID to set
5917  *                uap->xsecurity          ACL to set (or delete)
5918  *                uap->fd                 File descriptor of file to change mode
5919  *
5920  * Returns:        0                      Success
5921  *                !0                      errno value
5922  *
5923  */
5924 int
5925 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
5926 {
5927         int error;
5928         struct vnode_attr va;
5929         kauth_filesec_t xsecdst;
5930
5931         AUDIT_ARG(owner, uap->uid, uap->gid);
5932
5933         VATTR_INIT(&va);
5934         if (uap->mode != -1)
5935                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5936         if (uap->uid != KAUTH_UID_NONE)
5937                 VATTR_SET(&va, va_uid, uap->uid);
5938         if (uap->gid != KAUTH_GID_NONE)
5939                 VATTR_SET(&va, va_gid, uap->gid);
5940
5941         xsecdst = NULL;
5942         switch(uap->xsecurity) {
5943         case USER_ADDR_NULL:
5944                 VATTR_SET(&va, va_acl, NULL);
5945                 break;
5946         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
5947                 VATTR_SET(&va, va_acl, NULL);
5948                 break;
5949                 /* not being set */
5950         case CAST_USER_ADDR_T(-1):
5951                 break;
5952         default:
5953                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
5954                         return(error);
5955                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5956         }
5957
5958         error = fchmod1(p, uap->fd, &va);
5959
5960
5961         switch(uap->xsecurity) {
5962         case USER_ADDR_NULL:
5963         case CAST_USER_ADDR_T(-1):
5964                 break;
5965         default:
5966                 if (xsecdst != NULL)
5967                         kauth_filesec_free(xsecdst);
5968         }
5969         return(error);
5970 }
5971
5972 int
5973 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
5974 {
5975         struct vnode_attr va;
5976
5977         VATTR_INIT(&va);
5978         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5979
5980         return(fchmod1(p, uap->fd, &va));
5981 }
5982
5983
5984 /*
5985  * Set ownership given a path name.
5986  */
5987 /* ARGSUSED */
5988 static int
5989 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
5990    gid_t gid, int flag, enum uio_seg segflg)
5991 {
5992         vnode_t vp;
5993         struct vnode_attr va;
5994         int error;
5995         struct nameidata nd;
5996         int follow;
5997         kauth_action_t action;
5998
5999         AUDIT_ARG(owner, uid, gid);
6000
6001         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6002         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6003             path, ctx);
6004         error = nameiat(&nd, fd);
6005         if (error)
6006                 return (error);
6007         vp = nd.ni_vp;
6008
6009         nameidone(&nd);
6010
6011         VATTR_INIT(&va);
6012         if (uid != (uid_t)VNOVAL)
6013                 VATTR_SET(&va, va_uid, uid);
6014         if (gid != (gid_t)VNOVAL)
6015                 VATTR_SET(&va, va_gid, gid);
6016
6017 #if CONFIG_MACF
6018         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6019         if (error)
6020                 goto out;
6021 #endif
6022
6023         /* preflight and authorize attribute changes */
6024         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6025                 goto out;
6026         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6027                 goto out;
6028         error = vnode_setattr(vp, &va, ctx);
6029
6030 out:
6031         /*
6032          * EACCES is only allowed from namei(); permissions failure should
6033          * return EPERM, so we need to translate the error code.
6034          */
6035         if (error == EACCES)
6036                 error = EPERM;
6037
6038         vnode_put(vp);
6039         return (error);
6040 }
6041
6042 int
6043 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
6044 {
6045         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6046             uap->uid, uap->gid, 0, UIO_USERSPACE));
6047 }
6048
6049 int
6050 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
6051 {
6052         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6053             uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE));
6054 }
6055
6056 int
6057 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
6058 {
6059         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6060                 return (EINVAL);
6061
6062         return (fchownat_internal(vfs_context_current(), uap->fd, uap->path,
6063             uap->uid, uap->gid, uap->flag, UIO_USERSPACE));
6064 }
6065
6066 /*
6067  * Set ownership given a file descriptor.
6068  */
6069 /* ARGSUSED */
6070 int
6071 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
6072 {
6073         struct vnode_attr va;
6074         vfs_context_t ctx = vfs_context_current();
6075         vnode_t vp;
6076         int error;
6077         kauth_action_t action;
6078
6079         AUDIT_ARG(owner, uap->uid, uap->gid);
6080         AUDIT_ARG(fd, uap->fd);
6081
6082         if ( (error = file_vnode(uap->fd, &vp)) )
6083                 return (error);
6084
6085         if ( (error = vnode_getwithref(vp)) ) {
6086                 file_drop(uap->fd);
6087                 return(error);
6088         }
6089         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6090
6091         VATTR_INIT(&va);
6092         if (uap->uid != VNOVAL)
6093                 VATTR_SET(&va, va_uid, uap->uid);
6094         if (uap->gid != VNOVAL)
6095                 VATTR_SET(&va, va_gid, uap->gid);
6096
6097 #if NAMEDSTREAMS
6098         /* chown calls are not allowed for resource forks. */
6099         if (vp->v_flag & VISNAMEDSTREAM) {
6100                 error = EPERM;
6101                 goto out;
6102         }
6103 #endif
6104
6105 #if CONFIG_MACF
6106         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
6107         if (error)
6108                 goto out;
6109 #endif
6110
6111         /* preflight and authorize attribute changes */
6112         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6113                 goto out;
6114         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6115                 if (error == EACCES)
6116                         error = EPERM;
6117                 goto out;
6118         }
6119         error = vnode_setattr(vp, &va, ctx);
6120
6121 out:
6122         (void)vnode_put(vp);
6123         file_drop(uap->fd);
6124         return (error);
6125 }
6126
6127 static int
6128 getutimes(user_addr_t usrtvp, struct timespec *tsp)
6129 {
6130         int error;
6131
6132         if (usrtvp == USER_ADDR_NULL) {
6133                 struct timeval old_tv;
6134                 /* XXX Y2038 bug because of microtime argument */
6135                 microtime(&old_tv);
6136                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
6137                 tsp[1] = tsp[0];
6138         } else {
6139                 if (IS_64BIT_PROCESS(current_proc())) {
6140                         struct user64_timeval tv[2];
6141                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6142                         if (error)
6143                                 return (error);
6144                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6145                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6146                 } else {
6147                         struct user32_timeval tv[2];
6148                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6149                         if (error)
6150                                 return (error);
6151                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6152                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6153                 }
6154         }
6155         return 0;
6156 }
6157
6158 static int
6159 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
6160         int nullflag)
6161 {
6162         int error;
6163         struct vnode_attr va;
6164         kauth_action_t action;
6165
6166         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6167
6168         VATTR_INIT(&va);
6169         VATTR_SET(&va, va_access_time, ts[0]);
6170         VATTR_SET(&va, va_modify_time, ts[1]);
6171         if (nullflag)
6172                 va.va_vaflags |= VA_UTIMES_NULL;
6173
6174 #if NAMEDSTREAMS
6175         /* utimes calls are not allowed for resource forks. */
6176         if (vp->v_flag & VISNAMEDSTREAM) {
6177                 error = EPERM;
6178                 goto out;
6179         }
6180 #endif
6181
6182 #if CONFIG_MACF
6183         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
6184         if (error)
6185                 goto out;
6186 #endif
6187         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6188                 if (!nullflag && error == EACCES)
6189                         error = EPERM;
6190                 goto out;
6191         }
6192
6193         /* since we may not need to auth anything, check here */
6194         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6195                 if (!nullflag && error == EACCES)
6196                         error = EPERM;
6197                 goto out;
6198         }
6199         error = vnode_setattr(vp, &va, ctx);
6200
6201 out:
6202         return error;
6203 }
6204
6205 /*
6206  * Set the access and modification times of a file.
6207  */
6208 /* ARGSUSED */
6209 int
6210 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
6211 {
6212         struct timespec ts[2];
6213         user_addr_t usrtvp;
6214         int error;
6215         struct nameidata nd;
6216         vfs_context_t ctx = vfs_context_current();
6217
6218         /*
6219          * AUDIT: Needed to change the order of operations to do the
6220          * name lookup first because auditing wants the path.
6221          */
6222         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6223                 UIO_USERSPACE, uap->path, ctx);
6224         error = namei(&nd);
6225         if (error)
6226                 return (error);
6227         nameidone(&nd);
6228
6229         /*
6230          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
6231          * the current time instead.
6232          */
6233         usrtvp = uap->tptr;
6234         if ((error = getutimes(usrtvp, ts)) != 0)
6235                 goto out;
6236
6237         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
6238
6239 out:
6240         vnode_put(nd.ni_vp);
6241         return (error);
6242 }
6243
6244 /*
6245  * Set the access and modification times of a file.
6246  */
6247 /* ARGSUSED */
6248 int
6249 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
6250 {
6251         struct timespec ts[2];
6252         vnode_t vp;
6253         user_addr_t usrtvp;
6254         int error;
6255
6256         AUDIT_ARG(fd, uap->fd);
6257         usrtvp = uap->tptr;
6258         if ((error = getutimes(usrtvp, ts)) != 0)
6259                 return (error);
6260         if ((error = file_vnode(uap->fd, &vp)) != 0)
6261                 return (error);
6262         if((error = vnode_getwithref(vp))) {
6263                 file_drop(uap->fd);
6264                 return(error);
6265         }
6266
6267         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
6268         vnode_put(vp);
6269         file_drop(uap->fd);
6270         return(error);
6271 }
6272
6273 /*
6274  * Truncate a file given its path name.
6275  */
6276 /* ARGSUSED */
6277 int
6278 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
6279 {
6280         vnode_t vp;
6281         struct vnode_attr va;
6282         vfs_context_t ctx = vfs_context_current();
6283         int error;
6284         struct nameidata nd;
6285         kauth_action_t action;
6286
6287         if (uap->length < 0)
6288                 return(EINVAL);
6289         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
6290                 UIO_USERSPACE, uap->path, ctx);
6291         if ((error = namei(&nd)))
6292                 return (error);
6293         vp = nd.ni_vp;
6294
6295         nameidone(&nd);
6296
6297         VATTR_INIT(&va);
6298         VATTR_SET(&va, va_data_size, uap->length);
6299
6300 #if CONFIG_MACF
6301         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
6302         if (error)
6303                 goto out;
6304 #endif
6305
6306         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6307                 goto out;
6308         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6309                 goto out;
6310         error = vnode_setattr(vp, &va, ctx);
6311 out:
6312         vnode_put(vp);
6313         return (error);
6314 }
6315
6316 /*
6317  * Truncate a file given a file descriptor.
6318  */
6319 /* ARGSUSED */
6320 int
6321 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
6322 {
6323         vfs_context_t ctx = vfs_context_current();
6324         struct vnode_attr va;
6325         vnode_t vp;
6326         struct fileproc *fp;
6327         int error ;
6328         int fd = uap->fd;
6329
6330         AUDIT_ARG(fd, uap->fd);
6331         if (uap->length < 0)
6332                 return(EINVAL);
6333
6334         if ( (error = fp_lookup(p,fd,&fp,0)) ) {
6335                 return(error);
6336         }
6337
6338         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
6339         case DTYPE_PSXSHM:
6340                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
6341                 goto out;
6342         case DTYPE_VNODE:
6343                 break;
6344         default:
6345                 error = EINVAL;
6346                 goto out;
6347         }
6348
6349         vp = (vnode_t)fp->f_fglob->fg_data;
6350
6351         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
6352                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
6353                 error = EINVAL;
6354                 goto out;
6355         }
6356
6357         if ((error = vnode_getwithref(vp)) != 0) {
6358                 goto out;
6359         }
6360
6361         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6362
6363 #if CONFIG_MACF
6364         error = mac_vnode_check_truncate(ctx,
6365             fp->f_fglob->fg_cred, vp);
6366         if (error) {
6367                 (void)vnode_put(vp);
6368                 goto out;
6369         }
6370 #endif
6371         VATTR_INIT(&va);
6372         VATTR_SET(&va, va_data_size, uap->length);
6373         error = vnode_setattr(vp, &va, ctx);
6374         (void)vnode_put(vp);
6375 out:
6376         file_drop(fd);
6377         return (error);
6378 }
6379
6380
6381 /*
6382  * Sync an open file with synchronized I/O _file_ integrity completion
6383  */
6384 /* ARGSUSED */
6385 int
6386 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
6387 {
6388         __pthread_testcancel(1);
6389         return(fsync_common(p, uap, MNT_WAIT));
6390 }
6391
6392
6393 /*
6394  * Sync an open file with synchronized I/O _file_ integrity completion
6395  *
6396  * Notes:       This is a legacy support function that does not test for
6397  *              thread cancellation points.
6398  */
6399 /* ARGSUSED */
6400 int
6401 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
6402 {
6403         return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT));
6404 }
6405
6406
6407 /*
6408  * Sync an open file with synchronized I/O _data_ integrity completion
6409  */
6410 /* ARGSUSED */
6411 int
6412 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
6413 {
6414         __pthread_testcancel(1);
6415         return(fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT));
6416 }
6417
6418
6419 /*
6420  * fsync_common
6421  *
6422  * Common fsync code to support both synchronized I/O file integrity completion
6423  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
6424  *
6425  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
6426  * will only guarantee that the file data contents are retrievable.  If
6427  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
6428  * includes additional metadata unnecessary for retrieving the file data
6429  * contents, such as atime, mtime, ctime, etc., also be committed to stable
6430  * storage.
6431  *
6432  * Parameters:  p                               The process
6433  *              uap->fd                         The descriptor to synchronize
6434  *              flags                           The data integrity flags
6435  *
6436  * Returns:     int                             Success
6437  *      fp_getfvp:EBADF                         Bad file descriptor
6438  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
6439  *      VNOP_FSYNC:???                          unspecified
6440  *
6441  * Notes:       We use struct fsync_args because it is a short name, and all
6442  *              caller argument structures are otherwise identical.
6443  */
6444 static int
6445 fsync_common(proc_t p, struct fsync_args *uap, int flags)
6446 {
6447         vnode_t vp;
6448         struct fileproc *fp;
6449         vfs_context_t ctx = vfs_context_current();
6450         int error;
6451
6452         AUDIT_ARG(fd, uap->fd);
6453
6454         if ( (error = fp_getfvp(p, uap->fd, &fp, &vp)) )
6455                 return (error);
6456         if ( (error = vnode_getwithref(vp)) ) {
6457                 file_drop(uap->fd);
6458                 return(error);
6459         }
6460
6461         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6462
6463         error = VNOP_FSYNC(vp, flags, ctx);
6464
6465 #if NAMEDRSRCFORK
6466         /* Sync resource fork shadow file if necessary. */
6467         if ((error == 0) &&
6468             (vp->v_flag & VISNAMEDSTREAM) &&
6469             (vp->v_parent != NULLVP) &&
6470             vnode_isshadow(vp) &&
6471             (fp->f_flags & FP_WRITTEN)) {
6472                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
6473         }
6474 #endif
6475
6476         (void)vnode_put(vp);
6477         file_drop(uap->fd);
6478         return (error);
6479 }
6480
6481 /*
6482  * Duplicate files.  Source must be a file, target must be a file or
6483  * must not exist.
6484  *
6485  * XXX Copyfile authorisation checking is woefully inadequate, and will not
6486  *     perform inheritance correctly.
6487  */
6488 /* ARGSUSED */
6489 int
6490 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
6491 {
6492         vnode_t tvp, fvp, tdvp, sdvp;
6493         struct nameidata fromnd, tond;
6494         int error;
6495         vfs_context_t ctx = vfs_context_current();
6496
6497         /* Check that the flags are valid. */
6498
6499         if (uap->flags & ~CPF_MASK) {
6500                 return(EINVAL);
6501         }
6502
6503         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, SAVESTART | AUDITVNPATH1,
6504                 UIO_USERSPACE, uap->from, ctx);
6505         if ((error = namei(&fromnd)))
6506                 return (error);
6507         fvp = fromnd.ni_vp;
6508
6509         NDINIT(&tond, CREATE, OP_LINK,
6510                LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6511                UIO_USERSPACE, uap->to, ctx);
6512         if ((error = namei(&tond))) {
6513                 goto out1;
6514         }
6515         tdvp = tond.ni_dvp;
6516         tvp = tond.ni_vp;
6517
6518         if (tvp != NULL) {
6519                 if (!(uap->flags & CPF_OVERWRITE)) {
6520                         error = EEXIST;
6521                         goto out;
6522                 }
6523         }
6524         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
6525                 error = EISDIR;
6526                 goto out;
6527         }
6528
6529         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
6530                 goto out;
6531
6532         if (fvp == tdvp)
6533                 error = EINVAL;
6534         /*
6535          * If source is the same as the destination (that is the
6536          * same inode number) then there is nothing to do.
6537          * (fixed to have POSIX semantics - CSM 3/2/98)
6538          */
6539         if (fvp == tvp)
6540                 error = -1;
6541         if (!error)
6542                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
6543 out:
6544         sdvp = tond.ni_startdir;
6545         /*
6546          * nameidone has to happen before we vnode_put(tdvp)
6547          * since it may need to release the fs_nodelock on the tdvp
6548          */
6549         nameidone(&tond);
6550
6551         if (tvp)
6552                 vnode_put(tvp);
6553         vnode_put(tdvp);
6554         vnode_put(sdvp);
6555 out1:
6556         vnode_put(fvp);
6557
6558         if (fromnd.ni_startdir)
6559                 vnode_put(fromnd.ni_startdir);
6560         nameidone(&fromnd);
6561
6562         if (error == -1)
6563                 return (0);
6564         return (error);
6565 }
6566
6567
6568 /*
6569  * Rename files.  Source and destination must either both be directories,
6570  * or both not be directories.  If target is a directory, it must be empty.
6571  */
6572 /* ARGSUSED */
6573 static int
6574 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
6575     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
6576 {
6577         vnode_t tvp, tdvp;
6578         vnode_t fvp, fdvp;
6579         struct nameidata *fromnd, *tond;
6580         int error;
6581         int do_retry;
6582         int retry_count;
6583         int mntrename;
6584         int need_event;
6585         const char *oname = NULL;
6586         char *from_name = NULL, *to_name = NULL;
6587         int from_len=0, to_len=0;
6588         int holding_mntlock;
6589         mount_t locked_mp = NULL;
6590         vnode_t oparent = NULLVP;
6591 #if CONFIG_FSE
6592         fse_info from_finfo, to_finfo;
6593 #endif
6594         int from_truncated=0, to_truncated;
6595         int batched = 0;
6596         struct vnode_attr *fvap, *tvap;
6597         int continuing = 0;
6598         /* carving out a chunk for structs that are too big to be on stack. */
6599         struct {
6600                 struct nameidata from_node, to_node;
6601                 struct vnode_attr fv_attr, tv_attr;
6602         } * __rename_data;
6603         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
6604         fromnd = &__rename_data->from_node;
6605         tond = &__rename_data->to_node;
6606
6607         holding_mntlock = 0;
6608         do_retry = 0;
6609         retry_count = 0;
6610 retry:
6611         fvp = tvp = NULL;
6612         fdvp = tdvp = NULL;
6613         fvap = tvap = NULL;
6614         mntrename = FALSE;
6615
6616         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
6617             segflg, from, ctx);
6618         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
6619
6620         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6621             segflg, to, ctx);
6622         tond->ni_flag = NAMEI_COMPOUNDRENAME;
6623
6624 continue_lookup:
6625         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
6626                 if ( (error = nameiat(fromnd, fromfd)) )
6627                         goto out1;
6628                 fdvp = fromnd->ni_dvp;
6629                 fvp  = fromnd->ni_vp;
6630
6631                 if (fvp && fvp->v_type == VDIR)
6632                         tond->ni_cnd.cn_flags |= WILLBEDIR;
6633         }
6634
6635         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
6636                 if ( (error = nameiat(tond, tofd)) ) {
6637                         /*
6638                          * Translate error code for rename("dir1", "dir2/.").
6639                          */
6640                         if (error == EISDIR && fvp->v_type == VDIR)
6641                                 error = EINVAL;
6642                         goto out1;
6643                 }
6644                 tdvp = tond->ni_dvp;
6645                 tvp  = tond->ni_vp;
6646         }
6647
6648         batched = vnode_compound_rename_available(fdvp);
6649         if (!fvp) {
6650                 /*
6651                  * Claim: this check will never reject a valid rename.
6652                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
6653                  * Suppose fdvp and tdvp are not on the same mount.
6654                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
6655                  *      then you can't move it to within another dir on the same mountpoint.
6656                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
6657                  *
6658                  * If this check passes, then we are safe to pass these vnodes to the same FS.
6659                  */
6660                 if (fdvp->v_mount != tdvp->v_mount) {
6661                         error = EXDEV;
6662                         goto out1;
6663                 }
6664                 goto skipped_lookup;
6665         }
6666
6667         if (!batched) {
6668                 error = vn_authorize_rename(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, NULL);
6669                 if (error) {
6670                         if (error == ENOENT) {
6671                                 assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
6672                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6673                                         /*
6674                                          * We encountered a race where after doing the namei, tvp stops
6675                                          * being valid. If so, simply re-drive the rename call from the
6676                                          * top.
6677                                          */
6678                                         do_retry = 1;
6679                                         retry_count += 1;
6680                                 }
6681                         }
6682                         goto out1;
6683                 }
6684         }
6685
6686         /*
6687          * If the source and destination are the same (i.e. they're
6688          * links to the same vnode) and the target file system is
6689          * case sensitive, then there is nothing to do.
6690          *
6691          * XXX Come back to this.
6692          */
6693         if (fvp == tvp) {
6694                 int pathconf_val;
6695
6696                 /*
6697                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
6698                  * then assume that this file system is case sensitive.
6699                  */
6700                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
6701                     pathconf_val != 0) {
6702                         goto out1;
6703                 }
6704         }
6705
6706         /*
6707          * Allow the renaming of mount points.
6708          * - target must not exist
6709          * - target must reside in the same directory as source
6710          * - union mounts cannot be renamed
6711          * - "/" cannot be renamed
6712          *
6713          * XXX Handle this in VFS after a continued lookup (if we missed
6714          * in the cache to start off)
6715          */
6716         if ((fvp->v_flag & VROOT) &&
6717             (fvp->v_type == VDIR) &&
6718             (tvp == NULL)  &&
6719             (fvp->v_mountedhere == NULL)  &&
6720             (fdvp == tdvp)  &&
6721             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0)  &&
6722             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
6723                 vnode_t coveredvp;
6724
6725                 /* switch fvp to the covered vnode */
6726                 coveredvp = fvp->v_mount->mnt_vnodecovered;
6727                 if ( (vnode_getwithref(coveredvp)) ) {
6728                         error = ENOENT;
6729                         goto out1;
6730                 }
6731                 vnode_put(fvp);
6732
6733                 fvp = coveredvp;
6734                 mntrename = TRUE;
6735         }
6736         /*
6737          * Check for cross-device rename.
6738          */
6739         if ((fvp->v_mount != tdvp->v_mount) ||
6740             (tvp && (fvp->v_mount != tvp->v_mount))) {
6741                 error = EXDEV;
6742                 goto out1;
6743         }
6744
6745         /*
6746          * If source is the same as the destination (that is the
6747          * same inode number) then there is nothing to do...
6748          * EXCEPT if the underlying file system supports case
6749          * insensitivity and is case preserving.  In this case
6750          * the file system needs to handle the special case of
6751          * getting the same vnode as target (fvp) and source (tvp).
6752          *
6753          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
6754          * and _PC_CASE_PRESERVING can have this exception, and they need to
6755          * handle the special case of getting the same vnode as target and
6756          * source.  NOTE: Then the target is unlocked going into vnop_rename,
6757          * so not to cause locking problems. There is a single reference on tvp.
6758          *
6759          * NOTE - that fvp == tvp also occurs if they are hard linked and
6760          * that correct behaviour then is just to return success without doing
6761          * anything.
6762          *
6763          * XXX filesystem should take care of this itself, perhaps...
6764          */
6765         if (fvp == tvp && fdvp == tdvp) {
6766                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
6767                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
6768                           fromnd->ni_cnd.cn_namelen)) {
6769                         goto out1;
6770                 }
6771         }
6772
6773         if (holding_mntlock && fvp->v_mount != locked_mp) {
6774                 /*
6775                  * we're holding a reference and lock
6776                  * on locked_mp, but it no longer matches
6777                  * what we want to do... so drop our hold
6778                  */
6779                 mount_unlock_renames(locked_mp);
6780                 mount_drop(locked_mp, 0);
6781                 holding_mntlock = 0;
6782         }
6783         if (tdvp != fdvp && fvp->v_type == VDIR) {
6784                 /*
6785                  * serialize renames that re-shape
6786                  * the tree... if holding_mntlock is
6787                  * set, then we're ready to go...
6788                  * otherwise we
6789                  * first need to drop the iocounts
6790                  * we picked up, second take the
6791                  * lock to serialize the access,
6792                  * then finally start the lookup
6793                  * process over with the lock held
6794                  */
6795                 if (!holding_mntlock) {
6796                         /*
6797                          * need to grab a reference on
6798                          * the mount point before we
6799                          * drop all the iocounts... once
6800                          * the iocounts are gone, the mount
6801                          * could follow
6802                          */
6803                         locked_mp = fvp->v_mount;
6804                         mount_ref(locked_mp, 0);
6805
6806                         /*
6807                          * nameidone has to happen before we vnode_put(tvp)
6808                          * since it may need to release the fs_nodelock on the tvp
6809                          */
6810                         nameidone(tond);
6811
6812                         if (tvp)
6813                                 vnode_put(tvp);
6814                         vnode_put(tdvp);
6815
6816                         /*
6817                          * nameidone has to happen before we vnode_put(fdvp)
6818                          * since it may need to release the fs_nodelock on the fvp
6819                          */
6820                         nameidone(fromnd);
6821
6822                         vnode_put(fvp);
6823                         vnode_put(fdvp);
6824
6825                         mount_lock_renames(locked_mp);
6826                         holding_mntlock = 1;
6827
6828                         goto retry;
6829                 }
6830         } else {
6831                 /*
6832                  * when we dropped the iocounts to take
6833                  * the lock, we allowed the identity of
6834                  * the various vnodes to change... if they did,
6835                  * we may no longer be dealing with a rename
6836                  * that reshapes the tree... once we're holding
6837                  * the iocounts, the vnodes can't change type
6838                  * so we're free to drop the lock at this point
6839                  * and continue on
6840                  */
6841                 if (holding_mntlock) {
6842                         mount_unlock_renames(locked_mp);
6843                         mount_drop(locked_mp, 0);
6844                         holding_mntlock = 0;
6845                 }
6846         }
6847
6848         // save these off so we can later verify that fvp is the same
6849         oname   = fvp->v_name;
6850         oparent = fvp->v_parent;
6851
6852 skipped_lookup:
6853 #if CONFIG_FSE
6854         need_event = need_fsevent(FSE_RENAME, fdvp);
6855         if (need_event) {
6856                 if (fvp) {
6857                         get_fse_info(fvp, &from_finfo, ctx);
6858                 } else {
6859                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
6860                         if (error) {
6861                                 goto out1;
6862                         }
6863
6864                         fvap = &__rename_data->fv_attr;
6865                 }
6866
6867                 if (tvp) {
6868                         get_fse_info(tvp, &to_finfo, ctx);
6869                 } else if (batched) {
6870                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
6871                         if (error) {
6872                                 goto out1;
6873                         }
6874
6875                         tvap = &__rename_data->tv_attr;
6876                 }
6877         }
6878 #else
6879         need_event = 0;
6880 #endif /* CONFIG_FSE */
6881
6882         if (need_event || kauth_authorize_fileop_has_listeners()) {
6883                 if (from_name == NULL) {
6884                         GET_PATH(from_name);
6885                         if (from_name == NULL) {
6886                                 error = ENOMEM;
6887                                 goto out1;
6888                         }
6889                 }
6890
6891                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
6892
6893                 if (to_name == NULL) {
6894                         GET_PATH(to_name);
6895                         if (to_name == NULL) {
6896                                 error = ENOMEM;
6897                                 goto out1;
6898                         }
6899                 }
6900
6901                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
6902         }
6903 #if CONFIG_SECLUDED_RENAME
6904         if (flags & VFS_SECLUDE_RENAME) {
6905                 fromnd->ni_cnd.cn_flags |=  CN_SECLUDE_RENAME;
6906         }
6907 #else
6908         #pragma unused(flags)
6909 #endif
6910         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
6911                             tdvp, &tvp, &tond->ni_cnd, tvap,
6912                             0, ctx);
6913
6914         if (holding_mntlock) {
6915                 /*
6916                  * we can drop our serialization
6917                  * lock now
6918                  */
6919                 mount_unlock_renames(locked_mp);
6920                 mount_drop(locked_mp, 0);
6921                 holding_mntlock = 0;
6922         }
6923         if (error) {
6924                 if (error == EKEEPLOOKING) {
6925                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6926                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6927                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
6928                                 }
6929                         }
6930
6931                         fromnd->ni_vp = fvp;
6932                         tond->ni_vp = tvp;
6933
6934                         goto continue_lookup;
6935                 }
6936
6937                 /*
6938                  * We may encounter a race in the VNOP where the destination didn't
6939                  * exist when we did the namei, but it does by the time we go and
6940                  * try to create the entry. In this case, we should re-drive this rename
6941                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
6942                  * but other filesystems susceptible to this race could return it, too.
6943                  */
6944                 if (error == ERECYCLE) {
6945                         do_retry = 1;
6946                 }
6947
6948                 /*
6949                  * For compound VNOPs, the authorization callback may return
6950                  * ENOENT in case of racing hardlink lookups hitting the name
6951                  * cache, redrive the lookup.
6952                  */
6953                 if (batched && error == ENOENT) {
6954                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
6955                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6956                                 do_retry = 1;
6957                                 retry_count += 1;
6958                         }
6959                 }
6960
6961                 goto out1;
6962         }
6963
6964         /* call out to allow 3rd party notification of rename.
6965          * Ignore result of kauth_authorize_fileop call.
6966          */
6967         kauth_authorize_fileop(vfs_context_ucred(ctx),
6968                         KAUTH_FILEOP_RENAME,
6969                         (uintptr_t)from_name, (uintptr_t)to_name);
6970
6971 #if CONFIG_FSE
6972         if (from_name != NULL && to_name != NULL) {
6973                 if (from_truncated || to_truncated) {
6974                         // set it here since only the from_finfo gets reported up to user space
6975                         from_finfo.mode |= FSE_TRUNCATED_PATH;
6976                 }
6977
6978                 if (tvap && tvp) {
6979                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
6980                 }
6981                 if (fvap) {
6982                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
6983                 }
6984
6985                 if (tvp) {
6986                         add_fsevent(FSE_RENAME, ctx,
6987                                     FSE_ARG_STRING, from_len, from_name,
6988                                     FSE_ARG_FINFO, &from_finfo,
6989                                     FSE_ARG_STRING, to_len, to_name,
6990                                     FSE_ARG_FINFO, &to_finfo,
6991                                     FSE_ARG_DONE);
6992                 } else {
6993                         add_fsevent(FSE_RENAME, ctx,
6994                                     FSE_ARG_STRING, from_len, from_name,
6995                                     FSE_ARG_FINFO, &from_finfo,
6996                                     FSE_ARG_STRING, to_len, to_name,
6997                                     FSE_ARG_DONE);
6998                 }
6999         }
7000 #endif /* CONFIG_FSE */
7001
7002         /*
7003          * update filesystem's mount point data
7004          */
7005         if (mntrename) {
7006                 char *cp, *pathend, *mpname;
7007                 char * tobuf;
7008                 struct mount *mp;
7009                 int maxlen;
7010                 size_t len = 0;
7011
7012                 mp = fvp->v_mountedhere;
7013
7014                 if (vfs_busy(mp, LK_NOWAIT)) {
7015                         error = EBUSY;
7016                         goto out1;
7017                 }
7018                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
7019
7020                 if (UIO_SEG_IS_USER_SPACE(segflg))
7021                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
7022                 else
7023                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
7024                 if (!error) {
7025                         /* find current mount point prefix */
7026                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
7027                         for (cp = pathend; *cp != '\0'; ++cp) {
7028                                 if (*cp == '/')
7029                                         pathend = cp + 1;
7030                         }
7031                         /* find last component of target name */
7032                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
7033                                 if (*cp == '/')
7034                                         mpname = cp + 1;
7035                         }
7036                         /* append name to prefix */
7037                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
7038                         bzero(pathend, maxlen);
7039                         strlcpy(pathend, mpname, maxlen);
7040                 }
7041                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
7042
7043                 vfs_unbusy(mp);
7044         }
7045         /*
7046          * fix up name & parent pointers.  note that we first
7047          * check that fvp has the same name/parent pointers it
7048          * had before the rename call... this is a 'weak' check
7049          * at best...
7050          *
7051          * XXX oparent and oname may not be set in the compound vnop case
7052          */
7053         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
7054                 int update_flags;
7055
7056                 update_flags = VNODE_UPDATE_NAME;
7057
7058                 if (fdvp != tdvp)
7059                         update_flags |= VNODE_UPDATE_PARENT;
7060
7061                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
7062         }
7063 out1:
7064         if (to_name != NULL) {
7065                 RELEASE_PATH(to_name);
7066                 to_name = NULL;
7067         }
7068         if (from_name != NULL) {
7069                 RELEASE_PATH(from_name);
7070                 from_name = NULL;
7071         }
7072         if (holding_mntlock) {
7073                 mount_unlock_renames(locked_mp);
7074                 mount_drop(locked_mp, 0);
7075                 holding_mntlock = 0;
7076         }
7077         if (tdvp) {
7078                 /*
7079                  * nameidone has to happen before we vnode_put(tdvp)
7080                  * since it may need to release the fs_nodelock on the tdvp
7081                  */
7082                 nameidone(tond);
7083
7084                 if (tvp)
7085                         vnode_put(tvp);
7086                 vnode_put(tdvp);
7087         }
7088         if (fdvp) {
7089                 /*
7090                  * nameidone has to happen before we vnode_put(fdvp)
7091                  * since it may need to release the fs_nodelock on the fdvp
7092                  */
7093                 nameidone(fromnd);
7094
7095                 if (fvp)
7096                         vnode_put(fvp);
7097                 vnode_put(fdvp);
7098         }
7099
7100         /*
7101          * If things changed after we did the namei, then we will re-drive
7102          * this rename call from the top.
7103          */
7104         if (do_retry) {
7105                 do_retry = 0;
7106                 goto retry;
7107         }
7108
7109         FREE(__rename_data, M_TEMP);
7110         return (error);
7111 }
7112
7113 int
7114 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
7115 {
7116         return (renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
7117             AT_FDCWD, uap->to, UIO_USERSPACE, 0));
7118 }
7119
7120 #if CONFIG_SECLUDED_RENAME
7121 int rename_ext(__unused proc_t p, struct rename_ext_args *uap, __unused int32_t *retval)
7122 {
7123         return renameat_internal(
7124                 vfs_context_current(),
7125                 AT_FDCWD, uap->from,
7126                 AT_FDCWD, uap->to,
7127                 UIO_USERSPACE, uap->flags);
7128 }
7129 #endif
7130
7131 int
7132 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
7133 {
7134         return (renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
7135             uap->tofd, uap->to, UIO_USERSPACE, 0));
7136 }
7137
7138 /*
7139  * Make a directory file.
7140  *
7141  * Returns:     0                       Success
7142  *              EEXIST
7143  *      namei:???
7144  *      vnode_authorize:???
7145  *      vn_create:???
7146  */
7147 /* ARGSUSED */
7148 static int
7149 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
7150     enum uio_seg segflg)
7151 {
7152         vnode_t vp, dvp;
7153         int error;
7154         int update_flags = 0;
7155         int batched;
7156         struct nameidata nd;
7157
7158         AUDIT_ARG(mode, vap->va_mode);
7159         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
7160                path, ctx);
7161         nd.ni_cnd.cn_flags |= WILLBEDIR;
7162         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
7163
7164 continue_lookup:
7165         error = nameiat(&nd, fd);
7166         if (error)
7167                 return (error);
7168         dvp = nd.ni_dvp;
7169         vp = nd.ni_vp;
7170
7171         if (vp != NULL) {
7172                 error = EEXIST;
7173                 goto out;
7174         }
7175
7176         batched = vnode_compound_mkdir_available(dvp);
7177
7178         VATTR_SET(vap, va_type, VDIR);
7179
7180         /*
7181          * XXX
7182          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
7183          * only get EXISTS or EISDIR for existing path components, and not that it could see
7184          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
7185          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
7186          */
7187         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
7188                 if (error == EACCES || error == EPERM) {
7189                         int error2;
7190
7191                         nameidone(&nd);
7192                         vnode_put(dvp);
7193                         dvp = NULLVP;
7194
7195                         /*
7196                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
7197                          * rather than EACCESS if the target exists.
7198                          */
7199                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
7200                                         path, ctx);
7201                         error2 = nameiat(&nd, fd);
7202                         if (error2) {
7203                                 goto out;
7204                         } else {
7205                                 vp = nd.ni_vp;
7206                                 error = EEXIST;
7207                                 goto out;
7208                         }
7209                 }
7210
7211                 goto out;
7212         }
7213
7214         /*
7215          * make the directory
7216          */
7217         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
7218                 if (error == EKEEPLOOKING) {
7219                         nd.ni_vp = vp;
7220                         goto continue_lookup;
7221                 }
7222
7223                 goto out;
7224         }
7225
7226         // Make sure the name & parent pointers are hooked up
7227         if (vp->v_name == NULL)
7228                 update_flags |= VNODE_UPDATE_NAME;
7229         if (vp->v_parent == NULLVP)
7230                 update_flags |= VNODE_UPDATE_PARENT;
7231
7232         if (update_flags)
7233                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
7234
7235 #if CONFIG_FSE
7236         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
7237 #endif
7238
7239 out:
7240         /*
7241          * nameidone has to happen before we vnode_put(dvp)
7242          * since it may need to release the fs_nodelock on the dvp
7243          */
7244         nameidone(&nd);
7245
7246         if (vp)
7247                 vnode_put(vp);
7248         if (dvp)
7249                 vnode_put(dvp);
7250
7251         return (error);
7252 }
7253
7254 /*
7255  * mkdir_extended: Create a directory; with extended security (ACL).
7256  *
7257  * Parameters:    p                       Process requesting to create the directory
7258  *                uap                     User argument descriptor (see below)
7259  *                retval                  (ignored)
7260  *
7261  * Indirect:      uap->path               Path of directory to create
7262  *                uap->mode               Access permissions to set
7263  *                uap->xsecurity          ACL to set
7264  *
7265  * Returns:        0                      Success
7266  *                !0                      Not success
7267  *
7268  */
7269 int
7270 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
7271 {
7272         int ciferror;
7273         kauth_filesec_t xsecdst;
7274         struct vnode_attr va;
7275
7276         AUDIT_ARG(owner, uap->uid, uap->gid);
7277
7278         xsecdst = NULL;
7279         if ((uap->xsecurity != USER_ADDR_NULL) &&
7280             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
7281                 return ciferror;
7282
7283         VATTR_INIT(&va);
7284         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7285         if (xsecdst != NULL)
7286                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7287
7288         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7289             UIO_USERSPACE);
7290         if (xsecdst != NULL)
7291                 kauth_filesec_free(xsecdst);
7292         return ciferror;
7293 }
7294
7295 int
7296 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
7297 {
7298         struct vnode_attr va;
7299
7300         VATTR_INIT(&va);
7301         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7302
7303         return (mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7304             UIO_USERSPACE));
7305 }
7306
7307 int
7308 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
7309 {
7310         struct vnode_attr va;
7311
7312         VATTR_INIT(&va);
7313         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7314
7315         return(mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
7316             UIO_USERSPACE));
7317 }
7318
7319 static int
7320 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
7321     enum uio_seg segflg)
7322 {
7323         vnode_t vp, dvp;
7324         int error;
7325         struct nameidata nd;
7326         char     *path = NULL;
7327         int       len=0;
7328         int has_listeners = 0;
7329         int need_event = 0;
7330         int truncated = 0;
7331 #if CONFIG_FSE
7332         struct vnode_attr va;
7333 #endif /* CONFIG_FSE */
7334         struct vnode_attr *vap = NULL;
7335         int restart_count = 0;
7336         int batched;
7337
7338         int restart_flag;
7339
7340         /*
7341          * This loop exists to restart rmdir in the unlikely case that two
7342          * processes are simultaneously trying to remove the same directory
7343          * containing orphaned appleDouble files.
7344          */
7345         do {
7346                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
7347                     segflg, dirpath, ctx);
7348                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
7349 continue_lookup:
7350                 restart_flag = 0;
7351                 vap = NULL;
7352
7353                 error = nameiat(&nd, fd);
7354                 if (error)
7355                         return (error);
7356
7357                 dvp = nd.ni_dvp;
7358                 vp = nd.ni_vp;
7359
7360                 if (vp) {
7361                         batched = vnode_compound_rmdir_available(vp);
7362
7363                         if (vp->v_flag & VROOT) {
7364                                 /*
7365                                  * The root of a mounted filesystem cannot be deleted.
7366                                  */
7367                                 error = EBUSY;
7368                                 goto out;
7369                         }
7370
7371                         /*
7372                          * Removed a check here; we used to abort if vp's vid
7373                          * was not the same as what we'd seen the last time around.
7374                          * I do not think that check was valid, because if we retry
7375                          * and all dirents are gone, the directory could legitimately
7376                          * be recycled but still be present in a situation where we would
7377                          * have had permission to delete.  Therefore, we won't make
7378                          * an effort to preserve that check now that we may not have a
7379                          * vp here.
7380                          */
7381
7382                         if (!batched) {
7383                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
7384                                 if (error) {
7385                                         if (error == ENOENT) {
7386                                                 assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7387                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7388                                                         restart_flag = 1;
7389                                                         restart_count += 1;
7390                                                 }
7391                                         }
7392                                         goto out;
7393                                 }
7394                         }
7395                 } else {
7396                         batched = 1;
7397
7398                         if (!vnode_compound_rmdir_available(dvp)) {
7399                                 panic("No error, but no compound rmdir?");
7400                         }
7401                 }
7402
7403 #if CONFIG_FSE
7404                 fse_info  finfo;
7405
7406                 need_event = need_fsevent(FSE_DELETE, dvp);
7407                 if (need_event) {
7408                         if (!batched) {
7409                                 get_fse_info(vp, &finfo, ctx);
7410                         } else {
7411                                 error = vfs_get_notify_attributes(&va);
7412                                 if (error) {
7413                                         goto out;
7414                                 }
7415
7416                                 vap = &va;
7417                         }
7418                 }
7419 #endif
7420                 has_listeners = kauth_authorize_fileop_has_listeners();
7421                 if (need_event || has_listeners) {
7422                         if (path == NULL) {
7423                                 GET_PATH(path);
7424                                 if (path == NULL) {
7425                                         error = ENOMEM;
7426                                         goto out;
7427                                 }
7428                         }
7429
7430                         len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
7431 #if CONFIG_FSE
7432                         if (truncated) {
7433                                 finfo.mode |= FSE_TRUNCATED_PATH;
7434                         }
7435 #endif
7436                 }
7437
7438                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
7439                 nd.ni_vp = vp;
7440                 if (vp == NULLVP) {
7441                         /* Couldn't find a vnode */
7442                         goto out;
7443                 }
7444
7445                 if (error == EKEEPLOOKING) {
7446                         goto continue_lookup;
7447                 } else if (batched && error == ENOENT) {
7448                         assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7449                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7450                                 /*
7451                                  * For compound VNOPs, the authorization callback
7452                                  * may return ENOENT in case of racing hard link lookups
7453                                  * redrive the lookup.
7454                                  */
7455                                 restart_flag = 1;
7456                                 restart_count += 1;
7457                                 goto out;
7458                         }
7459                 }
7460 #if CONFIG_APPLEDOUBLE
7461                 /*
7462                  * Special case to remove orphaned AppleDouble
7463                  * files. I don't like putting this in the kernel,
7464                  * but carbon does not like putting this in carbon either,
7465                  * so here we are.
7466                  */
7467                 if (error == ENOTEMPTY) {
7468                         error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
7469                         if (error == EBUSY) {
7470                                 goto out;
7471                         }
7472
7473
7474                         /*
7475                          * Assuming everything went well, we will try the RMDIR again
7476                          */
7477                         if (!error)
7478                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
7479                 }
7480 #endif /* CONFIG_APPLEDOUBLE */
7481                 /*
7482                  * Call out to allow 3rd party notification of delete.
7483                  * Ignore result of kauth_authorize_fileop call.
7484                  */
7485                 if (!error) {
7486                         if (has_listeners) {
7487                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
7488                                                 KAUTH_FILEOP_DELETE,
7489                                                 (uintptr_t)vp,
7490                                                 (uintptr_t)path);
7491                         }
7492
7493                         if (vp->v_flag & VISHARDLINK) {
7494                                 // see the comment in unlink1() about why we update
7495                                 // the parent of a hard link when it is removed
7496                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
7497                         }
7498
7499 #if CONFIG_FSE
7500                         if (need_event) {
7501                                 if (vap) {
7502                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
7503                                 }
7504                                 add_fsevent(FSE_DELETE, ctx,
7505                                                 FSE_ARG_STRING, len, path,
7506                                                 FSE_ARG_FINFO, &finfo,
7507                                                 FSE_ARG_DONE);
7508                         }
7509 #endif
7510                 }
7511
7512 out:
7513                 if (path != NULL) {
7514                         RELEASE_PATH(path);
7515                         path = NULL;
7516                 }
7517                 /*
7518                  * nameidone has to happen before we vnode_put(dvp)
7519                  * since it may need to release the fs_nodelock on the dvp
7520                  */
7521                 nameidone(&nd);
7522                 vnode_put(dvp);
7523
7524                 if (vp)
7525                         vnode_put(vp);
7526
7527                 if (restart_flag == 0) {
7528                         wakeup_one((caddr_t)vp);
7529                         return (error);
7530                 }
7531                 tsleep(vp, PVFS, "rm AD", 1);
7532
7533         } while (restart_flag != 0);
7534
7535         return (error);
7536
7537 }
7538
7539 /*
7540  * Remove a directory file.
7541  */
7542 /* ARGSUSED */
7543 int
7544 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
7545 {
7546         return (rmdirat_internal(vfs_context_current(), AT_FDCWD,
7547             CAST_USER_ADDR_T(uap->path), UIO_USERSPACE));
7548 }
7549
7550 /* Get direntry length padded to 8 byte alignment */
7551 #define DIRENT64_LEN(namlen) \
7552         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
7553
7554 errno_t
7555 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
7556                 int *numdirent, vfs_context_t ctxp)
7557 {
7558         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
7559         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
7560                    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0))  {
7561                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
7562         } else {
7563                 size_t bufsize;
7564                 void * bufptr;
7565                 uio_t auio;
7566                 struct direntry *entry64;
7567                 struct dirent *dep;
7568                 int bytesread;
7569                 int error;
7570
7571                 /*
7572                  * Our kernel buffer needs to be smaller since re-packing
7573                  * will expand each dirent.  The worse case (when the name
7574                  * length is 3) corresponds to a struct direntry size of 32
7575                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
7576                  * (4-byte aligned).  So having a buffer that is 3/8 the size
7577                  * will prevent us from reading more than we can pack.
7578                  *
7579                  * Since this buffer is wired memory, we will limit the
7580                  * buffer size to a maximum of 32K. We would really like to
7581                  * use 32K in the MIN(), but we use magic number 87371 to
7582                  * prevent uio_resid() * 3 / 8 from overflowing.
7583                  */
7584                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
7585                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
7586                 if (bufptr == NULL) {
7587                         return ENOMEM;
7588                 }
7589
7590                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
7591                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
7592                 auio->uio_offset = uio->uio_offset;
7593
7594                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
7595
7596                 dep = (struct dirent *)bufptr;
7597                 bytesread = bufsize - uio_resid(auio);
7598
7599                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
7600                        M_TEMP, M_WAITOK);
7601                 /*
7602                  * Convert all the entries and copy them out to user's buffer.
7603                  */
7604                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
7605                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
7606
7607                         bzero(entry64, enbufsize);
7608                         /* Convert a dirent to a dirent64. */
7609                         entry64->d_ino = dep->d_ino;
7610                         entry64->d_seekoff = 0;
7611                         entry64->d_reclen = enbufsize;
7612                         entry64->d_namlen = dep->d_namlen;
7613                         entry64->d_type = dep->d_type;
7614                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
7615
7616                         /* Move to next entry. */
7617                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
7618
7619                         /* Copy entry64 to user's buffer. */
7620                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
7621                 }
7622
7623                 /* Update the real offset using the offset we got from VNOP_READDIR. */
7624                 if (error == 0) {
7625                         uio->uio_offset = auio->uio_offset;
7626                 }
7627                 uio_free(auio);
7628                 FREE(bufptr, M_TEMP);
7629                 FREE(entry64, M_TEMP);
7630                 return (error);
7631         }
7632 }
7633
7634 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
7635
7636 /*
7637  * Read a block of directory entries in a file system independent format.
7638  */
7639 static int
7640 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
7641                      off_t *offset, int flags)
7642 {
7643         vnode_t vp;
7644         struct vfs_context context = *vfs_context_current();    /* local copy */
7645         struct fileproc *fp;
7646         uio_t auio;
7647         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7648         off_t loff;
7649         int error, eofflag, numdirent;
7650         char uio_buf[ UIO_SIZEOF(1) ];
7651
7652         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
7653         if (error) {
7654                 return (error);
7655         }
7656         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7657                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7658                 error = EBADF;
7659                 goto out;
7660         }
7661
7662         if (bufsize > GETDIRENTRIES_MAXBUFSIZE)
7663                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
7664
7665 #if CONFIG_MACF
7666         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
7667         if (error)
7668                 goto out;
7669 #endif
7670         if ( (error = vnode_getwithref(vp)) ) {
7671                 goto out;
7672         }
7673         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7674
7675 unionread:
7676         if (vp->v_type != VDIR) {
7677                 (void)vnode_put(vp);
7678                 error = EINVAL;
7679                 goto out;
7680         }
7681
7682 #if CONFIG_MACF
7683         error = mac_vnode_check_readdir(&context, vp);
7684         if (error != 0) {
7685                 (void)vnode_put(vp);
7686                 goto out;
7687         }
7688 #endif /* MAC */
7689
7690         loff = fp->f_fglob->fg_offset;
7691         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
7692         uio_addiov(auio, bufp, bufsize);
7693
7694         if (flags & VNODE_READDIR_EXTENDED) {
7695                 error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
7696                 fp->f_fglob->fg_offset = uio_offset(auio);
7697         } else {
7698                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
7699                 fp->f_fglob->fg_offset = uio_offset(auio);
7700         }
7701         if (error) {
7702                 (void)vnode_put(vp);
7703                 goto out;
7704         }
7705
7706         if ((user_ssize_t)bufsize == uio_resid(auio)){
7707                 if (union_dircheckp) {
7708                         error = union_dircheckp(&vp, fp, &context);
7709                         if (error == -1)
7710                                 goto unionread;
7711                         if (error)
7712                                 goto out;
7713                 }
7714
7715                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
7716                         struct vnode *tvp = vp;
7717                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
7718                                 vnode_ref(vp);
7719                                 fp->f_fglob->fg_data = (caddr_t) vp;
7720                                 fp->f_fglob->fg_offset = 0;
7721                                 vnode_rele(tvp);
7722                                 vnode_put(tvp);
7723                                 goto unionread;
7724                         }
7725                         vp = tvp;
7726                 }
7727         }
7728
7729         vnode_put(vp);
7730         if (offset) {
7731                 *offset = loff;
7732         }
7733
7734         *bytesread = bufsize - uio_resid(auio);
7735 out:
7736         file_drop(fd);
7737         return (error);
7738 }
7739
7740
7741 int
7742 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
7743 {
7744         off_t offset;
7745         ssize_t bytesread;
7746         int error;
7747
7748         AUDIT_ARG(fd, uap->fd);
7749         error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
7750
7751         if (error == 0) {
7752                 if (proc_is64bit(p)) {
7753                         user64_long_t base = (user64_long_t)offset;
7754                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
7755                 } else {
7756                         user32_long_t base = (user32_long_t)offset;
7757                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
7758                 }
7759                 *retval = bytesread;
7760         }
7761         return (error);
7762 }
7763
7764 int
7765 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
7766 {
7767         off_t offset;
7768         ssize_t bytesread;
7769         int error;
7770
7771         AUDIT_ARG(fd, uap->fd);
7772         error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
7773
7774         if (error == 0) {
7775                 *retval = bytesread;
7776                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
7777         }
7778         return (error);
7779 }
7780
7781
7782 /*
7783  * Set the mode mask for creation of filesystem nodes.
7784  * XXX implement xsecurity
7785  */
7786 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
7787 static int
7788 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
7789 {
7790         struct filedesc *fdp;
7791
7792         AUDIT_ARG(mask, newmask);
7793         proc_fdlock(p);
7794         fdp = p->p_fd;
7795         *retval = fdp->fd_cmask;
7796         fdp->fd_cmask = newmask & ALLPERMS;
7797         proc_fdunlock(p);
7798         return (0);
7799 }
7800
7801 /*
7802  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
7803  *
7804  * Parameters:    p                       Process requesting to set the umask
7805  *                uap                     User argument descriptor (see below)
7806  *                retval                  umask of the process (parameter p)
7807  *
7808  * Indirect:      uap->newmask            umask to set
7809  *                uap->xsecurity          ACL to set
7810  *
7811  * Returns:        0                      Success
7812  *                !0                      Not success
7813  *
7814  */
7815 int
7816 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
7817 {
7818         int ciferror;
7819         kauth_filesec_t xsecdst;
7820
7821         xsecdst = KAUTH_FILESEC_NONE;
7822         if (uap->xsecurity != USER_ADDR_NULL) {
7823                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
7824                         return ciferror;
7825         } else {
7826                 xsecdst = KAUTH_FILESEC_NONE;
7827         }
7828
7829         ciferror = umask1(p, uap->newmask, xsecdst, retval);
7830
7831         if (xsecdst != KAUTH_FILESEC_NONE)
7832                 kauth_filesec_free(xsecdst);
7833         return ciferror;
7834 }
7835
7836 int
7837 umask(proc_t p, struct umask_args *uap, int32_t *retval)
7838 {
7839         return(umask1(p, uap->newmask, UMASK_NOXSECURITY, retval));
7840 }
7841
7842 /*
7843  * Void all references to file by ripping underlying filesystem
7844  * away from vnode.
7845  */
7846 /* ARGSUSED */
7847 int
7848 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
7849 {
7850         vnode_t vp;
7851         struct vnode_attr va;
7852         vfs_context_t ctx = vfs_context_current();
7853         int error;
7854         struct nameidata nd;
7855
7856         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
7857                uap->path, ctx);
7858         error = namei(&nd);
7859         if (error)
7860                 return (error);
7861         vp = nd.ni_vp;
7862
7863         nameidone(&nd);
7864
7865         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
7866                 error = ENOTSUP;
7867                 goto out;
7868         }
7869
7870         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
7871                 error = EBUSY;
7872                 goto out;
7873         }
7874
7875 #if CONFIG_MACF
7876         error = mac_vnode_check_revoke(ctx, vp);
7877         if (error)
7878                 goto out;
7879 #endif
7880
7881         VATTR_INIT(&va);
7882         VATTR_WANTED(&va, va_uid);
7883         if ((error = vnode_getattr(vp, &va, ctx)))
7884                 goto out;
7885         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
7886             (error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
7887                 goto out;
7888         if (vp->v_usecount > 0 || (vnode_isaliased(vp)))
7889                 VNOP_REVOKE(vp, REVOKEALL, ctx);
7890 out:
7891         vnode_put(vp);
7892         return (error);
7893 }
7894
7895
7896 /*
7897  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
7898  *  The following system calls are designed to support features
7899  *  which are specific to the HFS & HFS Plus volume formats
7900  */
7901
7902
7903 /*
7904  * Obtain attribute information on objects in a directory while enumerating
7905  * the directory.
7906  */
7907 /* ARGSUSED */
7908 int
7909 getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
7910 {
7911         vnode_t vp;
7912         struct fileproc *fp;
7913         uio_t auio = NULL;
7914         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7915         uint32_t count, savecount;
7916         uint32_t newstate;
7917         int error, eofflag;
7918         uint32_t loff;
7919         struct attrlist attributelist;
7920         vfs_context_t ctx = vfs_context_current();
7921         int fd = uap->fd;
7922         char uio_buf[ UIO_SIZEOF(1) ];
7923         kauth_action_t action;
7924
7925         AUDIT_ARG(fd, fd);
7926
7927         /* Get the attributes into kernel space */
7928         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
7929                 return(error);
7930         }
7931         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
7932                 return(error);
7933         }
7934         savecount = count;
7935         if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) {
7936                 return (error);
7937         }
7938         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7939                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7940                 error = EBADF;
7941                 goto out;
7942         }
7943
7944
7945 #if CONFIG_MACF
7946         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
7947             fp->f_fglob);
7948         if (error)
7949                 goto out;
7950 #endif
7951
7952
7953         if ( (error = vnode_getwithref(vp)) )
7954                 goto out;
7955
7956         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7957
7958 unionread:
7959         if (vp->v_type != VDIR) {
7960                 (void)vnode_put(vp);
7961                 error = EINVAL;
7962                 goto out;
7963         }
7964
7965 #if CONFIG_MACF
7966         error = mac_vnode_check_readdir(ctx, vp);
7967         if (error != 0) {
7968                 (void)vnode_put(vp);
7969                 goto out;
7970         }
7971 #endif /* MAC */
7972
7973         /* set up the uio structure which will contain the users return buffer */
7974         loff = fp->f_fglob->fg_offset;
7975         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
7976         uio_addiov(auio, uap->buffer, uap->buffersize);
7977
7978         /*
7979          * If the only item requested is file names, we can let that past with
7980          * just LIST_DIRECTORY.  If they want any other attributes, that means
7981          * they need SEARCH as well.
7982          */
7983         action = KAUTH_VNODE_LIST_DIRECTORY;
7984         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
7985             attributelist.fileattr || attributelist.dirattr)
7986                 action |= KAUTH_VNODE_SEARCH;
7987
7988         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
7989
7990                 /* Believe it or not, uap->options only has 32-bits of valid
7991                  * info, so truncate before extending again */
7992
7993                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
7994                                 (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
7995         }
7996
7997         if (error) {
7998                 (void) vnode_put(vp);
7999                 goto out;
8000         }
8001
8002         /*
8003          * If we've got the last entry of a directory in a union mount
8004          * then reset the eofflag and pretend there's still more to come.
8005          * The next call will again set eofflag and the buffer will be empty,
8006          * so traverse to the underlying directory and do the directory
8007          * read there.
8008          */
8009         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
8010                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
8011                         eofflag = 0;
8012                 } else {                                                // Empty buffer
8013                         struct vnode *tvp = vp;
8014                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
8015                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
8016                                 fp->f_fglob->fg_data = (caddr_t) vp;
8017                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
8018                                 count = savecount;
8019                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
8020                                 vnode_put(tvp);
8021                                 goto unionread;
8022                         }
8023                         vp = tvp;
8024                 }
8025         }
8026
8027         (void)vnode_put(vp);
8028
8029         if (error)
8030                 goto out;
8031         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
8032
8033         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count))))
8034                 goto out;
8035         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate))))
8036                 goto out;
8037         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff))))
8038                 goto out;
8039
8040         *retval = eofflag;  /* similar to getdirentries */
8041         error = 0;
8042 out:
8043         file_drop(fd);
8044         return (error); /* return error earlier, an retval of 0 or 1 now */
8045
8046 } /* end of getdirentriesattr system call */
8047
8048 /*
8049 * Exchange data between two files
8050 */
8051
8052 /* ARGSUSED */
8053 int
8054 exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
8055 {
8056
8057         struct nameidata fnd, snd;
8058         vfs_context_t ctx = vfs_context_current();
8059         vnode_t fvp;
8060         vnode_t svp;
8061         int error;
8062         u_int32_t nameiflags;
8063         char *fpath = NULL;
8064         char *spath = NULL;
8065         int   flen=0, slen=0;
8066         int from_truncated=0, to_truncated=0;
8067 #if CONFIG_FSE
8068         fse_info f_finfo, s_finfo;
8069 #endif
8070
8071         nameiflags = 0;
8072         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8073
8074         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
8075                UIO_USERSPACE, uap->path1, ctx);
8076
8077         error = namei(&fnd);
8078         if (error)
8079                 goto out2;
8080
8081         nameidone(&fnd);
8082         fvp = fnd.ni_vp;
8083
8084         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
8085                UIO_USERSPACE, uap->path2, ctx);
8086
8087         error = namei(&snd);
8088         if (error) {
8089                 vnode_put(fvp);
8090                 goto out2;
8091         }
8092         nameidone(&snd);
8093         svp = snd.ni_vp;
8094
8095         /*
8096          * if the files are the same, return an inval error
8097          */
8098         if (svp == fvp) {
8099                 error = EINVAL;
8100                 goto out;
8101         }
8102
8103         /*
8104          * if the files are on different volumes, return an error
8105          */
8106         if (svp->v_mount != fvp->v_mount) {
8107                 error = EXDEV;
8108                 goto out;
8109         }
8110
8111         /* If they're not files, return an error */
8112         if ( (vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
8113                 error = EINVAL;
8114                 goto out;
8115         }
8116
8117 #if CONFIG_MACF
8118         error = mac_vnode_check_exchangedata(ctx,
8119             fvp, svp);
8120         if (error)
8121                 goto out;
8122 #endif
8123         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
8124             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0))
8125                 goto out;
8126
8127         if (
8128 #if CONFIG_FSE
8129         need_fsevent(FSE_EXCHANGE, fvp) ||
8130 #endif
8131         kauth_authorize_fileop_has_listeners()) {
8132                 GET_PATH(fpath);
8133                 GET_PATH(spath);
8134                 if (fpath == NULL || spath == NULL) {
8135                         error = ENOMEM;
8136                         goto out;
8137                 }
8138
8139                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
8140                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
8141
8142 #if CONFIG_FSE
8143                 get_fse_info(fvp, &f_finfo, ctx);
8144                 get_fse_info(svp, &s_finfo, ctx);
8145                 if (from_truncated || to_truncated) {
8146                         // set it here since only the f_finfo gets reported up to user space
8147                         f_finfo.mode |= FSE_TRUNCATED_PATH;
8148                 }
8149 #endif
8150         }
8151         /* Ok, make the call */
8152         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
8153
8154         if (error == 0) {
8155             const char *tmpname;
8156
8157             if (fpath != NULL && spath != NULL) {
8158                     /* call out to allow 3rd party notification of exchangedata.
8159                      * Ignore result of kauth_authorize_fileop call.
8160                      */
8161                     kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
8162                                            (uintptr_t)fpath, (uintptr_t)spath);
8163             }
8164             name_cache_lock();
8165
8166             tmpname     = fvp->v_name;
8167             fvp->v_name = svp->v_name;
8168             svp->v_name = tmpname;
8169
8170             if (fvp->v_parent != svp->v_parent) {
8171                 vnode_t tmp;
8172
8173                 tmp           = fvp->v_parent;
8174                 fvp->v_parent = svp->v_parent;
8175                 svp->v_parent = tmp;
8176             }
8177             name_cache_unlock();
8178
8179 #if CONFIG_FSE
8180             if (fpath != NULL && spath != NULL) {
8181                     add_fsevent(FSE_EXCHANGE, ctx,
8182                                 FSE_ARG_STRING, flen, fpath,
8183                                 FSE_ARG_FINFO, &f_finfo,
8184                                 FSE_ARG_STRING, slen, spath,
8185                                 FSE_ARG_FINFO, &s_finfo,
8186                                 FSE_ARG_DONE);
8187             }
8188 #endif
8189         }
8190
8191 out:
8192         if (fpath != NULL)
8193                 RELEASE_PATH(fpath);
8194         if (spath != NULL)
8195                 RELEASE_PATH(spath);
8196         vnode_put(svp);
8197         vnode_put(fvp);
8198 out2:
8199         return (error);
8200 }
8201
8202 /*
8203  * Return (in MB) the amount of freespace on the given vnode's volume.
8204  */
8205 uint32_t freespace_mb(vnode_t vp);
8206
8207 uint32_t
8208 freespace_mb(vnode_t vp)
8209 {
8210         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
8211         return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
8212                 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
8213 }
8214
8215 #if CONFIG_SEARCHFS
8216
8217 /* ARGSUSED */
8218
8219 int
8220 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
8221 {
8222         vnode_t vp, tvp;
8223         int i, error=0;
8224         int fserror = 0;
8225         struct nameidata nd;
8226         struct user64_fssearchblock searchblock;
8227         struct searchstate *state;
8228         struct attrlist *returnattrs;
8229         struct timeval timelimit;
8230         void *searchparams1,*searchparams2;
8231         uio_t auio = NULL;
8232         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8233         uint32_t nummatches;
8234         int mallocsize;
8235         uint32_t nameiflags;
8236         vfs_context_t ctx = vfs_context_current();
8237         char uio_buf[ UIO_SIZEOF(1) ];
8238
8239         /* Start by copying in fsearchblock parameter list */
8240     if (IS_64BIT_PROCESS(p)) {
8241         error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
8242         timelimit.tv_sec = searchblock.timelimit.tv_sec;
8243         timelimit.tv_usec = searchblock.timelimit.tv_usec;
8244     }
8245     else {
8246         struct user32_fssearchblock tmp_searchblock;
8247
8248         error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
8249         // munge into 64-bit version
8250         searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
8251         searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
8252         searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
8253         searchblock.maxmatches = tmp_searchblock.maxmatches;
8254                 /*
8255                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
8256                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
8257                  */
8258         timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
8259         timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
8260         searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
8261         searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
8262         searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
8263         searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
8264         searchblock.searchattrs = tmp_searchblock.searchattrs;
8265     }
8266         if (error)
8267                 return(error);
8268
8269         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
8270          */
8271         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
8272                 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS)
8273                 return(EINVAL);
8274
8275         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
8276         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
8277         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
8278         /* block.                                                                                             */
8279         /*                                                                                                    */
8280         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
8281         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
8282         /*       assumes the size is still 556 bytes it will continue to work                                 */
8283
8284         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
8285                 sizeof(struct attrlist) + sizeof(struct searchstate) + (2*sizeof(uint32_t));
8286
8287         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
8288
8289         /* Now set up the various pointers to the correct place in our newly allocated memory */
8290
8291         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
8292         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
8293         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof (struct attrlist));
8294
8295         /* Now copy in the stuff given our local variables. */
8296
8297         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1)))
8298                 goto freeandexit;
8299
8300         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2)))
8301                 goto freeandexit;
8302
8303         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist))))
8304                 goto freeandexit;
8305
8306         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate))))
8307                 goto freeandexit;
8308
8309         /*
8310          * When searching a union mount, need to set the
8311          * start flag at the first call on each layer to
8312          * reset state for the new volume.
8313          */
8314         if (uap->options & SRCHFS_START)
8315                 state->ss_union_layer = 0;
8316         else
8317                 uap->options |= state->ss_union_flags;
8318         state->ss_union_flags = 0;
8319
8320         /*
8321          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
8322          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
8323          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
8324          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
8325          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
8326          */
8327
8328         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
8329                 attrreference_t* string_ref;
8330                 u_int32_t* start_length;
8331                 user64_size_t param_length;
8332
8333                 /* validate searchparams1 */
8334                 param_length = searchblock.sizeofsearchparams1;
8335                 /* skip the word that specifies length of the buffer */
8336                 start_length= (u_int32_t*) searchparams1;
8337                 start_length= start_length+1;
8338                 string_ref= (attrreference_t*) start_length;
8339
8340                 /* ensure no negative offsets or too big offsets */
8341                 if (string_ref->attr_dataoffset < 0 ) {
8342                         error = EINVAL;
8343                         goto freeandexit;
8344                 }
8345                 if (string_ref->attr_length > MAXPATHLEN) {
8346                         error = EINVAL;
8347                         goto freeandexit;
8348                 }
8349
8350                 /* Check for pointer overflow in the string ref */
8351                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
8352                         error = EINVAL;
8353                         goto freeandexit;
8354                 }
8355
8356                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
8357                         error = EINVAL;
8358                         goto freeandexit;
8359                 }
8360                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
8361                         error = EINVAL;
8362                         goto freeandexit;
8363                 }
8364         }
8365
8366         /* set up the uio structure which will contain the users return buffer */
8367         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8368         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
8369
8370         nameiflags = 0;
8371         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8372         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
8373                UIO_USERSPACE, uap->path, ctx);
8374
8375         error = namei(&nd);
8376         if (error)
8377                 goto freeandexit;
8378         vp = nd.ni_vp;
8379         nameidone(&nd);
8380
8381         /*
8382          * Switch to the root vnode for the volume
8383          */
8384         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
8385         vnode_put(vp);
8386         if (error)
8387                 goto freeandexit;
8388         vp = tvp;
8389
8390         /*
8391          * If it's a union mount, the path lookup takes
8392          * us to the top layer. But we may need to descend
8393          * to a lower layer. For non-union mounts the layer
8394          * is always zero.
8395          */
8396         for (i = 0; i < (int) state->ss_union_layer; i++) {
8397                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0)
8398                         break;
8399                 tvp = vp;
8400                 vp = vp->v_mount->mnt_vnodecovered;
8401                 if (vp == NULL) {
8402                         vnode_put(tvp);
8403                         error = ENOENT;
8404                         goto freeandexit;
8405                 }
8406                 vnode_getwithref(vp);
8407                 vnode_put(tvp);
8408         }
8409
8410 #if CONFIG_MACF
8411         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
8412         if (error) {
8413                 vnode_put(vp);
8414                 goto freeandexit;
8415         }
8416 #endif
8417
8418
8419         /*
8420          * If searchblock.maxmatches == 0, then skip the search. This has happened
8421          * before and sometimes the underlying code doesnt deal with it well.
8422          */
8423          if (searchblock.maxmatches == 0) {
8424                 nummatches = 0;
8425                 goto saveandexit;
8426          }
8427
8428         /*
8429          * Allright, we have everything we need, so lets make that call.
8430          *
8431          * We keep special track of the return value from the file system:
8432          * EAGAIN is an acceptable error condition that shouldn't keep us
8433          * from copying out any results...
8434          */
8435
8436         fserror = VNOP_SEARCHFS(vp,
8437                 searchparams1,
8438                 searchparams2,
8439                 &searchblock.searchattrs,
8440                 (u_long)searchblock.maxmatches,
8441                 &timelimit,
8442                 returnattrs,
8443                 &nummatches,
8444                 (u_long)uap->scriptcode,
8445                 (u_long)uap->options,
8446                 auio,
8447                 (struct searchstate *) &state->ss_fsstate,
8448                 ctx);
8449
8450         /*
8451          * If it's a union mount we need to be called again
8452          * to search the mounted-on filesystem.
8453          */
8454         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
8455                 state->ss_union_flags = SRCHFS_START;
8456                 state->ss_union_layer++;        // search next layer down
8457                 fserror = EAGAIN;
8458         }
8459
8460 saveandexit:
8461
8462         vnode_put(vp);
8463
8464         /* Now copy out the stuff that needs copying out. That means the number of matches, the
8465            search state.  Everything was already put into he return buffer by the vop call. */
8466
8467         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0)
8468                 goto freeandexit;
8469
8470         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0)
8471                 goto freeandexit;
8472
8473         error = fserror;
8474
8475 freeandexit:
8476
8477         FREE(searchparams1,M_TEMP);
8478
8479         return(error);
8480
8481
8482 } /* end of searchfs system call */
8483
8484 #else /* CONFIG_SEARCHFS */
8485
8486 int
8487 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
8488 {
8489         return (ENOTSUP);
8490 }
8491
8492 #endif /* CONFIG_SEARCHFS */
8493
8494
8495 lck_grp_attr_t *  nspace_group_attr;
8496 lck_attr_t *      nspace_lock_attr;
8497 lck_grp_t *       nspace_mutex_group;
8498
8499 lck_mtx_t         nspace_handler_lock;
8500 lck_mtx_t         nspace_handler_exclusion_lock;
8501
8502 time_t snapshot_timestamp=0;
8503 int nspace_allow_virtual_devs=0;
8504
8505 void nspace_handler_init(void);
8506
8507 typedef struct nspace_item_info {
8508         struct vnode *vp;
8509         void         *arg;
8510         uint64_t      op;
8511         uint32_t      vid;
8512         uint32_t      flags;
8513         uint32_t      token;
8514         uint32_t      refcount;
8515 } nspace_item_info;
8516
8517 #define MAX_NSPACE_ITEMS   128
8518 nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
8519 uint32_t      nspace_item_idx=0;              // also used as the sleep/wakeup rendezvous address
8520 uint32_t      nspace_token_id=0;
8521 uint32_t      nspace_handler_timeout = 15;    // seconds
8522
8523 #define NSPACE_ITEM_NEW         0x0001
8524 #define NSPACE_ITEM_PROCESSING  0x0002
8525 #define NSPACE_ITEM_DEAD        0x0004
8526 #define NSPACE_ITEM_CANCELLED   0x0008
8527 #define NSPACE_ITEM_DONE        0x0010
8528 #define NSPACE_ITEM_RESET_TIMER 0x0020
8529
8530 #define NSPACE_ITEM_NSPACE_EVENT   0x0040
8531 #define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
8532
8533 #define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT)
8534
8535 //#pragma optimization_level 0
8536
8537 typedef enum {
8538         NSPACE_HANDLER_NSPACE = 0,
8539         NSPACE_HANDLER_SNAPSHOT = 1,
8540
8541         NSPACE_HANDLER_COUNT,
8542 } nspace_type_t;
8543
8544 typedef struct {
8545         uint64_t handler_tid;
8546         struct proc *handler_proc;
8547         int handler_busy;
8548 } nspace_handler_t;
8549
8550 nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
8551
8552 /* namespace fsctl functions */
8553 static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
8554 static int nspace_item_flags_for_type(nspace_type_t nspace_type);
8555 static int nspace_open_flags_for_type(nspace_type_t nspace_type);
8556 static nspace_type_t nspace_type_for_op(uint64_t op);
8557 static int nspace_is_special_process(struct proc *proc);
8558 static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
8559 static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
8560 static int validate_namespace_args (int is64bit, int size);
8561 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
8562
8563
8564 static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
8565 {
8566         switch(nspace_type) {
8567                 case NSPACE_HANDLER_NSPACE:
8568                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
8569                 case NSPACE_HANDLER_SNAPSHOT:
8570                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
8571                 default:
8572                         printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
8573                         return 0;
8574         }
8575 }
8576
8577 static inline int nspace_item_flags_for_type(nspace_type_t nspace_type)
8578 {
8579         switch(nspace_type) {
8580                 case NSPACE_HANDLER_NSPACE:
8581                         return NSPACE_ITEM_NSPACE_EVENT;
8582                 case NSPACE_HANDLER_SNAPSHOT:
8583                         return NSPACE_ITEM_SNAPSHOT_EVENT;
8584                 default:
8585                         printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
8586                         return 0;
8587         }
8588 }
8589
8590 static inline int nspace_open_flags_for_type(nspace_type_t nspace_type)
8591 {
8592         switch(nspace_type) {
8593                 case NSPACE_HANDLER_NSPACE:
8594                         return FREAD | FWRITE | O_EVTONLY;
8595                 case NSPACE_HANDLER_SNAPSHOT:
8596                         return FREAD | O_EVTONLY;
8597                 default:
8598                         printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
8599                         return 0;
8600         }
8601 }
8602
8603 static inline nspace_type_t nspace_type_for_op(uint64_t op)
8604 {
8605         switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
8606                 case NAMESPACE_HANDLER_NSPACE_EVENT:
8607                         return NSPACE_HANDLER_NSPACE;
8608                 case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
8609                         return NSPACE_HANDLER_SNAPSHOT;
8610                 default:
8611                         printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
8612                         return NSPACE_HANDLER_NSPACE;
8613         }
8614 }
8615
8616 static inline int nspace_is_special_process(struct proc *proc)
8617 {
8618         int i;
8619         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
8620                 if (proc == nspace_handlers[i].handler_proc)
8621                         return 1;
8622         }
8623         return 0;
8624 }
8625
8626 void
8627 nspace_handler_init(void)
8628 {
8629         nspace_lock_attr    = lck_attr_alloc_init();
8630         nspace_group_attr   = lck_grp_attr_alloc_init();
8631         nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
8632         lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
8633         lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
8634         memset(&nspace_items[0], 0, sizeof(nspace_items));
8635 }
8636
8637 void
8638 nspace_proc_exit(struct proc *p)
8639 {
8640         int i, event_mask = 0;
8641
8642         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
8643                 if (p == nspace_handlers[i].handler_proc) {
8644                         event_mask |= nspace_item_flags_for_type(i);
8645                         nspace_handlers[i].handler_tid = 0;
8646                         nspace_handlers[i].handler_proc = NULL;
8647                 }
8648         }
8649
8650         if (event_mask == 0) {
8651                 return;
8652         }
8653
8654         if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
8655                 // if this process was the snapshot handler, zero snapshot_timeout
8656                 snapshot_timestamp = 0;
8657         }
8658
8659         //
8660         // unblock anyone that's waiting for the handler that died
8661         //
8662         lck_mtx_lock(&nspace_handler_lock);
8663         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8664                 if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
8665
8666                         if ( nspace_items[i].flags & event_mask ) {
8667
8668                                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
8669                                         vnode_lock_spin(nspace_items[i].vp);
8670                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
8671                                         vnode_unlock(nspace_items[i].vp);
8672                                 }
8673                                 nspace_items[i].vp = NULL;
8674                                 nspace_items[i].vid = 0;
8675                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
8676                                 nspace_items[i].token = 0;
8677
8678                                 wakeup((caddr_t)&(nspace_items[i].vp));
8679                         }
8680                 }
8681         }
8682
8683         wakeup((caddr_t)&nspace_item_idx);
8684         lck_mtx_unlock(&nspace_handler_lock);
8685 }
8686
8687
8688 int
8689 resolve_nspace_item(struct vnode *vp, uint64_t op)
8690 {
8691         return resolve_nspace_item_ext(vp, op, NULL);
8692 }
8693
8694 int
8695 resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
8696 {
8697         int i, error, keep_waiting;
8698         struct timespec ts;
8699         nspace_type_t nspace_type = nspace_type_for_op(op);
8700
8701         // only allow namespace events on regular files, directories and symlinks.
8702         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
8703                 return 0;
8704         }
8705
8706         //
8707         // if this is a snapshot event and the vnode is on a
8708         // disk image just pretend nothing happened since any
8709         // change to the disk image will cause the disk image
8710         // itself to get backed up and this avoids multi-way
8711         // deadlocks between the snapshot handler and the ever
8712         // popular diskimages-helper process.  the variable
8713         // nspace_allow_virtual_devs allows this behavior to
8714         // be overridden (for use by the Mobile TimeMachine
8715         // testing infrastructure which uses disk images)
8716         //
8717         if (   (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
8718             && (vp->v_mount != NULL)
8719             && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
8720             && !nspace_allow_virtual_devs) {
8721
8722                 return 0;
8723         }
8724
8725         // if (thread_tid(current_thread()) == namespace_handler_tid) {
8726         if (nspace_handlers[nspace_type].handler_proc == NULL) {
8727                 return 0;
8728         }
8729
8730         if (nspace_is_special_process(current_proc())) {
8731                 return EDEADLK;
8732         }
8733
8734         lck_mtx_lock(&nspace_handler_lock);
8735
8736 retry:
8737         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8738                 if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
8739                         break;
8740                 }
8741         }
8742
8743         if (i >= MAX_NSPACE_ITEMS) {
8744                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8745                         if (nspace_items[i].flags == 0) {
8746                                 break;
8747                         }
8748                 }
8749         } else {
8750                 nspace_items[i].refcount++;
8751         }
8752
8753         if (i >= MAX_NSPACE_ITEMS) {
8754                 ts.tv_sec = nspace_handler_timeout;
8755                 ts.tv_nsec = 0;
8756
8757                 error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS|PCATCH, "nspace-no-space", &ts);
8758                 if (error == 0) {
8759                         // an entry got free'd up, go see if we can get a slot
8760                         goto retry;
8761                 } else {
8762                         lck_mtx_unlock(&nspace_handler_lock);
8763                         return error;
8764                 }
8765         }
8766
8767         //
8768         // if it didn't already exist, add it.  if it did exist
8769         // we'll get woken up when someone does a wakeup() on
8770         // the slot in the nspace_items table.
8771         //
8772         if (vp != nspace_items[i].vp) {
8773                 nspace_items[i].vp = vp;
8774                 nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
8775                 nspace_items[i].op = op;
8776                 nspace_items[i].vid = vnode_vid(vp);
8777                 nspace_items[i].flags = NSPACE_ITEM_NEW;
8778                 nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
8779                 if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
8780                         if (arg) {
8781                                 vnode_lock_spin(vp);
8782                                 vp->v_flag |= VNEEDSSNAPSHOT;
8783                                 vnode_unlock(vp);
8784                         }
8785                 }
8786
8787                 nspace_items[i].token = 0;
8788                 nspace_items[i].refcount = 1;
8789
8790                 wakeup((caddr_t)&nspace_item_idx);
8791         }
8792
8793         //
8794         // Now go to sleep until the handler does a wakeup on this
8795         // slot in the nspace_items table (or we timeout).
8796         //
8797         keep_waiting = 1;
8798         while(keep_waiting) {
8799                 ts.tv_sec = nspace_handler_timeout;
8800                 ts.tv_nsec = 0;
8801                 error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS|PCATCH, "namespace-done", &ts);
8802
8803                 if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
8804                         error = 0;
8805                 } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
8806                         error = nspace_items[i].token;
8807                 } else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
8808                         if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
8809                                 nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
8810                                 continue;
8811                         } else {
8812                                 error = ETIMEDOUT;
8813                         }
8814                 } else if (error == 0) {
8815                         // hmmm, why did we get woken up?
8816                         printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
8817                                nspace_items[i].token);
8818                 }
8819
8820                 if (--nspace_items[i].refcount == 0) {
8821                         nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
8822                         nspace_items[i].arg = NULL;
8823                         nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
8824                         nspace_items[i].flags = 0;     // this clears it for re-use
8825                 }
8826                 wakeup(&nspace_token_id);
8827                 keep_waiting = 0;
8828         }
8829
8830         lck_mtx_unlock(&nspace_handler_lock);
8831
8832         return error;
8833 }
8834
8835
8836 int
8837 get_nspace_item_status(struct vnode *vp, int32_t *status)
8838 {
8839         int i;
8840
8841         lck_mtx_lock(&nspace_handler_lock);
8842         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8843                 if (nspace_items[i].vp == vp) {
8844                         break;
8845                 }
8846         }
8847
8848         if (i >= MAX_NSPACE_ITEMS) {
8849                 lck_mtx_unlock(&nspace_handler_lock);
8850                 return ENOENT;
8851         }
8852
8853         *status = nspace_items[i].flags;
8854         lck_mtx_unlock(&nspace_handler_lock);
8855         return 0;
8856 }
8857
8858
8859 #if 0
8860 static int
8861 build_volfs_path(struct vnode *vp, char *path, int *len)
8862 {
8863         struct vnode_attr va;
8864         int ret;
8865
8866         VATTR_INIT(&va);
8867         VATTR_WANTED(&va, va_fsid);
8868         VATTR_WANTED(&va, va_fileid);
8869
8870         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
8871                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
8872                 ret = -1;
8873         } else {
8874                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
8875                 ret = 0;
8876         }
8877
8878         return ret;
8879 }
8880 #endif
8881
8882 //
8883 // Note: this function does NOT check permissions on all of the
8884 // parent directories leading to this vnode.  It should only be
8885 // called on behalf of a root process.  Otherwise a process may
8886 // get access to a file because the file itself is readable even
8887 // though its parent directories would prevent access.
8888 //
8889 static int
8890 vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
8891 {
8892         int error, action;
8893
8894         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
8895                 return error;
8896         }
8897
8898 #if CONFIG_MACF
8899         error = mac_vnode_check_open(ctx, vp, fmode);
8900         if (error)
8901                 return error;
8902 #endif
8903
8904         /* compute action to be authorized */
8905         action = 0;
8906         if (fmode & FREAD) {
8907                 action |= KAUTH_VNODE_READ_DATA;
8908         }
8909         if (fmode & (FWRITE | O_TRUNC)) {
8910                 /*
8911                  * If we are writing, appending, and not truncating,
8912                  * indicate that we are appending so that if the
8913                  * UF_APPEND or SF_APPEND bits are set, we do not deny
8914                  * the open.
8915                  */
8916                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
8917                         action |= KAUTH_VNODE_APPEND_DATA;
8918                 } else {
8919                         action |= KAUTH_VNODE_WRITE_DATA;
8920                 }
8921         }
8922
8923         if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
8924                 return error;
8925
8926
8927         //
8928         // if the vnode is tagged VOPENEVT and the current process
8929         // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
8930         // flag to the open mode so that this open won't count against
8931         // the vnode when carbon delete() does a vnode_isinuse() to see
8932         // if a file is currently in use.  this allows spotlight
8933         // importers to not interfere with carbon apps that depend on
8934         // the no-delete-if-busy semantics of carbon delete().
8935         //
8936         if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
8937                 fmode |= O_EVTONLY;
8938         }
8939
8940         if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) {
8941                 return error;
8942         }
8943         if ( (error = vnode_ref_ext(vp, fmode, 0)) ) {
8944                 VNOP_CLOSE(vp, fmode, ctx);
8945                 return error;
8946         }
8947
8948         /* Call out to allow 3rd party notification of open.
8949          * Ignore result of kauth_authorize_fileop call.
8950          */
8951 #if CONFIG_MACF
8952         mac_vnode_notify_open(ctx, vp, fmode);
8953 #endif
8954         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
8955                                (uintptr_t)vp, 0);
8956
8957
8958         return 0;
8959 }
8960
8961 static int
8962 wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
8963 {
8964         int i, error=0, unblock=0;
8965         task_t curtask;
8966
8967         lck_mtx_lock(&nspace_handler_exclusion_lock);
8968         if (nspace_handlers[nspace_type].handler_busy) {
8969                 lck_mtx_unlock(&nspace_handler_exclusion_lock);
8970                 return EBUSY;
8971         }
8972         nspace_handlers[nspace_type].handler_busy = 1;
8973         lck_mtx_unlock(&nspace_handler_exclusion_lock);
8974
8975         /*
8976          * Any process that gets here will be one of the namespace handlers.
8977          * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
8978          * as we can cause deadlocks to occur, because the namespace handler may prevent
8979          * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
8980          * process.
8981          */
8982         curtask = current_task();
8983         bsd_set_dependency_capable (curtask);
8984
8985         lck_mtx_lock(&nspace_handler_lock);
8986         if (nspace_handlers[nspace_type].handler_proc == NULL) {
8987                 nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
8988                 nspace_handlers[nspace_type].handler_proc = current_proc();
8989         }
8990
8991         while (error == 0) {
8992
8993                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8994                         if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
8995                                 if (!nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
8996                                         continue;
8997                                 }
8998                                 break;
8999                         }
9000                 }
9001
9002                 if (i < MAX_NSPACE_ITEMS) {
9003                         nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
9004                         nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
9005                         nspace_items[i].token  = ++nspace_token_id;
9006
9007                         if (nspace_items[i].vp) {
9008                                 struct fileproc *fp;
9009                                 int32_t indx, fmode;
9010                                 struct proc *p = current_proc();
9011                                 vfs_context_t ctx = vfs_context_current();
9012                                 struct vnode_attr va;
9013
9014
9015                                 /*
9016                                  * Use vnode pointer to acquire a file descriptor for
9017                                  * hand-off to userland
9018                                  */
9019                                 fmode = nspace_open_flags_for_type(nspace_type);
9020                                 error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
9021                                 if (error) {
9022                                         unblock = 1;
9023                                         break;
9024                                 }
9025                                 error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
9026                                 if (error) {
9027                                         unblock = 1;
9028                                         vnode_put(nspace_items[i].vp);
9029                                         break;
9030                                 }
9031
9032                                 if ((error = falloc(p, &fp, &indx, ctx))) {
9033                                         vn_close(nspace_items[i].vp, fmode, ctx);
9034                                         vnode_put(nspace_items[i].vp);
9035                                         unblock = 1;
9036                                         break;
9037                                 }
9038
9039                                 fp->f_fglob->fg_flag = fmode;
9040                                 fp->f_fglob->fg_ops = &vnops;
9041                                 fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
9042
9043                                 proc_fdlock(p);
9044                                 procfdtbl_releasefd(p, indx, NULL);
9045                                 fp_drop(p, indx, fp, 1);
9046                                 proc_fdunlock(p);
9047
9048                                 /*
9049                                  * All variants of the namespace handler struct support these three fields:
9050                                  * token, flags, and the FD pointer
9051                                  */
9052                                 error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
9053                                 error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
9054                                 error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
9055
9056                                 /*
9057                                  * Handle optional fields:
9058                                  * extended version support an info ptr (offset, length), and the
9059                                  *
9060                                  * namedata version supports a unique per-link object ID
9061                                  *
9062                                  */
9063                                 if (nhd->infoptr) {
9064                                         uio_t uio = (uio_t)nspace_items[i].arg;
9065                                         uint64_t u_offset, u_length;
9066
9067                                         if (uio) {
9068                                                 u_offset = uio_offset(uio);
9069                                                 u_length = uio_resid(uio);
9070                                         } else {
9071                                                 u_offset = 0;
9072                                                 u_length = 0;
9073                                         }
9074                                         error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
9075                                         error = copyout(&u_length, nhd->infoptr+sizeof(uint64_t), sizeof(uint64_t));
9076                                 }
9077
9078                                 if (nhd->objid) {
9079                                         VATTR_INIT(&va);
9080                                         VATTR_WANTED(&va, va_linkid);
9081                                         error = vnode_getattr(nspace_items[i].vp, &va, ctx);
9082                                         if (error == 0 ) {
9083                                                 uint64_t linkid = 0;
9084                                                 if (VATTR_IS_SUPPORTED (&va, va_linkid)) {
9085                                                         linkid = (uint64_t)va.va_linkid;
9086                                                 }
9087                                                 error = copyout (&linkid, nhd->objid, sizeof(uint64_t));
9088                                         }
9089                                 }
9090
9091                                 if (error) {
9092                                         vn_close(nspace_items[i].vp, fmode, ctx);
9093                                         fp_free(p, indx, fp);
9094                                         unblock = 1;
9095                                 }
9096
9097                                 vnode_put(nspace_items[i].vp);
9098
9099                                 break;
9100                         } else {
9101                                 printf("wait_for_nspace_event: failed (nspace_items[%d] == %p error %d, name %s)\n",
9102                                        i, nspace_items[i].vp, error, nspace_items[i].vp->v_name);
9103                         }
9104
9105                 } else {
9106                         error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0);
9107                         if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9108                                 error = EINVAL;
9109                                 break;
9110                         }
9111
9112                 }
9113         }
9114
9115         if (unblock) {
9116                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9117                         vnode_lock_spin(nspace_items[i].vp);
9118                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9119                         vnode_unlock(nspace_items[i].vp);
9120                 }
9121                 nspace_items[i].vp = NULL;
9122                 nspace_items[i].vid = 0;
9123                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9124                 nspace_items[i].token = 0;
9125
9126                 wakeup((caddr_t)&(nspace_items[i].vp));
9127         }
9128
9129         if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
9130                 // just go through every snapshot event and unblock it immediately.
9131                 if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9132                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9133                                 if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9134                                         if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9135                                                 nspace_items[i].vp = NULL;
9136                                                 nspace_items[i].vid = 0;
9137                                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9138                                                 nspace_items[i].token = 0;
9139
9140                                                 wakeup((caddr_t)&(nspace_items[i].vp));
9141                                         }
9142                                 }
9143                         }
9144                 }
9145         }
9146
9147         lck_mtx_unlock(&nspace_handler_lock);
9148
9149         lck_mtx_lock(&nspace_handler_exclusion_lock);
9150         nspace_handlers[nspace_type].handler_busy = 0;
9151         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9152
9153         return error;
9154 }
9155
9156 static inline int validate_namespace_args (int is64bit, int size) {
9157
9158         if (is64bit) {
9159                 /* Must be one of these */
9160                 if (size == sizeof(user64_namespace_handler_info)) {
9161                         goto sizeok;
9162                 }
9163                 if (size == sizeof(user64_namespace_handler_info_ext)) {
9164                         goto sizeok;
9165                 }
9166                 if (size == sizeof(user64_namespace_handler_data)) {
9167                         goto sizeok;
9168                 }
9169                 return EINVAL;
9170         }
9171         else {
9172                 /* 32 bit -- must be one of these */
9173                 if (size == sizeof(user32_namespace_handler_info)) {
9174                         goto sizeok;
9175                 }
9176                 if (size == sizeof(user32_namespace_handler_info_ext)) {
9177                         goto sizeok;
9178                 }
9179                 if (size == sizeof(user32_namespace_handler_data)) {
9180                         goto sizeok;
9181                 }
9182                 return EINVAL;
9183         }
9184
9185 sizeok:
9186
9187         return 0;
9188
9189 }
9190
9191 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
9192 {
9193         int error = 0;
9194         namespace_handler_data nhd;
9195
9196         bzero (&nhd, sizeof(namespace_handler_data));
9197
9198         if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
9199                         (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9200                 return EINVAL;
9201         }
9202
9203         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9204                 return error;
9205         }
9206
9207         error = validate_namespace_args (is64bit, size);
9208         if (error) {
9209                 return error;
9210         }
9211
9212         /* Copy in the userland pointers into our kernel-only struct */
9213
9214         if (is64bit) {
9215                 /* 64 bit userland structures */
9216                 nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
9217                 nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
9218                 nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
9219
9220                 /* If the size is greater than the standard info struct, add in extra fields */
9221                 if (size > (sizeof(user64_namespace_handler_info))) {
9222                         if (size >= (sizeof(user64_namespace_handler_info_ext))) {
9223                                 nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
9224                         }
9225                         if (size == (sizeof(user64_namespace_handler_data))) {
9226                                 nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
9227                         }
9228                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9229                 }
9230         }
9231         else {
9232                 /* 32 bit userland structures */
9233                 nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
9234                 nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
9235                 nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
9236
9237                 if (size > (sizeof(user32_namespace_handler_info))) {
9238                         if (size >= (sizeof(user32_namespace_handler_info_ext))) {
9239                                 nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
9240                         }
9241                         if (size == (sizeof(user32_namespace_handler_data))) {
9242                                 nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
9243                         }
9244                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9245                 }
9246         }
9247
9248         return wait_for_namespace_event(&nhd, nspace_type);
9249 }
9250
9251 /*
9252  * Make a filesystem-specific control call:
9253  */
9254 /* ARGSUSED */
9255 static int
9256 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
9257 {
9258         int error=0;
9259         boolean_t is64bit;
9260         u_int size;
9261 #define STK_PARAMS 128
9262         char stkbuf[STK_PARAMS];
9263         caddr_t data, memp;
9264         vnode_t vp = *arg_vp;
9265
9266         size = IOCPARM_LEN(cmd);
9267         if (size > IOCPARM_MAX) return (EINVAL);
9268
9269         is64bit = proc_is64bit(p);
9270
9271         memp = NULL;
9272
9273
9274         /*
9275          * ensure the buffer is large enough for underlying calls
9276          */
9277 #ifndef HFSIOC_GETPATH
9278         typedef char pn_t[MAXPATHLEN];
9279 #define HFSIOC_GETPATH  _IOWR('h', 13, pn_t)
9280 #endif
9281
9282 #ifndef HFS_GETPATH
9283 #define HFS_GETPATH  IOCBASECMD(HFSIOC_GETPATH)
9284 #endif
9285         if (IOCBASECMD(cmd) == HFS_GETPATH) {
9286                 /* Round up to MAXPATHLEN regardless of user input */
9287                 size = MAXPATHLEN;
9288         }
9289
9290         if (size > sizeof (stkbuf)) {
9291                 if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
9292                 data = memp;
9293         } else {
9294                 data = &stkbuf[0];
9295         };
9296
9297         if (cmd & IOC_IN) {
9298                 if (size) {
9299                         error = copyin(udata, data, size);
9300                         if (error) {
9301                                 if (memp) {
9302                                         kfree (memp, size);
9303                                 }
9304                                 return error;
9305                         }
9306                 } else {
9307                         if (is64bit) {
9308                                 *(user_addr_t *)data = udata;
9309                         }
9310                         else {
9311                                 *(uint32_t *)data = (uint32_t)udata;
9312                         }
9313                 };
9314         } else if ((cmd & IOC_OUT) && size) {
9315                 /*
9316                  * Zero the buffer so the user always
9317                  * gets back something deterministic.
9318                  */
9319                 bzero(data, size);
9320         } else if (cmd & IOC_VOID) {
9321                 if (is64bit) {
9322                         *(user_addr_t *)data = udata;
9323                 }
9324                 else {
9325                         *(uint32_t *)data = (uint32_t)udata;
9326                 }
9327         }
9328
9329         /* Check to see if it's a generic command */
9330         switch (IOCBASECMD(cmd)) {
9331
9332                 case FSCTL_SYNC_VOLUME: {
9333                         mount_t mp = vp->v_mount;
9334                         int arg = *(uint32_t*)data;
9335
9336                         /* record vid of vp so we can drop it below. */
9337                         uint32_t vvid = vp->v_id;
9338
9339                         /*
9340                          * Then grab mount_iterref so that we can release the vnode.
9341                          * Without this, a thread may call vnode_iterate_prepare then
9342                          * get into a deadlock because we've never released the root vp
9343                          */
9344                         error = mount_iterref (mp, 0);
9345                         if (error)  {
9346                                 break;
9347                         }
9348                         vnode_put(vp);
9349
9350                         /* issue the sync for this volume */
9351                         (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
9352
9353                         /*
9354                          * Then release the mount_iterref once we're done syncing; it's not
9355                          * needed for the VNOP_IOCTL below
9356                          */
9357                         mount_iterdrop(mp);
9358
9359                         if (arg & FSCTL_SYNC_FULLSYNC) {
9360                                 /* re-obtain vnode iocount on the root vp, if possible */
9361                                 error = vnode_getwithvid (vp, vvid);
9362                                 if (error == 0) {
9363                                         error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
9364                                         vnode_put (vp);
9365                                 }
9366                         }
9367                         /* mark the argument VP as having been released */
9368                         *arg_vp = NULL;
9369                 }
9370                 break;
9371
9372                 case FSCTL_SET_PACKAGE_EXTS: {
9373                         user_addr_t ext_strings;
9374                         uint32_t    num_entries;
9375                         uint32_t    max_width;
9376
9377                         if (   (is64bit && size != sizeof(user64_package_ext_info))
9378                                         || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
9379
9380                                 // either you're 64-bit and passed a 64-bit struct or
9381                                 // you're 32-bit and passed a 32-bit struct.  otherwise
9382                                 // it's not ok.
9383                                 error = EINVAL;
9384                                 break;
9385                         }
9386
9387                         if (is64bit) {
9388                                 ext_strings = ((user64_package_ext_info *)data)->strings;
9389                                 num_entries = ((user64_package_ext_info *)data)->num_entries;
9390                                 max_width   = ((user64_package_ext_info *)data)->max_width;
9391                         } else {
9392                                 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
9393                                 num_entries = ((user32_package_ext_info *)data)->num_entries;
9394                                 max_width   = ((user32_package_ext_info *)data)->max_width;
9395                         }
9396                         error = set_package_extensions_table(ext_strings, num_entries, max_width);
9397                 }
9398                 break;
9399
9400                 /* namespace handlers */
9401                 case FSCTL_NAMESPACE_HANDLER_GET: {
9402                         error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
9403                 }
9404                 break;
9405
9406                 /* Snapshot handlers */
9407                 case FSCTL_OLD_SNAPSHOT_HANDLER_GET: {
9408                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
9409                 }
9410                 break;
9411
9412                 case FSCTL_SNAPSHOT_HANDLER_GET_EXT: {
9413                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
9414                 }
9415                 break;
9416
9417                 case FSCTL_NAMESPACE_HANDLER_UPDATE: {
9418                         uint32_t token, val;
9419                         int i;
9420
9421                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
9422                                 break;
9423                         }
9424
9425                         if (!nspace_is_special_process(p)) {
9426                                 error = EINVAL;
9427                                 break;
9428                         }
9429
9430                         token = ((uint32_t *)data)[0];
9431                         val   = ((uint32_t *)data)[1];
9432
9433                         lck_mtx_lock(&nspace_handler_lock);
9434
9435                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9436                                 if (nspace_items[i].token == token) {
9437                                         break;  /* exit for loop, not case stmt */
9438                                 }
9439                         }
9440
9441                         if (i >= MAX_NSPACE_ITEMS) {
9442                                 error = ENOENT;
9443                         } else {
9444                                 //
9445                                 // if this bit is set, when resolve_nspace_item() times out
9446                                 // it will loop and go back to sleep.
9447                                 //
9448                                 nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
9449                         }
9450
9451                         lck_mtx_unlock(&nspace_handler_lock);
9452
9453                         if (error) {
9454                                 printf("nspace-handler-update: did not find token %u\n", token);
9455                         }
9456                 }
9457                 break;
9458
9459                 case FSCTL_NAMESPACE_HANDLER_UNBLOCK: {
9460                         uint32_t token, val;
9461                         int i;
9462
9463                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
9464                                 break;
9465                         }
9466
9467                         if (!nspace_is_special_process(p)) {
9468                                 error = EINVAL;
9469                                 break;
9470                         }
9471
9472                         token = ((uint32_t *)data)[0];
9473                         val   = ((uint32_t *)data)[1];
9474
9475                         lck_mtx_lock(&nspace_handler_lock);
9476
9477                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9478                                 if (nspace_items[i].token == token) {
9479                                         break; /* exit for loop, not case statement */
9480                                 }
9481                         }
9482
9483                         if (i >= MAX_NSPACE_ITEMS) {
9484                                 printf("nspace-handler-unblock: did not find token %u\n", token);
9485                                 error = ENOENT;
9486                         } else {
9487                                 if (val == 0 && nspace_items[i].vp) {
9488                                         vnode_lock_spin(nspace_items[i].vp);
9489                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9490                                         vnode_unlock(nspace_items[i].vp);
9491                                 }
9492
9493                                 nspace_items[i].vp = NULL;
9494                                 nspace_items[i].arg = NULL;
9495                                 nspace_items[i].op = 0;
9496                                 nspace_items[i].vid = 0;
9497                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9498                                 nspace_items[i].token = 0;
9499
9500                                 wakeup((caddr_t)&(nspace_items[i].vp));
9501                         }
9502
9503                         lck_mtx_unlock(&nspace_handler_lock);
9504                 }
9505                 break;
9506
9507                 case FSCTL_NAMESPACE_HANDLER_CANCEL: {
9508                         uint32_t token, val;
9509                         int i;
9510
9511                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
9512                                 break;
9513                         }
9514
9515                         if (!nspace_is_special_process(p)) {
9516                                 error = EINVAL;
9517                                 break;
9518                         }
9519
9520                         token = ((uint32_t *)data)[0];
9521                         val   = ((uint32_t *)data)[1];
9522
9523                         lck_mtx_lock(&nspace_handler_lock);
9524
9525                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9526                                 if (nspace_items[i].token == token) {
9527                                         break;  /* exit for loop, not case stmt */
9528                                 }
9529                         }
9530
9531                         if (i >= MAX_NSPACE_ITEMS) {
9532                                 printf("nspace-handler-cancel: did not find token %u\n", token);
9533                                 error = ENOENT;
9534                         } else {
9535                                 if (nspace_items[i].vp) {
9536                                         vnode_lock_spin(nspace_items[i].vp);
9537                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9538                                         vnode_unlock(nspace_items[i].vp);
9539                                 }
9540
9541                                 nspace_items[i].vp = NULL;
9542                                 nspace_items[i].arg = NULL;
9543                                 nspace_items[i].vid = 0;
9544                                 nspace_items[i].token = val;
9545                                 nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
9546                                 nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
9547
9548                                 wakeup((caddr_t)&(nspace_items[i].vp));
9549                         }
9550
9551                         lck_mtx_unlock(&nspace_handler_lock);
9552                 }
9553                 break;
9554
9555                 case FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
9556                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9557                                 break;
9558                         }
9559
9560                         // we explicitly do not do the namespace_handler_proc check here
9561
9562                         lck_mtx_lock(&nspace_handler_lock);
9563                         snapshot_timestamp = ((uint32_t *)data)[0];
9564                         wakeup(&nspace_item_idx);
9565                         lck_mtx_unlock(&nspace_handler_lock);
9566                         printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
9567
9568                 }
9569                 break;
9570
9571                 case FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
9572                 {
9573                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9574                                 break;
9575                         }
9576
9577                         lck_mtx_lock(&nspace_handler_lock);
9578                         nspace_allow_virtual_devs = ((uint32_t *)data)[0];
9579                         lck_mtx_unlock(&nspace_handler_lock);
9580                         printf("nspace-snapshot-handler will%s allow events on disk-images\n",
9581                                         nspace_allow_virtual_devs ? "" : " NOT");
9582                         error = 0;
9583
9584                 }
9585                 break;
9586
9587                 case FSCTL_SET_FSTYPENAME_OVERRIDE:
9588                 {
9589                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9590                                 break;
9591                         }
9592                         if (vp->v_mount) {
9593                                 mount_lock(vp->v_mount);
9594                                 if (data[0] != 0) {
9595                                         strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
9596                                         vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
9597                                         if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
9598                                                 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
9599                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
9600                                         }
9601                                 } else {
9602                                         if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
9603                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
9604                                         }
9605                                         vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
9606                                         vp->v_mount->fstypename_override[0] = '\0';
9607                                 }
9608                                 mount_unlock(vp->v_mount);
9609                         }
9610                 }
9611                 break;
9612
9613                 default: {
9614                         /* Invoke the filesystem-specific code */
9615                         error = VNOP_IOCTL(vp, IOCBASECMD(cmd), data, options, ctx);
9616                 }
9617
9618         } /* end switch stmt */
9619
9620         /*
9621          * if no errors, copy any data to user. Size was
9622          * already set and checked above.
9623          */
9624         if (error == 0 && (cmd & IOC_OUT) && size)
9625                 error = copyout(data, udata, size);
9626
9627         if (memp) {
9628                 kfree(memp, size);
9629         }
9630
9631         return error;
9632 }
9633
9634 /* ARGSUSED */
9635 int
9636 fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
9637 {
9638         int error;
9639         struct nameidata nd;
9640         u_long nameiflags;
9641         vnode_t vp = NULL;
9642         vfs_context_t ctx = vfs_context_current();
9643
9644         AUDIT_ARG(cmd, uap->cmd);
9645         AUDIT_ARG(value32, uap->options);
9646         /* Get the vnode for the file we are getting info on:  */
9647         nameiflags = 0;
9648         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
9649         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
9650                UIO_USERSPACE, uap->path, ctx);
9651         if ((error = namei(&nd))) goto done;
9652         vp = nd.ni_vp;
9653         nameidone(&nd);
9654
9655 #if CONFIG_MACF
9656         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
9657         if (error) {
9658                 goto done;
9659         }
9660 #endif
9661
9662         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
9663
9664 done:
9665         if (vp)
9666                 vnode_put(vp);
9667         return error;
9668 }
9669 /* ARGSUSED */
9670 int
9671 ffsctl (proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
9672 {
9673         int error;
9674         vnode_t vp = NULL;
9675         vfs_context_t ctx = vfs_context_current();
9676         int fd = -1;
9677
9678         AUDIT_ARG(fd, uap->fd);
9679         AUDIT_ARG(cmd, uap->cmd);
9680         AUDIT_ARG(value32, uap->options);
9681
9682         /* Get the vnode for the file we are getting info on:  */
9683         if ((error = file_vnode(uap->fd, &vp)))
9684                 return error;
9685         fd = uap->fd;
9686         if ((error = vnode_getwithref(vp))) {
9687                 file_drop(fd);
9688                 return error;
9689         }
9690
9691 #if CONFIG_MACF
9692         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
9693                 file_drop(fd);
9694                 vnode_put(vp);
9695                 return error;
9696         }
9697 #endif
9698
9699         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
9700
9701         file_drop(fd);
9702
9703         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
9704         if (vp) {
9705                 vnode_put(vp);
9706         }
9707
9708         return error;
9709 }
9710 /* end of fsctl system call */
9711
9712 /*
9713  *  Retrieve the data of an extended attribute.
9714  */
9715 int
9716 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
9717 {
9718         vnode_t vp;
9719         struct nameidata nd;
9720         char attrname[XATTR_MAXNAMELEN+1];
9721         vfs_context_t ctx = vfs_context_current();
9722         uio_t auio = NULL;
9723         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9724         size_t attrsize = 0;
9725         size_t namelen;
9726         u_int32_t nameiflags;
9727         int error;
9728         char uio_buf[ UIO_SIZEOF(1) ];
9729
9730         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9731                 return (EINVAL);
9732
9733         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9734         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
9735         if ((error = namei(&nd))) {
9736                 return (error);
9737         }
9738         vp = nd.ni_vp;
9739         nameidone(&nd);
9740
9741         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9742                 goto out;
9743         }
9744         if (xattr_protected(attrname)) {
9745                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
9746                         error = EPERM;
9747                         goto out;
9748                 }
9749         }
9750         /*
9751          * the specific check for 0xffffffff is a hack to preserve
9752          * binaray compatibilty in K64 with applications that discovered
9753          * that passing in a buf pointer and a size of -1 resulted in
9754          * just the size of the indicated extended attribute being returned.
9755          * this isn't part of the documented behavior, but because of the
9756          * original implemtation's check for "uap->size > 0", this behavior
9757          * was allowed. In K32 that check turned into a signed comparison
9758          * even though uap->size is unsigned...  in K64, we blow by that
9759          * check because uap->size is unsigned and doesn't get sign smeared
9760          * in the munger for a 32 bit user app.  we also need to add a
9761          * check to limit the maximum size of the buffer being passed in...
9762          * unfortunately, the underlying fileystems seem to just malloc
9763          * the requested size even if the actual extended attribute is tiny.
9764          * because that malloc is for kernel wired memory, we have to put a
9765          * sane limit on it.
9766          *
9767          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
9768          * U64 running on K64 will yield -1 (64 bits wide)
9769          * U32/U64 running on K32 will yield -1 (32 bits wide)
9770          */
9771         if (uap->size == 0xffffffff || uap->size == (size_t)-1)
9772                 goto no_uio;
9773
9774         if (uap->value) {
9775                 if (uap->size > (size_t)XATTR_MAXSIZE)
9776                         uap->size = XATTR_MAXSIZE;
9777
9778                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
9779                                             &uio_buf[0], sizeof(uio_buf));
9780                 uio_addiov(auio, uap->value, uap->size);
9781         }
9782 no_uio:
9783         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
9784 out:
9785         vnode_put(vp);
9786
9787         if (auio) {
9788                 *retval = uap->size - uio_resid(auio);
9789         } else {
9790                 *retval = (user_ssize_t)attrsize;
9791         }
9792
9793         return (error);
9794 }
9795
9796 /*
9797  * Retrieve the data of an extended attribute.
9798  */
9799 int
9800 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
9801 {
9802         vnode_t vp;
9803         char attrname[XATTR_MAXNAMELEN+1];
9804         uio_t auio = NULL;
9805         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9806         size_t attrsize = 0;
9807         size_t namelen;
9808         int error;
9809         char uio_buf[ UIO_SIZEOF(1) ];
9810
9811         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9812                 return (EINVAL);
9813
9814         if ( (error = file_vnode(uap->fd, &vp)) ) {
9815                 return (error);
9816         }
9817         if ( (error = vnode_getwithref(vp)) ) {
9818                 file_drop(uap->fd);
9819                 return(error);
9820         }
9821         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9822                 goto out;
9823         }
9824         if (xattr_protected(attrname)) {
9825                 error = EPERM;
9826                 goto out;
9827         }
9828         if (uap->value && uap->size > 0) {
9829                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
9830                                             &uio_buf[0], sizeof(uio_buf));
9831                 uio_addiov(auio, uap->value, uap->size);
9832         }
9833
9834         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
9835 out:
9836         (void)vnode_put(vp);
9837         file_drop(uap->fd);
9838
9839         if (auio) {
9840                 *retval = uap->size - uio_resid(auio);
9841         } else {
9842                 *retval = (user_ssize_t)attrsize;
9843         }
9844         return (error);
9845 }
9846
9847 /*
9848  * Set the data of an extended attribute.
9849  */
9850 int
9851 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
9852 {
9853         vnode_t vp;
9854         struct nameidata nd;
9855         char attrname[XATTR_MAXNAMELEN+1];
9856         vfs_context_t ctx = vfs_context_current();
9857         uio_t auio = NULL;
9858         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9859         size_t namelen;
9860         u_int32_t nameiflags;
9861         int error;
9862         char uio_buf[ UIO_SIZEOF(1) ];
9863
9864         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9865                 return (EINVAL);
9866
9867         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9868                 if (error == EPERM) {
9869                         /* if the string won't fit in attrname, copyinstr emits EPERM */
9870                         return (ENAMETOOLONG);
9871                 }
9872                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
9873                 return error;
9874         }
9875         if (xattr_protected(attrname))
9876                 return(EPERM);
9877         if (uap->size != 0 && uap->value == 0) {
9878                 return (EINVAL);
9879         }
9880
9881         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9882         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
9883         if ((error = namei(&nd))) {
9884                 return (error);
9885         }
9886         vp = nd.ni_vp;
9887         nameidone(&nd);
9888
9889         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
9890                                     &uio_buf[0], sizeof(uio_buf));
9891         uio_addiov(auio, uap->value, uap->size);
9892
9893         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
9894 #if CONFIG_FSE
9895         if (error == 0) {
9896                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
9897                     FSE_ARG_VNODE, vp,
9898                     FSE_ARG_DONE);
9899         }
9900 #endif
9901         vnode_put(vp);
9902         *retval = 0;
9903         return (error);
9904 }
9905
9906 /*
9907  * Set the data of an extended attribute.
9908  */
9909 int
9910 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
9911 {
9912         vnode_t vp;
9913         char attrname[XATTR_MAXNAMELEN+1];
9914         uio_t auio = NULL;
9915         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9916         size_t namelen;
9917         int error;
9918         char uio_buf[ UIO_SIZEOF(1) ];
9919 #if CONFIG_FSE
9920         vfs_context_t ctx = vfs_context_current();
9921 #endif
9922
9923         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9924                 return (EINVAL);
9925
9926         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9927                 if (error == EPERM) {
9928                         /* if the string won't fit in attrname, copyinstr emits EPERM */
9929                         return (ENAMETOOLONG);
9930                 }
9931                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
9932                 return error;
9933         }
9934         if (xattr_protected(attrname))
9935                 return(EPERM);
9936         if (uap->size != 0 && uap->value == 0) {
9937                 return (EINVAL);
9938         }
9939         if ( (error = file_vnode(uap->fd, &vp)) ) {
9940                 return (error);
9941         }
9942         if ( (error = vnode_getwithref(vp)) ) {
9943                 file_drop(uap->fd);
9944                 return(error);
9945         }
9946         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
9947                                     &uio_buf[0], sizeof(uio_buf));
9948         uio_addiov(auio, uap->value, uap->size);
9949
9950         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
9951 #if CONFIG_FSE
9952         if (error == 0) {
9953                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
9954                     FSE_ARG_VNODE, vp,
9955                     FSE_ARG_DONE);
9956         }
9957 #endif
9958         vnode_put(vp);
9959         file_drop(uap->fd);
9960         *retval = 0;
9961         return (error);
9962 }
9963
9964 /*
9965  * Remove an extended attribute.
9966  * XXX Code duplication here.
9967  */
9968 int
9969 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
9970 {
9971         vnode_t vp;
9972         struct nameidata nd;
9973         char attrname[XATTR_MAXNAMELEN+1];
9974         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9975         vfs_context_t ctx = vfs_context_current();
9976         size_t namelen;
9977         u_int32_t nameiflags;
9978         int error;
9979
9980         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9981                 return (EINVAL);
9982
9983         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
9984         if (error != 0) {
9985                 return (error);
9986         }
9987         if (xattr_protected(attrname))
9988                 return(EPERM);
9989         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9990         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
9991         if ((error = namei(&nd))) {
9992                 return (error);
9993         }
9994         vp = nd.ni_vp;
9995         nameidone(&nd);
9996
9997         error = vn_removexattr(vp, attrname, uap->options, ctx);
9998 #if CONFIG_FSE
9999         if (error == 0) {
10000                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10001                     FSE_ARG_VNODE, vp,
10002                     FSE_ARG_DONE);
10003         }
10004 #endif
10005         vnode_put(vp);
10006         *retval = 0;
10007         return (error);
10008 }
10009
10010 /*
10011  * Remove an extended attribute.
10012  * XXX Code duplication here.
10013  */
10014 int
10015 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
10016 {
10017         vnode_t vp;
10018         char attrname[XATTR_MAXNAMELEN+1];
10019         size_t namelen;
10020         int error;
10021 #if CONFIG_FSE
10022         vfs_context_t ctx = vfs_context_current();
10023 #endif
10024
10025         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10026                 return (EINVAL);
10027
10028         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10029         if (error != 0) {
10030                 return (error);
10031         }
10032         if (xattr_protected(attrname))
10033                 return(EPERM);
10034         if ( (error = file_vnode(uap->fd, &vp)) ) {
10035                 return (error);
10036         }
10037         if ( (error = vnode_getwithref(vp)) ) {
10038                 file_drop(uap->fd);
10039                 return(error);
10040         }
10041
10042         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
10043 #if CONFIG_FSE
10044         if (error == 0) {
10045                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10046                     FSE_ARG_VNODE, vp,
10047                     FSE_ARG_DONE);
10048         }
10049 #endif
10050         vnode_put(vp);
10051         file_drop(uap->fd);
10052         *retval = 0;
10053         return (error);
10054 }
10055
10056 /*
10057  * Retrieve the list of extended attribute names.
10058  * XXX Code duplication here.
10059  */
10060 int
10061 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
10062 {
10063         vnode_t vp;
10064         struct nameidata nd;
10065         vfs_context_t ctx = vfs_context_current();
10066         uio_t auio = NULL;
10067         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10068         size_t attrsize = 0;
10069         u_int32_t nameiflags;
10070         int error;
10071         char uio_buf[ UIO_SIZEOF(1) ];
10072
10073         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10074                 return (EINVAL);
10075
10076         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10077         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
10078         if ((error = namei(&nd))) {
10079                 return (error);
10080         }
10081         vp = nd.ni_vp;
10082         nameidone(&nd);
10083         if (uap->namebuf != 0 && uap->bufsize > 0) {
10084                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
10085                                             &uio_buf[0], sizeof(uio_buf));
10086                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10087         }
10088
10089         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
10090
10091         vnode_put(vp);
10092         if (auio) {
10093                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10094         } else {
10095                 *retval = (user_ssize_t)attrsize;
10096         }
10097         return (error);
10098 }
10099
10100 /*
10101  * Retrieve the list of extended attribute names.
10102  * XXX Code duplication here.
10103  */
10104 int
10105 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
10106 {
10107         vnode_t vp;
10108         uio_t auio = NULL;
10109         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10110         size_t attrsize = 0;
10111         int error;
10112         char uio_buf[ UIO_SIZEOF(1) ];
10113
10114         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10115                 return (EINVAL);
10116
10117         if ( (error = file_vnode(uap->fd, &vp)) ) {
10118                 return (error);
10119         }
10120         if ( (error = vnode_getwithref(vp)) ) {
10121                 file_drop(uap->fd);
10122                 return(error);
10123         }
10124         if (uap->namebuf != 0 && uap->bufsize > 0) {
10125                 auio = uio_createwithbuffer(1, 0, spacetype,
10126                                                                           UIO_READ, &uio_buf[0], sizeof(uio_buf));
10127                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10128         }
10129
10130         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
10131
10132         vnode_put(vp);
10133         file_drop(uap->fd);
10134         if (auio) {
10135                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10136         } else {
10137                 *retval = (user_ssize_t)attrsize;
10138         }
10139         return (error);
10140 }
10141
10142 static int fsgetpath_internal(
10143         vfs_context_t ctx, int volfs_id, uint64_t objid,
10144         vm_size_t bufsize, caddr_t buf, int *pathlen)
10145 {
10146         int error;
10147         struct mount *mp = NULL;
10148         vnode_t vp;
10149         int length;
10150         int bpflags;
10151
10152         if (bufsize > PAGE_SIZE) {
10153                 return (EINVAL);
10154         }
10155
10156         if (buf == NULL) {
10157                 return (ENOMEM);
10158         }
10159
10160         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
10161                 error = ENOTSUP;  /* unexpected failure */
10162                 return ENOTSUP;
10163         }
10164
10165 unionget:
10166         if (objid == 2) {
10167                 error = VFS_ROOT(mp, &vp, ctx);
10168         } else {
10169                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
10170         }
10171
10172         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
10173                 /*
10174                  * If the fileid isn't found and we're in a union
10175                  * mount volume, then see if the fileid is in the
10176                  * mounted-on volume.
10177                  */
10178                 struct mount *tmp = mp;
10179                 mp = vnode_mount(tmp->mnt_vnodecovered);
10180                 vfs_unbusy(tmp);
10181                 if (vfs_busy(mp, LK_NOWAIT) == 0)
10182                         goto unionget;
10183         } else {
10184                 vfs_unbusy(mp);
10185         }
10186
10187         if (error) {
10188                 return error;
10189         }
10190
10191 #if CONFIG_MACF
10192         error = mac_vnode_check_fsgetpath(ctx, vp);
10193         if (error) {
10194                 vnode_put(vp);
10195                 return error;
10196         }
10197 #endif
10198
10199         /* Obtain the absolute path to this vnode. */
10200         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
10201         bpflags |= BUILDPATH_CHECK_MOVED;
10202         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
10203         vnode_put(vp);
10204
10205         if (error) {
10206                 goto out;
10207         }
10208
10209         AUDIT_ARG(text, buf);
10210
10211         if (kdebug_enable) {
10212                 long dbg_parms[NUMPARMS];
10213                 int  dbg_namelen;
10214
10215                 dbg_namelen = (int)sizeof(dbg_parms);
10216
10217         if (length < dbg_namelen) {
10218                         memcpy((char *)dbg_parms, buf, length);
10219                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
10220
10221                         dbg_namelen = length;
10222                 } else {
10223                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
10224                 }
10225
10226                 kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE);
10227         }
10228
10229         *pathlen = (user_ssize_t)length; /* may be superseded by error */
10230
10231 out:
10232         return (error);
10233 }
10234
10235 /*
10236  * Obtain the full pathname of a file system object by id.
10237  *
10238  * This is a private SPI used by the File Manager.
10239  */
10240 __private_extern__
10241 int
10242 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
10243 {
10244         vfs_context_t ctx = vfs_context_current();
10245         fsid_t fsid;
10246         char *realpath;
10247         int length;
10248         int error;
10249
10250         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
10251                 return (error);
10252         }
10253         AUDIT_ARG(value32, fsid.val[0]);
10254         AUDIT_ARG(value64, uap->objid);
10255         /* Restrict output buffer size for now. */
10256
10257         if (uap->bufsize > PAGE_SIZE) {
10258                 return (EINVAL);
10259         }
10260         MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK);
10261         if (realpath == NULL) {
10262                 return (ENOMEM);
10263         }
10264
10265         error = fsgetpath_internal(
10266                 ctx, fsid.val[0], uap->objid,
10267                 uap->bufsize, realpath, &length);
10268
10269         if (error) {
10270                 goto out;
10271         }
10272
10273         error = copyout((caddr_t)realpath, uap->buf, length);
10274
10275         *retval = (user_ssize_t)length; /* may be superseded by error */
10276 out:
10277         if (realpath) {
10278                 FREE(realpath, M_TEMP);
10279         }
10280         return (error);
10281 }
10282
10283 /*
10284  * Common routine to handle various flavors of statfs data heading out
10285  *      to user space.
10286  *
10287  * Returns:     0                       Success
10288  *              EFAULT
10289  */
10290 static int
10291 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
10292     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
10293     boolean_t partial_copy)
10294 {
10295         int             error;
10296         int             my_size, copy_size;
10297
10298         if (is_64_bit) {
10299                 struct user64_statfs sfs;
10300                 my_size = copy_size = sizeof(sfs);
10301                 bzero(&sfs, my_size);
10302                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10303                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10304                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10305                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
10306                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
10307                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
10308                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
10309                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
10310                 sfs.f_files = (user64_long_t)sfsp->f_files;
10311                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
10312                 sfs.f_fsid = sfsp->f_fsid;
10313                 sfs.f_owner = sfsp->f_owner;
10314                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
10315                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
10316                 } else {
10317                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
10318                 }
10319                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
10320                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
10321
10322                 if (partial_copy) {
10323                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
10324                 }
10325                 error = copyout((caddr_t)&sfs, bufp, copy_size);
10326         }
10327         else {
10328                 struct user32_statfs sfs;
10329
10330                 my_size = copy_size = sizeof(sfs);
10331                 bzero(&sfs, my_size);
10332
10333                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10334                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10335                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10336
10337                 /*
10338                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
10339                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
10340                  * to reflect the filesystem size as best we can.
10341                  */
10342                 if ((sfsp->f_blocks > INT_MAX)
10343                         /* Hack for 4061702 . I think the real fix is for Carbon to
10344                          * look for some volume capability and not depend on hidden
10345                          * semantics agreed between a FS and carbon.
10346                          * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
10347                          * for Carbon to set bNoVolumeSizes volume attribute.
10348                          * Without this the webdavfs files cannot be copied onto
10349                          * disk as they look huge. This change should not affect
10350                          * XSAN as they should not setting these to -1..
10351                          */
10352                          && (sfsp->f_blocks != 0xffffffffffffffffULL)
10353                          && (sfsp->f_bfree != 0xffffffffffffffffULL)
10354                          && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
10355                         int             shift;
10356
10357                         /*
10358                          * Work out how far we have to shift the block count down to make it fit.
10359                          * Note that it's possible to have to shift so far that the resulting
10360                          * blocksize would be unreportably large.  At that point, we will clip
10361                          * any values that don't fit.
10362                          *
10363                          * For safety's sake, we also ensure that f_iosize is never reported as
10364                          * being smaller than f_bsize.
10365                          */
10366                         for (shift = 0; shift < 32; shift++) {
10367                                 if ((sfsp->f_blocks >> shift) <= INT_MAX)
10368                                         break;
10369                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX)
10370                                         break;
10371                         }
10372 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
10373                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
10374                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
10375                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
10376 #undef __SHIFT_OR_CLIP
10377                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
10378                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
10379                 } else {
10380                         /* filesystem is small enough to be reported honestly */
10381                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
10382                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
10383                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
10384                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
10385                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
10386                 }
10387                 sfs.f_files = (user32_long_t)sfsp->f_files;
10388                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
10389                 sfs.f_fsid = sfsp->f_fsid;
10390                 sfs.f_owner = sfsp->f_owner;
10391                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
10392                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
10393                 } else {
10394                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
10395                 }
10396                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
10397                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
10398
10399                 if (partial_copy) {
10400                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
10401                 }
10402                 error = copyout((caddr_t)&sfs, bufp, copy_size);
10403         }
10404
10405         if (sizep != NULL) {
10406                 *sizep = my_size;
10407         }
10408         return(error);
10409 }
10410
10411 /*
10412  * copy stat structure into user_stat structure.
10413  */
10414 void munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
10415 {
10416         bzero(usbp, sizeof(*usbp));
10417
10418         usbp->st_dev = sbp->st_dev;
10419         usbp->st_ino = sbp->st_ino;
10420         usbp->st_mode = sbp->st_mode;
10421         usbp->st_nlink = sbp->st_nlink;
10422         usbp->st_uid = sbp->st_uid;
10423         usbp->st_gid = sbp->st_gid;
10424         usbp->st_rdev = sbp->st_rdev;
10425 #ifndef _POSIX_C_SOURCE
10426         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10427         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10428         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10429         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10430         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10431         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10432 #else
10433         usbp->st_atime = sbp->st_atime;
10434         usbp->st_atimensec = sbp->st_atimensec;
10435         usbp->st_mtime = sbp->st_mtime;
10436         usbp->st_mtimensec = sbp->st_mtimensec;
10437         usbp->st_ctime = sbp->st_ctime;
10438         usbp->st_ctimensec = sbp->st_ctimensec;
10439 #endif
10440         usbp->st_size = sbp->st_size;
10441         usbp->st_blocks = sbp->st_blocks;
10442         usbp->st_blksize = sbp->st_blksize;
10443         usbp->st_flags = sbp->st_flags;
10444         usbp->st_gen = sbp->st_gen;
10445         usbp->st_lspare = sbp->st_lspare;
10446         usbp->st_qspare[0] = sbp->st_qspare[0];
10447         usbp->st_qspare[1] = sbp->st_qspare[1];
10448 }
10449
10450 void munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
10451 {
10452         bzero(usbp, sizeof(*usbp));
10453
10454         usbp->st_dev = sbp->st_dev;
10455         usbp->st_ino = sbp->st_ino;
10456         usbp->st_mode = sbp->st_mode;
10457         usbp->st_nlink = sbp->st_nlink;
10458         usbp->st_uid = sbp->st_uid;
10459         usbp->st_gid = sbp->st_gid;
10460         usbp->st_rdev = sbp->st_rdev;
10461 #ifndef _POSIX_C_SOURCE
10462         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10463         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10464         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10465         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10466         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10467         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10468 #else
10469         usbp->st_atime = sbp->st_atime;
10470         usbp->st_atimensec = sbp->st_atimensec;
10471         usbp->st_mtime = sbp->st_mtime;
10472         usbp->st_mtimensec = sbp->st_mtimensec;
10473         usbp->st_ctime = sbp->st_ctime;
10474         usbp->st_ctimensec = sbp->st_ctimensec;
10475 #endif
10476         usbp->st_size = sbp->st_size;
10477         usbp->st_blocks = sbp->st_blocks;
10478         usbp->st_blksize = sbp->st_blksize;
10479         usbp->st_flags = sbp->st_flags;
10480         usbp->st_gen = sbp->st_gen;
10481         usbp->st_lspare = sbp->st_lspare;
10482         usbp->st_qspare[0] = sbp->st_qspare[0];
10483         usbp->st_qspare[1] = sbp->st_qspare[1];
10484 }
10485
10486 /*
10487  * copy stat64 structure into user_stat64 structure.
10488  */
10489 void munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
10490 {
10491         bzero(usbp, sizeof(*usbp));
10492
10493         usbp->st_dev = sbp->st_dev;
10494         usbp->st_ino = sbp->st_ino;
10495         usbp->st_mode = sbp->st_mode;
10496         usbp->st_nlink = sbp->st_nlink;
10497         usbp->st_uid = sbp->st_uid;
10498         usbp->st_gid = sbp->st_gid;
10499         usbp->st_rdev = sbp->st_rdev;
10500 #ifndef _POSIX_C_SOURCE
10501         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10502         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10503         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10504         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10505         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10506         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10507         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
10508         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
10509 #else
10510         usbp->st_atime = sbp->st_atime;
10511         usbp->st_atimensec = sbp->st_atimensec;
10512         usbp->st_mtime = sbp->st_mtime;
10513         usbp->st_mtimensec = sbp->st_mtimensec;
10514         usbp->st_ctime = sbp->st_ctime;
10515         usbp->st_ctimensec = sbp->st_ctimensec;
10516         usbp->st_birthtime = sbp->st_birthtime;
10517         usbp->st_birthtimensec = sbp->st_birthtimensec;
10518 #endif
10519         usbp->st_size = sbp->st_size;
10520         usbp->st_blocks = sbp->st_blocks;
10521         usbp->st_blksize = sbp->st_blksize;
10522         usbp->st_flags = sbp->st_flags;
10523         usbp->st_gen = sbp->st_gen;
10524         usbp->st_lspare = sbp->st_lspare;
10525         usbp->st_qspare[0] = sbp->st_qspare[0];
10526         usbp->st_qspare[1] = sbp->st_qspare[1];
10527 }
10528
10529 void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
10530 {
10531         bzero(usbp, sizeof(*usbp));
10532
10533         usbp->st_dev = sbp->st_dev;
10534         usbp->st_ino = sbp->st_ino;
10535         usbp->st_mode = sbp->st_mode;
10536         usbp->st_nlink = sbp->st_nlink;
10537         usbp->st_uid = sbp->st_uid;
10538         usbp->st_gid = sbp->st_gid;
10539         usbp->st_rdev = sbp->st_rdev;
10540 #ifndef _POSIX_C_SOURCE
10541         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10542         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10543         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10544         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10545         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10546         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10547         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
10548         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
10549 #else
10550         usbp->st_atime = sbp->st_atime;
10551         usbp->st_atimensec = sbp->st_atimensec;
10552         usbp->st_mtime = sbp->st_mtime;
10553         usbp->st_mtimensec = sbp->st_mtimensec;
10554         usbp->st_ctime = sbp->st_ctime;
10555         usbp->st_ctimensec = sbp->st_ctimensec;
10556         usbp->st_birthtime = sbp->st_birthtime;
10557         usbp->st_birthtimensec = sbp->st_birthtimensec;
10558 #endif
10559         usbp->st_size = sbp->st_size;
10560         usbp->st_blocks = sbp->st_blocks;
10561         usbp->st_blksize = sbp->st_blksize;
10562         usbp->st_flags = sbp->st_flags;
10563         usbp->st_gen = sbp->st_gen;
10564         usbp->st_lspare = sbp->st_lspare;
10565         usbp->st_qspare[0] = sbp->st_qspare[0];
10566         usbp->st_qspare[1] = sbp->st_qspare[1];
10567 }
10568
10569 /*
10570  * Purge buffer cache for simulating cold starts
10571  */
10572 static int vnode_purge_callback(struct vnode *vp, __unused void *cargs)
10573 {
10574         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
10575
10576         return VNODE_RETURNED;
10577 }
10578
10579 static int vfs_purge_callback(mount_t mp, __unused void * arg)
10580 {
10581         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
10582
10583         return VFS_RETURNED;
10584 }
10585
10586 int
10587 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
10588 {
10589         if (!kauth_cred_issuser(kauth_cred_get()))
10590                 return EPERM;
10591
10592         vfs_iterate(0/* flags */, vfs_purge_callback, NULL);
10593
10594         return 0;
10595 }
10596