bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/xattr.h>
  98 #include <sys/fcntl.h>
  99 #include <sys/fsctl.h>
 100 #include <sys/ubc_internal.h>
 101 #include <sys/disk.h>
 102 #include <machine/cons.h>
 103 #include <machine/limits.h>
 104 #include <miscfs/specfs/specdev.h>
 105
 106 #include <security/audit/audit.h>
 107 #include <bsm/audit_kevents.h>
 108
 109 #include <mach/mach_types.h>
 110 #include <kern/kern_types.h>
 111 #include <kern/kalloc.h>
 112 #include <kern/task.h>
 113
 114 #include <vm/vm_pageout.h>
 115
 116 #include <libkern/OSAtomic.h>
 117 #include <pexpert/pexpert.h>
 118
 119 #if CONFIG_MACF
 120 #include <security/mac.h>
 121 #include <security/mac_framework.h>
 122 #endif
 123
 124 #if CONFIG_FSE
 125 #define GET_PATH(x) \
 126         (x) = get_pathbuff();
 127 #define RELEASE_PATH(x) \
 128         release_pathbuff(x);
 129 #else
 130 #define GET_PATH(x)     \
 131         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 132 #define RELEASE_PATH(x) \
 133         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 134 #endif /* CONFIG_FSE */
 135
 136 /* struct for checkdirs iteration */
 137 struct cdirargs {
 138         vnode_t olddp;
 139         vnode_t newdp;
 140 };
 141 /* callback  for checkdirs iteration */
 142 static int checkdirs_callback(proc_t p, void * arg);
 143
 144 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 145 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 146 void enablequotas(struct mount *mp, vfs_context_t ctx);
 147 static int getfsstat_callback(mount_t mp, void * arg);
 148 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 149 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 150 static int sync_callback(mount_t, void *);
 151 static void sync_thread(void *, __unused wait_result_t);
 152 static int sync_async(int);
 153 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 154                         user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 155                                                 boolean_t partial_copy);
 156 static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
 157                         user_addr_t bufp);
 158 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 159 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 160                         struct componentname *cnp, user_addr_t fsmountargs,
 161                         int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 162                         vfs_context_t ctx);
 163 void vfs_notify_mount(vnode_t pdvp);
 164
 165 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 166
 167 struct fd_vn_data * fg_vn_data_alloc(void);
 168
 169 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
 170
 171 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
 172
 173 #ifdef CONFIG_IMGSRC_ACCESS
 174 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 175 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 176 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 177 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 178 static void mount_end_update(mount_t mp);
 179 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 180 #endif /* CONFIG_IMGSRC_ACCESS */
 181
 182 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 183
 184 __private_extern__
 185 int sync_internal(void);
 186
 187 __private_extern__
 188 int unlink1(vfs_context_t, struct nameidata *, int);
 189
 190 extern lck_grp_t *fd_vn_lck_grp;
 191 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 192 extern lck_attr_t *fd_vn_lck_attr;
 193
 194 /*
 195  * incremented each time a mount or unmount operation occurs
 196  * used to invalidate the cached value of the rootvp in the
 197  * mount structure utilized by cache_lookup_path
 198  */
 199 uint32_t mount_generation = 0;
 200
 201 /* counts number of mount and unmount operations */
 202 unsigned int vfs_nummntops=0;
 203
 204 extern const struct fileops vnops;
 205 #if CONFIG_APPLEDOUBLE
 206 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 207 #endif /* CONFIG_APPLEDOUBLE */
 208
 209 typedef uint32_t vfs_rename_flags_t;
 210 #if CONFIG_SECLUDED_RENAME
 211 enum {
 212         VFS_SECLUDE_RENAME              = 0x00000001
 213 };
 214 #endif
 215
 216 /*
 217  * Virtual File System System Calls
 218  */
 219
 220 #if NFSCLIENT || DEVFS
 221 /*
 222  * Private in-kernel mounting spi (NFS only, not exported)
 223  */
 224  __private_extern__
 225 boolean_t
 226 vfs_iskernelmount(mount_t mp)
 227 {
 228         return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE);
 229 }
 230
 231  __private_extern__
 232 int
 233 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 234              void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
 235 {
 236         struct nameidata nd;
 237         boolean_t did_namei;
 238         int error;
 239
 240         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 241                UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 242
 243         /*
 244          * Get the vnode to be covered if it's not supplied
 245          */
 246         if (vp == NULLVP) {
 247                 error = namei(&nd);
 248                 if (error)
 249                         return (error);
 250                 vp = nd.ni_vp;
 251                 pvp = nd.ni_dvp;
 252                 did_namei = TRUE;
 253         } else {
 254                 char *pnbuf = CAST_DOWN(char *, path);
 255
 256                 nd.ni_cnd.cn_pnbuf = pnbuf;
 257                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 258                 did_namei = FALSE;
 259         }
 260
 261         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 262                              syscall_flags, kern_flags, NULL, TRUE, ctx);
 263
 264         if (did_namei) {
 265                 vnode_put(vp);
 266                 vnode_put(pvp);
 267                 nameidone(&nd);
 268         }
 269
 270         return (error);
 271 }
 272 #endif /* NFSCLIENT || DEVFS */
 273
 274 /*
 275  * Mount a file system.
 276  */
 277 /* ARGSUSED */
 278 int
 279 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 280 {
 281         struct __mac_mount_args muap;
 282
 283         muap.type = uap->type;
 284         muap.path = uap->path;
 285         muap.flags = uap->flags;
 286         muap.data = uap->data;
 287         muap.mac_p = USER_ADDR_NULL;
 288         return (__mac_mount(p, &muap, retval));
 289 }
 290
 291 void
 292 vfs_notify_mount(vnode_t pdvp)
 293 {
 294         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 295         lock_vnode_and_post(pdvp, NOTE_WRITE);
 296 }
 297
 298 /*
 299  * __mac_mount:
 300  *      Mount a file system taking into account MAC label behavior.
 301  *      See mount(2) man page for more information
 302  *
 303  * Parameters:    p                        Process requesting the mount
 304  *                uap                      User argument descriptor (see below)
 305  *                retval                   (ignored)
 306  *
 307  * Indirect:      uap->type                Filesystem type
 308  *                uap->path                Path to mount
 309  *                uap->data                Mount arguments
 310  *                uap->mac_p               MAC info
 311  *                uap->flags               Mount flags
 312  *
 313  *
 314  * Returns:        0                       Success
 315  *                !0                       Not success
 316  */
 317 boolean_t root_fs_upgrade_try = FALSE;
 318
 319 int
 320 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 321 {
 322         vnode_t pvp = NULL;
 323         vnode_t vp = NULL;
 324         int need_nameidone = 0;
 325         vfs_context_t ctx = vfs_context_current();
 326         char fstypename[MFSNAMELEN];
 327         struct nameidata nd;
 328         size_t dummy=0;
 329         char *labelstr = NULL;
 330         int flags = uap->flags;
 331         int error;
 332 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 333         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 334 #else
 335 #pragma unused(p)
 336 #endif
 337         /*
 338          * Get the fs type name from user space
 339          */
 340         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 341         if (error)
 342                 return (error);
 343
 344         /*
 345          * Get the vnode to be covered
 346          */
 347         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 348                UIO_USERSPACE, uap->path, ctx);
 349         error = namei(&nd);
 350         if (error) {
 351                 goto out;
 352         }
 353         need_nameidone = 1;
 354         vp = nd.ni_vp;
 355         pvp = nd.ni_dvp;
 356
 357 #ifdef CONFIG_IMGSRC_ACCESS
 358         /* Mounting image source cannot be batched with other operations */
 359         if (flags == MNT_IMGSRC_BY_INDEX) {
 360                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 361                                                   ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 362                 goto out;
 363         }
 364 #endif /* CONFIG_IMGSRC_ACCESS */
 365
 366 #if CONFIG_MACF
 367         /*
 368          * Get the label string (if any) from user space
 369          */
 370         if (uap->mac_p != USER_ADDR_NULL) {
 371                 struct user_mac mac;
 372                 size_t ulen = 0;
 373
 374                 if (is_64bit) {
 375                         struct user64_mac mac64;
 376                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 377                         mac.m_buflen = mac64.m_buflen;
 378                         mac.m_string = mac64.m_string;
 379                 } else {
 380                         struct user32_mac mac32;
 381                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 382                         mac.m_buflen = mac32.m_buflen;
 383                         mac.m_string = mac32.m_string;
 384                 }
 385                 if (error)
 386                         goto out;
 387                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 388                     (mac.m_buflen < 2)) {
 389                         error = EINVAL;
 390                         goto out;
 391                 }
 392                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 393                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 394                 if (error) {
 395                         goto out;
 396                 }
 397                 AUDIT_ARG(mac_string, labelstr);
 398         }
 399 #endif /* CONFIG_MACF */
 400
 401         AUDIT_ARG(fflags, flags);
 402
 403         if ((vp->v_flag & VROOT) &&
 404                         (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 405                 if (!(flags & MNT_UNION)) {
 406                         flags |= MNT_UPDATE;
 407                 }
 408                 else {
 409                         /*
 410                          * For a union mount on '/', treat it as fresh
 411                          * mount instead of update.
 412                          * Otherwise, union mouting on '/' used to panic the
 413                          * system before, since mnt_vnodecovered was found to
 414                          * be NULL for '/' which is required for unionlookup
 415                          * after it gets ENOENT on union mount.
 416                          */
 417                         flags = (flags & ~(MNT_UPDATE));
 418                 }
 419
 420 #ifdef SECURE_KERNEL
 421                 if ((flags & MNT_RDONLY) == 0) {
 422                         /* Release kernels are not allowed to mount "/" as rw */
 423                         error = EPERM;
 424                         goto out;
 425                 }
 426 #endif
 427                 /*
 428                  * See 7392553 for more details on why this check exists.
 429                  * Suffice to say: If this check is ON and something tries
 430                  * to mount the rootFS RW, we'll turn off the codesign
 431                  * bitmap optimization.
 432                  */
 433 #if CHECK_CS_VALIDATION_BITMAP
 434                 if ((flags & MNT_RDONLY) == 0 ) {
 435                         root_fs_upgrade_try = TRUE;
 436                 }
 437 #endif
 438         }
 439
 440         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 441                              labelstr, FALSE, ctx);
 442
 443 out:
 444
 445 #if CONFIG_MACF
 446         if (labelstr)
 447                 FREE(labelstr, M_MACTEMP);
 448 #endif /* CONFIG_MACF */
 449
 450         if (vp) {
 451                 vnode_put(vp);
 452         }
 453         if (pvp) {
 454                 vnode_put(pvp);
 455         }
 456         if (need_nameidone) {
 457                 nameidone(&nd);
 458         }
 459
 460         return (error);
 461 }
 462
 463 /*
 464  * common mount implementation (final stage of mounting)
 465
 466  * Arguments:
 467  *  fstypename  file system type (ie it's vfs name)
 468  *  pvp         parent of covered vnode
 469  *  vp          covered vnode
 470  *  cnp         component name (ie path) of covered vnode
 471  *  flags       generic mount flags
 472  *  fsmountargs file system specific data
 473  *  labelstr    optional MAC label
 474  *  kernelmount TRUE for mounts initiated from inside the kernel
 475  *  ctx         caller's context
 476  */
 477 static int
 478 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 479              struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 480              char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 481 {
 482 #if !CONFIG_MACF
 483 #pragma unused(labelstr)
 484 #endif
 485         struct vnode *devvp = NULLVP;
 486         struct vnode *device_vnode = NULLVP;
 487 #if CONFIG_MACF
 488         struct vnode *rvp;
 489 #endif
 490         struct mount *mp;
 491         struct vfstable *vfsp = (struct vfstable *)0;
 492         struct proc *p = vfs_context_proc(ctx);
 493         int error, flag = 0;
 494         user_addr_t devpath = USER_ADDR_NULL;
 495         int ronly = 0;
 496         int mntalloc = 0;
 497         boolean_t vfsp_ref = FALSE;
 498         boolean_t is_rwlock_locked = FALSE;
 499         boolean_t did_rele = FALSE;
 500         boolean_t have_usecount = FALSE;
 501
 502         /*
 503          * Process an update for an existing mount
 504          */
 505         if (flags & MNT_UPDATE) {
 506                 if ((vp->v_flag & VROOT) == 0) {
 507                         error = EINVAL;
 508                         goto out1;
 509                 }
 510                 mp = vp->v_mount;
 511
 512                 /* unmount in progress return error */
 513                 mount_lock_spin(mp);
 514                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 515                         mount_unlock(mp);
 516                         error = EBUSY;
 517                         goto out1;
 518                 }
 519                 mount_unlock(mp);
 520                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 521                 is_rwlock_locked = TRUE;
 522                 /*
 523                  * We only allow the filesystem to be reloaded if it
 524                  * is currently mounted read-only.
 525                  */
 526                 if ((flags & MNT_RELOAD) &&
 527                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 528                         error = ENOTSUP;
 529                         goto out1;
 530                 }
 531
 532                 /*
 533                  * If content protection is enabled, update mounts are not
 534                  * allowed to turn it off.
 535                  */
 536                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 537                            ((flags & MNT_CPROTECT) == 0)) {
 538                         error = EINVAL;
 539                         goto out1;
 540                 }
 541
 542 #ifdef CONFIG_IMGSRC_ACCESS
 543                 /* Can't downgrade the backer of the root FS */
 544                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 545                         (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 546                         error = ENOTSUP;
 547                         goto out1;
 548                 }
 549 #endif /* CONFIG_IMGSRC_ACCESS */
 550
 551                 /*
 552                  * Only root, or the user that did the original mount is
 553                  * permitted to update it.
 554                  */
 555                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 556                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 557                         goto out1;
 558                 }
 559 #if CONFIG_MACF
 560                 error = mac_mount_check_remount(ctx, mp);
 561                 if (error != 0) {
 562                         goto out1;
 563                 }
 564 #endif
 565                 /*
 566                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 567                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 568                  */
 569                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 570                         flags |= MNT_NOSUID | MNT_NODEV;
 571                         if (mp->mnt_flag & MNT_NOEXEC)
 572                                 flags |= MNT_NOEXEC;
 573                 }
 574                 flag = mp->mnt_flag;
 575
 576
 577
 578                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 579
 580                 vfsp = mp->mnt_vtable;
 581                 goto update;
 582         }
 583         /*
 584          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 585          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 586          */
 587         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 588                 flags |= MNT_NOSUID | MNT_NODEV;
 589                 if (vp->v_mount->mnt_flag & MNT_NOEXEC)
 590                         flags |= MNT_NOEXEC;
 591         }
 592
 593         /* XXXAUDIT: Should we capture the type on the error path as well? */
 594         AUDIT_ARG(text, fstypename);
 595         mount_list_lock();
 596         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 597                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 598                         vfsp->vfc_refcount++;
 599                         vfsp_ref = TRUE;
 600                         break;
 601                 }
 602         mount_list_unlock();
 603         if (vfsp == NULL) {
 604                 error = ENODEV;
 605                 goto out1;
 606         }
 607
 608         /*
 609          * VFC_VFSLOCALARGS is not currently supported for kernel mounts
 610          */
 611         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
 612                 error = EINVAL;  /* unsupported request */
 613                 goto out1;
 614         }
 615
 616         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 617         if (error != 0) {
 618                 goto out1;
 619         }
 620
 621         /*
 622          * Allocate and initialize the filesystem (mount_t)
 623          */
 624         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 625                 M_MOUNT, M_WAITOK);
 626         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 627         mntalloc = 1;
 628
 629         /* Initialize the default IO constraints */
 630         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 631         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 632         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 633         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 634         mp->mnt_devblocksize = DEV_BSIZE;
 635         mp->mnt_alignmentmask = PAGE_MASK;
 636         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 637         mp->mnt_ioscale = 1;
 638         mp->mnt_ioflags = 0;
 639         mp->mnt_realrootvp = NULLVP;
 640         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 641
 642         TAILQ_INIT(&mp->mnt_vnodelist);
 643         TAILQ_INIT(&mp->mnt_workerqueue);
 644         TAILQ_INIT(&mp->mnt_newvnodes);
 645         mount_lock_init(mp);
 646         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 647         is_rwlock_locked = TRUE;
 648         mp->mnt_op = vfsp->vfc_vfsops;
 649         mp->mnt_vtable = vfsp;
 650         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 651         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 652         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 653         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 654         mp->mnt_vnodecovered = vp;
 655         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 656         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 657         mp->mnt_devbsdunit = 0;
 658
 659         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 660         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 661
 662 #if NFSCLIENT || DEVFS
 663         if (kernelmount)
 664                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 665         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0)
 666                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 667 #endif /* NFSCLIENT || DEVFS */
 668
 669 update:
 670         /*
 671          * Set the mount level flags.
 672          */
 673         if (flags & MNT_RDONLY)
 674                 mp->mnt_flag |= MNT_RDONLY;
 675         else if (mp->mnt_flag & MNT_RDONLY) {
 676                 // disallow read/write upgrades of file systems that
 677                 // had the TYPENAME_OVERRIDE feature set.
 678                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 679                         error = EPERM;
 680                         goto out1;
 681                 }
 682                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 683         }
 684         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 685                           MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 686                           MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 687                           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 688                           MNT_QUARANTINE | MNT_CPROTECT);
 689         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 690                                  MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 691                                  MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 692                                  MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 693                                  MNT_QUARANTINE | MNT_CPROTECT);
 694
 695 #if CONFIG_MACF
 696         if (flags & MNT_MULTILABEL) {
 697                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 698                         error = EINVAL;
 699                         goto out1;
 700                 }
 701                 mp->mnt_flag |= MNT_MULTILABEL;
 702         }
 703 #endif
 704         /*
 705          * Process device path for local file systems if requested
 706          */
 707         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
 708                 if (vfs_context_is64bit(ctx)) {
 709                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
 710                                 goto out1;
 711                         fsmountargs += sizeof(devpath);
 712                 } else {
 713                         user32_addr_t tmp;
 714                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
 715                                 goto out1;
 716                         /* munge into LP64 addr */
 717                         devpath = CAST_USER_ADDR_T(tmp);
 718                         fsmountargs += sizeof(tmp);
 719                 }
 720
 721                 /* Lookup device and authorize access to it */
 722                 if ((devpath)) {
 723                         struct nameidata nd;
 724
 725                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 726                         if ( (error = namei(&nd)) )
 727                                 goto out1;
 728
 729                         strncpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 730                         devvp = nd.ni_vp;
 731
 732                         nameidone(&nd);
 733
 734                         if (devvp->v_type != VBLK) {
 735                                 error = ENOTBLK;
 736                                 goto out2;
 737                         }
 738                         if (major(devvp->v_rdev) >= nblkdev) {
 739                                 error = ENXIO;
 740                                 goto out2;
 741                         }
 742                         /*
 743                         * If mount by non-root, then verify that user has necessary
 744                         * permissions on the device.
 745                         */
 746                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 747                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 748
 749                                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
 750                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 751                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0)
 752                                         goto out2;
 753                         }
 754                 }
 755                 /* On first mount, preflight and open device */
 756                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 757                         if ( (error = vnode_ref(devvp)) )
 758                                 goto out2;
 759                         /*
 760                         * Disallow multiple mounts of the same device.
 761                         * Disallow mounting of a device that is currently in use
 762                         * (except for root, which might share swap device for miniroot).
 763                         * Flush out any old buffers remaining from a previous use.
 764                         */
 765                         if ( (error = vfs_mountedon(devvp)) )
 766                                 goto out3;
 767
 768                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 769                                 error = EBUSY;
 770                                 goto out3;
 771                         }
 772                         if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, ctx)) ) {
 773                                 error = ENOTBLK;
 774                                 goto out3;
 775                         }
 776                         if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0)) )
 777                                 goto out3;
 778
 779                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 780 #if CONFIG_MACF
 781                         error = mac_vnode_check_open(ctx,
 782                             devvp,
 783                             ronly ? FREAD : FREAD|FWRITE);
 784                         if (error)
 785                                 goto out3;
 786 #endif /* MAC */
 787                         if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, ctx)) )
 788                                 goto out3;
 789
 790                         mp->mnt_devvp = devvp;
 791                         device_vnode = devvp;
 792
 793                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 794                            (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 795                            (device_vnode = mp->mnt_devvp)) {
 796                         dev_t dev;
 797                         int maj;
 798                         /*
 799                          * If upgrade to read-write by non-root, then verify
 800                          * that user has necessary permissions on the device.
 801                          */
 802                         vnode_getalways(device_vnode);
 803
 804                         if (suser(vfs_context_ucred(ctx), NULL) &&
 805                             (error = vnode_authorize(device_vnode, NULL,
 806                              KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
 807                              ctx)) != 0) {
 808                                 vnode_put(device_vnode);
 809                                 goto out2;
 810                         }
 811
 812                         /* Tell the device that we're upgrading */
 813                         dev = (dev_t)device_vnode->v_rdev;
 814                         maj = major(dev);
 815
 816                         if ((u_int)maj >= (u_int)nblkdev)
 817                                 panic("Volume mounted on a device with invalid major number.");
 818
 819                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
 820                         vnode_put(device_vnode);
 821                         device_vnode = NULLVP;
 822                         if (error != 0) {
 823                                 goto out2;
 824                         }
 825                 }
 826         }
 827 #if CONFIG_MACF
 828         if ((flags & MNT_UPDATE) == 0) {
 829                 mac_mount_label_init(mp);
 830                 mac_mount_label_associate(ctx, mp);
 831         }
 832         if (labelstr) {
 833                 if ((flags & MNT_UPDATE) != 0) {
 834                         error = mac_mount_check_label_update(ctx, mp);
 835                         if (error != 0)
 836                                 goto out3;
 837                 }
 838         }
 839 #endif
 840         /*
 841          * Mount the filesystem.
 842          */
 843         error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
 844
 845         if (flags & MNT_UPDATE) {
 846                 if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 847                         mp->mnt_flag &= ~MNT_RDONLY;
 848                 mp->mnt_flag &=~
 849                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
 850                 mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 851                 if (error)
 852                         mp->mnt_flag = flag;  /* restore flag value */
 853                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
 854                 lck_rw_done(&mp->mnt_rwlock);
 855                 is_rwlock_locked = FALSE;
 856                 if (!error)
 857                         enablequotas(mp, ctx);
 858                 goto exit;
 859         }
 860
 861         /*
 862          * Put the new filesystem on the mount list after root.
 863          */
 864         if (error == 0) {
 865                 struct vfs_attr vfsattr;
 866 #if CONFIG_MACF
 867                 if (vfs_flags(mp) & MNT_MULTILABEL) {
 868                         error = VFS_ROOT(mp, &rvp, ctx);
 869                         if (error) {
 870                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
 871                                 goto out3;
 872                         }
 873                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
 874                         /*
 875                          * drop reference provided by VFS_ROOT
 876                          */
 877                         vnode_put(rvp);
 878
 879                         if (error)
 880                                 goto out3;
 881                 }
 882 #endif  /* MAC */
 883
 884                 vnode_lock_spin(vp);
 885                 CLR(vp->v_flag, VMOUNT);
 886                 vp->v_mountedhere = mp;
 887                 vnode_unlock(vp);
 888
 889                 /*
 890                  * taking the name_cache_lock exclusively will
 891                  * insure that everyone is out of the fast path who
 892                  * might be trying to use a now stale copy of
 893                  * vp->v_mountedhere->mnt_realrootvp
 894                  * bumping mount_generation causes the cached values
 895                  * to be invalidated
 896                  */
 897                 name_cache_lock();
 898                 mount_generation++;
 899                 name_cache_unlock();
 900
 901                 error = vnode_ref(vp);
 902                 if (error != 0) {
 903                         goto out4;
 904                 }
 905
 906                 have_usecount = TRUE;
 907
 908                 error = checkdirs(vp, ctx);
 909                 if (error != 0)  {
 910                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
 911                         goto out4;
 912                 }
 913                 /*
 914                  * there is no cleanup code here so I have made it void
 915                  * we need to revisit this
 916                  */
 917                 (void)VFS_START(mp, 0, ctx);
 918
 919                 if (mount_list_add(mp) != 0) {
 920                         /*
 921                          * The system is shutting down trying to umount
 922                          * everything, so fail with a plausible errno.
 923                          */
 924                         error = EBUSY;
 925                         goto out4;
 926                 }
 927                 lck_rw_done(&mp->mnt_rwlock);
 928                 is_rwlock_locked = FALSE;
 929
 930                 /* Check if this mounted file system supports EAs or named streams. */
 931                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
 932                 VFSATTR_INIT(&vfsattr);
 933                 VFSATTR_WANTED(&vfsattr, f_capabilities);
 934                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
 935                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
 936                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
 937                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
 938                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
 939                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
 940                         }
 941 #if NAMEDSTREAMS
 942                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
 943                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
 944                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
 945                         }
 946 #endif
 947                         /* Check if this file system supports path from id lookups. */
 948                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
 949                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
 950                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 951                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
 952                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
 953                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 954                         }
 955                 }
 956                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
 957                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
 958                 }
 959                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
 960                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
 961                 }
 962                 /* increment the operations count */
 963                 OSAddAtomic(1, &vfs_nummntops);
 964                 enablequotas(mp, ctx);
 965
 966                 if (device_vnode) {
 967                         device_vnode->v_specflags |= SI_MOUNTEDON;
 968
 969                         /*
 970                          *   cache the IO attributes for the underlying physical media...
 971                          *   an error return indicates the underlying driver doesn't
 972                          *   support all the queries necessary... however, reasonable
 973                          *   defaults will have been set, so no reason to bail or care
 974                          */
 975                         vfs_init_io_attributes(device_vnode, mp);
 976                 }
 977
 978                 /* Now that mount is setup, notify the listeners */
 979                 vfs_notify_mount(pvp);
 980         } else {
 981                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
 982                 if (mp->mnt_vnodelist.tqh_first != NULL) {
 983                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
 984                                         mp->mnt_vtable->vfc_name, error);
 985                 }
 986
 987                 vnode_lock_spin(vp);
 988                 CLR(vp->v_flag, VMOUNT);
 989                 vnode_unlock(vp);
 990                 mount_list_lock();
 991                 mp->mnt_vtable->vfc_refcount--;
 992                 mount_list_unlock();
 993
 994                 if (device_vnode ) {
 995                         vnode_rele(device_vnode);
 996                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD|FWRITE, ctx);
 997                 }
 998                 lck_rw_done(&mp->mnt_rwlock);
 999                 is_rwlock_locked = FALSE;
1000
1001                 /*
1002                  * if we get here, we have a mount structure that needs to be freed,
1003                  * but since the coveredvp hasn't yet been updated to point at it,
1004                  * no need to worry about other threads holding a crossref on this mp
1005                  * so it's ok to just free it
1006                  */
1007                 mount_lock_destroy(mp);
1008 #if CONFIG_MACF
1009                 mac_mount_label_destroy(mp);
1010 #endif
1011                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1012         }
1013 exit:
1014         /*
1015          * drop I/O count on the device vp if there was one
1016          */
1017         if (devpath && devvp)
1018                 vnode_put(devvp);
1019
1020         return(error);
1021
1022 /* Error condition exits */
1023 out4:
1024         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1025
1026         /*
1027          * If the mount has been placed on the covered vp,
1028          * it may have been discovered by now, so we have
1029          * to treat this just like an unmount
1030          */
1031         mount_lock_spin(mp);
1032         mp->mnt_lflag |= MNT_LDEAD;
1033         mount_unlock(mp);
1034
1035         if (device_vnode != NULLVP) {
1036                 vnode_rele(device_vnode);
1037                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1038                        ctx);
1039                 did_rele = TRUE;
1040         }
1041
1042         vnode_lock_spin(vp);
1043
1044         mp->mnt_crossref++;
1045         vp->v_mountedhere = (mount_t) 0;
1046
1047         vnode_unlock(vp);
1048
1049         if (have_usecount) {
1050                 vnode_rele(vp);
1051         }
1052 out3:
1053         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele))
1054                 vnode_rele(devvp);
1055 out2:
1056         if (devpath && devvp)
1057                 vnode_put(devvp);
1058 out1:
1059         /* Release mnt_rwlock only when it was taken */
1060         if (is_rwlock_locked == TRUE) {
1061                 lck_rw_done(&mp->mnt_rwlock);
1062         }
1063
1064         if (mntalloc) {
1065                 if (mp->mnt_crossref)
1066                         mount_dropcrossref(mp, vp, 0);
1067                 else {
1068                         mount_lock_destroy(mp);
1069 #if CONFIG_MACF
1070                         mac_mount_label_destroy(mp);
1071 #endif
1072                         FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1073                 }
1074         }
1075         if (vfsp_ref) {
1076                 mount_list_lock();
1077                 vfsp->vfc_refcount--;
1078                 mount_list_unlock();
1079         }
1080
1081         return(error);
1082 }
1083
1084 /*
1085  * Flush in-core data, check for competing mount attempts,
1086  * and set VMOUNT
1087  */
1088 int
1089 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1090 {
1091 #if !CONFIG_MACF
1092 #pragma unused(cnp,fsname)
1093 #endif
1094         struct vnode_attr va;
1095         int error;
1096
1097         if (!skip_auth) {
1098                 /*
1099                  * If the user is not root, ensure that they own the directory
1100                  * onto which we are attempting to mount.
1101                  */
1102                 VATTR_INIT(&va);
1103                 VATTR_WANTED(&va, va_uid);
1104                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1105                                 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1106                                  (!vfs_context_issuser(ctx)))) {
1107                         error = EPERM;
1108                         goto out;
1109                 }
1110         }
1111
1112         if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
1113                 goto out;
1114
1115         if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) )
1116                 goto out;
1117
1118         if (vp->v_type != VDIR) {
1119                 error = ENOTDIR;
1120                 goto out;
1121         }
1122
1123         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1124                 error = EBUSY;
1125                 goto out;
1126         }
1127
1128 #if CONFIG_MACF
1129         error = mac_mount_check_mount(ctx, vp,
1130             cnp, fsname);
1131         if (error != 0)
1132                 goto out;
1133 #endif
1134
1135         vnode_lock_spin(vp);
1136         SET(vp->v_flag, VMOUNT);
1137         vnode_unlock(vp);
1138
1139 out:
1140         return error;
1141 }
1142
1143 #if CONFIG_IMGSRC_ACCESS
1144
1145 #if DEBUG
1146 #define IMGSRC_DEBUG(args...) printf(args)
1147 #else
1148 #define IMGSRC_DEBUG(args...) do { } while(0)
1149 #endif
1150
1151 static int
1152 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1153 {
1154         struct nameidata nd;
1155         vnode_t vp, realdevvp;
1156         mode_t accessmode;
1157         int error;
1158
1159         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
1160         if ( (error = namei(&nd)) ) {
1161                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1162                 return error;
1163         }
1164
1165         vp = nd.ni_vp;
1166
1167         if (!vnode_isblk(vp)) {
1168                 IMGSRC_DEBUG("Not block device.\n");
1169                 error = ENOTBLK;
1170                 goto out;
1171         }
1172
1173         realdevvp = mp->mnt_devvp;
1174         if (realdevvp == NULLVP) {
1175                 IMGSRC_DEBUG("No device backs the mount.\n");
1176                 error = ENXIO;
1177                 goto out;
1178         }
1179
1180         error = vnode_getwithref(realdevvp);
1181         if (error != 0) {
1182                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1183                 goto out;
1184         }
1185
1186         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1187                 IMGSRC_DEBUG("Wrong dev_t.\n");
1188                 error = ENXIO;
1189                 goto out1;
1190         }
1191
1192         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1193
1194         /*
1195          * If mount by non-root, then verify that user has necessary
1196          * permissions on the device.
1197          */
1198         if (!vfs_context_issuser(ctx)) {
1199                 accessmode = KAUTH_VNODE_READ_DATA;
1200                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
1201                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1202                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1203                         IMGSRC_DEBUG("Access denied.\n");
1204                         goto out1;
1205                 }
1206         }
1207
1208         *devvpp = vp;
1209
1210 out1:
1211         vnode_put(realdevvp);
1212 out:
1213         nameidone(&nd);
1214         if (error) {
1215                 vnode_put(vp);
1216         }
1217
1218         return error;
1219 }
1220
1221 /*
1222  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1223  * and call checkdirs()
1224  */
1225 static int
1226 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1227 {
1228         int error;
1229
1230         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1231
1232         vnode_lock_spin(vp);
1233         CLR(vp->v_flag, VMOUNT);
1234         vp->v_mountedhere = mp;
1235         vnode_unlock(vp);
1236
1237         /*
1238          * taking the name_cache_lock exclusively will
1239          * insure that everyone is out of the fast path who
1240          * might be trying to use a now stale copy of
1241          * vp->v_mountedhere->mnt_realrootvp
1242          * bumping mount_generation causes the cached values
1243          * to be invalidated
1244          */
1245         name_cache_lock();
1246         mount_generation++;
1247         name_cache_unlock();
1248
1249         error = vnode_ref(vp);
1250         if (error != 0) {
1251                 goto out;
1252         }
1253
1254         error = checkdirs(vp, ctx);
1255         if (error != 0)  {
1256                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1257                 vnode_rele(vp);
1258                 goto out;
1259         }
1260
1261 out:
1262         if (error != 0) {
1263                 mp->mnt_vnodecovered = NULLVP;
1264         }
1265         return error;
1266 }
1267
1268 static void
1269 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1270 {
1271         vnode_rele(vp);
1272         vnode_lock_spin(vp);
1273         vp->v_mountedhere = (mount_t)NULL;
1274         vnode_unlock(vp);
1275
1276         mp->mnt_vnodecovered = NULLVP;
1277 }
1278
1279 static int
1280 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1281 {
1282         int error;
1283
1284         /* unmount in progress return error */
1285         mount_lock_spin(mp);
1286         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1287                 mount_unlock(mp);
1288                 return EBUSY;
1289         }
1290         mount_unlock(mp);
1291         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1292
1293         /*
1294          * We only allow the filesystem to be reloaded if it
1295          * is currently mounted read-only.
1296          */
1297         if ((flags & MNT_RELOAD) &&
1298                         ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1299                 error = ENOTSUP;
1300                 goto out;
1301         }
1302
1303         /*
1304          * Only root, or the user that did the original mount is
1305          * permitted to update it.
1306          */
1307         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1308                         (!vfs_context_issuser(ctx))) {
1309                 error = EPERM;
1310                 goto out;
1311         }
1312 #if CONFIG_MACF
1313         error = mac_mount_check_remount(ctx, mp);
1314         if (error != 0) {
1315                 goto out;
1316         }
1317 #endif
1318
1319 out:
1320         if (error) {
1321                 lck_rw_done(&mp->mnt_rwlock);
1322         }
1323
1324         return error;
1325 }
1326
1327 static void
1328 mount_end_update(mount_t mp)
1329 {
1330         lck_rw_done(&mp->mnt_rwlock);
1331 }
1332
1333 static int
1334 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1335 {
1336         vnode_t vp;
1337
1338         if (height >= MAX_IMAGEBOOT_NESTING) {
1339                 return EINVAL;
1340         }
1341
1342         vp = imgsrc_rootvnodes[height];
1343         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1344                 *rvpp = vp;
1345                 return 0;
1346         } else {
1347                 return ENOENT;
1348         }
1349 }
1350
1351 static int
1352 relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
1353                 const char *fsname, vfs_context_t ctx,
1354                 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1355 {
1356         int error;
1357         mount_t mp;
1358         boolean_t placed = FALSE;
1359         vnode_t devvp = NULLVP;
1360         struct vfstable *vfsp;
1361         user_addr_t devpath;
1362         char *old_mntonname;
1363         vnode_t rvp;
1364         uint32_t height;
1365         uint32_t flags;
1366
1367         /* If we didn't imageboot, nothing to move */
1368         if (imgsrc_rootvnodes[0] == NULLVP) {
1369                 return EINVAL;
1370         }
1371
1372         /* Only root can do this */
1373         if (!vfs_context_issuser(ctx)) {
1374                 return EPERM;
1375         }
1376
1377         IMGSRC_DEBUG("looking for root vnode.\n");
1378
1379         /*
1380          * Get root vnode of filesystem we're moving.
1381          */
1382         if (by_index) {
1383                 if (is64bit) {
1384                         struct user64_mnt_imgsrc_args mia64;
1385                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1386                         if (error != 0) {
1387                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1388                                 return error;
1389                         }
1390
1391                         height = mia64.mi_height;
1392                         flags = mia64.mi_flags;
1393                         devpath = mia64.mi_devpath;
1394                 } else {
1395                         struct user32_mnt_imgsrc_args mia32;
1396                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1397                         if (error != 0) {
1398                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1399                                 return error;
1400                         }
1401
1402                         height = mia32.mi_height;
1403                         flags = mia32.mi_flags;
1404                         devpath = mia32.mi_devpath;
1405                 }
1406         } else {
1407                 /*
1408                  * For binary compatibility--assumes one level of nesting.
1409                  */
1410                 if (is64bit) {
1411                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
1412                                 return error;
1413                 } else {
1414                         user32_addr_t tmp;
1415                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
1416                                 return error;
1417
1418                         /* munge into LP64 addr */
1419                         devpath = CAST_USER_ADDR_T(tmp);
1420                 }
1421
1422                 height = 0;
1423                 flags = 0;
1424         }
1425
1426         if (flags != 0) {
1427                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1428                 return EINVAL;
1429         }
1430
1431         error = get_imgsrc_rootvnode(height, &rvp);
1432         if (error != 0) {
1433                 IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
1434                 return error;
1435         }
1436
1437         IMGSRC_DEBUG("got root vnode.\n");
1438
1439         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1440
1441         /* Can only move once */
1442         mp = vnode_mount(rvp);
1443         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1444                 IMGSRC_DEBUG("Already moved.\n");
1445                 error = EBUSY;
1446                 goto out0;
1447         }
1448
1449         IMGSRC_DEBUG("Starting updated.\n");
1450
1451         /* Get exclusive rwlock on mount, authorize update on mp */
1452         error = mount_begin_update(mp , ctx, 0);
1453         if (error != 0) {
1454                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1455                 goto out0;
1456         }
1457
1458         /*
1459          * It can only be moved once.  Flag is set under the rwlock,
1460          * so we're now safe to proceed.
1461          */
1462         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1463                 IMGSRC_DEBUG("Already moved [2]\n");
1464                 goto out1;
1465         }
1466
1467
1468         IMGSRC_DEBUG("Preparing coveredvp.\n");
1469
1470         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1471         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1472         if (error != 0) {
1473                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1474                 goto out1;
1475         }
1476
1477         IMGSRC_DEBUG("Covered vp OK.\n");
1478
1479         /* Sanity check the name caller has provided */
1480         vfsp = mp->mnt_vtable;
1481         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1482                 IMGSRC_DEBUG("Wrong fs name.\n");
1483                 error = EINVAL;
1484                 goto out2;
1485         }
1486
1487         /* Check the device vnode and update mount-from name, for local filesystems */
1488         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1489                 IMGSRC_DEBUG("Local, doing device validation.\n");
1490
1491                 if (devpath != USER_ADDR_NULL) {
1492                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1493                         if (error) {
1494                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1495                                 goto out2;
1496                         }
1497
1498                         vnode_put(devvp);
1499                 }
1500         }
1501
1502         /*
1503          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1504          * and increment the name cache's mount generation
1505          */
1506
1507         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1508         error = place_mount_and_checkdirs(mp, vp, ctx);
1509         if (error != 0) {
1510                 goto out2;
1511         }
1512
1513         placed = TRUE;
1514
1515         strncpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1516         strncpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1517
1518         /* Forbid future moves */
1519         mount_lock(mp);
1520         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1521         mount_unlock(mp);
1522
1523         /* Finally, add to mount list, completely ready to go */
1524         if (mount_list_add(mp) != 0) {
1525                 /*
1526                  * The system is shutting down trying to umount
1527                  * everything, so fail with a plausible errno.
1528                  */
1529                 error = EBUSY;
1530                 goto out3;
1531         }
1532
1533         mount_end_update(mp);
1534         vnode_put(rvp);
1535         FREE(old_mntonname, M_TEMP);
1536
1537         vfs_notify_mount(pvp);
1538
1539         return 0;
1540 out3:
1541         strncpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1542
1543         mount_lock(mp);
1544         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1545         mount_unlock(mp);
1546
1547 out2:
1548         /*
1549          * Placing the mp on the vnode clears VMOUNT,
1550          * so cleanup is different after that point
1551          */
1552         if (placed) {
1553                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1554                 undo_place_on_covered_vp(mp, vp);
1555         } else {
1556                 vnode_lock_spin(vp);
1557                 CLR(vp->v_flag, VMOUNT);
1558                 vnode_unlock(vp);
1559         }
1560 out1:
1561         mount_end_update(mp);
1562
1563 out0:
1564         vnode_put(rvp);
1565         FREE(old_mntonname, M_TEMP);
1566         return error;
1567 }
1568
1569 #endif /* CONFIG_IMGSRC_ACCESS */
1570
1571 void
1572 enablequotas(struct mount *mp, vfs_context_t ctx)
1573 {
1574         struct nameidata qnd;
1575         int type;
1576         char qfpath[MAXPATHLEN];
1577         const char *qfname = QUOTAFILENAME;
1578         const char *qfopsname = QUOTAOPSNAME;
1579         const char *qfextension[] = INITQFNAMES;
1580
1581         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1582         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0 ) {
1583                 return;
1584         }
1585         /*
1586          * Enable filesystem disk quotas if necessary.
1587          * We ignore errors as this should not interfere with final mount
1588          */
1589         for (type=0; type < MAXQUOTAS; type++) {
1590                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1591                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1592                        CAST_USER_ADDR_T(qfpath), ctx);
1593                 if (namei(&qnd) != 0)
1594                         continue;           /* option file to trigger quotas is not present */
1595                 vnode_put(qnd.ni_vp);
1596                 nameidone(&qnd);
1597                 snprintf(qfpath, sizeof(qfpath),  "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1598
1599                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1600         }
1601         return;
1602 }
1603
1604
1605 static int
1606 checkdirs_callback(proc_t p, void * arg)
1607 {
1608         struct cdirargs * cdrp = (struct cdirargs * )arg;
1609         vnode_t olddp = cdrp->olddp;
1610         vnode_t newdp = cdrp->newdp;
1611         struct filedesc *fdp;
1612         vnode_t tvp;
1613         vnode_t fdp_cvp;
1614         vnode_t fdp_rvp;
1615         int cdir_changed = 0;
1616         int rdir_changed = 0;
1617
1618         /*
1619          * XXX Also needs to iterate each thread in the process to see if it
1620          * XXX is using a per-thread current working directory, and, if so,
1621          * XXX update that as well.
1622          */
1623
1624         proc_fdlock(p);
1625         fdp = p->p_fd;
1626         if (fdp == (struct filedesc *)0) {
1627                 proc_fdunlock(p);
1628                 return(PROC_RETURNED);
1629         }
1630         fdp_cvp = fdp->fd_cdir;
1631         fdp_rvp = fdp->fd_rdir;
1632         proc_fdunlock(p);
1633
1634         if (fdp_cvp == olddp) {
1635                 vnode_ref(newdp);
1636                 tvp = fdp->fd_cdir;
1637                 fdp_cvp = newdp;
1638                 cdir_changed = 1;
1639                 vnode_rele(tvp);
1640         }
1641         if (fdp_rvp == olddp) {
1642                 vnode_ref(newdp);
1643                 tvp = fdp->fd_rdir;
1644                 fdp_rvp = newdp;
1645                 rdir_changed = 1;
1646                 vnode_rele(tvp);
1647         }
1648         if (cdir_changed || rdir_changed) {
1649                 proc_fdlock(p);
1650                 fdp->fd_cdir = fdp_cvp;
1651                 fdp->fd_rdir = fdp_rvp;
1652                 proc_fdunlock(p);
1653         }
1654         return(PROC_RETURNED);
1655 }
1656
1657
1658
1659 /*
1660  * Scan all active processes to see if any of them have a current
1661  * or root directory onto which the new filesystem has just been
1662  * mounted. If so, replace them with the new mount point.
1663  */
1664 static int
1665 checkdirs(vnode_t olddp, vfs_context_t ctx)
1666 {
1667         vnode_t newdp;
1668         vnode_t tvp;
1669         int err;
1670         struct cdirargs cdr;
1671
1672         if (olddp->v_usecount == 1)
1673                 return(0);
1674         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
1675
1676         if (err != 0) {
1677 #if DIAGNOSTIC
1678                 panic("mount: lost mount: error %d", err);
1679 #endif
1680                 return(err);
1681         }
1682
1683         cdr.olddp = olddp;
1684         cdr.newdp = newdp;
1685         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
1686         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
1687
1688         if (rootvnode == olddp) {
1689                 vnode_ref(newdp);
1690                 tvp = rootvnode;
1691                 rootvnode = newdp;
1692                 vnode_rele(tvp);
1693         }
1694
1695         vnode_put(newdp);
1696         return(0);
1697 }
1698
1699 /*
1700  * Unmount a file system.
1701  *
1702  * Note: unmount takes a path to the vnode mounted on as argument,
1703  * not special file (as before).
1704  */
1705 /* ARGSUSED */
1706 int
1707 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
1708 {
1709         vnode_t vp;
1710         struct mount *mp;
1711         int error;
1712         struct nameidata nd;
1713         vfs_context_t ctx = vfs_context_current();
1714
1715         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
1716                 UIO_USERSPACE, uap->path, ctx);
1717         error = namei(&nd);
1718         if (error)
1719                 return (error);
1720         vp = nd.ni_vp;
1721         mp = vp->v_mount;
1722         nameidone(&nd);
1723
1724 #if CONFIG_MACF
1725         error = mac_mount_check_umount(ctx, mp);
1726         if (error != 0) {
1727                 vnode_put(vp);
1728                 return (error);
1729         }
1730 #endif
1731         /*
1732          * Must be the root of the filesystem
1733          */
1734         if ((vp->v_flag & VROOT) == 0) {
1735                 vnode_put(vp);
1736                 return (EINVAL);
1737         }
1738         mount_ref(mp, 0);
1739         vnode_put(vp);
1740         /* safedounmount consumes the mount ref */
1741         return (safedounmount(mp, uap->flags, ctx));
1742 }
1743
1744 int
1745 vfs_unmountbyfsid(fsid_t * fsid, int flags, vfs_context_t ctx)
1746 {
1747         mount_t mp;
1748
1749         mp = mount_list_lookupby_fsid(fsid, 0, 1);
1750         if (mp == (mount_t)0) {
1751                 return(ENOENT);
1752         }
1753         mount_ref(mp, 0);
1754         mount_iterdrop(mp);
1755         /* safedounmount consumes the mount ref */
1756         return(safedounmount(mp, flags, ctx));
1757 }
1758
1759
1760 /*
1761  * The mount struct comes with a mount ref which will be consumed.
1762  * Do the actual file system unmount, prevent some common foot shooting.
1763  */
1764 int
1765 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
1766 {
1767         int error;
1768         proc_t p = vfs_context_proc(ctx);
1769
1770         /*
1771          * If the file system is not responding and MNT_NOBLOCK
1772          * is set and not a forced unmount then return EBUSY.
1773          */
1774         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
1775                 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
1776                 error = EBUSY;
1777                 goto out;
1778         }
1779
1780         /*
1781          * Skip authorization if the mount is tagged as permissive and
1782          * this is not a forced-unmount attempt.
1783          */
1784         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
1785                 /*
1786                  * Only root, or the user that did the original mount is
1787                  * permitted to unmount this filesystem.
1788                  */
1789                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
1790                                 (error = suser(kauth_cred_get(), &p->p_acflag)))
1791                         goto out;
1792         }
1793         /*
1794          * Don't allow unmounting the root file system.
1795          */
1796         if (mp->mnt_flag & MNT_ROOTFS) {
1797                 error = EBUSY; /* the root is always busy */
1798                 goto out;
1799         }
1800
1801 #ifdef CONFIG_IMGSRC_ACCESS
1802         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
1803                 error = EBUSY;
1804                 goto out;
1805         }
1806 #endif /* CONFIG_IMGSRC_ACCESS */
1807
1808         return (dounmount(mp, flags, 1, ctx));
1809
1810 out:
1811         mount_drop(mp, 0);
1812         return(error);
1813 }
1814
1815 /*
1816  * Do the actual file system unmount.
1817  */
1818 int
1819 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
1820 {
1821         vnode_t coveredvp = (vnode_t)0;
1822         int error;
1823         int needwakeup = 0;
1824         int forcedunmount = 0;
1825         int lflags = 0;
1826         struct vnode *devvp = NULLVP;
1827 #if CONFIG_TRIGGERS
1828         proc_t p = vfs_context_proc(ctx);
1829         int did_vflush = 0;
1830         int pflags_save = 0;
1831 #endif /* CONFIG_TRIGGERS */
1832
1833         mount_lock(mp);
1834
1835         /*
1836          * If already an unmount in progress just return EBUSY.
1837          * Even a forced unmount cannot override.
1838          */
1839         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1840                 if (withref != 0)
1841                         mount_drop(mp, 1);
1842                 mount_unlock(mp);
1843                 return (EBUSY);
1844         }
1845
1846         if (flags & MNT_FORCE) {
1847                 forcedunmount = 1;
1848                 mp->mnt_lflag |= MNT_LFORCE;
1849         }
1850
1851 #if CONFIG_TRIGGERS
1852         if (flags & MNT_NOBLOCK && p != kernproc)
1853                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
1854 #endif
1855
1856         mp->mnt_kern_flag |= MNTK_UNMOUNT;
1857         mp->mnt_lflag |= MNT_LUNMOUNT;
1858         mp->mnt_flag &=~ MNT_ASYNC;
1859         /*
1860          * anyone currently in the fast path that
1861          * trips over the cached rootvp will be
1862          * dumped out and forced into the slow path
1863          * to regenerate a new cached value
1864          */
1865         mp->mnt_realrootvp = NULLVP;
1866         mount_unlock(mp);
1867
1868         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
1869                 /*
1870                  * Force unmount any mounts in this filesystem.
1871                  * If any unmounts fail - just leave them dangling.
1872                  * Avoids recursion.
1873                  */
1874                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
1875         }
1876
1877         /*
1878          * taking the name_cache_lock exclusively will
1879          * insure that everyone is out of the fast path who
1880          * might be trying to use a now stale copy of
1881          * vp->v_mountedhere->mnt_realrootvp
1882          * bumping mount_generation causes the cached values
1883          * to be invalidated
1884          */
1885         name_cache_lock();
1886         mount_generation++;
1887         name_cache_unlock();
1888
1889
1890         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1891         if (withref != 0)
1892                 mount_drop(mp, 0);
1893 #if CONFIG_FSE
1894         fsevent_unmount(mp);  /* has to come first! */
1895 #endif
1896         error = 0;
1897         if (forcedunmount == 0) {
1898                 ubc_umount(mp); /* release cached vnodes */
1899                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1900                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
1901                         if (error) {
1902                                 mount_lock(mp);
1903                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1904                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1905                                 mp->mnt_lflag &= ~MNT_LFORCE;
1906                                 goto out;
1907                         }
1908                 }
1909         }
1910
1911 #if CONFIG_TRIGGERS
1912         vfs_nested_trigger_unmounts(mp, flags, ctx);
1913         did_vflush = 1;
1914 #endif
1915         if (forcedunmount)
1916                 lflags |= FORCECLOSE;
1917         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
1918         if ((forcedunmount == 0) && error) {
1919                 mount_lock(mp);
1920                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1921                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1922                 mp->mnt_lflag &= ~MNT_LFORCE;
1923                 goto out;
1924         }
1925
1926         /* make sure there are no one in the mount iterations or lookup */
1927         mount_iterdrain(mp);
1928
1929         error = VFS_UNMOUNT(mp, flags, ctx);
1930         if (error) {
1931                 mount_iterreset(mp);
1932                 mount_lock(mp);
1933                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1934                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1935                 mp->mnt_lflag &= ~MNT_LFORCE;
1936                 goto out;
1937         }
1938
1939         /* increment the operations count */
1940         if (!error)
1941                 OSAddAtomic(1, &vfs_nummntops);
1942
1943         if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
1944                 /* hold an io reference and drop the usecount before close */
1945                 devvp = mp->mnt_devvp;
1946                 vnode_getalways(devvp);
1947                 vnode_rele(devvp);
1948                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1949                        ctx);
1950                 vnode_clearmountedon(devvp);
1951                 vnode_put(devvp);
1952         }
1953         lck_rw_done(&mp->mnt_rwlock);
1954         mount_list_remove(mp);
1955         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1956
1957         /* mark the mount point hook in the vp but not drop the ref yet */
1958         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
1959                 /*
1960                  * The covered vnode needs special handling. Trying to get an
1961                  * iocount must not block here as this may lead to deadlocks
1962                  * if the Filesystem to which the covered vnode belongs is
1963                  * undergoing forced unmounts. Since we hold a usecount, the
1964                  * vnode cannot be reused (it can, however, still be terminated)
1965                  */
1966                 vnode_getalways(coveredvp);
1967                 vnode_lock_spin(coveredvp);
1968
1969                 mp->mnt_crossref++;
1970                 coveredvp->v_mountedhere = (struct mount *)0;
1971                 CLR(coveredvp->v_flag, VMOUNT);
1972
1973                 vnode_unlock(coveredvp);
1974                 vnode_put(coveredvp);
1975         }
1976
1977         mount_list_lock();
1978         mp->mnt_vtable->vfc_refcount--;
1979         mount_list_unlock();
1980
1981         cache_purgevfs(mp);     /* remove cache entries for this file sys */
1982         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
1983         mount_lock(mp);
1984         mp->mnt_lflag |= MNT_LDEAD;
1985
1986         if (mp->mnt_lflag & MNT_LWAIT) {
1987                 /*
1988                  * do the wakeup here
1989                  * in case we block in mount_refdrain
1990                  * which will drop the mount lock
1991                  * and allow anyone blocked in vfs_busy
1992                  * to wakeup and see the LDEAD state
1993                  */
1994                 mp->mnt_lflag &= ~MNT_LWAIT;
1995                 wakeup((caddr_t)mp);
1996         }
1997         mount_refdrain(mp);
1998 out:
1999         if (mp->mnt_lflag & MNT_LWAIT) {
2000                 mp->mnt_lflag &= ~MNT_LWAIT;
2001                 needwakeup = 1;
2002         }
2003
2004 #if CONFIG_TRIGGERS
2005         if (flags & MNT_NOBLOCK && p != kernproc) {
2006                 // Restore P_NOREMOTEHANG bit to its previous value
2007                 if ((pflags_save & P_NOREMOTEHANG) == 0)
2008                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2009         }
2010
2011         /*
2012          * Callback and context are set together under the mount lock, and
2013          * never cleared, so we're safe to examine them here, drop the lock,
2014          * and call out.
2015          */
2016         if (mp->mnt_triggercallback != NULL) {
2017                 mount_unlock(mp);
2018                 if (error == 0) {
2019                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2020                 } else if (did_vflush) {
2021                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2022                 }
2023         } else {
2024                 mount_unlock(mp);
2025         }
2026 #else
2027         mount_unlock(mp);
2028 #endif /* CONFIG_TRIGGERS */
2029
2030         lck_rw_done(&mp->mnt_rwlock);
2031
2032         if (needwakeup)
2033                 wakeup((caddr_t)mp);
2034
2035         if (!error) {
2036                 if ((coveredvp != NULLVP)) {
2037                         vnode_t pvp = NULLVP;
2038
2039                         /*
2040                          * The covered vnode needs special handling. Trying to
2041                          * get an iocount must not block here as this may lead
2042                          * to deadlocks if the Filesystem to which the covered
2043                          * vnode belongs is undergoing forced unmounts. Since we
2044                          * hold a usecount, the  vnode cannot be reused
2045                          * (it can, however, still be terminated).
2046                          */
2047                         vnode_getalways(coveredvp);
2048
2049                         mount_dropcrossref(mp, coveredvp, 0);
2050                         /*
2051                          * We'll _try_ to detect if this really needs to be
2052                          * done. The coveredvp can only be in termination (or
2053                          * terminated) if the coveredvp's mount point is in a
2054                          * forced unmount (or has been) since we still hold the
2055                          * ref.
2056                          */
2057                         if (!vnode_isrecycled(coveredvp)) {
2058                                 pvp = vnode_getparent(coveredvp);
2059 #if CONFIG_TRIGGERS
2060                                 if (coveredvp->v_resolve) {
2061                                         vnode_trigger_rearm(coveredvp, ctx);
2062                                 }
2063 #endif
2064                         }
2065
2066                         vnode_rele(coveredvp);
2067                         vnode_put(coveredvp);
2068                         coveredvp = NULLVP;
2069
2070                         if (pvp) {
2071                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2072                                 vnode_put(pvp);
2073                         }
2074                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2075                                 mount_lock_destroy(mp);
2076 #if CONFIG_MACF
2077                                 mac_mount_label_destroy(mp);
2078 #endif
2079                                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2080                 } else
2081                         panic("dounmount: no coveredvp");
2082         }
2083         return (error);
2084 }
2085
2086 /*
2087  * Unmount any mounts in this filesystem.
2088  */
2089 void
2090 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2091 {
2092         mount_t smp;
2093         fsid_t *fsids, fsid;
2094         int fsids_sz;
2095         int count = 0, i, m = 0;
2096         vnode_t vp;
2097
2098         mount_list_lock();
2099
2100         // Get an array to hold the submounts fsids.
2101         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2102                 count++;
2103         fsids_sz = count * sizeof(fsid_t);
2104         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2105         if (fsids == NULL) {
2106                 mount_list_unlock();
2107                 goto out;
2108         }
2109         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2110
2111         /*
2112          * Fill the array with submount fsids.
2113          * Since mounts are always added to the tail of the mount list, the
2114          * list is always in mount order.
2115          * For each mount check if the mounted-on vnode belongs to a
2116          * mount that's already added to our array of mounts to be unmounted.
2117          */
2118         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2119                 vp = smp->mnt_vnodecovered;
2120                 if (vp == NULL)
2121                         continue;
2122                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2123                 for (i = 0; i <= m; i++) {
2124                         if (fsids[i].val[0] == fsid.val[0] &&
2125                             fsids[i].val[1] == fsid.val[1]) {
2126                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2127                                 break;
2128                         }
2129                 }
2130         }
2131         mount_list_unlock();
2132
2133         // Unmount the submounts in reverse order. Ignore errors.
2134         for (i = m; i > 0; i--) {
2135                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2136                 if (smp) {
2137                         mount_ref(smp, 0);
2138                         mount_iterdrop(smp);
2139                         (void) dounmount(smp, flags, 1, ctx);
2140                 }
2141         }
2142 out:
2143         if (fsids)
2144                 FREE(fsids, M_TEMP);
2145 }
2146
2147 void
2148 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2149 {
2150         vnode_lock(dp);
2151         mp->mnt_crossref--;
2152
2153         if (mp->mnt_crossref < 0)
2154                 panic("mount cross refs -ve");
2155
2156         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2157
2158                 if (need_put)
2159                         vnode_put_locked(dp);
2160                 vnode_unlock(dp);
2161
2162                 mount_lock_destroy(mp);
2163 #if CONFIG_MACF
2164                 mac_mount_label_destroy(mp);
2165 #endif
2166                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2167                 return;
2168         }
2169         if (need_put)
2170                 vnode_put_locked(dp);
2171         vnode_unlock(dp);
2172 }
2173
2174
2175 /*
2176  * Sync each mounted filesystem.
2177  */
2178 #if DIAGNOSTIC
2179 int syncprt = 0;
2180 #endif
2181
2182 int print_vmpage_stat=0;
2183 int sync_timeout = 60;  // Sync time limit (sec)
2184
2185 static int
2186 sync_callback(mount_t mp, __unused void *arg)
2187 {
2188         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2189                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2190
2191                 mp->mnt_flag &= ~MNT_ASYNC;
2192                 VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
2193                 if (asyncflag)
2194                         mp->mnt_flag |= MNT_ASYNC;
2195         }
2196
2197         return (VFS_RETURNED);
2198 }
2199
2200 /* ARGSUSED */
2201 int
2202 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2203 {
2204         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2205
2206         if (print_vmpage_stat) {
2207                 vm_countdirtypages();
2208         }
2209
2210 #if DIAGNOSTIC
2211         if (syncprt)
2212                 vfs_bufstats();
2213 #endif /* DIAGNOSTIC */
2214         return 0;
2215 }
2216
2217 static void
2218 sync_thread(void *arg, __unused wait_result_t wr)
2219 {
2220         int *timeout = (int *) arg;
2221
2222         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2223
2224         if (timeout)
2225                 wakeup((caddr_t) timeout);
2226         if (print_vmpage_stat) {
2227                 vm_countdirtypages();
2228         }
2229
2230 #if DIAGNOSTIC
2231         if (syncprt)
2232                 vfs_bufstats();
2233 #endif /* DIAGNOSTIC */
2234 }
2235
2236 /*
2237  * Sync in a separate thread so we can time out if it blocks.
2238  */
2239 static int
2240 sync_async(int timeout)
2241 {
2242         thread_t thd;
2243         int error;
2244         struct timespec ts = {timeout, 0};
2245
2246         lck_mtx_lock(sync_mtx_lck);
2247         if (kernel_thread_start(sync_thread, &timeout, &thd) != KERN_SUCCESS) {
2248                 printf("sync_thread failed\n");
2249                 lck_mtx_unlock(sync_mtx_lck);
2250                 return (0);
2251         }
2252
2253         error = msleep((caddr_t) &timeout, sync_mtx_lck, (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2254         if (error) {
2255                 printf("sync timed out: %d sec\n", timeout);
2256         }
2257         thread_deallocate(thd);
2258
2259         return (0);
2260 }
2261
2262 /*
2263  * An in-kernel sync for power management to call.
2264  */
2265 __private_extern__ int
2266 sync_internal(void)
2267 {
2268         (void) sync_async(sync_timeout);
2269
2270         return 0;
2271 } /* end of sync_internal call */
2272
2273 /*
2274  * Change filesystem quotas.
2275  */
2276 #if QUOTA
2277 int
2278 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2279 {
2280         struct mount *mp;
2281         int error, quota_cmd, quota_status;
2282         caddr_t datap;
2283         size_t fnamelen;
2284         struct nameidata nd;
2285         vfs_context_t ctx = vfs_context_current();
2286         struct dqblk my_dqblk;
2287
2288         AUDIT_ARG(uid, uap->uid);
2289         AUDIT_ARG(cmd, uap->cmd);
2290         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2291                uap->path, ctx);
2292         error = namei(&nd);
2293         if (error)
2294                 return (error);
2295         mp = nd.ni_vp->v_mount;
2296         vnode_put(nd.ni_vp);
2297         nameidone(&nd);
2298
2299         /* copyin any data we will need for downstream code */
2300         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2301
2302         switch (quota_cmd) {
2303         case Q_QUOTAON:
2304                 /* uap->arg specifies a file from which to take the quotas */
2305                 fnamelen = MAXPATHLEN;
2306                 datap = kalloc(MAXPATHLEN);
2307                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2308                 break;
2309         case Q_GETQUOTA:
2310                 /* uap->arg is a pointer to a dqblk structure. */
2311                 datap = (caddr_t) &my_dqblk;
2312                 break;
2313         case Q_SETQUOTA:
2314         case Q_SETUSE:
2315                 /* uap->arg is a pointer to a dqblk structure. */
2316                 datap = (caddr_t) &my_dqblk;
2317                 if (proc_is64bit(p)) {
2318                         struct user_dqblk       my_dqblk64;
2319                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof (my_dqblk64));
2320                         if (error == 0) {
2321                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2322                         }
2323                 }
2324                 else {
2325                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof (my_dqblk));
2326                 }
2327                 break;
2328         case Q_QUOTASTAT:
2329                 /* uap->arg is a pointer to an integer */
2330                 datap = (caddr_t) &quota_status;
2331                 break;
2332         default:
2333                 datap = NULL;
2334                 break;
2335         } /* switch */
2336
2337         if (error == 0) {
2338                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2339         }
2340
2341         switch (quota_cmd) {
2342         case Q_QUOTAON:
2343                 if (datap != NULL)
2344                         kfree(datap, MAXPATHLEN);
2345                 break;
2346         case Q_GETQUOTA:
2347                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2348                 if (error == 0) {
2349                         if (proc_is64bit(p)) {
2350                                 struct user_dqblk       my_dqblk64 = {.dqb_bhardlimit = 0};
2351                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2352                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof (my_dqblk64));
2353                         }
2354                         else {
2355                                 error = copyout(datap, uap->arg, sizeof (struct dqblk));
2356                         }
2357                 }
2358                 break;
2359         case Q_QUOTASTAT:
2360                 /* uap->arg is a pointer to an integer */
2361                 if (error == 0) {
2362                         error = copyout(datap, uap->arg, sizeof(quota_status));
2363                 }
2364                 break;
2365         default:
2366                 break;
2367         } /* switch */
2368
2369         return (error);
2370 }
2371 #else
2372 int
2373 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2374 {
2375         return (EOPNOTSUPP);
2376 }
2377 #endif /* QUOTA */
2378
2379 /*
2380  * Get filesystem statistics.
2381  *
2382  * Returns:     0                       Success
2383  *      namei:???
2384  *      vfs_update_vfsstat:???
2385  *      munge_statfs:EFAULT
2386  */
2387 /* ARGSUSED */
2388 int
2389 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2390 {
2391         struct mount *mp;
2392         struct vfsstatfs *sp;
2393         int error;
2394         struct nameidata nd;
2395         vfs_context_t ctx = vfs_context_current();
2396         vnode_t vp;
2397
2398         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2399                 UIO_USERSPACE, uap->path, ctx);
2400         error = namei(&nd);
2401         if (error)
2402                 return (error);
2403         vp = nd.ni_vp;
2404         mp = vp->v_mount;
2405         sp = &mp->mnt_vfsstat;
2406         nameidone(&nd);
2407
2408         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2409         if (error != 0) {
2410                 vnode_put(vp);
2411                 return (error);
2412         }
2413
2414         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2415         vnode_put(vp);
2416         return (error);
2417 }
2418
2419 /*
2420  * Get filesystem statistics.
2421  */
2422 /* ARGSUSED */
2423 int
2424 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2425 {
2426         vnode_t vp;
2427         struct mount *mp;
2428         struct vfsstatfs *sp;
2429         int error;
2430
2431         AUDIT_ARG(fd, uap->fd);
2432
2433         if ( (error = file_vnode(uap->fd, &vp)) )
2434                 return (error);
2435
2436         error = vnode_getwithref(vp);
2437         if (error) {
2438                 file_drop(uap->fd);
2439                 return (error);
2440         }
2441
2442         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2443
2444         mp = vp->v_mount;
2445         if (!mp) {
2446                 error = EBADF;
2447                 goto out;
2448         }
2449         sp = &mp->mnt_vfsstat;
2450         if ((error = vfs_update_vfsstat(mp,vfs_context_current(),VFS_USER_EVENT)) != 0) {
2451                 goto out;
2452         }
2453
2454         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2455
2456 out:
2457         file_drop(uap->fd);
2458         vnode_put(vp);
2459
2460         return (error);
2461 }
2462
2463 /*
2464  * Common routine to handle copying of statfs64 data to user space
2465  */
2466 static int
2467 statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
2468 {
2469         int error;
2470         struct statfs64 sfs;
2471
2472         bzero(&sfs, sizeof(sfs));
2473
2474         sfs.f_bsize = sfsp->f_bsize;
2475         sfs.f_iosize = (int32_t)sfsp->f_iosize;
2476         sfs.f_blocks = sfsp->f_blocks;
2477         sfs.f_bfree = sfsp->f_bfree;
2478         sfs.f_bavail = sfsp->f_bavail;
2479         sfs.f_files = sfsp->f_files;
2480         sfs.f_ffree = sfsp->f_ffree;
2481         sfs.f_fsid = sfsp->f_fsid;
2482         sfs.f_owner = sfsp->f_owner;
2483         sfs.f_type = mp->mnt_vtable->vfc_typenum;
2484         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2485         sfs.f_fssubtype = sfsp->f_fssubtype;
2486         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2487                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
2488         } else {
2489                 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
2490         }
2491         strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
2492         strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
2493
2494         error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
2495
2496         return(error);
2497 }
2498
2499 /*
2500  * Get file system statistics in 64-bit mode
2501  */
2502 int
2503 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
2504 {
2505         struct mount *mp;
2506         struct vfsstatfs *sp;
2507         int error;
2508         struct nameidata nd;
2509         vfs_context_t ctxp = vfs_context_current();
2510         vnode_t vp;
2511
2512         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2513                 UIO_USERSPACE, uap->path, ctxp);
2514         error = namei(&nd);
2515         if (error)
2516                 return (error);
2517         vp = nd.ni_vp;
2518         mp = vp->v_mount;
2519         sp = &mp->mnt_vfsstat;
2520         nameidone(&nd);
2521
2522         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
2523         if (error != 0) {
2524                 vnode_put(vp);
2525                 return (error);
2526         }
2527
2528         error = statfs64_common(mp, sp, uap->buf);
2529         vnode_put(vp);
2530
2531         return (error);
2532 }
2533
2534 /*
2535  * Get file system statistics in 64-bit mode
2536  */
2537 int
2538 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
2539 {
2540         struct vnode *vp;
2541         struct mount *mp;
2542         struct vfsstatfs *sp;
2543         int error;
2544
2545         AUDIT_ARG(fd, uap->fd);
2546
2547         if ( (error = file_vnode(uap->fd, &vp)) )
2548                 return (error);
2549
2550         error = vnode_getwithref(vp);
2551         if (error) {
2552                 file_drop(uap->fd);
2553                 return (error);
2554         }
2555
2556         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2557
2558         mp = vp->v_mount;
2559         if (!mp) {
2560                 error = EBADF;
2561                 goto out;
2562         }
2563         sp = &mp->mnt_vfsstat;
2564         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2565                 goto out;
2566         }
2567
2568         error = statfs64_common(mp, sp, uap->buf);
2569
2570 out:
2571         file_drop(uap->fd);
2572         vnode_put(vp);
2573
2574         return (error);
2575 }
2576
2577 struct getfsstat_struct {
2578         user_addr_t     sfsp;
2579         user_addr_t     *mp;
2580         int             count;
2581         int             maxcount;
2582         int             flags;
2583         int             error;
2584 };
2585
2586
2587 static int
2588 getfsstat_callback(mount_t mp, void * arg)
2589 {
2590
2591         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2592         struct vfsstatfs *sp;
2593         int error, my_size;
2594         vfs_context_t ctx = vfs_context_current();
2595
2596         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2597                 sp = &mp->mnt_vfsstat;
2598                 /*
2599                  * If MNT_NOWAIT is specified, do not refresh the
2600                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
2601                  */
2602                 if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2603                         (error = vfs_update_vfsstat(mp, ctx,
2604                             VFS_USER_EVENT))) {
2605                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2606                         return(VFS_RETURNED);
2607                 }
2608
2609                 /*
2610                  * Need to handle LP64 version of struct statfs
2611                  */
2612                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
2613                 if (error) {
2614                         fstp->error = error;
2615                         return(VFS_RETURNED_DONE);
2616                 }
2617                 fstp->sfsp += my_size;
2618
2619                 if (fstp->mp) {
2620 #if CONFIG_MACF
2621                         error = mac_mount_label_get(mp, *fstp->mp);
2622                         if (error) {
2623                                 fstp->error = error;
2624                                 return(VFS_RETURNED_DONE);
2625                         }
2626 #endif
2627                         fstp->mp++;
2628                 }
2629         }
2630         fstp->count++;
2631         return(VFS_RETURNED);
2632 }
2633
2634 /*
2635  * Get statistics on all filesystems.
2636  */
2637 int
2638 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
2639 {
2640         struct __mac_getfsstat_args muap;
2641
2642         muap.buf = uap->buf;
2643         muap.bufsize = uap->bufsize;
2644         muap.mac = USER_ADDR_NULL;
2645         muap.macsize = 0;
2646         muap.flags = uap->flags;
2647
2648         return (__mac_getfsstat(p, &muap, retval));
2649 }
2650
2651 /*
2652  * __mac_getfsstat: Get MAC-related file system statistics
2653  *
2654  * Parameters:    p                        (ignored)
2655  *                uap                      User argument descriptor (see below)
2656  *                retval                   Count of file system statistics (N stats)
2657  *
2658  * Indirect:      uap->bufsize             Buffer size
2659  *                uap->macsize             MAC info size
2660  *                uap->buf                 Buffer where information will be returned
2661  *                uap->mac                 MAC info
2662  *                uap->flags               File system flags
2663  *
2664  *
2665  * Returns:        0                       Success
2666  *                !0                       Not success
2667  *
2668  */
2669 int
2670 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
2671 {
2672         user_addr_t sfsp;
2673         user_addr_t *mp;
2674         size_t count, maxcount, bufsize, macsize;
2675         struct getfsstat_struct fst;
2676
2677         bufsize = (size_t) uap->bufsize;
2678         macsize = (size_t) uap->macsize;
2679
2680         if (IS_64BIT_PROCESS(p)) {
2681                 maxcount = bufsize / sizeof(struct user64_statfs);
2682         }
2683         else {
2684                 maxcount = bufsize / sizeof(struct user32_statfs);
2685         }
2686         sfsp = uap->buf;
2687         count = 0;
2688
2689         mp = NULL;
2690
2691 #if CONFIG_MACF
2692         if (uap->mac != USER_ADDR_NULL) {
2693                 u_int32_t *mp0;
2694                 int error;
2695                 unsigned int i;
2696
2697                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
2698                 if (count != maxcount)
2699                         return (EINVAL);
2700
2701                 /* Copy in the array */
2702                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
2703                 if (mp0 == NULL) {
2704                         return (ENOMEM);
2705                 }
2706
2707                 error = copyin(uap->mac, mp0, macsize);
2708                 if (error) {
2709                         FREE(mp0, M_MACTEMP);
2710                         return (error);
2711                 }
2712
2713                 /* Normalize to an array of user_addr_t */
2714                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
2715                 if (mp == NULL) {
2716                         FREE(mp0, M_MACTEMP);
2717                         return (ENOMEM);
2718                 }
2719
2720                 for (i = 0; i < count; i++) {
2721                         if (IS_64BIT_PROCESS(p))
2722                                 mp[i] = ((user_addr_t *)mp0)[i];
2723                         else
2724                                 mp[i] = (user_addr_t)mp0[i];
2725                 }
2726                 FREE(mp0, M_MACTEMP);
2727         }
2728 #endif
2729
2730
2731         fst.sfsp = sfsp;
2732         fst.mp = mp;
2733         fst.flags = uap->flags;
2734         fst.count = 0;
2735         fst.error = 0;
2736         fst.maxcount = maxcount;
2737
2738
2739         vfs_iterate(0, getfsstat_callback, &fst);
2740
2741         if (mp)
2742                 FREE(mp, M_MACTEMP);
2743
2744         if (fst.error ) {
2745                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2746                 return(fst.error);
2747         }
2748
2749         if (fst.sfsp && fst.count > fst.maxcount)
2750                 *retval = fst.maxcount;
2751         else
2752                 *retval = fst.count;
2753         return (0);
2754 }
2755
2756 static int
2757 getfsstat64_callback(mount_t mp, void * arg)
2758 {
2759         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2760         struct vfsstatfs *sp;
2761         int error;
2762
2763         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2764                 sp = &mp->mnt_vfsstat;
2765                 /*
2766                  * If MNT_NOWAIT is specified, do not refresh the fsstat
2767                  * cache. MNT_WAIT overrides MNT_NOWAIT.
2768                  *
2769                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
2770                  * getfsstat, since the constants are out of the same
2771                  * namespace.
2772                  */
2773                 if (((fstp->flags & MNT_NOWAIT) == 0 ||
2774                      (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2775                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
2776                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2777                         return(VFS_RETURNED);
2778                 }
2779
2780                 error = statfs64_common(mp, sp, fstp->sfsp);
2781                 if (error) {
2782                         fstp->error = error;
2783                         return(VFS_RETURNED_DONE);
2784                 }
2785                 fstp->sfsp += sizeof(struct statfs64);
2786         }
2787         fstp->count++;
2788         return(VFS_RETURNED);
2789 }
2790
2791 /*
2792  * Get statistics on all file systems in 64 bit mode.
2793  */
2794 int
2795 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
2796 {
2797         user_addr_t sfsp;
2798         int count, maxcount;
2799         struct getfsstat_struct fst;
2800
2801         maxcount = uap->bufsize / sizeof(struct statfs64);
2802
2803         sfsp = uap->buf;
2804         count = 0;
2805
2806         fst.sfsp = sfsp;
2807         fst.flags = uap->flags;
2808         fst.count = 0;
2809         fst.error = 0;
2810         fst.maxcount = maxcount;
2811
2812         vfs_iterate(0, getfsstat64_callback, &fst);
2813
2814         if (fst.error ) {
2815                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2816                 return(fst.error);
2817         }
2818
2819         if (fst.sfsp && fst.count > fst.maxcount)
2820                 *retval = fst.maxcount;
2821         else
2822                 *retval = fst.count;
2823
2824         return (0);
2825 }
2826
2827 /*
2828  * gets the associated vnode with the file descriptor passed.
2829  * as input
2830  *
2831  * INPUT
2832  * ctx - vfs context of caller
2833  * fd - file descriptor for which vnode is required.
2834  * vpp - Pointer to pointer to vnode to be returned.
2835  *
2836  * The vnode is returned with an iocount so any vnode obtained
2837  * by this call needs a vnode_put
2838  *
2839  */
2840 static int
2841 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
2842 {
2843         int error;
2844         vnode_t vp;
2845         struct fileproc *fp;
2846         proc_t p = vfs_context_proc(ctx);
2847
2848         *vpp =  NULLVP;
2849
2850         error = fp_getfvp(p, fd, &fp, &vp);
2851         if (error)
2852                 return (error);
2853
2854         error = vnode_getwithref(vp);
2855         if (error) {
2856                 (void)fp_drop(p, fd, fp, 0);
2857                 return (error);
2858         }
2859
2860         (void)fp_drop(p, fd, fp, 0);
2861         *vpp = vp;
2862         return (error);
2863 }
2864
2865 /*
2866  * Wrapper function around namei to start lookup from a directory
2867  * specified by a file descriptor ni_dirfd.
2868  *
2869  * In addition to all the errors returned by namei, this call can
2870  * return ENOTDIR if the file descriptor does not refer to a directory.
2871  * and EBADF if the file descriptor is not valid.
2872  */
2873 int
2874 nameiat(struct nameidata *ndp, int dirfd)
2875 {
2876         if ((dirfd != AT_FDCWD) &&
2877             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
2878             !(ndp->ni_cnd.cn_flags & USEDVP)) {
2879                 int error = 0;
2880                 char c;
2881
2882                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
2883                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
2884                         if (error)
2885                                 return (error);
2886                 } else {
2887                         c = *((char *)(ndp->ni_dirp));
2888                 }
2889
2890                 if (c != '/') {
2891                         vnode_t dvp_at;
2892
2893                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
2894                             &dvp_at);
2895                         if (error)
2896                                 return (error);
2897
2898                         if (vnode_vtype(dvp_at) != VDIR) {
2899                                 vnode_put(dvp_at);
2900                                 return (ENOTDIR);
2901                         }
2902
2903                         ndp->ni_dvp = dvp_at;
2904                         ndp->ni_cnd.cn_flags |= USEDVP;
2905                         error = namei(ndp);
2906                         ndp->ni_cnd.cn_flags &= ~USEDVP;
2907                         vnode_put(dvp_at);
2908                         return (error);
2909                 }
2910         }
2911
2912         return (namei(ndp));
2913 }
2914
2915 /*
2916  * Change current working directory to a given file descriptor.
2917  */
2918 /* ARGSUSED */
2919 static int
2920 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
2921 {
2922         struct filedesc *fdp = p->p_fd;
2923         vnode_t vp;
2924         vnode_t tdp;
2925         vnode_t tvp;
2926         struct mount *mp;
2927         int error;
2928         vfs_context_t ctx = vfs_context_current();
2929
2930         AUDIT_ARG(fd, uap->fd);
2931         if (per_thread && uap->fd == -1) {
2932                 /*
2933                  * Switching back from per-thread to per process CWD; verify we
2934                  * in fact have one before proceeding.  The only success case
2935                  * for this code path is to return 0 preemptively after zapping
2936                  * the thread structure contents.
2937                  */
2938                 thread_t th = vfs_context_thread(ctx);
2939                 if (th) {
2940                         uthread_t uth = get_bsdthread_info(th);
2941                         tvp = uth->uu_cdir;
2942                         uth->uu_cdir = NULLVP;
2943                         if (tvp != NULLVP) {
2944                                 vnode_rele(tvp);
2945                                 return (0);
2946                         }
2947                 }
2948                 return (EBADF);
2949         }
2950
2951         if ( (error = file_vnode(uap->fd, &vp)) )
2952                 return(error);
2953         if ( (error = vnode_getwithref(vp)) ) {
2954                 file_drop(uap->fd);
2955                 return(error);
2956         }
2957
2958         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
2959
2960         if (vp->v_type != VDIR) {
2961                 error = ENOTDIR;
2962                 goto out;
2963         }
2964
2965 #if CONFIG_MACF
2966         error = mac_vnode_check_chdir(ctx, vp);
2967         if (error)
2968                 goto out;
2969 #endif
2970         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
2971         if (error)
2972                 goto out;
2973
2974         while (!error && (mp = vp->v_mountedhere) != NULL) {
2975                 if (vfs_busy(mp, LK_NOWAIT)) {
2976                         error = EACCES;
2977                         goto out;
2978                 }
2979                 error = VFS_ROOT(mp, &tdp, ctx);
2980                 vfs_unbusy(mp);
2981                 if (error)
2982                         break;
2983                 vnode_put(vp);
2984                 vp = tdp;
2985         }
2986         if (error)
2987                 goto out;
2988         if ( (error = vnode_ref(vp)) )
2989                 goto out;
2990         vnode_put(vp);
2991
2992         if (per_thread) {
2993                 thread_t th = vfs_context_thread(ctx);
2994                 if (th) {
2995                         uthread_t uth = get_bsdthread_info(th);
2996                         tvp = uth->uu_cdir;
2997                         uth->uu_cdir = vp;
2998                         OSBitOrAtomic(P_THCWD, &p->p_flag);
2999                 } else {
3000                         vnode_rele(vp);
3001                         return (ENOENT);
3002                 }
3003         } else {
3004                 proc_fdlock(p);
3005                 tvp = fdp->fd_cdir;
3006                 fdp->fd_cdir = vp;
3007                 proc_fdunlock(p);
3008         }
3009
3010         if (tvp)
3011                 vnode_rele(tvp);
3012         file_drop(uap->fd);
3013
3014         return (0);
3015 out:
3016         vnode_put(vp);
3017         file_drop(uap->fd);
3018
3019         return(error);
3020 }
3021
3022 int
3023 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3024 {
3025         return common_fchdir(p, uap, 0);
3026 }
3027
3028 int
3029 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3030 {
3031         return common_fchdir(p, (void *)uap, 1);
3032 }
3033
3034 /*
3035  * Change current working directory (".").
3036  *
3037  * Returns:     0                       Success
3038  *      change_dir:ENOTDIR
3039  *      change_dir:???
3040  *      vnode_ref:ENOENT                No such file or directory
3041  */
3042 /* ARGSUSED */
3043 static int
3044 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3045 {
3046         struct filedesc *fdp = p->p_fd;
3047         int error;
3048         struct nameidata nd;
3049         vnode_t tvp;
3050         vfs_context_t ctx = vfs_context_current();
3051
3052         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3053                 UIO_USERSPACE, uap->path, ctx);
3054         error = change_dir(&nd, ctx);
3055         if (error)
3056                 return (error);
3057         if ( (error = vnode_ref(nd.ni_vp)) ) {
3058                 vnode_put(nd.ni_vp);
3059                 return (error);
3060         }
3061         /*
3062          * drop the iocount we picked up in change_dir
3063          */
3064         vnode_put(nd.ni_vp);
3065
3066         if (per_thread) {
3067                 thread_t th = vfs_context_thread(ctx);
3068                 if (th) {
3069                         uthread_t uth = get_bsdthread_info(th);
3070                         tvp = uth->uu_cdir;
3071                         uth->uu_cdir = nd.ni_vp;
3072                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3073                 } else {
3074                         vnode_rele(nd.ni_vp);
3075                         return (ENOENT);
3076                 }
3077         } else {
3078                 proc_fdlock(p);
3079                 tvp = fdp->fd_cdir;
3080                 fdp->fd_cdir = nd.ni_vp;
3081                 proc_fdunlock(p);
3082         }
3083
3084         if (tvp)
3085                 vnode_rele(tvp);
3086
3087         return (0);
3088 }
3089
3090
3091 /*
3092  * chdir
3093  *
3094  * Change current working directory (".") for the entire process
3095  *
3096  * Parameters:  p       Process requesting the call
3097  *              uap     User argument descriptor (see below)
3098  *              retval  (ignored)
3099  *
3100  * Indirect parameters: uap->path       Directory path
3101  *
3102  * Returns:     0                       Success
3103  *              common_chdir: ENOTDIR
3104  *              common_chdir: ENOENT    No such file or directory
3105  *              common_chdir: ???
3106  *
3107  */
3108 int
3109 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3110 {
3111         return common_chdir(p, (void *)uap, 0);
3112 }
3113
3114 /*
3115  * __pthread_chdir
3116  *
3117  * Change current working directory (".") for a single thread
3118  *
3119  * Parameters:  p       Process requesting the call
3120  *              uap     User argument descriptor (see below)
3121  *              retval  (ignored)
3122  *
3123  * Indirect parameters: uap->path       Directory path
3124  *
3125  * Returns:     0                       Success
3126  *              common_chdir: ENOTDIR
3127  *              common_chdir: ENOENT    No such file or directory
3128  *              common_chdir: ???
3129  *
3130  */
3131 int
3132 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3133 {
3134         return common_chdir(p, (void *)uap, 1);
3135 }
3136
3137
3138 /*
3139  * Change notion of root (``/'') directory.
3140  */
3141 /* ARGSUSED */
3142 int
3143 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3144 {
3145         struct filedesc *fdp = p->p_fd;
3146         int error;
3147         struct nameidata nd;
3148         vnode_t tvp;
3149         vfs_context_t ctx = vfs_context_current();
3150
3151         if ((error = suser(kauth_cred_get(), &p->p_acflag)))
3152                 return (error);
3153
3154         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3155                 UIO_USERSPACE, uap->path, ctx);
3156         error = change_dir(&nd, ctx);
3157         if (error)
3158                 return (error);
3159
3160 #if CONFIG_MACF
3161         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3162             &nd.ni_cnd);
3163         if (error) {
3164                 vnode_put(nd.ni_vp);
3165                 return (error);
3166         }
3167 #endif
3168
3169         if ( (error = vnode_ref(nd.ni_vp)) ) {
3170                 vnode_put(nd.ni_vp);
3171                 return (error);
3172         }
3173         vnode_put(nd.ni_vp);
3174
3175         proc_fdlock(p);
3176         tvp = fdp->fd_rdir;
3177         fdp->fd_rdir = nd.ni_vp;
3178         fdp->fd_flags |= FD_CHROOT;
3179         proc_fdunlock(p);
3180
3181         if (tvp != NULL)
3182                 vnode_rele(tvp);
3183
3184         return (0);
3185 }
3186
3187 /*
3188  * Common routine for chroot and chdir.
3189  *
3190  * Returns:     0                       Success
3191  *              ENOTDIR                 Not a directory
3192  *              namei:???               [anything namei can return]
3193  *              vnode_authorize:???     [anything vnode_authorize can return]
3194  */
3195 static int
3196 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3197 {
3198         vnode_t vp;
3199         int error;
3200
3201         if ((error = namei(ndp)))
3202                 return (error);
3203         nameidone(ndp);
3204         vp = ndp->ni_vp;
3205
3206         if (vp->v_type != VDIR) {
3207                 vnode_put(vp);
3208                 return (ENOTDIR);
3209         }
3210
3211 #if CONFIG_MACF
3212         error = mac_vnode_check_chdir(ctx, vp);
3213         if (error) {
3214                 vnode_put(vp);
3215                 return (error);
3216         }
3217 #endif
3218
3219         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3220         if (error) {
3221                 vnode_put(vp);
3222                 return (error);
3223         }
3224
3225         return (error);
3226 }
3227
3228 /*
3229  * Free the vnode data (for directories) associated with the file glob.
3230  */
3231 struct fd_vn_data *
3232 fg_vn_data_alloc(void)
3233 {
3234         struct fd_vn_data *fvdata;
3235
3236         /* Allocate per fd vnode data */
3237         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3238                M_FD_VN_DATA, M_WAITOK | M_ZERO);
3239         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3240         return fvdata;
3241 }
3242
3243 /*
3244  * Free the vnode data (for directories) associated with the file glob.
3245  */
3246 void
3247 fg_vn_data_free(void *fgvndata)
3248 {
3249         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3250
3251         if (fvdata->fv_buf)
3252                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3253         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3254         FREE(fvdata, M_FD_VN_DATA);
3255 }
3256
3257 /*
3258  * Check permissions, allocate an open file structure,
3259  * and call the device open routine if any.
3260  *
3261  * Returns:     0                       Success
3262  *              EINVAL
3263  *              EINTR
3264  *      falloc:ENFILE
3265  *      falloc:EMFILE
3266  *      falloc:ENOMEM
3267  *      vn_open_auth:???
3268  *      dupfdopen:???
3269  *      VNOP_ADVLOCK:???
3270  *      vnode_setsize:???
3271  *
3272  * XXX Need to implement uid, gid
3273  */
3274 int
3275 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3276     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3277     int32_t *retval)
3278 {
3279         proc_t p = vfs_context_proc(ctx);
3280         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3281         struct fileproc *fp;
3282         vnode_t vp;
3283         int flags, oflags;
3284         int type, indx, error;
3285         struct flock lf;
3286         int no_controlling_tty = 0;
3287         int deny_controlling_tty = 0;
3288         struct session *sessp = SESSION_NULL;
3289
3290         oflags = uflags;
3291
3292         if ((oflags & O_ACCMODE) == O_ACCMODE)
3293                 return(EINVAL);
3294         flags = FFLAGS(uflags);
3295
3296         AUDIT_ARG(fflags, oflags);
3297         AUDIT_ARG(mode, vap->va_mode);
3298
3299         if ((error = falloc_withalloc(p,
3300             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3301                 return (error);
3302         }
3303         uu->uu_dupfd = -indx - 1;
3304
3305         if (!(p->p_flag & P_CONTROLT)) {
3306                 sessp = proc_session(p);
3307                 no_controlling_tty = 1;
3308                 /*
3309                  * If conditions would warrant getting a controlling tty if
3310                  * the device being opened is a tty (see ttyopen in tty.c),
3311                  * but the open flags deny it, set a flag in the session to
3312                  * prevent it.
3313                  */
3314                 if (SESS_LEADER(p, sessp) &&
3315                     sessp->s_ttyvp == NULL &&
3316                     (flags & O_NOCTTY)) {
3317                         session_lock(sessp);
3318                         sessp->s_flags |= S_NOCTTY;
3319                         session_unlock(sessp);
3320                         deny_controlling_tty = 1;
3321                 }
3322         }
3323
3324         if ((error = vn_open_auth(ndp, &flags, vap))) {
3325                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)){        /* XXX from fdopen */
3326                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3327                                 fp_drop(p, indx, NULL, 0);
3328                                 *retval = indx;
3329                                 if (deny_controlling_tty) {
3330                                         session_lock(sessp);
3331                                         sessp->s_flags &= ~S_NOCTTY;
3332                                         session_unlock(sessp);
3333                                 }
3334                                 if (sessp != SESSION_NULL)
3335                                         session_rele(sessp);
3336                                 return (0);
3337                         }
3338                 }
3339                 if (error == ERESTART)
3340                         error = EINTR;
3341                 fp_free(p, indx, fp);
3342
3343                 if (deny_controlling_tty) {
3344                         session_lock(sessp);
3345                         sessp->s_flags &= ~S_NOCTTY;
3346                         session_unlock(sessp);
3347                 }
3348                 if (sessp != SESSION_NULL)
3349                         session_rele(sessp);
3350                 return (error);
3351         }
3352         uu->uu_dupfd = 0;
3353         vp = ndp->ni_vp;
3354
3355         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY);
3356         fp->f_fglob->fg_ops = &vnops;
3357         fp->f_fglob->fg_data = (caddr_t)vp;
3358
3359 #if CONFIG_PROTECT
3360         if (VATTR_IS_ACTIVE (vap, va_dataprotect_flags)) {
3361                 if (vap->va_dataprotect_flags & VA_DP_RAWENCRYPTED) {
3362                         fp->f_fglob->fg_flag |= FENCRYPTED;
3363                 }
3364         }
3365 #endif
3366
3367         if (flags & (O_EXLOCK | O_SHLOCK)) {
3368                 lf.l_whence = SEEK_SET;
3369                 lf.l_start = 0;
3370                 lf.l_len = 0;
3371                 if (flags & O_EXLOCK)
3372                         lf.l_type = F_WRLCK;
3373                 else
3374                         lf.l_type = F_RDLCK;
3375                 type = F_FLOCK;
3376                 if ((flags & FNONBLOCK) == 0)
3377                         type |= F_WAIT;
3378 #if CONFIG_MACF
3379                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3380                     F_SETLK, &lf);
3381                 if (error)
3382                         goto bad;
3383 #endif
3384                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL)))
3385                         goto bad;
3386                 fp->f_fglob->fg_flag |= FHASLOCK;
3387         }
3388
3389         /* try to truncate by setting the size attribute */
3390         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0))
3391                 goto bad;
3392
3393         /*
3394          * If the open flags denied the acquisition of a controlling tty,
3395          * clear the flag in the session structure that prevented the lower
3396          * level code from assigning one.
3397          */
3398         if (deny_controlling_tty) {
3399                 session_lock(sessp);
3400                 sessp->s_flags &= ~S_NOCTTY;
3401                 session_unlock(sessp);
3402         }
3403
3404         /*
3405          * If a controlling tty was set by the tty line discipline, then we
3406          * want to set the vp of the tty into the session structure.  We have
3407          * a race here because we can't get to the vp for the tp in ttyopen,
3408          * because it's not passed as a parameter in the open path.
3409          */
3410         if (no_controlling_tty && (p->p_flag & P_CONTROLT)) {
3411                 vnode_t ttyvp;
3412
3413                 session_lock(sessp);
3414                 ttyvp = sessp->s_ttyvp;
3415                 sessp->s_ttyvp = vp;
3416                 sessp->s_ttyvid = vnode_vid(vp);
3417                 session_unlock(sessp);
3418         }
3419
3420         /*
3421          * For directories we hold some additional information in the fd.
3422          */
3423         if (vnode_vtype(vp) == VDIR) {
3424                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3425         } else {
3426                 fp->f_fglob->fg_vn_data = NULL;
3427         }
3428
3429         vnode_put(vp);
3430
3431         proc_fdlock(p);
3432         if (flags & O_CLOEXEC)
3433                 *fdflags(p, indx) |= UF_EXCLOSE;
3434         if (flags & O_CLOFORK)
3435                 *fdflags(p, indx) |= UF_FORKCLOSE;
3436         procfdtbl_releasefd(p, indx, NULL);
3437         fp_drop(p, indx, fp, 1);
3438         proc_fdunlock(p);
3439
3440         *retval = indx;
3441
3442         if (sessp != SESSION_NULL)
3443                 session_rele(sessp);
3444         return (0);
3445 bad:
3446         if (deny_controlling_tty) {
3447                 session_lock(sessp);
3448                 sessp->s_flags &= ~S_NOCTTY;
3449                 session_unlock(sessp);
3450         }
3451         if (sessp != SESSION_NULL)
3452                 session_rele(sessp);
3453
3454         struct vfs_context context = *vfs_context_current();
3455         context.vc_ucred = fp->f_fglob->fg_cred;
3456
3457         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
3458             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
3459                 lf.l_whence = SEEK_SET;
3460                 lf.l_start = 0;
3461                 lf.l_len = 0;
3462                 lf.l_type = F_UNLCK;
3463
3464                 (void)VNOP_ADVLOCK(
3465                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
3466         }
3467
3468         vn_close(vp, fp->f_fglob->fg_flag, &context);
3469         vnode_put(vp);
3470         fp_free(p, indx, fp);
3471
3472         return (error);
3473 }
3474
3475 /*
3476  * While most of the *at syscall handlers can call nameiat() which
3477  * is a wrapper around namei, the use of namei and initialisation
3478  * of nameidata are far removed and in different functions  - namei
3479  * gets called in vn_open_auth for open1. So we'll just do here what
3480  * nameiat() does.
3481  */
3482 static int
3483 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3484     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
3485     int dirfd)
3486 {
3487         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
3488                 int error;
3489                 char c;
3490
3491                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3492                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3493                         if (error)
3494                                 return (error);
3495                 } else {
3496                         c = *((char *)(ndp->ni_dirp));
3497                 }
3498
3499                 if (c != '/') {
3500                         vnode_t dvp_at;
3501
3502                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3503                             &dvp_at);
3504                         if (error)
3505                                 return (error);
3506
3507                         if (vnode_vtype(dvp_at) != VDIR) {
3508                                 vnode_put(dvp_at);
3509                                 return (ENOTDIR);
3510                         }
3511
3512                         ndp->ni_dvp = dvp_at;
3513                         ndp->ni_cnd.cn_flags |= USEDVP;
3514                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
3515                             retval);
3516                         vnode_put(dvp_at);
3517                         return (error);
3518                 }
3519         }
3520
3521         return (open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval));
3522 }
3523
3524 /*
3525  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
3526  *
3527  * Parameters:  p                       Process requesting the open
3528  *              uap                     User argument descriptor (see below)
3529  *              retval                  Pointer to an area to receive the
3530  *                                      return calue from the system call
3531  *
3532  * Indirect:    uap->path               Path to open (same as 'open')
3533  *              uap->flags              Flags to open (same as 'open'
3534  *              uap->uid                UID to set, if creating
3535  *              uap->gid                GID to set, if creating
3536  *              uap->mode               File mode, if creating (same as 'open')
3537  *              uap->xsecurity          ACL to set, if creating
3538  *
3539  * Returns:     0                       Success
3540  *              !0                      errno value
3541  *
3542  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3543  *
3544  * XXX:         We should enummerate the possible errno values here, and where
3545  *              in the code they originated.
3546  */
3547 int
3548 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
3549 {
3550         struct filedesc *fdp = p->p_fd;
3551         int ciferror;
3552         kauth_filesec_t xsecdst;
3553         struct vnode_attr va;
3554         struct nameidata nd;
3555         int cmode;
3556
3557         AUDIT_ARG(owner, uap->uid, uap->gid);
3558
3559         xsecdst = NULL;
3560         if ((uap->xsecurity != USER_ADDR_NULL) &&
3561             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
3562                 return ciferror;
3563
3564         VATTR_INIT(&va);
3565         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3566         VATTR_SET(&va, va_mode, cmode);
3567         if (uap->uid != KAUTH_UID_NONE)
3568                 VATTR_SET(&va, va_uid, uap->uid);
3569         if (uap->gid != KAUTH_GID_NONE)
3570                 VATTR_SET(&va, va_gid, uap->gid);
3571         if (xsecdst != NULL)
3572                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3573
3574         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3575                uap->path, vfs_context_current());
3576
3577         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
3578                          fileproc_alloc_init, NULL, retval);
3579         if (xsecdst != NULL)
3580                 kauth_filesec_free(xsecdst);
3581
3582         return ciferror;
3583 }
3584
3585 /*
3586  * Go through the data-protected atomically controlled open (2)
3587  *
3588  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
3589  */
3590 int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval) {
3591         int flags = uap->flags;
3592         int class = uap->class;
3593         int dpflags = uap->dpflags;
3594
3595         /*
3596          * Follow the same path as normal open(2)
3597          * Look up the item if it exists, and acquire the vnode.
3598          */
3599         struct filedesc *fdp = p->p_fd;
3600         struct vnode_attr va;
3601         struct nameidata nd;
3602         int cmode;
3603         int error;
3604
3605         VATTR_INIT(&va);
3606         /* Mask off all but regular access permissions */
3607         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3608         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3609
3610         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3611                uap->path, vfs_context_current());
3612
3613         /*
3614          * Initialize the extra fields in vnode_attr to pass down our
3615          * extra fields.
3616          * 1. target cprotect class.
3617          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
3618          */
3619         if (flags & O_CREAT) {
3620                 VATTR_SET(&va, va_dataprotect_class, class);
3621         }
3622
3623         if (dpflags & O_DP_GETRAWENCRYPTED) {
3624                 if ( flags & (O_RDWR | O_WRONLY)) {
3625                         /* Not allowed to write raw encrypted bytes */
3626                         return EINVAL;
3627                 }
3628                 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
3629         }
3630
3631         error = open1(vfs_context_current(), &nd, uap->flags, &va,
3632                       fileproc_alloc_init, NULL, retval);
3633
3634         return error;
3635 }
3636
3637 static int
3638 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
3639     int fd, enum uio_seg segflg, int *retval)
3640 {
3641         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
3642         struct vnode_attr va;
3643         struct nameidata nd;
3644         int cmode;
3645
3646         VATTR_INIT(&va);
3647         /* Mask off all but regular access permissions */
3648         cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3649         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3650
3651         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
3652             segflg, path, ctx);
3653
3654         return (open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
3655             retval, fd));
3656 }
3657
3658 int
3659 open(proc_t p, struct open_args *uap, int32_t *retval)
3660 {
3661         __pthread_testcancel(1);
3662         return(open_nocancel(p, (struct open_nocancel_args *)uap, retval));
3663 }
3664
3665 int
3666 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
3667     int32_t *retval)
3668 {
3669         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3670             uap->mode, AT_FDCWD, UIO_USERSPACE, retval));
3671 }
3672
3673 int
3674 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
3675                 int32_t *retval)
3676 {
3677         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3678             uap->mode, uap->fd, UIO_USERSPACE, retval));
3679 }
3680
3681 int
3682 openat(proc_t p, struct openat_args *uap, int32_t *retval)
3683 {
3684         __pthread_testcancel(1);
3685         return(openat_nocancel(p, (struct openat_nocancel_args *)uap, retval));
3686 }
3687
3688 /*
3689  * openbyid_np: open a file given a file system id and a file system object id
3690  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
3691  *      file systems that don't support object ids it is a node id (uint64_t).
3692  *
3693  * Parameters:  p                       Process requesting the open
3694  *              uap                     User argument descriptor (see below)
3695  *              retval                  Pointer to an area to receive the
3696  *                                      return calue from the system call
3697  *
3698  * Indirect:    uap->path               Path to open (same as 'open')
3699  *
3700  *              uap->fsid               id of target file system
3701  *              uap->objid              id of target file system object
3702  *              uap->flags              Flags to open (same as 'open')
3703  *
3704  * Returns:     0                       Success
3705  *              !0                      errno value
3706  *
3707  *
3708  * XXX:         We should enummerate the possible errno values here, and where
3709  *              in the code they originated.
3710  */
3711 int
3712 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
3713 {
3714         fsid_t fsid;
3715         uint64_t objid;
3716         int error;
3717         char *buf = NULL;
3718         int buflen = MAXPATHLEN;
3719         int pathlen = 0;
3720         vfs_context_t ctx = vfs_context_current();
3721
3722         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
3723                 return (error);
3724         }
3725
3726         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
3727         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
3728                 return (error);
3729         }
3730
3731         AUDIT_ARG(value32, fsid.val[0]);
3732         AUDIT_ARG(value64, objid);
3733
3734         /*resolve path from fsis, objid*/
3735         do {
3736                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
3737                 if (buf == NULL) {
3738                         return (ENOMEM);
3739                 }
3740
3741                 error = fsgetpath_internal(
3742                         ctx, fsid.val[0], objid,
3743                         buflen, buf, &pathlen);
3744
3745                 if (error) {
3746                         FREE(buf, M_TEMP);
3747                         buf = NULL;
3748                 }
3749         } while (error == ENOSPC && (buflen += MAXPATHLEN));
3750
3751         if (error) {
3752                 return error;
3753         }
3754
3755         buf[pathlen] = 0;
3756
3757         error = openat_internal(
3758                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
3759
3760         FREE(buf, M_TEMP);
3761
3762         return error;
3763 }
3764
3765
3766 /*
3767  * Create a special file.
3768  */
3769 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
3770
3771 int
3772 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
3773 {
3774         struct vnode_attr va;
3775         vfs_context_t ctx = vfs_context_current();
3776         int error;
3777         struct nameidata nd;
3778         vnode_t vp, dvp;
3779
3780         VATTR_INIT(&va);
3781         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3782         VATTR_SET(&va, va_rdev, uap->dev);
3783
3784         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
3785         if ((uap->mode & S_IFMT) == S_IFIFO)
3786                 return(mkfifo1(ctx, uap->path, &va));
3787
3788         AUDIT_ARG(mode, uap->mode);
3789         AUDIT_ARG(value32, uap->dev);
3790
3791         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
3792                 return (error);
3793         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
3794                 UIO_USERSPACE, uap->path, ctx);
3795         error = namei(&nd);
3796         if (error)
3797                 return (error);
3798         dvp = nd.ni_dvp;
3799         vp = nd.ni_vp;
3800
3801         if (vp != NULL) {
3802                 error = EEXIST;
3803                 goto out;
3804         }
3805
3806         switch (uap->mode & S_IFMT) {
3807         case S_IFMT:    /* used by badsect to flag bad sectors */
3808                 VATTR_SET(&va, va_type, VBAD);
3809                 break;
3810         case S_IFCHR:
3811                 VATTR_SET(&va, va_type, VCHR);
3812                 break;
3813         case S_IFBLK:
3814                 VATTR_SET(&va, va_type, VBLK);
3815                 break;
3816         default:
3817                 error = EINVAL;
3818                 goto out;
3819         }
3820
3821 #if CONFIG_MACF
3822         error = mac_vnode_check_create(ctx,
3823             nd.ni_dvp, &nd.ni_cnd, &va);
3824         if (error)
3825                 goto out;
3826 #endif
3827
3828         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
3829                 goto out;
3830
3831         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0)
3832                 goto out;
3833
3834         if (vp) {
3835                 int     update_flags = 0;
3836
3837                 // Make sure the name & parent pointers are hooked up
3838                 if (vp->v_name == NULL)
3839                         update_flags |= VNODE_UPDATE_NAME;
3840                 if (vp->v_parent == NULLVP)
3841                         update_flags |= VNODE_UPDATE_PARENT;
3842
3843                 if (update_flags)
3844                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
3845
3846 #if CONFIG_FSE
3847                 add_fsevent(FSE_CREATE_FILE, ctx,
3848                     FSE_ARG_VNODE, vp,
3849                     FSE_ARG_DONE);
3850 #endif
3851         }
3852
3853 out:
3854         /*
3855          * nameidone has to happen before we vnode_put(dvp)
3856          * since it may need to release the fs_nodelock on the dvp
3857          */
3858         nameidone(&nd);
3859
3860         if (vp)
3861                 vnode_put(vp);
3862         vnode_put(dvp);
3863
3864         return (error);
3865 }
3866
3867 /*
3868  * Create a named pipe.
3869  *
3870  * Returns:     0                       Success
3871  *              EEXIST
3872  *      namei:???
3873  *      vnode_authorize:???
3874  *      vn_create:???
3875  */
3876 static int
3877 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
3878 {
3879         vnode_t vp, dvp;
3880         int error;
3881         struct nameidata nd;
3882
3883         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
3884                 UIO_USERSPACE, upath, ctx);
3885         error = namei(&nd);
3886         if (error)
3887                 return (error);
3888         dvp = nd.ni_dvp;
3889         vp = nd.ni_vp;
3890
3891         /* check that this is a new file and authorize addition */
3892         if (vp != NULL) {
3893                 error = EEXIST;
3894                 goto out;
3895         }
3896         VATTR_SET(vap, va_type, VFIFO);
3897
3898         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0)
3899                 goto out;
3900
3901         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
3902 out:
3903         /*
3904          * nameidone has to happen before we vnode_put(dvp)
3905          * since it may need to release the fs_nodelock on the dvp
3906          */
3907         nameidone(&nd);
3908
3909         if (vp)
3910                 vnode_put(vp);
3911         vnode_put(dvp);
3912
3913         return error;
3914 }
3915
3916
3917 /*
3918  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
3919  *
3920  * Parameters:  p                       Process requesting the open
3921  *              uap                     User argument descriptor (see below)
3922  *              retval                  (Ignored)
3923  *
3924  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
3925  *              uap->uid                UID to set
3926  *              uap->gid                GID to set
3927  *              uap->mode               File mode to set (same as 'mkfifo')
3928  *              uap->xsecurity          ACL to set, if creating
3929  *
3930  * Returns:     0                       Success
3931  *              !0                      errno value
3932  *
3933  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3934  *
3935  * XXX:         We should enummerate the possible errno values here, and where
3936  *              in the code they originated.
3937  */
3938 int
3939 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
3940 {
3941         int ciferror;
3942         kauth_filesec_t xsecdst;
3943         struct vnode_attr va;
3944
3945         AUDIT_ARG(owner, uap->uid, uap->gid);
3946
3947         xsecdst = KAUTH_FILESEC_NONE;
3948         if (uap->xsecurity != USER_ADDR_NULL) {
3949                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
3950                         return ciferror;
3951         }
3952
3953         VATTR_INIT(&va);
3954         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3955         if (uap->uid != KAUTH_UID_NONE)
3956                 VATTR_SET(&va, va_uid, uap->uid);
3957         if (uap->gid != KAUTH_GID_NONE)
3958                 VATTR_SET(&va, va_gid, uap->gid);
3959         if (xsecdst != KAUTH_FILESEC_NONE)
3960                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3961
3962         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
3963
3964         if (xsecdst != KAUTH_FILESEC_NONE)
3965                 kauth_filesec_free(xsecdst);
3966         return ciferror;
3967 }
3968
3969 /* ARGSUSED */
3970 int
3971 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
3972 {
3973         struct vnode_attr va;
3974
3975         VATTR_INIT(&va);
3976         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3977
3978         return(mkfifo1(vfs_context_current(), uap->path, &va));
3979 }
3980
3981
3982 static char *
3983 my_strrchr(char *p, int ch)
3984 {
3985         char *save;
3986
3987         for (save = NULL;; ++p) {
3988                 if (*p == ch)
3989                         save = p;
3990                 if (!*p)
3991                         return(save);
3992         }
3993         /* NOTREACHED */
3994 }
3995
3996 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
3997
3998 int
3999 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4000 {
4001         int ret, len = _len;
4002
4003         *truncated_path = 0;
4004         ret = vn_getpath(dvp, path, &len);
4005         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4006                 if (leafname) {
4007                         path[len-1] = '/';
4008                         len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + 1;
4009                         if (len > MAXPATHLEN) {
4010                                 char *ptr;
4011
4012                                 // the string got truncated!
4013                                 *truncated_path = 1;
4014                                 ptr = my_strrchr(path, '/');
4015                                 if (ptr) {
4016                                         *ptr = '\0';   // chop off the string at the last directory component
4017                                 }
4018                                 len = strlen(path) + 1;
4019                         }
4020                 }
4021         } else if (ret == 0) {
4022                 *truncated_path = 1;
4023         } else if (ret != 0) {
4024                 struct vnode *mydvp=dvp;
4025
4026                 if (ret != ENOSPC) {
4027                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4028                                dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4029                 }
4030                 *truncated_path = 1;
4031
4032                 do {
4033                         if (mydvp->v_parent != NULL) {
4034                                 mydvp = mydvp->v_parent;
4035                         } else if (mydvp->v_mount) {
4036                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4037                                 break;
4038                         } else {
4039                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4040                                 strlcpy(path, "/", _len);
4041                                 len = 2;
4042                                 mydvp = NULL;
4043                         }
4044
4045                         if (mydvp == NULL) {
4046                                 break;
4047                         }
4048
4049                         len = _len;
4050                         ret = vn_getpath(mydvp, path, &len);
4051                 } while (ret == ENOSPC);
4052         }
4053
4054         return len;
4055 }
4056
4057
4058 /*
4059  * Make a hard file link.
4060  *
4061  * Returns:     0                       Success
4062  *              EPERM
4063  *              EEXIST
4064  *              EXDEV
4065  *      namei:???
4066  *      vnode_authorize:???
4067  *      VNOP_LINK:???
4068  */
4069 /* ARGSUSED */
4070 static int
4071 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4072     user_addr_t link, int flag, enum uio_seg segflg)
4073 {
4074         vnode_t vp, dvp, lvp;
4075         struct nameidata nd;
4076         int follow;
4077         int error;
4078 #if CONFIG_FSE
4079         fse_info finfo;
4080 #endif
4081         int need_event, has_listeners;
4082         char *target_path = NULL;
4083         int truncated=0;
4084
4085         vp = dvp = lvp = NULLVP;
4086
4087         /* look up the object we are linking to */
4088         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4089         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4090             segflg, path, ctx);
4091
4092         error = nameiat(&nd, fd1);
4093         if (error)
4094                 return (error);
4095         vp = nd.ni_vp;
4096
4097         nameidone(&nd);
4098
4099         /*
4100          * Normally, linking to directories is not supported.
4101          * However, some file systems may have limited support.
4102          */
4103         if (vp->v_type == VDIR) {
4104                 if (!(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) {
4105                         error = EPERM;   /* POSIX */
4106                         goto out;
4107                 }
4108                 /* Linking to a directory requires ownership. */
4109                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4110                         struct vnode_attr dva;
4111
4112                         VATTR_INIT(&dva);
4113                         VATTR_WANTED(&dva, va_uid);
4114                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4115                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4116                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4117                                 error = EACCES;
4118                                 goto out;
4119                         }
4120                 }
4121         }
4122
4123         /* lookup the target node */
4124 #if CONFIG_TRIGGERS
4125         nd.ni_op = OP_LINK;
4126 #endif
4127         nd.ni_cnd.cn_nameiop = CREATE;
4128         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4129         nd.ni_dirp = link;
4130         error = nameiat(&nd, fd2);
4131         if (error != 0)
4132                 goto out;
4133         dvp = nd.ni_dvp;
4134         lvp = nd.ni_vp;
4135
4136 #if CONFIG_MACF
4137         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0)
4138                 goto out2;
4139 #endif
4140
4141         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4142         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0)
4143                 goto out2;
4144
4145         /* target node must not exist */
4146         if (lvp != NULLVP) {
4147                 error = EEXIST;
4148                 goto out2;
4149         }
4150         /* cannot link across mountpoints */
4151         if (vnode_mount(vp) != vnode_mount(dvp)) {
4152                 error = EXDEV;
4153                 goto out2;
4154         }
4155
4156         /* authorize creation of the target note */
4157         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
4158                 goto out2;
4159
4160         /* and finally make the link */
4161         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4162         if (error)
4163                 goto out2;
4164
4165 #if CONFIG_MACF
4166         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4167 #endif
4168
4169 #if CONFIG_FSE
4170         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4171 #else
4172         need_event = 0;
4173 #endif
4174         has_listeners = kauth_authorize_fileop_has_listeners();
4175
4176         if (need_event || has_listeners) {
4177                 char *link_to_path = NULL;
4178                 int len, link_name_len;
4179
4180                 /* build the path to the new link file */
4181                 GET_PATH(target_path);
4182                 if (target_path == NULL) {
4183                         error = ENOMEM;
4184                         goto out2;
4185                 }
4186
4187                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4188
4189                 if (has_listeners) {
4190                         /* build the path to file we are linking to */
4191                         GET_PATH(link_to_path);
4192                         if (link_to_path == NULL) {
4193                                 error = ENOMEM;
4194                                 goto out2;
4195                         }
4196
4197                         link_name_len = MAXPATHLEN;
4198                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4199                                 /*
4200                                  * Call out to allow 3rd party notification of rename.
4201                                  * Ignore result of kauth_authorize_fileop call.
4202                                  */
4203                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4204                                                        (uintptr_t)link_to_path,
4205                                                        (uintptr_t)target_path);
4206                         }
4207                         if (link_to_path != NULL) {
4208                                 RELEASE_PATH(link_to_path);
4209                         }
4210                 }
4211 #if CONFIG_FSE
4212                 if (need_event) {
4213                         /* construct fsevent */
4214                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4215                                 if (truncated) {
4216                                         finfo.mode |= FSE_TRUNCATED_PATH;
4217                                 }
4218
4219                                 // build the path to the destination of the link
4220                                 add_fsevent(FSE_CREATE_FILE, ctx,
4221                                             FSE_ARG_STRING, len, target_path,
4222                                             FSE_ARG_FINFO, &finfo,
4223                                             FSE_ARG_DONE);
4224                         }
4225                         if (vp->v_parent) {
4226                             add_fsevent(FSE_STAT_CHANGED, ctx,
4227                                 FSE_ARG_VNODE, vp->v_parent,
4228                                 FSE_ARG_DONE);
4229                         }
4230                 }
4231 #endif
4232         }
4233 out2:
4234         /*
4235          * nameidone has to happen before we vnode_put(dvp)
4236          * since it may need to release the fs_nodelock on the dvp
4237          */
4238         nameidone(&nd);
4239         if (target_path != NULL) {
4240                 RELEASE_PATH(target_path);
4241         }
4242 out:
4243         if (lvp)
4244                 vnode_put(lvp);
4245         if (dvp)
4246                 vnode_put(dvp);
4247         vnode_put(vp);
4248         return (error);
4249 }
4250
4251 int
4252 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4253 {
4254         return (linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4255             AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE));
4256 }
4257
4258 int
4259 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4260 {
4261         if (uap->flag & ~AT_SYMLINK_FOLLOW)
4262                 return (EINVAL);
4263
4264         return (linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4265             uap->fd2, uap->link, uap->flag, UIO_USERSPACE));
4266 }
4267
4268 /*
4269  * Make a symbolic link.
4270  *
4271  * We could add support for ACLs here too...
4272  */
4273 /* ARGSUSED */
4274 static int
4275 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4276     user_addr_t link, enum uio_seg segflg)
4277 {
4278         struct vnode_attr va;
4279         char *path;
4280         int error;
4281         struct nameidata nd;
4282         vnode_t vp, dvp;
4283         uint32_t dfflags;       // Directory file flags
4284         size_t dummy=0;
4285         proc_t p;
4286
4287         error = 0;
4288         if (UIO_SEG_IS_USER_SPACE(segflg)) {
4289                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
4290                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
4291         } else {
4292                 path = (char *)path_data;
4293         }
4294         if (error)
4295                 goto out;
4296         AUDIT_ARG(text, path);  /* This is the link string */
4297
4298         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
4299             segflg, link, ctx);
4300
4301         error = nameiat(&nd, fd);
4302         if (error)
4303                 goto out;
4304         dvp = nd.ni_dvp;
4305         vp = nd.ni_vp;
4306
4307         p = vfs_context_proc(ctx);
4308         VATTR_INIT(&va);
4309         VATTR_SET(&va, va_type, VLNK);
4310         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
4311
4312         /*
4313          * Handle inheritance of restricted flag
4314          */
4315         error = vnode_flags(dvp, &dfflags, ctx);
4316         if (error)
4317                 goto skipit;
4318         if (dfflags & SF_RESTRICTED)
4319                 VATTR_SET(&va, va_flags, SF_RESTRICTED);
4320
4321 #if CONFIG_MACF
4322         error = mac_vnode_check_create(ctx,
4323                         dvp, &nd.ni_cnd, &va);
4324 #endif
4325         if (error != 0) {
4326             goto skipit;
4327         }
4328
4329         if (vp != NULL) {
4330             error = EEXIST;
4331             goto skipit;
4332         }
4333
4334         /* authorize */
4335         if (error == 0)
4336                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
4337         /* get default ownership, etc. */
4338         if (error == 0)
4339                 error = vnode_authattr_new(dvp, &va, 0, ctx);
4340         if (error == 0)
4341                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
4342
4343 #if CONFIG_MACF
4344         if (error == 0)
4345                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
4346 #endif
4347
4348         /* do fallback attribute handling */
4349         if (error == 0)
4350                 error = vnode_setattr_fallback(vp, &va, ctx);
4351
4352         if (error == 0) {
4353                 int     update_flags = 0;
4354
4355                 if (vp == NULL) {
4356                         nd.ni_cnd.cn_nameiop = LOOKUP;
4357 #if CONFIG_TRIGGERS
4358                         nd.ni_op = OP_LOOKUP;
4359 #endif
4360                         nd.ni_cnd.cn_flags = 0;
4361                         error = nameiat(&nd, fd);
4362                         vp = nd.ni_vp;
4363
4364                         if (vp == NULL)
4365                                 goto skipit;
4366                 }
4367
4368 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
4369                 /* call out to allow 3rd party notification of rename.
4370                  * Ignore result of kauth_authorize_fileop call.
4371                  */
4372                 if (kauth_authorize_fileop_has_listeners() &&
4373                     namei(&nd) == 0) {
4374                         char *new_link_path = NULL;
4375                         int             len;
4376
4377                         /* build the path to the new link file */
4378                         new_link_path = get_pathbuff();
4379                         len = MAXPATHLEN;
4380                         vn_getpath(dvp, new_link_path, &len);
4381                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
4382                                 new_link_path[len - 1] = '/';
4383                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN-len);
4384                         }
4385
4386                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
4387                                            (uintptr_t)path, (uintptr_t)new_link_path);
4388                         if (new_link_path != NULL)
4389                                 release_pathbuff(new_link_path);
4390                 }
4391 #endif
4392                 // Make sure the name & parent pointers are hooked up
4393                 if (vp->v_name == NULL)
4394                         update_flags |= VNODE_UPDATE_NAME;
4395                 if (vp->v_parent == NULLVP)
4396                         update_flags |= VNODE_UPDATE_PARENT;
4397
4398                 if (update_flags)
4399                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4400
4401 #if CONFIG_FSE
4402                 add_fsevent(FSE_CREATE_FILE, ctx,
4403                             FSE_ARG_VNODE, vp,
4404                             FSE_ARG_DONE);
4405 #endif
4406         }
4407
4408 skipit:
4409         /*
4410          * nameidone has to happen before we vnode_put(dvp)
4411          * since it may need to release the fs_nodelock on the dvp
4412          */
4413         nameidone(&nd);
4414
4415         if (vp)
4416                 vnode_put(vp);
4417         vnode_put(dvp);
4418 out:
4419         if (path && (path != (char *)path_data))
4420                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
4421
4422         return (error);
4423 }
4424
4425 int
4426 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
4427 {
4428         return (symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
4429             uap->link, UIO_USERSPACE));
4430 }
4431
4432 int
4433 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
4434     __unused int32_t *retval)
4435 {
4436         return (symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
4437             uap->path2, UIO_USERSPACE));
4438 }
4439
4440 /*
4441  * Delete a whiteout from the filesystem.
4442  * No longer supported.
4443  */
4444 int
4445 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
4446 {
4447         return (ENOTSUP);
4448 }
4449
4450 /*
4451  * Delete a name from the filesystem.
4452  */
4453 /* ARGSUSED */
4454 static int
4455 unlink1at(vfs_context_t ctx, struct nameidata *ndp, int unlink_flags, int fd)
4456 {
4457         vnode_t vp, dvp;
4458         int error;
4459         struct componentname *cnp;
4460         char  *path = NULL;
4461         int  len=0;
4462 #if CONFIG_FSE
4463         fse_info  finfo;
4464         struct vnode_attr va;
4465 #endif
4466         int flags = 0;
4467         int need_event = 0;
4468         int has_listeners = 0;
4469         int truncated_path=0;
4470         int batched;
4471         struct vnode_attr *vap = NULL;
4472
4473 #if NAMEDRSRCFORK
4474         /* unlink or delete is allowed on rsrc forks and named streams */
4475         ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
4476 #endif
4477
4478         ndp->ni_cnd.cn_flags |= LOCKPARENT;
4479         ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
4480         cnp = &ndp->ni_cnd;
4481
4482 lookup_continue:
4483         error = nameiat(ndp, fd);
4484         if (error)
4485                 return (error);
4486
4487         dvp = ndp->ni_dvp;
4488         vp = ndp->ni_vp;
4489
4490
4491         /* With Carbon delete semantics, busy files cannot be deleted */
4492         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
4493                 flags |= VNODE_REMOVE_NODELETEBUSY;
4494         }
4495
4496         /* Skip any potential upcalls if told to. */
4497         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
4498                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
4499         }
4500
4501         if (vp) {
4502                 batched = vnode_compound_remove_available(vp);
4503                 /*
4504                  * The root of a mounted filesystem cannot be deleted.
4505                  */
4506                 if (vp->v_flag & VROOT) {
4507                         error = EBUSY;
4508                 }
4509
4510                 if (!batched) {
4511                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
4512                         if (error) {
4513                                 goto out;
4514                         }
4515                 }
4516         } else {
4517                 batched = 1;
4518
4519                 if (!vnode_compound_remove_available(dvp)) {
4520                         panic("No vp, but no compound remove?");
4521                 }
4522         }
4523
4524 #if CONFIG_FSE
4525         need_event = need_fsevent(FSE_DELETE, dvp);
4526         if (need_event) {
4527                 if (!batched) {
4528                         if ((vp->v_flag & VISHARDLINK) == 0) {
4529                                 /* XXX need to get these data in batched VNOP */
4530                                 get_fse_info(vp, &finfo, ctx);
4531                         }
4532                 } else {
4533                         error = vfs_get_notify_attributes(&va);
4534                         if (error) {
4535                                 goto out;
4536                         }
4537
4538                         vap = &va;
4539                 }
4540         }
4541 #endif
4542         has_listeners = kauth_authorize_fileop_has_listeners();
4543         if (need_event || has_listeners) {
4544                 if (path == NULL) {
4545                         GET_PATH(path);
4546                         if (path == NULL) {
4547                                 error = ENOMEM;
4548                                 goto out;
4549                         }
4550                 }
4551                 len = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
4552         }
4553
4554 #if NAMEDRSRCFORK
4555         if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK)
4556                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
4557         else
4558 #endif
4559         {
4560                 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
4561                 vp = ndp->ni_vp;
4562                 if (error == EKEEPLOOKING) {
4563                         if (!batched) {
4564                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
4565                         }
4566
4567                         if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
4568                                 panic("EKEEPLOOKING, but continue flag not set?");
4569                         }
4570
4571                         if (vnode_isdir(vp)) {
4572                                 error = EISDIR;
4573                                 goto out;
4574                         }
4575                         goto lookup_continue;
4576                 }
4577         }
4578
4579         /*
4580          * Call out to allow 3rd party notification of delete.
4581          * Ignore result of kauth_authorize_fileop call.
4582          */
4583         if (!error) {
4584                 if (has_listeners) {
4585                         kauth_authorize_fileop(vfs_context_ucred(ctx),
4586                                 KAUTH_FILEOP_DELETE,
4587                                 (uintptr_t)vp,
4588                                 (uintptr_t)path);
4589                 }
4590
4591                 if (vp->v_flag & VISHARDLINK) {
4592                     //
4593                     // if a hardlink gets deleted we want to blow away the
4594                     // v_parent link because the path that got us to this
4595                     // instance of the link is no longer valid.  this will
4596                     // force the next call to get the path to ask the file
4597                     // system instead of just following the v_parent link.
4598                     //
4599                     vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
4600                 }
4601
4602 #if CONFIG_FSE
4603                 if (need_event) {
4604                         if (vp->v_flag & VISHARDLINK) {
4605                                 get_fse_info(vp, &finfo, ctx);
4606                         } else if (vap) {
4607                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
4608                         }
4609                         if (truncated_path) {
4610                                 finfo.mode |= FSE_TRUNCATED_PATH;
4611                         }
4612                         add_fsevent(FSE_DELETE, ctx,
4613                                                 FSE_ARG_STRING, len, path,
4614                                                 FSE_ARG_FINFO, &finfo,
4615                                                 FSE_ARG_DONE);
4616                 }
4617 #endif
4618         }
4619
4620 out:
4621         if (path != NULL)
4622                 RELEASE_PATH(path);
4623
4624 #if NAMEDRSRCFORK
4625         /* recycle the deleted rsrc fork vnode to force a reclaim, which
4626          * will cause its shadow file to go away if necessary.
4627          */
4628          if (vp && (vnode_isnamedstream(vp)) &&
4629                 (vp->v_parent != NULLVP) &&
4630                 vnode_isshadow(vp)) {
4631                         vnode_recycle(vp);
4632          }
4633 #endif
4634         /*
4635          * nameidone has to happen before we vnode_put(dvp)
4636          * since it may need to release the fs_nodelock on the dvp
4637          */
4638         nameidone(ndp);
4639         vnode_put(dvp);
4640         if (vp) {
4641                 vnode_put(vp);
4642         }
4643         return (error);
4644 }
4645
4646 int
4647 unlink1(vfs_context_t ctx, struct nameidata *ndp, int unlink_flags)
4648 {
4649         return (unlink1at(ctx, ndp, unlink_flags, AT_FDCWD));
4650 }
4651
4652 /*
4653  * Delete a name from the filesystem using POSIX semantics.
4654  */
4655 static int
4656 unlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
4657     enum uio_seg segflg)
4658 {
4659         struct nameidata nd;
4660
4661         NDINIT(&nd, DELETE, OP_UNLINK, AUDITVNPATH1, segflg,
4662                path, ctx);
4663         return (unlink1at(ctx, &nd, 0, fd));
4664 }
4665
4666 int
4667 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
4668 {
4669         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4670             UIO_USERSPACE));
4671 }
4672
4673 int
4674 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
4675 {
4676         if (uap->flag & ~AT_REMOVEDIR)
4677                 return (EINVAL);
4678
4679         if (uap->flag & AT_REMOVEDIR)
4680                 return (rmdirat_internal(vfs_context_current(), uap->fd,
4681                     uap->path, UIO_USERSPACE));
4682         else
4683                 return (unlinkat_internal(vfs_context_current(), uap->fd,
4684                     uap->path, UIO_USERSPACE));
4685 }
4686
4687 /*
4688  * Delete a name from the filesystem using Carbon semantics.
4689  */
4690 int
4691 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
4692 {
4693         struct nameidata nd;
4694         vfs_context_t ctx = vfs_context_current();
4695
4696         NDINIT(&nd, DELETE, OP_UNLINK, AUDITVNPATH1, UIO_USERSPACE,
4697                uap->path, ctx);
4698         return unlink1(ctx, &nd, VNODE_REMOVE_NODELETEBUSY);
4699 }
4700
4701 /*
4702  * Reposition read/write file offset.
4703  */
4704 int
4705 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
4706 {
4707         struct fileproc *fp;
4708         vnode_t vp;
4709         struct vfs_context *ctx;
4710         off_t offset = uap->offset, file_size;
4711         int error;
4712
4713         if ( (error = fp_getfvp(p,uap->fd, &fp, &vp)) ) {
4714                 if (error == ENOTSUP)
4715                         return (ESPIPE);
4716                 return (error);
4717         }
4718         if (vnode_isfifo(vp)) {
4719                 file_drop(uap->fd);
4720                 return(ESPIPE);
4721         }
4722
4723
4724         ctx = vfs_context_current();
4725 #if CONFIG_MACF
4726         if (uap->whence == L_INCR && uap->offset == 0)
4727                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
4728                     fp->f_fglob);
4729         else
4730                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
4731                     fp->f_fglob);
4732         if (error) {
4733                 file_drop(uap->fd);
4734                 return (error);
4735         }
4736 #endif
4737         if ( (error = vnode_getwithref(vp)) ) {
4738                 file_drop(uap->fd);
4739                 return(error);
4740         }
4741
4742         switch (uap->whence) {
4743         case L_INCR:
4744                 offset += fp->f_fglob->fg_offset;
4745                 break;
4746         case L_XTND:
4747                 if ((error = vnode_size(vp, &file_size, ctx)) != 0)
4748                         break;
4749                 offset += file_size;
4750                 break;
4751         case L_SET:
4752                 break;
4753         default:
4754                 error = EINVAL;
4755         }
4756         if (error == 0) {
4757                 if (uap->offset > 0 && offset < 0) {
4758                         /* Incremented/relative move past max size */
4759                         error = EOVERFLOW;
4760                 } else {
4761                         /*
4762                          * Allow negative offsets on character devices, per
4763                          * POSIX 1003.1-2001.  Most likely for writing disk
4764                          * labels.
4765                          */
4766                         if (offset < 0 && vp->v_type != VCHR) {
4767                                 /* Decremented/relative move before start */
4768                                 error = EINVAL;
4769                         } else {
4770                                 /* Success */
4771                                 fp->f_fglob->fg_offset = offset;
4772                                 *retval = fp->f_fglob->fg_offset;
4773                         }
4774                 }
4775         }
4776
4777         /*
4778          * An lseek can affect whether data is "available to read."  Use
4779          * hint of NOTE_NONE so no EVFILT_VNODE events fire
4780          */
4781         post_event_if_success(vp, error, NOTE_NONE);
4782         (void)vnode_put(vp);
4783         file_drop(uap->fd);
4784         return (error);
4785 }
4786
4787
4788 /*
4789  * Check access permissions.
4790  *
4791  * Returns:     0                       Success
4792  *              vnode_authorize:???
4793  */
4794 static int
4795 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
4796 {
4797         kauth_action_t action;
4798         int error;
4799
4800         /*
4801          * If just the regular access bits, convert them to something
4802          * that vnode_authorize will understand.
4803          */
4804         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
4805                 action = 0;
4806                 if (uflags & R_OK)
4807                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
4808                 if (uflags & W_OK) {
4809                         if (vnode_isdir(vp)) {
4810                                 action |= KAUTH_VNODE_ADD_FILE |
4811                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
4812                                 /* might want delete rights here too */
4813                         } else {
4814                                 action |= KAUTH_VNODE_WRITE_DATA;
4815                         }
4816                 }
4817                 if (uflags & X_OK) {
4818                         if (vnode_isdir(vp)) {
4819                                 action |= KAUTH_VNODE_SEARCH;
4820                         } else {
4821                                 action |= KAUTH_VNODE_EXECUTE;
4822                         }
4823                 }
4824         } else {
4825                 /* take advantage of definition of uflags */
4826                 action = uflags >> 8;
4827         }
4828
4829 #if CONFIG_MACF
4830         error = mac_vnode_check_access(ctx, vp, uflags);
4831         if (error)
4832                 return (error);
4833 #endif /* MAC */
4834
4835         /* action == 0 means only check for existence */
4836         if (action != 0) {
4837                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
4838         } else {
4839                 error = 0;
4840         }
4841
4842         return(error);
4843 }
4844
4845
4846
4847 /*
4848  * access_extended: Check access permissions in bulk.
4849  *
4850  * Description: uap->entries            Pointer to an array of accessx
4851  *                                      descriptor structs, plus one or
4852  *                                      more NULL terminated strings (see
4853  *                                      "Notes" section below).
4854  *              uap->size               Size of the area pointed to by
4855  *                                      uap->entries.
4856  *              uap->results            Pointer to the results array.
4857  *
4858  * Returns:     0                       Success
4859  *              ENOMEM                  Insufficient memory
4860  *              EINVAL                  Invalid arguments
4861  *              namei:EFAULT            Bad address
4862  *              namei:ENAMETOOLONG      Filename too long
4863  *              namei:ENOENT            No such file or directory
4864  *              namei:ELOOP             Too many levels of symbolic links
4865  *              namei:EBADF             Bad file descriptor
4866  *              namei:ENOTDIR           Not a directory
4867  *              namei:???
4868  *              access1:
4869  *
4870  * Implicit returns:
4871  *              uap->results            Array contents modified
4872  *
4873  * Notes:       The uap->entries are structured as an arbitrary length array
4874  *              of accessx descriptors, followed by one or more NULL terminated
4875  *              strings
4876  *
4877  *                      struct accessx_descriptor[0]
4878  *                      ...
4879  *                      struct accessx_descriptor[n]
4880  *                      char name_data[0];
4881  *
4882  *              We determine the entry count by walking the buffer containing
4883  *              the uap->entries argument descriptor.  For each descriptor we
4884  *              see, the valid values for the offset ad_name_offset will be
4885  *              in the byte range:
4886  *
4887  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
4888  *                                              to
4889  *                              [ uap->entries + uap->size - 2 ]
4890  *
4891  *              since we must have at least one string, and the string must
4892  *              be at least one character plus the NULL terminator in length.
4893  *
4894  * XXX:         Need to support the check-as uid argument
4895  */
4896 int
4897 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
4898 {
4899         struct accessx_descriptor *input = NULL;
4900         errno_t *result = NULL;
4901         errno_t error = 0;
4902         int wantdelete = 0;
4903         unsigned int desc_max, desc_actual, i, j;
4904         struct vfs_context context;
4905         struct nameidata nd;
4906         int niopts;
4907         vnode_t vp = NULL;
4908         vnode_t dvp = NULL;
4909 #define ACCESSX_MAX_DESCR_ON_STACK 10
4910         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
4911
4912         context.vc_ucred = NULL;
4913
4914         /*
4915          * Validate parameters; if valid, copy the descriptor array and string
4916          * arguments into local memory.  Before proceeding, the following
4917          * conditions must have been met:
4918          *
4919          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
4920          * o    There must be sufficient room in the request for at least one
4921          *      descriptor and a one yte NUL terminated string.
4922          * o    The allocation of local storage must not fail.
4923          */
4924         if (uap->size > ACCESSX_MAX_TABLESIZE)
4925                 return(ENOMEM);
4926         if (uap->size < (sizeof(struct accessx_descriptor) + 2))
4927                 return(EINVAL);
4928         if (uap->size <= sizeof (stack_input)) {
4929                 input = stack_input;
4930         } else {
4931         MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
4932         if (input == NULL) {
4933                 error = ENOMEM;
4934                 goto out;
4935         }
4936         }
4937         error = copyin(uap->entries, input, uap->size);
4938         if (error)
4939                 goto out;
4940
4941         AUDIT_ARG(opaque, input, uap->size);
4942
4943         /*
4944          * Force NUL termination of the copyin buffer to avoid nami() running
4945          * off the end.  If the caller passes us bogus data, they may get a
4946          * bogus result.
4947          */
4948         ((char *)input)[uap->size - 1] = 0;
4949
4950         /*
4951          * Access is defined as checking against the process' real identity,
4952          * even if operations are checking the effective identity.  This
4953          * requires that we use a local vfs context.
4954          */
4955         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
4956         context.vc_thread = current_thread();
4957
4958         /*
4959          * Find out how many entries we have, so we can allocate the result
4960          * array by walking the list and adjusting the count downward by the
4961          * earliest string offset we see.
4962          */
4963         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
4964         desc_actual = desc_max;
4965         for (i = 0; i < desc_actual; i++) {
4966                 /*
4967                  * Take the offset to the name string for this entry and
4968                  * convert to an input array index, which would be one off
4969                  * the end of the array if this entry was the lowest-addressed
4970                  * name string.
4971                  */
4972                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
4973
4974                 /*
4975                  * An offset greater than the max allowable offset is an error.
4976                  * It is also an error for any valid entry to point
4977                  * to a location prior to the end of the current entry, if
4978                  * it's not a reference to the string of the previous entry.
4979                  */
4980                 if (j > desc_max || (j != 0 && j <= i)) {
4981                         error = EINVAL;
4982                         goto out;
4983                 }
4984
4985                 /*
4986                  * An offset of 0 means use the previous descriptor's offset;
4987                  * this is used to chain multiple requests for the same file
4988                  * to avoid multiple lookups.
4989                  */
4990                 if (j == 0) {
4991                         /* This is not valid for the first entry */
4992                         if (i == 0) {
4993                                 error = EINVAL;
4994                                 goto out;
4995                         }
4996                         continue;
4997                 }
4998
4999                 /*
5000                  * If the offset of the string for this descriptor is before
5001                  * what we believe is the current actual last descriptor,
5002                  * then we need to adjust our estimate downward; this permits
5003                  * the string table following the last descriptor to be out
5004                  * of order relative to the descriptor list.
5005                  */
5006                 if (j < desc_actual)
5007                         desc_actual = j;
5008         }
5009
5010         /*
5011          * We limit the actual number of descriptors we are willing to process
5012          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5013          * requested does not exceed this limit,
5014          */
5015         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5016                 error = ENOMEM;
5017                 goto out;
5018         }
5019         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK);
5020         if (result == NULL) {
5021                 error = ENOMEM;
5022                 goto out;
5023         }
5024
5025         /*
5026          * Do the work by iterating over the descriptor entries we know to
5027          * at least appear to contain valid data.
5028          */
5029         error = 0;
5030         for (i = 0; i < desc_actual; i++) {
5031                 /*
5032                  * If the ad_name_offset is 0, then we use the previous
5033                  * results to make the check; otherwise, we are looking up
5034                  * a new file name.
5035                  */
5036                 if (input[i].ad_name_offset != 0) {
5037                         /* discard old vnodes */
5038                         if (vp) {
5039                                 vnode_put(vp);
5040                                 vp = NULL;
5041                         }
5042                         if (dvp) {
5043                                 vnode_put(dvp);
5044                                 dvp = NULL;
5045                         }
5046
5047                         /*
5048                          * Scan forward in the descriptor list to see if we
5049                          * need the parent vnode.  We will need it if we are
5050                          * deleting, since we must have rights  to remove
5051                          * entries in the parent directory, as well as the
5052                          * rights to delete the object itself.
5053                          */
5054                         wantdelete = input[i].ad_flags & _DELETE_OK;
5055                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++)
5056                                 if (input[j].ad_flags & _DELETE_OK)
5057                                         wantdelete = 1;
5058
5059                         niopts = FOLLOW | AUDITVNPATH1;
5060
5061                         /* need parent for vnode_authorize for deletion test */
5062                         if (wantdelete)
5063                                 niopts |= WANTPARENT;
5064
5065                         /* do the lookup */
5066                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5067                                CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5068                                &context);
5069                         error = namei(&nd);
5070                         if (!error) {
5071                                 vp = nd.ni_vp;
5072                                 if (wantdelete)
5073                                         dvp = nd.ni_dvp;
5074                         }
5075                         nameidone(&nd);
5076                 }
5077
5078                 /*
5079                  * Handle lookup errors.
5080                  */
5081                 switch(error) {
5082                 case ENOENT:
5083                 case EACCES:
5084                 case EPERM:
5085                 case ENOTDIR:
5086                         result[i] = error;
5087                         break;
5088                 case 0:
5089                         /* run this access check */
5090                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5091                         break;
5092                 default:
5093                         /* fatal lookup error */
5094
5095                         goto out;
5096                 }
5097         }
5098
5099         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5100
5101         /* copy out results */
5102         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5103
5104 out:
5105         if (input && input != stack_input)
5106                 FREE(input, M_TEMP);
5107         if (result)
5108                 FREE(result, M_TEMP);
5109         if (vp)
5110                 vnode_put(vp);
5111         if (dvp)
5112                 vnode_put(dvp);
5113         if (IS_VALID_CRED(context.vc_ucred))
5114                 kauth_cred_unref(&context.vc_ucred);
5115         return(error);
5116 }
5117
5118
5119 /*
5120  * Returns:     0                       Success
5121  *              namei:EFAULT            Bad address
5122  *              namei:ENAMETOOLONG      Filename too long
5123  *              namei:ENOENT            No such file or directory
5124  *              namei:ELOOP             Too many levels of symbolic links
5125  *              namei:EBADF             Bad file descriptor
5126  *              namei:ENOTDIR           Not a directory
5127  *              namei:???
5128  *              access1:
5129  */
5130 static int
5131 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5132     int flag, enum uio_seg segflg)
5133 {
5134         int error;
5135         struct nameidata nd;
5136         int niopts;
5137         struct vfs_context context;
5138 #if NAMEDRSRCFORK
5139         int is_namedstream = 0;
5140 #endif
5141
5142         /*
5143          * Unless the AT_EACCESS option is used, Access is defined as checking
5144          * against the process' real identity, even if operations are checking
5145          * the effective identity.  So we need to tweak the credential
5146          * in the context for that case.
5147          */
5148         if (!(flag & AT_EACCESS))
5149                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5150         else
5151                 context.vc_ucred = ctx->vc_ucred;
5152         context.vc_thread = ctx->vc_thread;
5153
5154
5155         niopts = FOLLOW | AUDITVNPATH1;
5156         /* need parent for vnode_authorize for deletion test */
5157         if (amode & _DELETE_OK)
5158                 niopts |= WANTPARENT;
5159         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5160                path, &context);
5161
5162 #if NAMEDRSRCFORK
5163         /* access(F_OK) calls are allowed for resource forks. */
5164         if (amode == F_OK)
5165                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5166 #endif
5167         error = nameiat(&nd, fd);
5168         if (error)
5169                 goto out;
5170
5171 #if NAMEDRSRCFORK
5172         /* Grab reference on the shadow stream file vnode to
5173          * force an inactive on release which will mark it
5174          * for recycle.
5175          */
5176         if (vnode_isnamedstream(nd.ni_vp) &&
5177             (nd.ni_vp->v_parent != NULLVP) &&
5178             vnode_isshadow(nd.ni_vp)) {
5179                 is_namedstream = 1;
5180                 vnode_ref(nd.ni_vp);
5181         }
5182 #endif
5183
5184         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
5185
5186 #if NAMEDRSRCFORK
5187         if (is_namedstream) {
5188                 vnode_rele(nd.ni_vp);
5189         }
5190 #endif
5191
5192         vnode_put(nd.ni_vp);
5193         if (amode & _DELETE_OK)
5194                 vnode_put(nd.ni_dvp);
5195         nameidone(&nd);
5196
5197 out:
5198         if (!(flag & AT_EACCESS))
5199                 kauth_cred_unref(&context.vc_ucred);
5200         return (error);
5201 }
5202
5203 int
5204 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
5205 {
5206         return (faccessat_internal(vfs_context_current(), AT_FDCWD,
5207             uap->path, uap->flags, 0, UIO_USERSPACE));
5208 }
5209
5210 int
5211 faccessat(__unused proc_t p, struct faccessat_args *uap,
5212           __unused int32_t *retval)
5213 {
5214         if (uap->flag & ~AT_EACCESS)
5215                 return (EINVAL);
5216
5217         return (faccessat_internal(vfs_context_current(), uap->fd,
5218             uap->path, uap->amode, uap->flag, UIO_USERSPACE));
5219 }
5220
5221 /*
5222  * Returns:     0                       Success
5223  *              EFAULT
5224  *      copyout:EFAULT
5225  *      namei:???
5226  *      vn_stat:???
5227  */
5228 static int
5229 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
5230     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
5231     enum uio_seg segflg, int fd, int flag)
5232 {
5233         struct nameidata nd;
5234         int follow;
5235         union {
5236                 struct stat sb;
5237                 struct stat64 sb64;
5238         } source;
5239         union {
5240                 struct user64_stat user64_sb;
5241                 struct user32_stat user32_sb;
5242                 struct user64_stat64 user64_sb64;
5243                 struct user32_stat64 user32_sb64;
5244         } dest;
5245         caddr_t sbp;
5246         int error, my_size;
5247         kauth_filesec_t fsec;
5248         size_t xsecurity_bufsize;
5249         void * statptr;
5250
5251         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5252         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
5253             segflg, path, ctx);
5254
5255 #if NAMEDRSRCFORK
5256         int is_namedstream = 0;
5257         /* stat calls are allowed for resource forks. */
5258         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5259 #endif
5260         error = nameiat(&nd, fd);
5261         if (error)
5262                 return (error);
5263         fsec = KAUTH_FILESEC_NONE;
5264
5265         statptr = (void *)&source;
5266
5267 #if NAMEDRSRCFORK
5268         /* Grab reference on the shadow stream file vnode to
5269          * force an inactive on release which will mark it
5270          * for recycle.
5271          */
5272         if (vnode_isnamedstream(nd.ni_vp) &&
5273             (nd.ni_vp->v_parent != NULLVP) &&
5274             vnode_isshadow(nd.ni_vp)) {
5275                 is_namedstream = 1;
5276                 vnode_ref(nd.ni_vp);
5277         }
5278 #endif
5279
5280         error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
5281
5282 #if NAMEDRSRCFORK
5283         if (is_namedstream) {
5284                 vnode_rele(nd.ni_vp);
5285         }
5286 #endif
5287         vnode_put(nd.ni_vp);
5288         nameidone(&nd);
5289
5290         if (error)
5291                 return (error);
5292         /* Zap spare fields */
5293         if (isstat64 != 0) {
5294                 source.sb64.st_lspare = 0;
5295                 source.sb64.st_qspare[0] = 0LL;
5296                 source.sb64.st_qspare[1] = 0LL;
5297                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5298                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
5299                         my_size = sizeof(dest.user64_sb64);
5300                         sbp = (caddr_t)&dest.user64_sb64;
5301                 } else {
5302                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
5303                         my_size = sizeof(dest.user32_sb64);
5304                         sbp = (caddr_t)&dest.user32_sb64;
5305                 }
5306                 /*
5307                  * Check if we raced (post lookup) against the last unlink of a file.
5308                  */
5309                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
5310                         source.sb64.st_nlink = 1;
5311                 }
5312         } else {
5313                 source.sb.st_lspare = 0;
5314                 source.sb.st_qspare[0] = 0LL;
5315                 source.sb.st_qspare[1] = 0LL;
5316                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5317                         munge_user64_stat(&source.sb, &dest.user64_sb);
5318                         my_size = sizeof(dest.user64_sb);
5319                         sbp = (caddr_t)&dest.user64_sb;
5320                 } else {
5321                         munge_user32_stat(&source.sb, &dest.user32_sb);
5322                         my_size = sizeof(dest.user32_sb);
5323                         sbp = (caddr_t)&dest.user32_sb;
5324                 }
5325
5326                 /*
5327                  * Check if we raced (post lookup) against the last unlink of a file.
5328                  */
5329                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
5330                         source.sb.st_nlink = 1;
5331                 }
5332         }
5333         if ((error = copyout(sbp, ub, my_size)) != 0)
5334                 goto out;
5335
5336         /* caller wants extended security information? */
5337         if (xsecurity != USER_ADDR_NULL) {
5338
5339                 /* did we get any? */
5340                 if (fsec == KAUTH_FILESEC_NONE) {
5341                         if (susize(xsecurity_size, 0) != 0) {
5342                                 error = EFAULT;
5343                                 goto out;
5344                         }
5345                 } else {
5346                         /* find the user buffer size */
5347                         xsecurity_bufsize = fusize(xsecurity_size);
5348
5349                         /* copy out the actual data size */
5350                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
5351                                 error = EFAULT;
5352                                 goto out;
5353                         }
5354
5355                         /* if the caller supplied enough room, copy out to it */
5356                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec))
5357                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
5358                 }
5359         }
5360 out:
5361         if (fsec != KAUTH_FILESEC_NONE)
5362                 kauth_filesec_free(fsec);
5363         return (error);
5364 }
5365
5366 /*
5367  * stat_extended: Get file status; with extended security (ACL).
5368  *
5369  * Parameters:    p                       (ignored)
5370  *                uap                     User argument descriptor (see below)
5371  *                retval                  (ignored)
5372  *
5373  * Indirect:      uap->path               Path of file to get status from
5374  *                uap->ub                 User buffer (holds file status info)
5375  *                uap->xsecurity          ACL to get (extended security)
5376  *                uap->xsecurity_size     Size of ACL
5377  *
5378  * Returns:        0                      Success
5379  *                !0                      errno value
5380  *
5381  */
5382 int
5383 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
5384     __unused int32_t *retval)
5385 {
5386         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5387             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5388             0));
5389 }
5390
5391 /*
5392  * Returns:     0                       Success
5393  *      fstatat_internal:???            [see fstatat_internal() in this file]
5394  */
5395 int
5396 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
5397 {
5398         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5399             0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0));
5400 }
5401
5402 int
5403 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
5404 {
5405         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5406             0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0));
5407 }
5408
5409 /*
5410  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
5411  *
5412  * Parameters:    p                       (ignored)
5413  *                uap                     User argument descriptor (see below)
5414  *                retval                  (ignored)
5415  *
5416  * Indirect:      uap->path               Path of file to get status from
5417  *                uap->ub                 User buffer (holds file status info)
5418  *                uap->xsecurity          ACL to get (extended security)
5419  *                uap->xsecurity_size     Size of ACL
5420  *
5421  * Returns:        0                      Success
5422  *                !0                      errno value
5423  *
5424  */
5425 int
5426 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
5427 {
5428         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5429             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5430             0));
5431 }
5432
5433 /*
5434  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
5435  *
5436  * Parameters:    p                       (ignored)
5437  *                uap                     User argument descriptor (see below)
5438  *                retval                  (ignored)
5439  *
5440  * Indirect:      uap->path               Path of file to get status from
5441  *                uap->ub                 User buffer (holds file status info)
5442  *                uap->xsecurity          ACL to get (extended security)
5443  *                uap->xsecurity_size     Size of ACL
5444  *
5445  * Returns:        0                      Success
5446  *                !0                      errno value
5447  *
5448  */
5449 int
5450 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
5451 {
5452         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5453             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5454             AT_SYMLINK_NOFOLLOW));
5455 }
5456
5457 /*
5458  * Get file status; this version does not follow links.
5459  */
5460 int
5461 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
5462 {
5463         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5464             0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5465 }
5466
5467 int
5468 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
5469 {
5470         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5471             0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5472 }
5473
5474 /*
5475  * lstat64_extended: Get file status; can handle large inode numbers; does not
5476  * follow links; with extended security (ACL).
5477  *
5478  * Parameters:    p                       (ignored)
5479  *                uap                     User argument descriptor (see below)
5480  *                retval                  (ignored)
5481  *
5482  * Indirect:      uap->path               Path of file to get status from
5483  *                uap->ub                 User buffer (holds file status info)
5484  *                uap->xsecurity          ACL to get (extended security)
5485  *                uap->xsecurity_size     Size of ACL
5486  *
5487  * Returns:        0                      Success
5488  *                !0                      errno value
5489  *
5490  */
5491 int
5492 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
5493 {
5494         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5495             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5496             AT_SYMLINK_NOFOLLOW));
5497 }
5498
5499 int
5500 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
5501 {
5502         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5503                 return (EINVAL);
5504
5505         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5506             0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag));
5507 }
5508
5509 int
5510 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
5511     __unused int32_t *retval)
5512 {
5513         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5514                 return (EINVAL);
5515
5516         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5517             0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag));
5518 }
5519
5520 /*
5521  * Get configurable pathname variables.
5522  *
5523  * Returns:     0                       Success
5524  *      namei:???
5525  *      vn_pathconf:???
5526  *
5527  * Notes:       Global implementation  constants are intended to be
5528  *              implemented in this function directly; all other constants
5529  *              are per-FS implementation, and therefore must be handled in
5530  *              each respective FS, instead.
5531  *
5532  * XXX We implement some things globally right now that should actually be
5533  * XXX per-FS; we will need to deal with this at some point.
5534  */
5535 /* ARGSUSED */
5536 int
5537 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
5538 {
5539         int error;
5540         struct nameidata nd;
5541         vfs_context_t ctx = vfs_context_current();
5542
5543         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
5544                 UIO_USERSPACE, uap->path, ctx);
5545         error = namei(&nd);
5546         if (error)
5547                 return (error);
5548
5549         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
5550
5551         vnode_put(nd.ni_vp);
5552         nameidone(&nd);
5553         return (error);
5554 }
5555
5556 /*
5557  * Return target name of a symbolic link.
5558  */
5559 /* ARGSUSED */
5560 static int
5561 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
5562     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
5563     int *retval)
5564 {
5565         vnode_t vp;
5566         uio_t auio;
5567         int error;
5568         struct nameidata nd;
5569         char uio_buf[ UIO_SIZEOF(1) ];
5570
5571         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
5572             seg, path, ctx);
5573
5574         error = nameiat(&nd, fd);
5575         if (error)
5576                 return (error);
5577         vp = nd.ni_vp;
5578
5579         nameidone(&nd);
5580
5581         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
5582                                     &uio_buf[0], sizeof(uio_buf));
5583         uio_addiov(auio, buf, bufsize);
5584         if (vp->v_type != VLNK) {
5585                 error = EINVAL;
5586         } else {
5587 #if CONFIG_MACF
5588                 error = mac_vnode_check_readlink(ctx, vp);
5589 #endif
5590                 if (error == 0)
5591                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
5592                                                 ctx);
5593                 if (error == 0)
5594                         error = VNOP_READLINK(vp, auio, ctx);
5595         }
5596         vnode_put(vp);
5597
5598         *retval = bufsize - (int)uio_resid(auio);
5599         return (error);
5600 }
5601
5602 int
5603 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
5604 {
5605         enum uio_seg procseg;
5606
5607         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5608         return (readlinkat_internal(vfs_context_current(), AT_FDCWD,
5609             CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
5610             uap->count, procseg, retval));
5611 }
5612
5613 int
5614 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
5615 {
5616         enum uio_seg procseg;
5617
5618         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5619         return (readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
5620             procseg, uap->buf, uap->bufsize, procseg, retval));
5621 }
5622
5623 /*
5624  * Change file flags.
5625  */
5626 static int
5627 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
5628 {
5629         struct vnode_attr va;
5630         kauth_action_t action;
5631         int error;
5632
5633         VATTR_INIT(&va);
5634         VATTR_SET(&va, va_flags, flags);
5635
5636 #if CONFIG_MACF
5637         error = mac_vnode_check_setflags(ctx, vp, flags);
5638         if (error)
5639                 goto out;
5640 #endif
5641
5642         /* request authorisation, disregard immutability */
5643         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5644                 goto out;
5645         /*
5646          * Request that the auth layer disregard those file flags it's allowed to when
5647          * authorizing this operation; we need to do this in order to be able to
5648          * clear immutable flags.
5649          */
5650         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0))
5651                 goto out;
5652         error = vnode_setattr(vp, &va, ctx);
5653
5654         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
5655                 error = ENOTSUP;
5656         }
5657 out:
5658         vnode_put(vp);
5659         return(error);
5660 }
5661
5662 /*
5663  * Change flags of a file given a path name.
5664  */
5665 /* ARGSUSED */
5666 int
5667 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
5668 {
5669         vnode_t vp;
5670         vfs_context_t ctx = vfs_context_current();
5671         int error;
5672         struct nameidata nd;
5673
5674         AUDIT_ARG(fflags, uap->flags);
5675         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
5676                 UIO_USERSPACE, uap->path, ctx);
5677         error = namei(&nd);
5678         if (error)
5679                 return (error);
5680         vp = nd.ni_vp;
5681         nameidone(&nd);
5682
5683         error = chflags1(vp, uap->flags, ctx);
5684
5685         return(error);
5686 }
5687
5688 /*
5689  * Change flags of a file given a file descriptor.
5690  */
5691 /* ARGSUSED */
5692 int
5693 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
5694 {
5695         vnode_t vp;
5696         int error;
5697
5698         AUDIT_ARG(fd, uap->fd);
5699         AUDIT_ARG(fflags, uap->flags);
5700         if ( (error = file_vnode(uap->fd, &vp)) )
5701                 return (error);
5702
5703         if ((error = vnode_getwithref(vp))) {
5704                 file_drop(uap->fd);
5705                 return(error);
5706         }
5707
5708         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5709
5710         error = chflags1(vp, uap->flags, vfs_context_current());
5711
5712         file_drop(uap->fd);
5713         return (error);
5714 }
5715
5716 /*
5717  * Change security information on a filesystem object.
5718  *
5719  * Returns:     0                       Success
5720  *              EPERM                   Operation not permitted
5721  *              vnode_authattr:???      [anything vnode_authattr can return]
5722  *              vnode_authorize:???     [anything vnode_authorize can return]
5723  *              vnode_setattr:???       [anything vnode_setattr can return]
5724  *
5725  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
5726  *              translated to EPERM before being returned.
5727  */
5728 static int
5729 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
5730 {
5731         kauth_action_t action;
5732         int error;
5733
5734         AUDIT_ARG(mode, vap->va_mode);
5735         /* XXX audit new args */
5736
5737 #if NAMEDSTREAMS
5738         /* chmod calls are not allowed for resource forks. */
5739         if (vp->v_flag & VISNAMEDSTREAM) {
5740                 return (EPERM);
5741         }
5742 #endif
5743
5744 #if CONFIG_MACF
5745         if (VATTR_IS_ACTIVE(vap, va_mode) &&
5746             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0)
5747                 return (error);
5748 #endif
5749
5750         /* make sure that the caller is allowed to set this security information */
5751         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
5752             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
5753                 if (error == EACCES)
5754                         error = EPERM;
5755                 return(error);
5756         }
5757
5758         error = vnode_setattr(vp, vap, ctx);
5759
5760         return (error);
5761 }
5762
5763
5764 /*
5765  * Change mode of a file given a path name.
5766  *
5767  * Returns:     0                       Success
5768  *              namei:???               [anything namei can return]
5769  *              chmod_vnode:???         [anything chmod_vnode can return]
5770  */
5771 static int
5772 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
5773     int fd, int flag, enum uio_seg segflg)
5774 {
5775         struct nameidata nd;
5776         int follow, error;
5777
5778         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5779         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
5780             segflg, path, ctx);
5781         if ((error = nameiat(&nd, fd)))
5782                 return (error);
5783         error = chmod_vnode(ctx, nd.ni_vp, vap);
5784         vnode_put(nd.ni_vp);
5785         nameidone(&nd);
5786         return(error);
5787 }
5788
5789 /*
5790  * chmod_extended: Change the mode of a file given a path name; with extended
5791  * argument list (including extended security (ACL)).
5792  *
5793  * Parameters:  p                       Process requesting the open
5794  *              uap                     User argument descriptor (see below)
5795  *              retval                  (ignored)
5796  *
5797  * Indirect:    uap->path               Path to object (same as 'chmod')
5798  *              uap->uid                UID to set
5799  *              uap->gid                GID to set
5800  *              uap->mode               File mode to set (same as 'chmod')
5801  *              uap->xsecurity          ACL to set (or delete)
5802  *
5803  * Returns:     0                       Success
5804  *              !0                      errno value
5805  *
5806  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
5807  *
5808  * XXX:         We should enummerate the possible errno values here, and where
5809  *              in the code they originated.
5810  */
5811 int
5812 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
5813 {
5814         int error;
5815         struct vnode_attr va;
5816         kauth_filesec_t xsecdst;
5817
5818         AUDIT_ARG(owner, uap->uid, uap->gid);
5819
5820         VATTR_INIT(&va);
5821         if (uap->mode != -1)
5822                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5823         if (uap->uid != KAUTH_UID_NONE)
5824                 VATTR_SET(&va, va_uid, uap->uid);
5825         if (uap->gid != KAUTH_GID_NONE)
5826                 VATTR_SET(&va, va_gid, uap->gid);
5827
5828         xsecdst = NULL;
5829         switch(uap->xsecurity) {
5830                 /* explicit remove request */
5831         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
5832                 VATTR_SET(&va, va_acl, NULL);
5833                 break;
5834                 /* not being set */
5835         case USER_ADDR_NULL:
5836                 break;
5837         default:
5838                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
5839                         return(error);
5840                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5841                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
5842         }
5843
5844         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
5845             UIO_USERSPACE);
5846
5847         if (xsecdst != NULL)
5848                 kauth_filesec_free(xsecdst);
5849         return(error);
5850 }
5851
5852 /*
5853  * Returns:     0                       Success
5854  *              chmodat:???             [anything chmodat can return]
5855  */
5856 static int
5857 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
5858     int flag, enum uio_seg segflg)
5859 {
5860         struct vnode_attr va;
5861
5862         VATTR_INIT(&va);
5863         VATTR_SET(&va, va_mode, mode & ALLPERMS);
5864
5865         return (chmodat(ctx, path, &va, fd, flag, segflg));
5866 }
5867
5868 int
5869 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
5870 {
5871         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
5872             AT_FDCWD, 0, UIO_USERSPACE));
5873 }
5874
5875 int
5876 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
5877 {
5878         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5879                 return (EINVAL);
5880
5881         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
5882             uap->fd, uap->flag, UIO_USERSPACE));
5883 }
5884
5885 /*
5886  * Change mode of a file given a file descriptor.
5887  */
5888 static int
5889 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
5890 {
5891         vnode_t vp;
5892         int error;
5893
5894         AUDIT_ARG(fd, fd);
5895
5896         if ((error = file_vnode(fd, &vp)) != 0)
5897                 return (error);
5898         if ((error = vnode_getwithref(vp)) != 0) {
5899                 file_drop(fd);
5900                 return(error);
5901         }
5902         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5903
5904         error = chmod_vnode(vfs_context_current(), vp, vap);
5905         (void)vnode_put(vp);
5906         file_drop(fd);
5907
5908         return (error);
5909 }
5910
5911 /*
5912  * fchmod_extended: Change mode of a file given a file descriptor; with
5913  * extended argument list (including extended security (ACL)).
5914  *
5915  * Parameters:    p                       Process requesting to change file mode
5916  *                uap                     User argument descriptor (see below)
5917  *                retval                  (ignored)
5918  *
5919  * Indirect:      uap->mode               File mode to set (same as 'chmod')
5920  *                uap->uid                UID to set
5921  *                uap->gid                GID to set
5922  *                uap->xsecurity          ACL to set (or delete)
5923  *                uap->fd                 File descriptor of file to change mode
5924  *
5925  * Returns:        0                      Success
5926  *                !0                      errno value
5927  *
5928  */
5929 int
5930 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
5931 {
5932         int error;
5933         struct vnode_attr va;
5934         kauth_filesec_t xsecdst;
5935
5936         AUDIT_ARG(owner, uap->uid, uap->gid);
5937
5938         VATTR_INIT(&va);
5939         if (uap->mode != -1)
5940                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5941         if (uap->uid != KAUTH_UID_NONE)
5942                 VATTR_SET(&va, va_uid, uap->uid);
5943         if (uap->gid != KAUTH_GID_NONE)
5944                 VATTR_SET(&va, va_gid, uap->gid);
5945
5946         xsecdst = NULL;
5947         switch(uap->xsecurity) {
5948         case USER_ADDR_NULL:
5949                 VATTR_SET(&va, va_acl, NULL);
5950                 break;
5951         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
5952                 VATTR_SET(&va, va_acl, NULL);
5953                 break;
5954                 /* not being set */
5955         case CAST_USER_ADDR_T(-1):
5956                 break;
5957         default:
5958                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
5959                         return(error);
5960                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5961         }
5962
5963         error = fchmod1(p, uap->fd, &va);
5964
5965
5966         switch(uap->xsecurity) {
5967         case USER_ADDR_NULL:
5968         case CAST_USER_ADDR_T(-1):
5969                 break;
5970         default:
5971                 if (xsecdst != NULL)
5972                         kauth_filesec_free(xsecdst);
5973         }
5974         return(error);
5975 }
5976
5977 int
5978 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
5979 {
5980         struct vnode_attr va;
5981
5982         VATTR_INIT(&va);
5983         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5984
5985         return(fchmod1(p, uap->fd, &va));
5986 }
5987
5988
5989 /*
5990  * Set ownership given a path name.
5991  */
5992 /* ARGSUSED */
5993 static int
5994 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
5995    gid_t gid, int flag, enum uio_seg segflg)
5996 {
5997         vnode_t vp;
5998         struct vnode_attr va;
5999         int error;
6000         struct nameidata nd;
6001         int follow;
6002         kauth_action_t action;
6003
6004         AUDIT_ARG(owner, uid, gid);
6005
6006         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6007         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6008             path, ctx);
6009         error = nameiat(&nd, fd);
6010         if (error)
6011                 return (error);
6012         vp = nd.ni_vp;
6013
6014         nameidone(&nd);
6015
6016         VATTR_INIT(&va);
6017         if (uid != (uid_t)VNOVAL)
6018                 VATTR_SET(&va, va_uid, uid);
6019         if (gid != (gid_t)VNOVAL)
6020                 VATTR_SET(&va, va_gid, gid);
6021
6022 #if CONFIG_MACF
6023         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6024         if (error)
6025                 goto out;
6026 #endif
6027
6028         /* preflight and authorize attribute changes */
6029         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6030                 goto out;
6031         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6032                 goto out;
6033         error = vnode_setattr(vp, &va, ctx);
6034
6035 out:
6036         /*
6037          * EACCES is only allowed from namei(); permissions failure should
6038          * return EPERM, so we need to translate the error code.
6039          */
6040         if (error == EACCES)
6041                 error = EPERM;
6042
6043         vnode_put(vp);
6044         return (error);
6045 }
6046
6047 int
6048 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
6049 {
6050         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6051             uap->uid, uap->gid, 0, UIO_USERSPACE));
6052 }
6053
6054 int
6055 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
6056 {
6057         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6058             uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE));
6059 }
6060
6061 int
6062 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
6063 {
6064         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6065                 return (EINVAL);
6066
6067         return (fchownat_internal(vfs_context_current(), uap->fd, uap->path,
6068             uap->uid, uap->gid, uap->flag, UIO_USERSPACE));
6069 }
6070
6071 /*
6072  * Set ownership given a file descriptor.
6073  */
6074 /* ARGSUSED */
6075 int
6076 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
6077 {
6078         struct vnode_attr va;
6079         vfs_context_t ctx = vfs_context_current();
6080         vnode_t vp;
6081         int error;
6082         kauth_action_t action;
6083
6084         AUDIT_ARG(owner, uap->uid, uap->gid);
6085         AUDIT_ARG(fd, uap->fd);
6086
6087         if ( (error = file_vnode(uap->fd, &vp)) )
6088                 return (error);
6089
6090         if ( (error = vnode_getwithref(vp)) ) {
6091                 file_drop(uap->fd);
6092                 return(error);
6093         }
6094         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6095
6096         VATTR_INIT(&va);
6097         if (uap->uid != VNOVAL)
6098                 VATTR_SET(&va, va_uid, uap->uid);
6099         if (uap->gid != VNOVAL)
6100                 VATTR_SET(&va, va_gid, uap->gid);
6101
6102 #if NAMEDSTREAMS
6103         /* chown calls are not allowed for resource forks. */
6104         if (vp->v_flag & VISNAMEDSTREAM) {
6105                 error = EPERM;
6106                 goto out;
6107         }
6108 #endif
6109
6110 #if CONFIG_MACF
6111         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
6112         if (error)
6113                 goto out;
6114 #endif
6115
6116         /* preflight and authorize attribute changes */
6117         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6118                 goto out;
6119         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6120                 if (error == EACCES)
6121                         error = EPERM;
6122                 goto out;
6123         }
6124         error = vnode_setattr(vp, &va, ctx);
6125
6126 out:
6127         (void)vnode_put(vp);
6128         file_drop(uap->fd);
6129         return (error);
6130 }
6131
6132 static int
6133 getutimes(user_addr_t usrtvp, struct timespec *tsp)
6134 {
6135         int error;
6136
6137         if (usrtvp == USER_ADDR_NULL) {
6138                 struct timeval old_tv;
6139                 /* XXX Y2038 bug because of microtime argument */
6140                 microtime(&old_tv);
6141                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
6142                 tsp[1] = tsp[0];
6143         } else {
6144                 if (IS_64BIT_PROCESS(current_proc())) {
6145                         struct user64_timeval tv[2];
6146                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6147                         if (error)
6148                                 return (error);
6149                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6150                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6151                 } else {
6152                         struct user32_timeval tv[2];
6153                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6154                         if (error)
6155                                 return (error);
6156                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6157                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6158                 }
6159         }
6160         return 0;
6161 }
6162
6163 static int
6164 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
6165         int nullflag)
6166 {
6167         int error;
6168         struct vnode_attr va;
6169         kauth_action_t action;
6170
6171         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6172
6173         VATTR_INIT(&va);
6174         VATTR_SET(&va, va_access_time, ts[0]);
6175         VATTR_SET(&va, va_modify_time, ts[1]);
6176         if (nullflag)
6177                 va.va_vaflags |= VA_UTIMES_NULL;
6178
6179 #if NAMEDSTREAMS
6180         /* utimes calls are not allowed for resource forks. */
6181         if (vp->v_flag & VISNAMEDSTREAM) {
6182                 error = EPERM;
6183                 goto out;
6184         }
6185 #endif
6186
6187 #if CONFIG_MACF
6188         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
6189         if (error)
6190                 goto out;
6191 #endif
6192         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6193                 if (!nullflag && error == EACCES)
6194                         error = EPERM;
6195                 goto out;
6196         }
6197
6198         /* since we may not need to auth anything, check here */
6199         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6200                 if (!nullflag && error == EACCES)
6201                         error = EPERM;
6202                 goto out;
6203         }
6204         error = vnode_setattr(vp, &va, ctx);
6205
6206 out:
6207         return error;
6208 }
6209
6210 /*
6211  * Set the access and modification times of a file.
6212  */
6213 /* ARGSUSED */
6214 int
6215 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
6216 {
6217         struct timespec ts[2];
6218         user_addr_t usrtvp;
6219         int error;
6220         struct nameidata nd;
6221         vfs_context_t ctx = vfs_context_current();
6222
6223         /*
6224          * AUDIT: Needed to change the order of operations to do the
6225          * name lookup first because auditing wants the path.
6226          */
6227         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6228                 UIO_USERSPACE, uap->path, ctx);
6229         error = namei(&nd);
6230         if (error)
6231                 return (error);
6232         nameidone(&nd);
6233
6234         /*
6235          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
6236          * the current time instead.
6237          */
6238         usrtvp = uap->tptr;
6239         if ((error = getutimes(usrtvp, ts)) != 0)
6240                 goto out;
6241
6242         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
6243
6244 out:
6245         vnode_put(nd.ni_vp);
6246         return (error);
6247 }
6248
6249 /*
6250  * Set the access and modification times of a file.
6251  */
6252 /* ARGSUSED */
6253 int
6254 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
6255 {
6256         struct timespec ts[2];
6257         vnode_t vp;
6258         user_addr_t usrtvp;
6259         int error;
6260
6261         AUDIT_ARG(fd, uap->fd);
6262         usrtvp = uap->tptr;
6263         if ((error = getutimes(usrtvp, ts)) != 0)
6264                 return (error);
6265         if ((error = file_vnode(uap->fd, &vp)) != 0)
6266                 return (error);
6267         if((error = vnode_getwithref(vp))) {
6268                 file_drop(uap->fd);
6269                 return(error);
6270         }
6271
6272         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
6273         vnode_put(vp);
6274         file_drop(uap->fd);
6275         return(error);
6276 }
6277
6278 /*
6279  * Truncate a file given its path name.
6280  */
6281 /* ARGSUSED */
6282 int
6283 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
6284 {
6285         vnode_t vp;
6286         struct vnode_attr va;
6287         vfs_context_t ctx = vfs_context_current();
6288         int error;
6289         struct nameidata nd;
6290         kauth_action_t action;
6291
6292         if (uap->length < 0)
6293                 return(EINVAL);
6294         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
6295                 UIO_USERSPACE, uap->path, ctx);
6296         if ((error = namei(&nd)))
6297                 return (error);
6298         vp = nd.ni_vp;
6299
6300         nameidone(&nd);
6301
6302         VATTR_INIT(&va);
6303         VATTR_SET(&va, va_data_size, uap->length);
6304
6305 #if CONFIG_MACF
6306         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
6307         if (error)
6308                 goto out;
6309 #endif
6310
6311         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6312                 goto out;
6313         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6314                 goto out;
6315         error = vnode_setattr(vp, &va, ctx);
6316 out:
6317         vnode_put(vp);
6318         return (error);
6319 }
6320
6321 /*
6322  * Truncate a file given a file descriptor.
6323  */
6324 /* ARGSUSED */
6325 int
6326 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
6327 {
6328         vfs_context_t ctx = vfs_context_current();
6329         struct vnode_attr va;
6330         vnode_t vp;
6331         struct fileproc *fp;
6332         int error ;
6333         int fd = uap->fd;
6334
6335         AUDIT_ARG(fd, uap->fd);
6336         if (uap->length < 0)
6337                 return(EINVAL);
6338
6339         if ( (error = fp_lookup(p,fd,&fp,0)) ) {
6340                 return(error);
6341         }
6342
6343         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
6344         case DTYPE_PSXSHM:
6345                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
6346                 goto out;
6347         case DTYPE_VNODE:
6348                 break;
6349         default:
6350                 error = EINVAL;
6351                 goto out;
6352         }
6353
6354         vp = (vnode_t)fp->f_fglob->fg_data;
6355
6356         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
6357                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
6358                 error = EINVAL;
6359                 goto out;
6360         }
6361
6362         if ((error = vnode_getwithref(vp)) != 0) {
6363                 goto out;
6364         }
6365
6366         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6367
6368 #if CONFIG_MACF
6369         error = mac_vnode_check_truncate(ctx,
6370             fp->f_fglob->fg_cred, vp);
6371         if (error) {
6372                 (void)vnode_put(vp);
6373                 goto out;
6374         }
6375 #endif
6376         VATTR_INIT(&va);
6377         VATTR_SET(&va, va_data_size, uap->length);
6378         error = vnode_setattr(vp, &va, ctx);
6379         (void)vnode_put(vp);
6380 out:
6381         file_drop(fd);
6382         return (error);
6383 }
6384
6385
6386 /*
6387  * Sync an open file with synchronized I/O _file_ integrity completion
6388  */
6389 /* ARGSUSED */
6390 int
6391 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
6392 {
6393         __pthread_testcancel(1);
6394         return(fsync_common(p, uap, MNT_WAIT));
6395 }
6396
6397
6398 /*
6399  * Sync an open file with synchronized I/O _file_ integrity completion
6400  *
6401  * Notes:       This is a legacy support function that does not test for
6402  *              thread cancellation points.
6403  */
6404 /* ARGSUSED */
6405 int
6406 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
6407 {
6408         return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT));
6409 }
6410
6411
6412 /*
6413  * Sync an open file with synchronized I/O _data_ integrity completion
6414  */
6415 /* ARGSUSED */
6416 int
6417 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
6418 {
6419         __pthread_testcancel(1);
6420         return(fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT));
6421 }
6422
6423
6424 /*
6425  * fsync_common
6426  *
6427  * Common fsync code to support both synchronized I/O file integrity completion
6428  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
6429  *
6430  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
6431  * will only guarantee that the file data contents are retrievable.  If
6432  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
6433  * includes additional metadata unnecessary for retrieving the file data
6434  * contents, such as atime, mtime, ctime, etc., also be committed to stable
6435  * storage.
6436  *
6437  * Parameters:  p                               The process
6438  *              uap->fd                         The descriptor to synchronize
6439  *              flags                           The data integrity flags
6440  *
6441  * Returns:     int                             Success
6442  *      fp_getfvp:EBADF                         Bad file descriptor
6443  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
6444  *      VNOP_FSYNC:???                          unspecified
6445  *
6446  * Notes:       We use struct fsync_args because it is a short name, and all
6447  *              caller argument structures are otherwise identical.
6448  */
6449 static int
6450 fsync_common(proc_t p, struct fsync_args *uap, int flags)
6451 {
6452         vnode_t vp;
6453         struct fileproc *fp;
6454         vfs_context_t ctx = vfs_context_current();
6455         int error;
6456
6457         AUDIT_ARG(fd, uap->fd);
6458
6459         if ( (error = fp_getfvp(p, uap->fd, &fp, &vp)) )
6460                 return (error);
6461         if ( (error = vnode_getwithref(vp)) ) {
6462                 file_drop(uap->fd);
6463                 return(error);
6464         }
6465
6466         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6467
6468         error = VNOP_FSYNC(vp, flags, ctx);
6469
6470 #if NAMEDRSRCFORK
6471         /* Sync resource fork shadow file if necessary. */
6472         if ((error == 0) &&
6473             (vp->v_flag & VISNAMEDSTREAM) &&
6474             (vp->v_parent != NULLVP) &&
6475             vnode_isshadow(vp) &&
6476             (fp->f_flags & FP_WRITTEN)) {
6477                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
6478         }
6479 #endif
6480
6481         (void)vnode_put(vp);
6482         file_drop(uap->fd);
6483         return (error);
6484 }
6485
6486 /*
6487  * Duplicate files.  Source must be a file, target must be a file or
6488  * must not exist.
6489  *
6490  * XXX Copyfile authorisation checking is woefully inadequate, and will not
6491  *     perform inheritance correctly.
6492  */
6493 /* ARGSUSED */
6494 int
6495 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
6496 {
6497         vnode_t tvp, fvp, tdvp, sdvp;
6498         struct nameidata fromnd, tond;
6499         int error;
6500         vfs_context_t ctx = vfs_context_current();
6501
6502         /* Check that the flags are valid. */
6503
6504         if (uap->flags & ~CPF_MASK) {
6505                 return(EINVAL);
6506         }
6507
6508         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, SAVESTART | AUDITVNPATH1,
6509                 UIO_USERSPACE, uap->from, ctx);
6510         if ((error = namei(&fromnd)))
6511                 return (error);
6512         fvp = fromnd.ni_vp;
6513
6514         NDINIT(&tond, CREATE, OP_LINK,
6515                LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6516                UIO_USERSPACE, uap->to, ctx);
6517         if ((error = namei(&tond))) {
6518                 goto out1;
6519         }
6520         tdvp = tond.ni_dvp;
6521         tvp = tond.ni_vp;
6522
6523         if (tvp != NULL) {
6524                 if (!(uap->flags & CPF_OVERWRITE)) {
6525                         error = EEXIST;
6526                         goto out;
6527                 }
6528         }
6529         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
6530                 error = EISDIR;
6531                 goto out;
6532         }
6533
6534         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
6535                 goto out;
6536
6537         if (fvp == tdvp)
6538                 error = EINVAL;
6539         /*
6540          * If source is the same as the destination (that is the
6541          * same inode number) then there is nothing to do.
6542          * (fixed to have POSIX semantics - CSM 3/2/98)
6543          */
6544         if (fvp == tvp)
6545                 error = -1;
6546         if (!error)
6547                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
6548 out:
6549         sdvp = tond.ni_startdir;
6550         /*
6551          * nameidone has to happen before we vnode_put(tdvp)
6552          * since it may need to release the fs_nodelock on the tdvp
6553          */
6554         nameidone(&tond);
6555
6556         if (tvp)
6557                 vnode_put(tvp);
6558         vnode_put(tdvp);
6559         vnode_put(sdvp);
6560 out1:
6561         vnode_put(fvp);
6562
6563         if (fromnd.ni_startdir)
6564                 vnode_put(fromnd.ni_startdir);
6565         nameidone(&fromnd);
6566
6567         if (error == -1)
6568                 return (0);
6569         return (error);
6570 }
6571
6572
6573 /*
6574  * Rename files.  Source and destination must either both be directories,
6575  * or both not be directories.  If target is a directory, it must be empty.
6576  */
6577 /* ARGSUSED */
6578 static int
6579 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
6580     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
6581 {
6582         vnode_t tvp, tdvp;
6583         vnode_t fvp, fdvp;
6584         struct nameidata *fromnd, *tond;
6585         int error;
6586         int do_retry;
6587         int mntrename;
6588         int need_event;
6589         const char *oname = NULL;
6590         char *from_name = NULL, *to_name = NULL;
6591         int from_len=0, to_len=0;
6592         int holding_mntlock;
6593         mount_t locked_mp = NULL;
6594         vnode_t oparent = NULLVP;
6595 #if CONFIG_FSE
6596         fse_info from_finfo, to_finfo;
6597 #endif
6598         int from_truncated=0, to_truncated;
6599         int batched = 0;
6600         struct vnode_attr *fvap, *tvap;
6601         int continuing = 0;
6602         /* carving out a chunk for structs that are too big to be on stack. */
6603         struct {
6604                 struct nameidata from_node, to_node;
6605                 struct vnode_attr fv_attr, tv_attr;
6606         } * __rename_data;
6607         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
6608         fromnd = &__rename_data->from_node;
6609         tond = &__rename_data->to_node;
6610
6611         holding_mntlock = 0;
6612         do_retry = 0;
6613 retry:
6614         fvp = tvp = NULL;
6615         fdvp = tdvp = NULL;
6616         fvap = tvap = NULL;
6617         mntrename = FALSE;
6618
6619         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
6620             segflg, from, ctx);
6621         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
6622
6623         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6624             segflg, to, ctx);
6625         tond->ni_flag = NAMEI_COMPOUNDRENAME;
6626
6627 continue_lookup:
6628         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
6629                 if ( (error = nameiat(fromnd, fromfd)) )
6630                         goto out1;
6631                 fdvp = fromnd->ni_dvp;
6632                 fvp  = fromnd->ni_vp;
6633
6634                 if (fvp && fvp->v_type == VDIR)
6635                         tond->ni_cnd.cn_flags |= WILLBEDIR;
6636         }
6637
6638         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
6639                 if ( (error = nameiat(tond, tofd)) ) {
6640                         /*
6641                          * Translate error code for rename("dir1", "dir2/.").
6642                          */
6643                         if (error == EISDIR && fvp->v_type == VDIR)
6644                                 error = EINVAL;
6645                         goto out1;
6646                 }
6647                 tdvp = tond->ni_dvp;
6648                 tvp  = tond->ni_vp;
6649         }
6650
6651         batched = vnode_compound_rename_available(fdvp);
6652         if (!fvp) {
6653                 /*
6654                  * Claim: this check will never reject a valid rename.
6655                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
6656                  * Suppose fdvp and tdvp are not on the same mount.
6657                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
6658                  *      then you can't move it to within another dir on the same mountpoint.
6659                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
6660                  *
6661                  * If this check passes, then we are safe to pass these vnodes to the same FS.
6662                  */
6663                 if (fdvp->v_mount != tdvp->v_mount) {
6664                         error = EXDEV;
6665                         goto out1;
6666                 }
6667                 goto skipped_lookup;
6668         }
6669
6670         if (!batched) {
6671                 error = vn_authorize_rename(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, NULL);
6672                 if (error) {
6673                         if (error == ENOENT) {
6674                                 /*
6675                                  * We encountered a race where after doing the namei, tvp stops
6676                                  * being valid. If so, simply re-drive the rename call from the
6677                                  * top.
6678                                  */
6679                                 do_retry = 1;
6680                         }
6681                         goto out1;
6682                 }
6683         }
6684
6685         /*
6686          * If the source and destination are the same (i.e. they're
6687          * links to the same vnode) and the target file system is
6688          * case sensitive, then there is nothing to do.
6689          *
6690          * XXX Come back to this.
6691          */
6692         if (fvp == tvp) {
6693                 int pathconf_val;
6694
6695                 /*
6696                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
6697                  * then assume that this file system is case sensitive.
6698                  */
6699                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
6700                     pathconf_val != 0) {
6701                         goto out1;
6702                 }
6703         }
6704
6705         /*
6706          * Allow the renaming of mount points.
6707          * - target must not exist
6708          * - target must reside in the same directory as source
6709          * - union mounts cannot be renamed
6710          * - "/" cannot be renamed
6711          *
6712          * XXX Handle this in VFS after a continued lookup (if we missed
6713          * in the cache to start off)
6714          */
6715         if ((fvp->v_flag & VROOT) &&
6716             (fvp->v_type == VDIR) &&
6717             (tvp == NULL)  &&
6718             (fvp->v_mountedhere == NULL)  &&
6719             (fdvp == tdvp)  &&
6720             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0)  &&
6721             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
6722                 vnode_t coveredvp;
6723
6724                 /* switch fvp to the covered vnode */
6725                 coveredvp = fvp->v_mount->mnt_vnodecovered;
6726                 if ( (vnode_getwithref(coveredvp)) ) {
6727                         error = ENOENT;
6728                         goto out1;
6729                 }
6730                 vnode_put(fvp);
6731
6732                 fvp = coveredvp;
6733                 mntrename = TRUE;
6734         }
6735         /*
6736          * Check for cross-device rename.
6737          */
6738         if ((fvp->v_mount != tdvp->v_mount) ||
6739             (tvp && (fvp->v_mount != tvp->v_mount))) {
6740                 error = EXDEV;
6741                 goto out1;
6742         }
6743
6744         /*
6745          * If source is the same as the destination (that is the
6746          * same inode number) then there is nothing to do...
6747          * EXCEPT if the underlying file system supports case
6748          * insensitivity and is case preserving.  In this case
6749          * the file system needs to handle the special case of
6750          * getting the same vnode as target (fvp) and source (tvp).
6751          *
6752          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
6753          * and _PC_CASE_PRESERVING can have this exception, and they need to
6754          * handle the special case of getting the same vnode as target and
6755          * source.  NOTE: Then the target is unlocked going into vnop_rename,
6756          * so not to cause locking problems. There is a single reference on tvp.
6757          *
6758          * NOTE - that fvp == tvp also occurs if they are hard linked and
6759          * that correct behaviour then is just to return success without doing
6760          * anything.
6761          *
6762          * XXX filesystem should take care of this itself, perhaps...
6763          */
6764         if (fvp == tvp && fdvp == tdvp) {
6765                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
6766                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
6767                           fromnd->ni_cnd.cn_namelen)) {
6768                         goto out1;
6769                 }
6770         }
6771
6772         if (holding_mntlock && fvp->v_mount != locked_mp) {
6773                 /*
6774                  * we're holding a reference and lock
6775                  * on locked_mp, but it no longer matches
6776                  * what we want to do... so drop our hold
6777                  */
6778                 mount_unlock_renames(locked_mp);
6779                 mount_drop(locked_mp, 0);
6780                 holding_mntlock = 0;
6781         }
6782         if (tdvp != fdvp && fvp->v_type == VDIR) {
6783                 /*
6784                  * serialize renames that re-shape
6785                  * the tree... if holding_mntlock is
6786                  * set, then we're ready to go...
6787                  * otherwise we
6788                  * first need to drop the iocounts
6789                  * we picked up, second take the
6790                  * lock to serialize the access,
6791                  * then finally start the lookup
6792                  * process over with the lock held
6793                  */
6794                 if (!holding_mntlock) {
6795                         /*
6796                          * need to grab a reference on
6797                          * the mount point before we
6798                          * drop all the iocounts... once
6799                          * the iocounts are gone, the mount
6800                          * could follow
6801                          */
6802                         locked_mp = fvp->v_mount;
6803                         mount_ref(locked_mp, 0);
6804
6805                         /*
6806                          * nameidone has to happen before we vnode_put(tvp)
6807                          * since it may need to release the fs_nodelock on the tvp
6808                          */
6809                         nameidone(tond);
6810
6811                         if (tvp)
6812                                 vnode_put(tvp);
6813                         vnode_put(tdvp);
6814
6815                         /*
6816                          * nameidone has to happen before we vnode_put(fdvp)
6817                          * since it may need to release the fs_nodelock on the fvp
6818                          */
6819                         nameidone(fromnd);
6820
6821                         vnode_put(fvp);
6822                         vnode_put(fdvp);
6823
6824                         mount_lock_renames(locked_mp);
6825                         holding_mntlock = 1;
6826
6827                         goto retry;
6828                 }
6829         } else {
6830                 /*
6831                  * when we dropped the iocounts to take
6832                  * the lock, we allowed the identity of
6833                  * the various vnodes to change... if they did,
6834                  * we may no longer be dealing with a rename
6835                  * that reshapes the tree... once we're holding
6836                  * the iocounts, the vnodes can't change type
6837                  * so we're free to drop the lock at this point
6838                  * and continue on
6839                  */
6840                 if (holding_mntlock) {
6841                         mount_unlock_renames(locked_mp);
6842                         mount_drop(locked_mp, 0);
6843                         holding_mntlock = 0;
6844                 }
6845         }
6846
6847         // save these off so we can later verify that fvp is the same
6848         oname   = fvp->v_name;
6849         oparent = fvp->v_parent;
6850
6851 skipped_lookup:
6852 #if CONFIG_FSE
6853         need_event = need_fsevent(FSE_RENAME, fdvp);
6854         if (need_event) {
6855                 if (fvp) {
6856                         get_fse_info(fvp, &from_finfo, ctx);
6857                 } else {
6858                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
6859                         if (error) {
6860                                 goto out1;
6861                         }
6862
6863                         fvap = &__rename_data->fv_attr;
6864                 }
6865
6866                 if (tvp) {
6867                         get_fse_info(tvp, &to_finfo, ctx);
6868                 } else if (batched) {
6869                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
6870                         if (error) {
6871                                 goto out1;
6872                         }
6873
6874                         tvap = &__rename_data->tv_attr;
6875                 }
6876         }
6877 #else
6878         need_event = 0;
6879 #endif /* CONFIG_FSE */
6880
6881         if (need_event || kauth_authorize_fileop_has_listeners()) {
6882                 if (from_name == NULL) {
6883                         GET_PATH(from_name);
6884                         if (from_name == NULL) {
6885                                 error = ENOMEM;
6886                                 goto out1;
6887                         }
6888                 }
6889
6890                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
6891
6892                 if (to_name == NULL) {
6893                         GET_PATH(to_name);
6894                         if (to_name == NULL) {
6895                                 error = ENOMEM;
6896                                 goto out1;
6897                         }
6898                 }
6899
6900                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
6901         }
6902 #if CONFIG_SECLUDED_RENAME
6903         if (flags & VFS_SECLUDE_RENAME) {
6904                 fromnd->ni_cnd.cn_flags |=  CN_SECLUDE_RENAME;
6905         }
6906 #else
6907         #pragma unused(flags)
6908 #endif
6909         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
6910                             tdvp, &tvp, &tond->ni_cnd, tvap,
6911                             0, ctx);
6912
6913         if (holding_mntlock) {
6914                 /*
6915                  * we can drop our serialization
6916                  * lock now
6917                  */
6918                 mount_unlock_renames(locked_mp);
6919                 mount_drop(locked_mp, 0);
6920                 holding_mntlock = 0;
6921         }
6922         if (error) {
6923                 if (error == EKEEPLOOKING) {
6924                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6925                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6926                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
6927                                 }
6928                         }
6929
6930                         fromnd->ni_vp = fvp;
6931                         tond->ni_vp = tvp;
6932
6933                         goto continue_lookup;
6934                 }
6935
6936                 /*
6937                  * We may encounter a race in the VNOP where the destination didn't
6938                  * exist when we did the namei, but it does by the time we go and
6939                  * try to create the entry. In this case, we should re-drive this rename
6940                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
6941                  * but other filesystems susceptible to this race could return it, too.
6942                  */
6943                 if (error == ERECYCLE) {
6944                         do_retry = 1;
6945                 }
6946
6947                 goto out1;
6948         }
6949
6950         /* call out to allow 3rd party notification of rename.
6951          * Ignore result of kauth_authorize_fileop call.
6952          */
6953         kauth_authorize_fileop(vfs_context_ucred(ctx),
6954                         KAUTH_FILEOP_RENAME,
6955                         (uintptr_t)from_name, (uintptr_t)to_name);
6956
6957 #if CONFIG_FSE
6958         if (from_name != NULL && to_name != NULL) {
6959                 if (from_truncated || to_truncated) {
6960                         // set it here since only the from_finfo gets reported up to user space
6961                         from_finfo.mode |= FSE_TRUNCATED_PATH;
6962                 }
6963
6964                 if (tvap && tvp) {
6965                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
6966                 }
6967                 if (fvap) {
6968                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
6969                 }
6970
6971                 if (tvp) {
6972                         add_fsevent(FSE_RENAME, ctx,
6973                                     FSE_ARG_STRING, from_len, from_name,
6974                                     FSE_ARG_FINFO, &from_finfo,
6975                                     FSE_ARG_STRING, to_len, to_name,
6976                                     FSE_ARG_FINFO, &to_finfo,
6977                                     FSE_ARG_DONE);
6978                 } else {
6979                         add_fsevent(FSE_RENAME, ctx,
6980                                     FSE_ARG_STRING, from_len, from_name,
6981                                     FSE_ARG_FINFO, &from_finfo,
6982                                     FSE_ARG_STRING, to_len, to_name,
6983                                     FSE_ARG_DONE);
6984                 }
6985         }
6986 #endif /* CONFIG_FSE */
6987
6988         /*
6989          * update filesystem's mount point data
6990          */
6991         if (mntrename) {
6992                 char *cp, *pathend, *mpname;
6993                 char * tobuf;
6994                 struct mount *mp;
6995                 int maxlen;
6996                 size_t len = 0;
6997
6998                 mp = fvp->v_mountedhere;
6999
7000                 if (vfs_busy(mp, LK_NOWAIT)) {
7001                         error = EBUSY;
7002                         goto out1;
7003                 }
7004                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
7005
7006                 if (UIO_SEG_IS_USER_SPACE(segflg))
7007                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
7008                 else
7009                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
7010                 if (!error) {
7011                         /* find current mount point prefix */
7012                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
7013                         for (cp = pathend; *cp != '\0'; ++cp) {
7014                                 if (*cp == '/')
7015                                         pathend = cp + 1;
7016                         }
7017                         /* find last component of target name */
7018                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
7019                                 if (*cp == '/')
7020                                         mpname = cp + 1;
7021                         }
7022                         /* append name to prefix */
7023                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
7024                         bzero(pathend, maxlen);
7025                         strlcpy(pathend, mpname, maxlen);
7026                 }
7027                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
7028
7029                 vfs_unbusy(mp);
7030         }
7031         /*
7032          * fix up name & parent pointers.  note that we first
7033          * check that fvp has the same name/parent pointers it
7034          * had before the rename call... this is a 'weak' check
7035          * at best...
7036          *
7037          * XXX oparent and oname may not be set in the compound vnop case
7038          */
7039         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
7040                 int update_flags;
7041
7042                 update_flags = VNODE_UPDATE_NAME;
7043
7044                 if (fdvp != tdvp)
7045                         update_flags |= VNODE_UPDATE_PARENT;
7046
7047                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
7048         }
7049 out1:
7050         if (to_name != NULL) {
7051                 RELEASE_PATH(to_name);
7052                 to_name = NULL;
7053         }
7054         if (from_name != NULL) {
7055                 RELEASE_PATH(from_name);
7056                 from_name = NULL;
7057         }
7058         if (holding_mntlock) {
7059                 mount_unlock_renames(locked_mp);
7060                 mount_drop(locked_mp, 0);
7061                 holding_mntlock = 0;
7062         }
7063         if (tdvp) {
7064                 /*
7065                  * nameidone has to happen before we vnode_put(tdvp)
7066                  * since it may need to release the fs_nodelock on the tdvp
7067                  */
7068                 nameidone(tond);
7069
7070                 if (tvp)
7071                         vnode_put(tvp);
7072                 vnode_put(tdvp);
7073         }
7074         if (fdvp) {
7075                 /*
7076                  * nameidone has to happen before we vnode_put(fdvp)
7077                  * since it may need to release the fs_nodelock on the fdvp
7078                  */
7079                 nameidone(fromnd);
7080
7081                 if (fvp)
7082                         vnode_put(fvp);
7083                 vnode_put(fdvp);
7084         }
7085
7086         /*
7087          * If things changed after we did the namei, then we will re-drive
7088          * this rename call from the top.
7089          */
7090         if (do_retry) {
7091                 do_retry = 0;
7092                 goto retry;
7093         }
7094
7095         FREE(__rename_data, M_TEMP);
7096         return (error);
7097 }
7098
7099 int
7100 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
7101 {
7102         return (renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
7103             AT_FDCWD, uap->to, UIO_USERSPACE, 0));
7104 }
7105
7106 #if CONFIG_SECLUDED_RENAME
7107 int rename_ext(__unused proc_t p, struct rename_ext_args *uap, __unused int32_t *retval)
7108 {
7109         return renameat_internal(
7110                 vfs_context_current(),
7111                 AT_FDCWD, uap->from,
7112                 AT_FDCWD, uap->to,
7113                 UIO_USERSPACE, uap->flags);
7114 }
7115 #endif
7116
7117 int
7118 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
7119 {
7120         return (renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
7121             uap->tofd, uap->to, UIO_USERSPACE, 0));
7122 }
7123
7124 /*
7125  * Make a directory file.
7126  *
7127  * Returns:     0                       Success
7128  *              EEXIST
7129  *      namei:???
7130  *      vnode_authorize:???
7131  *      vn_create:???
7132  */
7133 /* ARGSUSED */
7134 static int
7135 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
7136     enum uio_seg segflg)
7137 {
7138         vnode_t vp, dvp;
7139         int error;
7140         int update_flags = 0;
7141         int batched;
7142         struct nameidata nd;
7143
7144         AUDIT_ARG(mode, vap->va_mode);
7145         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
7146                path, ctx);
7147         nd.ni_cnd.cn_flags |= WILLBEDIR;
7148         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
7149
7150 continue_lookup:
7151         error = nameiat(&nd, fd);
7152         if (error)
7153                 return (error);
7154         dvp = nd.ni_dvp;
7155         vp = nd.ni_vp;
7156
7157         if (vp != NULL) {
7158                 error = EEXIST;
7159                 goto out;
7160         }
7161
7162         batched = vnode_compound_mkdir_available(dvp);
7163
7164         VATTR_SET(vap, va_type, VDIR);
7165
7166         /*
7167          * XXX
7168          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
7169          * only get EXISTS or EISDIR for existing path components, and not that it could see
7170          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
7171          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
7172          */
7173         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
7174                 if (error == EACCES || error == EPERM) {
7175                         int error2;
7176
7177                         nameidone(&nd);
7178                         vnode_put(dvp);
7179                         dvp = NULLVP;
7180
7181                         /*
7182                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
7183                          * rather than EACCESS if the target exists.
7184                          */
7185                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
7186                                         path, ctx);
7187                         error2 = nameiat(&nd, fd);
7188                         if (error2) {
7189                                 goto out;
7190                         } else {
7191                                 vp = nd.ni_vp;
7192                                 error = EEXIST;
7193                                 goto out;
7194                         }
7195                 }
7196
7197                 goto out;
7198         }
7199
7200         /*
7201          * make the directory
7202          */
7203         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
7204                 if (error == EKEEPLOOKING) {
7205                         nd.ni_vp = vp;
7206                         goto continue_lookup;
7207                 }
7208
7209                 goto out;
7210         }
7211
7212         // Make sure the name & parent pointers are hooked up
7213         if (vp->v_name == NULL)
7214                 update_flags |= VNODE_UPDATE_NAME;
7215         if (vp->v_parent == NULLVP)
7216                 update_flags |= VNODE_UPDATE_PARENT;
7217
7218         if (update_flags)
7219                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
7220
7221 #if CONFIG_FSE
7222         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
7223 #endif
7224
7225 out:
7226         /*
7227          * nameidone has to happen before we vnode_put(dvp)
7228          * since it may need to release the fs_nodelock on the dvp
7229          */
7230         nameidone(&nd);
7231
7232         if (vp)
7233                 vnode_put(vp);
7234         if (dvp)
7235                 vnode_put(dvp);
7236
7237         return (error);
7238 }
7239
7240 /*
7241  * mkdir_extended: Create a directory; with extended security (ACL).
7242  *
7243  * Parameters:    p                       Process requesting to create the directory
7244  *                uap                     User argument descriptor (see below)
7245  *                retval                  (ignored)
7246  *
7247  * Indirect:      uap->path               Path of directory to create
7248  *                uap->mode               Access permissions to set
7249  *                uap->xsecurity          ACL to set
7250  *
7251  * Returns:        0                      Success
7252  *                !0                      Not success
7253  *
7254  */
7255 int
7256 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
7257 {
7258         int ciferror;
7259         kauth_filesec_t xsecdst;
7260         struct vnode_attr va;
7261
7262         AUDIT_ARG(owner, uap->uid, uap->gid);
7263
7264         xsecdst = NULL;
7265         if ((uap->xsecurity != USER_ADDR_NULL) &&
7266             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
7267                 return ciferror;
7268
7269         VATTR_INIT(&va);
7270         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7271         if (xsecdst != NULL)
7272                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7273
7274         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7275             UIO_USERSPACE);
7276         if (xsecdst != NULL)
7277                 kauth_filesec_free(xsecdst);
7278         return ciferror;
7279 }
7280
7281 int
7282 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
7283 {
7284         struct vnode_attr va;
7285
7286         VATTR_INIT(&va);
7287         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7288
7289         return (mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7290             UIO_USERSPACE));
7291 }
7292
7293 int
7294 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
7295 {
7296         struct vnode_attr va;
7297
7298         VATTR_INIT(&va);
7299         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7300
7301         return(mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
7302             UIO_USERSPACE));
7303 }
7304
7305 static int
7306 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
7307     enum uio_seg segflg)
7308 {
7309         vnode_t vp, dvp;
7310         int error;
7311         struct nameidata nd;
7312         char     *path = NULL;
7313         int       len=0;
7314         int has_listeners = 0;
7315         int need_event = 0;
7316         int truncated = 0;
7317 #if CONFIG_FSE
7318         struct vnode_attr va;
7319 #endif /* CONFIG_FSE */
7320         struct vnode_attr *vap = NULL;
7321         int batched;
7322
7323         int restart_flag;
7324
7325         /*
7326          * This loop exists to restart rmdir in the unlikely case that two
7327          * processes are simultaneously trying to remove the same directory
7328          * containing orphaned appleDouble files.
7329          */
7330         do {
7331                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
7332                     segflg, dirpath, ctx);
7333                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
7334 continue_lookup:
7335                 restart_flag = 0;
7336                 vap = NULL;
7337
7338                 error = nameiat(&nd, fd);
7339                 if (error)
7340                         return (error);
7341
7342                 dvp = nd.ni_dvp;
7343                 vp = nd.ni_vp;
7344
7345                 if (vp) {
7346                         batched = vnode_compound_rmdir_available(vp);
7347
7348                         if (vp->v_flag & VROOT) {
7349                                 /*
7350                                  * The root of a mounted filesystem cannot be deleted.
7351                                  */
7352                                 error = EBUSY;
7353                                 goto out;
7354                         }
7355
7356                         /*
7357                          * Removed a check here; we used to abort if vp's vid
7358                          * was not the same as what we'd seen the last time around.
7359                          * I do not think that check was valid, because if we retry
7360                          * and all dirents are gone, the directory could legitimately
7361                          * be recycled but still be present in a situation where we would
7362                          * have had permission to delete.  Therefore, we won't make
7363                          * an effort to preserve that check now that we may not have a
7364                          * vp here.
7365                          */
7366
7367                         if (!batched) {
7368                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
7369                                 if (error) {
7370                                         goto out;
7371                                 }
7372                         }
7373                 } else {
7374                         batched = 1;
7375
7376                         if (!vnode_compound_rmdir_available(dvp)) {
7377                                 panic("No error, but no compound rmdir?");
7378                         }
7379                 }
7380
7381 #if CONFIG_FSE
7382                 fse_info  finfo;
7383
7384                 need_event = need_fsevent(FSE_DELETE, dvp);
7385                 if (need_event) {
7386                         if (!batched) {
7387                                 get_fse_info(vp, &finfo, ctx);
7388                         } else {
7389                                 error = vfs_get_notify_attributes(&va);
7390                                 if (error) {
7391                                         goto out;
7392                                 }
7393
7394                                 vap = &va;
7395                         }
7396                 }
7397 #endif
7398                 has_listeners = kauth_authorize_fileop_has_listeners();
7399                 if (need_event || has_listeners) {
7400                         if (path == NULL) {
7401                                 GET_PATH(path);
7402                                 if (path == NULL) {
7403                                         error = ENOMEM;
7404                                         goto out;
7405                                 }
7406                         }
7407
7408                         len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
7409 #if CONFIG_FSE
7410                         if (truncated) {
7411                                 finfo.mode |= FSE_TRUNCATED_PATH;
7412                         }
7413 #endif
7414                 }
7415
7416                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
7417                 nd.ni_vp = vp;
7418                 if (vp == NULLVP) {
7419                         /* Couldn't find a vnode */
7420                         goto out;
7421                 }
7422
7423                 if (error == EKEEPLOOKING) {
7424                         goto continue_lookup;
7425                 }
7426 #if CONFIG_APPLEDOUBLE
7427                 /*
7428                  * Special case to remove orphaned AppleDouble
7429                  * files. I don't like putting this in the kernel,
7430                  * but carbon does not like putting this in carbon either,
7431                  * so here we are.
7432                  */
7433                 if (error == ENOTEMPTY) {
7434                         error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
7435                         if (error == EBUSY) {
7436                                 goto out;
7437                         }
7438
7439
7440                         /*
7441                          * Assuming everything went well, we will try the RMDIR again
7442                          */
7443                         if (!error)
7444                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
7445                 }
7446 #endif /* CONFIG_APPLEDOUBLE */
7447                 /*
7448                  * Call out to allow 3rd party notification of delete.
7449                  * Ignore result of kauth_authorize_fileop call.
7450                  */
7451                 if (!error) {
7452                         if (has_listeners) {
7453                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
7454                                                 KAUTH_FILEOP_DELETE,
7455                                                 (uintptr_t)vp,
7456                                                 (uintptr_t)path);
7457                         }
7458
7459                         if (vp->v_flag & VISHARDLINK) {
7460                                 // see the comment in unlink1() about why we update
7461                                 // the parent of a hard link when it is removed
7462                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
7463                         }
7464
7465 #if CONFIG_FSE
7466                         if (need_event) {
7467                                 if (vap) {
7468                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
7469                                 }
7470                                 add_fsevent(FSE_DELETE, ctx,
7471                                                 FSE_ARG_STRING, len, path,
7472                                                 FSE_ARG_FINFO, &finfo,
7473                                                 FSE_ARG_DONE);
7474                         }
7475 #endif
7476                 }
7477
7478 out:
7479                 if (path != NULL) {
7480                         RELEASE_PATH(path);
7481                         path = NULL;
7482                 }
7483                 /*
7484                  * nameidone has to happen before we vnode_put(dvp)
7485                  * since it may need to release the fs_nodelock on the dvp
7486                  */
7487                 nameidone(&nd);
7488                 vnode_put(dvp);
7489
7490                 if (vp)
7491                         vnode_put(vp);
7492
7493                 if (restart_flag == 0) {
7494                         wakeup_one((caddr_t)vp);
7495                         return (error);
7496                 }
7497                 tsleep(vp, PVFS, "rm AD", 1);
7498
7499         } while (restart_flag != 0);
7500
7501         return (error);
7502
7503 }
7504
7505 /*
7506  * Remove a directory file.
7507  */
7508 /* ARGSUSED */
7509 int
7510 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
7511 {
7512         return (rmdirat_internal(vfs_context_current(), AT_FDCWD,
7513             CAST_USER_ADDR_T(uap->path), UIO_USERSPACE));
7514 }
7515
7516 /* Get direntry length padded to 8 byte alignment */
7517 #define DIRENT64_LEN(namlen) \
7518         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
7519
7520 errno_t
7521 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
7522                 int *numdirent, vfs_context_t ctxp)
7523 {
7524         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
7525         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
7526                    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0))  {
7527                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
7528         } else {
7529                 size_t bufsize;
7530                 void * bufptr;
7531                 uio_t auio;
7532                 struct direntry *entry64;
7533                 struct dirent *dep;
7534                 int bytesread;
7535                 int error;
7536
7537                 /*
7538                  * Our kernel buffer needs to be smaller since re-packing
7539                  * will expand each dirent.  The worse case (when the name
7540                  * length is 3) corresponds to a struct direntry size of 32
7541                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
7542                  * (4-byte aligned).  So having a buffer that is 3/8 the size
7543                  * will prevent us from reading more than we can pack.
7544                  *
7545                  * Since this buffer is wired memory, we will limit the
7546                  * buffer size to a maximum of 32K. We would really like to
7547                  * use 32K in the MIN(), but we use magic number 87371 to
7548                  * prevent uio_resid() * 3 / 8 from overflowing.
7549                  */
7550                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
7551                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
7552                 if (bufptr == NULL) {
7553                         return ENOMEM;
7554                 }
7555
7556                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
7557                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
7558                 auio->uio_offset = uio->uio_offset;
7559
7560                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
7561
7562                 dep = (struct dirent *)bufptr;
7563                 bytesread = bufsize - uio_resid(auio);
7564
7565                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
7566                        M_TEMP, M_WAITOK);
7567                 /*
7568                  * Convert all the entries and copy them out to user's buffer.
7569                  */
7570                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
7571                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
7572
7573                         bzero(entry64, enbufsize);
7574                         /* Convert a dirent to a dirent64. */
7575                         entry64->d_ino = dep->d_ino;
7576                         entry64->d_seekoff = 0;
7577                         entry64->d_reclen = enbufsize;
7578                         entry64->d_namlen = dep->d_namlen;
7579                         entry64->d_type = dep->d_type;
7580                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
7581
7582                         /* Move to next entry. */
7583                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
7584
7585                         /* Copy entry64 to user's buffer. */
7586                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
7587                 }
7588
7589                 /* Update the real offset using the offset we got from VNOP_READDIR. */
7590                 if (error == 0) {
7591                         uio->uio_offset = auio->uio_offset;
7592                 }
7593                 uio_free(auio);
7594                 FREE(bufptr, M_TEMP);
7595                 FREE(entry64, M_TEMP);
7596                 return (error);
7597         }
7598 }
7599
7600 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
7601
7602 /*
7603  * Read a block of directory entries in a file system independent format.
7604  */
7605 static int
7606 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
7607                      off_t *offset, int flags)
7608 {
7609         vnode_t vp;
7610         struct vfs_context context = *vfs_context_current();    /* local copy */
7611         struct fileproc *fp;
7612         uio_t auio;
7613         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7614         off_t loff;
7615         int error, eofflag, numdirent;
7616         char uio_buf[ UIO_SIZEOF(1) ];
7617
7618         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
7619         if (error) {
7620                 return (error);
7621         }
7622         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7623                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7624                 error = EBADF;
7625                 goto out;
7626         }
7627
7628         if (bufsize > GETDIRENTRIES_MAXBUFSIZE)
7629                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
7630
7631 #if CONFIG_MACF
7632         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
7633         if (error)
7634                 goto out;
7635 #endif
7636         if ( (error = vnode_getwithref(vp)) ) {
7637                 goto out;
7638         }
7639         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7640
7641 unionread:
7642         if (vp->v_type != VDIR) {
7643                 (void)vnode_put(vp);
7644                 error = EINVAL;
7645                 goto out;
7646         }
7647
7648 #if CONFIG_MACF
7649         error = mac_vnode_check_readdir(&context, vp);
7650         if (error != 0) {
7651                 (void)vnode_put(vp);
7652                 goto out;
7653         }
7654 #endif /* MAC */
7655
7656         loff = fp->f_fglob->fg_offset;
7657         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
7658         uio_addiov(auio, bufp, bufsize);
7659
7660         if (flags & VNODE_READDIR_EXTENDED) {
7661                 error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
7662                 fp->f_fglob->fg_offset = uio_offset(auio);
7663         } else {
7664                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
7665                 fp->f_fglob->fg_offset = uio_offset(auio);
7666         }
7667         if (error) {
7668                 (void)vnode_put(vp);
7669                 goto out;
7670         }
7671
7672         if ((user_ssize_t)bufsize == uio_resid(auio)){
7673                 if (union_dircheckp) {
7674                         error = union_dircheckp(&vp, fp, &context);
7675                         if (error == -1)
7676                                 goto unionread;
7677                         if (error)
7678                                 goto out;
7679                 }
7680
7681                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
7682                         struct vnode *tvp = vp;
7683                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
7684                                 vnode_ref(vp);
7685                                 fp->f_fglob->fg_data = (caddr_t) vp;
7686                                 fp->f_fglob->fg_offset = 0;
7687                                 vnode_rele(tvp);
7688                                 vnode_put(tvp);
7689                                 goto unionread;
7690                         }
7691                         vp = tvp;
7692                 }
7693         }
7694
7695         vnode_put(vp);
7696         if (offset) {
7697                 *offset = loff;
7698         }
7699
7700         *bytesread = bufsize - uio_resid(auio);
7701 out:
7702         file_drop(fd);
7703         return (error);
7704 }
7705
7706
7707 int
7708 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
7709 {
7710         off_t offset;
7711         ssize_t bytesread;
7712         int error;
7713
7714         AUDIT_ARG(fd, uap->fd);
7715         error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
7716
7717         if (error == 0) {
7718                 if (proc_is64bit(p)) {
7719                         user64_long_t base = (user64_long_t)offset;
7720                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
7721                 } else {
7722                         user32_long_t base = (user32_long_t)offset;
7723                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
7724                 }
7725                 *retval = bytesread;
7726         }
7727         return (error);
7728 }
7729
7730 int
7731 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
7732 {
7733         off_t offset;
7734         ssize_t bytesread;
7735         int error;
7736
7737         AUDIT_ARG(fd, uap->fd);
7738         error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
7739
7740         if (error == 0) {
7741                 *retval = bytesread;
7742                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
7743         }
7744         return (error);
7745 }
7746
7747
7748 /*
7749  * Set the mode mask for creation of filesystem nodes.
7750  * XXX implement xsecurity
7751  */
7752 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
7753 static int
7754 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
7755 {
7756         struct filedesc *fdp;
7757
7758         AUDIT_ARG(mask, newmask);
7759         proc_fdlock(p);
7760         fdp = p->p_fd;
7761         *retval = fdp->fd_cmask;
7762         fdp->fd_cmask = newmask & ALLPERMS;
7763         proc_fdunlock(p);
7764         return (0);
7765 }
7766
7767 /*
7768  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
7769  *
7770  * Parameters:    p                       Process requesting to set the umask
7771  *                uap                     User argument descriptor (see below)
7772  *                retval                  umask of the process (parameter p)
7773  *
7774  * Indirect:      uap->newmask            umask to set
7775  *                uap->xsecurity          ACL to set
7776  *
7777  * Returns:        0                      Success
7778  *                !0                      Not success
7779  *
7780  */
7781 int
7782 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
7783 {
7784         int ciferror;
7785         kauth_filesec_t xsecdst;
7786
7787         xsecdst = KAUTH_FILESEC_NONE;
7788         if (uap->xsecurity != USER_ADDR_NULL) {
7789                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
7790                         return ciferror;
7791         } else {
7792                 xsecdst = KAUTH_FILESEC_NONE;
7793         }
7794
7795         ciferror = umask1(p, uap->newmask, xsecdst, retval);
7796
7797         if (xsecdst != KAUTH_FILESEC_NONE)
7798                 kauth_filesec_free(xsecdst);
7799         return ciferror;
7800 }
7801
7802 int
7803 umask(proc_t p, struct umask_args *uap, int32_t *retval)
7804 {
7805         return(umask1(p, uap->newmask, UMASK_NOXSECURITY, retval));
7806 }
7807
7808 /*
7809  * Void all references to file by ripping underlying filesystem
7810  * away from vnode.
7811  */
7812 /* ARGSUSED */
7813 int
7814 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
7815 {
7816         vnode_t vp;
7817         struct vnode_attr va;
7818         vfs_context_t ctx = vfs_context_current();
7819         int error;
7820         struct nameidata nd;
7821
7822         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
7823                uap->path, ctx);
7824         error = namei(&nd);
7825         if (error)
7826                 return (error);
7827         vp = nd.ni_vp;
7828
7829         nameidone(&nd);
7830
7831         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
7832                 error = ENOTSUP;
7833                 goto out;
7834         }
7835
7836         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
7837                 error = EBUSY;
7838                 goto out;
7839         }
7840
7841 #if CONFIG_MACF
7842         error = mac_vnode_check_revoke(ctx, vp);
7843         if (error)
7844                 goto out;
7845 #endif
7846
7847         VATTR_INIT(&va);
7848         VATTR_WANTED(&va, va_uid);
7849         if ((error = vnode_getattr(vp, &va, ctx)))
7850                 goto out;
7851         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
7852             (error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
7853                 goto out;
7854         if (vp->v_usecount > 0 || (vnode_isaliased(vp)))
7855                 VNOP_REVOKE(vp, REVOKEALL, ctx);
7856 out:
7857         vnode_put(vp);
7858         return (error);
7859 }
7860
7861
7862 /*
7863  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
7864  *  The following system calls are designed to support features
7865  *  which are specific to the HFS & HFS Plus volume formats
7866  */
7867
7868
7869 /*
7870  * Obtain attribute information on objects in a directory while enumerating
7871  * the directory.
7872  */
7873 /* ARGSUSED */
7874 int
7875 getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
7876 {
7877         vnode_t vp;
7878         struct fileproc *fp;
7879         uio_t auio = NULL;
7880         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7881         uint32_t count, savecount;
7882         uint32_t newstate;
7883         int error, eofflag;
7884         uint32_t loff;
7885         struct attrlist attributelist;
7886         vfs_context_t ctx = vfs_context_current();
7887         int fd = uap->fd;
7888         char uio_buf[ UIO_SIZEOF(1) ];
7889         kauth_action_t action;
7890
7891         AUDIT_ARG(fd, fd);
7892
7893         /* Get the attributes into kernel space */
7894         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
7895                 return(error);
7896         }
7897         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
7898                 return(error);
7899         }
7900         savecount = count;
7901         if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) {
7902                 return (error);
7903         }
7904         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7905                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7906                 error = EBADF;
7907                 goto out;
7908         }
7909
7910
7911 #if CONFIG_MACF
7912         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
7913             fp->f_fglob);
7914         if (error)
7915                 goto out;
7916 #endif
7917
7918
7919         if ( (error = vnode_getwithref(vp)) )
7920                 goto out;
7921
7922         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7923
7924 unionread:
7925         if (vp->v_type != VDIR) {
7926                 (void)vnode_put(vp);
7927                 error = EINVAL;
7928                 goto out;
7929         }
7930
7931 #if CONFIG_MACF
7932         error = mac_vnode_check_readdir(ctx, vp);
7933         if (error != 0) {
7934                 (void)vnode_put(vp);
7935                 goto out;
7936         }
7937 #endif /* MAC */
7938
7939         /* set up the uio structure which will contain the users return buffer */
7940         loff = fp->f_fglob->fg_offset;
7941         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
7942         uio_addiov(auio, uap->buffer, uap->buffersize);
7943
7944         /*
7945          * If the only item requested is file names, we can let that past with
7946          * just LIST_DIRECTORY.  If they want any other attributes, that means
7947          * they need SEARCH as well.
7948          */
7949         action = KAUTH_VNODE_LIST_DIRECTORY;
7950         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
7951             attributelist.fileattr || attributelist.dirattr)
7952                 action |= KAUTH_VNODE_SEARCH;
7953
7954         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
7955
7956                 /* Believe it or not, uap->options only has 32-bits of valid
7957                  * info, so truncate before extending again */
7958
7959                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
7960                                 (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
7961         }
7962
7963         if (error) {
7964                 (void) vnode_put(vp);
7965                 goto out;
7966         }
7967
7968         /*
7969          * If we've got the last entry of a directory in a union mount
7970          * then reset the eofflag and pretend there's still more to come.
7971          * The next call will again set eofflag and the buffer will be empty,
7972          * so traverse to the underlying directory and do the directory
7973          * read there.
7974          */
7975         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
7976                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
7977                         eofflag = 0;
7978                 } else {                                                // Empty buffer
7979                         struct vnode *tvp = vp;
7980                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
7981                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
7982                                 fp->f_fglob->fg_data = (caddr_t) vp;
7983                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
7984                                 count = savecount;
7985                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
7986                                 vnode_put(tvp);
7987                                 goto unionread;
7988                         }
7989                         vp = tvp;
7990                 }
7991         }
7992
7993         (void)vnode_put(vp);
7994
7995         if (error)
7996                 goto out;
7997         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
7998
7999         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count))))
8000                 goto out;
8001         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate))))
8002                 goto out;
8003         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff))))
8004                 goto out;
8005
8006         *retval = eofflag;  /* similar to getdirentries */
8007         error = 0;
8008 out:
8009         file_drop(fd);
8010         return (error); /* return error earlier, an retval of 0 or 1 now */
8011
8012 } /* end of getdirentriesattr system call */
8013
8014 /*
8015 * Exchange data between two files
8016 */
8017
8018 /* ARGSUSED */
8019 int
8020 exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
8021 {
8022
8023         struct nameidata fnd, snd;
8024         vfs_context_t ctx = vfs_context_current();
8025         vnode_t fvp;
8026         vnode_t svp;
8027         int error;
8028         u_int32_t nameiflags;
8029         char *fpath = NULL;
8030         char *spath = NULL;
8031         int   flen=0, slen=0;
8032         int from_truncated=0, to_truncated=0;
8033 #if CONFIG_FSE
8034         fse_info f_finfo, s_finfo;
8035 #endif
8036
8037         nameiflags = 0;
8038         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8039
8040         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
8041                UIO_USERSPACE, uap->path1, ctx);
8042
8043         error = namei(&fnd);
8044         if (error)
8045                 goto out2;
8046
8047         nameidone(&fnd);
8048         fvp = fnd.ni_vp;
8049
8050         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
8051                UIO_USERSPACE, uap->path2, ctx);
8052
8053         error = namei(&snd);
8054         if (error) {
8055                 vnode_put(fvp);
8056                 goto out2;
8057         }
8058         nameidone(&snd);
8059         svp = snd.ni_vp;
8060
8061         /*
8062          * if the files are the same, return an inval error
8063          */
8064         if (svp == fvp) {
8065                 error = EINVAL;
8066                 goto out;
8067         }
8068
8069         /*
8070          * if the files are on different volumes, return an error
8071          */
8072         if (svp->v_mount != fvp->v_mount) {
8073                 error = EXDEV;
8074                 goto out;
8075         }
8076
8077         /* If they're not files, return an error */
8078         if ( (vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
8079                 error = EINVAL;
8080                 goto out;
8081         }
8082
8083 #if CONFIG_MACF
8084         error = mac_vnode_check_exchangedata(ctx,
8085             fvp, svp);
8086         if (error)
8087                 goto out;
8088 #endif
8089         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
8090             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0))
8091                 goto out;
8092
8093         if (
8094 #if CONFIG_FSE
8095         need_fsevent(FSE_EXCHANGE, fvp) ||
8096 #endif
8097         kauth_authorize_fileop_has_listeners()) {
8098                 GET_PATH(fpath);
8099                 GET_PATH(spath);
8100                 if (fpath == NULL || spath == NULL) {
8101                         error = ENOMEM;
8102                         goto out;
8103                 }
8104
8105                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
8106                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
8107
8108 #if CONFIG_FSE
8109                 get_fse_info(fvp, &f_finfo, ctx);
8110                 get_fse_info(svp, &s_finfo, ctx);
8111                 if (from_truncated || to_truncated) {
8112                         // set it here since only the f_finfo gets reported up to user space
8113                         f_finfo.mode |= FSE_TRUNCATED_PATH;
8114                 }
8115 #endif
8116         }
8117         /* Ok, make the call */
8118         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
8119
8120         if (error == 0) {
8121             const char *tmpname;
8122
8123             if (fpath != NULL && spath != NULL) {
8124                     /* call out to allow 3rd party notification of exchangedata.
8125                      * Ignore result of kauth_authorize_fileop call.
8126                      */
8127                     kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
8128                                            (uintptr_t)fpath, (uintptr_t)spath);
8129             }
8130             name_cache_lock();
8131
8132             tmpname     = fvp->v_name;
8133             fvp->v_name = svp->v_name;
8134             svp->v_name = tmpname;
8135
8136             if (fvp->v_parent != svp->v_parent) {
8137                 vnode_t tmp;
8138
8139                 tmp           = fvp->v_parent;
8140                 fvp->v_parent = svp->v_parent;
8141                 svp->v_parent = tmp;
8142             }
8143             name_cache_unlock();
8144
8145 #if CONFIG_FSE
8146             if (fpath != NULL && spath != NULL) {
8147                     add_fsevent(FSE_EXCHANGE, ctx,
8148                                 FSE_ARG_STRING, flen, fpath,
8149                                 FSE_ARG_FINFO, &f_finfo,
8150                                 FSE_ARG_STRING, slen, spath,
8151                                 FSE_ARG_FINFO, &s_finfo,
8152                                 FSE_ARG_DONE);
8153             }
8154 #endif
8155         }
8156
8157 out:
8158         if (fpath != NULL)
8159                 RELEASE_PATH(fpath);
8160         if (spath != NULL)
8161                 RELEASE_PATH(spath);
8162         vnode_put(svp);
8163         vnode_put(fvp);
8164 out2:
8165         return (error);
8166 }
8167
8168 /*
8169  * Return (in MB) the amount of freespace on the given vnode's volume.
8170  */
8171 uint32_t freespace_mb(vnode_t vp);
8172
8173 uint32_t
8174 freespace_mb(vnode_t vp)
8175 {
8176         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
8177         return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
8178                 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
8179 }
8180
8181 #if CONFIG_SEARCHFS
8182
8183 /* ARGSUSED */
8184
8185 int
8186 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
8187 {
8188         vnode_t vp, tvp;
8189         int i, error=0;
8190         int fserror = 0;
8191         struct nameidata nd;
8192         struct user64_fssearchblock searchblock;
8193         struct searchstate *state;
8194         struct attrlist *returnattrs;
8195         struct timeval timelimit;
8196         void *searchparams1,*searchparams2;
8197         uio_t auio = NULL;
8198         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8199         uint32_t nummatches;
8200         int mallocsize;
8201         uint32_t nameiflags;
8202         vfs_context_t ctx = vfs_context_current();
8203         char uio_buf[ UIO_SIZEOF(1) ];
8204
8205         /* Start by copying in fsearchblock parameter list */
8206     if (IS_64BIT_PROCESS(p)) {
8207         error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
8208         timelimit.tv_sec = searchblock.timelimit.tv_sec;
8209         timelimit.tv_usec = searchblock.timelimit.tv_usec;
8210     }
8211     else {
8212         struct user32_fssearchblock tmp_searchblock;
8213
8214         error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
8215         // munge into 64-bit version
8216         searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
8217         searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
8218         searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
8219         searchblock.maxmatches = tmp_searchblock.maxmatches;
8220                 /*
8221                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
8222                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
8223                  */
8224         timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
8225         timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
8226         searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
8227         searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
8228         searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
8229         searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
8230         searchblock.searchattrs = tmp_searchblock.searchattrs;
8231     }
8232         if (error)
8233                 return(error);
8234
8235         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
8236          */
8237         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
8238                 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS)
8239                 return(EINVAL);
8240
8241         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
8242         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
8243         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
8244         /* block.                                                                                             */
8245         /*                                                                                                    */
8246         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
8247         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
8248         /*       assumes the size is still 556 bytes it will continue to work                                 */
8249
8250         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
8251                 sizeof(struct attrlist) + sizeof(struct searchstate) + (2*sizeof(uint32_t));
8252
8253         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
8254
8255         /* Now set up the various pointers to the correct place in our newly allocated memory */
8256
8257         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
8258         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
8259         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof (struct attrlist));
8260
8261         /* Now copy in the stuff given our local variables. */
8262
8263         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1)))
8264                 goto freeandexit;
8265
8266         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2)))
8267                 goto freeandexit;
8268
8269         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist))))
8270                 goto freeandexit;
8271
8272         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate))))
8273                 goto freeandexit;
8274
8275         /*
8276          * When searching a union mount, need to set the
8277          * start flag at the first call on each layer to
8278          * reset state for the new volume.
8279          */
8280         if (uap->options & SRCHFS_START)
8281                 state->ss_union_layer = 0;
8282         else
8283                 uap->options |= state->ss_union_flags;
8284         state->ss_union_flags = 0;
8285
8286         /*
8287          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
8288          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
8289          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
8290          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
8291          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
8292          */
8293
8294         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
8295                 attrreference_t* string_ref;
8296                 u_int32_t* start_length;
8297                 user64_size_t param_length;
8298
8299                 /* validate searchparams1 */
8300                 param_length = searchblock.sizeofsearchparams1;
8301                 /* skip the word that specifies length of the buffer */
8302                 start_length= (u_int32_t*) searchparams1;
8303                 start_length= start_length+1;
8304                 string_ref= (attrreference_t*) start_length;
8305
8306                 /* ensure no negative offsets or too big offsets */
8307                 if (string_ref->attr_dataoffset < 0 ) {
8308                         error = EINVAL;
8309                         goto freeandexit;
8310                 }
8311                 if (string_ref->attr_length > MAXPATHLEN) {
8312                         error = EINVAL;
8313                         goto freeandexit;
8314                 }
8315
8316                 /* Check for pointer overflow in the string ref */
8317                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
8318                         error = EINVAL;
8319                         goto freeandexit;
8320                 }
8321
8322                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
8323                         error = EINVAL;
8324                         goto freeandexit;
8325                 }
8326                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
8327                         error = EINVAL;
8328                         goto freeandexit;
8329                 }
8330         }
8331
8332         /* set up the uio structure which will contain the users return buffer */
8333         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8334         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
8335
8336         nameiflags = 0;
8337         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8338         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
8339                UIO_USERSPACE, uap->path, ctx);
8340
8341         error = namei(&nd);
8342         if (error)
8343                 goto freeandexit;
8344         vp = nd.ni_vp;
8345         nameidone(&nd);
8346
8347         /*
8348          * Switch to the root vnode for the volume
8349          */
8350         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
8351         vnode_put(vp);
8352         if (error)
8353                 goto freeandexit;
8354         vp = tvp;
8355
8356         /*
8357          * If it's a union mount, the path lookup takes
8358          * us to the top layer. But we may need to descend
8359          * to a lower layer. For non-union mounts the layer
8360          * is always zero.
8361          */
8362         for (i = 0; i < (int) state->ss_union_layer; i++) {
8363                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0)
8364                         break;
8365                 tvp = vp;
8366                 vp = vp->v_mount->mnt_vnodecovered;
8367                 if (vp == NULL) {
8368                         vnode_put(tvp);
8369                         error = ENOENT;
8370                         goto freeandexit;
8371                 }
8372                 vnode_getwithref(vp);
8373                 vnode_put(tvp);
8374         }
8375
8376 #if CONFIG_MACF
8377         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
8378         if (error) {
8379                 vnode_put(vp);
8380                 goto freeandexit;
8381         }
8382 #endif
8383
8384
8385         /*
8386          * If searchblock.maxmatches == 0, then skip the search. This has happened
8387          * before and sometimes the underlying code doesnt deal with it well.
8388          */
8389          if (searchblock.maxmatches == 0) {
8390                 nummatches = 0;
8391                 goto saveandexit;
8392          }
8393
8394         /*
8395          * Allright, we have everything we need, so lets make that call.
8396          *
8397          * We keep special track of the return value from the file system:
8398          * EAGAIN is an acceptable error condition that shouldn't keep us
8399          * from copying out any results...
8400          */
8401
8402         fserror = VNOP_SEARCHFS(vp,
8403                 searchparams1,
8404                 searchparams2,
8405                 &searchblock.searchattrs,
8406                 (u_long)searchblock.maxmatches,
8407                 &timelimit,
8408                 returnattrs,
8409                 &nummatches,
8410                 (u_long)uap->scriptcode,
8411                 (u_long)uap->options,
8412                 auio,
8413                 (struct searchstate *) &state->ss_fsstate,
8414                 ctx);
8415
8416         /*
8417          * If it's a union mount we need to be called again
8418          * to search the mounted-on filesystem.
8419          */
8420         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
8421                 state->ss_union_flags = SRCHFS_START;
8422                 state->ss_union_layer++;        // search next layer down
8423                 fserror = EAGAIN;
8424         }
8425
8426 saveandexit:
8427
8428         vnode_put(vp);
8429
8430         /* Now copy out the stuff that needs copying out. That means the number of matches, the
8431            search state.  Everything was already put into he return buffer by the vop call. */
8432
8433         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0)
8434                 goto freeandexit;
8435
8436         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0)
8437                 goto freeandexit;
8438
8439         error = fserror;
8440
8441 freeandexit:
8442
8443         FREE(searchparams1,M_TEMP);
8444
8445         return(error);
8446
8447
8448 } /* end of searchfs system call */
8449
8450 #else /* CONFIG_SEARCHFS */
8451
8452 int
8453 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
8454 {
8455         return (ENOTSUP);
8456 }
8457
8458 #endif /* CONFIG_SEARCHFS */
8459
8460
8461 lck_grp_attr_t *  nspace_group_attr;
8462 lck_attr_t *      nspace_lock_attr;
8463 lck_grp_t *       nspace_mutex_group;
8464
8465 lck_mtx_t         nspace_handler_lock;
8466 lck_mtx_t         nspace_handler_exclusion_lock;
8467
8468 time_t snapshot_timestamp=0;
8469 int nspace_allow_virtual_devs=0;
8470
8471 void nspace_handler_init(void);
8472
8473 typedef struct nspace_item_info {
8474         struct vnode *vp;
8475         void         *arg;
8476         uint64_t      op;
8477         uint32_t      vid;
8478         uint32_t      flags;
8479         uint32_t      token;
8480         uint32_t      refcount;
8481 } nspace_item_info;
8482
8483 #define MAX_NSPACE_ITEMS   128
8484 nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
8485 uint32_t      nspace_item_idx=0;              // also used as the sleep/wakeup rendezvous address
8486 uint32_t      nspace_token_id=0;
8487 uint32_t      nspace_handler_timeout = 15;    // seconds
8488
8489 #define NSPACE_ITEM_NEW         0x0001
8490 #define NSPACE_ITEM_PROCESSING  0x0002
8491 #define NSPACE_ITEM_DEAD        0x0004
8492 #define NSPACE_ITEM_CANCELLED   0x0008
8493 #define NSPACE_ITEM_DONE        0x0010
8494 #define NSPACE_ITEM_RESET_TIMER 0x0020
8495
8496 #define NSPACE_ITEM_NSPACE_EVENT   0x0040
8497 #define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
8498
8499 #define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT)
8500
8501 //#pragma optimization_level 0
8502
8503 typedef enum {
8504         NSPACE_HANDLER_NSPACE = 0,
8505         NSPACE_HANDLER_SNAPSHOT = 1,
8506
8507         NSPACE_HANDLER_COUNT,
8508 } nspace_type_t;
8509
8510 typedef struct {
8511         uint64_t handler_tid;
8512         struct proc *handler_proc;
8513         int handler_busy;
8514 } nspace_handler_t;
8515
8516 nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
8517
8518 /* namespace fsctl functions */
8519 static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
8520 static int nspace_item_flags_for_type(nspace_type_t nspace_type);
8521 static int nspace_open_flags_for_type(nspace_type_t nspace_type);
8522 static nspace_type_t nspace_type_for_op(uint64_t op);
8523 static int nspace_is_special_process(struct proc *proc);
8524 static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
8525 static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
8526 static int validate_namespace_args (int is64bit, int size);
8527 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
8528
8529
8530 static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
8531 {
8532         switch(nspace_type) {
8533                 case NSPACE_HANDLER_NSPACE:
8534                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
8535                 case NSPACE_HANDLER_SNAPSHOT:
8536                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
8537                 default:
8538                         printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
8539                         return 0;
8540         }
8541 }
8542
8543 static inline int nspace_item_flags_for_type(nspace_type_t nspace_type)
8544 {
8545         switch(nspace_type) {
8546                 case NSPACE_HANDLER_NSPACE:
8547                         return NSPACE_ITEM_NSPACE_EVENT;
8548                 case NSPACE_HANDLER_SNAPSHOT:
8549                         return NSPACE_ITEM_SNAPSHOT_EVENT;
8550                 default:
8551                         printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
8552                         return 0;
8553         }
8554 }
8555
8556 static inline int nspace_open_flags_for_type(nspace_type_t nspace_type)
8557 {
8558         switch(nspace_type) {
8559                 case NSPACE_HANDLER_NSPACE:
8560                         return FREAD | FWRITE | O_EVTONLY;
8561                 case NSPACE_HANDLER_SNAPSHOT:
8562                         return FREAD | O_EVTONLY;
8563                 default:
8564                         printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
8565                         return 0;
8566         }
8567 }
8568
8569 static inline nspace_type_t nspace_type_for_op(uint64_t op)
8570 {
8571         switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
8572                 case NAMESPACE_HANDLER_NSPACE_EVENT:
8573                         return NSPACE_HANDLER_NSPACE;
8574                 case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
8575                         return NSPACE_HANDLER_SNAPSHOT;
8576                 default:
8577                         printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
8578                         return NSPACE_HANDLER_NSPACE;
8579         }
8580 }
8581
8582 static inline int nspace_is_special_process(struct proc *proc)
8583 {
8584         int i;
8585         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
8586                 if (proc == nspace_handlers[i].handler_proc)
8587                         return 1;
8588         }
8589         return 0;
8590 }
8591
8592 void
8593 nspace_handler_init(void)
8594 {
8595         nspace_lock_attr    = lck_attr_alloc_init();
8596         nspace_group_attr   = lck_grp_attr_alloc_init();
8597         nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
8598         lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
8599         lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
8600         memset(&nspace_items[0], 0, sizeof(nspace_items));
8601 }
8602
8603 void
8604 nspace_proc_exit(struct proc *p)
8605 {
8606         int i, event_mask = 0;
8607
8608         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
8609                 if (p == nspace_handlers[i].handler_proc) {
8610                         event_mask |= nspace_item_flags_for_type(i);
8611                         nspace_handlers[i].handler_tid = 0;
8612                         nspace_handlers[i].handler_proc = NULL;
8613                 }
8614         }
8615
8616         if (event_mask == 0) {
8617                 return;
8618         }
8619
8620         if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
8621                 // if this process was the snapshot handler, zero snapshot_timeout
8622                 snapshot_timestamp = 0;
8623         }
8624
8625         //
8626         // unblock anyone that's waiting for the handler that died
8627         //
8628         lck_mtx_lock(&nspace_handler_lock);
8629         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8630                 if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
8631
8632                         if ( nspace_items[i].flags & event_mask ) {
8633
8634                                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
8635                                         vnode_lock_spin(nspace_items[i].vp);
8636                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
8637                                         vnode_unlock(nspace_items[i].vp);
8638                                 }
8639                                 nspace_items[i].vp = NULL;
8640                                 nspace_items[i].vid = 0;
8641                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
8642                                 nspace_items[i].token = 0;
8643
8644                                 wakeup((caddr_t)&(nspace_items[i].vp));
8645                         }
8646                 }
8647         }
8648
8649         wakeup((caddr_t)&nspace_item_idx);
8650         lck_mtx_unlock(&nspace_handler_lock);
8651 }
8652
8653
8654 int
8655 resolve_nspace_item(struct vnode *vp, uint64_t op)
8656 {
8657         return resolve_nspace_item_ext(vp, op, NULL);
8658 }
8659
8660 int
8661 resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
8662 {
8663         int i, error, keep_waiting;
8664         struct timespec ts;
8665         nspace_type_t nspace_type = nspace_type_for_op(op);
8666
8667         // only allow namespace events on regular files, directories and symlinks.
8668         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
8669                 return 0;
8670         }
8671
8672         //
8673         // if this is a snapshot event and the vnode is on a
8674         // disk image just pretend nothing happened since any
8675         // change to the disk image will cause the disk image
8676         // itself to get backed up and this avoids multi-way
8677         // deadlocks between the snapshot handler and the ever
8678         // popular diskimages-helper process.  the variable
8679         // nspace_allow_virtual_devs allows this behavior to
8680         // be overridden (for use by the Mobile TimeMachine
8681         // testing infrastructure which uses disk images)
8682         //
8683         if (   (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
8684             && (vp->v_mount != NULL)
8685             && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
8686             && !nspace_allow_virtual_devs) {
8687
8688                 return 0;
8689         }
8690
8691         // if (thread_tid(current_thread()) == namespace_handler_tid) {
8692         if (nspace_handlers[nspace_type].handler_proc == NULL) {
8693                 return 0;
8694         }
8695
8696         if (nspace_is_special_process(current_proc())) {
8697                 return EDEADLK;
8698         }
8699
8700         lck_mtx_lock(&nspace_handler_lock);
8701
8702 retry:
8703         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8704                 if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
8705                         break;
8706                 }
8707         }
8708
8709         if (i >= MAX_NSPACE_ITEMS) {
8710                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8711                         if (nspace_items[i].flags == 0) {
8712                                 break;
8713                         }
8714                 }
8715         } else {
8716                 nspace_items[i].refcount++;
8717         }
8718
8719         if (i >= MAX_NSPACE_ITEMS) {
8720                 ts.tv_sec = nspace_handler_timeout;
8721                 ts.tv_nsec = 0;
8722
8723                 error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS|PCATCH, "nspace-no-space", &ts);
8724                 if (error == 0) {
8725                         // an entry got free'd up, go see if we can get a slot
8726                         goto retry;
8727                 } else {
8728                         lck_mtx_unlock(&nspace_handler_lock);
8729                         return error;
8730                 }
8731         }
8732
8733         //
8734         // if it didn't already exist, add it.  if it did exist
8735         // we'll get woken up when someone does a wakeup() on
8736         // the slot in the nspace_items table.
8737         //
8738         if (vp != nspace_items[i].vp) {
8739                 nspace_items[i].vp = vp;
8740                 nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
8741                 nspace_items[i].op = op;
8742                 nspace_items[i].vid = vnode_vid(vp);
8743                 nspace_items[i].flags = NSPACE_ITEM_NEW;
8744                 nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
8745                 if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
8746                         if (arg) {
8747                                 vnode_lock_spin(vp);
8748                                 vp->v_flag |= VNEEDSSNAPSHOT;
8749                                 vnode_unlock(vp);
8750                         }
8751                 }
8752
8753                 nspace_items[i].token = 0;
8754                 nspace_items[i].refcount = 1;
8755
8756                 wakeup((caddr_t)&nspace_item_idx);
8757         }
8758
8759         //
8760         // Now go to sleep until the handler does a wakeup on this
8761         // slot in the nspace_items table (or we timeout).
8762         //
8763         keep_waiting = 1;
8764         while(keep_waiting) {
8765                 ts.tv_sec = nspace_handler_timeout;
8766                 ts.tv_nsec = 0;
8767                 error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS|PCATCH, "namespace-done", &ts);
8768
8769                 if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
8770                         error = 0;
8771                 } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
8772                         error = nspace_items[i].token;
8773                 } else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
8774                         if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
8775                                 nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
8776                                 continue;
8777                         } else {
8778                                 error = ETIMEDOUT;
8779                         }
8780                 } else if (error == 0) {
8781                         // hmmm, why did we get woken up?
8782                         printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
8783                                nspace_items[i].token);
8784                 }
8785
8786                 if (--nspace_items[i].refcount == 0) {
8787                         nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
8788                         nspace_items[i].arg = NULL;
8789                         nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
8790                         nspace_items[i].flags = 0;     // this clears it for re-use
8791                 }
8792                 wakeup(&nspace_token_id);
8793                 keep_waiting = 0;
8794         }
8795
8796         lck_mtx_unlock(&nspace_handler_lock);
8797
8798         return error;
8799 }
8800
8801
8802 int
8803 get_nspace_item_status(struct vnode *vp, int32_t *status)
8804 {
8805         int i;
8806
8807         lck_mtx_lock(&nspace_handler_lock);
8808         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8809                 if (nspace_items[i].vp == vp) {
8810                         break;
8811                 }
8812         }
8813
8814         if (i >= MAX_NSPACE_ITEMS) {
8815                 lck_mtx_unlock(&nspace_handler_lock);
8816                 return ENOENT;
8817         }
8818
8819         *status = nspace_items[i].flags;
8820         lck_mtx_unlock(&nspace_handler_lock);
8821         return 0;
8822 }
8823
8824
8825 #if 0
8826 static int
8827 build_volfs_path(struct vnode *vp, char *path, int *len)
8828 {
8829         struct vnode_attr va;
8830         int ret;
8831
8832         VATTR_INIT(&va);
8833         VATTR_WANTED(&va, va_fsid);
8834         VATTR_WANTED(&va, va_fileid);
8835
8836         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
8837                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
8838                 ret = -1;
8839         } else {
8840                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
8841                 ret = 0;
8842         }
8843
8844         return ret;
8845 }
8846 #endif
8847
8848 //
8849 // Note: this function does NOT check permissions on all of the
8850 // parent directories leading to this vnode.  It should only be
8851 // called on behalf of a root process.  Otherwise a process may
8852 // get access to a file because the file itself is readable even
8853 // though its parent directories would prevent access.
8854 //
8855 static int
8856 vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
8857 {
8858         int error, action;
8859
8860         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
8861                 return error;
8862         }
8863
8864 #if CONFIG_MACF
8865         error = mac_vnode_check_open(ctx, vp, fmode);
8866         if (error)
8867                 return error;
8868 #endif
8869
8870         /* compute action to be authorized */
8871         action = 0;
8872         if (fmode & FREAD) {
8873                 action |= KAUTH_VNODE_READ_DATA;
8874         }
8875         if (fmode & (FWRITE | O_TRUNC)) {
8876                 /*
8877                  * If we are writing, appending, and not truncating,
8878                  * indicate that we are appending so that if the
8879                  * UF_APPEND or SF_APPEND bits are set, we do not deny
8880                  * the open.
8881                  */
8882                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
8883                         action |= KAUTH_VNODE_APPEND_DATA;
8884                 } else {
8885                         action |= KAUTH_VNODE_WRITE_DATA;
8886                 }
8887         }
8888
8889         if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
8890                 return error;
8891
8892
8893         //
8894         // if the vnode is tagged VOPENEVT and the current process
8895         // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
8896         // flag to the open mode so that this open won't count against
8897         // the vnode when carbon delete() does a vnode_isinuse() to see
8898         // if a file is currently in use.  this allows spotlight
8899         // importers to not interfere with carbon apps that depend on
8900         // the no-delete-if-busy semantics of carbon delete().
8901         //
8902         if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
8903                 fmode |= O_EVTONLY;
8904         }
8905
8906         if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) {
8907                 return error;
8908         }
8909         if ( (error = vnode_ref_ext(vp, fmode, 0)) ) {
8910                 VNOP_CLOSE(vp, fmode, ctx);
8911                 return error;
8912         }
8913
8914         /* Call out to allow 3rd party notification of open.
8915          * Ignore result of kauth_authorize_fileop call.
8916          */
8917 #if CONFIG_MACF
8918         mac_vnode_notify_open(ctx, vp, fmode);
8919 #endif
8920         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
8921                                (uintptr_t)vp, 0);
8922
8923
8924         return 0;
8925 }
8926
8927 static int
8928 wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
8929 {
8930         int i, error=0, unblock=0;
8931         task_t curtask;
8932
8933         lck_mtx_lock(&nspace_handler_exclusion_lock);
8934         if (nspace_handlers[nspace_type].handler_busy) {
8935                 lck_mtx_unlock(&nspace_handler_exclusion_lock);
8936                 return EBUSY;
8937         }
8938         nspace_handlers[nspace_type].handler_busy = 1;
8939         lck_mtx_unlock(&nspace_handler_exclusion_lock);
8940
8941         /*
8942          * Any process that gets here will be one of the namespace handlers.
8943          * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
8944          * as we can cause deadlocks to occur, because the namespace handler may prevent
8945          * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
8946          * process.
8947          */
8948         curtask = current_task();
8949         bsd_set_dependency_capable (curtask);
8950
8951         lck_mtx_lock(&nspace_handler_lock);
8952         if (nspace_handlers[nspace_type].handler_proc == NULL) {
8953                 nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
8954                 nspace_handlers[nspace_type].handler_proc = current_proc();
8955         }
8956
8957         while (error == 0) {
8958
8959                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8960                         if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
8961                                 if (!nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
8962                                         continue;
8963                                 }
8964                                 break;
8965                         }
8966                 }
8967
8968                 if (i < MAX_NSPACE_ITEMS) {
8969                         nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
8970                         nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
8971                         nspace_items[i].token  = ++nspace_token_id;
8972
8973                         if (nspace_items[i].vp) {
8974                                 struct fileproc *fp;
8975                                 int32_t indx, fmode;
8976                                 struct proc *p = current_proc();
8977                                 vfs_context_t ctx = vfs_context_current();
8978                                 struct vnode_attr va;
8979
8980
8981                                 /*
8982                                  * Use vnode pointer to acquire a file descriptor for
8983                                  * hand-off to userland
8984                                  */
8985                                 fmode = nspace_open_flags_for_type(nspace_type);
8986                                 error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
8987                                 if (error) {
8988                                         unblock = 1;
8989                                         break;
8990                                 }
8991                                 error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
8992                                 if (error) {
8993                                         unblock = 1;
8994                                         vnode_put(nspace_items[i].vp);
8995                                         break;
8996                                 }
8997
8998                                 if ((error = falloc(p, &fp, &indx, ctx))) {
8999                                         vn_close(nspace_items[i].vp, fmode, ctx);
9000                                         vnode_put(nspace_items[i].vp);
9001                                         unblock = 1;
9002                                         break;
9003                                 }
9004
9005                                 fp->f_fglob->fg_flag = fmode;
9006                                 fp->f_fglob->fg_ops = &vnops;
9007                                 fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
9008
9009                                 proc_fdlock(p);
9010                                 procfdtbl_releasefd(p, indx, NULL);
9011                                 fp_drop(p, indx, fp, 1);
9012                                 proc_fdunlock(p);
9013
9014                                 /*
9015                                  * All variants of the namespace handler struct support these three fields:
9016                                  * token, flags, and the FD pointer
9017                                  */
9018                                 error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
9019                                 error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
9020                                 error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
9021
9022                                 /*
9023                                  * Handle optional fields:
9024                                  * extended version support an info ptr (offset, length), and the
9025                                  *
9026                                  * namedata version supports a unique per-link object ID
9027                                  *
9028                                  */
9029                                 if (nhd->infoptr) {
9030                                         uio_t uio = (uio_t)nspace_items[i].arg;
9031                                         uint64_t u_offset, u_length;
9032
9033                                         if (uio) {
9034                                                 u_offset = uio_offset(uio);
9035                                                 u_length = uio_resid(uio);
9036                                         } else {
9037                                                 u_offset = 0;
9038                                                 u_length = 0;
9039                                         }
9040                                         error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
9041                                         error = copyout(&u_length, nhd->infoptr+sizeof(uint64_t), sizeof(uint64_t));
9042                                 }
9043
9044                                 if (nhd->objid) {
9045                                         VATTR_INIT(&va);
9046                                         VATTR_WANTED(&va, va_linkid);
9047                                         error = vnode_getattr(nspace_items[i].vp, &va, ctx);
9048                                         if (error == 0 ) {
9049                                                 uint64_t linkid = 0;
9050                                                 if (VATTR_IS_SUPPORTED (&va, va_linkid)) {
9051                                                         linkid = (uint64_t)va.va_linkid;
9052                                                 }
9053                                                 error = copyout (&linkid, nhd->objid, sizeof(uint64_t));
9054                                         }
9055                                 }
9056
9057                                 if (error) {
9058                                         vn_close(nspace_items[i].vp, fmode, ctx);
9059                                         fp_free(p, indx, fp);
9060                                         unblock = 1;
9061                                 }
9062
9063                                 vnode_put(nspace_items[i].vp);
9064
9065                                 break;
9066                         } else {
9067                                 printf("wait_for_nspace_event: failed (nspace_items[%d] == %p error %d, name %s)\n",
9068                                        i, nspace_items[i].vp, error, nspace_items[i].vp->v_name);
9069                         }
9070
9071                 } else {
9072                         error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0);
9073                         if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9074                                 error = EINVAL;
9075                                 break;
9076                         }
9077
9078                 }
9079         }
9080
9081         if (unblock) {
9082                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9083                         vnode_lock_spin(nspace_items[i].vp);
9084                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9085                         vnode_unlock(nspace_items[i].vp);
9086                 }
9087                 nspace_items[i].vp = NULL;
9088                 nspace_items[i].vid = 0;
9089                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9090                 nspace_items[i].token = 0;
9091
9092                 wakeup((caddr_t)&(nspace_items[i].vp));
9093         }
9094
9095         if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
9096                 // just go through every snapshot event and unblock it immediately.
9097                 if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9098                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9099                                 if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9100                                         if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9101                                                 nspace_items[i].vp = NULL;
9102                                                 nspace_items[i].vid = 0;
9103                                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9104                                                 nspace_items[i].token = 0;
9105
9106                                                 wakeup((caddr_t)&(nspace_items[i].vp));
9107                                         }
9108                                 }
9109                         }
9110                 }
9111         }
9112
9113         lck_mtx_unlock(&nspace_handler_lock);
9114
9115         lck_mtx_lock(&nspace_handler_exclusion_lock);
9116         nspace_handlers[nspace_type].handler_busy = 0;
9117         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9118
9119         return error;
9120 }
9121
9122 static inline int validate_namespace_args (int is64bit, int size) {
9123
9124         if (is64bit) {
9125                 /* Must be one of these */
9126                 if (size == sizeof(user64_namespace_handler_info)) {
9127                         goto sizeok;
9128                 }
9129                 if (size == sizeof(user64_namespace_handler_info_ext)) {
9130                         goto sizeok;
9131                 }
9132                 if (size == sizeof(user64_namespace_handler_data)) {
9133                         goto sizeok;
9134                 }
9135                 return EINVAL;
9136         }
9137         else {
9138                 /* 32 bit -- must be one of these */
9139                 if (size == sizeof(user32_namespace_handler_info)) {
9140                         goto sizeok;
9141                 }
9142                 if (size == sizeof(user32_namespace_handler_info_ext)) {
9143                         goto sizeok;
9144                 }
9145                 if (size == sizeof(user32_namespace_handler_data)) {
9146                         goto sizeok;
9147                 }
9148                 return EINVAL;
9149         }
9150
9151 sizeok:
9152
9153         return 0;
9154
9155 }
9156
9157 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
9158 {
9159         int error = 0;
9160         namespace_handler_data nhd;
9161
9162         bzero (&nhd, sizeof(namespace_handler_data));
9163
9164         if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
9165                         (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9166                 return EINVAL;
9167         }
9168
9169         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9170                 return error;
9171         }
9172
9173         error = validate_namespace_args (is64bit, size);
9174         if (error) {
9175                 return error;
9176         }
9177
9178         /* Copy in the userland pointers into our kernel-only struct */
9179
9180         if (is64bit) {
9181                 /* 64 bit userland structures */
9182                 nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
9183                 nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
9184                 nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
9185
9186                 /* If the size is greater than the standard info struct, add in extra fields */
9187                 if (size > (sizeof(user64_namespace_handler_info))) {
9188                         if (size >= (sizeof(user64_namespace_handler_info_ext))) {
9189                                 nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
9190                         }
9191                         if (size == (sizeof(user64_namespace_handler_data))) {
9192                                 nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
9193                         }
9194                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9195                 }
9196         }
9197         else {
9198                 /* 32 bit userland structures */
9199                 nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
9200                 nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
9201                 nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
9202
9203                 if (size > (sizeof(user32_namespace_handler_info))) {
9204                         if (size >= (sizeof(user32_namespace_handler_info_ext))) {
9205                                 nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
9206                         }
9207                         if (size == (sizeof(user32_namespace_handler_data))) {
9208                                 nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
9209                         }
9210                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9211                 }
9212         }
9213
9214         return wait_for_namespace_event(&nhd, nspace_type);
9215 }
9216
9217 /*
9218  * Make a filesystem-specific control call:
9219  */
9220 /* ARGSUSED */
9221 static int
9222 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
9223 {
9224         int error=0;
9225         boolean_t is64bit;
9226         u_int size;
9227 #define STK_PARAMS 128
9228         char stkbuf[STK_PARAMS];
9229         caddr_t data, memp;
9230         vnode_t vp = *arg_vp;
9231
9232         size = IOCPARM_LEN(cmd);
9233         if (size > IOCPARM_MAX) return (EINVAL);
9234
9235         is64bit = proc_is64bit(p);
9236
9237         memp = NULL;
9238         if (size > sizeof (stkbuf)) {
9239                 if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
9240                 data = memp;
9241         } else {
9242                 data = &stkbuf[0];
9243         };
9244
9245         if (cmd & IOC_IN) {
9246                 if (size) {
9247                         error = copyin(udata, data, size);
9248                         if (error) {
9249                                 if (memp) {
9250                                         kfree (memp, size);
9251                                 }
9252                                 return error;
9253                         }
9254                 } else {
9255                         if (is64bit) {
9256                                 *(user_addr_t *)data = udata;
9257                         }
9258                         else {
9259                                 *(uint32_t *)data = (uint32_t)udata;
9260                         }
9261                 };
9262         } else if ((cmd & IOC_OUT) && size) {
9263                 /*
9264                  * Zero the buffer so the user always
9265                  * gets back something deterministic.
9266                  */
9267                 bzero(data, size);
9268         } else if (cmd & IOC_VOID) {
9269                 if (is64bit) {
9270                         *(user_addr_t *)data = udata;
9271                 }
9272                 else {
9273                         *(uint32_t *)data = (uint32_t)udata;
9274                 }
9275         }
9276
9277         /* Check to see if it's a generic command */
9278         switch (IOCBASECMD(cmd)) {
9279
9280                 case FSCTL_SYNC_VOLUME: {
9281                         mount_t mp = vp->v_mount;
9282                         int arg = *(uint32_t*)data;
9283
9284                         /* record vid of vp so we can drop it below. */
9285                         uint32_t vvid = vp->v_id;
9286
9287                         /*
9288                          * Then grab mount_iterref so that we can release the vnode.
9289                          * Without this, a thread may call vnode_iterate_prepare then
9290                          * get into a deadlock because we've never released the root vp
9291                          */
9292                         error = mount_iterref (mp, 0);
9293                         if (error)  {
9294                                 break;
9295                         }
9296                         vnode_put(vp);
9297
9298                         /* issue the sync for this volume */
9299                         (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
9300
9301                         /*
9302                          * Then release the mount_iterref once we're done syncing; it's not
9303                          * needed for the VNOP_IOCTL below
9304                          */
9305                         mount_iterdrop(mp);
9306
9307                         if (arg & FSCTL_SYNC_FULLSYNC) {
9308                                 /* re-obtain vnode iocount on the root vp, if possible */
9309                                 error = vnode_getwithvid (vp, vvid);
9310                                 if (error == 0) {
9311                                         error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
9312                                         vnode_put (vp);
9313                                 }
9314                         }
9315                         /* mark the argument VP as having been released */
9316                         *arg_vp = NULL;
9317                 }
9318                 break;
9319
9320                 case FSCTL_SET_PACKAGE_EXTS: {
9321                         user_addr_t ext_strings;
9322                         uint32_t    num_entries;
9323                         uint32_t    max_width;
9324
9325                         if (   (is64bit && size != sizeof(user64_package_ext_info))
9326                                         || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
9327
9328                                 // either you're 64-bit and passed a 64-bit struct or
9329                                 // you're 32-bit and passed a 32-bit struct.  otherwise
9330                                 // it's not ok.
9331                                 error = EINVAL;
9332                                 break;
9333                         }
9334
9335                         if (is64bit) {
9336                                 ext_strings = ((user64_package_ext_info *)data)->strings;
9337                                 num_entries = ((user64_package_ext_info *)data)->num_entries;
9338                                 max_width   = ((user64_package_ext_info *)data)->max_width;
9339                         } else {
9340                                 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
9341                                 num_entries = ((user32_package_ext_info *)data)->num_entries;
9342                                 max_width   = ((user32_package_ext_info *)data)->max_width;
9343                         }
9344                         error = set_package_extensions_table(ext_strings, num_entries, max_width);
9345                 }
9346                 break;
9347
9348                 /* namespace handlers */
9349                 case FSCTL_NAMESPACE_HANDLER_GET: {
9350                         error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
9351                 }
9352                 break;
9353
9354                 /* Snapshot handlers */
9355                 case FSCTL_OLD_SNAPSHOT_HANDLER_GET: {
9356                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
9357                 }
9358                 break;
9359
9360                 case FSCTL_SNAPSHOT_HANDLER_GET_EXT: {
9361                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
9362                 }
9363                 break;
9364
9365                 case FSCTL_NAMESPACE_HANDLER_UPDATE: {
9366                         uint32_t token, val;
9367                         int i;
9368
9369                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
9370                                 break;
9371                         }
9372
9373                         if (!nspace_is_special_process(p)) {
9374                                 error = EINVAL;
9375                                 break;
9376                         }
9377
9378                         token = ((uint32_t *)data)[0];
9379                         val   = ((uint32_t *)data)[1];
9380
9381                         lck_mtx_lock(&nspace_handler_lock);
9382
9383                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9384                                 if (nspace_items[i].token == token) {
9385                                         break;  /* exit for loop, not case stmt */
9386                                 }
9387                         }
9388
9389                         if (i >= MAX_NSPACE_ITEMS) {
9390                                 error = ENOENT;
9391                         } else {
9392                                 //
9393                                 // if this bit is set, when resolve_nspace_item() times out
9394                                 // it will loop and go back to sleep.
9395                                 //
9396                                 nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
9397                         }
9398
9399                         lck_mtx_unlock(&nspace_handler_lock);
9400
9401                         if (error) {
9402                                 printf("nspace-handler-update: did not find token %u\n", token);
9403                         }
9404                 }
9405                 break;
9406
9407                 case FSCTL_NAMESPACE_HANDLER_UNBLOCK: {
9408                         uint32_t token, val;
9409                         int i;
9410
9411                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
9412                                 break;
9413                         }
9414
9415                         if (!nspace_is_special_process(p)) {
9416                                 error = EINVAL;
9417                                 break;
9418                         }
9419
9420                         token = ((uint32_t *)data)[0];
9421                         val   = ((uint32_t *)data)[1];
9422
9423                         lck_mtx_lock(&nspace_handler_lock);
9424
9425                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9426                                 if (nspace_items[i].token == token) {
9427                                         break; /* exit for loop, not case statement */
9428                                 }
9429                         }
9430
9431                         if (i >= MAX_NSPACE_ITEMS) {
9432                                 printf("nspace-handler-unblock: did not find token %u\n", token);
9433                                 error = ENOENT;
9434                         } else {
9435                                 if (val == 0 && nspace_items[i].vp) {
9436                                         vnode_lock_spin(nspace_items[i].vp);
9437                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9438                                         vnode_unlock(nspace_items[i].vp);
9439                                 }
9440
9441                                 nspace_items[i].vp = NULL;
9442                                 nspace_items[i].arg = NULL;
9443                                 nspace_items[i].op = 0;
9444                                 nspace_items[i].vid = 0;
9445                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9446                                 nspace_items[i].token = 0;
9447
9448                                 wakeup((caddr_t)&(nspace_items[i].vp));
9449                         }
9450
9451                         lck_mtx_unlock(&nspace_handler_lock);
9452                 }
9453                 break;
9454
9455                 case FSCTL_NAMESPACE_HANDLER_CANCEL: {
9456                         uint32_t token, val;
9457                         int i;
9458
9459                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
9460                                 break;
9461                         }
9462
9463                         if (!nspace_is_special_process(p)) {
9464                                 error = EINVAL;
9465                                 break;
9466                         }
9467
9468                         token = ((uint32_t *)data)[0];
9469                         val   = ((uint32_t *)data)[1];
9470
9471                         lck_mtx_lock(&nspace_handler_lock);
9472
9473                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9474                                 if (nspace_items[i].token == token) {
9475                                         break;  /* exit for loop, not case stmt */
9476                                 }
9477                         }
9478
9479                         if (i >= MAX_NSPACE_ITEMS) {
9480                                 printf("nspace-handler-cancel: did not find token %u\n", token);
9481                                 error = ENOENT;
9482                         } else {
9483                                 if (nspace_items[i].vp) {
9484                                         vnode_lock_spin(nspace_items[i].vp);
9485                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9486                                         vnode_unlock(nspace_items[i].vp);
9487                                 }
9488
9489                                 nspace_items[i].vp = NULL;
9490                                 nspace_items[i].arg = NULL;
9491                                 nspace_items[i].vid = 0;
9492                                 nspace_items[i].token = val;
9493                                 nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
9494                                 nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
9495
9496                                 wakeup((caddr_t)&(nspace_items[i].vp));
9497                         }
9498
9499                         lck_mtx_unlock(&nspace_handler_lock);
9500                 }
9501                 break;
9502
9503                 case FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
9504                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9505                                 break;
9506                         }
9507
9508                         // we explicitly do not do the namespace_handler_proc check here
9509
9510                         lck_mtx_lock(&nspace_handler_lock);
9511                         snapshot_timestamp = ((uint32_t *)data)[0];
9512                         wakeup(&nspace_item_idx);
9513                         lck_mtx_unlock(&nspace_handler_lock);
9514                         printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
9515
9516                 }
9517                 break;
9518
9519                 case FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
9520                 {
9521                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9522                                 break;
9523                         }
9524
9525                         lck_mtx_lock(&nspace_handler_lock);
9526                         nspace_allow_virtual_devs = ((uint32_t *)data)[0];
9527                         lck_mtx_unlock(&nspace_handler_lock);
9528                         printf("nspace-snapshot-handler will%s allow events on disk-images\n",
9529                                         nspace_allow_virtual_devs ? "" : " NOT");
9530                         error = 0;
9531
9532                 }
9533                 break;
9534
9535                 case FSCTL_SET_FSTYPENAME_OVERRIDE:
9536                 {
9537                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9538                                 break;
9539                         }
9540                         if (vp->v_mount) {
9541                                 mount_lock(vp->v_mount);
9542                                 if (data[0] != 0) {
9543                                         strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
9544                                         vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
9545                                         if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
9546                                                 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
9547                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
9548                                         }
9549                                 } else {
9550                                         if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
9551                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
9552                                         }
9553                                         vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
9554                                         vp->v_mount->fstypename_override[0] = '\0';
9555                                 }
9556                                 mount_unlock(vp->v_mount);
9557                         }
9558                 }
9559                 break;
9560
9561                 default: {
9562                         /* Invoke the filesystem-specific code */
9563                         error = VNOP_IOCTL(vp, IOCBASECMD(cmd), data, options, ctx);
9564                 }
9565
9566         } /* end switch stmt */
9567
9568         /*
9569          * if no errors, copy any data to user. Size was
9570          * already set and checked above.
9571          */
9572         if (error == 0 && (cmd & IOC_OUT) && size)
9573                 error = copyout(data, udata, size);
9574
9575         if (memp) {
9576                 kfree(memp, size);
9577         }
9578
9579         return error;
9580 }
9581
9582 /* ARGSUSED */
9583 int
9584 fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
9585 {
9586         int error;
9587         struct nameidata nd;
9588         u_long nameiflags;
9589         vnode_t vp = NULL;
9590         vfs_context_t ctx = vfs_context_current();
9591
9592         AUDIT_ARG(cmd, uap->cmd);
9593         AUDIT_ARG(value32, uap->options);
9594         /* Get the vnode for the file we are getting info on:  */
9595         nameiflags = 0;
9596         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
9597         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
9598                UIO_USERSPACE, uap->path, ctx);
9599         if ((error = namei(&nd))) goto done;
9600         vp = nd.ni_vp;
9601         nameidone(&nd);
9602
9603 #if CONFIG_MACF
9604         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
9605         if (error) {
9606                 goto done;
9607         }
9608 #endif
9609
9610         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
9611
9612 done:
9613         if (vp)
9614                 vnode_put(vp);
9615         return error;
9616 }
9617 /* ARGSUSED */
9618 int
9619 ffsctl (proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
9620 {
9621         int error;
9622         vnode_t vp = NULL;
9623         vfs_context_t ctx = vfs_context_current();
9624         int fd = -1;
9625
9626         AUDIT_ARG(fd, uap->fd);
9627         AUDIT_ARG(cmd, uap->cmd);
9628         AUDIT_ARG(value32, uap->options);
9629
9630         /* Get the vnode for the file we are getting info on:  */
9631         if ((error = file_vnode(uap->fd, &vp)))
9632                 goto done;
9633         fd = uap->fd;
9634         if ((error = vnode_getwithref(vp))) {
9635                 goto done;
9636         }
9637
9638 #if CONFIG_MACF
9639         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
9640         if (error) {
9641                 goto done;
9642         }
9643 #endif
9644
9645         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
9646
9647 done:
9648         if (fd != -1)
9649                 file_drop(fd);
9650
9651         if (vp)
9652                 vnode_put(vp);
9653         return error;
9654 }
9655 /* end of fsctl system call */
9656
9657 /*
9658  *  Retrieve the data of an extended attribute.
9659  */
9660 int
9661 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
9662 {
9663         vnode_t vp;
9664         struct nameidata nd;
9665         char attrname[XATTR_MAXNAMELEN+1];
9666         vfs_context_t ctx = vfs_context_current();
9667         uio_t auio = NULL;
9668         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9669         size_t attrsize = 0;
9670         size_t namelen;
9671         u_int32_t nameiflags;
9672         int error;
9673         char uio_buf[ UIO_SIZEOF(1) ];
9674
9675         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9676                 return (EINVAL);
9677
9678         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9679         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
9680         if ((error = namei(&nd))) {
9681                 return (error);
9682         }
9683         vp = nd.ni_vp;
9684         nameidone(&nd);
9685
9686         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9687                 goto out;
9688         }
9689         if (xattr_protected(attrname)) {
9690                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
9691                         error = EPERM;
9692                         goto out;
9693                 }
9694         }
9695         /*
9696          * the specific check for 0xffffffff is a hack to preserve
9697          * binaray compatibilty in K64 with applications that discovered
9698          * that passing in a buf pointer and a size of -1 resulted in
9699          * just the size of the indicated extended attribute being returned.
9700          * this isn't part of the documented behavior, but because of the
9701          * original implemtation's check for "uap->size > 0", this behavior
9702          * was allowed. In K32 that check turned into a signed comparison
9703          * even though uap->size is unsigned...  in K64, we blow by that
9704          * check because uap->size is unsigned and doesn't get sign smeared
9705          * in the munger for a 32 bit user app.  we also need to add a
9706          * check to limit the maximum size of the buffer being passed in...
9707          * unfortunately, the underlying fileystems seem to just malloc
9708          * the requested size even if the actual extended attribute is tiny.
9709          * because that malloc is for kernel wired memory, we have to put a
9710          * sane limit on it.
9711          *
9712          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
9713          * U64 running on K64 will yield -1 (64 bits wide)
9714          * U32/U64 running on K32 will yield -1 (32 bits wide)
9715          */
9716         if (uap->size == 0xffffffff || uap->size == (size_t)-1)
9717                 goto no_uio;
9718
9719         if (uap->value) {
9720                 if (uap->size > (size_t)XATTR_MAXSIZE)
9721                         uap->size = XATTR_MAXSIZE;
9722
9723                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
9724                                             &uio_buf[0], sizeof(uio_buf));
9725                 uio_addiov(auio, uap->value, uap->size);
9726         }
9727 no_uio:
9728         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
9729 out:
9730         vnode_put(vp);
9731
9732         if (auio) {
9733                 *retval = uap->size - uio_resid(auio);
9734         } else {
9735                 *retval = (user_ssize_t)attrsize;
9736         }
9737
9738         return (error);
9739 }
9740
9741 /*
9742  * Retrieve the data of an extended attribute.
9743  */
9744 int
9745 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
9746 {
9747         vnode_t vp;
9748         char attrname[XATTR_MAXNAMELEN+1];
9749         uio_t auio = NULL;
9750         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9751         size_t attrsize = 0;
9752         size_t namelen;
9753         int error;
9754         char uio_buf[ UIO_SIZEOF(1) ];
9755
9756         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9757                 return (EINVAL);
9758
9759         if ( (error = file_vnode(uap->fd, &vp)) ) {
9760                 return (error);
9761         }
9762         if ( (error = vnode_getwithref(vp)) ) {
9763                 file_drop(uap->fd);
9764                 return(error);
9765         }
9766         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9767                 goto out;
9768         }
9769         if (xattr_protected(attrname)) {
9770                 error = EPERM;
9771                 goto out;
9772         }
9773         if (uap->value && uap->size > 0) {
9774                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
9775                                             &uio_buf[0], sizeof(uio_buf));
9776                 uio_addiov(auio, uap->value, uap->size);
9777         }
9778
9779         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
9780 out:
9781         (void)vnode_put(vp);
9782         file_drop(uap->fd);
9783
9784         if (auio) {
9785                 *retval = uap->size - uio_resid(auio);
9786         } else {
9787                 *retval = (user_ssize_t)attrsize;
9788         }
9789         return (error);
9790 }
9791
9792 /*
9793  * Set the data of an extended attribute.
9794  */
9795 int
9796 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
9797 {
9798         vnode_t vp;
9799         struct nameidata nd;
9800         char attrname[XATTR_MAXNAMELEN+1];
9801         vfs_context_t ctx = vfs_context_current();
9802         uio_t auio = NULL;
9803         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9804         size_t namelen;
9805         u_int32_t nameiflags;
9806         int error;
9807         char uio_buf[ UIO_SIZEOF(1) ];
9808
9809         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9810                 return (EINVAL);
9811
9812         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9813                 if (error == EPERM) {
9814                         /* if the string won't fit in attrname, copyinstr emits EPERM */
9815                         return (ENAMETOOLONG);
9816                 }
9817                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
9818                 return error;
9819         }
9820         if (xattr_protected(attrname))
9821                 return(EPERM);
9822         if (uap->size != 0 && uap->value == 0) {
9823                 return (EINVAL);
9824         }
9825
9826         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9827         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
9828         if ((error = namei(&nd))) {
9829                 return (error);
9830         }
9831         vp = nd.ni_vp;
9832         nameidone(&nd);
9833
9834         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
9835                                     &uio_buf[0], sizeof(uio_buf));
9836         uio_addiov(auio, uap->value, uap->size);
9837
9838         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
9839 #if CONFIG_FSE
9840         if (error == 0) {
9841                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
9842                     FSE_ARG_VNODE, vp,
9843                     FSE_ARG_DONE);
9844         }
9845 #endif
9846         vnode_put(vp);
9847         *retval = 0;
9848         return (error);
9849 }
9850
9851 /*
9852  * Set the data of an extended attribute.
9853  */
9854 int
9855 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
9856 {
9857         vnode_t vp;
9858         char attrname[XATTR_MAXNAMELEN+1];
9859         uio_t auio = NULL;
9860         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9861         size_t namelen;
9862         int error;
9863         char uio_buf[ UIO_SIZEOF(1) ];
9864 #if CONFIG_FSE
9865         vfs_context_t ctx = vfs_context_current();
9866 #endif
9867
9868         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9869                 return (EINVAL);
9870
9871         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9872                 return (error);
9873         }
9874         if (xattr_protected(attrname))
9875                 return(EPERM);
9876         if (uap->size != 0 && uap->value == 0) {
9877                 return (EINVAL);
9878         }
9879         if ( (error = file_vnode(uap->fd, &vp)) ) {
9880                 return (error);
9881         }
9882         if ( (error = vnode_getwithref(vp)) ) {
9883                 file_drop(uap->fd);
9884                 return(error);
9885         }
9886         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
9887                                     &uio_buf[0], sizeof(uio_buf));
9888         uio_addiov(auio, uap->value, uap->size);
9889
9890         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
9891 #if CONFIG_FSE
9892         if (error == 0) {
9893                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
9894                     FSE_ARG_VNODE, vp,
9895                     FSE_ARG_DONE);
9896         }
9897 #endif
9898         vnode_put(vp);
9899         file_drop(uap->fd);
9900         *retval = 0;
9901         return (error);
9902 }
9903
9904 /*
9905  * Remove an extended attribute.
9906  * XXX Code duplication here.
9907  */
9908 int
9909 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
9910 {
9911         vnode_t vp;
9912         struct nameidata nd;
9913         char attrname[XATTR_MAXNAMELEN+1];
9914         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9915         vfs_context_t ctx = vfs_context_current();
9916         size_t namelen;
9917         u_int32_t nameiflags;
9918         int error;
9919
9920         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9921                 return (EINVAL);
9922
9923         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
9924         if (error != 0) {
9925                 return (error);
9926         }
9927         if (xattr_protected(attrname))
9928                 return(EPERM);
9929         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9930         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
9931         if ((error = namei(&nd))) {
9932                 return (error);
9933         }
9934         vp = nd.ni_vp;
9935         nameidone(&nd);
9936
9937         error = vn_removexattr(vp, attrname, uap->options, ctx);
9938 #if CONFIG_FSE
9939         if (error == 0) {
9940                 add_fsevent(FSE_XATTR_REMOVED, ctx,
9941                     FSE_ARG_VNODE, vp,
9942                     FSE_ARG_DONE);
9943         }
9944 #endif
9945         vnode_put(vp);
9946         *retval = 0;
9947         return (error);
9948 }
9949
9950 /*
9951  * Remove an extended attribute.
9952  * XXX Code duplication here.
9953  */
9954 int
9955 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
9956 {
9957         vnode_t vp;
9958         char attrname[XATTR_MAXNAMELEN+1];
9959         size_t namelen;
9960         int error;
9961 #if CONFIG_FSE
9962         vfs_context_t ctx = vfs_context_current();
9963 #endif
9964
9965         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9966                 return (EINVAL);
9967
9968         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
9969         if (error != 0) {
9970                 return (error);
9971         }
9972         if (xattr_protected(attrname))
9973                 return(EPERM);
9974         if ( (error = file_vnode(uap->fd, &vp)) ) {
9975                 return (error);
9976         }
9977         if ( (error = vnode_getwithref(vp)) ) {
9978                 file_drop(uap->fd);
9979                 return(error);
9980         }
9981
9982         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
9983 #if CONFIG_FSE
9984         if (error == 0) {
9985                 add_fsevent(FSE_XATTR_REMOVED, ctx,
9986                     FSE_ARG_VNODE, vp,
9987                     FSE_ARG_DONE);
9988         }
9989 #endif
9990         vnode_put(vp);
9991         file_drop(uap->fd);
9992         *retval = 0;
9993         return (error);
9994 }
9995
9996 /*
9997  * Retrieve the list of extended attribute names.
9998  * XXX Code duplication here.
9999  */
10000 int
10001 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
10002 {
10003         vnode_t vp;
10004         struct nameidata nd;
10005         vfs_context_t ctx = vfs_context_current();
10006         uio_t auio = NULL;
10007         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10008         size_t attrsize = 0;
10009         u_int32_t nameiflags;
10010         int error;
10011         char uio_buf[ UIO_SIZEOF(1) ];
10012
10013         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10014                 return (EINVAL);
10015
10016         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10017         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
10018         if ((error = namei(&nd))) {
10019                 return (error);
10020         }
10021         vp = nd.ni_vp;
10022         nameidone(&nd);
10023         if (uap->namebuf != 0 && uap->bufsize > 0) {
10024                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
10025                                             &uio_buf[0], sizeof(uio_buf));
10026                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10027         }
10028
10029         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
10030
10031         vnode_put(vp);
10032         if (auio) {
10033                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10034         } else {
10035                 *retval = (user_ssize_t)attrsize;
10036         }
10037         return (error);
10038 }
10039
10040 /*
10041  * Retrieve the list of extended attribute names.
10042  * XXX Code duplication here.
10043  */
10044 int
10045 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
10046 {
10047         vnode_t vp;
10048         uio_t auio = NULL;
10049         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10050         size_t attrsize = 0;
10051         int error;
10052         char uio_buf[ UIO_SIZEOF(1) ];
10053
10054         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10055                 return (EINVAL);
10056
10057         if ( (error = file_vnode(uap->fd, &vp)) ) {
10058                 return (error);
10059         }
10060         if ( (error = vnode_getwithref(vp)) ) {
10061                 file_drop(uap->fd);
10062                 return(error);
10063         }
10064         if (uap->namebuf != 0 && uap->bufsize > 0) {
10065                 auio = uio_createwithbuffer(1, 0, spacetype,
10066                                                                           UIO_READ, &uio_buf[0], sizeof(uio_buf));
10067                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10068         }
10069
10070         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
10071
10072         vnode_put(vp);
10073         file_drop(uap->fd);
10074         if (auio) {
10075                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10076         } else {
10077                 *retval = (user_ssize_t)attrsize;
10078         }
10079         return (error);
10080 }
10081
10082 static int fsgetpath_internal(
10083         vfs_context_t ctx, int volfs_id, uint64_t objid,
10084         vm_size_t bufsize, caddr_t buf, int *pathlen)
10085 {
10086         int error;
10087         struct mount *mp = NULL;
10088         vnode_t vp;
10089         int length;
10090         int bpflags;
10091
10092         if (bufsize > PAGE_SIZE) {
10093                 return (EINVAL);
10094         }
10095
10096         if (buf == NULL) {
10097                 return (ENOMEM);
10098         }
10099
10100         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
10101                 error = ENOTSUP;  /* unexpected failure */
10102                 return ENOTSUP;
10103         }
10104
10105 unionget:
10106         if (objid == 2) {
10107                 error = VFS_ROOT(mp, &vp, ctx);
10108         } else {
10109                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
10110         }
10111
10112         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
10113                 /*
10114                  * If the fileid isn't found and we're in a union
10115                  * mount volume, then see if the fileid is in the
10116                  * mounted-on volume.
10117                  */
10118                 struct mount *tmp = mp;
10119                 mp = vnode_mount(tmp->mnt_vnodecovered);
10120                 vfs_unbusy(tmp);
10121                 if (vfs_busy(mp, LK_NOWAIT) == 0)
10122                         goto unionget;
10123         } else {
10124                 vfs_unbusy(mp);
10125         }
10126
10127         if (error) {
10128                 return error;
10129         }
10130
10131 #if CONFIG_MACF
10132         error = mac_vnode_check_fsgetpath(ctx, vp);
10133         if (error) {
10134                 vnode_put(vp);
10135                 return error;
10136         }
10137 #endif
10138
10139         /* Obtain the absolute path to this vnode. */
10140         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
10141         bpflags |= BUILDPATH_CHECK_MOVED;
10142         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
10143         vnode_put(vp);
10144
10145         if (error) {
10146                 goto out;
10147         }
10148
10149         AUDIT_ARG(text, buf);
10150
10151         if (kdebug_enable) {
10152                 long dbg_parms[NUMPARMS];
10153                 int  dbg_namelen;
10154
10155                 dbg_namelen = (int)sizeof(dbg_parms);
10156
10157         if (length < dbg_namelen) {
10158                         memcpy((char *)dbg_parms, buf, length);
10159                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
10160
10161                         dbg_namelen = length;
10162                 } else {
10163                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
10164                 }
10165
10166                 kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE);
10167         }
10168
10169         *pathlen = (user_ssize_t)length; /* may be superseded by error */
10170
10171 out:
10172         return (error);
10173 }
10174
10175 /*
10176  * Obtain the full pathname of a file system object by id.
10177  *
10178  * This is a private SPI used by the File Manager.
10179  */
10180 __private_extern__
10181 int
10182 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
10183 {
10184         vfs_context_t ctx = vfs_context_current();
10185         fsid_t fsid;
10186         char *realpath;
10187         int length;
10188         int error;
10189
10190         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
10191                 return (error);
10192         }
10193         AUDIT_ARG(value32, fsid.val[0]);
10194         AUDIT_ARG(value64, uap->objid);
10195         /* Restrict output buffer size for now. */
10196
10197         if (uap->bufsize > PAGE_SIZE) {
10198                 return (EINVAL);
10199         }
10200         MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK);
10201         if (realpath == NULL) {
10202                 return (ENOMEM);
10203         }
10204
10205         error = fsgetpath_internal(
10206                 ctx, fsid.val[0], uap->objid,
10207                 uap->bufsize, realpath, &length);
10208
10209         if (error) {
10210                 goto out;
10211         }
10212
10213         error = copyout((caddr_t)realpath, uap->buf, length);
10214
10215         *retval = (user_ssize_t)length; /* may be superseded by error */
10216 out:
10217         if (realpath) {
10218                 FREE(realpath, M_TEMP);
10219         }
10220         return (error);
10221 }
10222
10223 /*
10224  * Common routine to handle various flavors of statfs data heading out
10225  *      to user space.
10226  *
10227  * Returns:     0                       Success
10228  *              EFAULT
10229  */
10230 static int
10231 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
10232     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
10233     boolean_t partial_copy)
10234 {
10235         int             error;
10236         int             my_size, copy_size;
10237
10238         if (is_64_bit) {
10239                 struct user64_statfs sfs;
10240                 my_size = copy_size = sizeof(sfs);
10241                 bzero(&sfs, my_size);
10242                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10243                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10244                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10245                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
10246                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
10247                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
10248                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
10249                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
10250                 sfs.f_files = (user64_long_t)sfsp->f_files;
10251                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
10252                 sfs.f_fsid = sfsp->f_fsid;
10253                 sfs.f_owner = sfsp->f_owner;
10254                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
10255                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
10256                 } else {
10257                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
10258                 }
10259                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
10260                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
10261
10262                 if (partial_copy) {
10263                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
10264                 }
10265                 error = copyout((caddr_t)&sfs, bufp, copy_size);
10266         }
10267         else {
10268                 struct user32_statfs sfs;
10269
10270                 my_size = copy_size = sizeof(sfs);
10271                 bzero(&sfs, my_size);
10272
10273                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10274                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10275                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10276
10277                 /*
10278                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
10279                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
10280                  * to reflect the filesystem size as best we can.
10281                  */
10282                 if ((sfsp->f_blocks > INT_MAX)
10283                         /* Hack for 4061702 . I think the real fix is for Carbon to
10284                          * look for some volume capability and not depend on hidden
10285                          * semantics agreed between a FS and carbon.
10286                          * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
10287                          * for Carbon to set bNoVolumeSizes volume attribute.
10288                          * Without this the webdavfs files cannot be copied onto
10289                          * disk as they look huge. This change should not affect
10290                          * XSAN as they should not setting these to -1..
10291                          */
10292                          && (sfsp->f_blocks != 0xffffffffffffffffULL)
10293                          && (sfsp->f_bfree != 0xffffffffffffffffULL)
10294                          && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
10295                         int             shift;
10296
10297                         /*
10298                          * Work out how far we have to shift the block count down to make it fit.
10299                          * Note that it's possible to have to shift so far that the resulting
10300                          * blocksize would be unreportably large.  At that point, we will clip
10301                          * any values that don't fit.
10302                          *
10303                          * For safety's sake, we also ensure that f_iosize is never reported as
10304                          * being smaller than f_bsize.
10305                          */
10306                         for (shift = 0; shift < 32; shift++) {
10307                                 if ((sfsp->f_blocks >> shift) <= INT_MAX)
10308                                         break;
10309                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX)
10310                                         break;
10311                         }
10312 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
10313                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
10314                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
10315                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
10316 #undef __SHIFT_OR_CLIP
10317                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
10318                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
10319                 } else {
10320                         /* filesystem is small enough to be reported honestly */
10321                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
10322                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
10323                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
10324                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
10325                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
10326                 }
10327                 sfs.f_files = (user32_long_t)sfsp->f_files;
10328                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
10329                 sfs.f_fsid = sfsp->f_fsid;
10330                 sfs.f_owner = sfsp->f_owner;
10331                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
10332                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
10333                 } else {
10334                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
10335                 }
10336                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
10337                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
10338
10339                 if (partial_copy) {
10340                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
10341                 }
10342                 error = copyout((caddr_t)&sfs, bufp, copy_size);
10343         }
10344
10345         if (sizep != NULL) {
10346                 *sizep = my_size;
10347         }
10348         return(error);
10349 }
10350
10351 /*
10352  * copy stat structure into user_stat structure.
10353  */
10354 void munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
10355 {
10356         bzero(usbp, sizeof(*usbp));
10357
10358         usbp->st_dev = sbp->st_dev;
10359         usbp->st_ino = sbp->st_ino;
10360         usbp->st_mode = sbp->st_mode;
10361         usbp->st_nlink = sbp->st_nlink;
10362         usbp->st_uid = sbp->st_uid;
10363         usbp->st_gid = sbp->st_gid;
10364         usbp->st_rdev = sbp->st_rdev;
10365 #ifndef _POSIX_C_SOURCE
10366         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10367         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10368         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10369         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10370         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10371         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10372 #else
10373         usbp->st_atime = sbp->st_atime;
10374         usbp->st_atimensec = sbp->st_atimensec;
10375         usbp->st_mtime = sbp->st_mtime;
10376         usbp->st_mtimensec = sbp->st_mtimensec;
10377         usbp->st_ctime = sbp->st_ctime;
10378         usbp->st_ctimensec = sbp->st_ctimensec;
10379 #endif
10380         usbp->st_size = sbp->st_size;
10381         usbp->st_blocks = sbp->st_blocks;
10382         usbp->st_blksize = sbp->st_blksize;
10383         usbp->st_flags = sbp->st_flags;
10384         usbp->st_gen = sbp->st_gen;
10385         usbp->st_lspare = sbp->st_lspare;
10386         usbp->st_qspare[0] = sbp->st_qspare[0];
10387         usbp->st_qspare[1] = sbp->st_qspare[1];
10388 }
10389
10390 void munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
10391 {
10392         bzero(usbp, sizeof(*usbp));
10393
10394         usbp->st_dev = sbp->st_dev;
10395         usbp->st_ino = sbp->st_ino;
10396         usbp->st_mode = sbp->st_mode;
10397         usbp->st_nlink = sbp->st_nlink;
10398         usbp->st_uid = sbp->st_uid;
10399         usbp->st_gid = sbp->st_gid;
10400         usbp->st_rdev = sbp->st_rdev;
10401 #ifndef _POSIX_C_SOURCE
10402         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10403         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10404         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10405         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10406         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10407         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10408 #else
10409         usbp->st_atime = sbp->st_atime;
10410         usbp->st_atimensec = sbp->st_atimensec;
10411         usbp->st_mtime = sbp->st_mtime;
10412         usbp->st_mtimensec = sbp->st_mtimensec;
10413         usbp->st_ctime = sbp->st_ctime;
10414         usbp->st_ctimensec = sbp->st_ctimensec;
10415 #endif
10416         usbp->st_size = sbp->st_size;
10417         usbp->st_blocks = sbp->st_blocks;
10418         usbp->st_blksize = sbp->st_blksize;
10419         usbp->st_flags = sbp->st_flags;
10420         usbp->st_gen = sbp->st_gen;
10421         usbp->st_lspare = sbp->st_lspare;
10422         usbp->st_qspare[0] = sbp->st_qspare[0];
10423         usbp->st_qspare[1] = sbp->st_qspare[1];
10424 }
10425
10426 /*
10427  * copy stat64 structure into user_stat64 structure.
10428  */
10429 void munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
10430 {
10431         bzero(usbp, sizeof(*usbp));
10432
10433         usbp->st_dev = sbp->st_dev;
10434         usbp->st_ino = sbp->st_ino;
10435         usbp->st_mode = sbp->st_mode;
10436         usbp->st_nlink = sbp->st_nlink;
10437         usbp->st_uid = sbp->st_uid;
10438         usbp->st_gid = sbp->st_gid;
10439         usbp->st_rdev = sbp->st_rdev;
10440 #ifndef _POSIX_C_SOURCE
10441         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10442         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10443         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10444         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10445         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10446         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10447         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
10448         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
10449 #else
10450         usbp->st_atime = sbp->st_atime;
10451         usbp->st_atimensec = sbp->st_atimensec;
10452         usbp->st_mtime = sbp->st_mtime;
10453         usbp->st_mtimensec = sbp->st_mtimensec;
10454         usbp->st_ctime = sbp->st_ctime;
10455         usbp->st_ctimensec = sbp->st_ctimensec;
10456         usbp->st_birthtime = sbp->st_birthtime;
10457         usbp->st_birthtimensec = sbp->st_birthtimensec;
10458 #endif
10459         usbp->st_size = sbp->st_size;
10460         usbp->st_blocks = sbp->st_blocks;
10461         usbp->st_blksize = sbp->st_blksize;
10462         usbp->st_flags = sbp->st_flags;
10463         usbp->st_gen = sbp->st_gen;
10464         usbp->st_lspare = sbp->st_lspare;
10465         usbp->st_qspare[0] = sbp->st_qspare[0];
10466         usbp->st_qspare[1] = sbp->st_qspare[1];
10467 }
10468
10469 void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
10470 {
10471         bzero(usbp, sizeof(*usbp));
10472
10473         usbp->st_dev = sbp->st_dev;
10474         usbp->st_ino = sbp->st_ino;
10475         usbp->st_mode = sbp->st_mode;
10476         usbp->st_nlink = sbp->st_nlink;
10477         usbp->st_uid = sbp->st_uid;
10478         usbp->st_gid = sbp->st_gid;
10479         usbp->st_rdev = sbp->st_rdev;
10480 #ifndef _POSIX_C_SOURCE
10481         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10482         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10483         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10484         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10485         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10486         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10487         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
10488         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
10489 #else
10490         usbp->st_atime = sbp->st_atime;
10491         usbp->st_atimensec = sbp->st_atimensec;
10492         usbp->st_mtime = sbp->st_mtime;
10493         usbp->st_mtimensec = sbp->st_mtimensec;
10494         usbp->st_ctime = sbp->st_ctime;
10495         usbp->st_ctimensec = sbp->st_ctimensec;
10496         usbp->st_birthtime = sbp->st_birthtime;
10497         usbp->st_birthtimensec = sbp->st_birthtimensec;
10498 #endif
10499         usbp->st_size = sbp->st_size;
10500         usbp->st_blocks = sbp->st_blocks;
10501         usbp->st_blksize = sbp->st_blksize;
10502         usbp->st_flags = sbp->st_flags;
10503         usbp->st_gen = sbp->st_gen;
10504         usbp->st_lspare = sbp->st_lspare;
10505         usbp->st_qspare[0] = sbp->st_qspare[0];
10506         usbp->st_qspare[1] = sbp->st_qspare[1];
10507 }
10508
10509 /*
10510  * Purge buffer cache for simulating cold starts
10511  */
10512 static int vnode_purge_callback(struct vnode *vp, __unused void *cargs)
10513 {
10514         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
10515
10516         return VNODE_RETURNED;
10517 }
10518
10519 static int vfs_purge_callback(mount_t mp, __unused void * arg)
10520 {
10521         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
10522
10523         return VFS_RETURNED;
10524 }
10525
10526 int
10527 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
10528 {
10529         if (!kauth_cred_issuser(kauth_cred_get()))
10530                 return EPERM;
10531
10532         vfs_iterate(0/* flags */, vfs_purge_callback, NULL);
10533
10534         return 0;
10535 }
10536