bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/xattr.h>
  98 #include <sys/fcntl.h>
  99 #include <sys/fsctl.h>
 100 #include <sys/ubc_internal.h>
 101 #include <sys/disk.h>
 102 #include <sys/content_protection.h>
 103 #include <sys/clonefile.h>
 104 #include <sys/snapshot.h>
 105 #include <sys/priv.h>
 106 #include <machine/cons.h>
 107 #include <machine/limits.h>
 108 #include <miscfs/specfs/specdev.h>
 109
 110 #include <vfs/vfs_disk_conditioner.h>
 111
 112 #include <security/audit/audit.h>
 113 #include <bsm/audit_kevents.h>
 114
 115 #include <mach/mach_types.h>
 116 #include <kern/kern_types.h>
 117 #include <kern/kalloc.h>
 118 #include <kern/task.h>
 119
 120 #include <vm/vm_pageout.h>
 121 #include <vm/vm_protos.h>
 122
 123 #include <libkern/OSAtomic.h>
 124 #include <pexpert/pexpert.h>
 125 #include <IOKit/IOBSD.h>
 126
 127 #if ROUTEFS
 128 #include <miscfs/routefs/routefs.h>
 129 #endif /* ROUTEFS */
 130
 131 #if CONFIG_MACF
 132 #include <security/mac.h>
 133 #include <security/mac_framework.h>
 134 #endif
 135
 136 #if CONFIG_FSE
 137 #define GET_PATH(x) \
 138         (x) = get_pathbuff();
 139 #define RELEASE_PATH(x) \
 140         release_pathbuff(x);
 141 #else
 142 #define GET_PATH(x)     \
 143         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 144 #define RELEASE_PATH(x) \
 145         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 146 #endif /* CONFIG_FSE */
 147
 148 #ifndef HFS_GET_BOOT_INFO
 149 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
 150 #endif
 151
 152 #ifndef HFS_SET_BOOT_INFO
 153 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
 154 #endif
 155
 156 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
 157 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
 158 #endif
 159
 160 extern void disk_conditioner_unmount(mount_t mp);
 161
 162 /* struct for checkdirs iteration */
 163 struct cdirargs {
 164         vnode_t olddp;
 165         vnode_t newdp;
 166 };
 167 /* callback  for checkdirs iteration */
 168 static int checkdirs_callback(proc_t p, void * arg);
 169
 170 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 171 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 172 void enablequotas(struct mount *mp, vfs_context_t ctx);
 173 static int getfsstat_callback(mount_t mp, void * arg);
 174 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 175 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 176 static int sync_callback(mount_t, void *);
 177 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 178     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 179     boolean_t partial_copy);
 180 static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
 181     user_addr_t bufp);
 182 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 183 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 184     struct componentname *cnp, user_addr_t fsmountargs,
 185     int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 186     vfs_context_t ctx);
 187 void vfs_notify_mount(vnode_t pdvp);
 188
 189 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 190
 191 struct fd_vn_data * fg_vn_data_alloc(void);
 192
 193 /*
 194  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 195  * Concurrent lookups (or lookups by ids) on hard links can cause the
 196  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 197  * does) to return ENOENT as the path cannot be returned from the name cache
 198  * alone. We have no option but to retry and hope to get one namei->reverse path
 199  * generation done without an intervening lookup, lookup by id on the hard link
 200  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 201  * which currently are the MAC hooks for rename, unlink and rmdir.
 202  */
 203 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 204
 205 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
 206
 207 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
 208
 209 #ifdef CONFIG_IMGSRC_ACCESS
 210 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 211 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 212 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 213 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 214 static void mount_end_update(mount_t mp);
 215 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 216 #endif /* CONFIG_IMGSRC_ACCESS */
 217
 218 //snapshot functions
 219 #if CONFIG_MNT_ROOTSNAP
 220 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
 221 #else
 222 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
 223 #endif
 224
 225 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 226
 227 __private_extern__
 228 int sync_internal(void);
 229
 230 __private_extern__
 231 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 232
 233 extern lck_grp_t *fd_vn_lck_grp;
 234 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 235 extern lck_attr_t *fd_vn_lck_attr;
 236
 237 /*
 238  * incremented each time a mount or unmount operation occurs
 239  * used to invalidate the cached value of the rootvp in the
 240  * mount structure utilized by cache_lookup_path
 241  */
 242 uint32_t mount_generation = 0;
 243
 244 /* counts number of mount and unmount operations */
 245 unsigned int vfs_nummntops = 0;
 246
 247 extern const struct fileops vnops;
 248 #if CONFIG_APPLEDOUBLE
 249 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 250 #endif /* CONFIG_APPLEDOUBLE */
 251
 252 /*
 253  * Virtual File System System Calls
 254  */
 255
 256 #if NFSCLIENT || DEVFS || ROUTEFS
 257 /*
 258  * Private in-kernel mounting spi (NFS only, not exported)
 259  */
 260 __private_extern__
 261 boolean_t
 262 vfs_iskernelmount(mount_t mp)
 263 {
 264         return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
 265 }
 266
 267 __private_extern__
 268 int
 269 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 270     void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
 271 {
 272         struct nameidata nd;
 273         boolean_t did_namei;
 274         int error;
 275
 276         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 277             UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 278
 279         /*
 280          * Get the vnode to be covered if it's not supplied
 281          */
 282         if (vp == NULLVP) {
 283                 error = namei(&nd);
 284                 if (error) {
 285                         return error;
 286                 }
 287                 vp = nd.ni_vp;
 288                 pvp = nd.ni_dvp;
 289                 did_namei = TRUE;
 290         } else {
 291                 char *pnbuf = CAST_DOWN(char *, path);
 292
 293                 nd.ni_cnd.cn_pnbuf = pnbuf;
 294                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 295                 did_namei = FALSE;
 296         }
 297
 298         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 299             syscall_flags, kern_flags, NULL, TRUE, ctx);
 300
 301         if (did_namei) {
 302                 vnode_put(vp);
 303                 vnode_put(pvp);
 304                 nameidone(&nd);
 305         }
 306
 307         return error;
 308 }
 309 #endif /* NFSCLIENT || DEVFS */
 310
 311 /*
 312  * Mount a file system.
 313  */
 314 /* ARGSUSED */
 315 int
 316 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 317 {
 318         struct __mac_mount_args muap;
 319
 320         muap.type = uap->type;
 321         muap.path = uap->path;
 322         muap.flags = uap->flags;
 323         muap.data = uap->data;
 324         muap.mac_p = USER_ADDR_NULL;
 325         return __mac_mount(p, &muap, retval);
 326 }
 327
 328 int
 329 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
 330 {
 331         struct componentname    cn;
 332         vfs_context_t           ctx = vfs_context_current();
 333         size_t                  dummy = 0;
 334         int                     error;
 335         int                     flags = uap->flags;
 336         char                    fstypename[MFSNAMELEN];
 337         char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
 338         vnode_t                 pvp;
 339         vnode_t                 vp;
 340
 341         AUDIT_ARG(fd, uap->fd);
 342         AUDIT_ARG(fflags, flags);
 343         /* fstypename will get audited by mount_common */
 344
 345         /* Sanity check the flags */
 346         if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
 347                 return ENOTSUP;
 348         }
 349
 350         if (flags & MNT_UNION) {
 351                 return EPERM;
 352         }
 353
 354         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 355         if (error) {
 356                 return error;
 357         }
 358
 359         if ((error = file_vnode(uap->fd, &vp)) != 0) {
 360                 return error;
 361         }
 362
 363         if ((error = vnode_getwithref(vp)) != 0) {
 364                 file_drop(uap->fd);
 365                 return error;
 366         }
 367
 368         pvp = vnode_getparent(vp);
 369         if (pvp == NULL) {
 370                 vnode_put(vp);
 371                 file_drop(uap->fd);
 372                 return EINVAL;
 373         }
 374
 375         memset(&cn, 0, sizeof(struct componentname));
 376         MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
 377         cn.cn_pnlen = MAXPATHLEN;
 378
 379         if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
 380                 FREE(cn.cn_pnbuf, M_TEMP);
 381                 vnode_put(pvp);
 382                 vnode_put(vp);
 383                 file_drop(uap->fd);
 384                 return error;
 385         }
 386
 387         error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
 388
 389         FREE(cn.cn_pnbuf, M_TEMP);
 390         vnode_put(pvp);
 391         vnode_put(vp);
 392         file_drop(uap->fd);
 393
 394         return error;
 395 }
 396
 397 void
 398 vfs_notify_mount(vnode_t pdvp)
 399 {
 400         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 401         lock_vnode_and_post(pdvp, NOTE_WRITE);
 402 }
 403
 404 /*
 405  * __mac_mount:
 406  *      Mount a file system taking into account MAC label behavior.
 407  *      See mount(2) man page for more information
 408  *
 409  * Parameters:    p                        Process requesting the mount
 410  *                uap                      User argument descriptor (see below)
 411  *                retval                   (ignored)
 412  *
 413  * Indirect:      uap->type                Filesystem type
 414  *                uap->path                Path to mount
 415  *                uap->data                Mount arguments
 416  *                uap->mac_p               MAC info
 417  *                uap->flags               Mount flags
 418  *
 419  *
 420  * Returns:        0                       Success
 421  *                !0                       Not success
 422  */
 423 boolean_t root_fs_upgrade_try = FALSE;
 424
 425 int
 426 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 427 {
 428         vnode_t pvp = NULL;
 429         vnode_t vp = NULL;
 430         int need_nameidone = 0;
 431         vfs_context_t ctx = vfs_context_current();
 432         char fstypename[MFSNAMELEN];
 433         struct nameidata nd;
 434         size_t dummy = 0;
 435         char *labelstr = NULL;
 436         int flags = uap->flags;
 437         int error;
 438 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 439         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 440 #else
 441 #pragma unused(p)
 442 #endif
 443         /*
 444          * Get the fs type name from user space
 445          */
 446         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 447         if (error) {
 448                 return error;
 449         }
 450
 451         /*
 452          * Get the vnode to be covered
 453          */
 454         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 455             UIO_USERSPACE, uap->path, ctx);
 456         error = namei(&nd);
 457         if (error) {
 458                 goto out;
 459         }
 460         need_nameidone = 1;
 461         vp = nd.ni_vp;
 462         pvp = nd.ni_dvp;
 463
 464 #ifdef CONFIG_IMGSRC_ACCESS
 465         /* Mounting image source cannot be batched with other operations */
 466         if (flags == MNT_IMGSRC_BY_INDEX) {
 467                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 468                     ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 469                 goto out;
 470         }
 471 #endif /* CONFIG_IMGSRC_ACCESS */
 472
 473 #if CONFIG_MACF
 474         /*
 475          * Get the label string (if any) from user space
 476          */
 477         if (uap->mac_p != USER_ADDR_NULL) {
 478                 struct user_mac mac;
 479                 size_t ulen = 0;
 480
 481                 if (is_64bit) {
 482                         struct user64_mac mac64;
 483                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 484                         mac.m_buflen = mac64.m_buflen;
 485                         mac.m_string = mac64.m_string;
 486                 } else {
 487                         struct user32_mac mac32;
 488                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 489                         mac.m_buflen = mac32.m_buflen;
 490                         mac.m_string = mac32.m_string;
 491                 }
 492                 if (error) {
 493                         goto out;
 494                 }
 495                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 496                     (mac.m_buflen < 2)) {
 497                         error = EINVAL;
 498                         goto out;
 499                 }
 500                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 501                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 502                 if (error) {
 503                         goto out;
 504                 }
 505                 AUDIT_ARG(mac_string, labelstr);
 506         }
 507 #endif /* CONFIG_MACF */
 508
 509         AUDIT_ARG(fflags, flags);
 510
 511 #if SECURE_KERNEL
 512         if (flags & MNT_UNION) {
 513                 /* No union mounts on release kernels */
 514                 error = EPERM;
 515                 goto out;
 516         }
 517 #endif
 518
 519         if ((vp->v_flag & VROOT) &&
 520             (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 521                 if (!(flags & MNT_UNION)) {
 522                         flags |= MNT_UPDATE;
 523                 } else {
 524                         /*
 525                          * For a union mount on '/', treat it as fresh
 526                          * mount instead of update.
 527                          * Otherwise, union mouting on '/' used to panic the
 528                          * system before, since mnt_vnodecovered was found to
 529                          * be NULL for '/' which is required for unionlookup
 530                          * after it gets ENOENT on union mount.
 531                          */
 532                         flags = (flags & ~(MNT_UPDATE));
 533                 }
 534
 535 #if SECURE_KERNEL
 536                 if ((flags & MNT_RDONLY) == 0) {
 537                         /* Release kernels are not allowed to mount "/" as rw */
 538                         error = EPERM;
 539                         goto out;
 540                 }
 541 #endif
 542                 /*
 543                  * See 7392553 for more details on why this check exists.
 544                  * Suffice to say: If this check is ON and something tries
 545                  * to mount the rootFS RW, we'll turn off the codesign
 546                  * bitmap optimization.
 547                  */
 548 #if CHECK_CS_VALIDATION_BITMAP
 549                 if ((flags & MNT_RDONLY) == 0) {
 550                         root_fs_upgrade_try = TRUE;
 551                 }
 552 #endif
 553         }
 554
 555         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 556             labelstr, FALSE, ctx);
 557
 558 out:
 559
 560 #if CONFIG_MACF
 561         if (labelstr) {
 562                 FREE(labelstr, M_MACTEMP);
 563         }
 564 #endif /* CONFIG_MACF */
 565
 566         if (vp) {
 567                 vnode_put(vp);
 568         }
 569         if (pvp) {
 570                 vnode_put(pvp);
 571         }
 572         if (need_nameidone) {
 573                 nameidone(&nd);
 574         }
 575
 576         return error;
 577 }
 578
 579 /*
 580  * common mount implementation (final stage of mounting)
 581  *
 582  * Arguments:
 583  *  fstypename  file system type (ie it's vfs name)
 584  *  pvp         parent of covered vnode
 585  *  vp          covered vnode
 586  *  cnp         component name (ie path) of covered vnode
 587  *  flags       generic mount flags
 588  *  fsmountargs file system specific data
 589  *  labelstr    optional MAC label
 590  *  kernelmount TRUE for mounts initiated from inside the kernel
 591  *  ctx         caller's context
 592  */
 593 static int
 594 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 595     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 596     char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 597 {
 598 #if !CONFIG_MACF
 599 #pragma unused(labelstr)
 600 #endif
 601         struct vnode *devvp = NULLVP;
 602         struct vnode *device_vnode = NULLVP;
 603 #if CONFIG_MACF
 604         struct vnode *rvp;
 605 #endif
 606         struct mount *mp;
 607         struct vfstable *vfsp = (struct vfstable *)0;
 608         struct proc *p = vfs_context_proc(ctx);
 609         int error, flag = 0;
 610         user_addr_t devpath = USER_ADDR_NULL;
 611         int ronly = 0;
 612         int mntalloc = 0;
 613         boolean_t vfsp_ref = FALSE;
 614         boolean_t is_rwlock_locked = FALSE;
 615         boolean_t did_rele = FALSE;
 616         boolean_t have_usecount = FALSE;
 617
 618         /*
 619          * Process an update for an existing mount
 620          */
 621         if (flags & MNT_UPDATE) {
 622                 if ((vp->v_flag & VROOT) == 0) {
 623                         error = EINVAL;
 624                         goto out1;
 625                 }
 626                 mp = vp->v_mount;
 627
 628                 /* unmount in progress return error */
 629                 mount_lock_spin(mp);
 630                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 631                         mount_unlock(mp);
 632                         error = EBUSY;
 633                         goto out1;
 634                 }
 635                 mount_unlock(mp);
 636                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 637                 is_rwlock_locked = TRUE;
 638                 /*
 639                  * We only allow the filesystem to be reloaded if it
 640                  * is currently mounted read-only.
 641                  */
 642                 if ((flags & MNT_RELOAD) &&
 643                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 644                         error = ENOTSUP;
 645                         goto out1;
 646                 }
 647
 648                 /*
 649                  * If content protection is enabled, update mounts are not
 650                  * allowed to turn it off.
 651                  */
 652                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 653                     ((flags & MNT_CPROTECT) == 0)) {
 654                         error = EINVAL;
 655                         goto out1;
 656                 }
 657
 658 #ifdef CONFIG_IMGSRC_ACCESS
 659                 /* Can't downgrade the backer of the root FS */
 660                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 661                     (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 662                         error = ENOTSUP;
 663                         goto out1;
 664                 }
 665 #endif /* CONFIG_IMGSRC_ACCESS */
 666
 667                 /*
 668                  * Only root, or the user that did the original mount is
 669                  * permitted to update it.
 670                  */
 671                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 672                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 673                         goto out1;
 674                 }
 675 #if CONFIG_MACF
 676                 error = mac_mount_check_remount(ctx, mp);
 677                 if (error != 0) {
 678                         goto out1;
 679                 }
 680 #endif
 681                 /*
 682                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 683                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 684                  */
 685                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 686                         flags |= MNT_NOSUID | MNT_NODEV;
 687                         if (mp->mnt_flag & MNT_NOEXEC) {
 688                                 flags |= MNT_NOEXEC;
 689                         }
 690                 }
 691                 flag = mp->mnt_flag;
 692
 693
 694
 695                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 696
 697                 vfsp = mp->mnt_vtable;
 698                 goto update;
 699         }
 700
 701         /*
 702          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 703          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 704          */
 705         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 706                 flags |= MNT_NOSUID | MNT_NODEV;
 707                 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
 708                         flags |= MNT_NOEXEC;
 709                 }
 710         }
 711
 712         /* XXXAUDIT: Should we capture the type on the error path as well? */
 713         AUDIT_ARG(text, fstypename);
 714         mount_list_lock();
 715         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 716                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 717                         vfsp->vfc_refcount++;
 718                         vfsp_ref = TRUE;
 719                         break;
 720                 }
 721         }
 722         mount_list_unlock();
 723         if (vfsp == NULL) {
 724                 error = ENODEV;
 725                 goto out1;
 726         }
 727
 728         /*
 729          * VFC_VFSLOCALARGS is not currently supported for kernel mounts
 730          */
 731         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
 732                 error = EINVAL;  /* unsupported request */
 733                 goto out1;
 734         }
 735
 736         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 737         if (error != 0) {
 738                 goto out1;
 739         }
 740
 741         /*
 742          * Allocate and initialize the filesystem (mount_t)
 743          */
 744         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 745             M_MOUNT, M_WAITOK);
 746         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 747         mntalloc = 1;
 748
 749         /* Initialize the default IO constraints */
 750         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 751         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 752         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 753         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 754         mp->mnt_devblocksize = DEV_BSIZE;
 755         mp->mnt_alignmentmask = PAGE_MASK;
 756         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 757         mp->mnt_ioscale = 1;
 758         mp->mnt_ioflags = 0;
 759         mp->mnt_realrootvp = NULLVP;
 760         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 761
 762         TAILQ_INIT(&mp->mnt_vnodelist);
 763         TAILQ_INIT(&mp->mnt_workerqueue);
 764         TAILQ_INIT(&mp->mnt_newvnodes);
 765         mount_lock_init(mp);
 766         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 767         is_rwlock_locked = TRUE;
 768         mp->mnt_op = vfsp->vfc_vfsops;
 769         mp->mnt_vtable = vfsp;
 770         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 771         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 772         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 773         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 774         mp->mnt_vnodecovered = vp;
 775         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 776         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 777         mp->mnt_devbsdunit = 0;
 778
 779         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 780         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 781
 782 #if NFSCLIENT || DEVFS || ROUTEFS
 783         if (kernelmount) {
 784                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 785         }
 786         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
 787                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 788         }
 789 #endif /* NFSCLIENT || DEVFS */
 790
 791 update:
 792
 793         /*
 794          * Set the mount level flags.
 795          */
 796         if (flags & MNT_RDONLY) {
 797                 mp->mnt_flag |= MNT_RDONLY;
 798         } else if (mp->mnt_flag & MNT_RDONLY) {
 799                 // disallow read/write upgrades of file systems that
 800                 // had the TYPENAME_OVERRIDE feature set.
 801                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 802                         error = EPERM;
 803                         goto out1;
 804                 }
 805                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 806         }
 807         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 808             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 809             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 810             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 811             MNT_QUARANTINE | MNT_CPROTECT);
 812
 813 #if SECURE_KERNEL
 814 #if !CONFIG_MNT_SUID
 815         /*
 816          * On release builds of iOS based platforms, always enforce NOSUID on
 817          * all mounts. We do this here because we can catch update mounts as well as
 818          * non-update mounts in this case.
 819          */
 820         mp->mnt_flag |= (MNT_NOSUID);
 821 #endif
 822 #endif
 823
 824         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 825             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 826             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 827             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 828             MNT_QUARANTINE | MNT_CPROTECT);
 829
 830 #if CONFIG_MACF
 831         if (flags & MNT_MULTILABEL) {
 832                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 833                         error = EINVAL;
 834                         goto out1;
 835                 }
 836                 mp->mnt_flag |= MNT_MULTILABEL;
 837         }
 838 #endif
 839         /*
 840          * Process device path for local file systems if requested
 841          */
 842         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
 843             !(internal_flags & KERNEL_MOUNT_SNAPSHOT)) {
 844                 if (vfs_context_is64bit(ctx)) {
 845                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
 846                                 goto out1;
 847                         }
 848                         fsmountargs += sizeof(devpath);
 849                 } else {
 850                         user32_addr_t tmp;
 851                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
 852                                 goto out1;
 853                         }
 854                         /* munge into LP64 addr */
 855                         devpath = CAST_USER_ADDR_T(tmp);
 856                         fsmountargs += sizeof(tmp);
 857                 }
 858
 859                 /* Lookup device and authorize access to it */
 860                 if ((devpath)) {
 861                         struct nameidata nd;
 862
 863                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 864                         if ((error = namei(&nd))) {
 865                                 goto out1;
 866                         }
 867
 868                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 869                         devvp = nd.ni_vp;
 870
 871                         nameidone(&nd);
 872
 873                         if (devvp->v_type != VBLK) {
 874                                 error = ENOTBLK;
 875                                 goto out2;
 876                         }
 877                         if (major(devvp->v_rdev) >= nblkdev) {
 878                                 error = ENXIO;
 879                                 goto out2;
 880                         }
 881                         /*
 882                          * If mount by non-root, then verify that user has necessary
 883                          * permissions on the device.
 884                          */
 885                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 886                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 887
 888                                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 889                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 890                                 }
 891                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
 892                                         goto out2;
 893                                 }
 894                         }
 895                 }
 896                 /* On first mount, preflight and open device */
 897                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 898                         if ((error = vnode_ref(devvp))) {
 899                                 goto out2;
 900                         }
 901                         /*
 902                          * Disallow multiple mounts of the same device.
 903                          * Disallow mounting of a device that is currently in use
 904                          * (except for root, which might share swap device for miniroot).
 905                          * Flush out any old buffers remaining from a previous use.
 906                          */
 907                         if ((error = vfs_mountedon(devvp))) {
 908                                 goto out3;
 909                         }
 910
 911                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 912                                 error = EBUSY;
 913                                 goto out3;
 914                         }
 915                         if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
 916                                 error = ENOTBLK;
 917                                 goto out3;
 918                         }
 919                         if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
 920                                 goto out3;
 921                         }
 922
 923                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 924 #if CONFIG_MACF
 925                         error = mac_vnode_check_open(ctx,
 926                             devvp,
 927                             ronly ? FREAD : FREAD | FWRITE);
 928                         if (error) {
 929                                 goto out3;
 930                         }
 931 #endif /* MAC */
 932                         if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
 933                                 goto out3;
 934                         }
 935
 936                         mp->mnt_devvp = devvp;
 937                         device_vnode = devvp;
 938                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 939                     (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 940                     (device_vnode = mp->mnt_devvp)) {
 941                         dev_t dev;
 942                         int maj;
 943                         /*
 944                          * If upgrade to read-write by non-root, then verify
 945                          * that user has necessary permissions on the device.
 946                          */
 947                         vnode_getalways(device_vnode);
 948
 949                         if (suser(vfs_context_ucred(ctx), NULL) &&
 950                             (error = vnode_authorize(device_vnode, NULL,
 951                             KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
 952                             ctx)) != 0) {
 953                                 vnode_put(device_vnode);
 954                                 goto out2;
 955                         }
 956
 957                         /* Tell the device that we're upgrading */
 958                         dev = (dev_t)device_vnode->v_rdev;
 959                         maj = major(dev);
 960
 961                         if ((u_int)maj >= (u_int)nblkdev) {
 962                                 panic("Volume mounted on a device with invalid major number.");
 963                         }
 964
 965                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
 966                         vnode_put(device_vnode);
 967                         device_vnode = NULLVP;
 968                         if (error != 0) {
 969                                 goto out2;
 970                         }
 971                 }
 972         }
 973 #if CONFIG_MACF
 974         if ((flags & MNT_UPDATE) == 0) {
 975                 mac_mount_label_init(mp);
 976                 mac_mount_label_associate(ctx, mp);
 977         }
 978         if (labelstr) {
 979                 if ((flags & MNT_UPDATE) != 0) {
 980                         error = mac_mount_check_label_update(ctx, mp);
 981                         if (error != 0) {
 982                                 goto out3;
 983                         }
 984                 }
 985         }
 986 #endif
 987         /*
 988          * Mount the filesystem.
 989          */
 990         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
 991                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
 992                     (caddr_t)fsmountargs, 0, ctx);
 993         } else {
 994                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
 995         }
 996
 997         if (flags & MNT_UPDATE) {
 998                 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
 999                         mp->mnt_flag &= ~MNT_RDONLY;
1000                 }
1001                 mp->mnt_flag &= ~
1002                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1003                 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1004                 if (error) {
1005                         mp->mnt_flag = flag;  /* restore flag value */
1006                 }
1007                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1008                 lck_rw_done(&mp->mnt_rwlock);
1009                 is_rwlock_locked = FALSE;
1010                 if (!error) {
1011                         enablequotas(mp, ctx);
1012                 }
1013                 goto exit;
1014         }
1015
1016         /*
1017          * Put the new filesystem on the mount list after root.
1018          */
1019         if (error == 0) {
1020                 struct vfs_attr vfsattr;
1021 #if CONFIG_MACF
1022                 if (vfs_flags(mp) & MNT_MULTILABEL) {
1023                         error = VFS_ROOT(mp, &rvp, ctx);
1024                         if (error) {
1025                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1026                                 goto out3;
1027                         }
1028                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1029                         /*
1030                          * drop reference provided by VFS_ROOT
1031                          */
1032                         vnode_put(rvp);
1033
1034                         if (error) {
1035                                 goto out3;
1036                         }
1037                 }
1038 #endif  /* MAC */
1039
1040                 vnode_lock_spin(vp);
1041                 CLR(vp->v_flag, VMOUNT);
1042                 vp->v_mountedhere = mp;
1043                 vnode_unlock(vp);
1044
1045                 /*
1046                  * taking the name_cache_lock exclusively will
1047                  * insure that everyone is out of the fast path who
1048                  * might be trying to use a now stale copy of
1049                  * vp->v_mountedhere->mnt_realrootvp
1050                  * bumping mount_generation causes the cached values
1051                  * to be invalidated
1052                  */
1053                 name_cache_lock();
1054                 mount_generation++;
1055                 name_cache_unlock();
1056
1057                 error = vnode_ref(vp);
1058                 if (error != 0) {
1059                         goto out4;
1060                 }
1061
1062                 have_usecount = TRUE;
1063
1064                 error = checkdirs(vp, ctx);
1065                 if (error != 0) {
1066                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
1067                         goto out4;
1068                 }
1069                 /*
1070                  * there is no cleanup code here so I have made it void
1071                  * we need to revisit this
1072                  */
1073                 (void)VFS_START(mp, 0, ctx);
1074
1075                 if (mount_list_add(mp) != 0) {
1076                         /*
1077                          * The system is shutting down trying to umount
1078                          * everything, so fail with a plausible errno.
1079                          */
1080                         error = EBUSY;
1081                         goto out4;
1082                 }
1083                 lck_rw_done(&mp->mnt_rwlock);
1084                 is_rwlock_locked = FALSE;
1085
1086                 /* Check if this mounted file system supports EAs or named streams. */
1087                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1088                 VFSATTR_INIT(&vfsattr);
1089                 VFSATTR_WANTED(&vfsattr, f_capabilities);
1090                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1091                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1092                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1093                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1094                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1095                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1096                         }
1097 #if NAMEDSTREAMS
1098                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1099                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1100                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1101                         }
1102 #endif
1103                         /* Check if this file system supports path from id lookups. */
1104                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1105                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1106                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1107                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
1108                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1109                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1110                         }
1111
1112                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1113                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1114                                 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1115                         }
1116                 }
1117                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1118                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1119                 }
1120                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1121                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1122                 }
1123                 /* increment the operations count */
1124                 OSAddAtomic(1, &vfs_nummntops);
1125                 enablequotas(mp, ctx);
1126
1127                 if (device_vnode) {
1128                         device_vnode->v_specflags |= SI_MOUNTEDON;
1129
1130                         /*
1131                          *   cache the IO attributes for the underlying physical media...
1132                          *   an error return indicates the underlying driver doesn't
1133                          *   support all the queries necessary... however, reasonable
1134                          *   defaults will have been set, so no reason to bail or care
1135                          */
1136                         vfs_init_io_attributes(device_vnode, mp);
1137                 }
1138
1139                 /* Now that mount is setup, notify the listeners */
1140                 vfs_notify_mount(pvp);
1141                 IOBSDMountChange(mp, kIOMountChangeMount);
1142         } else {
1143                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1144                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1145                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1146                             mp->mnt_vtable->vfc_name, error);
1147                 }
1148
1149                 vnode_lock_spin(vp);
1150                 CLR(vp->v_flag, VMOUNT);
1151                 vnode_unlock(vp);
1152                 mount_list_lock();
1153                 mp->mnt_vtable->vfc_refcount--;
1154                 mount_list_unlock();
1155
1156                 if (device_vnode) {
1157                         vnode_rele(device_vnode);
1158                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1159                 }
1160                 lck_rw_done(&mp->mnt_rwlock);
1161                 is_rwlock_locked = FALSE;
1162
1163                 /*
1164                  * if we get here, we have a mount structure that needs to be freed,
1165                  * but since the coveredvp hasn't yet been updated to point at it,
1166                  * no need to worry about other threads holding a crossref on this mp
1167                  * so it's ok to just free it
1168                  */
1169                 mount_lock_destroy(mp);
1170 #if CONFIG_MACF
1171                 mac_mount_label_destroy(mp);
1172 #endif
1173                 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1174         }
1175 exit:
1176         /*
1177          * drop I/O count on the device vp if there was one
1178          */
1179         if (devpath && devvp) {
1180                 vnode_put(devvp);
1181         }
1182
1183         return error;
1184
1185 /* Error condition exits */
1186 out4:
1187         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1188
1189         /*
1190          * If the mount has been placed on the covered vp,
1191          * it may have been discovered by now, so we have
1192          * to treat this just like an unmount
1193          */
1194         mount_lock_spin(mp);
1195         mp->mnt_lflag |= MNT_LDEAD;
1196         mount_unlock(mp);
1197
1198         if (device_vnode != NULLVP) {
1199                 vnode_rele(device_vnode);
1200                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1201                     ctx);
1202                 did_rele = TRUE;
1203         }
1204
1205         vnode_lock_spin(vp);
1206
1207         mp->mnt_crossref++;
1208         vp->v_mountedhere = (mount_t) 0;
1209
1210         vnode_unlock(vp);
1211
1212         if (have_usecount) {
1213                 vnode_rele(vp);
1214         }
1215 out3:
1216         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1217                 vnode_rele(devvp);
1218         }
1219 out2:
1220         if (devpath && devvp) {
1221                 vnode_put(devvp);
1222         }
1223 out1:
1224         /* Release mnt_rwlock only when it was taken */
1225         if (is_rwlock_locked == TRUE) {
1226                 lck_rw_done(&mp->mnt_rwlock);
1227         }
1228
1229         if (mntalloc) {
1230                 if (mp->mnt_crossref) {
1231                         mount_dropcrossref(mp, vp, 0);
1232                 } else {
1233                         mount_lock_destroy(mp);
1234 #if CONFIG_MACF
1235                         mac_mount_label_destroy(mp);
1236 #endif
1237                         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1238                 }
1239         }
1240         if (vfsp_ref) {
1241                 mount_list_lock();
1242                 vfsp->vfc_refcount--;
1243                 mount_list_unlock();
1244         }
1245
1246         return error;
1247 }
1248
1249 /*
1250  * Flush in-core data, check for competing mount attempts,
1251  * and set VMOUNT
1252  */
1253 int
1254 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1255 {
1256 #if !CONFIG_MACF
1257 #pragma unused(cnp,fsname)
1258 #endif
1259         struct vnode_attr va;
1260         int error;
1261
1262         if (!skip_auth) {
1263                 /*
1264                  * If the user is not root, ensure that they own the directory
1265                  * onto which we are attempting to mount.
1266                  */
1267                 VATTR_INIT(&va);
1268                 VATTR_WANTED(&va, va_uid);
1269                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1270                     (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1271                     (!vfs_context_issuser(ctx)))) {
1272                         error = EPERM;
1273                         goto out;
1274                 }
1275         }
1276
1277         if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1278                 goto out;
1279         }
1280
1281         if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1282                 goto out;
1283         }
1284
1285         if (vp->v_type != VDIR) {
1286                 error = ENOTDIR;
1287                 goto out;
1288         }
1289
1290         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1291                 error = EBUSY;
1292                 goto out;
1293         }
1294
1295 #if CONFIG_MACF
1296         error = mac_mount_check_mount(ctx, vp,
1297             cnp, fsname);
1298         if (error != 0) {
1299                 goto out;
1300         }
1301 #endif
1302
1303         vnode_lock_spin(vp);
1304         SET(vp->v_flag, VMOUNT);
1305         vnode_unlock(vp);
1306
1307 out:
1308         return error;
1309 }
1310
1311 #if CONFIG_IMGSRC_ACCESS
1312
1313 #if DEBUG
1314 #define IMGSRC_DEBUG(args...) printf(args)
1315 #else
1316 #define IMGSRC_DEBUG(args...) do { } while(0)
1317 #endif
1318
1319 static int
1320 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1321 {
1322         struct nameidata nd;
1323         vnode_t vp, realdevvp;
1324         mode_t accessmode;
1325         int error;
1326
1327         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
1328         if ((error = namei(&nd))) {
1329                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1330                 return error;
1331         }
1332
1333         vp = nd.ni_vp;
1334
1335         if (!vnode_isblk(vp)) {
1336                 IMGSRC_DEBUG("Not block device.\n");
1337                 error = ENOTBLK;
1338                 goto out;
1339         }
1340
1341         realdevvp = mp->mnt_devvp;
1342         if (realdevvp == NULLVP) {
1343                 IMGSRC_DEBUG("No device backs the mount.\n");
1344                 error = ENXIO;
1345                 goto out;
1346         }
1347
1348         error = vnode_getwithref(realdevvp);
1349         if (error != 0) {
1350                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1351                 goto out;
1352         }
1353
1354         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1355                 IMGSRC_DEBUG("Wrong dev_t.\n");
1356                 error = ENXIO;
1357                 goto out1;
1358         }
1359
1360         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1361
1362         /*
1363          * If mount by non-root, then verify that user has necessary
1364          * permissions on the device.
1365          */
1366         if (!vfs_context_issuser(ctx)) {
1367                 accessmode = KAUTH_VNODE_READ_DATA;
1368                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1369                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1370                 }
1371                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1372                         IMGSRC_DEBUG("Access denied.\n");
1373                         goto out1;
1374                 }
1375         }
1376
1377         *devvpp = vp;
1378
1379 out1:
1380         vnode_put(realdevvp);
1381 out:
1382         nameidone(&nd);
1383         if (error) {
1384                 vnode_put(vp);
1385         }
1386
1387         return error;
1388 }
1389
1390 /*
1391  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1392  * and call checkdirs()
1393  */
1394 static int
1395 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1396 {
1397         int error;
1398
1399         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1400
1401         vnode_lock_spin(vp);
1402         CLR(vp->v_flag, VMOUNT);
1403         vp->v_mountedhere = mp;
1404         vnode_unlock(vp);
1405
1406         /*
1407          * taking the name_cache_lock exclusively will
1408          * insure that everyone is out of the fast path who
1409          * might be trying to use a now stale copy of
1410          * vp->v_mountedhere->mnt_realrootvp
1411          * bumping mount_generation causes the cached values
1412          * to be invalidated
1413          */
1414         name_cache_lock();
1415         mount_generation++;
1416         name_cache_unlock();
1417
1418         error = vnode_ref(vp);
1419         if (error != 0) {
1420                 goto out;
1421         }
1422
1423         error = checkdirs(vp, ctx);
1424         if (error != 0) {
1425                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1426                 vnode_rele(vp);
1427                 goto out;
1428         }
1429
1430 out:
1431         if (error != 0) {
1432                 mp->mnt_vnodecovered = NULLVP;
1433         }
1434         return error;
1435 }
1436
1437 static void
1438 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1439 {
1440         vnode_rele(vp);
1441         vnode_lock_spin(vp);
1442         vp->v_mountedhere = (mount_t)NULL;
1443         vnode_unlock(vp);
1444
1445         mp->mnt_vnodecovered = NULLVP;
1446 }
1447
1448 static int
1449 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1450 {
1451         int error;
1452
1453         /* unmount in progress return error */
1454         mount_lock_spin(mp);
1455         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1456                 mount_unlock(mp);
1457                 return EBUSY;
1458         }
1459         mount_unlock(mp);
1460         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1461
1462         /*
1463          * We only allow the filesystem to be reloaded if it
1464          * is currently mounted read-only.
1465          */
1466         if ((flags & MNT_RELOAD) &&
1467             ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1468                 error = ENOTSUP;
1469                 goto out;
1470         }
1471
1472         /*
1473          * Only root, or the user that did the original mount is
1474          * permitted to update it.
1475          */
1476         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1477             (!vfs_context_issuser(ctx))) {
1478                 error = EPERM;
1479                 goto out;
1480         }
1481 #if CONFIG_MACF
1482         error = mac_mount_check_remount(ctx, mp);
1483         if (error != 0) {
1484                 goto out;
1485         }
1486 #endif
1487
1488 out:
1489         if (error) {
1490                 lck_rw_done(&mp->mnt_rwlock);
1491         }
1492
1493         return error;
1494 }
1495
1496 static void
1497 mount_end_update(mount_t mp)
1498 {
1499         lck_rw_done(&mp->mnt_rwlock);
1500 }
1501
1502 static int
1503 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1504 {
1505         vnode_t vp;
1506
1507         if (height >= MAX_IMAGEBOOT_NESTING) {
1508                 return EINVAL;
1509         }
1510
1511         vp = imgsrc_rootvnodes[height];
1512         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1513                 *rvpp = vp;
1514                 return 0;
1515         } else {
1516                 return ENOENT;
1517         }
1518 }
1519
1520 static int
1521 relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
1522     const char *fsname, vfs_context_t ctx,
1523     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1524 {
1525         int error;
1526         mount_t mp;
1527         boolean_t placed = FALSE;
1528         vnode_t devvp = NULLVP;
1529         struct vfstable *vfsp;
1530         user_addr_t devpath;
1531         char *old_mntonname;
1532         vnode_t rvp;
1533         uint32_t height;
1534         uint32_t flags;
1535
1536         /* If we didn't imageboot, nothing to move */
1537         if (imgsrc_rootvnodes[0] == NULLVP) {
1538                 return EINVAL;
1539         }
1540
1541         /* Only root can do this */
1542         if (!vfs_context_issuser(ctx)) {
1543                 return EPERM;
1544         }
1545
1546         IMGSRC_DEBUG("looking for root vnode.\n");
1547
1548         /*
1549          * Get root vnode of filesystem we're moving.
1550          */
1551         if (by_index) {
1552                 if (is64bit) {
1553                         struct user64_mnt_imgsrc_args mia64;
1554                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1555                         if (error != 0) {
1556                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1557                                 return error;
1558                         }
1559
1560                         height = mia64.mi_height;
1561                         flags = mia64.mi_flags;
1562                         devpath = mia64.mi_devpath;
1563                 } else {
1564                         struct user32_mnt_imgsrc_args mia32;
1565                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1566                         if (error != 0) {
1567                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1568                                 return error;
1569                         }
1570
1571                         height = mia32.mi_height;
1572                         flags = mia32.mi_flags;
1573                         devpath = mia32.mi_devpath;
1574                 }
1575         } else {
1576                 /*
1577                  * For binary compatibility--assumes one level of nesting.
1578                  */
1579                 if (is64bit) {
1580                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1581                                 return error;
1582                         }
1583                 } else {
1584                         user32_addr_t tmp;
1585                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1586                                 return error;
1587                         }
1588
1589                         /* munge into LP64 addr */
1590                         devpath = CAST_USER_ADDR_T(tmp);
1591                 }
1592
1593                 height = 0;
1594                 flags = 0;
1595         }
1596
1597         if (flags != 0) {
1598                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1599                 return EINVAL;
1600         }
1601
1602         error = get_imgsrc_rootvnode(height, &rvp);
1603         if (error != 0) {
1604                 IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
1605                 return error;
1606         }
1607
1608         IMGSRC_DEBUG("got root vnode.\n");
1609
1610         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1611
1612         /* Can only move once */
1613         mp = vnode_mount(rvp);
1614         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1615                 IMGSRC_DEBUG("Already moved.\n");
1616                 error = EBUSY;
1617                 goto out0;
1618         }
1619
1620         IMGSRC_DEBUG("Starting updated.\n");
1621
1622         /* Get exclusive rwlock on mount, authorize update on mp */
1623         error = mount_begin_update(mp, ctx, 0);
1624         if (error != 0) {
1625                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1626                 goto out0;
1627         }
1628
1629         /*
1630          * It can only be moved once.  Flag is set under the rwlock,
1631          * so we're now safe to proceed.
1632          */
1633         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1634                 IMGSRC_DEBUG("Already moved [2]\n");
1635                 goto out1;
1636         }
1637
1638
1639         IMGSRC_DEBUG("Preparing coveredvp.\n");
1640
1641         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1642         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1643         if (error != 0) {
1644                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1645                 goto out1;
1646         }
1647
1648         IMGSRC_DEBUG("Covered vp OK.\n");
1649
1650         /* Sanity check the name caller has provided */
1651         vfsp = mp->mnt_vtable;
1652         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1653                 IMGSRC_DEBUG("Wrong fs name.\n");
1654                 error = EINVAL;
1655                 goto out2;
1656         }
1657
1658         /* Check the device vnode and update mount-from name, for local filesystems */
1659         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1660                 IMGSRC_DEBUG("Local, doing device validation.\n");
1661
1662                 if (devpath != USER_ADDR_NULL) {
1663                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1664                         if (error) {
1665                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1666                                 goto out2;
1667                         }
1668
1669                         vnode_put(devvp);
1670                 }
1671         }
1672
1673         /*
1674          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1675          * and increment the name cache's mount generation
1676          */
1677
1678         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1679         error = place_mount_and_checkdirs(mp, vp, ctx);
1680         if (error != 0) {
1681                 goto out2;
1682         }
1683
1684         placed = TRUE;
1685
1686         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1687         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1688
1689         /* Forbid future moves */
1690         mount_lock(mp);
1691         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1692         mount_unlock(mp);
1693
1694         /* Finally, add to mount list, completely ready to go */
1695         if (mount_list_add(mp) != 0) {
1696                 /*
1697                  * The system is shutting down trying to umount
1698                  * everything, so fail with a plausible errno.
1699                  */
1700                 error = EBUSY;
1701                 goto out3;
1702         }
1703
1704         mount_end_update(mp);
1705         vnode_put(rvp);
1706         FREE(old_mntonname, M_TEMP);
1707
1708         vfs_notify_mount(pvp);
1709
1710         return 0;
1711 out3:
1712         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1713
1714         mount_lock(mp);
1715         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1716         mount_unlock(mp);
1717
1718 out2:
1719         /*
1720          * Placing the mp on the vnode clears VMOUNT,
1721          * so cleanup is different after that point
1722          */
1723         if (placed) {
1724                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1725                 undo_place_on_covered_vp(mp, vp);
1726         } else {
1727                 vnode_lock_spin(vp);
1728                 CLR(vp->v_flag, VMOUNT);
1729                 vnode_unlock(vp);
1730         }
1731 out1:
1732         mount_end_update(mp);
1733
1734 out0:
1735         vnode_put(rvp);
1736         FREE(old_mntonname, M_TEMP);
1737         return error;
1738 }
1739
1740 #endif /* CONFIG_IMGSRC_ACCESS */
1741
1742 void
1743 enablequotas(struct mount *mp, vfs_context_t ctx)
1744 {
1745         struct nameidata qnd;
1746         int type;
1747         char qfpath[MAXPATHLEN];
1748         const char *qfname = QUOTAFILENAME;
1749         const char *qfopsname = QUOTAOPSNAME;
1750         const char *qfextension[] = INITQFNAMES;
1751
1752         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1753         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
1754                 return;
1755         }
1756         /*
1757          * Enable filesystem disk quotas if necessary.
1758          * We ignore errors as this should not interfere with final mount
1759          */
1760         for (type = 0; type < MAXQUOTAS; type++) {
1761                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1762                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1763                     CAST_USER_ADDR_T(qfpath), ctx);
1764                 if (namei(&qnd) != 0) {
1765                         continue;           /* option file to trigger quotas is not present */
1766                 }
1767                 vnode_put(qnd.ni_vp);
1768                 nameidone(&qnd);
1769                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1770
1771                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1772         }
1773         return;
1774 }
1775
1776
1777 static int
1778 checkdirs_callback(proc_t p, void * arg)
1779 {
1780         struct cdirargs * cdrp = (struct cdirargs *)arg;
1781         vnode_t olddp = cdrp->olddp;
1782         vnode_t newdp = cdrp->newdp;
1783         struct filedesc *fdp;
1784         vnode_t tvp;
1785         vnode_t fdp_cvp;
1786         vnode_t fdp_rvp;
1787         int cdir_changed = 0;
1788         int rdir_changed = 0;
1789
1790         /*
1791          * XXX Also needs to iterate each thread in the process to see if it
1792          * XXX is using a per-thread current working directory, and, if so,
1793          * XXX update that as well.
1794          */
1795
1796         proc_fdlock(p);
1797         fdp = p->p_fd;
1798         if (fdp == (struct filedesc *)0) {
1799                 proc_fdunlock(p);
1800                 return PROC_RETURNED;
1801         }
1802         fdp_cvp = fdp->fd_cdir;
1803         fdp_rvp = fdp->fd_rdir;
1804         proc_fdunlock(p);
1805
1806         if (fdp_cvp == olddp) {
1807                 vnode_ref(newdp);
1808                 tvp = fdp->fd_cdir;
1809                 fdp_cvp = newdp;
1810                 cdir_changed = 1;
1811                 vnode_rele(tvp);
1812         }
1813         if (fdp_rvp == olddp) {
1814                 vnode_ref(newdp);
1815                 tvp = fdp->fd_rdir;
1816                 fdp_rvp = newdp;
1817                 rdir_changed = 1;
1818                 vnode_rele(tvp);
1819         }
1820         if (cdir_changed || rdir_changed) {
1821                 proc_fdlock(p);
1822                 fdp->fd_cdir = fdp_cvp;
1823                 fdp->fd_rdir = fdp_rvp;
1824                 proc_fdunlock(p);
1825         }
1826         return PROC_RETURNED;
1827 }
1828
1829
1830
1831 /*
1832  * Scan all active processes to see if any of them have a current
1833  * or root directory onto which the new filesystem has just been
1834  * mounted. If so, replace them with the new mount point.
1835  */
1836 static int
1837 checkdirs(vnode_t olddp, vfs_context_t ctx)
1838 {
1839         vnode_t newdp;
1840         vnode_t tvp;
1841         int err;
1842         struct cdirargs cdr;
1843
1844         if (olddp->v_usecount == 1) {
1845                 return 0;
1846         }
1847         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
1848
1849         if (err != 0) {
1850 #if DIAGNOSTIC
1851                 panic("mount: lost mount: error %d", err);
1852 #endif
1853                 return err;
1854         }
1855
1856         cdr.olddp = olddp;
1857         cdr.newdp = newdp;
1858         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
1859         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
1860
1861         if (rootvnode == olddp) {
1862                 vnode_ref(newdp);
1863                 tvp = rootvnode;
1864                 rootvnode = newdp;
1865                 vnode_rele(tvp);
1866         }
1867
1868         vnode_put(newdp);
1869         return 0;
1870 }
1871
1872 /*
1873  * Unmount a file system.
1874  *
1875  * Note: unmount takes a path to the vnode mounted on as argument,
1876  * not special file (as before).
1877  */
1878 /* ARGSUSED */
1879 int
1880 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
1881 {
1882         vnode_t vp;
1883         struct mount *mp;
1884         int error;
1885         struct nameidata nd;
1886         vfs_context_t ctx = vfs_context_current();
1887
1888         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
1889             UIO_USERSPACE, uap->path, ctx);
1890         error = namei(&nd);
1891         if (error) {
1892                 return error;
1893         }
1894         vp = nd.ni_vp;
1895         mp = vp->v_mount;
1896         nameidone(&nd);
1897
1898 #if CONFIG_MACF
1899         error = mac_mount_check_umount(ctx, mp);
1900         if (error != 0) {
1901                 vnode_put(vp);
1902                 return error;
1903         }
1904 #endif
1905         /*
1906          * Must be the root of the filesystem
1907          */
1908         if ((vp->v_flag & VROOT) == 0) {
1909                 vnode_put(vp);
1910                 return EINVAL;
1911         }
1912         mount_ref(mp, 0);
1913         vnode_put(vp);
1914         /* safedounmount consumes the mount ref */
1915         return safedounmount(mp, uap->flags, ctx);
1916 }
1917
1918 int
1919 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
1920 {
1921         mount_t mp;
1922
1923         mp = mount_list_lookupby_fsid(fsid, 0, 1);
1924         if (mp == (mount_t)0) {
1925                 return ENOENT;
1926         }
1927         mount_ref(mp, 0);
1928         mount_iterdrop(mp);
1929         /* safedounmount consumes the mount ref */
1930         return safedounmount(mp, flags, ctx);
1931 }
1932
1933
1934 /*
1935  * The mount struct comes with a mount ref which will be consumed.
1936  * Do the actual file system unmount, prevent some common foot shooting.
1937  */
1938 int
1939 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
1940 {
1941         int error;
1942         proc_t p = vfs_context_proc(ctx);
1943
1944         /*
1945          * If the file system is not responding and MNT_NOBLOCK
1946          * is set and not a forced unmount then return EBUSY.
1947          */
1948         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
1949             (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
1950                 error = EBUSY;
1951                 goto out;
1952         }
1953
1954         /*
1955          * Skip authorization if the mount is tagged as permissive and
1956          * this is not a forced-unmount attempt.
1957          */
1958         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
1959                 /*
1960                  * Only root, or the user that did the original mount is
1961                  * permitted to unmount this filesystem.
1962                  */
1963                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
1964                     (error = suser(kauth_cred_get(), &p->p_acflag))) {
1965                         goto out;
1966                 }
1967         }
1968         /*
1969          * Don't allow unmounting the root file system.
1970          */
1971         if (mp->mnt_flag & MNT_ROOTFS) {
1972                 error = EBUSY; /* the root is always busy */
1973                 goto out;
1974         }
1975
1976 #ifdef CONFIG_IMGSRC_ACCESS
1977         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
1978                 error = EBUSY;
1979                 goto out;
1980         }
1981 #endif /* CONFIG_IMGSRC_ACCESS */
1982
1983         return dounmount(mp, flags, 1, ctx);
1984
1985 out:
1986         mount_drop(mp, 0);
1987         return error;
1988 }
1989
1990 /*
1991  * Do the actual file system unmount.
1992  */
1993 int
1994 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
1995 {
1996         vnode_t coveredvp = (vnode_t)0;
1997         int error;
1998         int needwakeup = 0;
1999         int forcedunmount = 0;
2000         int lflags = 0;
2001         struct vnode *devvp = NULLVP;
2002 #if CONFIG_TRIGGERS
2003         proc_t p = vfs_context_proc(ctx);
2004         int did_vflush = 0;
2005         int pflags_save = 0;
2006 #endif /* CONFIG_TRIGGERS */
2007
2008 #if CONFIG_FSE
2009         if (!(flags & MNT_FORCE)) {
2010                 fsevent_unmount(mp, ctx);  /* has to come first! */
2011         }
2012 #endif
2013
2014         mount_lock(mp);
2015
2016         /*
2017          * If already an unmount in progress just return EBUSY.
2018          * Even a forced unmount cannot override.
2019          */
2020         if (mp->mnt_lflag & MNT_LUNMOUNT) {
2021                 if (withref != 0) {
2022                         mount_drop(mp, 1);
2023                 }
2024                 mount_unlock(mp);
2025                 return EBUSY;
2026         }
2027
2028         if (flags & MNT_FORCE) {
2029                 forcedunmount = 1;
2030                 mp->mnt_lflag |= MNT_LFORCE;
2031         }
2032
2033 #if CONFIG_TRIGGERS
2034         if (flags & MNT_NOBLOCK && p != kernproc) {
2035                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2036         }
2037 #endif
2038
2039         mp->mnt_kern_flag |= MNTK_UNMOUNT;
2040         mp->mnt_lflag |= MNT_LUNMOUNT;
2041         mp->mnt_flag &= ~MNT_ASYNC;
2042         /*
2043          * anyone currently in the fast path that
2044          * trips over the cached rootvp will be
2045          * dumped out and forced into the slow path
2046          * to regenerate a new cached value
2047          */
2048         mp->mnt_realrootvp = NULLVP;
2049         mount_unlock(mp);
2050
2051         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2052                 /*
2053                  * Force unmount any mounts in this filesystem.
2054                  * If any unmounts fail - just leave them dangling.
2055                  * Avoids recursion.
2056                  */
2057                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2058         }
2059
2060         /*
2061          * taking the name_cache_lock exclusively will
2062          * insure that everyone is out of the fast path who
2063          * might be trying to use a now stale copy of
2064          * vp->v_mountedhere->mnt_realrootvp
2065          * bumping mount_generation causes the cached values
2066          * to be invalidated
2067          */
2068         name_cache_lock();
2069         mount_generation++;
2070         name_cache_unlock();
2071
2072
2073         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2074         if (withref != 0) {
2075                 mount_drop(mp, 0);
2076         }
2077         error = 0;
2078         if (forcedunmount == 0) {
2079                 ubc_umount(mp); /* release cached vnodes */
2080                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2081                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
2082                         if (error) {
2083                                 mount_lock(mp);
2084                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2085                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2086                                 mp->mnt_lflag &= ~MNT_LFORCE;
2087                                 goto out;
2088                         }
2089                 }
2090         }
2091
2092         /* free disk_conditioner_info structure for this mount */
2093         disk_conditioner_unmount(mp);
2094
2095         IOBSDMountChange(mp, kIOMountChangeUnmount);
2096
2097 #if CONFIG_TRIGGERS
2098         vfs_nested_trigger_unmounts(mp, flags, ctx);
2099         did_vflush = 1;
2100 #endif
2101         if (forcedunmount) {
2102                 lflags |= FORCECLOSE;
2103         }
2104         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2105         if ((forcedunmount == 0) && error) {
2106                 mount_lock(mp);
2107                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2108                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2109                 mp->mnt_lflag &= ~MNT_LFORCE;
2110                 goto out;
2111         }
2112
2113         /* make sure there are no one in the mount iterations or lookup */
2114         mount_iterdrain(mp);
2115
2116         error = VFS_UNMOUNT(mp, flags, ctx);
2117         if (error) {
2118                 mount_iterreset(mp);
2119                 mount_lock(mp);
2120                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2121                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2122                 mp->mnt_lflag &= ~MNT_LFORCE;
2123                 goto out;
2124         }
2125
2126         /* increment the operations count */
2127         if (!error) {
2128                 OSAddAtomic(1, &vfs_nummntops);
2129         }
2130
2131         if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2132                 /* hold an io reference and drop the usecount before close */
2133                 devvp = mp->mnt_devvp;
2134                 vnode_getalways(devvp);
2135                 vnode_rele(devvp);
2136                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2137                     ctx);
2138                 vnode_clearmountedon(devvp);
2139                 vnode_put(devvp);
2140         }
2141         lck_rw_done(&mp->mnt_rwlock);
2142         mount_list_remove(mp);
2143         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2144
2145         /* mark the mount point hook in the vp but not drop the ref yet */
2146         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2147                 /*
2148                  * The covered vnode needs special handling. Trying to get an
2149                  * iocount must not block here as this may lead to deadlocks
2150                  * if the Filesystem to which the covered vnode belongs is
2151                  * undergoing forced unmounts. Since we hold a usecount, the
2152                  * vnode cannot be reused (it can, however, still be terminated)
2153                  */
2154                 vnode_getalways(coveredvp);
2155                 vnode_lock_spin(coveredvp);
2156
2157                 mp->mnt_crossref++;
2158                 coveredvp->v_mountedhere = (struct mount *)0;
2159                 CLR(coveredvp->v_flag, VMOUNT);
2160
2161                 vnode_unlock(coveredvp);
2162                 vnode_put(coveredvp);
2163         }
2164
2165         mount_list_lock();
2166         mp->mnt_vtable->vfc_refcount--;
2167         mount_list_unlock();
2168
2169         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2170         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2171         mount_lock(mp);
2172         mp->mnt_lflag |= MNT_LDEAD;
2173
2174         if (mp->mnt_lflag & MNT_LWAIT) {
2175                 /*
2176                  * do the wakeup here
2177                  * in case we block in mount_refdrain
2178                  * which will drop the mount lock
2179                  * and allow anyone blocked in vfs_busy
2180                  * to wakeup and see the LDEAD state
2181                  */
2182                 mp->mnt_lflag &= ~MNT_LWAIT;
2183                 wakeup((caddr_t)mp);
2184         }
2185         mount_refdrain(mp);
2186 out:
2187         if (mp->mnt_lflag & MNT_LWAIT) {
2188                 mp->mnt_lflag &= ~MNT_LWAIT;
2189                 needwakeup = 1;
2190         }
2191
2192 #if CONFIG_TRIGGERS
2193         if (flags & MNT_NOBLOCK && p != kernproc) {
2194                 // Restore P_NOREMOTEHANG bit to its previous value
2195                 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2196                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2197                 }
2198         }
2199
2200         /*
2201          * Callback and context are set together under the mount lock, and
2202          * never cleared, so we're safe to examine them here, drop the lock,
2203          * and call out.
2204          */
2205         if (mp->mnt_triggercallback != NULL) {
2206                 mount_unlock(mp);
2207                 if (error == 0) {
2208                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2209                 } else if (did_vflush) {
2210                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2211                 }
2212         } else {
2213                 mount_unlock(mp);
2214         }
2215 #else
2216         mount_unlock(mp);
2217 #endif /* CONFIG_TRIGGERS */
2218
2219         lck_rw_done(&mp->mnt_rwlock);
2220
2221         if (needwakeup) {
2222                 wakeup((caddr_t)mp);
2223         }
2224
2225         if (!error) {
2226                 if ((coveredvp != NULLVP)) {
2227                         vnode_t pvp = NULLVP;
2228
2229                         /*
2230                          * The covered vnode needs special handling. Trying to
2231                          * get an iocount must not block here as this may lead
2232                          * to deadlocks if the Filesystem to which the covered
2233                          * vnode belongs is undergoing forced unmounts. Since we
2234                          * hold a usecount, the  vnode cannot be reused
2235                          * (it can, however, still be terminated).
2236                          */
2237                         vnode_getalways(coveredvp);
2238
2239                         mount_dropcrossref(mp, coveredvp, 0);
2240                         /*
2241                          * We'll _try_ to detect if this really needs to be
2242                          * done. The coveredvp can only be in termination (or
2243                          * terminated) if the coveredvp's mount point is in a
2244                          * forced unmount (or has been) since we still hold the
2245                          * ref.
2246                          */
2247                         if (!vnode_isrecycled(coveredvp)) {
2248                                 pvp = vnode_getparent(coveredvp);
2249 #if CONFIG_TRIGGERS
2250                                 if (coveredvp->v_resolve) {
2251                                         vnode_trigger_rearm(coveredvp, ctx);
2252                                 }
2253 #endif
2254                         }
2255
2256                         vnode_rele(coveredvp);
2257                         vnode_put(coveredvp);
2258                         coveredvp = NULLVP;
2259
2260                         if (pvp) {
2261                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2262                                 vnode_put(pvp);
2263                         }
2264                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2265                         mount_lock_destroy(mp);
2266 #if CONFIG_MACF
2267                         mac_mount_label_destroy(mp);
2268 #endif
2269                         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2270                 } else {
2271                         panic("dounmount: no coveredvp");
2272                 }
2273         }
2274         return error;
2275 }
2276
2277 /*
2278  * Unmount any mounts in this filesystem.
2279  */
2280 void
2281 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2282 {
2283         mount_t smp;
2284         fsid_t *fsids, fsid;
2285         int fsids_sz;
2286         int count = 0, i, m = 0;
2287         vnode_t vp;
2288
2289         mount_list_lock();
2290
2291         // Get an array to hold the submounts fsids.
2292         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2293         count++;
2294         fsids_sz = count * sizeof(fsid_t);
2295         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2296         if (fsids == NULL) {
2297                 mount_list_unlock();
2298                 goto out;
2299         }
2300         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2301
2302         /*
2303          * Fill the array with submount fsids.
2304          * Since mounts are always added to the tail of the mount list, the
2305          * list is always in mount order.
2306          * For each mount check if the mounted-on vnode belongs to a
2307          * mount that's already added to our array of mounts to be unmounted.
2308          */
2309         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2310                 vp = smp->mnt_vnodecovered;
2311                 if (vp == NULL) {
2312                         continue;
2313                 }
2314                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2315                 for (i = 0; i <= m; i++) {
2316                         if (fsids[i].val[0] == fsid.val[0] &&
2317                             fsids[i].val[1] == fsid.val[1]) {
2318                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2319                                 break;
2320                         }
2321                 }
2322         }
2323         mount_list_unlock();
2324
2325         // Unmount the submounts in reverse order. Ignore errors.
2326         for (i = m; i > 0; i--) {
2327                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2328                 if (smp) {
2329                         mount_ref(smp, 0);
2330                         mount_iterdrop(smp);
2331                         (void) dounmount(smp, flags, 1, ctx);
2332                 }
2333         }
2334 out:
2335         if (fsids) {
2336                 FREE(fsids, M_TEMP);
2337         }
2338 }
2339
2340 void
2341 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2342 {
2343         vnode_lock(dp);
2344         mp->mnt_crossref--;
2345
2346         if (mp->mnt_crossref < 0) {
2347                 panic("mount cross refs -ve");
2348         }
2349
2350         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2351                 if (need_put) {
2352                         vnode_put_locked(dp);
2353                 }
2354                 vnode_unlock(dp);
2355
2356                 mount_lock_destroy(mp);
2357 #if CONFIG_MACF
2358                 mac_mount_label_destroy(mp);
2359 #endif
2360                 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2361                 return;
2362         }
2363         if (need_put) {
2364                 vnode_put_locked(dp);
2365         }
2366         vnode_unlock(dp);
2367 }
2368
2369
2370 /*
2371  * Sync each mounted filesystem.
2372  */
2373 #if DIAGNOSTIC
2374 int syncprt = 0;
2375 #endif
2376
2377 int print_vmpage_stat = 0;
2378
2379 static int
2380 sync_callback(mount_t mp, __unused void *arg)
2381 {
2382         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2383                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2384
2385                 mp->mnt_flag &= ~MNT_ASYNC;
2386                 VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
2387                 if (asyncflag) {
2388                         mp->mnt_flag |= MNT_ASYNC;
2389                 }
2390         }
2391
2392         return VFS_RETURNED;
2393 }
2394
2395 /* ARGSUSED */
2396 int
2397 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2398 {
2399         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2400
2401         if (print_vmpage_stat) {
2402                 vm_countdirtypages();
2403         }
2404
2405 #if DIAGNOSTIC
2406         if (syncprt) {
2407                 vfs_bufstats();
2408         }
2409 #endif /* DIAGNOSTIC */
2410         return 0;
2411 }
2412
2413 typedef enum {
2414         SYNC_ALL = 0,
2415         SYNC_ONLY_RELIABLE_MEDIA = 1,
2416         SYNC_ONLY_UNRELIABLE_MEDIA = 2
2417 } sync_type_t;
2418
2419 static int
2420 sync_internal_callback(mount_t mp, void *arg)
2421 {
2422         if (arg) {
2423                 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2424                     (mp->mnt_flag & MNT_LOCAL);
2425                 sync_type_t sync_type = *((sync_type_t *)arg);
2426
2427                 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2428                         return VFS_RETURNED;
2429                 } else if ((sync_type = SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2430                         return VFS_RETURNED;
2431                 }
2432         }
2433
2434         (void)sync_callback(mp, NULL);
2435
2436         return VFS_RETURNED;
2437 }
2438
2439 int sync_thread_state = 0;
2440 int sync_timeout_seconds = 5;
2441
2442 #define SYNC_THREAD_RUN       0x0001
2443 #define SYNC_THREAD_RUNNING   0x0002
2444
2445 static void
2446 sync_thread(__unused void *arg, __unused wait_result_t wr)
2447 {
2448         sync_type_t sync_type;
2449
2450         lck_mtx_lock(sync_mtx_lck);
2451         while (sync_thread_state & SYNC_THREAD_RUN) {
2452                 sync_thread_state &= ~SYNC_THREAD_RUN;
2453                 lck_mtx_unlock(sync_mtx_lck);
2454
2455                 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2456                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2457                 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2458                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2459
2460                 lck_mtx_lock(sync_mtx_lck);
2461         }
2462         /*
2463          * This wakeup _has_ to be issued before the lock is released otherwise
2464          * we may end up waking up a thread in sync_internal which is
2465          * expecting a wakeup from a thread it just created and not from this
2466          * thread which is about to exit.
2467          */
2468         wakeup(&sync_thread_state);
2469         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2470         lck_mtx_unlock(sync_mtx_lck);
2471
2472         if (print_vmpage_stat) {
2473                 vm_countdirtypages();
2474         }
2475
2476 #if DIAGNOSTIC
2477         if (syncprt) {
2478                 vfs_bufstats();
2479         }
2480 #endif /* DIAGNOSTIC */
2481 }
2482
2483 struct timeval sync_timeout_last_print = {0, 0};
2484
2485 /*
2486  * An in-kernel sync for power management to call.
2487  * This function always returns within sync_timeout seconds.
2488  */
2489 __private_extern__ int
2490 sync_internal(void)
2491 {
2492         thread_t thd;
2493         int error;
2494         int thread_created = FALSE;
2495         struct timespec ts = {sync_timeout_seconds, 0};
2496
2497         lck_mtx_lock(sync_mtx_lck);
2498         sync_thread_state |= SYNC_THREAD_RUN;
2499         if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2500                 int kr;
2501
2502                 sync_thread_state |= SYNC_THREAD_RUNNING;
2503                 kr = kernel_thread_start(sync_thread, NULL, &thd);
2504                 if (kr != KERN_SUCCESS) {
2505                         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2506                         lck_mtx_unlock(sync_mtx_lck);
2507                         printf("sync_thread failed\n");
2508                         return 0;
2509                 }
2510                 thread_created = TRUE;
2511         }
2512
2513         error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
2514             (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2515         if (error) {
2516                 struct timeval now;
2517
2518                 microtime(&now);
2519                 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2520                         printf("sync timed out: %d sec\n", sync_timeout_seconds);
2521                         sync_timeout_last_print.tv_sec = now.tv_sec;
2522                 }
2523         }
2524
2525         if (thread_created) {
2526                 thread_deallocate(thd);
2527         }
2528
2529         return 0;
2530 } /* end of sync_internal call */
2531
2532 /*
2533  * Change filesystem quotas.
2534  */
2535 #if QUOTA
2536 int
2537 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2538 {
2539         struct mount *mp;
2540         int error, quota_cmd, quota_status = 0;
2541         caddr_t datap;
2542         size_t fnamelen;
2543         struct nameidata nd;
2544         vfs_context_t ctx = vfs_context_current();
2545         struct dqblk my_dqblk = {};
2546
2547         AUDIT_ARG(uid, uap->uid);
2548         AUDIT_ARG(cmd, uap->cmd);
2549         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2550             uap->path, ctx);
2551         error = namei(&nd);
2552         if (error) {
2553                 return error;
2554         }
2555         mp = nd.ni_vp->v_mount;
2556         vnode_put(nd.ni_vp);
2557         nameidone(&nd);
2558
2559         /* copyin any data we will need for downstream code */
2560         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2561
2562         switch (quota_cmd) {
2563         case Q_QUOTAON:
2564                 /* uap->arg specifies a file from which to take the quotas */
2565                 fnamelen = MAXPATHLEN;
2566                 datap = kalloc(MAXPATHLEN);
2567                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2568                 break;
2569         case Q_GETQUOTA:
2570                 /* uap->arg is a pointer to a dqblk structure. */
2571                 datap = (caddr_t) &my_dqblk;
2572                 break;
2573         case Q_SETQUOTA:
2574         case Q_SETUSE:
2575                 /* uap->arg is a pointer to a dqblk structure. */
2576                 datap = (caddr_t) &my_dqblk;
2577                 if (proc_is64bit(p)) {
2578                         struct user_dqblk       my_dqblk64;
2579                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2580                         if (error == 0) {
2581                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2582                         }
2583                 } else {
2584                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2585                 }
2586                 break;
2587         case Q_QUOTASTAT:
2588                 /* uap->arg is a pointer to an integer */
2589                 datap = (caddr_t) &quota_status;
2590                 break;
2591         default:
2592                 datap = NULL;
2593                 break;
2594         } /* switch */
2595
2596         if (error == 0) {
2597                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2598         }
2599
2600         switch (quota_cmd) {
2601         case Q_QUOTAON:
2602                 if (datap != NULL) {
2603                         kfree(datap, MAXPATHLEN);
2604                 }
2605                 break;
2606         case Q_GETQUOTA:
2607                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2608                 if (error == 0) {
2609                         if (proc_is64bit(p)) {
2610                                 struct user_dqblk       my_dqblk64;
2611
2612                                 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
2613                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2614                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
2615                         } else {
2616                                 error = copyout(datap, uap->arg, sizeof(struct dqblk));
2617                         }
2618                 }
2619                 break;
2620         case Q_QUOTASTAT:
2621                 /* uap->arg is a pointer to an integer */
2622                 if (error == 0) {
2623                         error = copyout(datap, uap->arg, sizeof(quota_status));
2624                 }
2625                 break;
2626         default:
2627                 break;
2628         } /* switch */
2629
2630         return error;
2631 }
2632 #else
2633 int
2634 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2635 {
2636         return EOPNOTSUPP;
2637 }
2638 #endif /* QUOTA */
2639
2640 /*
2641  * Get filesystem statistics.
2642  *
2643  * Returns:     0                       Success
2644  *      namei:???
2645  *      vfs_update_vfsstat:???
2646  *      munge_statfs:EFAULT
2647  */
2648 /* ARGSUSED */
2649 int
2650 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2651 {
2652         struct mount *mp;
2653         struct vfsstatfs *sp;
2654         int error;
2655         struct nameidata nd;
2656         vfs_context_t ctx = vfs_context_current();
2657         vnode_t vp;
2658
2659         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2660             UIO_USERSPACE, uap->path, ctx);
2661         error = namei(&nd);
2662         if (error != 0) {
2663                 return error;
2664         }
2665         vp = nd.ni_vp;
2666         mp = vp->v_mount;
2667         sp = &mp->mnt_vfsstat;
2668         nameidone(&nd);
2669
2670 #if CONFIG_MACF
2671         error = mac_mount_check_stat(ctx, mp);
2672         if (error != 0) {
2673                 return error;
2674         }
2675 #endif
2676
2677         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2678         if (error != 0) {
2679                 vnode_put(vp);
2680                 return error;
2681         }
2682
2683         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2684         vnode_put(vp);
2685         return error;
2686 }
2687
2688 /*
2689  * Get filesystem statistics.
2690  */
2691 /* ARGSUSED */
2692 int
2693 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2694 {
2695         vnode_t vp;
2696         struct mount *mp;
2697         struct vfsstatfs *sp;
2698         int error;
2699
2700         AUDIT_ARG(fd, uap->fd);
2701
2702         if ((error = file_vnode(uap->fd, &vp))) {
2703                 return error;
2704         }
2705
2706         error = vnode_getwithref(vp);
2707         if (error) {
2708                 file_drop(uap->fd);
2709                 return error;
2710         }
2711
2712         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2713
2714         mp = vp->v_mount;
2715         if (!mp) {
2716                 error = EBADF;
2717                 goto out;
2718         }
2719
2720 #if CONFIG_MACF
2721         error = mac_mount_check_stat(vfs_context_current(), mp);
2722         if (error != 0) {
2723                 goto out;
2724         }
2725 #endif
2726
2727         sp = &mp->mnt_vfsstat;
2728         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2729                 goto out;
2730         }
2731
2732         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2733
2734 out:
2735         file_drop(uap->fd);
2736         vnode_put(vp);
2737
2738         return error;
2739 }
2740
2741 /*
2742  * Common routine to handle copying of statfs64 data to user space
2743  */
2744 static int
2745 statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
2746 {
2747         int error;
2748         struct statfs64 sfs;
2749
2750         bzero(&sfs, sizeof(sfs));
2751
2752         sfs.f_bsize = sfsp->f_bsize;
2753         sfs.f_iosize = (int32_t)sfsp->f_iosize;
2754         sfs.f_blocks = sfsp->f_blocks;
2755         sfs.f_bfree = sfsp->f_bfree;
2756         sfs.f_bavail = sfsp->f_bavail;
2757         sfs.f_files = sfsp->f_files;
2758         sfs.f_ffree = sfsp->f_ffree;
2759         sfs.f_fsid = sfsp->f_fsid;
2760         sfs.f_owner = sfsp->f_owner;
2761         sfs.f_type = mp->mnt_vtable->vfc_typenum;
2762         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2763         sfs.f_fssubtype = sfsp->f_fssubtype;
2764         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2765                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
2766         } else {
2767                 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
2768         }
2769         strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
2770         strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
2771
2772         error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
2773
2774         return error;
2775 }
2776
2777 /*
2778  * Get file system statistics in 64-bit mode
2779  */
2780 int
2781 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
2782 {
2783         struct mount *mp;
2784         struct vfsstatfs *sp;
2785         int error;
2786         struct nameidata nd;
2787         vfs_context_t ctxp = vfs_context_current();
2788         vnode_t vp;
2789
2790         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2791             UIO_USERSPACE, uap->path, ctxp);
2792         error = namei(&nd);
2793         if (error != 0) {
2794                 return error;
2795         }
2796         vp = nd.ni_vp;
2797         mp = vp->v_mount;
2798         sp = &mp->mnt_vfsstat;
2799         nameidone(&nd);
2800
2801 #if CONFIG_MACF
2802         error = mac_mount_check_stat(ctxp, mp);
2803         if (error != 0) {
2804                 return error;
2805         }
2806 #endif
2807
2808         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
2809         if (error != 0) {
2810                 vnode_put(vp);
2811                 return error;
2812         }
2813
2814         error = statfs64_common(mp, sp, uap->buf);
2815         vnode_put(vp);
2816
2817         return error;
2818 }
2819
2820 /*
2821  * Get file system statistics in 64-bit mode
2822  */
2823 int
2824 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
2825 {
2826         struct vnode *vp;
2827         struct mount *mp;
2828         struct vfsstatfs *sp;
2829         int error;
2830
2831         AUDIT_ARG(fd, uap->fd);
2832
2833         if ((error = file_vnode(uap->fd, &vp))) {
2834                 return error;
2835         }
2836
2837         error = vnode_getwithref(vp);
2838         if (error) {
2839                 file_drop(uap->fd);
2840                 return error;
2841         }
2842
2843         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2844
2845         mp = vp->v_mount;
2846         if (!mp) {
2847                 error = EBADF;
2848                 goto out;
2849         }
2850
2851 #if CONFIG_MACF
2852         error = mac_mount_check_stat(vfs_context_current(), mp);
2853         if (error != 0) {
2854                 goto out;
2855         }
2856 #endif
2857
2858         sp = &mp->mnt_vfsstat;
2859         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2860                 goto out;
2861         }
2862
2863         error = statfs64_common(mp, sp, uap->buf);
2864
2865 out:
2866         file_drop(uap->fd);
2867         vnode_put(vp);
2868
2869         return error;
2870 }
2871
2872 struct getfsstat_struct {
2873         user_addr_t     sfsp;
2874         user_addr_t     *mp;
2875         int             count;
2876         int             maxcount;
2877         int             flags;
2878         int             error;
2879 };
2880
2881
2882 static int
2883 getfsstat_callback(mount_t mp, void * arg)
2884 {
2885         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2886         struct vfsstatfs *sp;
2887         int error, my_size;
2888         vfs_context_t ctx = vfs_context_current();
2889
2890         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2891 #if CONFIG_MACF
2892                 error = mac_mount_check_stat(ctx, mp);
2893                 if (error != 0) {
2894                         fstp->error = error;
2895                         return VFS_RETURNED_DONE;
2896                 }
2897 #endif
2898                 sp = &mp->mnt_vfsstat;
2899                 /*
2900                  * If MNT_NOWAIT is specified, do not refresh the
2901                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
2902                  */
2903                 if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2904                     (error = vfs_update_vfsstat(mp, ctx,
2905                     VFS_USER_EVENT))) {
2906                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2907                         return VFS_RETURNED;
2908                 }
2909
2910                 /*
2911                  * Need to handle LP64 version of struct statfs
2912                  */
2913                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
2914                 if (error) {
2915                         fstp->error = error;
2916                         return VFS_RETURNED_DONE;
2917                 }
2918                 fstp->sfsp += my_size;
2919
2920                 if (fstp->mp) {
2921 #if CONFIG_MACF
2922                         error = mac_mount_label_get(mp, *fstp->mp);
2923                         if (error) {
2924                                 fstp->error = error;
2925                                 return VFS_RETURNED_DONE;
2926                         }
2927 #endif
2928                         fstp->mp++;
2929                 }
2930         }
2931         fstp->count++;
2932         return VFS_RETURNED;
2933 }
2934
2935 /*
2936  * Get statistics on all filesystems.
2937  */
2938 int
2939 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
2940 {
2941         struct __mac_getfsstat_args muap;
2942
2943         muap.buf = uap->buf;
2944         muap.bufsize = uap->bufsize;
2945         muap.mac = USER_ADDR_NULL;
2946         muap.macsize = 0;
2947         muap.flags = uap->flags;
2948
2949         return __mac_getfsstat(p, &muap, retval);
2950 }
2951
2952 /*
2953  * __mac_getfsstat: Get MAC-related file system statistics
2954  *
2955  * Parameters:    p                        (ignored)
2956  *                uap                      User argument descriptor (see below)
2957  *                retval                   Count of file system statistics (N stats)
2958  *
2959  * Indirect:      uap->bufsize             Buffer size
2960  *                uap->macsize             MAC info size
2961  *                uap->buf                 Buffer where information will be returned
2962  *                uap->mac                 MAC info
2963  *                uap->flags               File system flags
2964  *
2965  *
2966  * Returns:        0                       Success
2967  *                !0                       Not success
2968  *
2969  */
2970 int
2971 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
2972 {
2973         user_addr_t sfsp;
2974         user_addr_t *mp;
2975         size_t count, maxcount, bufsize, macsize;
2976         struct getfsstat_struct fst;
2977
2978         bufsize = (size_t) uap->bufsize;
2979         macsize = (size_t) uap->macsize;
2980
2981         if (IS_64BIT_PROCESS(p)) {
2982                 maxcount = bufsize / sizeof(struct user64_statfs);
2983         } else {
2984                 maxcount = bufsize / sizeof(struct user32_statfs);
2985         }
2986         sfsp = uap->buf;
2987         count = 0;
2988
2989         mp = NULL;
2990
2991 #if CONFIG_MACF
2992         if (uap->mac != USER_ADDR_NULL) {
2993                 u_int32_t *mp0;
2994                 int error;
2995                 unsigned int i;
2996
2997                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
2998                 if (count != maxcount) {
2999                         return EINVAL;
3000                 }
3001
3002                 /* Copy in the array */
3003                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
3004                 if (mp0 == NULL) {
3005                         return ENOMEM;
3006                 }
3007
3008                 error = copyin(uap->mac, mp0, macsize);
3009                 if (error) {
3010                         FREE(mp0, M_MACTEMP);
3011                         return error;
3012                 }
3013
3014                 /* Normalize to an array of user_addr_t */
3015                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
3016                 if (mp == NULL) {
3017                         FREE(mp0, M_MACTEMP);
3018                         return ENOMEM;
3019                 }
3020
3021                 for (i = 0; i < count; i++) {
3022                         if (IS_64BIT_PROCESS(p)) {
3023                                 mp[i] = ((user_addr_t *)mp0)[i];
3024                         } else {
3025                                 mp[i] = (user_addr_t)mp0[i];
3026                         }
3027                 }
3028                 FREE(mp0, M_MACTEMP);
3029         }
3030 #endif
3031
3032
3033         fst.sfsp = sfsp;
3034         fst.mp = mp;
3035         fst.flags = uap->flags;
3036         fst.count = 0;
3037         fst.error = 0;
3038         fst.maxcount = maxcount;
3039
3040
3041         vfs_iterate(0, getfsstat_callback, &fst);
3042
3043         if (mp) {
3044                 FREE(mp, M_MACTEMP);
3045         }
3046
3047         if (fst.error) {
3048                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3049                 return fst.error;
3050         }
3051
3052         if (fst.sfsp && fst.count > fst.maxcount) {
3053                 *retval = fst.maxcount;
3054         } else {
3055                 *retval = fst.count;
3056         }
3057         return 0;
3058 }
3059
3060 static int
3061 getfsstat64_callback(mount_t mp, void * arg)
3062 {
3063         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3064         struct vfsstatfs *sp;
3065         int error;
3066
3067         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3068 #if CONFIG_MACF
3069                 error = mac_mount_check_stat(vfs_context_current(), mp);
3070                 if (error != 0) {
3071                         fstp->error = error;
3072                         return VFS_RETURNED_DONE;
3073                 }
3074 #endif
3075                 sp = &mp->mnt_vfsstat;
3076                 /*
3077                  * If MNT_NOWAIT is specified, do not refresh the fsstat
3078                  * cache. MNT_WAIT overrides MNT_NOWAIT.
3079                  *
3080                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
3081                  * getfsstat, since the constants are out of the same
3082                  * namespace.
3083                  */
3084                 if (((fstp->flags & MNT_NOWAIT) == 0 ||
3085                     (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3086                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
3087                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3088                         return VFS_RETURNED;
3089                 }
3090
3091                 error = statfs64_common(mp, sp, fstp->sfsp);
3092                 if (error) {
3093                         fstp->error = error;
3094                         return VFS_RETURNED_DONE;
3095                 }
3096                 fstp->sfsp += sizeof(struct statfs64);
3097         }
3098         fstp->count++;
3099         return VFS_RETURNED;
3100 }
3101
3102 /*
3103  * Get statistics on all file systems in 64 bit mode.
3104  */
3105 int
3106 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3107 {
3108         user_addr_t sfsp;
3109         int count, maxcount;
3110         struct getfsstat_struct fst;
3111
3112         maxcount = uap->bufsize / sizeof(struct statfs64);
3113
3114         sfsp = uap->buf;
3115         count = 0;
3116
3117         fst.sfsp = sfsp;
3118         fst.flags = uap->flags;
3119         fst.count = 0;
3120         fst.error = 0;
3121         fst.maxcount = maxcount;
3122
3123         vfs_iterate(0, getfsstat64_callback, &fst);
3124
3125         if (fst.error) {
3126                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3127                 return fst.error;
3128         }
3129
3130         if (fst.sfsp && fst.count > fst.maxcount) {
3131                 *retval = fst.maxcount;
3132         } else {
3133                 *retval = fst.count;
3134         }
3135
3136         return 0;
3137 }
3138
3139 /*
3140  * gets the associated vnode with the file descriptor passed.
3141  * as input
3142  *
3143  * INPUT
3144  * ctx - vfs context of caller
3145  * fd - file descriptor for which vnode is required.
3146  * vpp - Pointer to pointer to vnode to be returned.
3147  *
3148  * The vnode is returned with an iocount so any vnode obtained
3149  * by this call needs a vnode_put
3150  *
3151  */
3152 int
3153 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3154 {
3155         int error;
3156         vnode_t vp;
3157         struct fileproc *fp;
3158         proc_t p = vfs_context_proc(ctx);
3159
3160         *vpp =  NULLVP;
3161
3162         error = fp_getfvp(p, fd, &fp, &vp);
3163         if (error) {
3164                 return error;
3165         }
3166
3167         error = vnode_getwithref(vp);
3168         if (error) {
3169                 (void)fp_drop(p, fd, fp, 0);
3170                 return error;
3171         }
3172
3173         (void)fp_drop(p, fd, fp, 0);
3174         *vpp = vp;
3175         return error;
3176 }
3177
3178 /*
3179  * Wrapper function around namei to start lookup from a directory
3180  * specified by a file descriptor ni_dirfd.
3181  *
3182  * In addition to all the errors returned by namei, this call can
3183  * return ENOTDIR if the file descriptor does not refer to a directory.
3184  * and EBADF if the file descriptor is not valid.
3185  */
3186 int
3187 nameiat(struct nameidata *ndp, int dirfd)
3188 {
3189         if ((dirfd != AT_FDCWD) &&
3190             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3191             !(ndp->ni_cnd.cn_flags & USEDVP)) {
3192                 int error = 0;
3193                 char c;
3194
3195                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3196                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3197                         if (error) {
3198                                 return error;
3199                         }
3200                 } else {
3201                         c = *((char *)(ndp->ni_dirp));
3202                 }
3203
3204                 if (c != '/') {
3205                         vnode_t dvp_at;
3206
3207                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3208                             &dvp_at);
3209                         if (error) {
3210                                 return error;
3211                         }
3212
3213                         if (vnode_vtype(dvp_at) != VDIR) {
3214                                 vnode_put(dvp_at);
3215                                 return ENOTDIR;
3216                         }
3217
3218                         ndp->ni_dvp = dvp_at;
3219                         ndp->ni_cnd.cn_flags |= USEDVP;
3220                         error = namei(ndp);
3221                         ndp->ni_cnd.cn_flags &= ~USEDVP;
3222                         vnode_put(dvp_at);
3223                         return error;
3224                 }
3225         }
3226
3227         return namei(ndp);
3228 }
3229
3230 /*
3231  * Change current working directory to a given file descriptor.
3232  */
3233 /* ARGSUSED */
3234 static int
3235 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3236 {
3237         struct filedesc *fdp = p->p_fd;
3238         vnode_t vp;
3239         vnode_t tdp;
3240         vnode_t tvp;
3241         struct mount *mp;
3242         int error;
3243         vfs_context_t ctx = vfs_context_current();
3244
3245         AUDIT_ARG(fd, uap->fd);
3246         if (per_thread && uap->fd == -1) {
3247                 /*
3248                  * Switching back from per-thread to per process CWD; verify we
3249                  * in fact have one before proceeding.  The only success case
3250                  * for this code path is to return 0 preemptively after zapping
3251                  * the thread structure contents.
3252                  */
3253                 thread_t th = vfs_context_thread(ctx);
3254                 if (th) {
3255                         uthread_t uth = get_bsdthread_info(th);
3256                         tvp = uth->uu_cdir;
3257                         uth->uu_cdir = NULLVP;
3258                         if (tvp != NULLVP) {
3259                                 vnode_rele(tvp);
3260                                 return 0;
3261                         }
3262                 }
3263                 return EBADF;
3264         }
3265
3266         if ((error = file_vnode(uap->fd, &vp))) {
3267                 return error;
3268         }
3269         if ((error = vnode_getwithref(vp))) {
3270                 file_drop(uap->fd);
3271                 return error;
3272         }
3273
3274         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3275
3276         if (vp->v_type != VDIR) {
3277                 error = ENOTDIR;
3278                 goto out;
3279         }
3280
3281 #if CONFIG_MACF
3282         error = mac_vnode_check_chdir(ctx, vp);
3283         if (error) {
3284                 goto out;
3285         }
3286 #endif
3287         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3288         if (error) {
3289                 goto out;
3290         }
3291
3292         while (!error && (mp = vp->v_mountedhere) != NULL) {
3293                 if (vfs_busy(mp, LK_NOWAIT)) {
3294                         error = EACCES;
3295                         goto out;
3296                 }
3297                 error = VFS_ROOT(mp, &tdp, ctx);
3298                 vfs_unbusy(mp);
3299                 if (error) {
3300                         break;
3301                 }
3302                 vnode_put(vp);
3303                 vp = tdp;
3304         }
3305         if (error) {
3306                 goto out;
3307         }
3308         if ((error = vnode_ref(vp))) {
3309                 goto out;
3310         }
3311         vnode_put(vp);
3312
3313         if (per_thread) {
3314                 thread_t th = vfs_context_thread(ctx);
3315                 if (th) {
3316                         uthread_t uth = get_bsdthread_info(th);
3317                         tvp = uth->uu_cdir;
3318                         uth->uu_cdir = vp;
3319                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3320                 } else {
3321                         vnode_rele(vp);
3322                         return ENOENT;
3323                 }
3324         } else {
3325                 proc_fdlock(p);
3326                 tvp = fdp->fd_cdir;
3327                 fdp->fd_cdir = vp;
3328                 proc_fdunlock(p);
3329         }
3330
3331         if (tvp) {
3332                 vnode_rele(tvp);
3333         }
3334         file_drop(uap->fd);
3335
3336         return 0;
3337 out:
3338         vnode_put(vp);
3339         file_drop(uap->fd);
3340
3341         return error;
3342 }
3343
3344 int
3345 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3346 {
3347         return common_fchdir(p, uap, 0);
3348 }
3349
3350 int
3351 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3352 {
3353         return common_fchdir(p, (void *)uap, 1);
3354 }
3355
3356 /*
3357  * Change current working directory (".").
3358  *
3359  * Returns:     0                       Success
3360  *      change_dir:ENOTDIR
3361  *      change_dir:???
3362  *      vnode_ref:ENOENT                No such file or directory
3363  */
3364 /* ARGSUSED */
3365 static int
3366 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3367 {
3368         struct filedesc *fdp = p->p_fd;
3369         int error;
3370         struct nameidata nd;
3371         vnode_t tvp;
3372         vfs_context_t ctx = vfs_context_current();
3373
3374         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3375             UIO_USERSPACE, uap->path, ctx);
3376         error = change_dir(&nd, ctx);
3377         if (error) {
3378                 return error;
3379         }
3380         if ((error = vnode_ref(nd.ni_vp))) {
3381                 vnode_put(nd.ni_vp);
3382                 return error;
3383         }
3384         /*
3385          * drop the iocount we picked up in change_dir
3386          */
3387         vnode_put(nd.ni_vp);
3388
3389         if (per_thread) {
3390                 thread_t th = vfs_context_thread(ctx);
3391                 if (th) {
3392                         uthread_t uth = get_bsdthread_info(th);
3393                         tvp = uth->uu_cdir;
3394                         uth->uu_cdir = nd.ni_vp;
3395                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3396                 } else {
3397                         vnode_rele(nd.ni_vp);
3398                         return ENOENT;
3399                 }
3400         } else {
3401                 proc_fdlock(p);
3402                 tvp = fdp->fd_cdir;
3403                 fdp->fd_cdir = nd.ni_vp;
3404                 proc_fdunlock(p);
3405         }
3406
3407         if (tvp) {
3408                 vnode_rele(tvp);
3409         }
3410
3411         return 0;
3412 }
3413
3414
3415 /*
3416  * chdir
3417  *
3418  * Change current working directory (".") for the entire process
3419  *
3420  * Parameters:  p       Process requesting the call
3421  *              uap     User argument descriptor (see below)
3422  *              retval  (ignored)
3423  *
3424  * Indirect parameters: uap->path       Directory path
3425  *
3426  * Returns:     0                       Success
3427  *              common_chdir: ENOTDIR
3428  *              common_chdir: ENOENT    No such file or directory
3429  *              common_chdir: ???
3430  *
3431  */
3432 int
3433 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3434 {
3435         return common_chdir(p, (void *)uap, 0);
3436 }
3437
3438 /*
3439  * __pthread_chdir
3440  *
3441  * Change current working directory (".") for a single thread
3442  *
3443  * Parameters:  p       Process requesting the call
3444  *              uap     User argument descriptor (see below)
3445  *              retval  (ignored)
3446  *
3447  * Indirect parameters: uap->path       Directory path
3448  *
3449  * Returns:     0                       Success
3450  *              common_chdir: ENOTDIR
3451  *              common_chdir: ENOENT    No such file or directory
3452  *              common_chdir: ???
3453  *
3454  */
3455 int
3456 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3457 {
3458         return common_chdir(p, (void *)uap, 1);
3459 }
3460
3461
3462 /*
3463  * Change notion of root (``/'') directory.
3464  */
3465 /* ARGSUSED */
3466 int
3467 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3468 {
3469         struct filedesc *fdp = p->p_fd;
3470         int error;
3471         struct nameidata nd;
3472         vnode_t tvp;
3473         vfs_context_t ctx = vfs_context_current();
3474
3475         if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3476                 return error;
3477         }
3478
3479         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3480             UIO_USERSPACE, uap->path, ctx);
3481         error = change_dir(&nd, ctx);
3482         if (error) {
3483                 return error;
3484         }
3485
3486 #if CONFIG_MACF
3487         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3488             &nd.ni_cnd);
3489         if (error) {
3490                 vnode_put(nd.ni_vp);
3491                 return error;
3492         }
3493 #endif
3494
3495         if ((error = vnode_ref(nd.ni_vp))) {
3496                 vnode_put(nd.ni_vp);
3497                 return error;
3498         }
3499         vnode_put(nd.ni_vp);
3500
3501         proc_fdlock(p);
3502         tvp = fdp->fd_rdir;
3503         fdp->fd_rdir = nd.ni_vp;
3504         fdp->fd_flags |= FD_CHROOT;
3505         proc_fdunlock(p);
3506
3507         if (tvp != NULL) {
3508                 vnode_rele(tvp);
3509         }
3510
3511         return 0;
3512 }
3513
3514 /*
3515  * Common routine for chroot and chdir.
3516  *
3517  * Returns:     0                       Success
3518  *              ENOTDIR                 Not a directory
3519  *              namei:???               [anything namei can return]
3520  *              vnode_authorize:???     [anything vnode_authorize can return]
3521  */
3522 static int
3523 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3524 {
3525         vnode_t vp;
3526         int error;
3527
3528         if ((error = namei(ndp))) {
3529                 return error;
3530         }
3531         nameidone(ndp);
3532         vp = ndp->ni_vp;
3533
3534         if (vp->v_type != VDIR) {
3535                 vnode_put(vp);
3536                 return ENOTDIR;
3537         }
3538
3539 #if CONFIG_MACF
3540         error = mac_vnode_check_chdir(ctx, vp);
3541         if (error) {
3542                 vnode_put(vp);
3543                 return error;
3544         }
3545 #endif
3546
3547         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3548         if (error) {
3549                 vnode_put(vp);
3550                 return error;
3551         }
3552
3553         return error;
3554 }
3555
3556 /*
3557  * Free the vnode data (for directories) associated with the file glob.
3558  */
3559 struct fd_vn_data *
3560 fg_vn_data_alloc(void)
3561 {
3562         struct fd_vn_data *fvdata;
3563
3564         /* Allocate per fd vnode data */
3565         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3566             M_FD_VN_DATA, M_WAITOK | M_ZERO);
3567         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3568         return fvdata;
3569 }
3570
3571 /*
3572  * Free the vnode data (for directories) associated with the file glob.
3573  */
3574 void
3575 fg_vn_data_free(void *fgvndata)
3576 {
3577         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3578
3579         if (fvdata->fv_buf) {
3580                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3581         }
3582         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3583         FREE(fvdata, M_FD_VN_DATA);
3584 }
3585
3586 /*
3587  * Check permissions, allocate an open file structure,
3588  * and call the device open routine if any.
3589  *
3590  * Returns:     0                       Success
3591  *              EINVAL
3592  *              EINTR
3593  *      falloc:ENFILE
3594  *      falloc:EMFILE
3595  *      falloc:ENOMEM
3596  *      vn_open_auth:???
3597  *      dupfdopen:???
3598  *      VNOP_ADVLOCK:???
3599  *      vnode_setsize:???
3600  *
3601  * XXX Need to implement uid, gid
3602  */
3603 int
3604 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3605     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3606     int32_t *retval)
3607 {
3608         proc_t p = vfs_context_proc(ctx);
3609         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3610         struct fileproc *fp;
3611         vnode_t vp;
3612         int flags, oflags;
3613         int type, indx, error;
3614         struct flock lf;
3615         struct vfs_context context;
3616
3617         oflags = uflags;
3618
3619         if ((oflags & O_ACCMODE) == O_ACCMODE) {
3620                 return EINVAL;
3621         }
3622
3623         flags = FFLAGS(uflags);
3624         CLR(flags, FENCRYPTED);
3625         CLR(flags, FUNENCRYPTED);
3626
3627         AUDIT_ARG(fflags, oflags);
3628         AUDIT_ARG(mode, vap->va_mode);
3629
3630         if ((error = falloc_withalloc(p,
3631             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3632                 return error;
3633         }
3634         uu->uu_dupfd = -indx - 1;
3635
3636         if ((error = vn_open_auth(ndp, &flags, vap))) {
3637                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {       /* XXX from fdopen */
3638                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3639                                 fp_drop(p, indx, NULL, 0);
3640                                 *retval = indx;
3641                                 return 0;
3642                         }
3643                 }
3644                 if (error == ERESTART) {
3645                         error = EINTR;
3646                 }
3647                 fp_free(p, indx, fp);
3648                 return error;
3649         }
3650         uu->uu_dupfd = 0;
3651         vp = ndp->ni_vp;
3652
3653         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3654         fp->f_fglob->fg_ops = &vnops;
3655         fp->f_fglob->fg_data = (caddr_t)vp;
3656
3657         if (flags & (O_EXLOCK | O_SHLOCK)) {
3658                 lf.l_whence = SEEK_SET;
3659                 lf.l_start = 0;
3660                 lf.l_len = 0;
3661                 if (flags & O_EXLOCK) {
3662                         lf.l_type = F_WRLCK;
3663                 } else {
3664                         lf.l_type = F_RDLCK;
3665                 }
3666                 type = F_FLOCK;
3667                 if ((flags & FNONBLOCK) == 0) {
3668                         type |= F_WAIT;
3669                 }
3670 #if CONFIG_MACF
3671                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3672                     F_SETLK, &lf);
3673                 if (error) {
3674                         goto bad;
3675                 }
3676 #endif
3677                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL))) {
3678                         goto bad;
3679                 }
3680                 fp->f_fglob->fg_flag |= FHASLOCK;
3681         }
3682
3683 #if DEVELOPMENT || DEBUG
3684         /*
3685          * XXX VSWAP: Check for entitlements or special flag here
3686          * so we can restrict access appropriately.
3687          */
3688 #else /* DEVELOPMENT || DEBUG */
3689
3690         if (vnode_isswap(vp) && (flags & (FWRITE | O_TRUNC)) && (ctx != vfs_context_kernel())) {
3691                 /* block attempt to write/truncate swapfile */
3692                 error = EPERM;
3693                 goto bad;
3694         }
3695 #endif /* DEVELOPMENT || DEBUG */
3696
3697         /* try to truncate by setting the size attribute */
3698         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
3699                 goto bad;
3700         }
3701
3702         /*
3703          * For directories we hold some additional information in the fd.
3704          */
3705         if (vnode_vtype(vp) == VDIR) {
3706                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3707         } else {
3708                 fp->f_fglob->fg_vn_data = NULL;
3709         }
3710
3711         vnode_put(vp);
3712
3713         /*
3714          * The first terminal open (without a O_NOCTTY) by a session leader
3715          * results in it being set as the controlling terminal.
3716          */
3717         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3718             !(flags & O_NOCTTY)) {
3719                 int tmp = 0;
3720
3721                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3722                     (caddr_t)&tmp, ctx);
3723         }
3724
3725         proc_fdlock(p);
3726         if (flags & O_CLOEXEC) {
3727                 *fdflags(p, indx) |= UF_EXCLOSE;
3728         }
3729         if (flags & O_CLOFORK) {
3730                 *fdflags(p, indx) |= UF_FORKCLOSE;
3731         }
3732         procfdtbl_releasefd(p, indx, NULL);
3733
3734 #if CONFIG_SECLUDED_MEMORY
3735         if (secluded_for_filecache &&
3736             FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
3737             vnode_vtype(vp) == VREG) {
3738                 memory_object_control_t moc;
3739
3740                 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
3741
3742                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
3743                         /* nothing to do... */
3744                 } else if (fp->f_fglob->fg_flag & FWRITE) {
3745                         /* writable -> no longer  eligible for secluded pages */
3746                         memory_object_mark_eligible_for_secluded(moc,
3747                             FALSE);
3748                 } else if (secluded_for_filecache == 1) {
3749                         char pathname[32] = { 0, };
3750                         size_t copied;
3751                         /* XXX FBDP: better way to detect /Applications/ ? */
3752                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3753                                 copyinstr(ndp->ni_dirp,
3754                                     pathname,
3755                                     sizeof(pathname),
3756                                     &copied);
3757                         } else {
3758                                 copystr(CAST_DOWN(void *, ndp->ni_dirp),
3759                                     pathname,
3760                                     sizeof(pathname),
3761                                     &copied);
3762                         }
3763                         pathname[sizeof(pathname) - 1] = '\0';
3764                         if (strncmp(pathname,
3765                             "/Applications/",
3766                             strlen("/Applications/")) == 0 &&
3767                             strncmp(pathname,
3768                             "/Applications/Camera.app/",
3769                             strlen("/Applications/Camera.app/")) != 0) {
3770                                 /*
3771                                  * not writable
3772                                  * AND from "/Applications/"
3773                                  * AND not from "/Applications/Camera.app/"
3774                                  * ==> eligible for secluded
3775                                  */
3776                                 memory_object_mark_eligible_for_secluded(moc,
3777                                     TRUE);
3778                         }
3779                 } else if (secluded_for_filecache == 2) {
3780 #if __arm64__
3781 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
3782 #elif __arm__
3783 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
3784 #else
3785 /* not implemented... */
3786 #endif
3787                         if (!strncmp(vp->v_name,
3788                             DYLD_SHARED_CACHE_NAME,
3789                             strlen(DYLD_SHARED_CACHE_NAME)) ||
3790                             !strncmp(vp->v_name,
3791                             "dyld",
3792                             strlen(vp->v_name)) ||
3793                             !strncmp(vp->v_name,
3794                             "launchd",
3795                             strlen(vp->v_name)) ||
3796                             !strncmp(vp->v_name,
3797                             "Camera",
3798                             strlen(vp->v_name)) ||
3799                             !strncmp(vp->v_name,
3800                             "mediaserverd",
3801                             strlen(vp->v_name)) ||
3802                             !strncmp(vp->v_name,
3803                             "SpringBoard",
3804                             strlen(vp->v_name)) ||
3805                             !strncmp(vp->v_name,
3806                             "backboardd",
3807                             strlen(vp->v_name))) {
3808                                 /*
3809                                  * This file matters when launching Camera:
3810                                  * do not store its contents in the secluded
3811                                  * pool that will be drained on Camera launch.
3812                                  */
3813                                 memory_object_mark_eligible_for_secluded(moc,
3814                                     FALSE);
3815                         }
3816                 }
3817         }
3818 #endif /* CONFIG_SECLUDED_MEMORY */
3819
3820         fp_drop(p, indx, fp, 1);
3821         proc_fdunlock(p);
3822
3823         *retval = indx;
3824
3825         return 0;
3826 bad:
3827         context = *vfs_context_current();
3828         context.vc_ucred = fp->f_fglob->fg_cred;
3829
3830         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
3831             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
3832                 lf.l_whence = SEEK_SET;
3833                 lf.l_start = 0;
3834                 lf.l_len = 0;
3835                 lf.l_type = F_UNLCK;
3836
3837                 (void)VNOP_ADVLOCK(
3838                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
3839         }
3840
3841         vn_close(vp, fp->f_fglob->fg_flag, &context);
3842         vnode_put(vp);
3843         fp_free(p, indx, fp);
3844
3845         return error;
3846 }
3847
3848 /*
3849  * While most of the *at syscall handlers can call nameiat() which
3850  * is a wrapper around namei, the use of namei and initialisation
3851  * of nameidata are far removed and in different functions  - namei
3852  * gets called in vn_open_auth for open1. So we'll just do here what
3853  * nameiat() does.
3854  */
3855 static int
3856 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3857     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
3858     int dirfd)
3859 {
3860         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
3861                 int error;
3862                 char c;
3863
3864                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3865                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3866                         if (error) {
3867                                 return error;
3868                         }
3869                 } else {
3870                         c = *((char *)(ndp->ni_dirp));
3871                 }
3872
3873                 if (c != '/') {
3874                         vnode_t dvp_at;
3875
3876                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3877                             &dvp_at);
3878                         if (error) {
3879                                 return error;
3880                         }
3881
3882                         if (vnode_vtype(dvp_at) != VDIR) {
3883                                 vnode_put(dvp_at);
3884                                 return ENOTDIR;
3885                         }
3886
3887                         ndp->ni_dvp = dvp_at;
3888                         ndp->ni_cnd.cn_flags |= USEDVP;
3889                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
3890                             retval);
3891                         vnode_put(dvp_at);
3892                         return error;
3893                 }
3894         }
3895
3896         return open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval);
3897 }
3898
3899 /*
3900  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
3901  *
3902  * Parameters:  p                       Process requesting the open
3903  *              uap                     User argument descriptor (see below)
3904  *              retval                  Pointer to an area to receive the
3905  *                                      return calue from the system call
3906  *
3907  * Indirect:    uap->path               Path to open (same as 'open')
3908  *              uap->flags              Flags to open (same as 'open'
3909  *              uap->uid                UID to set, if creating
3910  *              uap->gid                GID to set, if creating
3911  *              uap->mode               File mode, if creating (same as 'open')
3912  *              uap->xsecurity          ACL to set, if creating
3913  *
3914  * Returns:     0                       Success
3915  *              !0                      errno value
3916  *
3917  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3918  *
3919  * XXX:         We should enummerate the possible errno values here, and where
3920  *              in the code they originated.
3921  */
3922 int
3923 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
3924 {
3925         struct filedesc *fdp = p->p_fd;
3926         int ciferror;
3927         kauth_filesec_t xsecdst;
3928         struct vnode_attr va;
3929         struct nameidata nd;
3930         int cmode;
3931
3932         AUDIT_ARG(owner, uap->uid, uap->gid);
3933
3934         xsecdst = NULL;
3935         if ((uap->xsecurity != USER_ADDR_NULL) &&
3936             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
3937                 return ciferror;
3938         }
3939
3940         VATTR_INIT(&va);
3941         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3942         VATTR_SET(&va, va_mode, cmode);
3943         if (uap->uid != KAUTH_UID_NONE) {
3944                 VATTR_SET(&va, va_uid, uap->uid);
3945         }
3946         if (uap->gid != KAUTH_GID_NONE) {
3947                 VATTR_SET(&va, va_gid, uap->gid);
3948         }
3949         if (xsecdst != NULL) {
3950                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3951         }
3952
3953         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3954             uap->path, vfs_context_current());
3955
3956         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
3957             fileproc_alloc_init, NULL, retval);
3958         if (xsecdst != NULL) {
3959                 kauth_filesec_free(xsecdst);
3960         }
3961
3962         return ciferror;
3963 }
3964
3965 /*
3966  * Go through the data-protected atomically controlled open (2)
3967  *
3968  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
3969  */
3970 int
3971 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
3972 {
3973         int flags = uap->flags;
3974         int class = uap->class;
3975         int dpflags = uap->dpflags;
3976
3977         /*
3978          * Follow the same path as normal open(2)
3979          * Look up the item if it exists, and acquire the vnode.
3980          */
3981         struct filedesc *fdp = p->p_fd;
3982         struct vnode_attr va;
3983         struct nameidata nd;
3984         int cmode;
3985         int error;
3986
3987         VATTR_INIT(&va);
3988         /* Mask off all but regular access permissions */
3989         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3990         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3991
3992         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3993             uap->path, vfs_context_current());
3994
3995         /*
3996          * Initialize the extra fields in vnode_attr to pass down our
3997          * extra fields.
3998          * 1. target cprotect class.
3999          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4000          */
4001         if (flags & O_CREAT) {
4002                 /* lower level kernel code validates that the class is valid before applying it. */
4003                 if (class != PROTECTION_CLASS_DEFAULT) {
4004                         /*
4005                          * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4006                          * file behave the same as open (2)
4007                          */
4008                         VATTR_SET(&va, va_dataprotect_class, class);
4009                 }
4010         }
4011
4012         if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4013                 if (flags & (O_RDWR | O_WRONLY)) {
4014                         /* Not allowed to write raw encrypted bytes */
4015                         return EINVAL;
4016                 }
4017                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4018                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4019                 }
4020                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4021                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4022                 }
4023         }
4024
4025         error = open1(vfs_context_current(), &nd, uap->flags, &va,
4026             fileproc_alloc_init, NULL, retval);
4027
4028         return error;
4029 }
4030
4031 static int
4032 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4033     int fd, enum uio_seg segflg, int *retval)
4034 {
4035         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
4036         struct vnode_attr va;
4037         struct nameidata nd;
4038         int cmode;
4039
4040         VATTR_INIT(&va);
4041         /* Mask off all but regular access permissions */
4042         cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4043         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4044
4045         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4046             segflg, path, ctx);
4047
4048         return open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
4049                    retval, fd);
4050 }
4051
4052 int
4053 open(proc_t p, struct open_args *uap, int32_t *retval)
4054 {
4055         __pthread_testcancel(1);
4056         return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4057 }
4058
4059 int
4060 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4061     int32_t *retval)
4062 {
4063         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4064                    uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4065 }
4066
4067 int
4068 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4069     int32_t *retval)
4070 {
4071         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4072                    uap->mode, uap->fd, UIO_USERSPACE, retval);
4073 }
4074
4075 int
4076 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4077 {
4078         __pthread_testcancel(1);
4079         return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4080 }
4081
4082 /*
4083  * openbyid_np: open a file given a file system id and a file system object id
4084  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
4085  *      file systems that don't support object ids it is a node id (uint64_t).
4086  *
4087  * Parameters:  p                       Process requesting the open
4088  *              uap                     User argument descriptor (see below)
4089  *              retval                  Pointer to an area to receive the
4090  *                                      return calue from the system call
4091  *
4092  * Indirect:    uap->path               Path to open (same as 'open')
4093  *
4094  *              uap->fsid               id of target file system
4095  *              uap->objid              id of target file system object
4096  *              uap->flags              Flags to open (same as 'open')
4097  *
4098  * Returns:     0                       Success
4099  *              !0                      errno value
4100  *
4101  *
4102  * XXX:         We should enummerate the possible errno values here, and where
4103  *              in the code they originated.
4104  */
4105 int
4106 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4107 {
4108         fsid_t fsid;
4109         uint64_t objid;
4110         int error;
4111         char *buf = NULL;
4112         int buflen = MAXPATHLEN;
4113         int pathlen = 0;
4114         vfs_context_t ctx = vfs_context_current();
4115
4116         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4117                 return error;
4118         }
4119
4120         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4121                 return error;
4122         }
4123
4124         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4125         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4126                 return error;
4127         }
4128
4129         AUDIT_ARG(value32, fsid.val[0]);
4130         AUDIT_ARG(value64, objid);
4131
4132         /*resolve path from fsis, objid*/
4133         do {
4134                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
4135                 if (buf == NULL) {
4136                         return ENOMEM;
4137                 }
4138
4139                 error = fsgetpath_internal(
4140                         ctx, fsid.val[0], objid,
4141                         buflen, buf, &pathlen);
4142
4143                 if (error) {
4144                         FREE(buf, M_TEMP);
4145                         buf = NULL;
4146                 }
4147         } while (error == ENOSPC && (buflen += MAXPATHLEN));
4148
4149         if (error) {
4150                 return error;
4151         }
4152
4153         buf[pathlen] = 0;
4154
4155         error = openat_internal(
4156                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4157
4158         FREE(buf, M_TEMP);
4159
4160         return error;
4161 }
4162
4163
4164 /*
4165  * Create a special file.
4166  */
4167 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4168
4169 int
4170 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4171 {
4172         struct vnode_attr va;
4173         vfs_context_t ctx = vfs_context_current();
4174         int error;
4175         struct nameidata nd;
4176         vnode_t vp, dvp;
4177
4178         VATTR_INIT(&va);
4179         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4180         VATTR_SET(&va, va_rdev, uap->dev);
4181
4182         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4183         if ((uap->mode & S_IFMT) == S_IFIFO) {
4184                 return mkfifo1(ctx, uap->path, &va);
4185         }
4186
4187         AUDIT_ARG(mode, uap->mode);
4188         AUDIT_ARG(value32, uap->dev);
4189
4190         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4191                 return error;
4192         }
4193         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4194             UIO_USERSPACE, uap->path, ctx);
4195         error = namei(&nd);
4196         if (error) {
4197                 return error;
4198         }
4199         dvp = nd.ni_dvp;
4200         vp = nd.ni_vp;
4201
4202         if (vp != NULL) {
4203                 error = EEXIST;
4204                 goto out;
4205         }
4206
4207         switch (uap->mode & S_IFMT) {
4208         case S_IFCHR:
4209                 VATTR_SET(&va, va_type, VCHR);
4210                 break;
4211         case S_IFBLK:
4212                 VATTR_SET(&va, va_type, VBLK);
4213                 break;
4214         default:
4215                 error = EINVAL;
4216                 goto out;
4217         }
4218
4219 #if CONFIG_MACF
4220         error = mac_vnode_check_create(ctx,
4221             nd.ni_dvp, &nd.ni_cnd, &va);
4222         if (error) {
4223                 goto out;
4224         }
4225 #endif
4226
4227         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4228                 goto out;
4229         }
4230
4231         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4232                 goto out;
4233         }
4234
4235         if (vp) {
4236                 int     update_flags = 0;
4237
4238                 // Make sure the name & parent pointers are hooked up
4239                 if (vp->v_name == NULL) {
4240                         update_flags |= VNODE_UPDATE_NAME;
4241                 }
4242                 if (vp->v_parent == NULLVP) {
4243                         update_flags |= VNODE_UPDATE_PARENT;
4244                 }
4245
4246                 if (update_flags) {
4247                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4248                 }
4249
4250 #if CONFIG_FSE
4251                 add_fsevent(FSE_CREATE_FILE, ctx,
4252                     FSE_ARG_VNODE, vp,
4253                     FSE_ARG_DONE);
4254 #endif
4255         }
4256
4257 out:
4258         /*
4259          * nameidone has to happen before we vnode_put(dvp)
4260          * since it may need to release the fs_nodelock on the dvp
4261          */
4262         nameidone(&nd);
4263
4264         if (vp) {
4265                 vnode_put(vp);
4266         }
4267         vnode_put(dvp);
4268
4269         return error;
4270 }
4271
4272 /*
4273  * Create a named pipe.
4274  *
4275  * Returns:     0                       Success
4276  *              EEXIST
4277  *      namei:???
4278  *      vnode_authorize:???
4279  *      vn_create:???
4280  */
4281 static int
4282 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4283 {
4284         vnode_t vp, dvp;
4285         int error;
4286         struct nameidata nd;
4287
4288         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4289             UIO_USERSPACE, upath, ctx);
4290         error = namei(&nd);
4291         if (error) {
4292                 return error;
4293         }
4294         dvp = nd.ni_dvp;
4295         vp = nd.ni_vp;
4296
4297         /* check that this is a new file and authorize addition */
4298         if (vp != NULL) {
4299                 error = EEXIST;
4300                 goto out;
4301         }
4302         VATTR_SET(vap, va_type, VFIFO);
4303
4304         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4305                 goto out;
4306         }
4307
4308         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4309 out:
4310         /*
4311          * nameidone has to happen before we vnode_put(dvp)
4312          * since it may need to release the fs_nodelock on the dvp
4313          */
4314         nameidone(&nd);
4315
4316         if (vp) {
4317                 vnode_put(vp);
4318         }
4319         vnode_put(dvp);
4320
4321         return error;
4322 }
4323
4324
4325 /*
4326  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4327  *
4328  * Parameters:  p                       Process requesting the open
4329  *              uap                     User argument descriptor (see below)
4330  *              retval                  (Ignored)
4331  *
4332  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
4333  *              uap->uid                UID to set
4334  *              uap->gid                GID to set
4335  *              uap->mode               File mode to set (same as 'mkfifo')
4336  *              uap->xsecurity          ACL to set, if creating
4337  *
4338  * Returns:     0                       Success
4339  *              !0                      errno value
4340  *
4341  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4342  *
4343  * XXX:         We should enummerate the possible errno values here, and where
4344  *              in the code they originated.
4345  */
4346 int
4347 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4348 {
4349         int ciferror;
4350         kauth_filesec_t xsecdst;
4351         struct vnode_attr va;
4352
4353         AUDIT_ARG(owner, uap->uid, uap->gid);
4354
4355         xsecdst = KAUTH_FILESEC_NONE;
4356         if (uap->xsecurity != USER_ADDR_NULL) {
4357                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4358                         return ciferror;
4359                 }
4360         }
4361
4362         VATTR_INIT(&va);
4363         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4364         if (uap->uid != KAUTH_UID_NONE) {
4365                 VATTR_SET(&va, va_uid, uap->uid);
4366         }
4367         if (uap->gid != KAUTH_GID_NONE) {
4368                 VATTR_SET(&va, va_gid, uap->gid);
4369         }
4370         if (xsecdst != KAUTH_FILESEC_NONE) {
4371                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4372         }
4373
4374         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4375
4376         if (xsecdst != KAUTH_FILESEC_NONE) {
4377                 kauth_filesec_free(xsecdst);
4378         }
4379         return ciferror;
4380 }
4381
4382 /* ARGSUSED */
4383 int
4384 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4385 {
4386         struct vnode_attr va;
4387
4388         VATTR_INIT(&va);
4389         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4390
4391         return mkfifo1(vfs_context_current(), uap->path, &va);
4392 }
4393
4394
4395 static char *
4396 my_strrchr(char *p, int ch)
4397 {
4398         char *save;
4399
4400         for (save = NULL;; ++p) {
4401                 if (*p == ch) {
4402                         save = p;
4403                 }
4404                 if (!*p) {
4405                         return save;
4406                 }
4407         }
4408         /* NOTREACHED */
4409 }
4410
4411 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4412
4413 int
4414 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4415 {
4416         int ret, len = _len;
4417
4418         *truncated_path = 0;
4419         ret = vn_getpath(dvp, path, &len);
4420         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4421                 if (leafname) {
4422                         path[len - 1] = '/';
4423                         len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4424                         if (len > MAXPATHLEN) {
4425                                 char *ptr;
4426
4427                                 // the string got truncated!
4428                                 *truncated_path = 1;
4429                                 ptr = my_strrchr(path, '/');
4430                                 if (ptr) {
4431                                         *ptr = '\0';   // chop off the string at the last directory component
4432                                 }
4433                                 len = strlen(path) + 1;
4434                         }
4435                 }
4436         } else if (ret == 0) {
4437                 *truncated_path = 1;
4438         } else if (ret != 0) {
4439                 struct vnode *mydvp = dvp;
4440
4441                 if (ret != ENOSPC) {
4442                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4443                             dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4444                 }
4445                 *truncated_path = 1;
4446
4447                 do {
4448                         if (mydvp->v_parent != NULL) {
4449                                 mydvp = mydvp->v_parent;
4450                         } else if (mydvp->v_mount) {
4451                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4452                                 break;
4453                         } else {
4454                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4455                                 strlcpy(path, "/", _len);
4456                                 len = 2;
4457                                 mydvp = NULL;
4458                         }
4459
4460                         if (mydvp == NULL) {
4461                                 break;
4462                         }
4463
4464                         len = _len;
4465                         ret = vn_getpath(mydvp, path, &len);
4466                 } while (ret == ENOSPC);
4467         }
4468
4469         return len;
4470 }
4471
4472
4473 /*
4474  * Make a hard file link.
4475  *
4476  * Returns:     0                       Success
4477  *              EPERM
4478  *              EEXIST
4479  *              EXDEV
4480  *      namei:???
4481  *      vnode_authorize:???
4482  *      VNOP_LINK:???
4483  */
4484 /* ARGSUSED */
4485 static int
4486 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4487     user_addr_t link, int flag, enum uio_seg segflg)
4488 {
4489         vnode_t vp, dvp, lvp;
4490         struct nameidata nd;
4491         int follow;
4492         int error;
4493 #if CONFIG_FSE
4494         fse_info finfo;
4495 #endif
4496         int need_event, has_listeners, need_kpath2;
4497         char *target_path = NULL;
4498         int truncated = 0;
4499
4500         vp = dvp = lvp = NULLVP;
4501
4502         /* look up the object we are linking to */
4503         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4504         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4505             segflg, path, ctx);
4506
4507         error = nameiat(&nd, fd1);
4508         if (error) {
4509                 return error;
4510         }
4511         vp = nd.ni_vp;
4512
4513         nameidone(&nd);
4514
4515         /*
4516          * Normally, linking to directories is not supported.
4517          * However, some file systems may have limited support.
4518          */
4519         if (vp->v_type == VDIR) {
4520                 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4521                         error = EPERM;   /* POSIX */
4522                         goto out;
4523                 }
4524
4525                 /* Linking to a directory requires ownership. */
4526                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4527                         struct vnode_attr dva;
4528
4529                         VATTR_INIT(&dva);
4530                         VATTR_WANTED(&dva, va_uid);
4531                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4532                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4533                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4534                                 error = EACCES;
4535                                 goto out;
4536                         }
4537                 }
4538         }
4539
4540         /* lookup the target node */
4541 #if CONFIG_TRIGGERS
4542         nd.ni_op = OP_LINK;
4543 #endif
4544         nd.ni_cnd.cn_nameiop = CREATE;
4545         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4546         nd.ni_dirp = link;
4547         error = nameiat(&nd, fd2);
4548         if (error != 0) {
4549                 goto out;
4550         }
4551         dvp = nd.ni_dvp;
4552         lvp = nd.ni_vp;
4553
4554 #if CONFIG_MACF
4555         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
4556                 goto out2;
4557         }
4558 #endif
4559
4560         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4561         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
4562                 goto out2;
4563         }
4564
4565         /* target node must not exist */
4566         if (lvp != NULLVP) {
4567                 error = EEXIST;
4568                 goto out2;
4569         }
4570         /* cannot link across mountpoints */
4571         if (vnode_mount(vp) != vnode_mount(dvp)) {
4572                 error = EXDEV;
4573                 goto out2;
4574         }
4575
4576         /* authorize creation of the target note */
4577         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4578                 goto out2;
4579         }
4580
4581         /* and finally make the link */
4582         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4583         if (error) {
4584                 goto out2;
4585         }
4586
4587 #if CONFIG_MACF
4588         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4589 #endif
4590
4591 #if CONFIG_FSE
4592         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4593 #else
4594         need_event = 0;
4595 #endif
4596         has_listeners = kauth_authorize_fileop_has_listeners();
4597
4598         need_kpath2 = 0;
4599 #if CONFIG_AUDIT
4600         if (AUDIT_RECORD_EXISTS()) {
4601                 need_kpath2 = 1;
4602         }
4603 #endif
4604
4605         if (need_event || has_listeners || need_kpath2) {
4606                 char *link_to_path = NULL;
4607                 int len, link_name_len;
4608
4609                 /* build the path to the new link file */
4610                 GET_PATH(target_path);
4611                 if (target_path == NULL) {
4612                         error = ENOMEM;
4613                         goto out2;
4614                 }
4615
4616                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4617
4618                 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
4619
4620                 if (has_listeners) {
4621                         /* build the path to file we are linking to */
4622                         GET_PATH(link_to_path);
4623                         if (link_to_path == NULL) {
4624                                 error = ENOMEM;
4625                                 goto out2;
4626                         }
4627
4628                         link_name_len = MAXPATHLEN;
4629                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4630                                 /*
4631                                  * Call out to allow 3rd party notification of rename.
4632                                  * Ignore result of kauth_authorize_fileop call.
4633                                  */
4634                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4635                                     (uintptr_t)link_to_path,
4636                                     (uintptr_t)target_path);
4637                         }
4638                         if (link_to_path != NULL) {
4639                                 RELEASE_PATH(link_to_path);
4640                         }
4641                 }
4642 #if CONFIG_FSE
4643                 if (need_event) {
4644                         /* construct fsevent */
4645                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4646                                 if (truncated) {
4647                                         finfo.mode |= FSE_TRUNCATED_PATH;
4648                                 }
4649
4650                                 // build the path to the destination of the link
4651                                 add_fsevent(FSE_CREATE_FILE, ctx,
4652                                     FSE_ARG_STRING, len, target_path,
4653                                     FSE_ARG_FINFO, &finfo,
4654                                     FSE_ARG_DONE);
4655                         }
4656                         if (vp->v_parent) {
4657                                 add_fsevent(FSE_STAT_CHANGED, ctx,
4658                                     FSE_ARG_VNODE, vp->v_parent,
4659                                     FSE_ARG_DONE);
4660                         }
4661                 }
4662 #endif
4663         }
4664 out2:
4665         /*
4666          * nameidone has to happen before we vnode_put(dvp)
4667          * since it may need to release the fs_nodelock on the dvp
4668          */
4669         nameidone(&nd);
4670         if (target_path != NULL) {
4671                 RELEASE_PATH(target_path);
4672         }
4673 out:
4674         if (lvp) {
4675                 vnode_put(lvp);
4676         }
4677         if (dvp) {
4678                 vnode_put(dvp);
4679         }
4680         vnode_put(vp);
4681         return error;
4682 }
4683
4684 int
4685 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4686 {
4687         return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4688                    AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
4689 }
4690
4691 int
4692 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4693 {
4694         if (uap->flag & ~AT_SYMLINK_FOLLOW) {
4695                 return EINVAL;
4696         }
4697
4698         return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4699                    uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
4700 }
4701
4702 /*
4703  * Make a symbolic link.
4704  *
4705  * We could add support for ACLs here too...
4706  */
4707 /* ARGSUSED */
4708 static int
4709 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4710     user_addr_t link, enum uio_seg segflg)
4711 {
4712         struct vnode_attr va;
4713         char *path;
4714         int error;
4715         struct nameidata nd;
4716         vnode_t vp, dvp;
4717         size_t dummy = 0;
4718         proc_t p;
4719
4720         error = 0;
4721         if (UIO_SEG_IS_USER_SPACE(segflg)) {
4722                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
4723                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
4724         } else {
4725                 path = (char *)path_data;
4726         }
4727         if (error) {
4728                 goto out;
4729         }
4730         AUDIT_ARG(text, path);  /* This is the link string */
4731
4732         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
4733             segflg, link, ctx);
4734
4735         error = nameiat(&nd, fd);
4736         if (error) {
4737                 goto out;
4738         }
4739         dvp = nd.ni_dvp;
4740         vp = nd.ni_vp;
4741
4742         p = vfs_context_proc(ctx);
4743         VATTR_INIT(&va);
4744         VATTR_SET(&va, va_type, VLNK);
4745         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
4746
4747 #if CONFIG_MACF
4748         error = mac_vnode_check_create(ctx,
4749             dvp, &nd.ni_cnd, &va);
4750 #endif
4751         if (error != 0) {
4752                 goto skipit;
4753         }
4754
4755         if (vp != NULL) {
4756                 error = EEXIST;
4757                 goto skipit;
4758         }
4759
4760         /* authorize */
4761         if (error == 0) {
4762                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
4763         }
4764         /* get default ownership, etc. */
4765         if (error == 0) {
4766                 error = vnode_authattr_new(dvp, &va, 0, ctx);
4767         }
4768         if (error == 0) {
4769                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
4770         }
4771
4772 #if CONFIG_MACF
4773         if (error == 0 && vp) {
4774                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
4775         }
4776 #endif
4777
4778         /* do fallback attribute handling */
4779         if (error == 0 && vp) {
4780                 error = vnode_setattr_fallback(vp, &va, ctx);
4781         }
4782
4783         if (error == 0) {
4784                 int     update_flags = 0;
4785
4786                 /*check if a new vnode was created, else try to get one*/
4787                 if (vp == NULL) {
4788                         nd.ni_cnd.cn_nameiop = LOOKUP;
4789 #if CONFIG_TRIGGERS
4790                         nd.ni_op = OP_LOOKUP;
4791 #endif
4792                         nd.ni_cnd.cn_flags = 0;
4793                         error = nameiat(&nd, fd);
4794                         vp = nd.ni_vp;
4795
4796                         if (vp == NULL) {
4797                                 goto skipit;
4798                         }
4799                 }
4800
4801 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
4802                 /* call out to allow 3rd party notification of rename.
4803                  * Ignore result of kauth_authorize_fileop call.
4804                  */
4805                 if (kauth_authorize_fileop_has_listeners() &&
4806                     namei(&nd) == 0) {
4807                         char *new_link_path = NULL;
4808                         int             len;
4809
4810                         /* build the path to the new link file */
4811                         new_link_path = get_pathbuff();
4812                         len = MAXPATHLEN;
4813                         vn_getpath(dvp, new_link_path, &len);
4814                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
4815                                 new_link_path[len - 1] = '/';
4816                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
4817                         }
4818
4819                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
4820                             (uintptr_t)path, (uintptr_t)new_link_path);
4821                         if (new_link_path != NULL) {
4822                                 release_pathbuff(new_link_path);
4823                         }
4824                 }
4825 #endif
4826                 // Make sure the name & parent pointers are hooked up
4827                 if (vp->v_name == NULL) {
4828                         update_flags |= VNODE_UPDATE_NAME;
4829                 }
4830                 if (vp->v_parent == NULLVP) {
4831                         update_flags |= VNODE_UPDATE_PARENT;
4832                 }
4833
4834                 if (update_flags) {
4835                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4836                 }
4837
4838 #if CONFIG_FSE
4839                 add_fsevent(FSE_CREATE_FILE, ctx,
4840                     FSE_ARG_VNODE, vp,
4841                     FSE_ARG_DONE);
4842 #endif
4843         }
4844
4845 skipit:
4846         /*
4847          * nameidone has to happen before we vnode_put(dvp)
4848          * since it may need to release the fs_nodelock on the dvp
4849          */
4850         nameidone(&nd);
4851
4852         if (vp) {
4853                 vnode_put(vp);
4854         }
4855         vnode_put(dvp);
4856 out:
4857         if (path && (path != (char *)path_data)) {
4858                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
4859         }
4860
4861         return error;
4862 }
4863
4864 int
4865 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
4866 {
4867         return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
4868                    uap->link, UIO_USERSPACE);
4869 }
4870
4871 int
4872 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
4873     __unused int32_t *retval)
4874 {
4875         return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
4876                    uap->path2, UIO_USERSPACE);
4877 }
4878
4879 /*
4880  * Delete a whiteout from the filesystem.
4881  * No longer supported.
4882  */
4883 int
4884 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
4885 {
4886         return ENOTSUP;
4887 }
4888
4889 /*
4890  * Delete a name from the filesystem.
4891  */
4892 /* ARGSUSED */
4893 static int
4894 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
4895     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
4896 {
4897         struct nameidata nd;
4898         vnode_t vp, dvp;
4899         int error;
4900         struct componentname *cnp;
4901         char  *path = NULL;
4902         int  len = 0;
4903 #if CONFIG_FSE
4904         fse_info  finfo;
4905         struct vnode_attr va;
4906 #endif
4907         int flags;
4908         int need_event;
4909         int has_listeners;
4910         int truncated_path;
4911         int batched;
4912         struct vnode_attr *vap;
4913         int do_retry;
4914         int retry_count = 0;
4915         int cn_flags;
4916
4917         cn_flags = LOCKPARENT;
4918         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
4919                 cn_flags |= AUDITVNPATH1;
4920         }
4921         /* If a starting dvp is passed, it trumps any fd passed. */
4922         if (start_dvp) {
4923                 cn_flags |= USEDVP;
4924         }
4925
4926 #if NAMEDRSRCFORK
4927         /* unlink or delete is allowed on rsrc forks and named streams */
4928         cn_flags |= CN_ALLOWRSRCFORK;
4929 #endif
4930
4931 retry:
4932         do_retry = 0;
4933         flags = 0;
4934         need_event = 0;
4935         has_listeners = 0;
4936         truncated_path = 0;
4937         vap = NULL;
4938
4939         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
4940
4941         nd.ni_dvp = start_dvp;
4942         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
4943         cnp = &nd.ni_cnd;
4944
4945 continue_lookup:
4946         error = nameiat(&nd, fd);
4947         if (error) {
4948                 return error;
4949         }
4950
4951         dvp = nd.ni_dvp;
4952         vp = nd.ni_vp;
4953
4954
4955         /* With Carbon delete semantics, busy files cannot be deleted */
4956         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
4957                 flags |= VNODE_REMOVE_NODELETEBUSY;
4958         }
4959
4960         /* Skip any potential upcalls if told to. */
4961         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
4962                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
4963         }
4964
4965         if (vp) {
4966                 batched = vnode_compound_remove_available(vp);
4967                 /*
4968                  * The root of a mounted filesystem cannot be deleted.
4969                  */
4970                 if (vp->v_flag & VROOT) {
4971                         error = EBUSY;
4972                 }
4973
4974 #if DEVELOPMENT || DEBUG
4975                 /*
4976                  * XXX VSWAP: Check for entitlements or special flag here
4977                  * so we can restrict access appropriately.
4978                  */
4979 #else /* DEVELOPMENT || DEBUG */
4980
4981                 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
4982                         error = EPERM;
4983                         goto out;
4984                 }
4985 #endif /* DEVELOPMENT || DEBUG */
4986
4987                 if (!batched) {
4988                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
4989                         if (error) {
4990                                 if (error == ENOENT) {
4991                                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4992                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4993                                                 do_retry = 1;
4994                                                 retry_count++;
4995                                         }
4996                                 }
4997                                 goto out;
4998                         }
4999                 }
5000         } else {
5001                 batched = 1;
5002
5003                 if (!vnode_compound_remove_available(dvp)) {
5004                         panic("No vp, but no compound remove?");
5005                 }
5006         }
5007
5008 #if CONFIG_FSE
5009         need_event = need_fsevent(FSE_DELETE, dvp);
5010         if (need_event) {
5011                 if (!batched) {
5012                         if ((vp->v_flag & VISHARDLINK) == 0) {
5013                                 /* XXX need to get these data in batched VNOP */
5014                                 get_fse_info(vp, &finfo, ctx);
5015                         }
5016                 } else {
5017                         error = vfs_get_notify_attributes(&va);
5018                         if (error) {
5019                                 goto out;
5020                         }
5021
5022                         vap = &va;
5023                 }
5024         }
5025 #endif
5026         has_listeners = kauth_authorize_fileop_has_listeners();
5027         if (need_event || has_listeners) {
5028                 if (path == NULL) {
5029                         GET_PATH(path);
5030                         if (path == NULL) {
5031                                 error = ENOMEM;
5032                                 goto out;
5033                         }
5034                 }
5035                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5036         }
5037
5038 #if NAMEDRSRCFORK
5039         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5040                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5041         } else
5042 #endif
5043         {
5044                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
5045                 vp = nd.ni_vp;
5046                 if (error == EKEEPLOOKING) {
5047                         if (!batched) {
5048                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5049                         }
5050
5051                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
5052                                 panic("EKEEPLOOKING, but continue flag not set?");
5053                         }
5054
5055                         if (vnode_isdir(vp)) {
5056                                 error = EISDIR;
5057                                 goto out;
5058                         }
5059                         goto continue_lookup;
5060                 } else if (error == ENOENT && batched) {
5061                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
5062                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5063                                 /*
5064                                  * For compound VNOPs, the authorization callback may
5065                                  * return ENOENT in case of racing hardlink lookups
5066                                  * hitting the name  cache, redrive the lookup.
5067                                  */
5068                                 do_retry = 1;
5069                                 retry_count += 1;
5070                                 goto out;
5071                         }
5072                 }
5073         }
5074
5075         /*
5076          * Call out to allow 3rd party notification of delete.
5077          * Ignore result of kauth_authorize_fileop call.
5078          */
5079         if (!error) {
5080                 if (has_listeners) {
5081                         kauth_authorize_fileop(vfs_context_ucred(ctx),
5082                             KAUTH_FILEOP_DELETE,
5083                             (uintptr_t)vp,
5084                             (uintptr_t)path);
5085                 }
5086
5087                 if (vp->v_flag & VISHARDLINK) {
5088                         //
5089                         // if a hardlink gets deleted we want to blow away the
5090                         // v_parent link because the path that got us to this
5091                         // instance of the link is no longer valid.  this will
5092                         // force the next call to get the path to ask the file
5093                         // system instead of just following the v_parent link.
5094                         //
5095                         vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5096                 }
5097
5098 #if CONFIG_FSE
5099                 if (need_event) {
5100                         if (vp->v_flag & VISHARDLINK) {
5101                                 get_fse_info(vp, &finfo, ctx);
5102                         } else if (vap) {
5103                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
5104                         }
5105                         if (truncated_path) {
5106                                 finfo.mode |= FSE_TRUNCATED_PATH;
5107                         }
5108                         add_fsevent(FSE_DELETE, ctx,
5109                             FSE_ARG_STRING, len, path,
5110                             FSE_ARG_FINFO, &finfo,
5111                             FSE_ARG_DONE);
5112                 }
5113 #endif
5114         }
5115
5116 out:
5117         if (path != NULL) {
5118                 RELEASE_PATH(path);
5119         }
5120
5121 #if NAMEDRSRCFORK
5122         /* recycle the deleted rsrc fork vnode to force a reclaim, which
5123          * will cause its shadow file to go away if necessary.
5124          */
5125         if (vp && (vnode_isnamedstream(vp)) &&
5126             (vp->v_parent != NULLVP) &&
5127             vnode_isshadow(vp)) {
5128                 vnode_recycle(vp);
5129         }
5130 #endif
5131         /*
5132          * nameidone has to happen before we vnode_put(dvp)
5133          * since it may need to release the fs_nodelock on the dvp
5134          */
5135         nameidone(&nd);
5136         vnode_put(dvp);
5137         if (vp) {
5138                 vnode_put(vp);
5139         }
5140
5141         if (do_retry) {
5142                 goto retry;
5143         }
5144
5145         return error;
5146 }
5147
5148 int
5149 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5150     enum uio_seg segflg, int unlink_flags)
5151 {
5152         return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5153                    unlink_flags);
5154 }
5155
5156 /*
5157  * Delete a name from the filesystem using Carbon semantics.
5158  */
5159 int
5160 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5161 {
5162         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5163                    uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5164 }
5165
5166 /*
5167  * Delete a name from the filesystem using POSIX semantics.
5168  */
5169 int
5170 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5171 {
5172         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5173                    uap->path, UIO_USERSPACE, 0);
5174 }
5175
5176 int
5177 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5178 {
5179         if (uap->flag & ~AT_REMOVEDIR) {
5180                 return EINVAL;
5181         }
5182
5183         if (uap->flag & AT_REMOVEDIR) {
5184                 return rmdirat_internal(vfs_context_current(), uap->fd,
5185                            uap->path, UIO_USERSPACE);
5186         } else {
5187                 return unlinkat_internal(vfs_context_current(), uap->fd,
5188                            NULLVP, uap->path, UIO_USERSPACE, 0);
5189         }
5190 }
5191
5192 /*
5193  * Reposition read/write file offset.
5194  */
5195 int
5196 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5197 {
5198         struct fileproc *fp;
5199         vnode_t vp;
5200         struct vfs_context *ctx;
5201         off_t offset = uap->offset, file_size;
5202         int error;
5203
5204         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5205                 if (error == ENOTSUP) {
5206                         return ESPIPE;
5207                 }
5208                 return error;
5209         }
5210         if (vnode_isfifo(vp)) {
5211                 file_drop(uap->fd);
5212                 return ESPIPE;
5213         }
5214
5215
5216         ctx = vfs_context_current();
5217 #if CONFIG_MACF
5218         if (uap->whence == L_INCR && uap->offset == 0) {
5219                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5220                     fp->f_fglob);
5221         } else {
5222                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5223                     fp->f_fglob);
5224         }
5225         if (error) {
5226                 file_drop(uap->fd);
5227                 return error;
5228         }
5229 #endif
5230         if ((error = vnode_getwithref(vp))) {
5231                 file_drop(uap->fd);
5232                 return error;
5233         }
5234
5235         switch (uap->whence) {
5236         case L_INCR:
5237                 offset += fp->f_fglob->fg_offset;
5238                 break;
5239         case L_XTND:
5240                 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5241                         break;
5242                 }
5243                 offset += file_size;
5244                 break;
5245         case L_SET:
5246                 break;
5247         case SEEK_HOLE:
5248                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5249                 break;
5250         case SEEK_DATA:
5251                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5252                 break;
5253         default:
5254                 error = EINVAL;
5255         }
5256         if (error == 0) {
5257                 if (uap->offset > 0 && offset < 0) {
5258                         /* Incremented/relative move past max size */
5259                         error = EOVERFLOW;
5260                 } else {
5261                         /*
5262                          * Allow negative offsets on character devices, per
5263                          * POSIX 1003.1-2001.  Most likely for writing disk
5264                          * labels.
5265                          */
5266                         if (offset < 0 && vp->v_type != VCHR) {
5267                                 /* Decremented/relative move before start */
5268                                 error = EINVAL;
5269                         } else {
5270                                 /* Success */
5271                                 fp->f_fglob->fg_offset = offset;
5272                                 *retval = fp->f_fglob->fg_offset;
5273                         }
5274                 }
5275         }
5276
5277         /*
5278          * An lseek can affect whether data is "available to read."  Use
5279          * hint of NOTE_NONE so no EVFILT_VNODE events fire
5280          */
5281         post_event_if_success(vp, error, NOTE_NONE);
5282         (void)vnode_put(vp);
5283         file_drop(uap->fd);
5284         return error;
5285 }
5286
5287
5288 /*
5289  * Check access permissions.
5290  *
5291  * Returns:     0                       Success
5292  *              vnode_authorize:???
5293  */
5294 static int
5295 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5296 {
5297         kauth_action_t action;
5298         int error;
5299
5300         /*
5301          * If just the regular access bits, convert them to something
5302          * that vnode_authorize will understand.
5303          */
5304         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5305                 action = 0;
5306                 if (uflags & R_OK) {
5307                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
5308                 }
5309                 if (uflags & W_OK) {
5310                         if (vnode_isdir(vp)) {
5311                                 action |= KAUTH_VNODE_ADD_FILE |
5312                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
5313                                 /* might want delete rights here too */
5314                         } else {
5315                                 action |= KAUTH_VNODE_WRITE_DATA;
5316                         }
5317                 }
5318                 if (uflags & X_OK) {
5319                         if (vnode_isdir(vp)) {
5320                                 action |= KAUTH_VNODE_SEARCH;
5321                         } else {
5322                                 action |= KAUTH_VNODE_EXECUTE;
5323                         }
5324                 }
5325         } else {
5326                 /* take advantage of definition of uflags */
5327                 action = uflags >> 8;
5328         }
5329
5330 #if CONFIG_MACF
5331         error = mac_vnode_check_access(ctx, vp, uflags);
5332         if (error) {
5333                 return error;
5334         }
5335 #endif /* MAC */
5336
5337         /* action == 0 means only check for existence */
5338         if (action != 0) {
5339                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5340         } else {
5341                 error = 0;
5342         }
5343
5344         return error;
5345 }
5346
5347
5348
5349 /*
5350  * access_extended: Check access permissions in bulk.
5351  *
5352  * Description: uap->entries            Pointer to an array of accessx
5353  *                                      descriptor structs, plus one or
5354  *                                      more NULL terminated strings (see
5355  *                                      "Notes" section below).
5356  *              uap->size               Size of the area pointed to by
5357  *                                      uap->entries.
5358  *              uap->results            Pointer to the results array.
5359  *
5360  * Returns:     0                       Success
5361  *              ENOMEM                  Insufficient memory
5362  *              EINVAL                  Invalid arguments
5363  *              namei:EFAULT            Bad address
5364  *              namei:ENAMETOOLONG      Filename too long
5365  *              namei:ENOENT            No such file or directory
5366  *              namei:ELOOP             Too many levels of symbolic links
5367  *              namei:EBADF             Bad file descriptor
5368  *              namei:ENOTDIR           Not a directory
5369  *              namei:???
5370  *              access1:
5371  *
5372  * Implicit returns:
5373  *              uap->results            Array contents modified
5374  *
5375  * Notes:       The uap->entries are structured as an arbitrary length array
5376  *              of accessx descriptors, followed by one or more NULL terminated
5377  *              strings
5378  *
5379  *                      struct accessx_descriptor[0]
5380  *                      ...
5381  *                      struct accessx_descriptor[n]
5382  *                      char name_data[0];
5383  *
5384  *              We determine the entry count by walking the buffer containing
5385  *              the uap->entries argument descriptor.  For each descriptor we
5386  *              see, the valid values for the offset ad_name_offset will be
5387  *              in the byte range:
5388  *
5389  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
5390  *                                              to
5391  *                              [ uap->entries + uap->size - 2 ]
5392  *
5393  *              since we must have at least one string, and the string must
5394  *              be at least one character plus the NULL terminator in length.
5395  *
5396  * XXX:         Need to support the check-as uid argument
5397  */
5398 int
5399 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5400 {
5401         struct accessx_descriptor *input = NULL;
5402         errno_t *result = NULL;
5403         errno_t error = 0;
5404         int wantdelete = 0;
5405         unsigned int desc_max, desc_actual, i, j;
5406         struct vfs_context context;
5407         struct nameidata nd;
5408         int niopts;
5409         vnode_t vp = NULL;
5410         vnode_t dvp = NULL;
5411 #define ACCESSX_MAX_DESCR_ON_STACK 10
5412         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5413
5414         context.vc_ucred = NULL;
5415
5416         /*
5417          * Validate parameters; if valid, copy the descriptor array and string
5418          * arguments into local memory.  Before proceeding, the following
5419          * conditions must have been met:
5420          *
5421          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5422          * o    There must be sufficient room in the request for at least one
5423          *      descriptor and a one yte NUL terminated string.
5424          * o    The allocation of local storage must not fail.
5425          */
5426         if (uap->size > ACCESSX_MAX_TABLESIZE) {
5427                 return ENOMEM;
5428         }
5429         if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
5430                 return EINVAL;
5431         }
5432         if (uap->size <= sizeof(stack_input)) {
5433                 input = stack_input;
5434         } else {
5435                 MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5436                 if (input == NULL) {
5437                         error = ENOMEM;
5438                         goto out;
5439                 }
5440         }
5441         error = copyin(uap->entries, input, uap->size);
5442         if (error) {
5443                 goto out;
5444         }
5445
5446         AUDIT_ARG(opaque, input, uap->size);
5447
5448         /*
5449          * Force NUL termination of the copyin buffer to avoid nami() running
5450          * off the end.  If the caller passes us bogus data, they may get a
5451          * bogus result.
5452          */
5453         ((char *)input)[uap->size - 1] = 0;
5454
5455         /*
5456          * Access is defined as checking against the process' real identity,
5457          * even if operations are checking the effective identity.  This
5458          * requires that we use a local vfs context.
5459          */
5460         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5461         context.vc_thread = current_thread();
5462
5463         /*
5464          * Find out how many entries we have, so we can allocate the result
5465          * array by walking the list and adjusting the count downward by the
5466          * earliest string offset we see.
5467          */
5468         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5469         desc_actual = desc_max;
5470         for (i = 0; i < desc_actual; i++) {
5471                 /*
5472                  * Take the offset to the name string for this entry and
5473                  * convert to an input array index, which would be one off
5474                  * the end of the array if this entry was the lowest-addressed
5475                  * name string.
5476                  */
5477                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5478
5479                 /*
5480                  * An offset greater than the max allowable offset is an error.
5481                  * It is also an error for any valid entry to point
5482                  * to a location prior to the end of the current entry, if
5483                  * it's not a reference to the string of the previous entry.
5484                  */
5485                 if (j > desc_max || (j != 0 && j <= i)) {
5486                         error = EINVAL;
5487                         goto out;
5488                 }
5489
5490                 /* Also do not let ad_name_offset point to something beyond the size of the input */
5491                 if (input[i].ad_name_offset >= uap->size) {
5492                         error = EINVAL;
5493                         goto out;
5494                 }
5495
5496                 /*
5497                  * An offset of 0 means use the previous descriptor's offset;
5498                  * this is used to chain multiple requests for the same file
5499                  * to avoid multiple lookups.
5500                  */
5501                 if (j == 0) {
5502                         /* This is not valid for the first entry */
5503                         if (i == 0) {
5504                                 error = EINVAL;
5505                                 goto out;
5506                         }
5507                         continue;
5508                 }
5509
5510                 /*
5511                  * If the offset of the string for this descriptor is before
5512                  * what we believe is the current actual last descriptor,
5513                  * then we need to adjust our estimate downward; this permits
5514                  * the string table following the last descriptor to be out
5515                  * of order relative to the descriptor list.
5516                  */
5517                 if (j < desc_actual) {
5518                         desc_actual = j;
5519                 }
5520         }
5521
5522         /*
5523          * We limit the actual number of descriptors we are willing to process
5524          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5525          * requested does not exceed this limit,
5526          */
5527         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5528                 error = ENOMEM;
5529                 goto out;
5530         }
5531         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK | M_ZERO);
5532         if (result == NULL) {
5533                 error = ENOMEM;
5534                 goto out;
5535         }
5536
5537         /*
5538          * Do the work by iterating over the descriptor entries we know to
5539          * at least appear to contain valid data.
5540          */
5541         error = 0;
5542         for (i = 0; i < desc_actual; i++) {
5543                 /*
5544                  * If the ad_name_offset is 0, then we use the previous
5545                  * results to make the check; otherwise, we are looking up
5546                  * a new file name.
5547                  */
5548                 if (input[i].ad_name_offset != 0) {
5549                         /* discard old vnodes */
5550                         if (vp) {
5551                                 vnode_put(vp);
5552                                 vp = NULL;
5553                         }
5554                         if (dvp) {
5555                                 vnode_put(dvp);
5556                                 dvp = NULL;
5557                         }
5558
5559                         /*
5560                          * Scan forward in the descriptor list to see if we
5561                          * need the parent vnode.  We will need it if we are
5562                          * deleting, since we must have rights  to remove
5563                          * entries in the parent directory, as well as the
5564                          * rights to delete the object itself.
5565                          */
5566                         wantdelete = input[i].ad_flags & _DELETE_OK;
5567                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
5568                                 if (input[j].ad_flags & _DELETE_OK) {
5569                                         wantdelete = 1;
5570                                 }
5571                         }
5572
5573                         niopts = FOLLOW | AUDITVNPATH1;
5574
5575                         /* need parent for vnode_authorize for deletion test */
5576                         if (wantdelete) {
5577                                 niopts |= WANTPARENT;
5578                         }
5579
5580                         /* do the lookup */
5581                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5582                             CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5583                             &context);
5584                         error = namei(&nd);
5585                         if (!error) {
5586                                 vp = nd.ni_vp;
5587                                 if (wantdelete) {
5588                                         dvp = nd.ni_dvp;
5589                                 }
5590                         }
5591                         nameidone(&nd);
5592                 }
5593
5594                 /*
5595                  * Handle lookup errors.
5596                  */
5597                 switch (error) {
5598                 case ENOENT:
5599                 case EACCES:
5600                 case EPERM:
5601                 case ENOTDIR:
5602                         result[i] = error;
5603                         break;
5604                 case 0:
5605                         /* run this access check */
5606                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5607                         break;
5608                 default:
5609                         /* fatal lookup error */
5610
5611                         goto out;
5612                 }
5613         }
5614
5615         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5616
5617         /* copy out results */
5618         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5619
5620 out:
5621         if (input && input != stack_input) {
5622                 FREE(input, M_TEMP);
5623         }
5624         if (result) {
5625                 FREE(result, M_TEMP);
5626         }
5627         if (vp) {
5628                 vnode_put(vp);
5629         }
5630         if (dvp) {
5631                 vnode_put(dvp);
5632         }
5633         if (IS_VALID_CRED(context.vc_ucred)) {
5634                 kauth_cred_unref(&context.vc_ucred);
5635         }
5636         return error;
5637 }
5638
5639
5640 /*
5641  * Returns:     0                       Success
5642  *              namei:EFAULT            Bad address
5643  *              namei:ENAMETOOLONG      Filename too long
5644  *              namei:ENOENT            No such file or directory
5645  *              namei:ELOOP             Too many levels of symbolic links
5646  *              namei:EBADF             Bad file descriptor
5647  *              namei:ENOTDIR           Not a directory
5648  *              namei:???
5649  *              access1:
5650  */
5651 static int
5652 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5653     int flag, enum uio_seg segflg)
5654 {
5655         int error;
5656         struct nameidata nd;
5657         int niopts;
5658         struct vfs_context context;
5659 #if NAMEDRSRCFORK
5660         int is_namedstream = 0;
5661 #endif
5662
5663         /*
5664          * Unless the AT_EACCESS option is used, Access is defined as checking
5665          * against the process' real identity, even if operations are checking
5666          * the effective identity.  So we need to tweak the credential
5667          * in the context for that case.
5668          */
5669         if (!(flag & AT_EACCESS)) {
5670                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5671         } else {
5672                 context.vc_ucred = ctx->vc_ucred;
5673         }
5674         context.vc_thread = ctx->vc_thread;
5675
5676
5677         niopts = FOLLOW | AUDITVNPATH1;
5678         /* need parent for vnode_authorize for deletion test */
5679         if (amode & _DELETE_OK) {
5680                 niopts |= WANTPARENT;
5681         }
5682         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5683             path, &context);
5684
5685 #if NAMEDRSRCFORK
5686         /* access(F_OK) calls are allowed for resource forks. */
5687         if (amode == F_OK) {
5688                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5689         }
5690 #endif
5691         error = nameiat(&nd, fd);
5692         if (error) {
5693                 goto out;
5694         }
5695
5696 #if NAMEDRSRCFORK
5697         /* Grab reference on the shadow stream file vnode to
5698          * force an inactive on release which will mark it
5699          * for recycle.
5700          */
5701         if (vnode_isnamedstream(nd.ni_vp) &&
5702             (nd.ni_vp->v_parent != NULLVP) &&
5703             vnode_isshadow(nd.ni_vp)) {
5704                 is_namedstream = 1;
5705                 vnode_ref(nd.ni_vp);
5706         }
5707 #endif
5708
5709         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
5710
5711 #if NAMEDRSRCFORK
5712         if (is_namedstream) {
5713                 vnode_rele(nd.ni_vp);
5714         }
5715 #endif
5716
5717         vnode_put(nd.ni_vp);
5718         if (amode & _DELETE_OK) {
5719                 vnode_put(nd.ni_dvp);
5720         }
5721         nameidone(&nd);
5722
5723 out:
5724         if (!(flag & AT_EACCESS)) {
5725                 kauth_cred_unref(&context.vc_ucred);
5726         }
5727         return error;
5728 }
5729
5730 int
5731 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
5732 {
5733         return faccessat_internal(vfs_context_current(), AT_FDCWD,
5734                    uap->path, uap->flags, 0, UIO_USERSPACE);
5735 }
5736
5737 int
5738 faccessat(__unused proc_t p, struct faccessat_args *uap,
5739     __unused int32_t *retval)
5740 {
5741         if (uap->flag & ~AT_EACCESS) {
5742                 return EINVAL;
5743         }
5744
5745         return faccessat_internal(vfs_context_current(), uap->fd,
5746                    uap->path, uap->amode, uap->flag, UIO_USERSPACE);
5747 }
5748
5749 /*
5750  * Returns:     0                       Success
5751  *              EFAULT
5752  *      copyout:EFAULT
5753  *      namei:???
5754  *      vn_stat:???
5755  */
5756 static int
5757 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
5758     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
5759     enum uio_seg segflg, int fd, int flag)
5760 {
5761         struct nameidata nd;
5762         int follow;
5763         union {
5764                 struct stat sb;
5765                 struct stat64 sb64;
5766         } source = {};
5767         union {
5768                 struct user64_stat user64_sb;
5769                 struct user32_stat user32_sb;
5770                 struct user64_stat64 user64_sb64;
5771                 struct user32_stat64 user32_sb64;
5772         } dest = {};
5773         caddr_t sbp;
5774         int error, my_size;
5775         kauth_filesec_t fsec;
5776         size_t xsecurity_bufsize;
5777         void * statptr;
5778
5779         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5780         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
5781             segflg, path, ctx);
5782
5783 #if NAMEDRSRCFORK
5784         int is_namedstream = 0;
5785         /* stat calls are allowed for resource forks. */
5786         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5787 #endif
5788         error = nameiat(&nd, fd);
5789         if (error) {
5790                 return error;
5791         }
5792         fsec = KAUTH_FILESEC_NONE;
5793
5794         statptr = (void *)&source;
5795
5796 #if NAMEDRSRCFORK
5797         /* Grab reference on the shadow stream file vnode to
5798          * force an inactive on release which will mark it
5799          * for recycle.
5800          */
5801         if (vnode_isnamedstream(nd.ni_vp) &&
5802             (nd.ni_vp->v_parent != NULLVP) &&
5803             vnode_isshadow(nd.ni_vp)) {
5804                 is_namedstream = 1;
5805                 vnode_ref(nd.ni_vp);
5806         }
5807 #endif
5808
5809         error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
5810
5811 #if NAMEDRSRCFORK
5812         if (is_namedstream) {
5813                 vnode_rele(nd.ni_vp);
5814         }
5815 #endif
5816         vnode_put(nd.ni_vp);
5817         nameidone(&nd);
5818
5819         if (error) {
5820                 return error;
5821         }
5822         /* Zap spare fields */
5823         if (isstat64 != 0) {
5824                 source.sb64.st_lspare = 0;
5825                 source.sb64.st_qspare[0] = 0LL;
5826                 source.sb64.st_qspare[1] = 0LL;
5827                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5828                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
5829                         my_size = sizeof(dest.user64_sb64);
5830                         sbp = (caddr_t)&dest.user64_sb64;
5831                 } else {
5832                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
5833                         my_size = sizeof(dest.user32_sb64);
5834                         sbp = (caddr_t)&dest.user32_sb64;
5835                 }
5836                 /*
5837                  * Check if we raced (post lookup) against the last unlink of a file.
5838                  */
5839                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
5840                         source.sb64.st_nlink = 1;
5841                 }
5842         } else {
5843                 source.sb.st_lspare = 0;
5844                 source.sb.st_qspare[0] = 0LL;
5845                 source.sb.st_qspare[1] = 0LL;
5846                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5847                         munge_user64_stat(&source.sb, &dest.user64_sb);
5848                         my_size = sizeof(dest.user64_sb);
5849                         sbp = (caddr_t)&dest.user64_sb;
5850                 } else {
5851                         munge_user32_stat(&source.sb, &dest.user32_sb);
5852                         my_size = sizeof(dest.user32_sb);
5853                         sbp = (caddr_t)&dest.user32_sb;
5854                 }
5855
5856                 /*
5857                  * Check if we raced (post lookup) against the last unlink of a file.
5858                  */
5859                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
5860                         source.sb.st_nlink = 1;
5861                 }
5862         }
5863         if ((error = copyout(sbp, ub, my_size)) != 0) {
5864                 goto out;
5865         }
5866
5867         /* caller wants extended security information? */
5868         if (xsecurity != USER_ADDR_NULL) {
5869                 /* did we get any? */
5870                 if (fsec == KAUTH_FILESEC_NONE) {
5871                         if (susize(xsecurity_size, 0) != 0) {
5872                                 error = EFAULT;
5873                                 goto out;
5874                         }
5875                 } else {
5876                         /* find the user buffer size */
5877                         xsecurity_bufsize = fusize(xsecurity_size);
5878
5879                         /* copy out the actual data size */
5880                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
5881                                 error = EFAULT;
5882                                 goto out;
5883                         }
5884
5885                         /* if the caller supplied enough room, copy out to it */
5886                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
5887                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
5888                         }
5889                 }
5890         }
5891 out:
5892         if (fsec != KAUTH_FILESEC_NONE) {
5893                 kauth_filesec_free(fsec);
5894         }
5895         return error;
5896 }
5897
5898 /*
5899  * stat_extended: Get file status; with extended security (ACL).
5900  *
5901  * Parameters:    p                       (ignored)
5902  *                uap                     User argument descriptor (see below)
5903  *                retval                  (ignored)
5904  *
5905  * Indirect:      uap->path               Path of file to get status from
5906  *                uap->ub                 User buffer (holds file status info)
5907  *                uap->xsecurity          ACL to get (extended security)
5908  *                uap->xsecurity_size     Size of ACL
5909  *
5910  * Returns:        0                      Success
5911  *                !0                      errno value
5912  *
5913  */
5914 int
5915 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
5916     __unused int32_t *retval)
5917 {
5918         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5919                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5920                    0);
5921 }
5922
5923 /*
5924  * Returns:     0                       Success
5925  *      fstatat_internal:???            [see fstatat_internal() in this file]
5926  */
5927 int
5928 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
5929 {
5930         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5931                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
5932 }
5933
5934 int
5935 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
5936 {
5937         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5938                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
5939 }
5940
5941 /*
5942  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
5943  *
5944  * Parameters:    p                       (ignored)
5945  *                uap                     User argument descriptor (see below)
5946  *                retval                  (ignored)
5947  *
5948  * Indirect:      uap->path               Path of file to get status from
5949  *                uap->ub                 User buffer (holds file status info)
5950  *                uap->xsecurity          ACL to get (extended security)
5951  *                uap->xsecurity_size     Size of ACL
5952  *
5953  * Returns:        0                      Success
5954  *                !0                      errno value
5955  *
5956  */
5957 int
5958 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
5959 {
5960         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5961                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5962                    0);
5963 }
5964
5965 /*
5966  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
5967  *
5968  * Parameters:    p                       (ignored)
5969  *                uap                     User argument descriptor (see below)
5970  *                retval                  (ignored)
5971  *
5972  * Indirect:      uap->path               Path of file to get status from
5973  *                uap->ub                 User buffer (holds file status info)
5974  *                uap->xsecurity          ACL to get (extended security)
5975  *                uap->xsecurity_size     Size of ACL
5976  *
5977  * Returns:        0                      Success
5978  *                !0                      errno value
5979  *
5980  */
5981 int
5982 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
5983 {
5984         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5985                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5986                    AT_SYMLINK_NOFOLLOW);
5987 }
5988
5989 /*
5990  * Get file status; this version does not follow links.
5991  */
5992 int
5993 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
5994 {
5995         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5996                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
5997 }
5998
5999 int
6000 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6001 {
6002         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6003                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6004 }
6005
6006 /*
6007  * lstat64_extended: Get file status; can handle large inode numbers; does not
6008  * follow links; with extended security (ACL).
6009  *
6010  * Parameters:    p                       (ignored)
6011  *                uap                     User argument descriptor (see below)
6012  *                retval                  (ignored)
6013  *
6014  * Indirect:      uap->path               Path of file to get status from
6015  *                uap->ub                 User buffer (holds file status info)
6016  *                uap->xsecurity          ACL to get (extended security)
6017  *                uap->xsecurity_size     Size of ACL
6018  *
6019  * Returns:        0                      Success
6020  *                !0                      errno value
6021  *
6022  */
6023 int
6024 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6025 {
6026         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6027                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6028                    AT_SYMLINK_NOFOLLOW);
6029 }
6030
6031 int
6032 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6033 {
6034         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
6035                 return EINVAL;
6036         }
6037
6038         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6039                    0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6040 }
6041
6042 int
6043 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6044     __unused int32_t *retval)
6045 {
6046         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
6047                 return EINVAL;
6048         }
6049
6050         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6051                    0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6052 }
6053
6054 /*
6055  * Get configurable pathname variables.
6056  *
6057  * Returns:     0                       Success
6058  *      namei:???
6059  *      vn_pathconf:???
6060  *
6061  * Notes:       Global implementation  constants are intended to be
6062  *              implemented in this function directly; all other constants
6063  *              are per-FS implementation, and therefore must be handled in
6064  *              each respective FS, instead.
6065  *
6066  * XXX We implement some things globally right now that should actually be
6067  * XXX per-FS; we will need to deal with this at some point.
6068  */
6069 /* ARGSUSED */
6070 int
6071 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6072 {
6073         int error;
6074         struct nameidata nd;
6075         vfs_context_t ctx = vfs_context_current();
6076
6077         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6078             UIO_USERSPACE, uap->path, ctx);
6079         error = namei(&nd);
6080         if (error) {
6081                 return error;
6082         }
6083
6084         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6085
6086         vnode_put(nd.ni_vp);
6087         nameidone(&nd);
6088         return error;
6089 }
6090
6091 /*
6092  * Return target name of a symbolic link.
6093  */
6094 /* ARGSUSED */
6095 static int
6096 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
6097     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6098     int *retval)
6099 {
6100         vnode_t vp;
6101         uio_t auio;
6102         int error;
6103         struct nameidata nd;
6104         char uio_buf[UIO_SIZEOF(1)];
6105
6106         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6107             seg, path, ctx);
6108
6109         error = nameiat(&nd, fd);
6110         if (error) {
6111                 return error;
6112         }
6113         vp = nd.ni_vp;
6114
6115         nameidone(&nd);
6116
6117         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6118             &uio_buf[0], sizeof(uio_buf));
6119         uio_addiov(auio, buf, bufsize);
6120         if (vp->v_type != VLNK) {
6121                 error = EINVAL;
6122         } else {
6123 #if CONFIG_MACF
6124                 error = mac_vnode_check_readlink(ctx, vp);
6125 #endif
6126                 if (error == 0) {
6127                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6128                             ctx);
6129                 }
6130                 if (error == 0) {
6131                         error = VNOP_READLINK(vp, auio, ctx);
6132                 }
6133         }
6134         vnode_put(vp);
6135
6136         *retval = bufsize - (int)uio_resid(auio);
6137         return error;
6138 }
6139
6140 int
6141 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6142 {
6143         enum uio_seg procseg;
6144
6145         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6146         return readlinkat_internal(vfs_context_current(), AT_FDCWD,
6147                    CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6148                    uap->count, procseg, retval);
6149 }
6150
6151 int
6152 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6153 {
6154         enum uio_seg procseg;
6155
6156         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6157         return readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
6158                    procseg, uap->buf, uap->bufsize, procseg, retval);
6159 }
6160
6161 /*
6162  * Change file flags.
6163  *
6164  * NOTE: this will vnode_put() `vp'
6165  */
6166 static int
6167 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6168 {
6169         struct vnode_attr va;
6170         kauth_action_t action;
6171         int error;
6172
6173         VATTR_INIT(&va);
6174         VATTR_SET(&va, va_flags, flags);
6175
6176 #if CONFIG_MACF
6177         error = mac_vnode_check_setflags(ctx, vp, flags);
6178         if (error) {
6179                 goto out;
6180         }
6181 #endif
6182
6183         /* request authorisation, disregard immutability */
6184         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6185                 goto out;
6186         }
6187         /*
6188          * Request that the auth layer disregard those file flags it's allowed to when
6189          * authorizing this operation; we need to do this in order to be able to
6190          * clear immutable flags.
6191          */
6192         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6193                 goto out;
6194         }
6195         error = vnode_setattr(vp, &va, ctx);
6196
6197 #if CONFIG_MACF
6198         if (error == 0) {
6199                 mac_vnode_notify_setflags(ctx, vp, flags);
6200         }
6201 #endif
6202
6203         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6204                 error = ENOTSUP;
6205         }
6206 out:
6207         vnode_put(vp);
6208         return error;
6209 }
6210
6211 /*
6212  * Change flags of a file given a path name.
6213  */
6214 /* ARGSUSED */
6215 int
6216 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6217 {
6218         vnode_t vp;
6219         vfs_context_t ctx = vfs_context_current();
6220         int error;
6221         struct nameidata nd;
6222
6223         AUDIT_ARG(fflags, uap->flags);
6224         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6225             UIO_USERSPACE, uap->path, ctx);
6226         error = namei(&nd);
6227         if (error) {
6228                 return error;
6229         }
6230         vp = nd.ni_vp;
6231         nameidone(&nd);
6232
6233         /* we don't vnode_put() here because chflags1 does internally */
6234         error = chflags1(vp, uap->flags, ctx);
6235
6236         return error;
6237 }
6238
6239 /*
6240  * Change flags of a file given a file descriptor.
6241  */
6242 /* ARGSUSED */
6243 int
6244 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6245 {
6246         vnode_t vp;
6247         int error;
6248
6249         AUDIT_ARG(fd, uap->fd);
6250         AUDIT_ARG(fflags, uap->flags);
6251         if ((error = file_vnode(uap->fd, &vp))) {
6252                 return error;
6253         }
6254
6255         if ((error = vnode_getwithref(vp))) {
6256                 file_drop(uap->fd);
6257                 return error;
6258         }
6259
6260         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6261
6262         /* we don't vnode_put() here because chflags1 does internally */
6263         error = chflags1(vp, uap->flags, vfs_context_current());
6264
6265         file_drop(uap->fd);
6266         return error;
6267 }
6268
6269 /*
6270  * Change security information on a filesystem object.
6271  *
6272  * Returns:     0                       Success
6273  *              EPERM                   Operation not permitted
6274  *              vnode_authattr:???      [anything vnode_authattr can return]
6275  *              vnode_authorize:???     [anything vnode_authorize can return]
6276  *              vnode_setattr:???       [anything vnode_setattr can return]
6277  *
6278  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
6279  *              translated to EPERM before being returned.
6280  */
6281 static int
6282 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6283 {
6284         kauth_action_t action;
6285         int error;
6286
6287         AUDIT_ARG(mode, vap->va_mode);
6288         /* XXX audit new args */
6289
6290 #if NAMEDSTREAMS
6291         /* chmod calls are not allowed for resource forks. */
6292         if (vp->v_flag & VISNAMEDSTREAM) {
6293                 return EPERM;
6294         }
6295 #endif
6296
6297 #if CONFIG_MACF
6298         if (VATTR_IS_ACTIVE(vap, va_mode) &&
6299             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6300                 return error;
6301         }
6302
6303         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6304                 if ((error = mac_vnode_check_setowner(ctx, vp,
6305                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6306                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6307                         return error;
6308                 }
6309         }
6310
6311         if (VATTR_IS_ACTIVE(vap, va_acl) &&
6312             (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
6313                 return error;
6314         }
6315 #endif
6316
6317         /* make sure that the caller is allowed to set this security information */
6318         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6319             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6320                 if (error == EACCES) {
6321                         error = EPERM;
6322                 }
6323                 return error;
6324         }
6325
6326         if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
6327                 return error;
6328         }
6329
6330 #if CONFIG_MACF
6331         if (VATTR_IS_ACTIVE(vap, va_mode)) {
6332                 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6333         }
6334
6335         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6336                 mac_vnode_notify_setowner(ctx, vp,
6337                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6338                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6339         }
6340
6341         if (VATTR_IS_ACTIVE(vap, va_acl)) {
6342                 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6343         }
6344 #endif
6345
6346         return error;
6347 }
6348
6349
6350 /*
6351  * Change mode of a file given a path name.
6352  *
6353  * Returns:     0                       Success
6354  *              namei:???               [anything namei can return]
6355  *              chmod_vnode:???         [anything chmod_vnode can return]
6356  */
6357 static int
6358 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6359     int fd, int flag, enum uio_seg segflg)
6360 {
6361         struct nameidata nd;
6362         int follow, error;
6363
6364         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6365         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
6366             segflg, path, ctx);
6367         if ((error = nameiat(&nd, fd))) {
6368                 return error;
6369         }
6370         error = chmod_vnode(ctx, nd.ni_vp, vap);
6371         vnode_put(nd.ni_vp);
6372         nameidone(&nd);
6373         return error;
6374 }
6375
6376 /*
6377  * chmod_extended: Change the mode of a file given a path name; with extended
6378  * argument list (including extended security (ACL)).
6379  *
6380  * Parameters:  p                       Process requesting the open
6381  *              uap                     User argument descriptor (see below)
6382  *              retval                  (ignored)
6383  *
6384  * Indirect:    uap->path               Path to object (same as 'chmod')
6385  *              uap->uid                UID to set
6386  *              uap->gid                GID to set
6387  *              uap->mode               File mode to set (same as 'chmod')
6388  *              uap->xsecurity          ACL to set (or delete)
6389  *
6390  * Returns:     0                       Success
6391  *              !0                      errno value
6392  *
6393  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
6394  *
6395  * XXX:         We should enummerate the possible errno values here, and where
6396  *              in the code they originated.
6397  */
6398 int
6399 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
6400 {
6401         int error;
6402         struct vnode_attr va;
6403         kauth_filesec_t xsecdst;
6404
6405         AUDIT_ARG(owner, uap->uid, uap->gid);
6406
6407         VATTR_INIT(&va);
6408         if (uap->mode != -1) {
6409                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6410         }
6411         if (uap->uid != KAUTH_UID_NONE) {
6412                 VATTR_SET(&va, va_uid, uap->uid);
6413         }
6414         if (uap->gid != KAUTH_GID_NONE) {
6415                 VATTR_SET(&va, va_gid, uap->gid);
6416         }
6417
6418         xsecdst = NULL;
6419         switch (uap->xsecurity) {
6420         /* explicit remove request */
6421         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6422                 VATTR_SET(&va, va_acl, NULL);
6423                 break;
6424         /* not being set */
6425         case USER_ADDR_NULL:
6426                 break;
6427         default:
6428                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6429                         return error;
6430                 }
6431                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6432                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6433         }
6434
6435         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6436             UIO_USERSPACE);
6437
6438         if (xsecdst != NULL) {
6439                 kauth_filesec_free(xsecdst);
6440         }
6441         return error;
6442 }
6443
6444 /*
6445  * Returns:     0                       Success
6446  *              chmodat:???             [anything chmodat can return]
6447  */
6448 static int
6449 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6450     int flag, enum uio_seg segflg)
6451 {
6452         struct vnode_attr va;
6453
6454         VATTR_INIT(&va);
6455         VATTR_SET(&va, va_mode, mode & ALLPERMS);
6456
6457         return chmodat(ctx, path, &va, fd, flag, segflg);
6458 }
6459
6460 int
6461 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6462 {
6463         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6464                    AT_FDCWD, 0, UIO_USERSPACE);
6465 }
6466
6467 int
6468 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6469 {
6470         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
6471                 return EINVAL;
6472         }
6473
6474         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6475                    uap->fd, uap->flag, UIO_USERSPACE);
6476 }
6477
6478 /*
6479  * Change mode of a file given a file descriptor.
6480  */
6481 static int
6482 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6483 {
6484         vnode_t vp;
6485         int error;
6486
6487         AUDIT_ARG(fd, fd);
6488
6489         if ((error = file_vnode(fd, &vp)) != 0) {
6490                 return error;
6491         }
6492         if ((error = vnode_getwithref(vp)) != 0) {
6493                 file_drop(fd);
6494                 return error;
6495         }
6496         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6497
6498         error = chmod_vnode(vfs_context_current(), vp, vap);
6499         (void)vnode_put(vp);
6500         file_drop(fd);
6501
6502         return error;
6503 }
6504
6505 /*
6506  * fchmod_extended: Change mode of a file given a file descriptor; with
6507  * extended argument list (including extended security (ACL)).
6508  *
6509  * Parameters:    p                       Process requesting to change file mode
6510  *                uap                     User argument descriptor (see below)
6511  *                retval                  (ignored)
6512  *
6513  * Indirect:      uap->mode               File mode to set (same as 'chmod')
6514  *                uap->uid                UID to set
6515  *                uap->gid                GID to set
6516  *                uap->xsecurity          ACL to set (or delete)
6517  *                uap->fd                 File descriptor of file to change mode
6518  *
6519  * Returns:        0                      Success
6520  *                !0                      errno value
6521  *
6522  */
6523 int
6524 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6525 {
6526         int error;
6527         struct vnode_attr va;
6528         kauth_filesec_t xsecdst;
6529
6530         AUDIT_ARG(owner, uap->uid, uap->gid);
6531
6532         VATTR_INIT(&va);
6533         if (uap->mode != -1) {
6534                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6535         }
6536         if (uap->uid != KAUTH_UID_NONE) {
6537                 VATTR_SET(&va, va_uid, uap->uid);
6538         }
6539         if (uap->gid != KAUTH_GID_NONE) {
6540                 VATTR_SET(&va, va_gid, uap->gid);
6541         }
6542
6543         xsecdst = NULL;
6544         switch (uap->xsecurity) {
6545         case USER_ADDR_NULL:
6546                 VATTR_SET(&va, va_acl, NULL);
6547                 break;
6548         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6549                 VATTR_SET(&va, va_acl, NULL);
6550                 break;
6551         /* not being set */
6552         case CAST_USER_ADDR_T(-1):
6553                 break;
6554         default:
6555                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6556                         return error;
6557                 }
6558                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6559         }
6560
6561         error = fchmod1(p, uap->fd, &va);
6562
6563
6564         switch (uap->xsecurity) {
6565         case USER_ADDR_NULL:
6566         case CAST_USER_ADDR_T(-1):
6567                 break;
6568         default:
6569                 if (xsecdst != NULL) {
6570                         kauth_filesec_free(xsecdst);
6571                 }
6572         }
6573         return error;
6574 }
6575
6576 int
6577 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6578 {
6579         struct vnode_attr va;
6580
6581         VATTR_INIT(&va);
6582         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6583
6584         return fchmod1(p, uap->fd, &va);
6585 }
6586
6587
6588 /*
6589  * Set ownership given a path name.
6590  */
6591 /* ARGSUSED */
6592 static int
6593 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6594     gid_t gid, int flag, enum uio_seg segflg)
6595 {
6596         vnode_t vp;
6597         struct vnode_attr va;
6598         int error;
6599         struct nameidata nd;
6600         int follow;
6601         kauth_action_t action;
6602
6603         AUDIT_ARG(owner, uid, gid);
6604
6605         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6606         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6607             path, ctx);
6608         error = nameiat(&nd, fd);
6609         if (error) {
6610                 return error;
6611         }
6612         vp = nd.ni_vp;
6613
6614         nameidone(&nd);
6615
6616         VATTR_INIT(&va);
6617         if (uid != (uid_t)VNOVAL) {
6618                 VATTR_SET(&va, va_uid, uid);
6619         }
6620         if (gid != (gid_t)VNOVAL) {
6621                 VATTR_SET(&va, va_gid, gid);
6622         }
6623
6624 #if CONFIG_MACF
6625         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6626         if (error) {
6627                 goto out;
6628         }
6629 #endif
6630
6631         /* preflight and authorize attribute changes */
6632         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6633                 goto out;
6634         }
6635         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6636                 goto out;
6637         }
6638         error = vnode_setattr(vp, &va, ctx);
6639
6640 #if CONFIG_MACF
6641         if (error == 0) {
6642                 mac_vnode_notify_setowner(ctx, vp, uid, gid);
6643         }
6644 #endif
6645
6646 out:
6647         /*
6648          * EACCES is only allowed from namei(); permissions failure should
6649          * return EPERM, so we need to translate the error code.
6650          */
6651         if (error == EACCES) {
6652                 error = EPERM;
6653         }
6654
6655         vnode_put(vp);
6656         return error;
6657 }
6658
6659 int
6660 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
6661 {
6662         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6663                    uap->uid, uap->gid, 0, UIO_USERSPACE);
6664 }
6665
6666 int
6667 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
6668 {
6669         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6670                    uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
6671 }
6672
6673 int
6674 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
6675 {
6676         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
6677                 return EINVAL;
6678         }
6679
6680         return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
6681                    uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
6682 }
6683
6684 /*
6685  * Set ownership given a file descriptor.
6686  */
6687 /* ARGSUSED */
6688 int
6689 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
6690 {
6691         struct vnode_attr va;
6692         vfs_context_t ctx = vfs_context_current();
6693         vnode_t vp;
6694         int error;
6695         kauth_action_t action;
6696
6697         AUDIT_ARG(owner, uap->uid, uap->gid);
6698         AUDIT_ARG(fd, uap->fd);
6699
6700         if ((error = file_vnode(uap->fd, &vp))) {
6701                 return error;
6702         }
6703
6704         if ((error = vnode_getwithref(vp))) {
6705                 file_drop(uap->fd);
6706                 return error;
6707         }
6708         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6709
6710         VATTR_INIT(&va);
6711         if (uap->uid != VNOVAL) {
6712                 VATTR_SET(&va, va_uid, uap->uid);
6713         }
6714         if (uap->gid != VNOVAL) {
6715                 VATTR_SET(&va, va_gid, uap->gid);
6716         }
6717
6718 #if NAMEDSTREAMS
6719         /* chown calls are not allowed for resource forks. */
6720         if (vp->v_flag & VISNAMEDSTREAM) {
6721                 error = EPERM;
6722                 goto out;
6723         }
6724 #endif
6725
6726 #if CONFIG_MACF
6727         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
6728         if (error) {
6729                 goto out;
6730         }
6731 #endif
6732
6733         /* preflight and authorize attribute changes */
6734         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6735                 goto out;
6736         }
6737         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6738                 if (error == EACCES) {
6739                         error = EPERM;
6740                 }
6741                 goto out;
6742         }
6743         error = vnode_setattr(vp, &va, ctx);
6744
6745 #if CONFIG_MACF
6746         if (error == 0) {
6747                 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
6748         }
6749 #endif
6750
6751 out:
6752         (void)vnode_put(vp);
6753         file_drop(uap->fd);
6754         return error;
6755 }
6756
6757 static int
6758 getutimes(user_addr_t usrtvp, struct timespec *tsp)
6759 {
6760         int error;
6761
6762         if (usrtvp == USER_ADDR_NULL) {
6763                 struct timeval old_tv;
6764                 /* XXX Y2038 bug because of microtime argument */
6765                 microtime(&old_tv);
6766                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
6767                 tsp[1] = tsp[0];
6768         } else {
6769                 if (IS_64BIT_PROCESS(current_proc())) {
6770                         struct user64_timeval tv[2];
6771                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6772                         if (error) {
6773                                 return error;
6774                         }
6775                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6776                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6777                 } else {
6778                         struct user32_timeval tv[2];
6779                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6780                         if (error) {
6781                                 return error;
6782                         }
6783                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6784                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6785                 }
6786         }
6787         return 0;
6788 }
6789
6790 static int
6791 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
6792     int nullflag)
6793 {
6794         int error;
6795         struct vnode_attr va;
6796         kauth_action_t action;
6797
6798         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6799
6800         VATTR_INIT(&va);
6801         VATTR_SET(&va, va_access_time, ts[0]);
6802         VATTR_SET(&va, va_modify_time, ts[1]);
6803         if (nullflag) {
6804                 va.va_vaflags |= VA_UTIMES_NULL;
6805         }
6806
6807 #if NAMEDSTREAMS
6808         /* utimes calls are not allowed for resource forks. */
6809         if (vp->v_flag & VISNAMEDSTREAM) {
6810                 error = EPERM;
6811                 goto out;
6812         }
6813 #endif
6814
6815 #if CONFIG_MACF
6816         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
6817         if (error) {
6818                 goto out;
6819         }
6820 #endif
6821         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6822                 if (!nullflag && error == EACCES) {
6823                         error = EPERM;
6824                 }
6825                 goto out;
6826         }
6827
6828         /* since we may not need to auth anything, check here */
6829         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6830                 if (!nullflag && error == EACCES) {
6831                         error = EPERM;
6832                 }
6833                 goto out;
6834         }
6835         error = vnode_setattr(vp, &va, ctx);
6836
6837 #if CONFIG_MACF
6838         if (error == 0) {
6839                 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
6840         }
6841 #endif
6842
6843 out:
6844         return error;
6845 }
6846
6847 /*
6848  * Set the access and modification times of a file.
6849  */
6850 /* ARGSUSED */
6851 int
6852 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
6853 {
6854         struct timespec ts[2];
6855         user_addr_t usrtvp;
6856         int error;
6857         struct nameidata nd;
6858         vfs_context_t ctx = vfs_context_current();
6859
6860         /*
6861          * AUDIT: Needed to change the order of operations to do the
6862          * name lookup first because auditing wants the path.
6863          */
6864         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6865             UIO_USERSPACE, uap->path, ctx);
6866         error = namei(&nd);
6867         if (error) {
6868                 return error;
6869         }
6870         nameidone(&nd);
6871
6872         /*
6873          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
6874          * the current time instead.
6875          */
6876         usrtvp = uap->tptr;
6877         if ((error = getutimes(usrtvp, ts)) != 0) {
6878                 goto out;
6879         }
6880
6881         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
6882
6883 out:
6884         vnode_put(nd.ni_vp);
6885         return error;
6886 }
6887
6888 /*
6889  * Set the access and modification times of a file.
6890  */
6891 /* ARGSUSED */
6892 int
6893 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
6894 {
6895         struct timespec ts[2];
6896         vnode_t vp;
6897         user_addr_t usrtvp;
6898         int error;
6899
6900         AUDIT_ARG(fd, uap->fd);
6901         usrtvp = uap->tptr;
6902         if ((error = getutimes(usrtvp, ts)) != 0) {
6903                 return error;
6904         }
6905         if ((error = file_vnode(uap->fd, &vp)) != 0) {
6906                 return error;
6907         }
6908         if ((error = vnode_getwithref(vp))) {
6909                 file_drop(uap->fd);
6910                 return error;
6911         }
6912
6913         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
6914         vnode_put(vp);
6915         file_drop(uap->fd);
6916         return error;
6917 }
6918
6919 /*
6920  * Truncate a file given its path name.
6921  */
6922 /* ARGSUSED */
6923 int
6924 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
6925 {
6926         vnode_t vp;
6927         struct vnode_attr va;
6928         vfs_context_t ctx = vfs_context_current();
6929         int error;
6930         struct nameidata nd;
6931         kauth_action_t action;
6932
6933         if (uap->length < 0) {
6934                 return EINVAL;
6935         }
6936         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
6937             UIO_USERSPACE, uap->path, ctx);
6938         if ((error = namei(&nd))) {
6939                 return error;
6940         }
6941         vp = nd.ni_vp;
6942
6943         nameidone(&nd);
6944
6945         VATTR_INIT(&va);
6946         VATTR_SET(&va, va_data_size, uap->length);
6947
6948 #if CONFIG_MACF
6949         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
6950         if (error) {
6951                 goto out;
6952         }
6953 #endif
6954
6955         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6956                 goto out;
6957         }
6958         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6959                 goto out;
6960         }
6961         error = vnode_setattr(vp, &va, ctx);
6962
6963 #if CONFIG_MACF
6964         if (error == 0) {
6965                 mac_vnode_notify_truncate(ctx, NOCRED, vp);
6966         }
6967 #endif
6968
6969 out:
6970         vnode_put(vp);
6971         return error;
6972 }
6973
6974 /*
6975  * Truncate a file given a file descriptor.
6976  */
6977 /* ARGSUSED */
6978 int
6979 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
6980 {
6981         vfs_context_t ctx = vfs_context_current();
6982         struct vnode_attr va;
6983         vnode_t vp;
6984         struct fileproc *fp;
6985         int error;
6986         int fd = uap->fd;
6987
6988         AUDIT_ARG(fd, uap->fd);
6989         if (uap->length < 0) {
6990                 return EINVAL;
6991         }
6992
6993         if ((error = fp_lookup(p, fd, &fp, 0))) {
6994                 return error;
6995         }
6996
6997         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
6998         case DTYPE_PSXSHM:
6999                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7000                 goto out;
7001         case DTYPE_VNODE:
7002                 break;
7003         default:
7004                 error = EINVAL;
7005                 goto out;
7006         }
7007
7008         vp = (vnode_t)fp->f_fglob->fg_data;
7009
7010         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
7011                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7012                 error = EINVAL;
7013                 goto out;
7014         }
7015
7016         if ((error = vnode_getwithref(vp)) != 0) {
7017                 goto out;
7018         }
7019
7020         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7021
7022 #if CONFIG_MACF
7023         error = mac_vnode_check_truncate(ctx,
7024             fp->f_fglob->fg_cred, vp);
7025         if (error) {
7026                 (void)vnode_put(vp);
7027                 goto out;
7028         }
7029 #endif
7030         VATTR_INIT(&va);
7031         VATTR_SET(&va, va_data_size, uap->length);
7032         error = vnode_setattr(vp, &va, ctx);
7033
7034 #if CONFIG_MACF
7035         if (error == 0) {
7036                 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
7037         }
7038 #endif
7039
7040         (void)vnode_put(vp);
7041 out:
7042         file_drop(fd);
7043         return error;
7044 }
7045
7046
7047 /*
7048  * Sync an open file with synchronized I/O _file_ integrity completion
7049  */
7050 /* ARGSUSED */
7051 int
7052 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7053 {
7054         __pthread_testcancel(1);
7055         return fsync_common(p, uap, MNT_WAIT);
7056 }
7057
7058
7059 /*
7060  * Sync an open file with synchronized I/O _file_ integrity completion
7061  *
7062  * Notes:       This is a legacy support function that does not test for
7063  *              thread cancellation points.
7064  */
7065 /* ARGSUSED */
7066 int
7067 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7068 {
7069         return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7070 }
7071
7072
7073 /*
7074  * Sync an open file with synchronized I/O _data_ integrity completion
7075  */
7076 /* ARGSUSED */
7077 int
7078 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7079 {
7080         __pthread_testcancel(1);
7081         return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7082 }
7083
7084
7085 /*
7086  * fsync_common
7087  *
7088  * Common fsync code to support both synchronized I/O file integrity completion
7089  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7090  *
7091  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7092  * will only guarantee that the file data contents are retrievable.  If
7093  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7094  * includes additional metadata unnecessary for retrieving the file data
7095  * contents, such as atime, mtime, ctime, etc., also be committed to stable
7096  * storage.
7097  *
7098  * Parameters:  p                               The process
7099  *              uap->fd                         The descriptor to synchronize
7100  *              flags                           The data integrity flags
7101  *
7102  * Returns:     int                             Success
7103  *      fp_getfvp:EBADF                         Bad file descriptor
7104  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
7105  *      VNOP_FSYNC:???                          unspecified
7106  *
7107  * Notes:       We use struct fsync_args because it is a short name, and all
7108  *              caller argument structures are otherwise identical.
7109  */
7110 static int
7111 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7112 {
7113         vnode_t vp;
7114         struct fileproc *fp;
7115         vfs_context_t ctx = vfs_context_current();
7116         int error;
7117
7118         AUDIT_ARG(fd, uap->fd);
7119
7120         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7121                 return error;
7122         }
7123         if ((error = vnode_getwithref(vp))) {
7124                 file_drop(uap->fd);
7125                 return error;
7126         }
7127
7128         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7129
7130         error = VNOP_FSYNC(vp, flags, ctx);
7131
7132 #if NAMEDRSRCFORK
7133         /* Sync resource fork shadow file if necessary. */
7134         if ((error == 0) &&
7135             (vp->v_flag & VISNAMEDSTREAM) &&
7136             (vp->v_parent != NULLVP) &&
7137             vnode_isshadow(vp) &&
7138             (fp->f_flags & FP_WRITTEN)) {
7139                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7140         }
7141 #endif
7142
7143         (void)vnode_put(vp);
7144         file_drop(uap->fd);
7145         return error;
7146 }
7147
7148 /*
7149  * Duplicate files.  Source must be a file, target must be a file or
7150  * must not exist.
7151  *
7152  * XXX Copyfile authorisation checking is woefully inadequate, and will not
7153  *     perform inheritance correctly.
7154  */
7155 /* ARGSUSED */
7156 int
7157 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7158 {
7159         vnode_t tvp, fvp, tdvp, sdvp;
7160         struct nameidata fromnd, tond;
7161         int error;
7162         vfs_context_t ctx = vfs_context_current();
7163 #if CONFIG_MACF
7164         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
7165         struct vnode_attr va;
7166 #endif
7167
7168         /* Check that the flags are valid. */
7169
7170         if (uap->flags & ~CPF_MASK) {
7171                 return EINVAL;
7172         }
7173
7174         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7175             UIO_USERSPACE, uap->from, ctx);
7176         if ((error = namei(&fromnd))) {
7177                 return error;
7178         }
7179         fvp = fromnd.ni_vp;
7180
7181         NDINIT(&tond, CREATE, OP_LINK,
7182             LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7183             UIO_USERSPACE, uap->to, ctx);
7184         if ((error = namei(&tond))) {
7185                 goto out1;
7186         }
7187         tdvp = tond.ni_dvp;
7188         tvp = tond.ni_vp;
7189
7190         if (tvp != NULL) {
7191                 if (!(uap->flags & CPF_OVERWRITE)) {
7192                         error = EEXIST;
7193                         goto out;
7194                 }
7195         }
7196
7197         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7198                 error = EISDIR;
7199                 goto out;
7200         }
7201
7202         /* This calls existing MAC hooks for open  */
7203         if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
7204             NULL))) {
7205                 goto out;
7206         }
7207
7208         if (tvp) {
7209                 /*
7210                  * See unlinkat_internal for an explanation of the potential
7211                  * ENOENT from the MAC hook but the gist is that the MAC hook
7212                  * can fail because vn_getpath isn't able to return the full
7213                  * path. We choose to ignore this failure.
7214                  */
7215                 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
7216                 if (error && error != ENOENT) {
7217                         goto out;
7218                 }
7219                 error = 0;
7220         }
7221
7222 #if CONFIG_MACF
7223         VATTR_INIT(&va);
7224         VATTR_SET(&va, va_type, fvp->v_type);
7225         /* Mask off all but regular access permissions */
7226         VATTR_SET(&va, va_mode,
7227             ((((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
7228         error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
7229         if (error) {
7230                 goto out;
7231         }
7232 #endif /* CONFIG_MACF */
7233
7234         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7235                 goto out;
7236         }
7237
7238         if (fvp == tdvp) {
7239                 error = EINVAL;
7240         }
7241         /*
7242          * If source is the same as the destination (that is the
7243          * same inode number) then there is nothing to do.
7244          * (fixed to have POSIX semantics - CSM 3/2/98)
7245          */
7246         if (fvp == tvp) {
7247                 error = -1;
7248         }
7249         if (!error) {
7250                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7251         }
7252 out:
7253         sdvp = tond.ni_startdir;
7254         /*
7255          * nameidone has to happen before we vnode_put(tdvp)
7256          * since it may need to release the fs_nodelock on the tdvp
7257          */
7258         nameidone(&tond);
7259
7260         if (tvp) {
7261                 vnode_put(tvp);
7262         }
7263         vnode_put(tdvp);
7264         vnode_put(sdvp);
7265 out1:
7266         vnode_put(fvp);
7267
7268         nameidone(&fromnd);
7269
7270         if (error == -1) {
7271                 return 0;
7272         }
7273         return error;
7274 }
7275
7276 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7277
7278 /*
7279  * Helper function for doing clones. The caller is expected to provide an
7280  * iocounted source vnode and release it.
7281  */
7282 static int
7283 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7284     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7285 {
7286         vnode_t tvp, tdvp;
7287         struct nameidata tond;
7288         int error;
7289         int follow;
7290         boolean_t free_src_acl;
7291         boolean_t attr_cleanup;
7292         enum vtype v_type;
7293         kauth_action_t action;
7294         struct componentname *cnp;
7295         uint32_t defaulted;
7296         struct vnode_attr va;
7297         struct vnode_attr nva;
7298         uint32_t vnop_flags;
7299
7300         v_type = vnode_vtype(fvp);
7301         switch (v_type) {
7302         case VLNK:
7303         /* FALLTHRU */
7304         case VREG:
7305                 action = KAUTH_VNODE_ADD_FILE;
7306                 break;
7307         case VDIR:
7308                 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7309                     fvp->v_mountedhere) {
7310                         return EINVAL;
7311                 }
7312                 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7313                 break;
7314         default:
7315                 return EINVAL;
7316         }
7317
7318         AUDIT_ARG(fd2, dst_dirfd);
7319         AUDIT_ARG(value32, flags);
7320
7321         follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7322         NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7323             UIO_USERSPACE, dst, ctx);
7324         if ((error = nameiat(&tond, dst_dirfd))) {
7325                 return error;
7326         }
7327         cnp = &tond.ni_cnd;
7328         tdvp = tond.ni_dvp;
7329         tvp = tond.ni_vp;
7330
7331         free_src_acl = FALSE;
7332         attr_cleanup = FALSE;
7333
7334         if (tvp != NULL) {
7335                 error = EEXIST;
7336                 goto out;
7337         }
7338
7339         if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7340                 error = EXDEV;
7341                 goto out;
7342         }
7343
7344 #if CONFIG_MACF
7345         if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
7346                 goto out;
7347         }
7348 #endif
7349         if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
7350                 goto out;
7351         }
7352
7353         action = KAUTH_VNODE_GENERIC_READ_BITS;
7354         if (data_read_authorised) {
7355                 action &= ~KAUTH_VNODE_READ_DATA;
7356         }
7357         if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
7358                 goto out;
7359         }
7360
7361         /*
7362          * certain attributes may need to be changed from the source, we ask for
7363          * those here.
7364          */
7365         VATTR_INIT(&va);
7366         VATTR_WANTED(&va, va_uid);
7367         VATTR_WANTED(&va, va_gid);
7368         VATTR_WANTED(&va, va_mode);
7369         VATTR_WANTED(&va, va_flags);
7370         VATTR_WANTED(&va, va_acl);
7371
7372         if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
7373                 goto out;
7374         }
7375
7376         VATTR_INIT(&nva);
7377         VATTR_SET(&nva, va_type, v_type);
7378         if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
7379                 VATTR_SET(&nva, va_acl, va.va_acl);
7380                 free_src_acl = TRUE;
7381         }
7382
7383         /* Handle ACL inheritance, initialize vap. */
7384         if (v_type == VLNK) {
7385                 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
7386         } else {
7387                 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
7388                 if (error) {
7389                         goto out;
7390                 }
7391                 attr_cleanup = TRUE;
7392         }
7393
7394         vnop_flags = VNODE_CLONEFILE_DEFAULT;
7395         /*
7396          * We've got initial values for all security parameters,
7397          * If we are superuser, then we can change owners to be the
7398          * same as the source. Both superuser and the owner have default
7399          * WRITE_SECURITY privileges so all other fields can be taken
7400          * from source as well.
7401          */
7402         if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
7403                 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
7404                         VATTR_SET(&nva, va_uid, va.va_uid);
7405                 }
7406                 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
7407                         VATTR_SET(&nva, va_gid, va.va_gid);
7408                 }
7409         } else {
7410                 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
7411         }
7412
7413         if (VATTR_IS_SUPPORTED(&va, va_mode)) {
7414                 VATTR_SET(&nva, va_mode, va.va_mode);
7415         }
7416         if (VATTR_IS_SUPPORTED(&va, va_flags)) {
7417                 VATTR_SET(&nva, va_flags,
7418                     ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
7419                     (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
7420         }
7421
7422         error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
7423
7424         if (!error && tvp) {
7425                 int     update_flags = 0;
7426 #if CONFIG_FSE
7427                 int fsevent;
7428 #endif /* CONFIG_FSE */
7429
7430 #if CONFIG_MACF
7431                 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
7432                     VNODE_LABEL_CREATE, ctx);
7433 #endif
7434                 /*
7435                  * If some of the requested attributes weren't handled by the
7436                  * VNOP, use our fallback code.
7437                  */
7438                 if (!VATTR_ALL_SUPPORTED(&va)) {
7439                         (void)vnode_setattr_fallback(tvp, &nva, ctx);
7440                 }
7441
7442                 // Make sure the name & parent pointers are hooked up
7443                 if (tvp->v_name == NULL) {
7444                         update_flags |= VNODE_UPDATE_NAME;
7445                 }
7446                 if (tvp->v_parent == NULLVP) {
7447                         update_flags |= VNODE_UPDATE_PARENT;
7448                 }
7449
7450                 if (update_flags) {
7451                         (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
7452                             cnp->cn_namelen, cnp->cn_hash, update_flags);
7453                 }
7454
7455 #if CONFIG_FSE
7456                 switch (vnode_vtype(tvp)) {
7457                 case VLNK:
7458                 /* FALLTHRU */
7459                 case VREG:
7460                         fsevent = FSE_CREATE_FILE;
7461                         break;
7462                 case VDIR:
7463                         fsevent = FSE_CREATE_DIR;
7464                         break;
7465                 default:
7466                         goto out;
7467                 }
7468
7469                 if (need_fsevent(fsevent, tvp)) {
7470                         /*
7471                          * The following is a sequence of three explicit events.
7472                          * A pair of FSE_CLONE events representing the source and destination
7473                          * followed by an FSE_CREATE_[FILE | DIR] for the destination.
7474                          * fseventsd may coalesce the destination clone and create events
7475                          * into a single event resulting in the following sequence for a client
7476                          * FSE_CLONE (src)
7477                          * FSE_CLONE | FSE_CREATE (dst)
7478                          */
7479                         add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
7480                             FSE_ARG_DONE);
7481                         add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
7482                             FSE_ARG_DONE);
7483                 }
7484 #endif /* CONFIG_FSE */
7485         }
7486
7487 out:
7488         if (attr_cleanup) {
7489                 vn_attribute_cleanup(&nva, defaulted);
7490         }
7491         if (free_src_acl && va.va_acl) {
7492                 kauth_acl_free(va.va_acl);
7493         }
7494         nameidone(&tond);
7495         if (tvp) {
7496                 vnode_put(tvp);
7497         }
7498         vnode_put(tdvp);
7499         return error;
7500 }
7501
7502 /*
7503  * clone files or directories, target must not exist.
7504  */
7505 /* ARGSUSED */
7506 int
7507 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7508     __unused int32_t *retval)
7509 {
7510         vnode_t fvp;
7511         struct nameidata fromnd;
7512         int follow;
7513         int error;
7514         vfs_context_t ctx = vfs_context_current();
7515
7516         /* Check that the flags are valid. */
7517         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7518                 return EINVAL;
7519         }
7520
7521         AUDIT_ARG(fd, uap->src_dirfd);
7522
7523         follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7524         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7525             UIO_USERSPACE, uap->src, ctx);
7526         if ((error = nameiat(&fromnd, uap->src_dirfd))) {
7527                 return error;
7528         }
7529
7530         fvp = fromnd.ni_vp;
7531         nameidone(&fromnd);
7532
7533         error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7534             uap->flags, ctx);
7535
7536         vnode_put(fvp);
7537         return error;
7538 }
7539
7540 int
7541 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7542     __unused int32_t *retval)
7543 {
7544         vnode_t fvp;
7545         struct fileproc *fp;
7546         int error;
7547         vfs_context_t ctx = vfs_context_current();
7548
7549         /* Check that the flags are valid. */
7550         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7551                 return EINVAL;
7552         }
7553
7554         AUDIT_ARG(fd, uap->src_fd);
7555         error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7556         if (error) {
7557                 return error;
7558         }
7559
7560         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7561                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7562                 error = EBADF;
7563                 goto out;
7564         }
7565
7566         if ((error = vnode_getwithref(fvp))) {
7567                 goto out;
7568         }
7569
7570         AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7571
7572         error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7573             uap->flags, ctx);
7574
7575         vnode_put(fvp);
7576 out:
7577         file_drop(uap->src_fd);
7578         return error;
7579 }
7580
7581 /*
7582  * Rename files.  Source and destination must either both be directories,
7583  * or both not be directories.  If target is a directory, it must be empty.
7584  */
7585 /* ARGSUSED */
7586 static int
7587 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
7588     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
7589 {
7590         if (flags & ~VFS_RENAME_FLAGS_MASK) {
7591                 return EINVAL;
7592         }
7593
7594         if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
7595                 return EINVAL;
7596         }
7597
7598         vnode_t tvp, tdvp;
7599         vnode_t fvp, fdvp;
7600         struct nameidata *fromnd, *tond;
7601         int error;
7602         int do_retry;
7603         int retry_count;
7604         int mntrename;
7605         int need_event;
7606         int need_kpath2;
7607         int has_listeners;
7608         const char *oname = NULL;
7609         char *from_name = NULL, *to_name = NULL;
7610         int from_len = 0, to_len = 0;
7611         int holding_mntlock;
7612         mount_t locked_mp = NULL;
7613         vnode_t oparent = NULLVP;
7614 #if CONFIG_FSE
7615         fse_info from_finfo, to_finfo;
7616 #endif
7617         int from_truncated = 0, to_truncated;
7618         int batched = 0;
7619         struct vnode_attr *fvap, *tvap;
7620         int continuing = 0;
7621         /* carving out a chunk for structs that are too big to be on stack. */
7622         struct {
7623                 struct nameidata from_node, to_node;
7624                 struct vnode_attr fv_attr, tv_attr;
7625         } * __rename_data;
7626         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
7627         fromnd = &__rename_data->from_node;
7628         tond = &__rename_data->to_node;
7629
7630         holding_mntlock = 0;
7631         do_retry = 0;
7632         retry_count = 0;
7633 retry:
7634         fvp = tvp = NULL;
7635         fdvp = tdvp = NULL;
7636         fvap = tvap = NULL;
7637         mntrename = FALSE;
7638
7639         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
7640             segflg, from, ctx);
7641         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
7642
7643         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7644             segflg, to, ctx);
7645         tond->ni_flag = NAMEI_COMPOUNDRENAME;
7646
7647 continue_lookup:
7648         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
7649                 if ((error = nameiat(fromnd, fromfd))) {
7650                         goto out1;
7651                 }
7652                 fdvp = fromnd->ni_dvp;
7653                 fvp  = fromnd->ni_vp;
7654
7655                 if (fvp && fvp->v_type == VDIR) {
7656                         tond->ni_cnd.cn_flags |= WILLBEDIR;
7657                 }
7658         }
7659
7660         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
7661                 if ((error = nameiat(tond, tofd))) {
7662                         /*
7663                          * Translate error code for rename("dir1", "dir2/.").
7664                          */
7665                         if (error == EISDIR && fvp->v_type == VDIR) {
7666                                 error = EINVAL;
7667                         }
7668                         goto out1;
7669                 }
7670                 tdvp = tond->ni_dvp;
7671                 tvp  = tond->ni_vp;
7672         }
7673
7674 #if DEVELOPMENT || DEBUG
7675         /*
7676          * XXX VSWAP: Check for entitlements or special flag here
7677          * so we can restrict access appropriately.
7678          */
7679 #else /* DEVELOPMENT || DEBUG */
7680
7681         if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
7682                 error = EPERM;
7683                 goto out1;
7684         }
7685
7686         if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
7687                 error = EPERM;
7688                 goto out1;
7689         }
7690 #endif /* DEVELOPMENT || DEBUG */
7691
7692         if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
7693                 error = ENOENT;
7694                 goto out1;
7695         }
7696
7697         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
7698                 error = EEXIST;
7699                 goto out1;
7700         }
7701
7702         batched = vnode_compound_rename_available(fdvp);
7703
7704 #if CONFIG_FSE
7705         need_event = need_fsevent(FSE_RENAME, fdvp);
7706         if (need_event) {
7707                 if (fvp) {
7708                         get_fse_info(fvp, &from_finfo, ctx);
7709                 } else {
7710                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
7711                         if (error) {
7712                                 goto out1;
7713                         }
7714
7715                         fvap = &__rename_data->fv_attr;
7716                 }
7717
7718                 if (tvp) {
7719                         get_fse_info(tvp, &to_finfo, ctx);
7720                 } else if (batched) {
7721                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
7722                         if (error) {
7723                                 goto out1;
7724                         }
7725
7726                         tvap = &__rename_data->tv_attr;
7727                 }
7728         }
7729 #else
7730         need_event = 0;
7731 #endif /* CONFIG_FSE */
7732
7733         has_listeners = kauth_authorize_fileop_has_listeners();
7734
7735         need_kpath2 = 0;
7736 #if CONFIG_AUDIT
7737         if (AUDIT_RECORD_EXISTS()) {
7738                 need_kpath2 = 1;
7739         }
7740 #endif
7741
7742         if (need_event || has_listeners) {
7743                 if (from_name == NULL) {
7744                         GET_PATH(from_name);
7745                         if (from_name == NULL) {
7746                                 error = ENOMEM;
7747                                 goto out1;
7748                         }
7749                 }
7750
7751                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
7752         }
7753
7754         if (need_event || need_kpath2 || has_listeners) {
7755                 if (to_name == NULL) {
7756                         GET_PATH(to_name);
7757                         if (to_name == NULL) {
7758                                 error = ENOMEM;
7759                                 goto out1;
7760                         }
7761                 }
7762
7763                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
7764                 if (to_name && need_kpath2) {
7765                         AUDIT_ARG(kpath, to_name, ARG_KPATH2);
7766                 }
7767         }
7768         if (!fvp) {
7769                 /*
7770                  * Claim: this check will never reject a valid rename.
7771                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
7772                  * Suppose fdvp and tdvp are not on the same mount.
7773                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
7774                  *      then you can't move it to within another dir on the same mountpoint.
7775                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
7776                  *
7777                  * If this check passes, then we are safe to pass these vnodes to the same FS.
7778                  */
7779                 if (fdvp->v_mount != tdvp->v_mount) {
7780                         error = EXDEV;
7781                         goto out1;
7782                 }
7783                 goto skipped_lookup;
7784         }
7785
7786         if (!batched) {
7787                 error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
7788                 if (error) {
7789                         if (error == ENOENT) {
7790                                 assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7791                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7792                                         /*
7793                                          * We encountered a race where after doing the namei, tvp stops
7794                                          * being valid. If so, simply re-drive the rename call from the
7795                                          * top.
7796                                          */
7797                                         do_retry = 1;
7798                                         retry_count += 1;
7799                                 }
7800                         }
7801                         goto out1;
7802                 }
7803         }
7804
7805         /*
7806          * If the source and destination are the same (i.e. they're
7807          * links to the same vnode) and the target file system is
7808          * case sensitive, then there is nothing to do.
7809          *
7810          * XXX Come back to this.
7811          */
7812         if (fvp == tvp) {
7813                 int pathconf_val;
7814
7815                 /*
7816                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
7817                  * then assume that this file system is case sensitive.
7818                  */
7819                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
7820                     pathconf_val != 0) {
7821                         goto out1;
7822                 }
7823         }
7824
7825         /*
7826          * Allow the renaming of mount points.
7827          * - target must not exist
7828          * - target must reside in the same directory as source
7829          * - union mounts cannot be renamed
7830          * - "/" cannot be renamed
7831          *
7832          * XXX Handle this in VFS after a continued lookup (if we missed
7833          * in the cache to start off)
7834          *
7835          * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
7836          * we'll skip past here.  The file system is responsible for
7837          * checking that @tvp is not a descendent of @fvp and vice versa
7838          * so it should always return EINVAL if either @tvp or @fvp is the
7839          * root of a volume.
7840          */
7841         if ((fvp->v_flag & VROOT) &&
7842             (fvp->v_type == VDIR) &&
7843             (tvp == NULL) &&
7844             (fvp->v_mountedhere == NULL) &&
7845             (fdvp == tdvp) &&
7846             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
7847             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
7848                 vnode_t coveredvp;
7849
7850                 /* switch fvp to the covered vnode */
7851                 coveredvp = fvp->v_mount->mnt_vnodecovered;
7852                 if ((vnode_getwithref(coveredvp))) {
7853                         error = ENOENT;
7854                         goto out1;
7855                 }
7856                 vnode_put(fvp);
7857
7858                 fvp = coveredvp;
7859                 mntrename = TRUE;
7860         }
7861         /*
7862          * Check for cross-device rename.
7863          */
7864         if ((fvp->v_mount != tdvp->v_mount) ||
7865             (tvp && (fvp->v_mount != tvp->v_mount))) {
7866                 error = EXDEV;
7867                 goto out1;
7868         }
7869
7870         /*
7871          * If source is the same as the destination (that is the
7872          * same inode number) then there is nothing to do...
7873          * EXCEPT if the underlying file system supports case
7874          * insensitivity and is case preserving.  In this case
7875          * the file system needs to handle the special case of
7876          * getting the same vnode as target (fvp) and source (tvp).
7877          *
7878          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
7879          * and _PC_CASE_PRESERVING can have this exception, and they need to
7880          * handle the special case of getting the same vnode as target and
7881          * source.  NOTE: Then the target is unlocked going into vnop_rename,
7882          * so not to cause locking problems. There is a single reference on tvp.
7883          *
7884          * NOTE - that fvp == tvp also occurs if they are hard linked and
7885          * that correct behaviour then is just to return success without doing
7886          * anything.
7887          *
7888          * XXX filesystem should take care of this itself, perhaps...
7889          */
7890         if (fvp == tvp && fdvp == tdvp) {
7891                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
7892                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
7893                     fromnd->ni_cnd.cn_namelen)) {
7894                         goto out1;
7895                 }
7896         }
7897
7898         if (holding_mntlock && fvp->v_mount != locked_mp) {
7899                 /*
7900                  * we're holding a reference and lock
7901                  * on locked_mp, but it no longer matches
7902                  * what we want to do... so drop our hold
7903                  */
7904                 mount_unlock_renames(locked_mp);
7905                 mount_drop(locked_mp, 0);
7906                 holding_mntlock = 0;
7907         }
7908         if (tdvp != fdvp && fvp->v_type == VDIR) {
7909                 /*
7910                  * serialize renames that re-shape
7911                  * the tree... if holding_mntlock is
7912                  * set, then we're ready to go...
7913                  * otherwise we
7914                  * first need to drop the iocounts
7915                  * we picked up, second take the
7916                  * lock to serialize the access,
7917                  * then finally start the lookup
7918                  * process over with the lock held
7919                  */
7920                 if (!holding_mntlock) {
7921                         /*
7922                          * need to grab a reference on
7923                          * the mount point before we
7924                          * drop all the iocounts... once
7925                          * the iocounts are gone, the mount
7926                          * could follow
7927                          */
7928                         locked_mp = fvp->v_mount;
7929                         mount_ref(locked_mp, 0);
7930
7931                         /*
7932                          * nameidone has to happen before we vnode_put(tvp)
7933                          * since it may need to release the fs_nodelock on the tvp
7934                          */
7935                         nameidone(tond);
7936
7937                         if (tvp) {
7938                                 vnode_put(tvp);
7939                         }
7940                         vnode_put(tdvp);
7941
7942                         /*
7943                          * nameidone has to happen before we vnode_put(fdvp)
7944                          * since it may need to release the fs_nodelock on the fvp
7945                          */
7946                         nameidone(fromnd);
7947
7948                         vnode_put(fvp);
7949                         vnode_put(fdvp);
7950
7951                         mount_lock_renames(locked_mp);
7952                         holding_mntlock = 1;
7953
7954                         goto retry;
7955                 }
7956         } else {
7957                 /*
7958                  * when we dropped the iocounts to take
7959                  * the lock, we allowed the identity of
7960                  * the various vnodes to change... if they did,
7961                  * we may no longer be dealing with a rename
7962                  * that reshapes the tree... once we're holding
7963                  * the iocounts, the vnodes can't change type
7964                  * so we're free to drop the lock at this point
7965                  * and continue on
7966                  */
7967                 if (holding_mntlock) {
7968                         mount_unlock_renames(locked_mp);
7969                         mount_drop(locked_mp, 0);
7970                         holding_mntlock = 0;
7971                 }
7972         }
7973
7974         // save these off so we can later verify that fvp is the same
7975         oname   = fvp->v_name;
7976         oparent = fvp->v_parent;
7977
7978 skipped_lookup:
7979         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
7980             tdvp, &tvp, &tond->ni_cnd, tvap,
7981             flags, ctx);
7982
7983         if (holding_mntlock) {
7984                 /*
7985                  * we can drop our serialization
7986                  * lock now
7987                  */
7988                 mount_unlock_renames(locked_mp);
7989                 mount_drop(locked_mp, 0);
7990                 holding_mntlock = 0;
7991         }
7992         if (error) {
7993                 if (error == EKEEPLOOKING) {
7994                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
7995                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
7996                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
7997                                 }
7998                         }
7999
8000                         fromnd->ni_vp = fvp;
8001                         tond->ni_vp = tvp;
8002
8003                         goto continue_lookup;
8004                 }
8005
8006                 /*
8007                  * We may encounter a race in the VNOP where the destination didn't
8008                  * exist when we did the namei, but it does by the time we go and
8009                  * try to create the entry. In this case, we should re-drive this rename
8010                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
8011                  * but other filesystems susceptible to this race could return it, too.
8012                  */
8013                 if (error == ERECYCLE) {
8014                         do_retry = 1;
8015                 }
8016
8017                 /*
8018                  * For compound VNOPs, the authorization callback may return
8019                  * ENOENT in case of racing hardlink lookups hitting the name
8020                  * cache, redrive the lookup.
8021                  */
8022                 if (batched && error == ENOENT) {
8023                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
8024                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8025                                 do_retry = 1;
8026                                 retry_count += 1;
8027                         }
8028                 }
8029
8030                 goto out1;
8031         }
8032
8033         /* call out to allow 3rd party notification of rename.
8034          * Ignore result of kauth_authorize_fileop call.
8035          */
8036         kauth_authorize_fileop(vfs_context_ucred(ctx),
8037             KAUTH_FILEOP_RENAME,
8038             (uintptr_t)from_name, (uintptr_t)to_name);
8039         if (flags & VFS_RENAME_SWAP) {
8040                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8041                     KAUTH_FILEOP_RENAME,
8042                     (uintptr_t)to_name, (uintptr_t)from_name);
8043         }
8044
8045 #if CONFIG_FSE
8046         if (from_name != NULL && to_name != NULL) {
8047                 if (from_truncated || to_truncated) {
8048                         // set it here since only the from_finfo gets reported up to user space
8049                         from_finfo.mode |= FSE_TRUNCATED_PATH;
8050                 }
8051
8052                 if (tvap && tvp) {
8053                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8054                 }
8055                 if (fvap) {
8056                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8057                 }
8058
8059                 if (tvp) {
8060                         add_fsevent(FSE_RENAME, ctx,
8061                             FSE_ARG_STRING, from_len, from_name,
8062                             FSE_ARG_FINFO, &from_finfo,
8063                             FSE_ARG_STRING, to_len, to_name,
8064                             FSE_ARG_FINFO, &to_finfo,
8065                             FSE_ARG_DONE);
8066                         if (flags & VFS_RENAME_SWAP) {
8067                                 /*
8068                                  * Strictly speaking, swap is the equivalent of
8069                                  * *three* renames.  FSEvents clients should only take
8070                                  * the events as a hint, so we only bother reporting
8071                                  * two.
8072                                  */
8073                                 add_fsevent(FSE_RENAME, ctx,
8074                                     FSE_ARG_STRING, to_len, to_name,
8075                                     FSE_ARG_FINFO, &to_finfo,
8076                                     FSE_ARG_STRING, from_len, from_name,
8077                                     FSE_ARG_FINFO, &from_finfo,
8078                                     FSE_ARG_DONE);
8079                         }
8080                 } else {
8081                         add_fsevent(FSE_RENAME, ctx,
8082                             FSE_ARG_STRING, from_len, from_name,
8083                             FSE_ARG_FINFO, &from_finfo,
8084                             FSE_ARG_STRING, to_len, to_name,
8085                             FSE_ARG_DONE);
8086                 }
8087         }
8088 #endif /* CONFIG_FSE */
8089
8090         /*
8091          * update filesystem's mount point data
8092          */
8093         if (mntrename) {
8094                 char *cp, *pathend, *mpname;
8095                 char * tobuf;
8096                 struct mount *mp;
8097                 int maxlen;
8098                 size_t len = 0;
8099
8100                 mp = fvp->v_mountedhere;
8101
8102                 if (vfs_busy(mp, LK_NOWAIT)) {
8103                         error = EBUSY;
8104                         goto out1;
8105                 }
8106                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
8107
8108                 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8109                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8110                 } else {
8111                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8112                 }
8113                 if (!error) {
8114                         /* find current mount point prefix */
8115                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
8116                         for (cp = pathend; *cp != '\0'; ++cp) {
8117                                 if (*cp == '/') {
8118                                         pathend = cp + 1;
8119                                 }
8120                         }
8121                         /* find last component of target name */
8122                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8123                                 if (*cp == '/') {
8124                                         mpname = cp + 1;
8125                                 }
8126                         }
8127                         /* append name to prefix */
8128                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
8129                         bzero(pathend, maxlen);
8130                         strlcpy(pathend, mpname, maxlen);
8131                 }
8132                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
8133
8134                 vfs_unbusy(mp);
8135         }
8136         /*
8137          * fix up name & parent pointers.  note that we first
8138          * check that fvp has the same name/parent pointers it
8139          * had before the rename call... this is a 'weak' check
8140          * at best...
8141          *
8142          * XXX oparent and oname may not be set in the compound vnop case
8143          */
8144         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8145                 int update_flags;
8146
8147                 update_flags = VNODE_UPDATE_NAME;
8148
8149                 if (fdvp != tdvp) {
8150                         update_flags |= VNODE_UPDATE_PARENT;
8151                 }
8152
8153                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8154         }
8155 out1:
8156         if (to_name != NULL) {
8157                 RELEASE_PATH(to_name);
8158                 to_name = NULL;
8159         }
8160         if (from_name != NULL) {
8161                 RELEASE_PATH(from_name);
8162                 from_name = NULL;
8163         }
8164         if (holding_mntlock) {
8165                 mount_unlock_renames(locked_mp);
8166                 mount_drop(locked_mp, 0);
8167                 holding_mntlock = 0;
8168         }
8169         if (tdvp) {
8170                 /*
8171                  * nameidone has to happen before we vnode_put(tdvp)
8172                  * since it may need to release the fs_nodelock on the tdvp
8173                  */
8174                 nameidone(tond);
8175
8176                 if (tvp) {
8177                         vnode_put(tvp);
8178                 }
8179                 vnode_put(tdvp);
8180         }
8181         if (fdvp) {
8182                 /*
8183                  * nameidone has to happen before we vnode_put(fdvp)
8184                  * since it may need to release the fs_nodelock on the fdvp
8185                  */
8186                 nameidone(fromnd);
8187
8188                 if (fvp) {
8189                         vnode_put(fvp);
8190                 }
8191                 vnode_put(fdvp);
8192         }
8193
8194         /*
8195          * If things changed after we did the namei, then we will re-drive
8196          * this rename call from the top.
8197          */
8198         if (do_retry) {
8199                 do_retry = 0;
8200                 goto retry;
8201         }
8202
8203         FREE(__rename_data, M_TEMP);
8204         return error;
8205 }
8206
8207 int
8208 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
8209 {
8210         return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
8211                    AT_FDCWD, uap->to, UIO_USERSPACE, 0);
8212 }
8213
8214 int
8215 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
8216 {
8217         return renameat_internal(
8218                 vfs_context_current(),
8219                 uap->fromfd, uap->from,
8220                 uap->tofd, uap->to,
8221                 UIO_USERSPACE, uap->flags);
8222 }
8223
8224 int
8225 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
8226 {
8227         return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
8228                    uap->tofd, uap->to, UIO_USERSPACE, 0);
8229 }
8230
8231 /*
8232  * Make a directory file.
8233  *
8234  * Returns:     0                       Success
8235  *              EEXIST
8236  *      namei:???
8237  *      vnode_authorize:???
8238  *      vn_create:???
8239  */
8240 /* ARGSUSED */
8241 static int
8242 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
8243     enum uio_seg segflg)
8244 {
8245         vnode_t vp, dvp;
8246         int error;
8247         int update_flags = 0;
8248         int batched;
8249         struct nameidata nd;
8250
8251         AUDIT_ARG(mode, vap->va_mode);
8252         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
8253             path, ctx);
8254         nd.ni_cnd.cn_flags |= WILLBEDIR;
8255         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
8256
8257 continue_lookup:
8258         error = nameiat(&nd, fd);
8259         if (error) {
8260                 return error;
8261         }
8262         dvp = nd.ni_dvp;
8263         vp = nd.ni_vp;
8264
8265         if (vp != NULL) {
8266                 error = EEXIST;
8267                 goto out;
8268         }
8269
8270         batched = vnode_compound_mkdir_available(dvp);
8271
8272         VATTR_SET(vap, va_type, VDIR);
8273
8274         /*
8275          * XXX
8276          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
8277          * only get EXISTS or EISDIR for existing path components, and not that it could see
8278          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
8279          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
8280          */
8281         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
8282                 if (error == EACCES || error == EPERM) {
8283                         int error2;
8284
8285                         nameidone(&nd);
8286                         vnode_put(dvp);
8287                         dvp = NULLVP;
8288
8289                         /*
8290                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
8291                          * rather than EACCESS if the target exists.
8292                          */
8293                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
8294                             path, ctx);
8295                         error2 = nameiat(&nd, fd);
8296                         if (error2) {
8297                                 goto out;
8298                         } else {
8299                                 vp = nd.ni_vp;
8300                                 error = EEXIST;
8301                                 goto out;
8302                         }
8303                 }
8304
8305                 goto out;
8306         }
8307
8308         /*
8309          * make the directory
8310          */
8311         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
8312                 if (error == EKEEPLOOKING) {
8313                         nd.ni_vp = vp;
8314                         goto continue_lookup;
8315                 }
8316
8317                 goto out;
8318         }
8319
8320         // Make sure the name & parent pointers are hooked up
8321         if (vp->v_name == NULL) {
8322                 update_flags |= VNODE_UPDATE_NAME;
8323         }
8324         if (vp->v_parent == NULLVP) {
8325                 update_flags |= VNODE_UPDATE_PARENT;
8326         }
8327
8328         if (update_flags) {
8329                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
8330         }
8331
8332 #if CONFIG_FSE
8333         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
8334 #endif
8335
8336 out:
8337         /*
8338          * nameidone has to happen before we vnode_put(dvp)
8339          * since it may need to release the fs_nodelock on the dvp
8340          */
8341         nameidone(&nd);
8342
8343         if (vp) {
8344                 vnode_put(vp);
8345         }
8346         if (dvp) {
8347                 vnode_put(dvp);
8348         }
8349
8350         return error;
8351 }
8352
8353 /*
8354  * mkdir_extended: Create a directory; with extended security (ACL).
8355  *
8356  * Parameters:    p                       Process requesting to create the directory
8357  *                uap                     User argument descriptor (see below)
8358  *                retval                  (ignored)
8359  *
8360  * Indirect:      uap->path               Path of directory to create
8361  *                uap->mode               Access permissions to set
8362  *                uap->xsecurity          ACL to set
8363  *
8364  * Returns:        0                      Success
8365  *                !0                      Not success
8366  *
8367  */
8368 int
8369 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
8370 {
8371         int ciferror;
8372         kauth_filesec_t xsecdst;
8373         struct vnode_attr va;
8374
8375         AUDIT_ARG(owner, uap->uid, uap->gid);
8376
8377         xsecdst = NULL;
8378         if ((uap->xsecurity != USER_ADDR_NULL) &&
8379             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
8380                 return ciferror;
8381         }
8382
8383         VATTR_INIT(&va);
8384         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8385         if (xsecdst != NULL) {
8386                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
8387         }
8388
8389         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8390             UIO_USERSPACE);
8391         if (xsecdst != NULL) {
8392                 kauth_filesec_free(xsecdst);
8393         }
8394         return ciferror;
8395 }
8396
8397 int
8398 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
8399 {
8400         struct vnode_attr va;
8401
8402         VATTR_INIT(&va);
8403         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8404
8405         return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8406                    UIO_USERSPACE);
8407 }
8408
8409 int
8410 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
8411 {
8412         struct vnode_attr va;
8413
8414         VATTR_INIT(&va);
8415         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8416
8417         return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
8418                    UIO_USERSPACE);
8419 }
8420
8421 static int
8422 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
8423     enum uio_seg segflg)
8424 {
8425         vnode_t vp, dvp;
8426         int error;
8427         struct nameidata nd;
8428         char     *path = NULL;
8429         int       len = 0;
8430         int has_listeners = 0;
8431         int need_event = 0;
8432         int truncated = 0;
8433 #if CONFIG_FSE
8434         struct vnode_attr va;
8435 #endif /* CONFIG_FSE */
8436         struct vnode_attr *vap = NULL;
8437         int restart_count = 0;
8438         int batched;
8439
8440         int restart_flag;
8441
8442         /*
8443          * This loop exists to restart rmdir in the unlikely case that two
8444          * processes are simultaneously trying to remove the same directory
8445          * containing orphaned appleDouble files.
8446          */
8447         do {
8448                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
8449                     segflg, dirpath, ctx);
8450                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
8451 continue_lookup:
8452                 restart_flag = 0;
8453                 vap = NULL;
8454
8455                 error = nameiat(&nd, fd);
8456                 if (error) {
8457                         return error;
8458                 }
8459
8460                 dvp = nd.ni_dvp;
8461                 vp = nd.ni_vp;
8462
8463                 if (vp) {
8464                         batched = vnode_compound_rmdir_available(vp);
8465
8466                         if (vp->v_flag & VROOT) {
8467                                 /*
8468                                  * The root of a mounted filesystem cannot be deleted.
8469                                  */
8470                                 error = EBUSY;
8471                                 goto out;
8472                         }
8473
8474 #if DEVELOPMENT || DEBUG
8475                         /*
8476                          * XXX VSWAP: Check for entitlements or special flag here
8477                          * so we can restrict access appropriately.
8478                          */
8479 #else /* DEVELOPMENT || DEBUG */
8480
8481                         if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
8482                                 error = EPERM;
8483                                 goto out;
8484                         }
8485 #endif /* DEVELOPMENT || DEBUG */
8486
8487                         /*
8488                          * Removed a check here; we used to abort if vp's vid
8489                          * was not the same as what we'd seen the last time around.
8490                          * I do not think that check was valid, because if we retry
8491                          * and all dirents are gone, the directory could legitimately
8492                          * be recycled but still be present in a situation where we would
8493                          * have had permission to delete.  Therefore, we won't make
8494                          * an effort to preserve that check now that we may not have a
8495                          * vp here.
8496                          */
8497
8498                         if (!batched) {
8499                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
8500                                 if (error) {
8501                                         if (error == ENOENT) {
8502                                                 assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
8503                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8504                                                         restart_flag = 1;
8505                                                         restart_count += 1;
8506                                                 }
8507                                         }
8508                                         goto out;
8509                                 }
8510                         }
8511                 } else {
8512                         batched = 1;
8513
8514                         if (!vnode_compound_rmdir_available(dvp)) {
8515                                 panic("No error, but no compound rmdir?");
8516                         }
8517                 }
8518
8519 #if CONFIG_FSE
8520                 fse_info  finfo;
8521
8522                 need_event = need_fsevent(FSE_DELETE, dvp);
8523                 if (need_event) {
8524                         if (!batched) {
8525                                 get_fse_info(vp, &finfo, ctx);
8526                         } else {
8527                                 error = vfs_get_notify_attributes(&va);
8528                                 if (error) {
8529                                         goto out;
8530                                 }
8531
8532                                 vap = &va;
8533                         }
8534                 }
8535 #endif
8536                 has_listeners = kauth_authorize_fileop_has_listeners();
8537                 if (need_event || has_listeners) {
8538                         if (path == NULL) {
8539                                 GET_PATH(path);
8540                                 if (path == NULL) {
8541                                         error = ENOMEM;
8542                                         goto out;
8543                                 }
8544                         }
8545
8546                         len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
8547 #if CONFIG_FSE
8548                         if (truncated) {
8549                                 finfo.mode |= FSE_TRUNCATED_PATH;
8550                         }
8551 #endif
8552                 }
8553
8554                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8555                 nd.ni_vp = vp;
8556                 if (vp == NULLVP) {
8557                         /* Couldn't find a vnode */
8558                         goto out;
8559                 }
8560
8561                 if (error == EKEEPLOOKING) {
8562                         goto continue_lookup;
8563                 } else if (batched && error == ENOENT) {
8564                         assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
8565                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8566                                 /*
8567                                  * For compound VNOPs, the authorization callback
8568                                  * may return ENOENT in case of racing hard link lookups
8569                                  * redrive the lookup.
8570                                  */
8571                                 restart_flag = 1;
8572                                 restart_count += 1;
8573                                 goto out;
8574                         }
8575                 }
8576 #if CONFIG_APPLEDOUBLE
8577                 /*
8578                  * Special case to remove orphaned AppleDouble
8579                  * files. I don't like putting this in the kernel,
8580                  * but carbon does not like putting this in carbon either,
8581                  * so here we are.
8582                  */
8583                 if (error == ENOTEMPTY) {
8584                         error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
8585                         if (error == EBUSY) {
8586                                 goto out;
8587                         }
8588
8589
8590                         /*
8591                          * Assuming everything went well, we will try the RMDIR again
8592                          */
8593                         if (!error) {
8594                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8595                         }
8596                 }
8597 #endif /* CONFIG_APPLEDOUBLE */
8598                 /*
8599                  * Call out to allow 3rd party notification of delete.
8600                  * Ignore result of kauth_authorize_fileop call.
8601                  */
8602                 if (!error) {
8603                         if (has_listeners) {
8604                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8605                                     KAUTH_FILEOP_DELETE,
8606                                     (uintptr_t)vp,
8607                                     (uintptr_t)path);
8608                         }
8609
8610                         if (vp->v_flag & VISHARDLINK) {
8611                                 // see the comment in unlink1() about why we update
8612                                 // the parent of a hard link when it is removed
8613                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
8614                         }
8615
8616 #if CONFIG_FSE
8617                         if (need_event) {
8618                                 if (vap) {
8619                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
8620                                 }
8621                                 add_fsevent(FSE_DELETE, ctx,
8622                                     FSE_ARG_STRING, len, path,
8623                                     FSE_ARG_FINFO, &finfo,
8624                                     FSE_ARG_DONE);
8625                         }
8626 #endif
8627                 }
8628
8629 out:
8630                 if (path != NULL) {
8631                         RELEASE_PATH(path);
8632                         path = NULL;
8633                 }
8634                 /*
8635                  * nameidone has to happen before we vnode_put(dvp)
8636                  * since it may need to release the fs_nodelock on the dvp
8637                  */
8638                 nameidone(&nd);
8639                 vnode_put(dvp);
8640
8641                 if (vp) {
8642                         vnode_put(vp);
8643                 }
8644
8645                 if (restart_flag == 0) {
8646                         wakeup_one((caddr_t)vp);
8647                         return error;
8648                 }
8649                 tsleep(vp, PVFS, "rm AD", 1);
8650         } while (restart_flag != 0);
8651
8652         return error;
8653 }
8654
8655 /*
8656  * Remove a directory file.
8657  */
8658 /* ARGSUSED */
8659 int
8660 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
8661 {
8662         return rmdirat_internal(vfs_context_current(), AT_FDCWD,
8663                    CAST_USER_ADDR_T(uap->path), UIO_USERSPACE);
8664 }
8665
8666 /* Get direntry length padded to 8 byte alignment */
8667 #define DIRENT64_LEN(namlen) \
8668         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
8669
8670 /* Get dirent length padded to 4 byte alignment */
8671 #define DIRENT_LEN(namelen) \
8672         ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
8673
8674 /* Get the end of this dirent */
8675 #define DIRENT_END(dep) \
8676         (((char *)(dep)) + (dep)->d_reclen - 1)
8677
8678 errno_t
8679 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
8680     int *numdirent, vfs_context_t ctxp)
8681 {
8682         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
8683         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
8684             ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
8685                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
8686         } else {
8687                 size_t bufsize;
8688                 void * bufptr;
8689                 uio_t auio;
8690                 struct direntry *entry64;
8691                 struct dirent *dep;
8692                 int bytesread;
8693                 int error;
8694
8695                 /*
8696                  * We're here because the underlying file system does not
8697                  * support direnties or we mounted denying support so we must
8698                  * fall back to dirents and convert them to direntries.
8699                  *
8700                  * Our kernel buffer needs to be smaller since re-packing will
8701                  * expand each dirent.  The worse case (when the name length
8702                  * is 3 or less) corresponds to a struct direntry size of 32
8703                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
8704                  * (4-byte aligned).  So having a buffer that is 3/8 the size
8705                  * will prevent us from reading more than we can pack.
8706                  *
8707                  * Since this buffer is wired memory, we will limit the
8708                  * buffer size to a maximum of 32K. We would really like to
8709                  * use 32K in the MIN(), but we use magic number 87371 to
8710                  * prevent uio_resid() * 3 / 8 from overflowing.
8711                  */
8712                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
8713                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
8714                 if (bufptr == NULL) {
8715                         return ENOMEM;
8716                 }
8717
8718                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
8719                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
8720                 auio->uio_offset = uio->uio_offset;
8721
8722                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
8723
8724                 dep = (struct dirent *)bufptr;
8725                 bytesread = bufsize - uio_resid(auio);
8726
8727                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
8728                     M_TEMP, M_WAITOK);
8729                 /*
8730                  * Convert all the entries and copy them out to user's buffer.
8731                  */
8732                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
8733                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
8734
8735                         if (DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
8736                             DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
8737                                 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
8738                                     vp->v_mount->mnt_vfsstat.f_mntonname,
8739                                     vp->v_name ? vp->v_name : "<unknown>");
8740                                 error = EIO;
8741                                 break;
8742                         }
8743
8744                         bzero(entry64, enbufsize);
8745                         /* Convert a dirent to a dirent64. */
8746                         entry64->d_ino = dep->d_ino;
8747                         entry64->d_seekoff = 0;
8748                         entry64->d_reclen = enbufsize;
8749                         entry64->d_namlen = dep->d_namlen;
8750                         entry64->d_type = dep->d_type;
8751                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
8752
8753                         /* Move to next entry. */
8754                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
8755
8756                         /* Copy entry64 to user's buffer. */
8757                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
8758                 }
8759
8760                 /* Update the real offset using the offset we got from VNOP_READDIR. */
8761                 if (error == 0) {
8762                         uio->uio_offset = auio->uio_offset;
8763                 }
8764                 uio_free(auio);
8765                 FREE(bufptr, M_TEMP);
8766                 FREE(entry64, M_TEMP);
8767                 return error;
8768         }
8769 }
8770
8771 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
8772
8773 /*
8774  * Read a block of directory entries in a file system independent format.
8775  */
8776 static int
8777 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
8778     off_t *offset, int flags)
8779 {
8780         vnode_t vp;
8781         struct vfs_context context = *vfs_context_current();    /* local copy */
8782         struct fileproc *fp;
8783         uio_t auio;
8784         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8785         off_t loff;
8786         int error, eofflag, numdirent;
8787         char uio_buf[UIO_SIZEOF(1)];
8788
8789         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
8790         if (error) {
8791                 return error;
8792         }
8793         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
8794                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8795                 error = EBADF;
8796                 goto out;
8797         }
8798
8799         if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
8800                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
8801         }
8802
8803 #if CONFIG_MACF
8804         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
8805         if (error) {
8806                 goto out;
8807         }
8808 #endif
8809         if ((error = vnode_getwithref(vp))) {
8810                 goto out;
8811         }
8812         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8813
8814 unionread:
8815         if (vp->v_type != VDIR) {
8816                 (void)vnode_put(vp);
8817                 error = EINVAL;
8818                 goto out;
8819         }
8820
8821 #if CONFIG_MACF
8822         error = mac_vnode_check_readdir(&context, vp);
8823         if (error != 0) {
8824                 (void)vnode_put(vp);
8825                 goto out;
8826         }
8827 #endif /* MAC */
8828
8829         loff = fp->f_fglob->fg_offset;
8830         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8831         uio_addiov(auio, bufp, bufsize);
8832
8833         if (flags & VNODE_READDIR_EXTENDED) {
8834                 error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
8835                 fp->f_fglob->fg_offset = uio_offset(auio);
8836         } else {
8837                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
8838                 fp->f_fglob->fg_offset = uio_offset(auio);
8839         }
8840         if (error) {
8841                 (void)vnode_put(vp);
8842                 goto out;
8843         }
8844
8845         if ((user_ssize_t)bufsize == uio_resid(auio)) {
8846                 if (union_dircheckp) {
8847                         error = union_dircheckp(&vp, fp, &context);
8848                         if (error == -1) {
8849                                 goto unionread;
8850                         }
8851                         if (error) {
8852                                 (void)vnode_put(vp);
8853                                 goto out;
8854                         }
8855                 }
8856
8857                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
8858                         struct vnode *tvp = vp;
8859                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
8860                                 vnode_ref(vp);
8861                                 fp->f_fglob->fg_data = (caddr_t) vp;
8862                                 fp->f_fglob->fg_offset = 0;
8863                                 vnode_rele(tvp);
8864                                 vnode_put(tvp);
8865                                 goto unionread;
8866                         }
8867                         vp = tvp;
8868                 }
8869         }
8870
8871         vnode_put(vp);
8872         if (offset) {
8873                 *offset = loff;
8874         }
8875
8876         *bytesread = bufsize - uio_resid(auio);
8877 out:
8878         file_drop(fd);
8879         return error;
8880 }
8881
8882
8883 int
8884 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
8885 {
8886         off_t offset;
8887         ssize_t bytesread;
8888         int error;
8889
8890         AUDIT_ARG(fd, uap->fd);
8891         error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
8892
8893         if (error == 0) {
8894                 if (proc_is64bit(p)) {
8895                         user64_long_t base = (user64_long_t)offset;
8896                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
8897                 } else {
8898                         user32_long_t base = (user32_long_t)offset;
8899                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
8900                 }
8901                 *retval = bytesread;
8902         }
8903         return error;
8904 }
8905
8906 int
8907 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
8908 {
8909         off_t offset;
8910         ssize_t bytesread;
8911         int error;
8912
8913         AUDIT_ARG(fd, uap->fd);
8914         error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
8915
8916         if (error == 0) {
8917                 *retval = bytesread;
8918                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
8919         }
8920         return error;
8921 }
8922
8923
8924 /*
8925  * Set the mode mask for creation of filesystem nodes.
8926  * XXX implement xsecurity
8927  */
8928 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
8929 static int
8930 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
8931 {
8932         struct filedesc *fdp;
8933
8934         AUDIT_ARG(mask, newmask);
8935         proc_fdlock(p);
8936         fdp = p->p_fd;
8937         *retval = fdp->fd_cmask;
8938         fdp->fd_cmask = newmask & ALLPERMS;
8939         proc_fdunlock(p);
8940         return 0;
8941 }
8942
8943 /*
8944  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
8945  *
8946  * Parameters:    p                       Process requesting to set the umask
8947  *                uap                     User argument descriptor (see below)
8948  *                retval                  umask of the process (parameter p)
8949  *
8950  * Indirect:      uap->newmask            umask to set
8951  *                uap->xsecurity          ACL to set
8952  *
8953  * Returns:        0                      Success
8954  *                !0                      Not success
8955  *
8956  */
8957 int
8958 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
8959 {
8960         int ciferror;
8961         kauth_filesec_t xsecdst;
8962
8963         xsecdst = KAUTH_FILESEC_NONE;
8964         if (uap->xsecurity != USER_ADDR_NULL) {
8965                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
8966                         return ciferror;
8967                 }
8968         } else {
8969                 xsecdst = KAUTH_FILESEC_NONE;
8970         }
8971
8972         ciferror = umask1(p, uap->newmask, xsecdst, retval);
8973
8974         if (xsecdst != KAUTH_FILESEC_NONE) {
8975                 kauth_filesec_free(xsecdst);
8976         }
8977         return ciferror;
8978 }
8979
8980 int
8981 umask(proc_t p, struct umask_args *uap, int32_t *retval)
8982 {
8983         return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
8984 }
8985
8986 /*
8987  * Void all references to file by ripping underlying filesystem
8988  * away from vnode.
8989  */
8990 /* ARGSUSED */
8991 int
8992 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
8993 {
8994         vnode_t vp;
8995         struct vnode_attr va;
8996         vfs_context_t ctx = vfs_context_current();
8997         int error;
8998         struct nameidata nd;
8999
9000         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9001             uap->path, ctx);
9002         error = namei(&nd);
9003         if (error) {
9004                 return error;
9005         }
9006         vp = nd.ni_vp;
9007
9008         nameidone(&nd);
9009
9010         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9011                 error = ENOTSUP;
9012                 goto out;
9013         }
9014
9015         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9016                 error = EBUSY;
9017                 goto out;
9018         }
9019
9020 #if CONFIG_MACF
9021         error = mac_vnode_check_revoke(ctx, vp);
9022         if (error) {
9023                 goto out;
9024         }
9025 #endif
9026
9027         VATTR_INIT(&va);
9028         VATTR_WANTED(&va, va_uid);
9029         if ((error = vnode_getattr(vp, &va, ctx))) {
9030                 goto out;
9031         }
9032         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9033             (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9034                 goto out;
9035         }
9036         if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9037                 VNOP_REVOKE(vp, REVOKEALL, ctx);
9038         }
9039 out:
9040         vnode_put(vp);
9041         return error;
9042 }
9043
9044
9045 /*
9046  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9047  *  The following system calls are designed to support features
9048  *  which are specific to the HFS & HFS Plus volume formats
9049  */
9050
9051
9052 /*
9053  * Obtain attribute information on objects in a directory while enumerating
9054  * the directory.
9055  */
9056 /* ARGSUSED */
9057 int
9058 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9059 {
9060         vnode_t vp;
9061         struct fileproc *fp;
9062         uio_t auio = NULL;
9063         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9064         uint32_t count = 0, savecount = 0;
9065         uint32_t newstate = 0;
9066         int error, eofflag;
9067         uint32_t loff = 0;
9068         struct attrlist attributelist;
9069         vfs_context_t ctx = vfs_context_current();
9070         int fd = uap->fd;
9071         char uio_buf[UIO_SIZEOF(1)];
9072         kauth_action_t action;
9073
9074         AUDIT_ARG(fd, fd);
9075
9076         /* Get the attributes into kernel space */
9077         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9078                 return error;
9079         }
9080         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9081                 return error;
9082         }
9083         savecount = count;
9084         if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9085                 return error;
9086         }
9087         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9088                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9089                 error = EBADF;
9090                 goto out;
9091         }
9092
9093
9094 #if CONFIG_MACF
9095         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
9096             fp->f_fglob);
9097         if (error) {
9098                 goto out;
9099         }
9100 #endif
9101
9102
9103         if ((error = vnode_getwithref(vp))) {
9104                 goto out;
9105         }
9106
9107         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9108
9109 unionread:
9110         if (vp->v_type != VDIR) {
9111                 (void)vnode_put(vp);
9112                 error = EINVAL;
9113                 goto out;
9114         }
9115
9116 #if CONFIG_MACF
9117         error = mac_vnode_check_readdir(ctx, vp);
9118         if (error != 0) {
9119                 (void)vnode_put(vp);
9120                 goto out;
9121         }
9122 #endif /* MAC */
9123
9124         /* set up the uio structure which will contain the users return buffer */
9125         loff = fp->f_fglob->fg_offset;
9126         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9127         uio_addiov(auio, uap->buffer, uap->buffersize);
9128
9129         /*
9130          * If the only item requested is file names, we can let that past with
9131          * just LIST_DIRECTORY.  If they want any other attributes, that means
9132          * they need SEARCH as well.
9133          */
9134         action = KAUTH_VNODE_LIST_DIRECTORY;
9135         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
9136             attributelist.fileattr || attributelist.dirattr) {
9137                 action |= KAUTH_VNODE_SEARCH;
9138         }
9139
9140         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
9141                 /* Believe it or not, uap->options only has 32-bits of valid
9142                  * info, so truncate before extending again */
9143
9144                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
9145                     (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
9146         }
9147
9148         if (error) {
9149                 (void) vnode_put(vp);
9150                 goto out;
9151         }
9152
9153         /*
9154          * If we've got the last entry of a directory in a union mount
9155          * then reset the eofflag and pretend there's still more to come.
9156          * The next call will again set eofflag and the buffer will be empty,
9157          * so traverse to the underlying directory and do the directory
9158          * read there.
9159          */
9160         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
9161                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
9162                         eofflag = 0;
9163                 } else {                                                // Empty buffer
9164                         struct vnode *tvp = vp;
9165                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
9166                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
9167                                 fp->f_fglob->fg_data = (caddr_t) vp;
9168                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
9169                                 count = savecount;
9170                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
9171                                 vnode_put(tvp);
9172                                 goto unionread;
9173                         }
9174                         vp = tvp;
9175                 }
9176         }
9177
9178         (void)vnode_put(vp);
9179
9180         if (error) {
9181                 goto out;
9182         }
9183         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
9184
9185         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
9186                 goto out;
9187         }
9188         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
9189                 goto out;
9190         }
9191         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
9192                 goto out;
9193         }
9194
9195         *retval = eofflag;  /* similar to getdirentries */
9196         error = 0;
9197 out:
9198         file_drop(fd);
9199         return error; /* return error earlier, an retval of 0 or 1 now */
9200 } /* end of getdirentriesattr system call */
9201
9202 /*
9203  * Exchange data between two files
9204  */
9205
9206 /* ARGSUSED */
9207 int
9208 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
9209 {
9210         struct nameidata fnd, snd;
9211         vfs_context_t ctx = vfs_context_current();
9212         vnode_t fvp;
9213         vnode_t svp;
9214         int error;
9215         u_int32_t nameiflags;
9216         char *fpath = NULL;
9217         char *spath = NULL;
9218         int   flen = 0, slen = 0;
9219         int from_truncated = 0, to_truncated = 0;
9220 #if CONFIG_FSE
9221         fse_info f_finfo, s_finfo;
9222 #endif
9223
9224         nameiflags = 0;
9225         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
9226                 nameiflags |= FOLLOW;
9227         }
9228
9229         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
9230             UIO_USERSPACE, uap->path1, ctx);
9231
9232         error = namei(&fnd);
9233         if (error) {
9234                 goto out2;
9235         }
9236
9237         nameidone(&fnd);
9238         fvp = fnd.ni_vp;
9239
9240         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
9241             UIO_USERSPACE, uap->path2, ctx);
9242
9243         error = namei(&snd);
9244         if (error) {
9245                 vnode_put(fvp);
9246                 goto out2;
9247         }
9248         nameidone(&snd);
9249         svp = snd.ni_vp;
9250
9251         /*
9252          * if the files are the same, return an inval error
9253          */
9254         if (svp == fvp) {
9255                 error = EINVAL;
9256                 goto out;
9257         }
9258
9259         /*
9260          * if the files are on different volumes, return an error
9261          */
9262         if (svp->v_mount != fvp->v_mount) {
9263                 error = EXDEV;
9264                 goto out;
9265         }
9266
9267         /* If they're not files, return an error */
9268         if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
9269                 error = EINVAL;
9270                 goto out;
9271         }
9272
9273 #if CONFIG_MACF
9274         error = mac_vnode_check_exchangedata(ctx,
9275             fvp, svp);
9276         if (error) {
9277                 goto out;
9278         }
9279 #endif
9280         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
9281             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
9282                 goto out;
9283         }
9284
9285         if (
9286 #if CONFIG_FSE
9287                 need_fsevent(FSE_EXCHANGE, fvp) ||
9288 #endif
9289                 kauth_authorize_fileop_has_listeners()) {
9290                 GET_PATH(fpath);
9291                 GET_PATH(spath);
9292                 if (fpath == NULL || spath == NULL) {
9293                         error = ENOMEM;
9294                         goto out;
9295                 }
9296
9297                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
9298                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
9299
9300 #if CONFIG_FSE
9301                 get_fse_info(fvp, &f_finfo, ctx);
9302                 get_fse_info(svp, &s_finfo, ctx);
9303                 if (from_truncated || to_truncated) {
9304                         // set it here since only the f_finfo gets reported up to user space
9305                         f_finfo.mode |= FSE_TRUNCATED_PATH;
9306                 }
9307 #endif
9308         }
9309         /* Ok, make the call */
9310         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
9311
9312         if (error == 0) {
9313                 const char *tmpname;
9314
9315                 if (fpath != NULL && spath != NULL) {
9316                         /* call out to allow 3rd party notification of exchangedata.
9317                          * Ignore result of kauth_authorize_fileop call.
9318                          */
9319                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
9320                             (uintptr_t)fpath, (uintptr_t)spath);
9321                 }
9322                 name_cache_lock();
9323
9324                 tmpname     = fvp->v_name;
9325                 fvp->v_name = svp->v_name;
9326                 svp->v_name = tmpname;
9327
9328                 if (fvp->v_parent != svp->v_parent) {
9329                         vnode_t tmp;
9330
9331                         tmp           = fvp->v_parent;
9332                         fvp->v_parent = svp->v_parent;
9333                         svp->v_parent = tmp;
9334                 }
9335                 name_cache_unlock();
9336
9337 #if CONFIG_FSE
9338                 if (fpath != NULL && spath != NULL) {
9339                         add_fsevent(FSE_EXCHANGE, ctx,
9340                             FSE_ARG_STRING, flen, fpath,
9341                             FSE_ARG_FINFO, &f_finfo,
9342                             FSE_ARG_STRING, slen, spath,
9343                             FSE_ARG_FINFO, &s_finfo,
9344                             FSE_ARG_DONE);
9345                 }
9346 #endif
9347         }
9348
9349 out:
9350         if (fpath != NULL) {
9351                 RELEASE_PATH(fpath);
9352         }
9353         if (spath != NULL) {
9354                 RELEASE_PATH(spath);
9355         }
9356         vnode_put(svp);
9357         vnode_put(fvp);
9358 out2:
9359         return error;
9360 }
9361
9362 /*
9363  * Return (in MB) the amount of freespace on the given vnode's volume.
9364  */
9365 uint32_t freespace_mb(vnode_t vp);
9366
9367 uint32_t
9368 freespace_mb(vnode_t vp)
9369 {
9370         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
9371         return ((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
9372                vp->v_mount->mnt_vfsstat.f_bsize) >> 20;
9373 }
9374
9375 #if CONFIG_SEARCHFS
9376
9377 /* ARGSUSED */
9378
9379 int
9380 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
9381 {
9382         vnode_t vp, tvp;
9383         int i, error = 0;
9384         int fserror = 0;
9385         struct nameidata nd;
9386         struct user64_fssearchblock searchblock;
9387         struct searchstate *state;
9388         struct attrlist *returnattrs;
9389         struct timeval timelimit;
9390         void *searchparams1, *searchparams2;
9391         uio_t auio = NULL;
9392         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9393         uint32_t nummatches;
9394         int mallocsize;
9395         uint32_t nameiflags;
9396         vfs_context_t ctx = vfs_context_current();
9397         char uio_buf[UIO_SIZEOF(1)];
9398
9399         /* Start by copying in fsearchblock parameter list */
9400         if (IS_64BIT_PROCESS(p)) {
9401                 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
9402                 timelimit.tv_sec = searchblock.timelimit.tv_sec;
9403                 timelimit.tv_usec = searchblock.timelimit.tv_usec;
9404         } else {
9405                 struct user32_fssearchblock tmp_searchblock;
9406
9407                 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
9408                 // munge into 64-bit version
9409                 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
9410                 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
9411                 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
9412                 searchblock.maxmatches = tmp_searchblock.maxmatches;
9413                 /*
9414                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
9415                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
9416                  */
9417                 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
9418                 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
9419                 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
9420                 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
9421                 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
9422                 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
9423                 searchblock.searchattrs = tmp_searchblock.searchattrs;
9424         }
9425         if (error) {
9426                 return error;
9427         }
9428
9429         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
9430          */
9431         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
9432             searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
9433                 return EINVAL;
9434         }
9435
9436         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
9437         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
9438         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
9439         /* block.                                                                                             */
9440         /*                                                                                                    */
9441         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
9442         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
9443         /*       assumes the size is still 556 bytes it will continue to work                                 */
9444
9445         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
9446             sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
9447
9448         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
9449
9450         /* Now set up the various pointers to the correct place in our newly allocated memory */
9451
9452         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
9453         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
9454         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
9455
9456         /* Now copy in the stuff given our local variables. */
9457
9458         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
9459                 goto freeandexit;
9460         }
9461
9462         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
9463                 goto freeandexit;
9464         }
9465
9466         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
9467                 goto freeandexit;
9468         }
9469
9470         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
9471                 goto freeandexit;
9472         }
9473
9474         /*
9475          * When searching a union mount, need to set the
9476          * start flag at the first call on each layer to
9477          * reset state for the new volume.
9478          */
9479         if (uap->options & SRCHFS_START) {
9480                 state->ss_union_layer = 0;
9481         } else {
9482                 uap->options |= state->ss_union_flags;
9483         }
9484         state->ss_union_flags = 0;
9485
9486         /*
9487          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
9488          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
9489          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
9490          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
9491          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
9492          */
9493
9494         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
9495                 attrreference_t* string_ref;
9496                 u_int32_t* start_length;
9497                 user64_size_t param_length;
9498
9499                 /* validate searchparams1 */
9500                 param_length = searchblock.sizeofsearchparams1;
9501                 /* skip the word that specifies length of the buffer */
9502                 start_length = (u_int32_t*) searchparams1;
9503                 start_length = start_length + 1;
9504                 string_ref = (attrreference_t*) start_length;
9505
9506                 /* ensure no negative offsets or too big offsets */
9507                 if (string_ref->attr_dataoffset < 0) {
9508                         error = EINVAL;
9509                         goto freeandexit;
9510                 }
9511                 if (string_ref->attr_length > MAXPATHLEN) {
9512                         error = EINVAL;
9513                         goto freeandexit;
9514                 }
9515
9516                 /* Check for pointer overflow in the string ref */
9517                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
9518                         error = EINVAL;
9519                         goto freeandexit;
9520                 }
9521
9522                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
9523                         error = EINVAL;
9524                         goto freeandexit;
9525                 }
9526                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
9527                         error = EINVAL;
9528                         goto freeandexit;
9529                 }
9530         }
9531
9532         /* set up the uio structure which will contain the users return buffer */
9533         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9534         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
9535
9536         nameiflags = 0;
9537         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
9538                 nameiflags |= FOLLOW;
9539         }
9540         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
9541             UIO_USERSPACE, uap->path, ctx);
9542
9543         error = namei(&nd);
9544         if (error) {
9545                 goto freeandexit;
9546         }
9547         vp = nd.ni_vp;
9548         nameidone(&nd);
9549
9550         /*
9551          * Switch to the root vnode for the volume
9552          */
9553         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
9554         vnode_put(vp);
9555         if (error) {
9556                 goto freeandexit;
9557         }
9558         vp = tvp;
9559
9560         /*
9561          * If it's a union mount, the path lookup takes
9562          * us to the top layer. But we may need to descend
9563          * to a lower layer. For non-union mounts the layer
9564          * is always zero.
9565          */
9566         for (i = 0; i < (int) state->ss_union_layer; i++) {
9567                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
9568                         break;
9569                 }
9570                 tvp = vp;
9571                 vp = vp->v_mount->mnt_vnodecovered;
9572                 if (vp == NULL) {
9573                         vnode_put(tvp);
9574                         error = ENOENT;
9575                         goto freeandexit;
9576                 }
9577                 error = vnode_getwithref(vp);
9578                 vnode_put(tvp);
9579                 if (error) {
9580                         goto freeandexit;
9581                 }
9582         }
9583
9584 #if CONFIG_MACF
9585         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
9586         if (error) {
9587                 vnode_put(vp);
9588                 goto freeandexit;
9589         }
9590 #endif
9591
9592
9593         /*
9594          * If searchblock.maxmatches == 0, then skip the search. This has happened
9595          * before and sometimes the underlying code doesnt deal with it well.
9596          */
9597         if (searchblock.maxmatches == 0) {
9598                 nummatches = 0;
9599                 goto saveandexit;
9600         }
9601
9602         /*
9603          * Allright, we have everything we need, so lets make that call.
9604          *
9605          * We keep special track of the return value from the file system:
9606          * EAGAIN is an acceptable error condition that shouldn't keep us
9607          * from copying out any results...
9608          */
9609
9610         fserror = VNOP_SEARCHFS(vp,
9611             searchparams1,
9612             searchparams2,
9613             &searchblock.searchattrs,
9614             (u_long)searchblock.maxmatches,
9615             &timelimit,
9616             returnattrs,
9617             &nummatches,
9618             (u_long)uap->scriptcode,
9619             (u_long)uap->options,
9620             auio,
9621             (struct searchstate *) &state->ss_fsstate,
9622             ctx);
9623
9624         /*
9625          * If it's a union mount we need to be called again
9626          * to search the mounted-on filesystem.
9627          */
9628         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
9629                 state->ss_union_flags = SRCHFS_START;
9630                 state->ss_union_layer++;        // search next layer down
9631                 fserror = EAGAIN;
9632         }
9633
9634 saveandexit:
9635
9636         vnode_put(vp);
9637
9638         /* Now copy out the stuff that needs copying out. That means the number of matches, the
9639          *  search state.  Everything was already put into he return buffer by the vop call. */
9640
9641         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
9642                 goto freeandexit;
9643         }
9644
9645         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
9646                 goto freeandexit;
9647         }
9648
9649         error = fserror;
9650
9651 freeandexit:
9652
9653         FREE(searchparams1, M_TEMP);
9654
9655         return error;
9656 } /* end of searchfs system call */
9657
9658 #else /* CONFIG_SEARCHFS */
9659
9660 int
9661 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
9662 {
9663         return ENOTSUP;
9664 }
9665
9666 #endif /* CONFIG_SEARCHFS */
9667
9668
9669 lck_grp_attr_t *  nspace_group_attr;
9670 lck_attr_t *      nspace_lock_attr;
9671 lck_grp_t *       nspace_mutex_group;
9672
9673 lck_mtx_t         nspace_handler_lock;
9674 lck_mtx_t         nspace_handler_exclusion_lock;
9675
9676 time_t snapshot_timestamp = 0;
9677 int nspace_allow_virtual_devs = 0;
9678
9679 void nspace_handler_init(void);
9680
9681 typedef struct nspace_item_info {
9682         struct vnode *vp;
9683         void         *arg;
9684         uint64_t      op;
9685         uint32_t      vid;
9686         uint32_t      flags;
9687         uint32_t      token;
9688         uint32_t      refcount;
9689 } nspace_item_info;
9690
9691 #define MAX_NSPACE_ITEMS   128
9692 nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
9693 uint32_t      nspace_item_idx = 0;              // also used as the sleep/wakeup rendezvous address
9694 uint32_t      nspace_token_id = 0;
9695 uint32_t      nspace_handler_timeout = 15;    // seconds
9696
9697 #define NSPACE_ITEM_NEW         0x0001
9698 #define NSPACE_ITEM_PROCESSING  0x0002
9699 #define NSPACE_ITEM_DEAD        0x0004
9700 #define NSPACE_ITEM_CANCELLED   0x0008
9701 #define NSPACE_ITEM_DONE        0x0010
9702 #define NSPACE_ITEM_RESET_TIMER 0x0020
9703
9704 #define NSPACE_ITEM_NSPACE_EVENT   0x0040
9705 #define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
9706
9707 #define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT)
9708
9709 //#pragma optimization_level 0
9710
9711 typedef enum {
9712         NSPACE_HANDLER_NSPACE = 0,
9713         NSPACE_HANDLER_SNAPSHOT = 1,
9714
9715         NSPACE_HANDLER_COUNT,
9716 } nspace_type_t;
9717
9718 typedef struct {
9719         uint64_t handler_tid;
9720         struct proc *handler_proc;
9721         int handler_busy;
9722 } nspace_handler_t;
9723
9724 nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
9725
9726 /* namespace fsctl functions */
9727 static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
9728 static int nspace_item_flags_for_type(nspace_type_t nspace_type);
9729 static int nspace_open_flags_for_type(nspace_type_t nspace_type);
9730 static nspace_type_t nspace_type_for_op(uint64_t op);
9731 static int nspace_is_special_process(struct proc *proc);
9732 static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
9733 static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
9734 static int validate_namespace_args(int is64bit, int size);
9735 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
9736
9737
9738 static inline int
9739 nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
9740 {
9741         switch (nspace_type) {
9742         case NSPACE_HANDLER_NSPACE:
9743                 return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
9744         case NSPACE_HANDLER_SNAPSHOT:
9745                 return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
9746         default:
9747                 printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
9748                 return 0;
9749         }
9750 }
9751
9752 static inline int
9753 nspace_item_flags_for_type(nspace_type_t nspace_type)
9754 {
9755         switch (nspace_type) {
9756         case NSPACE_HANDLER_NSPACE:
9757                 return NSPACE_ITEM_NSPACE_EVENT;
9758         case NSPACE_HANDLER_SNAPSHOT:
9759                 return NSPACE_ITEM_SNAPSHOT_EVENT;
9760         default:
9761                 printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
9762                 return 0;
9763         }
9764 }
9765
9766 static inline int
9767 nspace_open_flags_for_type(nspace_type_t nspace_type)
9768 {
9769         switch (nspace_type) {
9770         case NSPACE_HANDLER_NSPACE:
9771                 return FREAD | FWRITE | O_EVTONLY;
9772         case NSPACE_HANDLER_SNAPSHOT:
9773                 return FREAD | O_EVTONLY;
9774         default:
9775                 printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
9776                 return 0;
9777         }
9778 }
9779
9780 static inline nspace_type_t
9781 nspace_type_for_op(uint64_t op)
9782 {
9783         switch (op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
9784         case NAMESPACE_HANDLER_NSPACE_EVENT:
9785                 return NSPACE_HANDLER_NSPACE;
9786         case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
9787                 return NSPACE_HANDLER_SNAPSHOT;
9788         default:
9789                 printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
9790                 return NSPACE_HANDLER_NSPACE;
9791         }
9792 }
9793
9794 static inline int
9795 nspace_is_special_process(struct proc *proc)
9796 {
9797         int i;
9798         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
9799                 if (proc == nspace_handlers[i].handler_proc) {
9800                         return 1;
9801                 }
9802         }
9803         return 0;
9804 }
9805
9806 void
9807 nspace_handler_init(void)
9808 {
9809         nspace_lock_attr    = lck_attr_alloc_init();
9810         nspace_group_attr   = lck_grp_attr_alloc_init();
9811         nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
9812         lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
9813         lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
9814         memset(&nspace_items[0], 0, sizeof(nspace_items));
9815 }
9816
9817 void
9818 nspace_proc_exit(struct proc *p)
9819 {
9820         int i, event_mask = 0;
9821
9822         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
9823                 if (p == nspace_handlers[i].handler_proc) {
9824                         event_mask |= nspace_item_flags_for_type(i);
9825                         nspace_handlers[i].handler_tid = 0;
9826                         nspace_handlers[i].handler_proc = NULL;
9827                 }
9828         }
9829
9830         if (event_mask == 0) {
9831                 return;
9832         }
9833
9834         lck_mtx_lock(&nspace_handler_lock);
9835         if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
9836                 // if this process was the snapshot handler, zero snapshot_timeout
9837                 snapshot_timestamp = 0;
9838         }
9839
9840         //
9841         // unblock anyone that's waiting for the handler that died
9842         //
9843         for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
9844                 if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
9845                         if (nspace_items[i].flags & event_mask) {
9846                                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9847                                         vnode_lock_spin(nspace_items[i].vp);
9848                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9849                                         vnode_unlock(nspace_items[i].vp);
9850                                 }
9851                                 nspace_items[i].vp = NULL;
9852                                 nspace_items[i].vid = 0;
9853                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9854                                 nspace_items[i].token = 0;
9855
9856                                 wakeup((caddr_t)&(nspace_items[i].vp));
9857                         }
9858                 }
9859         }
9860
9861         wakeup((caddr_t)&nspace_item_idx);
9862         lck_mtx_unlock(&nspace_handler_lock);
9863 }
9864
9865
9866 int
9867 resolve_nspace_item(struct vnode *vp, uint64_t op)
9868 {
9869         return resolve_nspace_item_ext(vp, op, NULL);
9870 }
9871
9872 int
9873 resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
9874 {
9875         int i, error, keep_waiting;
9876         struct timespec ts;
9877         nspace_type_t nspace_type = nspace_type_for_op(op);
9878
9879         // only allow namespace events on regular files, directories and symlinks.
9880         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
9881                 return 0;
9882         }
9883
9884         //
9885         // if this is a snapshot event and the vnode is on a
9886         // disk image just pretend nothing happened since any
9887         // change to the disk image will cause the disk image
9888         // itself to get backed up and this avoids multi-way
9889         // deadlocks between the snapshot handler and the ever
9890         // popular diskimages-helper process.  the variable
9891         // nspace_allow_virtual_devs allows this behavior to
9892         // be overridden (for use by the Mobile TimeMachine
9893         // testing infrastructure which uses disk images)
9894         //
9895         if ((op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
9896             && (vp->v_mount != NULL)
9897             && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
9898             && !nspace_allow_virtual_devs) {
9899                 return 0;
9900         }
9901
9902         // if (thread_tid(current_thread()) == namespace_handler_tid) {
9903         if (nspace_handlers[nspace_type].handler_proc == NULL) {
9904                 return 0;
9905         }
9906
9907         if (nspace_is_special_process(current_proc())) {
9908                 return EDEADLK;
9909         }
9910
9911         lck_mtx_lock(&nspace_handler_lock);
9912
9913 retry:
9914         for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
9915                 if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
9916                         break;
9917                 }
9918         }
9919
9920         if (i >= MAX_NSPACE_ITEMS) {
9921                 for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
9922                         if (nspace_items[i].flags == 0) {
9923                                 break;
9924                         }
9925                 }
9926         } else {
9927                 nspace_items[i].refcount++;
9928         }
9929
9930         if (i >= MAX_NSPACE_ITEMS) {
9931                 ts.tv_sec = nspace_handler_timeout;
9932                 ts.tv_nsec = 0;
9933
9934                 error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS | PCATCH, "nspace-no-space", &ts);
9935                 if (error == 0) {
9936                         // an entry got free'd up, go see if we can get a slot
9937                         goto retry;
9938                 } else {
9939                         lck_mtx_unlock(&nspace_handler_lock);
9940                         return error;
9941                 }
9942         }
9943
9944         //
9945         // if it didn't already exist, add it.  if it did exist
9946         // we'll get woken up when someone does a wakeup() on
9947         // the slot in the nspace_items table.
9948         //
9949         if (vp != nspace_items[i].vp) {
9950                 nspace_items[i].vp = vp;
9951                 nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
9952                 nspace_items[i].op = op;
9953                 nspace_items[i].vid = vnode_vid(vp);
9954                 nspace_items[i].flags = NSPACE_ITEM_NEW;
9955                 nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
9956                 if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
9957                         if (arg) {
9958                                 vnode_lock_spin(vp);
9959                                 vp->v_flag |= VNEEDSSNAPSHOT;
9960                                 vnode_unlock(vp);
9961                         }
9962                 }
9963
9964                 nspace_items[i].token = 0;
9965                 nspace_items[i].refcount = 1;
9966
9967                 wakeup((caddr_t)&nspace_item_idx);
9968         }
9969
9970         //
9971         // Now go to sleep until the handler does a wakeup on this
9972         // slot in the nspace_items table (or we timeout).
9973         //
9974         keep_waiting = 1;
9975         while (keep_waiting) {
9976                 ts.tv_sec = nspace_handler_timeout;
9977                 ts.tv_nsec = 0;
9978                 error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS | PCATCH, "namespace-done", &ts);
9979
9980                 if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
9981                         error = 0;
9982                 } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
9983                         error = nspace_items[i].token;
9984                 } else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
9985                         if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
9986                                 nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
9987                                 continue;
9988                         } else {
9989                                 error = ETIMEDOUT;
9990                         }
9991                 } else if (error == 0) {
9992                         // hmmm, why did we get woken up?
9993                         printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
9994                             nspace_items[i].token);
9995                 }
9996
9997                 if (--nspace_items[i].refcount == 0) {
9998                         nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
9999                         nspace_items[i].arg = NULL;
10000                         nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
10001                         nspace_items[i].flags = 0;     // this clears it for re-use
10002                 }
10003                 wakeup(&nspace_token_id);
10004                 keep_waiting = 0;
10005         }
10006
10007         lck_mtx_unlock(&nspace_handler_lock);
10008
10009         return error;
10010 }
10011
10012 int
10013 nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg)
10014 {
10015         int snapshot_error = 0;
10016
10017         if (vp == NULL) {
10018                 return 0;
10019         }
10020
10021         /* Swap files are special; skip them */
10022         if (vnode_isswap(vp)) {
10023                 return 0;
10024         }
10025
10026         if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) {
10027                 // the change time is within this epoch
10028                 int error;
10029
10030                 error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg);
10031                 if (error == EDEADLK) {
10032                         snapshot_error = 0;
10033                 } else if (error) {
10034                         if (error == EAGAIN) {
10035                                 printf("nspace_snapshot_event: timed out waiting for namespace handler...\n");
10036                         } else if (error == EINTR) {
10037                                 // printf("nspace_snapshot_event: got a signal while waiting for namespace handler...\n");
10038                                 snapshot_error = EINTR;
10039                         }
10040                 }
10041         }
10042
10043         return snapshot_error;
10044 }
10045
10046 int
10047 get_nspace_item_status(struct vnode *vp, int32_t *status)
10048 {
10049         int i;
10050
10051         lck_mtx_lock(&nspace_handler_lock);
10052         for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
10053                 if (nspace_items[i].vp == vp) {
10054                         break;
10055                 }
10056         }
10057
10058         if (i >= MAX_NSPACE_ITEMS) {
10059                 lck_mtx_unlock(&nspace_handler_lock);
10060                 return ENOENT;
10061         }
10062
10063         *status = nspace_items[i].flags;
10064         lck_mtx_unlock(&nspace_handler_lock);
10065         return 0;
10066 }
10067
10068
10069 #if 0
10070 static int
10071 build_volfs_path(struct vnode *vp, char *path, int *len)
10072 {
10073         struct vnode_attr va;
10074         int ret;
10075
10076         VATTR_INIT(&va);
10077         VATTR_WANTED(&va, va_fsid);
10078         VATTR_WANTED(&va, va_fileid);
10079
10080         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
10081                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
10082                 ret = -1;
10083         } else {
10084                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
10085                 ret = 0;
10086         }
10087
10088         return ret;
10089 }
10090 #endif
10091
10092 //
10093 // Note: this function does NOT check permissions on all of the
10094 // parent directories leading to this vnode.  It should only be
10095 // called on behalf of a root process.  Otherwise a process may
10096 // get access to a file because the file itself is readable even
10097 // though its parent directories would prevent access.
10098 //
10099 static int
10100 vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
10101 {
10102         int error, action;
10103
10104         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10105                 return error;
10106         }
10107
10108 #if CONFIG_MACF
10109         error = mac_vnode_check_open(ctx, vp, fmode);
10110         if (error) {
10111                 return error;
10112         }
10113 #endif
10114
10115         /* compute action to be authorized */
10116         action = 0;
10117         if (fmode & FREAD) {
10118                 action |= KAUTH_VNODE_READ_DATA;
10119         }
10120         if (fmode & (FWRITE | O_TRUNC)) {
10121                 /*
10122                  * If we are writing, appending, and not truncating,
10123                  * indicate that we are appending so that if the
10124                  * UF_APPEND or SF_APPEND bits are set, we do not deny
10125                  * the open.
10126                  */
10127                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
10128                         action |= KAUTH_VNODE_APPEND_DATA;
10129                 } else {
10130                         action |= KAUTH_VNODE_WRITE_DATA;
10131                 }
10132         }
10133
10134         if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0) {
10135                 return error;
10136         }
10137
10138
10139         //
10140         // if the vnode is tagged VOPENEVT and the current process
10141         // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
10142         // flag to the open mode so that this open won't count against
10143         // the vnode when carbon delete() does a vnode_isinuse() to see
10144         // if a file is currently in use.  this allows spotlight
10145         // importers to not interfere with carbon apps that depend on
10146         // the no-delete-if-busy semantics of carbon delete().
10147         //
10148         if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
10149                 fmode |= O_EVTONLY;
10150         }
10151
10152         if ((error = VNOP_OPEN(vp, fmode, ctx))) {
10153                 return error;
10154         }
10155         if ((error = vnode_ref_ext(vp, fmode, 0))) {
10156                 VNOP_CLOSE(vp, fmode, ctx);
10157                 return error;
10158         }
10159
10160         /* Call out to allow 3rd party notification of open.
10161          * Ignore result of kauth_authorize_fileop call.
10162          */
10163 #if CONFIG_MACF
10164         mac_vnode_notify_open(ctx, vp, fmode);
10165 #endif
10166         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
10167             (uintptr_t)vp, 0);
10168
10169
10170         return 0;
10171 }
10172
10173 static int
10174 wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
10175 {
10176         int i;
10177         int error = 0;
10178         int unblock = 0;
10179         task_t curtask;
10180
10181         lck_mtx_lock(&nspace_handler_exclusion_lock);
10182         if (nspace_handlers[nspace_type].handler_busy) {
10183                 lck_mtx_unlock(&nspace_handler_exclusion_lock);
10184                 return EBUSY;
10185         }
10186
10187         nspace_handlers[nspace_type].handler_busy = 1;
10188         lck_mtx_unlock(&nspace_handler_exclusion_lock);
10189
10190         /*
10191          * Any process that gets here will be one of the namespace handlers.
10192          * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
10193          * as we can cause deadlocks to occur, because the namespace handler may prevent
10194          * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
10195          * process.
10196          */
10197         curtask = current_task();
10198         bsd_set_dependency_capable(curtask);
10199
10200         lck_mtx_lock(&nspace_handler_lock);
10201         if (nspace_handlers[nspace_type].handler_proc == NULL) {
10202                 nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
10203                 nspace_handlers[nspace_type].handler_proc = current_proc();
10204         }
10205
10206         if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
10207             (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
10208                 error = EINVAL;
10209         }
10210
10211         while (error == 0) {
10212                 /* Try to find matching namespace item */
10213                 for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
10214                         if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
10215                                 if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
10216                                         break;
10217                                 }
10218                         }
10219                 }
10220
10221                 if (i >= MAX_NSPACE_ITEMS) {
10222                         /* Nothing is there yet. Wait for wake up and retry */
10223                         error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS | PCATCH, "namespace-items", 0);
10224                         if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
10225                                 /* Prevent infinite loop if snapshot handler exited */
10226                                 error = EINVAL;
10227                                 break;
10228                         }
10229                         continue;
10230                 }
10231
10232                 nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
10233                 nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
10234                 nspace_items[i].token  = ++nspace_token_id;
10235
10236                 assert(nspace_items[i].vp);
10237                 struct fileproc *fp;
10238                 int32_t indx;
10239                 int32_t fmode;
10240                 struct proc *p = current_proc();
10241                 vfs_context_t ctx = vfs_context_current();
10242                 struct vnode_attr va;
10243                 bool vn_get_succsessful = false;
10244                 bool vn_open_successful = false;
10245                 bool fp_alloc_successful = false;
10246
10247                 /*
10248                  * Use vnode pointer to acquire a file descriptor for
10249                  * hand-off to userland
10250                  */
10251                 fmode = nspace_open_flags_for_type(nspace_type);
10252                 error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
10253                 if (error) {
10254                         goto cleanup;
10255                 }
10256                 vn_get_succsessful = true;
10257
10258                 error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
10259                 if (error) {
10260                         goto cleanup;
10261                 }
10262                 vn_open_successful = true;
10263
10264                 error = falloc(p, &fp, &indx, ctx);
10265                 if (error) {
10266                         goto cleanup;
10267                 }
10268                 fp_alloc_successful = true;
10269
10270                 fp->f_fglob->fg_flag = fmode;
10271                 fp->f_fglob->fg_ops = &vnops;
10272                 fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
10273
10274                 proc_fdlock(p);
10275                 procfdtbl_releasefd(p, indx, NULL);
10276                 fp_drop(p, indx, fp, 1);
10277                 proc_fdunlock(p);
10278
10279                 /*
10280                  * All variants of the namespace handler struct support these three fields:
10281                  * token, flags, and the FD pointer
10282                  */
10283                 error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
10284                 if (error) {
10285                         goto cleanup;
10286                 }
10287                 error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
10288                 if (error) {
10289                         goto cleanup;
10290                 }
10291                 error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
10292                 if (error) {
10293                         goto cleanup;
10294                 }
10295
10296                 /*
10297                  * Handle optional fields:
10298                  * extended version support an info ptr (offset, length), and the
10299                  *
10300                  * namedata version supports a unique per-link object ID
10301                  *
10302                  */
10303                 if (nhd->infoptr) {
10304                         uio_t uio = (uio_t)nspace_items[i].arg;
10305                         uint64_t u_offset, u_length;
10306
10307                         if (uio) {
10308                                 u_offset = uio_offset(uio);
10309                                 u_length = uio_resid(uio);
10310                         } else {
10311                                 u_offset = 0;
10312                                 u_length = 0;
10313                         }
10314                         error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
10315                         if (error) {
10316                                 goto cleanup;
10317                         }
10318                         error = copyout(&u_length, nhd->infoptr + sizeof(uint64_t), sizeof(uint64_t));
10319                         if (error) {
10320                                 goto cleanup;
10321                         }
10322                 }
10323
10324                 if (nhd->objid) {
10325                         VATTR_INIT(&va);
10326                         VATTR_WANTED(&va, va_linkid);
10327                         error = vnode_getattr(nspace_items[i].vp, &va, ctx);
10328                         if (error) {
10329                                 goto cleanup;
10330                         }
10331
10332                         uint64_t linkid = 0;
10333                         if (VATTR_IS_SUPPORTED(&va, va_linkid)) {
10334                                 linkid = (uint64_t)va.va_linkid;
10335                         }
10336                         error = copyout(&linkid, nhd->objid, sizeof(uint64_t));
10337                 }
10338 cleanup:
10339                 if (error) {
10340                         if (fp_alloc_successful) {
10341                                 fp_free(p, indx, fp);
10342                         }
10343                         if (vn_open_successful) {
10344                                 vn_close(nspace_items[i].vp, fmode, ctx);
10345                         }
10346                         unblock = 1;
10347                 }
10348
10349                 if (vn_get_succsessful) {
10350                         vnode_put(nspace_items[i].vp);
10351                 }
10352
10353                 break;
10354         }
10355
10356         if (unblock) {
10357                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
10358                         vnode_lock_spin(nspace_items[i].vp);
10359                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10360                         vnode_unlock(nspace_items[i].vp);
10361                 }
10362                 nspace_items[i].vp = NULL;
10363                 nspace_items[i].vid = 0;
10364                 nspace_items[i].flags = NSPACE_ITEM_DONE;
10365                 nspace_items[i].token = 0;
10366
10367                 wakeup((caddr_t)&(nspace_items[i].vp));
10368         }
10369
10370         if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
10371                 // just go through every snapshot event and unblock it immediately.
10372                 if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
10373                         for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
10374                                 if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
10375                                         if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
10376                                                 nspace_items[i].vp = NULL;
10377                                                 nspace_items[i].vid = 0;
10378                                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
10379                                                 nspace_items[i].token = 0;
10380
10381                                                 wakeup((caddr_t)&(nspace_items[i].vp));
10382                                         }
10383                                 }
10384                         }
10385                 }
10386         }
10387
10388         lck_mtx_unlock(&nspace_handler_lock);
10389
10390         lck_mtx_lock(&nspace_handler_exclusion_lock);
10391         nspace_handlers[nspace_type].handler_busy = 0;
10392         lck_mtx_unlock(&nspace_handler_exclusion_lock);
10393
10394         return error;
10395 }
10396
10397 static inline int
10398 validate_namespace_args(int is64bit, int size)
10399 {
10400         if (is64bit) {
10401                 /* Must be one of these */
10402                 if (size == sizeof(user64_namespace_handler_info)) {
10403                         goto sizeok;
10404                 }
10405                 if (size == sizeof(user64_namespace_handler_info_ext)) {
10406                         goto sizeok;
10407                 }
10408                 if (size == sizeof(user64_namespace_handler_data)) {
10409                         goto sizeok;
10410                 }
10411                 return EINVAL;
10412         } else {
10413                 /* 32 bit -- must be one of these */
10414                 if (size == sizeof(user32_namespace_handler_info)) {
10415                         goto sizeok;
10416                 }
10417                 if (size == sizeof(user32_namespace_handler_info_ext)) {
10418                         goto sizeok;
10419                 }
10420                 if (size == sizeof(user32_namespace_handler_data)) {
10421                         goto sizeok;
10422                 }
10423                 return EINVAL;
10424         }
10425
10426 sizeok:
10427
10428         return 0;
10429 }
10430
10431 static int
10432 process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
10433 {
10434         int error = 0;
10435         namespace_handler_data nhd;
10436
10437         bzero(&nhd, sizeof(namespace_handler_data));
10438
10439         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10440                 return error;
10441         }
10442
10443         error = validate_namespace_args(is64bit, size);
10444         if (error) {
10445                 return error;
10446         }
10447
10448         /* Copy in the userland pointers into our kernel-only struct */
10449
10450         if (is64bit) {
10451                 /* 64 bit userland structures */
10452                 nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
10453                 nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
10454                 nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
10455
10456                 /* If the size is greater than the standard info struct, add in extra fields */
10457                 if (size > (sizeof(user64_namespace_handler_info))) {
10458                         if (size >= (sizeof(user64_namespace_handler_info_ext))) {
10459                                 nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
10460                         }
10461                         if (size == (sizeof(user64_namespace_handler_data))) {
10462                                 nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
10463                         }
10464                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
10465                 }
10466         } else {
10467                 /* 32 bit userland structures */
10468                 nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
10469                 nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
10470                 nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
10471
10472                 if (size > (sizeof(user32_namespace_handler_info))) {
10473                         if (size >= (sizeof(user32_namespace_handler_info_ext))) {
10474                                 nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
10475                         }
10476                         if (size == (sizeof(user32_namespace_handler_data))) {
10477                                 nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
10478                         }
10479                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
10480                 }
10481         }
10482
10483         return wait_for_namespace_event(&nhd, nspace_type);
10484 }
10485
10486 static unsigned long
10487 fsctl_bogus_command_compat(unsigned long cmd)
10488 {
10489         switch (cmd) {
10490         case IOCBASECMD(FSIOC_SYNC_VOLUME):
10491                 return FSIOC_SYNC_VOLUME;
10492         case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
10493                 return FSIOC_ROUTEFS_SETROUTEID;
10494         case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
10495                 return FSIOC_SET_PACKAGE_EXTS;
10496         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_GET):
10497                 return FSIOC_NAMESPACE_HANDLER_GET;
10498         case IOCBASECMD(FSIOC_OLD_SNAPSHOT_HANDLER_GET):
10499                 return FSIOC_OLD_SNAPSHOT_HANDLER_GET;
10500         case IOCBASECMD(FSIOC_SNAPSHOT_HANDLER_GET_EXT):
10501                 return FSIOC_SNAPSHOT_HANDLER_GET_EXT;
10502         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UPDATE):
10503                 return FSIOC_NAMESPACE_HANDLER_UPDATE;
10504         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UNBLOCK):
10505                 return FSIOC_NAMESPACE_HANDLER_UNBLOCK;
10506         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_CANCEL):
10507                 return FSIOC_NAMESPACE_HANDLER_CANCEL;
10508         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME):
10509                 return FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME;
10510         case IOCBASECMD(FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS):
10511                 return FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS;
10512         case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
10513                 return FSIOC_SET_FSTYPENAME_OVERRIDE;
10514         case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
10515                 return DISK_CONDITIONER_IOC_GET;
10516         case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
10517                 return DISK_CONDITIONER_IOC_SET;
10518         case IOCBASECMD(FSIOC_FIOSEEKHOLE):
10519                 return FSIOC_FIOSEEKHOLE;
10520         case IOCBASECMD(FSIOC_FIOSEEKDATA):
10521                 return FSIOC_FIOSEEKDATA;
10522         case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
10523                 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
10524         case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
10525                 return SPOTLIGHT_IOC_GET_LAST_MTIME;
10526         }
10527
10528         return cmd;
10529 }
10530
10531 /*
10532  * Make a filesystem-specific control call:
10533  */
10534 /* ARGSUSED */
10535 static int
10536 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
10537 {
10538         int error = 0;
10539         boolean_t is64bit;
10540         u_int size;
10541 #define STK_PARAMS 128
10542         char stkbuf[STK_PARAMS] = {0};
10543         caddr_t data, memp;
10544         vnode_t vp = *arg_vp;
10545
10546         cmd = fsctl_bogus_command_compat(cmd);
10547
10548         size = IOCPARM_LEN(cmd);
10549         if (size > IOCPARM_MAX) {
10550                 return EINVAL;
10551         }
10552
10553         is64bit = proc_is64bit(p);
10554
10555         memp = NULL;
10556
10557         if (size > sizeof(stkbuf)) {
10558                 if ((memp = (caddr_t)kalloc(size)) == 0) {
10559                         return ENOMEM;
10560                 }
10561                 data = memp;
10562         } else {
10563                 data = &stkbuf[0];
10564         };
10565
10566         if (cmd & IOC_IN) {
10567                 if (size) {
10568                         error = copyin(udata, data, size);
10569                         if (error) {
10570                                 if (memp) {
10571                                         kfree(memp, size);
10572                                 }
10573                                 return error;
10574                         }
10575                 } else {
10576                         if (is64bit) {
10577                                 *(user_addr_t *)data = udata;
10578                         } else {
10579                                 *(uint32_t *)data = (uint32_t)udata;
10580                         }
10581                 };
10582         } else if ((cmd & IOC_OUT) && size) {
10583                 /*
10584                  * Zero the buffer so the user always
10585                  * gets back something deterministic.
10586                  */
10587                 bzero(data, size);
10588         } else if (cmd & IOC_VOID) {
10589                 if (is64bit) {
10590                         *(user_addr_t *)data = udata;
10591                 } else {
10592                         *(uint32_t *)data = (uint32_t)udata;
10593                 }
10594         }
10595
10596         /* Check to see if it's a generic command */
10597         switch (cmd) {
10598         case FSIOC_SYNC_VOLUME: {
10599                 mount_t mp = vp->v_mount;
10600                 int arg = *(uint32_t*)data;
10601
10602                 /* record vid of vp so we can drop it below. */
10603                 uint32_t vvid = vp->v_id;
10604
10605                 /*
10606                  * Then grab mount_iterref so that we can release the vnode.
10607                  * Without this, a thread may call vnode_iterate_prepare then
10608                  * get into a deadlock because we've never released the root vp
10609                  */
10610                 error = mount_iterref(mp, 0);
10611                 if (error) {
10612                         break;
10613                 }
10614                 vnode_put(vp);
10615
10616                 /* issue the sync for this volume */
10617                 (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
10618
10619                 /*
10620                  * Then release the mount_iterref once we're done syncing; it's not
10621                  * needed for the VNOP_IOCTL below
10622                  */
10623                 mount_iterdrop(mp);
10624
10625                 if (arg & FSCTL_SYNC_FULLSYNC) {
10626                         /* re-obtain vnode iocount on the root vp, if possible */
10627                         error = vnode_getwithvid(vp, vvid);
10628                         if (error == 0) {
10629                                 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
10630                                 vnode_put(vp);
10631                         }
10632                 }
10633                 /* mark the argument VP as having been released */
10634                 *arg_vp = NULL;
10635         }
10636         break;
10637
10638         case FSIOC_ROUTEFS_SETROUTEID: {
10639 #if ROUTEFS
10640                 char routepath[MAXPATHLEN];
10641                 size_t len = 0;
10642
10643                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10644                         break;
10645                 }
10646                 bzero(routepath, MAXPATHLEN);
10647                 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
10648                 if (error) {
10649                         break;
10650                 }
10651                 error = routefs_kernel_mount(routepath);
10652                 if (error) {
10653                         break;
10654                 }
10655 #endif
10656         }
10657         break;
10658
10659         case FSIOC_SET_PACKAGE_EXTS: {
10660                 user_addr_t ext_strings;
10661                 uint32_t    num_entries;
10662                 uint32_t    max_width;
10663
10664                 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
10665                         break;
10666                 }
10667
10668                 if ((is64bit && size != sizeof(user64_package_ext_info))
10669                     || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
10670                         // either you're 64-bit and passed a 64-bit struct or
10671                         // you're 32-bit and passed a 32-bit struct.  otherwise
10672                         // it's not ok.
10673                         error = EINVAL;
10674                         break;
10675                 }
10676
10677                 if (is64bit) {
10678                         ext_strings = ((user64_package_ext_info *)data)->strings;
10679                         num_entries = ((user64_package_ext_info *)data)->num_entries;
10680                         max_width   = ((user64_package_ext_info *)data)->max_width;
10681                 } else {
10682                         ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
10683                         num_entries = ((user32_package_ext_info *)data)->num_entries;
10684                         max_width   = ((user32_package_ext_info *)data)->max_width;
10685                 }
10686                 error = set_package_extensions_table(ext_strings, num_entries, max_width);
10687         }
10688         break;
10689
10690         /* namespace handlers */
10691         case FSIOC_NAMESPACE_HANDLER_GET: {
10692                 error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
10693         }
10694         break;
10695
10696         /* Snapshot handlers */
10697         case FSIOC_OLD_SNAPSHOT_HANDLER_GET: {
10698                 error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10699         }
10700         break;
10701
10702         case FSIOC_SNAPSHOT_HANDLER_GET_EXT: {
10703                 error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10704         }
10705         break;
10706
10707         case FSIOC_NAMESPACE_HANDLER_UPDATE: {
10708                 uint32_t token, val;
10709                 int i;
10710
10711                 if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10712                         break;
10713                 }
10714
10715                 if (!nspace_is_special_process(p)) {
10716                         error = EINVAL;
10717                         break;
10718                 }
10719
10720                 token = ((uint32_t *)data)[0];
10721                 val   = ((uint32_t *)data)[1];
10722
10723                 lck_mtx_lock(&nspace_handler_lock);
10724
10725                 for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
10726                         if (nspace_items[i].token == token) {
10727                                 break;          /* exit for loop, not case stmt */
10728                         }
10729                 }
10730
10731                 if (i >= MAX_NSPACE_ITEMS) {
10732                         error = ENOENT;
10733                 } else {
10734                         //
10735                         // if this bit is set, when resolve_nspace_item() times out
10736                         // it will loop and go back to sleep.
10737                         //
10738                         nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
10739                 }
10740
10741                 lck_mtx_unlock(&nspace_handler_lock);
10742
10743                 if (error) {
10744                         printf("nspace-handler-update: did not find token %u\n", token);
10745                 }
10746         }
10747         break;
10748
10749         case FSIOC_NAMESPACE_HANDLER_UNBLOCK: {
10750                 uint32_t token, val;
10751                 int i;
10752
10753                 if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10754                         break;
10755                 }
10756
10757                 if (!nspace_is_special_process(p)) {
10758                         error = EINVAL;
10759                         break;
10760                 }
10761
10762                 token = ((uint32_t *)data)[0];
10763                 val   = ((uint32_t *)data)[1];
10764
10765                 lck_mtx_lock(&nspace_handler_lock);
10766
10767                 for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
10768                         if (nspace_items[i].token == token) {
10769                                 break;         /* exit for loop, not case statement */
10770                         }
10771                 }
10772
10773                 if (i >= MAX_NSPACE_ITEMS) {
10774                         printf("nspace-handler-unblock: did not find token %u\n", token);
10775                         error = ENOENT;
10776                 } else {
10777                         if (val == 0 && nspace_items[i].vp) {
10778                                 vnode_lock_spin(nspace_items[i].vp);
10779                                 nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10780                                 vnode_unlock(nspace_items[i].vp);
10781                         }
10782
10783                         nspace_items[i].vp = NULL;
10784                         nspace_items[i].arg = NULL;
10785                         nspace_items[i].op = 0;
10786                         nspace_items[i].vid = 0;
10787                         nspace_items[i].flags = NSPACE_ITEM_DONE;
10788                         nspace_items[i].token = 0;
10789
10790                         wakeup((caddr_t)&(nspace_items[i].vp));
10791                 }
10792
10793                 lck_mtx_unlock(&nspace_handler_lock);
10794         }
10795         break;
10796
10797         case FSIOC_NAMESPACE_HANDLER_CANCEL: {
10798                 uint32_t token, val;
10799                 int i;
10800
10801                 if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10802                         break;
10803                 }
10804
10805                 if (!nspace_is_special_process(p)) {
10806                         error = EINVAL;
10807                         break;
10808                 }
10809
10810                 token = ((uint32_t *)data)[0];
10811                 val   = ((uint32_t *)data)[1];
10812
10813                 lck_mtx_lock(&nspace_handler_lock);
10814
10815                 for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
10816                         if (nspace_items[i].token == token) {
10817                                 break;          /* exit for loop, not case stmt */
10818                         }
10819                 }
10820
10821                 if (i >= MAX_NSPACE_ITEMS) {
10822                         printf("nspace-handler-cancel: did not find token %u\n", token);
10823                         error = ENOENT;
10824                 } else {
10825                         if (nspace_items[i].vp) {
10826                                 vnode_lock_spin(nspace_items[i].vp);
10827                                 nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10828                                 vnode_unlock(nspace_items[i].vp);
10829                         }
10830
10831                         nspace_items[i].vp = NULL;
10832                         nspace_items[i].arg = NULL;
10833                         nspace_items[i].vid = 0;
10834                         nspace_items[i].token = val;
10835                         nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
10836                         nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
10837
10838                         wakeup((caddr_t)&(nspace_items[i].vp));
10839                 }
10840
10841                 lck_mtx_unlock(&nspace_handler_lock);
10842         }
10843         break;
10844
10845         case FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
10846                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10847                         break;
10848                 }
10849
10850                 // we explicitly do not do the namespace_handler_proc check here
10851
10852                 lck_mtx_lock(&nspace_handler_lock);
10853                 snapshot_timestamp = ((uint32_t *)data)[0];
10854                 wakeup(&nspace_item_idx);
10855                 lck_mtx_unlock(&nspace_handler_lock);
10856                 printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
10857         }
10858         break;
10859
10860         case FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
10861         {
10862                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10863                         break;
10864                 }
10865
10866                 lck_mtx_lock(&nspace_handler_lock);
10867                 nspace_allow_virtual_devs = ((uint32_t *)data)[0];
10868                 lck_mtx_unlock(&nspace_handler_lock);
10869                 printf("nspace-snapshot-handler will%s allow events on disk-images\n",
10870                     nspace_allow_virtual_devs ? "" : " NOT");
10871                 error = 0;
10872         }
10873         break;
10874
10875         case FSIOC_SET_FSTYPENAME_OVERRIDE:
10876         {
10877                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10878                         break;
10879                 }
10880                 if (vp->v_mount) {
10881                         mount_lock(vp->v_mount);
10882                         if (data[0] != 0) {
10883                                 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
10884                                 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
10885                                 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
10886                                         vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
10887                                         vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
10888                                 }
10889                         } else {
10890                                 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
10891                                         vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
10892                                 }
10893                                 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
10894                                 vp->v_mount->fstypename_override[0] = '\0';
10895                         }
10896                         mount_unlock(vp->v_mount);
10897                 }
10898         }
10899         break;
10900
10901         case DISK_CONDITIONER_IOC_GET: {
10902                 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
10903         }
10904         break;
10905
10906         case DISK_CONDITIONER_IOC_SET: {
10907                 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
10908         }
10909         break;
10910
10911         case FSIOC_FD_ONLY_OPEN_ONCE: {
10912                 if (vnode_usecount(vp) > 1) {
10913                         error = EBUSY;
10914                 } else {
10915                         error = 0;
10916                 }
10917         }
10918         break;
10919
10920         default: {
10921                 /* other, known commands shouldn't be passed down here */
10922                 switch (cmd) {
10923                 case F_PUNCHHOLE:
10924                 case F_TRIM_ACTIVE_FILE:
10925                 case F_RDADVISE:
10926                 case F_TRANSCODEKEY:
10927                 case F_GETPROTECTIONLEVEL:
10928                 case F_GETDEFAULTPROTLEVEL:
10929                 case F_MAKECOMPRESSED:
10930                 case F_SET_GREEDY_MODE:
10931                 case F_SETSTATICCONTENT:
10932                 case F_SETIOTYPE:
10933                 case F_SETBACKINGSTORE:
10934                 case F_GETPATH_MTMINFO:
10935                 case APFSIOC_REVERT_TO_SNAPSHOT:
10936                 case FSIOC_FIOSEEKHOLE:
10937                 case FSIOC_FIOSEEKDATA:
10938                 case HFS_GET_BOOT_INFO:
10939                 case HFS_SET_BOOT_INFO:
10940                 case FIOPINSWAP:
10941                 case F_CHKCLEAN:
10942                 case F_FULLFSYNC:
10943                 case F_BARRIERFSYNC:
10944                 case F_FREEZE_FS:
10945                 case F_THAW_FS:
10946                         error = EINVAL;
10947                         goto outdrop;
10948                 }
10949                 /* Invoke the filesystem-specific code */
10950                 error = VNOP_IOCTL(vp, cmd, data, options, ctx);
10951         }
10952         } /* end switch stmt */
10953
10954         /*
10955          * if no errors, copy any data to user. Size was
10956          * already set and checked above.
10957          */
10958         if (error == 0 && (cmd & IOC_OUT) && size) {
10959                 error = copyout(data, udata, size);
10960         }
10961
10962 outdrop:
10963         if (memp) {
10964                 kfree(memp, size);
10965         }
10966
10967         return error;
10968 }
10969
10970 /* ARGSUSED */
10971 int
10972 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
10973 {
10974         int error;
10975         struct nameidata nd;
10976         u_long nameiflags;
10977         vnode_t vp = NULL;
10978         vfs_context_t ctx = vfs_context_current();
10979
10980         AUDIT_ARG(cmd, uap->cmd);
10981         AUDIT_ARG(value32, uap->options);
10982         /* Get the vnode for the file we are getting info on:  */
10983         nameiflags = 0;
10984         //
10985         // if we come through fsctl() then the file is by definition not open.
10986         // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
10987         // lest the caller mistakenly thinks the only open is their own (but in
10988         // reality it's someone elses).
10989         //
10990         if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
10991                 return EINVAL;
10992         }
10993         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10994                 nameiflags |= FOLLOW;
10995         }
10996         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
10997             UIO_USERSPACE, uap->path, ctx);
10998         if ((error = namei(&nd))) {
10999                 goto done;
11000         }
11001         vp = nd.ni_vp;
11002         nameidone(&nd);
11003
11004 #if CONFIG_MACF
11005         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
11006         if (error) {
11007                 goto done;
11008         }
11009 #endif
11010
11011         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11012
11013 done:
11014         if (vp) {
11015                 vnode_put(vp);
11016         }
11017         return error;
11018 }
11019 /* ARGSUSED */
11020 int
11021 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
11022 {
11023         int error;
11024         vnode_t vp = NULL;
11025         vfs_context_t ctx = vfs_context_current();
11026         int fd = -1;
11027
11028         AUDIT_ARG(fd, uap->fd);
11029         AUDIT_ARG(cmd, uap->cmd);
11030         AUDIT_ARG(value32, uap->options);
11031
11032         /* Get the vnode for the file we are getting info on:  */
11033         if ((error = file_vnode(uap->fd, &vp))) {
11034                 return error;
11035         }
11036         fd = uap->fd;
11037         if ((error = vnode_getwithref(vp))) {
11038                 file_drop(fd);
11039                 return error;
11040         }
11041
11042 #if CONFIG_MACF
11043         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
11044                 file_drop(fd);
11045                 vnode_put(vp);
11046                 return error;
11047         }
11048 #endif
11049
11050         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11051
11052         file_drop(fd);
11053
11054         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
11055         if (vp) {
11056                 vnode_put(vp);
11057         }
11058
11059         return error;
11060 }
11061 /* end of fsctl system call */
11062
11063 /*
11064  *  Retrieve the data of an extended attribute.
11065  */
11066 int
11067 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
11068 {
11069         vnode_t vp;
11070         struct nameidata nd;
11071         char attrname[XATTR_MAXNAMELEN + 1];
11072         vfs_context_t ctx = vfs_context_current();
11073         uio_t auio = NULL;
11074         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11075         size_t attrsize = 0;
11076         size_t namelen;
11077         u_int32_t nameiflags;
11078         int error;
11079         char uio_buf[UIO_SIZEOF(1)];
11080
11081         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11082                 return EINVAL;
11083         }
11084
11085         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11086         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
11087         if ((error = namei(&nd))) {
11088                 return error;
11089         }
11090         vp = nd.ni_vp;
11091         nameidone(&nd);
11092
11093         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11094         if (error != 0) {
11095                 goto out;
11096         }
11097         if (xattr_protected(attrname)) {
11098                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
11099                         error = EPERM;
11100                         goto out;
11101                 }
11102         }
11103         /*
11104          * the specific check for 0xffffffff is a hack to preserve
11105          * binaray compatibilty in K64 with applications that discovered
11106          * that passing in a buf pointer and a size of -1 resulted in
11107          * just the size of the indicated extended attribute being returned.
11108          * this isn't part of the documented behavior, but because of the
11109          * original implemtation's check for "uap->size > 0", this behavior
11110          * was allowed. In K32 that check turned into a signed comparison
11111          * even though uap->size is unsigned...  in K64, we blow by that
11112          * check because uap->size is unsigned and doesn't get sign smeared
11113          * in the munger for a 32 bit user app.  we also need to add a
11114          * check to limit the maximum size of the buffer being passed in...
11115          * unfortunately, the underlying fileystems seem to just malloc
11116          * the requested size even if the actual extended attribute is tiny.
11117          * because that malloc is for kernel wired memory, we have to put a
11118          * sane limit on it.
11119          *
11120          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
11121          * U64 running on K64 will yield -1 (64 bits wide)
11122          * U32/U64 running on K32 will yield -1 (32 bits wide)
11123          */
11124         if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
11125                 goto no_uio;
11126         }
11127
11128         if (uap->value) {
11129                 if (uap->size > (size_t)XATTR_MAXSIZE) {
11130                         uap->size = XATTR_MAXSIZE;
11131                 }
11132
11133                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11134                     &uio_buf[0], sizeof(uio_buf));
11135                 uio_addiov(auio, uap->value, uap->size);
11136         }
11137 no_uio:
11138         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
11139 out:
11140         vnode_put(vp);
11141
11142         if (auio) {
11143                 *retval = uap->size - uio_resid(auio);
11144         } else {
11145                 *retval = (user_ssize_t)attrsize;
11146         }
11147
11148         return error;
11149 }
11150
11151 /*
11152  * Retrieve the data of an extended attribute.
11153  */
11154 int
11155 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
11156 {
11157         vnode_t vp;
11158         char attrname[XATTR_MAXNAMELEN + 1];
11159         uio_t auio = NULL;
11160         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11161         size_t attrsize = 0;
11162         size_t namelen;
11163         int error;
11164         char uio_buf[UIO_SIZEOF(1)];
11165
11166         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11167                 return EINVAL;
11168         }
11169
11170         if ((error = file_vnode(uap->fd, &vp))) {
11171                 return error;
11172         }
11173         if ((error = vnode_getwithref(vp))) {
11174                 file_drop(uap->fd);
11175                 return error;
11176         }
11177         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11178         if (error != 0) {
11179                 goto out;
11180         }
11181         if (xattr_protected(attrname)) {
11182                 error = EPERM;
11183                 goto out;
11184         }
11185         if (uap->value && uap->size > 0) {
11186                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11187                     &uio_buf[0], sizeof(uio_buf));
11188                 uio_addiov(auio, uap->value, uap->size);
11189         }
11190
11191         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
11192 out:
11193         (void)vnode_put(vp);
11194         file_drop(uap->fd);
11195
11196         if (auio) {
11197                 *retval = uap->size - uio_resid(auio);
11198         } else {
11199                 *retval = (user_ssize_t)attrsize;
11200         }
11201         return error;
11202 }
11203
11204 /*
11205  * Set the data of an extended attribute.
11206  */
11207 int
11208 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
11209 {
11210         vnode_t vp;
11211         struct nameidata nd;
11212         char attrname[XATTR_MAXNAMELEN + 1];
11213         vfs_context_t ctx = vfs_context_current();
11214         uio_t auio = NULL;
11215         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11216         size_t namelen;
11217         u_int32_t nameiflags;
11218         int error;
11219         char uio_buf[UIO_SIZEOF(1)];
11220
11221         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11222                 return EINVAL;
11223         }
11224
11225         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11226         if (error != 0) {
11227                 if (error == EPERM) {
11228                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11229                         return ENAMETOOLONG;
11230                 }
11231                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11232                 return error;
11233         }
11234         if (xattr_protected(attrname)) {
11235                 return EPERM;
11236         }
11237         if (uap->size != 0 && uap->value == 0) {
11238                 return EINVAL;
11239         }
11240
11241         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11242         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
11243         if ((error = namei(&nd))) {
11244                 return error;
11245         }
11246         vp = nd.ni_vp;
11247         nameidone(&nd);
11248
11249         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11250             &uio_buf[0], sizeof(uio_buf));
11251         uio_addiov(auio, uap->value, uap->size);
11252
11253         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
11254 #if CONFIG_FSE
11255         if (error == 0) {
11256                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11257                     FSE_ARG_VNODE, vp,
11258                     FSE_ARG_DONE);
11259         }
11260 #endif
11261         vnode_put(vp);
11262         *retval = 0;
11263         return error;
11264 }
11265
11266 /*
11267  * Set the data of an extended attribute.
11268  */
11269 int
11270 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
11271 {
11272         vnode_t vp;
11273         char attrname[XATTR_MAXNAMELEN + 1];
11274         uio_t auio = NULL;
11275         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11276         size_t namelen;
11277         int error;
11278         char uio_buf[UIO_SIZEOF(1)];
11279 #if CONFIG_FSE
11280         vfs_context_t ctx = vfs_context_current();
11281 #endif
11282
11283         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11284                 return EINVAL;
11285         }
11286
11287         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11288         if (error != 0) {
11289                 if (error == EPERM) {
11290                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11291                         return ENAMETOOLONG;
11292                 }
11293                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11294                 return error;
11295         }
11296         if (xattr_protected(attrname)) {
11297                 return EPERM;
11298         }
11299         if (uap->size != 0 && uap->value == 0) {
11300                 return EINVAL;
11301         }
11302         if ((error = file_vnode(uap->fd, &vp))) {
11303                 return error;
11304         }
11305         if ((error = vnode_getwithref(vp))) {
11306                 file_drop(uap->fd);
11307                 return error;
11308         }
11309         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11310             &uio_buf[0], sizeof(uio_buf));
11311         uio_addiov(auio, uap->value, uap->size);
11312
11313         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
11314 #if CONFIG_FSE
11315         if (error == 0) {
11316                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11317                     FSE_ARG_VNODE, vp,
11318                     FSE_ARG_DONE);
11319         }
11320 #endif
11321         vnode_put(vp);
11322         file_drop(uap->fd);
11323         *retval = 0;
11324         return error;
11325 }
11326
11327 /*
11328  * Remove an extended attribute.
11329  * XXX Code duplication here.
11330  */
11331 int
11332 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
11333 {
11334         vnode_t vp;
11335         struct nameidata nd;
11336         char attrname[XATTR_MAXNAMELEN + 1];
11337         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11338         vfs_context_t ctx = vfs_context_current();
11339         size_t namelen;
11340         u_int32_t nameiflags;
11341         int error;
11342
11343         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11344                 return EINVAL;
11345         }
11346
11347         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11348         if (error != 0) {
11349                 return error;
11350         }
11351         if (xattr_protected(attrname)) {
11352                 return EPERM;
11353         }
11354         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11355         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
11356         if ((error = namei(&nd))) {
11357                 return error;
11358         }
11359         vp = nd.ni_vp;
11360         nameidone(&nd);
11361
11362         error = vn_removexattr(vp, attrname, uap->options, ctx);
11363 #if CONFIG_FSE
11364         if (error == 0) {
11365                 add_fsevent(FSE_XATTR_REMOVED, ctx,
11366                     FSE_ARG_VNODE, vp,
11367                     FSE_ARG_DONE);
11368         }
11369 #endif
11370         vnode_put(vp);
11371         *retval = 0;
11372         return error;
11373 }
11374
11375 /*
11376  * Remove an extended attribute.
11377  * XXX Code duplication here.
11378  */
11379 int
11380 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
11381 {
11382         vnode_t vp;
11383         char attrname[XATTR_MAXNAMELEN + 1];
11384         size_t namelen;
11385         int error;
11386 #if CONFIG_FSE
11387         vfs_context_t ctx = vfs_context_current();
11388 #endif
11389
11390         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11391                 return EINVAL;
11392         }
11393
11394         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11395         if (error != 0) {
11396                 return error;
11397         }
11398         if (xattr_protected(attrname)) {
11399                 return EPERM;
11400         }
11401         if ((error = file_vnode(uap->fd, &vp))) {
11402                 return error;
11403         }
11404         if ((error = vnode_getwithref(vp))) {
11405                 file_drop(uap->fd);
11406                 return error;
11407         }
11408
11409         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
11410 #if CONFIG_FSE
11411         if (error == 0) {
11412                 add_fsevent(FSE_XATTR_REMOVED, ctx,
11413                     FSE_ARG_VNODE, vp,
11414                     FSE_ARG_DONE);
11415         }
11416 #endif
11417         vnode_put(vp);
11418         file_drop(uap->fd);
11419         *retval = 0;
11420         return error;
11421 }
11422
11423 /*
11424  * Retrieve the list of extended attribute names.
11425  * XXX Code duplication here.
11426  */
11427 int
11428 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
11429 {
11430         vnode_t vp;
11431         struct nameidata nd;
11432         vfs_context_t ctx = vfs_context_current();
11433         uio_t auio = NULL;
11434         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11435         size_t attrsize = 0;
11436         u_int32_t nameiflags;
11437         int error;
11438         char uio_buf[UIO_SIZEOF(1)];
11439
11440         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11441                 return EINVAL;
11442         }
11443
11444         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11445         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
11446         if ((error = namei(&nd))) {
11447                 return error;
11448         }
11449         vp = nd.ni_vp;
11450         nameidone(&nd);
11451         if (uap->namebuf != 0 && uap->bufsize > 0) {
11452                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
11453                     &uio_buf[0], sizeof(uio_buf));
11454                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11455         }
11456
11457         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
11458
11459         vnode_put(vp);
11460         if (auio) {
11461                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11462         } else {
11463                 *retval = (user_ssize_t)attrsize;
11464         }
11465         return error;
11466 }
11467
11468 /*
11469  * Retrieve the list of extended attribute names.
11470  * XXX Code duplication here.
11471  */
11472 int
11473 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
11474 {
11475         vnode_t vp;
11476         uio_t auio = NULL;
11477         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11478         size_t attrsize = 0;
11479         int error;
11480         char uio_buf[UIO_SIZEOF(1)];
11481
11482         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11483                 return EINVAL;
11484         }
11485
11486         if ((error = file_vnode(uap->fd, &vp))) {
11487                 return error;
11488         }
11489         if ((error = vnode_getwithref(vp))) {
11490                 file_drop(uap->fd);
11491                 return error;
11492         }
11493         if (uap->namebuf != 0 && uap->bufsize > 0) {
11494                 auio = uio_createwithbuffer(1, 0, spacetype,
11495                     UIO_READ, &uio_buf[0], sizeof(uio_buf));
11496                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11497         }
11498
11499         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
11500
11501         vnode_put(vp);
11502         file_drop(uap->fd);
11503         if (auio) {
11504                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11505         } else {
11506                 *retval = (user_ssize_t)attrsize;
11507         }
11508         return error;
11509 }
11510
11511 static int
11512 fsgetpath_internal(
11513         vfs_context_t ctx, int volfs_id, uint64_t objid,
11514         vm_size_t bufsize, caddr_t buf, int *pathlen)
11515 {
11516         int error;
11517         struct mount *mp = NULL;
11518         vnode_t vp;
11519         int length;
11520         int bpflags;
11521         /* maximum number of times to retry build_path */
11522         unsigned int retries = 0x10;
11523
11524         if (bufsize > PAGE_SIZE) {
11525                 return EINVAL;
11526         }
11527
11528         if (buf == NULL) {
11529                 return ENOMEM;
11530         }
11531
11532 retry:
11533         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
11534                 error = ENOTSUP;  /* unexpected failure */
11535                 return ENOTSUP;
11536         }
11537
11538 unionget:
11539         if (objid == 2) {
11540                 error = VFS_ROOT(mp, &vp, ctx);
11541         } else {
11542                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
11543         }
11544
11545         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
11546                 /*
11547                  * If the fileid isn't found and we're in a union
11548                  * mount volume, then see if the fileid is in the
11549                  * mounted-on volume.
11550                  */
11551                 struct mount *tmp = mp;
11552                 mp = vnode_mount(tmp->mnt_vnodecovered);
11553                 vfs_unbusy(tmp);
11554                 if (vfs_busy(mp, LK_NOWAIT) == 0) {
11555                         goto unionget;
11556                 }
11557         } else {
11558                 vfs_unbusy(mp);
11559         }
11560
11561         if (error) {
11562                 return error;
11563         }
11564
11565 #if CONFIG_MACF
11566         error = mac_vnode_check_fsgetpath(ctx, vp);
11567         if (error) {
11568                 vnode_put(vp);
11569                 return error;
11570         }
11571 #endif
11572
11573         /* Obtain the absolute path to this vnode. */
11574         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
11575         bpflags |= BUILDPATH_CHECK_MOVED;
11576         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
11577         vnode_put(vp);
11578
11579         if (error) {
11580                 /* there was a race building the path, try a few more times */
11581                 if (error == EAGAIN) {
11582                         --retries;
11583                         if (retries > 0) {
11584                                 goto retry;
11585                         }
11586
11587                         error = ENOENT;
11588                 }
11589                 goto out;
11590         }
11591
11592         AUDIT_ARG(text, buf);
11593
11594         if (kdebug_enable) {
11595                 long dbg_parms[NUMPARMS];
11596                 int  dbg_namelen;
11597
11598                 dbg_namelen = (int)sizeof(dbg_parms);
11599
11600                 if (length < dbg_namelen) {
11601                         memcpy((char *)dbg_parms, buf, length);
11602                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
11603
11604                         dbg_namelen = length;
11605                 } else {
11606                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
11607                 }
11608
11609                 kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
11610                     KDBG_VFS_LOOKUP_FLAG_LOOKUP);
11611         }
11612
11613         *pathlen = (user_ssize_t)length; /* may be superseded by error */
11614
11615 out:
11616         return error;
11617 }
11618
11619 /*
11620  * Obtain the full pathname of a file system object by id.
11621  */
11622 int
11623 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
11624 {
11625         vfs_context_t ctx = vfs_context_current();
11626         fsid_t fsid;
11627         char *realpath;
11628         int length;
11629         int error;
11630
11631         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
11632                 return error;
11633         }
11634         AUDIT_ARG(value32, fsid.val[0]);
11635         AUDIT_ARG(value64, uap->objid);
11636         /* Restrict output buffer size for now. */
11637
11638         if (uap->bufsize > PAGE_SIZE) {
11639                 return EINVAL;
11640         }
11641         MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK | M_ZERO);
11642         if (realpath == NULL) {
11643                 return ENOMEM;
11644         }
11645
11646         error = fsgetpath_internal(
11647                 ctx, fsid.val[0], uap->objid,
11648                 uap->bufsize, realpath, &length);
11649
11650         if (error) {
11651                 goto out;
11652         }
11653
11654         error = copyout((caddr_t)realpath, uap->buf, length);
11655
11656         *retval = (user_ssize_t)length; /* may be superseded by error */
11657 out:
11658         if (realpath) {
11659                 FREE(realpath, M_TEMP);
11660         }
11661         return error;
11662 }
11663
11664 /*
11665  * Common routine to handle various flavors of statfs data heading out
11666  *      to user space.
11667  *
11668  * Returns:     0                       Success
11669  *              EFAULT
11670  */
11671 static int
11672 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
11673     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
11674     boolean_t partial_copy)
11675 {
11676         int             error;
11677         int             my_size, copy_size;
11678
11679         if (is_64_bit) {
11680                 struct user64_statfs sfs;
11681                 my_size = copy_size = sizeof(sfs);
11682                 bzero(&sfs, my_size);
11683                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
11684                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
11685                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
11686                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
11687                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
11688                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
11689                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
11690                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
11691                 sfs.f_files = (user64_long_t)sfsp->f_files;
11692                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
11693                 sfs.f_fsid = sfsp->f_fsid;
11694                 sfs.f_owner = sfsp->f_owner;
11695                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11696                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
11697                 } else {
11698                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
11699                 }
11700                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
11701                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
11702
11703                 if (partial_copy) {
11704                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
11705                 }
11706                 error = copyout((caddr_t)&sfs, bufp, copy_size);
11707         } else {
11708                 struct user32_statfs sfs;
11709
11710                 my_size = copy_size = sizeof(sfs);
11711                 bzero(&sfs, my_size);
11712
11713                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
11714                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
11715                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
11716
11717                 /*
11718                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
11719                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
11720                  * to reflect the filesystem size as best we can.
11721                  */
11722                 if ((sfsp->f_blocks > INT_MAX)
11723                     /* Hack for 4061702 . I think the real fix is for Carbon to
11724                      * look for some volume capability and not depend on hidden
11725                      * semantics agreed between a FS and carbon.
11726                      * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
11727                      * for Carbon to set bNoVolumeSizes volume attribute.
11728                      * Without this the webdavfs files cannot be copied onto
11729                      * disk as they look huge. This change should not affect
11730                      * XSAN as they should not setting these to -1..
11731                      */
11732                     && (sfsp->f_blocks != 0xffffffffffffffffULL)
11733                     && (sfsp->f_bfree != 0xffffffffffffffffULL)
11734                     && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
11735                         int             shift;
11736
11737                         /*
11738                          * Work out how far we have to shift the block count down to make it fit.
11739                          * Note that it's possible to have to shift so far that the resulting
11740                          * blocksize would be unreportably large.  At that point, we will clip
11741                          * any values that don't fit.
11742                          *
11743                          * For safety's sake, we also ensure that f_iosize is never reported as
11744                          * being smaller than f_bsize.
11745                          */
11746                         for (shift = 0; shift < 32; shift++) {
11747                                 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
11748                                         break;
11749                                 }
11750                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
11751                                         break;
11752                                 }
11753                         }
11754 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
11755                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
11756                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
11757                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
11758 #undef __SHIFT_OR_CLIP
11759                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
11760                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
11761                 } else {
11762                         /* filesystem is small enough to be reported honestly */
11763                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
11764                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
11765                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
11766                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
11767                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
11768                 }
11769                 sfs.f_files = (user32_long_t)sfsp->f_files;
11770                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
11771                 sfs.f_fsid = sfsp->f_fsid;
11772                 sfs.f_owner = sfsp->f_owner;
11773                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11774                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
11775                 } else {
11776                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
11777                 }
11778                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
11779                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
11780
11781                 if (partial_copy) {
11782                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
11783                 }
11784                 error = copyout((caddr_t)&sfs, bufp, copy_size);
11785         }
11786
11787         if (sizep != NULL) {
11788                 *sizep = my_size;
11789         }
11790         return error;
11791 }
11792
11793 /*
11794  * copy stat structure into user_stat structure.
11795  */
11796 void
11797 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
11798 {
11799         bzero(usbp, sizeof(*usbp));
11800
11801         usbp->st_dev = sbp->st_dev;
11802         usbp->st_ino = sbp->st_ino;
11803         usbp->st_mode = sbp->st_mode;
11804         usbp->st_nlink = sbp->st_nlink;
11805         usbp->st_uid = sbp->st_uid;
11806         usbp->st_gid = sbp->st_gid;
11807         usbp->st_rdev = sbp->st_rdev;
11808 #ifndef _POSIX_C_SOURCE
11809         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11810         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11811         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11812         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11813         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11814         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11815 #else
11816         usbp->st_atime = sbp->st_atime;
11817         usbp->st_atimensec = sbp->st_atimensec;
11818         usbp->st_mtime = sbp->st_mtime;
11819         usbp->st_mtimensec = sbp->st_mtimensec;
11820         usbp->st_ctime = sbp->st_ctime;
11821         usbp->st_ctimensec = sbp->st_ctimensec;
11822 #endif
11823         usbp->st_size = sbp->st_size;
11824         usbp->st_blocks = sbp->st_blocks;
11825         usbp->st_blksize = sbp->st_blksize;
11826         usbp->st_flags = sbp->st_flags;
11827         usbp->st_gen = sbp->st_gen;
11828         usbp->st_lspare = sbp->st_lspare;
11829         usbp->st_qspare[0] = sbp->st_qspare[0];
11830         usbp->st_qspare[1] = sbp->st_qspare[1];
11831 }
11832
11833 void
11834 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
11835 {
11836         bzero(usbp, sizeof(*usbp));
11837
11838         usbp->st_dev = sbp->st_dev;
11839         usbp->st_ino = sbp->st_ino;
11840         usbp->st_mode = sbp->st_mode;
11841         usbp->st_nlink = sbp->st_nlink;
11842         usbp->st_uid = sbp->st_uid;
11843         usbp->st_gid = sbp->st_gid;
11844         usbp->st_rdev = sbp->st_rdev;
11845 #ifndef _POSIX_C_SOURCE
11846         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11847         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11848         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11849         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11850         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11851         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11852 #else
11853         usbp->st_atime = sbp->st_atime;
11854         usbp->st_atimensec = sbp->st_atimensec;
11855         usbp->st_mtime = sbp->st_mtime;
11856         usbp->st_mtimensec = sbp->st_mtimensec;
11857         usbp->st_ctime = sbp->st_ctime;
11858         usbp->st_ctimensec = sbp->st_ctimensec;
11859 #endif
11860         usbp->st_size = sbp->st_size;
11861         usbp->st_blocks = sbp->st_blocks;
11862         usbp->st_blksize = sbp->st_blksize;
11863         usbp->st_flags = sbp->st_flags;
11864         usbp->st_gen = sbp->st_gen;
11865         usbp->st_lspare = sbp->st_lspare;
11866         usbp->st_qspare[0] = sbp->st_qspare[0];
11867         usbp->st_qspare[1] = sbp->st_qspare[1];
11868 }
11869
11870 /*
11871  * copy stat64 structure into user_stat64 structure.
11872  */
11873 void
11874 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
11875 {
11876         bzero(usbp, sizeof(*usbp));
11877
11878         usbp->st_dev = sbp->st_dev;
11879         usbp->st_ino = sbp->st_ino;
11880         usbp->st_mode = sbp->st_mode;
11881         usbp->st_nlink = sbp->st_nlink;
11882         usbp->st_uid = sbp->st_uid;
11883         usbp->st_gid = sbp->st_gid;
11884         usbp->st_rdev = sbp->st_rdev;
11885 #ifndef _POSIX_C_SOURCE
11886         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11887         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11888         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11889         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11890         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11891         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11892         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11893         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11894 #else
11895         usbp->st_atime = sbp->st_atime;
11896         usbp->st_atimensec = sbp->st_atimensec;
11897         usbp->st_mtime = sbp->st_mtime;
11898         usbp->st_mtimensec = sbp->st_mtimensec;
11899         usbp->st_ctime = sbp->st_ctime;
11900         usbp->st_ctimensec = sbp->st_ctimensec;
11901         usbp->st_birthtime = sbp->st_birthtime;
11902         usbp->st_birthtimensec = sbp->st_birthtimensec;
11903 #endif
11904         usbp->st_size = sbp->st_size;
11905         usbp->st_blocks = sbp->st_blocks;
11906         usbp->st_blksize = sbp->st_blksize;
11907         usbp->st_flags = sbp->st_flags;
11908         usbp->st_gen = sbp->st_gen;
11909         usbp->st_lspare = sbp->st_lspare;
11910         usbp->st_qspare[0] = sbp->st_qspare[0];
11911         usbp->st_qspare[1] = sbp->st_qspare[1];
11912 }
11913
11914 void
11915 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
11916 {
11917         bzero(usbp, sizeof(*usbp));
11918
11919         usbp->st_dev = sbp->st_dev;
11920         usbp->st_ino = sbp->st_ino;
11921         usbp->st_mode = sbp->st_mode;
11922         usbp->st_nlink = sbp->st_nlink;
11923         usbp->st_uid = sbp->st_uid;
11924         usbp->st_gid = sbp->st_gid;
11925         usbp->st_rdev = sbp->st_rdev;
11926 #ifndef _POSIX_C_SOURCE
11927         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11928         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11929         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11930         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11931         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11932         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11933         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11934         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11935 #else
11936         usbp->st_atime = sbp->st_atime;
11937         usbp->st_atimensec = sbp->st_atimensec;
11938         usbp->st_mtime = sbp->st_mtime;
11939         usbp->st_mtimensec = sbp->st_mtimensec;
11940         usbp->st_ctime = sbp->st_ctime;
11941         usbp->st_ctimensec = sbp->st_ctimensec;
11942         usbp->st_birthtime = sbp->st_birthtime;
11943         usbp->st_birthtimensec = sbp->st_birthtimensec;
11944 #endif
11945         usbp->st_size = sbp->st_size;
11946         usbp->st_blocks = sbp->st_blocks;
11947         usbp->st_blksize = sbp->st_blksize;
11948         usbp->st_flags = sbp->st_flags;
11949         usbp->st_gen = sbp->st_gen;
11950         usbp->st_lspare = sbp->st_lspare;
11951         usbp->st_qspare[0] = sbp->st_qspare[0];
11952         usbp->st_qspare[1] = sbp->st_qspare[1];
11953 }
11954
11955 /*
11956  * Purge buffer cache for simulating cold starts
11957  */
11958 static int
11959 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
11960 {
11961         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
11962
11963         return VNODE_RETURNED;
11964 }
11965
11966 static int
11967 vfs_purge_callback(mount_t mp, __unused void * arg)
11968 {
11969         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
11970
11971         return VFS_RETURNED;
11972 }
11973
11974 int
11975 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
11976 {
11977         if (!kauth_cred_issuser(kauth_cred_get())) {
11978                 return EPERM;
11979         }
11980
11981         vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
11982
11983         return 0;
11984 }
11985
11986 /*
11987  * gets the vnode associated with the (unnamed) snapshot directory
11988  * for a Filesystem. The snapshot directory vnode is returned with
11989  * an iocount on it.
11990  */
11991 int
11992 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
11993 {
11994         return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
11995 }
11996
11997 /*
11998  * Get the snapshot vnode.
11999  *
12000  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
12001  * needs nameidone() on ndp.
12002  *
12003  * If the snapshot vnode exists it is returned in ndp->ni_vp.
12004  *
12005  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
12006  * not needed.
12007  */
12008 static int
12009 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
12010     user_addr_t name, struct nameidata *ndp, int32_t op,
12011 #if !CONFIG_TRIGGERS
12012     __unused
12013 #endif
12014     enum path_operation pathop,
12015     vfs_context_t ctx)
12016 {
12017         int error, i;
12018         caddr_t name_buf;
12019         size_t name_len;
12020         struct vfs_attr vfa;
12021
12022         *sdvpp = NULLVP;
12023         *rvpp = NULLVP;
12024
12025         error = vnode_getfromfd(ctx, dirfd, rvpp);
12026         if (error) {
12027                 return error;
12028         }
12029
12030         if (!vnode_isvroot(*rvpp)) {
12031                 error = EINVAL;
12032                 goto out;
12033         }
12034
12035         /* Make sure the filesystem supports snapshots */
12036         VFSATTR_INIT(&vfa);
12037         VFSATTR_WANTED(&vfa, f_capabilities);
12038         if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
12039             !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
12040             !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
12041             VOL_CAP_INT_SNAPSHOT)) ||
12042             !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
12043             VOL_CAP_INT_SNAPSHOT))) {
12044                 error = ENOTSUP;
12045                 goto out;
12046         }
12047
12048         error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
12049         if (error) {
12050                 goto out;
12051         }
12052
12053         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12054         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12055         if (error) {
12056                 goto out1;
12057         }
12058
12059         /*
12060          * Some sanity checks- name can't be empty, "." or ".." or have slashes.
12061          * (the length returned by copyinstr includes the terminating NUL)
12062          */
12063         if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
12064             (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
12065                 error = EINVAL;
12066                 goto out1;
12067         }
12068         for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
12069                 ;
12070         }
12071         if (i < (int)name_len) {
12072                 error = EINVAL;
12073                 goto out1;
12074         }
12075
12076 #if CONFIG_MACF
12077         if (op == CREATE) {
12078                 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
12079                     name_buf);
12080         } else if (op == DELETE) {
12081                 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
12082                     name_buf);
12083         }
12084         if (error) {
12085                 goto out1;
12086         }
12087 #endif
12088
12089         /* Check if the snapshot already exists ... */
12090         NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
12091             UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
12092         ndp->ni_dvp = *sdvpp;
12093
12094         error = namei(ndp);
12095 out1:
12096         FREE(name_buf, M_TEMP);
12097 out:
12098         if (error) {
12099                 if (*sdvpp) {
12100                         vnode_put(*sdvpp);
12101                         *sdvpp = NULLVP;
12102                 }
12103                 if (*rvpp) {
12104                         vnode_put(*rvpp);
12105                         *rvpp = NULLVP;
12106                 }
12107         }
12108         return error;
12109 }
12110
12111 /*
12112  * create a filesystem snapshot (for supporting filesystems)
12113  *
12114  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
12115  * We get to the (unnamed) snapshot directory vnode and create the vnode
12116  * for the snapshot in it.
12117  *
12118  * Restrictions:
12119  *
12120  *    a) Passed in name for snapshot cannot have slashes.
12121  *    b) name can't be "." or ".."
12122  *
12123  * Since this requires superuser privileges, vnode_authorize calls are not
12124  * made.
12125  */
12126 static int
12127 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
12128     vfs_context_t ctx)
12129 {
12130         vnode_t rvp, snapdvp;
12131         int error;
12132         struct nameidata namend;
12133
12134         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
12135             OP_LINK, ctx);
12136         if (error) {
12137                 return error;
12138         }
12139
12140         if (namend.ni_vp) {
12141                 vnode_put(namend.ni_vp);
12142                 error = EEXIST;
12143         } else {
12144                 struct vnode_attr va;
12145                 vnode_t vp = NULLVP;
12146
12147                 VATTR_INIT(&va);
12148                 VATTR_SET(&va, va_type, VREG);
12149                 VATTR_SET(&va, va_mode, 0);
12150
12151                 error = vn_create(snapdvp, &vp, &namend, &va,
12152                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
12153                 if (!error && vp) {
12154                         vnode_put(vp);
12155                 }
12156         }
12157
12158         nameidone(&namend);
12159         vnode_put(snapdvp);
12160         vnode_put(rvp);
12161         return error;
12162 }
12163
12164 /*
12165  * Delete a Filesystem snapshot
12166  *
12167  * get the vnode for the unnamed snapshot directory and the snapshot and
12168  * delete the snapshot.
12169  */
12170 static int
12171 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
12172     vfs_context_t ctx)
12173 {
12174         vnode_t rvp, snapdvp;
12175         int error;
12176         struct nameidata namend;
12177
12178         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
12179             OP_UNLINK, ctx);
12180         if (error) {
12181                 goto out;
12182         }
12183
12184         error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
12185             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
12186
12187         vnode_put(namend.ni_vp);
12188         nameidone(&namend);
12189         vnode_put(snapdvp);
12190         vnode_put(rvp);
12191 out:
12192         return error;
12193 }
12194
12195 /*
12196  * Revert a filesystem to a snapshot
12197  *
12198  * Marks the filesystem to revert to the given snapshot on next mount.
12199  */
12200 static int
12201 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
12202     vfs_context_t ctx)
12203 {
12204         int error;
12205         vnode_t rvp;
12206         mount_t mp;
12207         struct fs_snapshot_revert_args revert_data;
12208         struct componentname cnp;
12209         caddr_t name_buf;
12210         size_t name_len;
12211
12212         error = vnode_getfromfd(ctx, dirfd, &rvp);
12213         if (error) {
12214                 return error;
12215         }
12216         mp = vnode_mount(rvp);
12217
12218         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12219         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12220         if (error) {
12221                 FREE(name_buf, M_TEMP);
12222                 vnode_put(rvp);
12223                 return error;
12224         }
12225
12226 #if CONFIG_MACF
12227         error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
12228         if (error) {
12229                 FREE(name_buf, M_TEMP);
12230                 vnode_put(rvp);
12231                 return error;
12232         }
12233 #endif
12234
12235         /*
12236          * Grab mount_iterref so that we can release the vnode,
12237          * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
12238          */
12239         error = mount_iterref(mp, 0);
12240         vnode_put(rvp);
12241         if (error) {
12242                 FREE(name_buf, M_TEMP);
12243                 return error;
12244         }
12245
12246         memset(&cnp, 0, sizeof(cnp));
12247         cnp.cn_pnbuf = (char *)name_buf;
12248         cnp.cn_nameiop = LOOKUP;
12249         cnp.cn_flags = ISLASTCN | HASBUF;
12250         cnp.cn_pnlen = MAXPATHLEN;
12251         cnp.cn_nameptr = cnp.cn_pnbuf;
12252         cnp.cn_namelen = (int)name_len;
12253         revert_data.sr_cnp = &cnp;
12254
12255         error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
12256         mount_iterdrop(mp);
12257         FREE(name_buf, M_TEMP);
12258
12259         if (error) {
12260                 /* If there was any error, try again using VNOP_IOCTL */
12261
12262                 vnode_t snapdvp;
12263                 struct nameidata namend;
12264
12265                 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
12266                     OP_LOOKUP, ctx);
12267                 if (error) {
12268                         return error;
12269                 }
12270
12271
12272                 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
12273                     0, ctx);
12274
12275                 vnode_put(namend.ni_vp);
12276                 nameidone(&namend);
12277                 vnode_put(snapdvp);
12278                 vnode_put(rvp);
12279         }
12280
12281         return error;
12282 }
12283
12284 /*
12285  * rename a Filesystem snapshot
12286  *
12287  * get the vnode for the unnamed snapshot directory and the snapshot and
12288  * rename the snapshot. This is a very specialised (and simple) case of
12289  * rename(2) (which has to deal with a lot more complications). It differs
12290  * slightly from rename(2) in that EEXIST is returned if the new name exists.
12291  */
12292 static int
12293 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
12294     __unused uint32_t flags, vfs_context_t ctx)
12295 {
12296         vnode_t rvp, snapdvp;
12297         int error, i;
12298         caddr_t newname_buf;
12299         size_t name_len;
12300         vnode_t fvp;
12301         struct nameidata *fromnd, *tond;
12302         /* carving out a chunk for structs that are too big to be on stack. */
12303         struct {
12304                 struct nameidata from_node;
12305                 struct nameidata to_node;
12306         } * __rename_data;
12307
12308         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
12309         fromnd = &__rename_data->from_node;
12310         tond = &__rename_data->to_node;
12311
12312         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
12313             OP_UNLINK, ctx);
12314         if (error) {
12315                 goto out;
12316         }
12317         fvp  = fromnd->ni_vp;
12318
12319         MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12320         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
12321         if (error) {
12322                 goto out1;
12323         }
12324
12325         /*
12326          * Some sanity checks- new name can't be empty, "." or ".." or have
12327          * slashes.
12328          * (the length returned by copyinstr includes the terminating NUL)
12329          *
12330          * The FS rename VNOP is suppossed to handle this but we'll pick it
12331          * off here itself.
12332          */
12333         if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
12334             (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
12335                 error = EINVAL;
12336                 goto out1;
12337         }
12338         for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
12339                 ;
12340         }
12341         if (i < (int)name_len) {
12342                 error = EINVAL;
12343                 goto out1;
12344         }
12345
12346 #if CONFIG_MACF
12347         error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
12348             newname_buf);
12349         if (error) {
12350                 goto out1;
12351         }
12352 #endif
12353
12354         NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
12355             UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
12356         tond->ni_dvp = snapdvp;
12357
12358         error = namei(tond);
12359         if (error) {
12360                 goto out2;
12361         } else if (tond->ni_vp) {
12362                 /*
12363                  * snapshot rename behaves differently than rename(2) - if the
12364                  * new name exists, EEXIST is returned.
12365                  */
12366                 vnode_put(tond->ni_vp);
12367                 error = EEXIST;
12368                 goto out2;
12369         }
12370
12371         error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
12372             &tond->ni_cnd, ctx);
12373
12374 out2:
12375         nameidone(tond);
12376 out1:
12377         FREE(newname_buf, M_TEMP);
12378         vnode_put(fvp);
12379         vnode_put(snapdvp);
12380         vnode_put(rvp);
12381         nameidone(fromnd);
12382 out:
12383         FREE(__rename_data, M_TEMP);
12384         return error;
12385 }
12386
12387 /*
12388  * Mount a Filesystem snapshot
12389  *
12390  * get the vnode for the unnamed snapshot directory and the snapshot and
12391  * mount the snapshot.
12392  */
12393 static int
12394 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
12395     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
12396 {
12397         vnode_t rvp, snapdvp, snapvp, vp, pvp;
12398         int error;
12399         struct nameidata *snapndp, *dirndp;
12400         /* carving out a chunk for structs that are too big to be on stack. */
12401         struct {
12402                 struct nameidata snapnd;
12403                 struct nameidata dirnd;
12404         } * __snapshot_mount_data;
12405
12406         MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
12407             M_TEMP, M_WAITOK);
12408         snapndp = &__snapshot_mount_data->snapnd;
12409         dirndp = &__snapshot_mount_data->dirnd;
12410
12411         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
12412             OP_LOOKUP, ctx);
12413         if (error) {
12414                 goto out;
12415         }
12416
12417         snapvp  = snapndp->ni_vp;
12418         if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
12419                 error = EIO;
12420                 goto out1;
12421         }
12422
12423         /* Get the vnode to be covered */
12424         NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
12425             UIO_USERSPACE, directory, ctx);
12426         error = namei(dirndp);
12427         if (error) {
12428                 goto out1;
12429         }
12430
12431         vp = dirndp->ni_vp;
12432         pvp = dirndp->ni_dvp;
12433
12434         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
12435                 error = EINVAL;
12436         } else {
12437                 mount_t mp = vnode_mount(rvp);
12438                 struct fs_snapshot_mount_args smnt_data;
12439
12440                 smnt_data.sm_mp  = mp;
12441                 smnt_data.sm_cnp = &snapndp->ni_cnd;
12442                 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
12443                     &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
12444                     KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
12445         }
12446
12447         vnode_put(vp);
12448         vnode_put(pvp);
12449         nameidone(dirndp);
12450 out1:
12451         vnode_put(snapvp);
12452         vnode_put(snapdvp);
12453         vnode_put(rvp);
12454         nameidone(snapndp);
12455 out:
12456         FREE(__snapshot_mount_data, M_TEMP);
12457         return error;
12458 }
12459
12460 /*
12461  * Root from a snapshot of the filesystem
12462  *
12463  * Marks the filesystem to root from the given snapshot on next boot.
12464  */
12465 static int
12466 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
12467     vfs_context_t ctx)
12468 {
12469         int error;
12470         vnode_t rvp;
12471         mount_t mp;
12472         struct fs_snapshot_root_args root_data;
12473         struct componentname cnp;
12474         caddr_t name_buf;
12475         size_t name_len;
12476
12477         error = vnode_getfromfd(ctx, dirfd, &rvp);
12478         if (error) {
12479                 return error;
12480         }
12481         mp = vnode_mount(rvp);
12482
12483         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12484         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12485         if (error) {
12486                 FREE(name_buf, M_TEMP);
12487                 vnode_put(rvp);
12488                 return error;
12489         }
12490
12491         // XXX MAC checks ?
12492
12493         /*
12494          * Grab mount_iterref so that we can release the vnode,
12495          * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
12496          */
12497         error = mount_iterref(mp, 0);
12498         vnode_put(rvp);
12499         if (error) {
12500                 FREE(name_buf, M_TEMP);
12501                 return error;
12502         }
12503
12504         memset(&cnp, 0, sizeof(cnp));
12505         cnp.cn_pnbuf = (char *)name_buf;
12506         cnp.cn_nameiop = LOOKUP;
12507         cnp.cn_flags = ISLASTCN | HASBUF;
12508         cnp.cn_pnlen = MAXPATHLEN;
12509         cnp.cn_nameptr = cnp.cn_pnbuf;
12510         cnp.cn_namelen = (int)name_len;
12511         root_data.sr_cnp = &cnp;
12512
12513         error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
12514
12515         mount_iterdrop(mp);
12516         FREE(name_buf, M_TEMP);
12517
12518         return error;
12519 }
12520
12521 /*
12522  * FS snapshot operations dispatcher
12523  */
12524 int
12525 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
12526     __unused int32_t *retval)
12527 {
12528         int error;
12529         vfs_context_t ctx = vfs_context_current();
12530
12531         AUDIT_ARG(fd, uap->dirfd);
12532         AUDIT_ARG(value32, uap->op);
12533
12534         error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
12535         if (error) {
12536                 return error;
12537         }
12538
12539         /*
12540          * Enforce user authorization for snapshot modification operations
12541          */
12542         if ((uap->op != SNAPSHOT_OP_MOUNT) &&
12543             (uap->op != SNAPSHOT_OP_ROOT)) {
12544                 vnode_t dvp = NULLVP;
12545                 vnode_t devvp = NULLVP;
12546                 mount_t mp;
12547
12548                 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
12549                 if (error) {
12550                         return error;
12551                 }
12552                 mp = vnode_mount(dvp);
12553                 devvp = mp->mnt_devvp;
12554
12555                 /* get an iocount on devvp */
12556                 if (devvp == NULLVP) {
12557                         error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
12558                         /* for mounts which arent block devices */
12559                         if (error == ENOENT) {
12560                                 error = ENXIO;
12561                         }
12562                 } else {
12563                         error = vnode_getwithref(devvp);
12564                 }
12565
12566                 if (error) {
12567                         vnode_put(dvp);
12568                         return error;
12569                 }
12570
12571                 if ((vfs_context_issuser(ctx) == 0) &&
12572                     (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0)) {
12573                         error = EPERM;
12574                 }
12575                 vnode_put(dvp);
12576                 vnode_put(devvp);
12577
12578                 if (error) {
12579                         return error;
12580                 }
12581         }
12582
12583         switch (uap->op) {
12584         case SNAPSHOT_OP_CREATE:
12585                 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
12586                 break;
12587         case SNAPSHOT_OP_DELETE:
12588                 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
12589                 break;
12590         case SNAPSHOT_OP_RENAME:
12591                 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
12592                     uap->flags, ctx);
12593                 break;
12594         case SNAPSHOT_OP_MOUNT:
12595                 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
12596                     uap->data, uap->flags, ctx);
12597                 break;
12598         case SNAPSHOT_OP_REVERT:
12599                 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
12600                 break;
12601 #if CONFIG_MNT_ROOTSNAP
12602         case SNAPSHOT_OP_ROOT:
12603                 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
12604                 break;
12605 #endif /* CONFIG_MNT_ROOTSNAP */
12606         default:
12607                 error = ENOSYS;
12608         }
12609
12610         return error;
12611 }