bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/xattr.h>
  98 #include <sys/fcntl.h>
  99 #include <sys/fsctl.h>
 100 #include <sys/ubc_internal.h>
 101 #include <sys/disk.h>
 102 #include <sys/content_protection.h>
 103 #include <sys/clonefile.h>
 104 #include <sys/snapshot.h>
 105 #include <sys/priv.h>
 106 #include <machine/cons.h>
 107 #include <machine/limits.h>
 108 #include <miscfs/specfs/specdev.h>
 109
 110 #include <vfs/vfs_disk_conditioner.h>
 111
 112 #include <security/audit/audit.h>
 113 #include <bsm/audit_kevents.h>
 114
 115 #include <mach/mach_types.h>
 116 #include <kern/kern_types.h>
 117 #include <kern/kalloc.h>
 118 #include <kern/task.h>
 119
 120 #include <vm/vm_pageout.h>
 121 #include <vm/vm_protos.h>
 122
 123 #include <libkern/OSAtomic.h>
 124 #include <pexpert/pexpert.h>
 125 #include <IOKit/IOBSD.h>
 126
 127 #if ROUTEFS
 128 #include <miscfs/routefs/routefs.h>
 129 #endif /* ROUTEFS */
 130
 131 #if CONFIG_MACF
 132 #include <security/mac.h>
 133 #include <security/mac_framework.h>
 134 #endif
 135
 136 #if CONFIG_FSE
 137 #define GET_PATH(x) \
 138         (x) = get_pathbuff();
 139 #define RELEASE_PATH(x) \
 140         release_pathbuff(x);
 141 #else
 142 #define GET_PATH(x)     \
 143         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 144 #define RELEASE_PATH(x) \
 145         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 146 #endif /* CONFIG_FSE */
 147
 148 #ifndef HFS_GET_BOOT_INFO
 149 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
 150 #endif
 151
 152 #ifndef HFS_SET_BOOT_INFO
 153 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
 154 #endif
 155
 156 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
 157 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
 158 #endif
 159
 160 extern void disk_conditioner_unmount(mount_t mp);
 161
 162 /* struct for checkdirs iteration */
 163 struct cdirargs {
 164         vnode_t olddp;
 165         vnode_t newdp;
 166 };
 167 /* callback  for checkdirs iteration */
 168 static int checkdirs_callback(proc_t p, void * arg);
 169
 170 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 171 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 172 void enablequotas(struct mount *mp, vfs_context_t ctx);
 173 static int getfsstat_callback(mount_t mp, void * arg);
 174 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 175 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 176 static int sync_callback(mount_t, void *);
 177 static void hibernate_sync_thread(void *, __unused wait_result_t);
 178 static int hibernate_sync_async(int);
 179 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 180                         user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 181                                                 boolean_t partial_copy);
 182 static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
 183                         user_addr_t bufp);
 184 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 185 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 186                         struct componentname *cnp, user_addr_t fsmountargs,
 187                         int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 188                         vfs_context_t ctx);
 189 void vfs_notify_mount(vnode_t pdvp);
 190
 191 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 192
 193 struct fd_vn_data * fg_vn_data_alloc(void);
 194
 195 /*
 196  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 197  * Concurrent lookups (or lookups by ids) on hard links can cause the
 198  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 199  * does) to return ENOENT as the path cannot be returned from the name cache
 200  * alone. We have no option but to retry and hope to get one namei->reverse path
 201  * generation done without an intervening lookup, lookup by id on the hard link
 202  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 203  * which currently are the MAC hooks for rename, unlink and rmdir.
 204  */
 205 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 206
 207 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
 208
 209 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
 210
 211 #ifdef CONFIG_IMGSRC_ACCESS
 212 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 213 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 214 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 215 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 216 static void mount_end_update(mount_t mp);
 217 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 218 #endif /* CONFIG_IMGSRC_ACCESS */
 219
 220 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 221
 222 __private_extern__
 223 int sync_internal(void);
 224
 225 __private_extern__
 226 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 227
 228 extern lck_grp_t *fd_vn_lck_grp;
 229 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 230 extern lck_attr_t *fd_vn_lck_attr;
 231
 232 /*
 233  * incremented each time a mount or unmount operation occurs
 234  * used to invalidate the cached value of the rootvp in the
 235  * mount structure utilized by cache_lookup_path
 236  */
 237 uint32_t mount_generation = 0;
 238
 239 /* counts number of mount and unmount operations */
 240 unsigned int vfs_nummntops=0;
 241
 242 extern const struct fileops vnops;
 243 #if CONFIG_APPLEDOUBLE
 244 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 245 #endif /* CONFIG_APPLEDOUBLE */
 246
 247 /*
 248  * Virtual File System System Calls
 249  */
 250
 251 #if NFSCLIENT || DEVFS || ROUTEFS
 252 /*
 253  * Private in-kernel mounting spi (NFS only, not exported)
 254  */
 255  __private_extern__
 256 boolean_t
 257 vfs_iskernelmount(mount_t mp)
 258 {
 259         return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE);
 260 }
 261
 262  __private_extern__
 263 int
 264 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 265              void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
 266 {
 267         struct nameidata nd;
 268         boolean_t did_namei;
 269         int error;
 270
 271         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 272                UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 273
 274         /*
 275          * Get the vnode to be covered if it's not supplied
 276          */
 277         if (vp == NULLVP) {
 278                 error = namei(&nd);
 279                 if (error)
 280                         return (error);
 281                 vp = nd.ni_vp;
 282                 pvp = nd.ni_dvp;
 283                 did_namei = TRUE;
 284         } else {
 285                 char *pnbuf = CAST_DOWN(char *, path);
 286
 287                 nd.ni_cnd.cn_pnbuf = pnbuf;
 288                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 289                 did_namei = FALSE;
 290         }
 291
 292         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 293                              syscall_flags, kern_flags, NULL, TRUE, ctx);
 294
 295         if (did_namei) {
 296                 vnode_put(vp);
 297                 vnode_put(pvp);
 298                 nameidone(&nd);
 299         }
 300
 301         return (error);
 302 }
 303 #endif /* NFSCLIENT || DEVFS */
 304
 305 /*
 306  * Mount a file system.
 307  */
 308 /* ARGSUSED */
 309 int
 310 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 311 {
 312         struct __mac_mount_args muap;
 313
 314         muap.type = uap->type;
 315         muap.path = uap->path;
 316         muap.flags = uap->flags;
 317         muap.data = uap->data;
 318         muap.mac_p = USER_ADDR_NULL;
 319         return (__mac_mount(p, &muap, retval));
 320 }
 321
 322 int
 323 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
 324 {
 325         struct componentname    cn;
 326         vfs_context_t           ctx = vfs_context_current();
 327         size_t                  dummy = 0;
 328         int                     error;
 329         int                     flags = uap->flags;
 330         char                    fstypename[MFSNAMELEN];
 331         char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
 332         vnode_t                 pvp;
 333         vnode_t                 vp;
 334
 335         AUDIT_ARG(fd, uap->fd);
 336         AUDIT_ARG(fflags, flags);
 337         /* fstypename will get audited by mount_common */
 338
 339         /* Sanity check the flags */
 340         if (flags & (MNT_IMGSRC_BY_INDEX|MNT_ROOTFS)) {
 341                 return (ENOTSUP);
 342         }
 343
 344         if (flags & MNT_UNION) {
 345                 return (EPERM);
 346         }
 347
 348         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 349         if (error) {
 350                 return (error);
 351         }
 352
 353         if ((error = file_vnode(uap->fd, &vp)) != 0) {
 354                 return (error);
 355         }
 356
 357         if ((error = vnode_getwithref(vp)) != 0) {
 358                 file_drop(uap->fd);
 359                 return (error);
 360         }
 361
 362         pvp = vnode_getparent(vp);
 363         if (pvp == NULL) {
 364                 vnode_put(vp);
 365                 file_drop(uap->fd);
 366                 return (EINVAL);
 367         }
 368
 369         memset(&cn, 0, sizeof(struct componentname));
 370         MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
 371         cn.cn_pnlen = MAXPATHLEN;
 372
 373         if((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
 374                 FREE(cn.cn_pnbuf, M_TEMP);
 375                 vnode_put(pvp);
 376                 vnode_put(vp);
 377                 file_drop(uap->fd);
 378                 return (error);
 379         }
 380
 381         error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
 382
 383         FREE(cn.cn_pnbuf, M_TEMP);
 384         vnode_put(pvp);
 385         vnode_put(vp);
 386         file_drop(uap->fd);
 387
 388         return (error);
 389 }
 390
 391 void
 392 vfs_notify_mount(vnode_t pdvp)
 393 {
 394         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 395         lock_vnode_and_post(pdvp, NOTE_WRITE);
 396 }
 397
 398 /*
 399  * __mac_mount:
 400  *      Mount a file system taking into account MAC label behavior.
 401  *      See mount(2) man page for more information
 402  *
 403  * Parameters:    p                        Process requesting the mount
 404  *                uap                      User argument descriptor (see below)
 405  *                retval                   (ignored)
 406  *
 407  * Indirect:      uap->type                Filesystem type
 408  *                uap->path                Path to mount
 409  *                uap->data                Mount arguments
 410  *                uap->mac_p               MAC info
 411  *                uap->flags               Mount flags
 412  *
 413  *
 414  * Returns:        0                       Success
 415  *                !0                       Not success
 416  */
 417 boolean_t root_fs_upgrade_try = FALSE;
 418
 419 int
 420 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 421 {
 422         vnode_t pvp = NULL;
 423         vnode_t vp = NULL;
 424         int need_nameidone = 0;
 425         vfs_context_t ctx = vfs_context_current();
 426         char fstypename[MFSNAMELEN];
 427         struct nameidata nd;
 428         size_t dummy=0;
 429         char *labelstr = NULL;
 430         int flags = uap->flags;
 431         int error;
 432 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 433         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 434 #else
 435 #pragma unused(p)
 436 #endif
 437         /*
 438          * Get the fs type name from user space
 439          */
 440         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 441         if (error)
 442                 return (error);
 443
 444         /*
 445          * Get the vnode to be covered
 446          */
 447         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 448                UIO_USERSPACE, uap->path, ctx);
 449         error = namei(&nd);
 450         if (error) {
 451                 goto out;
 452         }
 453         need_nameidone = 1;
 454         vp = nd.ni_vp;
 455         pvp = nd.ni_dvp;
 456
 457 #ifdef CONFIG_IMGSRC_ACCESS
 458         /* Mounting image source cannot be batched with other operations */
 459         if (flags == MNT_IMGSRC_BY_INDEX) {
 460                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 461                                                   ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 462                 goto out;
 463         }
 464 #endif /* CONFIG_IMGSRC_ACCESS */
 465
 466 #if CONFIG_MACF
 467         /*
 468          * Get the label string (if any) from user space
 469          */
 470         if (uap->mac_p != USER_ADDR_NULL) {
 471                 struct user_mac mac;
 472                 size_t ulen = 0;
 473
 474                 if (is_64bit) {
 475                         struct user64_mac mac64;
 476                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 477                         mac.m_buflen = mac64.m_buflen;
 478                         mac.m_string = mac64.m_string;
 479                 } else {
 480                         struct user32_mac mac32;
 481                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 482                         mac.m_buflen = mac32.m_buflen;
 483                         mac.m_string = mac32.m_string;
 484                 }
 485                 if (error)
 486                         goto out;
 487                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 488                     (mac.m_buflen < 2)) {
 489                         error = EINVAL;
 490                         goto out;
 491                 }
 492                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 493                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 494                 if (error) {
 495                         goto out;
 496                 }
 497                 AUDIT_ARG(mac_string, labelstr);
 498         }
 499 #endif /* CONFIG_MACF */
 500
 501         AUDIT_ARG(fflags, flags);
 502
 503 #if SECURE_KERNEL
 504         if (flags & MNT_UNION) {
 505                 /* No union mounts on release kernels */
 506                 error = EPERM;
 507                 goto out;
 508         }
 509 #endif
 510
 511         if ((vp->v_flag & VROOT) &&
 512                         (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 513                 if (!(flags & MNT_UNION)) {
 514                         flags |= MNT_UPDATE;
 515                 }
 516                 else {
 517                         /*
 518                          * For a union mount on '/', treat it as fresh
 519                          * mount instead of update.
 520                          * Otherwise, union mouting on '/' used to panic the
 521                          * system before, since mnt_vnodecovered was found to
 522                          * be NULL for '/' which is required for unionlookup
 523                          * after it gets ENOENT on union mount.
 524                          */
 525                         flags = (flags & ~(MNT_UPDATE));
 526                 }
 527
 528 #if SECURE_KERNEL
 529                 if ((flags & MNT_RDONLY) == 0) {
 530                         /* Release kernels are not allowed to mount "/" as rw */
 531                         error = EPERM;
 532                         goto out;
 533                 }
 534 #endif
 535                 /*
 536                  * See 7392553 for more details on why this check exists.
 537                  * Suffice to say: If this check is ON and something tries
 538                  * to mount the rootFS RW, we'll turn off the codesign
 539                  * bitmap optimization.
 540                  */
 541 #if CHECK_CS_VALIDATION_BITMAP
 542                 if ((flags & MNT_RDONLY) == 0 ) {
 543                         root_fs_upgrade_try = TRUE;
 544                 }
 545 #endif
 546         }
 547
 548         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 549                              labelstr, FALSE, ctx);
 550
 551 out:
 552
 553 #if CONFIG_MACF
 554         if (labelstr)
 555                 FREE(labelstr, M_MACTEMP);
 556 #endif /* CONFIG_MACF */
 557
 558         if (vp) {
 559                 vnode_put(vp);
 560         }
 561         if (pvp) {
 562                 vnode_put(pvp);
 563         }
 564         if (need_nameidone) {
 565                 nameidone(&nd);
 566         }
 567
 568         return (error);
 569 }
 570
 571 /*
 572  * common mount implementation (final stage of mounting)
 573
 574  * Arguments:
 575  *  fstypename  file system type (ie it's vfs name)
 576  *  pvp         parent of covered vnode
 577  *  vp          covered vnode
 578  *  cnp         component name (ie path) of covered vnode
 579  *  flags       generic mount flags
 580  *  fsmountargs file system specific data
 581  *  labelstr    optional MAC label
 582  *  kernelmount TRUE for mounts initiated from inside the kernel
 583  *  ctx         caller's context
 584  */
 585 static int
 586 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 587              struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 588              char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 589 {
 590 #if !CONFIG_MACF
 591 #pragma unused(labelstr)
 592 #endif
 593         struct vnode *devvp = NULLVP;
 594         struct vnode *device_vnode = NULLVP;
 595 #if CONFIG_MACF
 596         struct vnode *rvp;
 597 #endif
 598         struct mount *mp;
 599         struct vfstable *vfsp = (struct vfstable *)0;
 600         struct proc *p = vfs_context_proc(ctx);
 601         int error, flag = 0;
 602         user_addr_t devpath = USER_ADDR_NULL;
 603         int ronly = 0;
 604         int mntalloc = 0;
 605         boolean_t vfsp_ref = FALSE;
 606         boolean_t is_rwlock_locked = FALSE;
 607         boolean_t did_rele = FALSE;
 608         boolean_t have_usecount = FALSE;
 609
 610         /*
 611          * Process an update for an existing mount
 612          */
 613         if (flags & MNT_UPDATE) {
 614                 if ((vp->v_flag & VROOT) == 0) {
 615                         error = EINVAL;
 616                         goto out1;
 617                 }
 618                 mp = vp->v_mount;
 619
 620                 /* unmount in progress return error */
 621                 mount_lock_spin(mp);
 622                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 623                         mount_unlock(mp);
 624                         error = EBUSY;
 625                         goto out1;
 626                 }
 627                 mount_unlock(mp);
 628                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 629                 is_rwlock_locked = TRUE;
 630                 /*
 631                  * We only allow the filesystem to be reloaded if it
 632                  * is currently mounted read-only.
 633                  */
 634                 if ((flags & MNT_RELOAD) &&
 635                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 636                         error = ENOTSUP;
 637                         goto out1;
 638                 }
 639
 640                 /*
 641                  * If content protection is enabled, update mounts are not
 642                  * allowed to turn it off.
 643                  */
 644                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 645                            ((flags & MNT_CPROTECT) == 0)) {
 646                         error = EINVAL;
 647                         goto out1;
 648                 }
 649
 650 #ifdef CONFIG_IMGSRC_ACCESS
 651                 /* Can't downgrade the backer of the root FS */
 652                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 653                         (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 654                         error = ENOTSUP;
 655                         goto out1;
 656                 }
 657 #endif /* CONFIG_IMGSRC_ACCESS */
 658
 659                 /*
 660                  * Only root, or the user that did the original mount is
 661                  * permitted to update it.
 662                  */
 663                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 664                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 665                         goto out1;
 666                 }
 667 #if CONFIG_MACF
 668                 error = mac_mount_check_remount(ctx, mp);
 669                 if (error != 0) {
 670                         goto out1;
 671                 }
 672 #endif
 673                 /*
 674                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 675                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 676                  */
 677                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 678                         flags |= MNT_NOSUID | MNT_NODEV;
 679                         if (mp->mnt_flag & MNT_NOEXEC)
 680                                 flags |= MNT_NOEXEC;
 681                 }
 682                 flag = mp->mnt_flag;
 683
 684
 685
 686                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 687
 688                 vfsp = mp->mnt_vtable;
 689                 goto update;
 690         }
 691
 692         /*
 693          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 694          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 695          */
 696         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 697                 flags |= MNT_NOSUID | MNT_NODEV;
 698                 if (vp->v_mount->mnt_flag & MNT_NOEXEC)
 699                         flags |= MNT_NOEXEC;
 700         }
 701
 702         /* XXXAUDIT: Should we capture the type on the error path as well? */
 703         AUDIT_ARG(text, fstypename);
 704         mount_list_lock();
 705         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 706                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 707                         vfsp->vfc_refcount++;
 708                         vfsp_ref = TRUE;
 709                         break;
 710                 }
 711         mount_list_unlock();
 712         if (vfsp == NULL) {
 713                 error = ENODEV;
 714                 goto out1;
 715         }
 716
 717         /*
 718          * VFC_VFSLOCALARGS is not currently supported for kernel mounts
 719          */
 720         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
 721                 error = EINVAL;  /* unsupported request */
 722                 goto out1;
 723         }
 724
 725         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 726         if (error != 0) {
 727                 goto out1;
 728         }
 729
 730         /*
 731          * Allocate and initialize the filesystem (mount_t)
 732          */
 733         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 734                 M_MOUNT, M_WAITOK);
 735         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 736         mntalloc = 1;
 737
 738         /* Initialize the default IO constraints */
 739         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 740         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 741         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 742         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 743         mp->mnt_devblocksize = DEV_BSIZE;
 744         mp->mnt_alignmentmask = PAGE_MASK;
 745         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 746         mp->mnt_ioscale = 1;
 747         mp->mnt_ioflags = 0;
 748         mp->mnt_realrootvp = NULLVP;
 749         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 750
 751         TAILQ_INIT(&mp->mnt_vnodelist);
 752         TAILQ_INIT(&mp->mnt_workerqueue);
 753         TAILQ_INIT(&mp->mnt_newvnodes);
 754         mount_lock_init(mp);
 755         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 756         is_rwlock_locked = TRUE;
 757         mp->mnt_op = vfsp->vfc_vfsops;
 758         mp->mnt_vtable = vfsp;
 759         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 760         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 761         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 762         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 763         mp->mnt_vnodecovered = vp;
 764         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 765         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 766         mp->mnt_devbsdunit = 0;
 767
 768         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 769         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 770
 771 #if NFSCLIENT || DEVFS || ROUTEFS
 772         if (kernelmount)
 773                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 774         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0)
 775                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 776 #endif /* NFSCLIENT || DEVFS */
 777
 778 update:
 779
 780         /*
 781          * Set the mount level flags.
 782          */
 783         if (flags & MNT_RDONLY)
 784                 mp->mnt_flag |= MNT_RDONLY;
 785         else if (mp->mnt_flag & MNT_RDONLY) {
 786                 // disallow read/write upgrades of file systems that
 787                 // had the TYPENAME_OVERRIDE feature set.
 788                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 789                         error = EPERM;
 790                         goto out1;
 791                 }
 792                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 793         }
 794         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 795                           MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 796                           MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 797                           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 798                           MNT_QUARANTINE | MNT_CPROTECT);
 799
 800 #if SECURE_KERNEL
 801 #if !CONFIG_MNT_SUID
 802         /*
 803          * On release builds of iOS based platforms, always enforce NOSUID on
 804          * all mounts. We do this here because we can catch update mounts as well as
 805          * non-update mounts in this case.
 806          */
 807         mp->mnt_flag |= (MNT_NOSUID);
 808 #endif
 809 #endif
 810
 811         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 812                                  MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 813                                  MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 814                                  MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 815                                  MNT_QUARANTINE | MNT_CPROTECT);
 816
 817 #if CONFIG_MACF
 818         if (flags & MNT_MULTILABEL) {
 819                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 820                         error = EINVAL;
 821                         goto out1;
 822                 }
 823                 mp->mnt_flag |= MNT_MULTILABEL;
 824         }
 825 #endif
 826         /*
 827          * Process device path for local file systems if requested
 828          */
 829         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
 830             !(internal_flags & KERNEL_MOUNT_SNAPSHOT)) {
 831                 if (vfs_context_is64bit(ctx)) {
 832                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
 833                                 goto out1;
 834                         fsmountargs += sizeof(devpath);
 835                 } else {
 836                         user32_addr_t tmp;
 837                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
 838                                 goto out1;
 839                         /* munge into LP64 addr */
 840                         devpath = CAST_USER_ADDR_T(tmp);
 841                         fsmountargs += sizeof(tmp);
 842                 }
 843
 844                 /* Lookup device and authorize access to it */
 845                 if ((devpath)) {
 846                         struct nameidata nd;
 847
 848                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 849                         if ( (error = namei(&nd)) )
 850                                 goto out1;
 851
 852                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 853                         devvp = nd.ni_vp;
 854
 855                         nameidone(&nd);
 856
 857                         if (devvp->v_type != VBLK) {
 858                                 error = ENOTBLK;
 859                                 goto out2;
 860                         }
 861                         if (major(devvp->v_rdev) >= nblkdev) {
 862                                 error = ENXIO;
 863                                 goto out2;
 864                         }
 865                         /*
 866                         * If mount by non-root, then verify that user has necessary
 867                         * permissions on the device.
 868                         */
 869                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 870                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 871
 872                                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
 873                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 874                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0)
 875                                         goto out2;
 876                         }
 877                 }
 878                 /* On first mount, preflight and open device */
 879                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 880                         if ( (error = vnode_ref(devvp)) )
 881                                 goto out2;
 882                         /*
 883                         * Disallow multiple mounts of the same device.
 884                         * Disallow mounting of a device that is currently in use
 885                         * (except for root, which might share swap device for miniroot).
 886                         * Flush out any old buffers remaining from a previous use.
 887                         */
 888                         if ( (error = vfs_mountedon(devvp)) )
 889                                 goto out3;
 890
 891                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 892                                 error = EBUSY;
 893                                 goto out3;
 894                         }
 895                         if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, ctx)) ) {
 896                                 error = ENOTBLK;
 897                                 goto out3;
 898                         }
 899                         if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0)) )
 900                                 goto out3;
 901
 902                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 903 #if CONFIG_MACF
 904                         error = mac_vnode_check_open(ctx,
 905                             devvp,
 906                             ronly ? FREAD : FREAD|FWRITE);
 907                         if (error)
 908                                 goto out3;
 909 #endif /* MAC */
 910                         if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, ctx)) )
 911                                 goto out3;
 912
 913                         mp->mnt_devvp = devvp;
 914                         device_vnode = devvp;
 915
 916                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 917                            (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 918                            (device_vnode = mp->mnt_devvp)) {
 919                         dev_t dev;
 920                         int maj;
 921                         /*
 922                          * If upgrade to read-write by non-root, then verify
 923                          * that user has necessary permissions on the device.
 924                          */
 925                         vnode_getalways(device_vnode);
 926
 927                         if (suser(vfs_context_ucred(ctx), NULL) &&
 928                             (error = vnode_authorize(device_vnode, NULL,
 929                              KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
 930                              ctx)) != 0) {
 931                                 vnode_put(device_vnode);
 932                                 goto out2;
 933                         }
 934
 935                         /* Tell the device that we're upgrading */
 936                         dev = (dev_t)device_vnode->v_rdev;
 937                         maj = major(dev);
 938
 939                         if ((u_int)maj >= (u_int)nblkdev)
 940                                 panic("Volume mounted on a device with invalid major number.");
 941
 942                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
 943                         vnode_put(device_vnode);
 944                         device_vnode = NULLVP;
 945                         if (error != 0) {
 946                                 goto out2;
 947                         }
 948                 }
 949         }
 950 #if CONFIG_MACF
 951         if ((flags & MNT_UPDATE) == 0) {
 952                 mac_mount_label_init(mp);
 953                 mac_mount_label_associate(ctx, mp);
 954         }
 955         if (labelstr) {
 956                 if ((flags & MNT_UPDATE) != 0) {
 957                         error = mac_mount_check_label_update(ctx, mp);
 958                         if (error != 0)
 959                                 goto out3;
 960                 }
 961         }
 962 #endif
 963         /*
 964          * Mount the filesystem.
 965          */
 966         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
 967                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
 968                     (caddr_t)fsmountargs, 0, ctx);
 969         } else {
 970                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
 971         }
 972
 973         if (flags & MNT_UPDATE) {
 974                 if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 975                         mp->mnt_flag &= ~MNT_RDONLY;
 976                 mp->mnt_flag &=~
 977                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
 978                 mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 979                 if (error)
 980                         mp->mnt_flag = flag;  /* restore flag value */
 981                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
 982                 lck_rw_done(&mp->mnt_rwlock);
 983                 is_rwlock_locked = FALSE;
 984                 if (!error)
 985                         enablequotas(mp, ctx);
 986                 goto exit;
 987         }
 988
 989         /*
 990          * Put the new filesystem on the mount list after root.
 991          */
 992         if (error == 0) {
 993                 struct vfs_attr vfsattr;
 994 #if CONFIG_MACF
 995                 if (vfs_flags(mp) & MNT_MULTILABEL) {
 996                         error = VFS_ROOT(mp, &rvp, ctx);
 997                         if (error) {
 998                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
 999                                 goto out3;
1000                         }
1001                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1002                         /*
1003                          * drop reference provided by VFS_ROOT
1004                          */
1005                         vnode_put(rvp);
1006
1007                         if (error)
1008                                 goto out3;
1009                 }
1010 #endif  /* MAC */
1011
1012                 vnode_lock_spin(vp);
1013                 CLR(vp->v_flag, VMOUNT);
1014                 vp->v_mountedhere = mp;
1015                 vnode_unlock(vp);
1016
1017                 /*
1018                  * taking the name_cache_lock exclusively will
1019                  * insure that everyone is out of the fast path who
1020                  * might be trying to use a now stale copy of
1021                  * vp->v_mountedhere->mnt_realrootvp
1022                  * bumping mount_generation causes the cached values
1023                  * to be invalidated
1024                  */
1025                 name_cache_lock();
1026                 mount_generation++;
1027                 name_cache_unlock();
1028
1029                 error = vnode_ref(vp);
1030                 if (error != 0) {
1031                         goto out4;
1032                 }
1033
1034                 have_usecount = TRUE;
1035
1036                 error = checkdirs(vp, ctx);
1037                 if (error != 0)  {
1038                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
1039                         goto out4;
1040                 }
1041                 /*
1042                  * there is no cleanup code here so I have made it void
1043                  * we need to revisit this
1044                  */
1045                 (void)VFS_START(mp, 0, ctx);
1046
1047                 if (mount_list_add(mp) != 0) {
1048                         /*
1049                          * The system is shutting down trying to umount
1050                          * everything, so fail with a plausible errno.
1051                          */
1052                         error = EBUSY;
1053                         goto out4;
1054                 }
1055                 lck_rw_done(&mp->mnt_rwlock);
1056                 is_rwlock_locked = FALSE;
1057
1058                 /* Check if this mounted file system supports EAs or named streams. */
1059                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1060                 VFSATTR_INIT(&vfsattr);
1061                 VFSATTR_WANTED(&vfsattr, f_capabilities);
1062                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1063                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1064                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1065                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1066                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1067                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1068                         }
1069 #if NAMEDSTREAMS
1070                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1071                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1072                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1073                         }
1074 #endif
1075                         /* Check if this file system supports path from id lookups. */
1076                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1077                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1078                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1079                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
1080                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1081                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1082                         }
1083
1084                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1085                                 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1086                                 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1087                         }
1088                 }
1089                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1090                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1091                 }
1092                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1093                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1094                 }
1095                 /* increment the operations count */
1096                 OSAddAtomic(1, &vfs_nummntops);
1097                 enablequotas(mp, ctx);
1098
1099                 if (device_vnode) {
1100                         device_vnode->v_specflags |= SI_MOUNTEDON;
1101
1102                         /*
1103                          *   cache the IO attributes for the underlying physical media...
1104                          *   an error return indicates the underlying driver doesn't
1105                          *   support all the queries necessary... however, reasonable
1106                          *   defaults will have been set, so no reason to bail or care
1107                          */
1108                         vfs_init_io_attributes(device_vnode, mp);
1109                 }
1110
1111                 /* Now that mount is setup, notify the listeners */
1112                 vfs_notify_mount(pvp);
1113                 IOBSDMountChange(mp, kIOMountChangeMount);
1114
1115         } else {
1116                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1117                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1118                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1119                                         mp->mnt_vtable->vfc_name, error);
1120                 }
1121
1122                 vnode_lock_spin(vp);
1123                 CLR(vp->v_flag, VMOUNT);
1124                 vnode_unlock(vp);
1125                 mount_list_lock();
1126                 mp->mnt_vtable->vfc_refcount--;
1127                 mount_list_unlock();
1128
1129                 if (device_vnode ) {
1130                         vnode_rele(device_vnode);
1131                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD|FWRITE, ctx);
1132                 }
1133                 lck_rw_done(&mp->mnt_rwlock);
1134                 is_rwlock_locked = FALSE;
1135
1136                 /*
1137                  * if we get here, we have a mount structure that needs to be freed,
1138                  * but since the coveredvp hasn't yet been updated to point at it,
1139                  * no need to worry about other threads holding a crossref on this mp
1140                  * so it's ok to just free it
1141                  */
1142                 mount_lock_destroy(mp);
1143 #if CONFIG_MACF
1144                 mac_mount_label_destroy(mp);
1145 #endif
1146                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1147         }
1148 exit:
1149         /*
1150          * drop I/O count on the device vp if there was one
1151          */
1152         if (devpath && devvp)
1153                 vnode_put(devvp);
1154
1155         return(error);
1156
1157 /* Error condition exits */
1158 out4:
1159         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1160
1161         /*
1162          * If the mount has been placed on the covered vp,
1163          * it may have been discovered by now, so we have
1164          * to treat this just like an unmount
1165          */
1166         mount_lock_spin(mp);
1167         mp->mnt_lflag |= MNT_LDEAD;
1168         mount_unlock(mp);
1169
1170         if (device_vnode != NULLVP) {
1171                 vnode_rele(device_vnode);
1172                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1173                        ctx);
1174                 did_rele = TRUE;
1175         }
1176
1177         vnode_lock_spin(vp);
1178
1179         mp->mnt_crossref++;
1180         vp->v_mountedhere = (mount_t) 0;
1181
1182         vnode_unlock(vp);
1183
1184         if (have_usecount) {
1185                 vnode_rele(vp);
1186         }
1187 out3:
1188         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele))
1189                 vnode_rele(devvp);
1190 out2:
1191         if (devpath && devvp)
1192                 vnode_put(devvp);
1193 out1:
1194         /* Release mnt_rwlock only when it was taken */
1195         if (is_rwlock_locked == TRUE) {
1196                 lck_rw_done(&mp->mnt_rwlock);
1197         }
1198
1199         if (mntalloc) {
1200                 if (mp->mnt_crossref)
1201                         mount_dropcrossref(mp, vp, 0);
1202                 else {
1203                         mount_lock_destroy(mp);
1204 #if CONFIG_MACF
1205                         mac_mount_label_destroy(mp);
1206 #endif
1207                         FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1208                 }
1209         }
1210         if (vfsp_ref) {
1211                 mount_list_lock();
1212                 vfsp->vfc_refcount--;
1213                 mount_list_unlock();
1214         }
1215
1216         return(error);
1217 }
1218
1219 /*
1220  * Flush in-core data, check for competing mount attempts,
1221  * and set VMOUNT
1222  */
1223 int
1224 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1225 {
1226 #if !CONFIG_MACF
1227 #pragma unused(cnp,fsname)
1228 #endif
1229         struct vnode_attr va;
1230         int error;
1231
1232         if (!skip_auth) {
1233                 /*
1234                  * If the user is not root, ensure that they own the directory
1235                  * onto which we are attempting to mount.
1236                  */
1237                 VATTR_INIT(&va);
1238                 VATTR_WANTED(&va, va_uid);
1239                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1240                                 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1241                                  (!vfs_context_issuser(ctx)))) {
1242                         error = EPERM;
1243                         goto out;
1244                 }
1245         }
1246
1247         if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
1248                 goto out;
1249
1250         if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) )
1251                 goto out;
1252
1253         if (vp->v_type != VDIR) {
1254                 error = ENOTDIR;
1255                 goto out;
1256         }
1257
1258         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1259                 error = EBUSY;
1260                 goto out;
1261         }
1262
1263 #if CONFIG_MACF
1264         error = mac_mount_check_mount(ctx, vp,
1265             cnp, fsname);
1266         if (error != 0)
1267                 goto out;
1268 #endif
1269
1270         vnode_lock_spin(vp);
1271         SET(vp->v_flag, VMOUNT);
1272         vnode_unlock(vp);
1273
1274 out:
1275         return error;
1276 }
1277
1278 #if CONFIG_IMGSRC_ACCESS
1279
1280 #if DEBUG
1281 #define IMGSRC_DEBUG(args...) printf(args)
1282 #else
1283 #define IMGSRC_DEBUG(args...) do { } while(0)
1284 #endif
1285
1286 static int
1287 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1288 {
1289         struct nameidata nd;
1290         vnode_t vp, realdevvp;
1291         mode_t accessmode;
1292         int error;
1293
1294         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
1295         if ( (error = namei(&nd)) ) {
1296                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1297                 return error;
1298         }
1299
1300         vp = nd.ni_vp;
1301
1302         if (!vnode_isblk(vp)) {
1303                 IMGSRC_DEBUG("Not block device.\n");
1304                 error = ENOTBLK;
1305                 goto out;
1306         }
1307
1308         realdevvp = mp->mnt_devvp;
1309         if (realdevvp == NULLVP) {
1310                 IMGSRC_DEBUG("No device backs the mount.\n");
1311                 error = ENXIO;
1312                 goto out;
1313         }
1314
1315         error = vnode_getwithref(realdevvp);
1316         if (error != 0) {
1317                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1318                 goto out;
1319         }
1320
1321         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1322                 IMGSRC_DEBUG("Wrong dev_t.\n");
1323                 error = ENXIO;
1324                 goto out1;
1325         }
1326
1327         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1328
1329         /*
1330          * If mount by non-root, then verify that user has necessary
1331          * permissions on the device.
1332          */
1333         if (!vfs_context_issuser(ctx)) {
1334                 accessmode = KAUTH_VNODE_READ_DATA;
1335                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
1336                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1337                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1338                         IMGSRC_DEBUG("Access denied.\n");
1339                         goto out1;
1340                 }
1341         }
1342
1343         *devvpp = vp;
1344
1345 out1:
1346         vnode_put(realdevvp);
1347 out:
1348         nameidone(&nd);
1349         if (error) {
1350                 vnode_put(vp);
1351         }
1352
1353         return error;
1354 }
1355
1356 /*
1357  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1358  * and call checkdirs()
1359  */
1360 static int
1361 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1362 {
1363         int error;
1364
1365         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1366
1367         vnode_lock_spin(vp);
1368         CLR(vp->v_flag, VMOUNT);
1369         vp->v_mountedhere = mp;
1370         vnode_unlock(vp);
1371
1372         /*
1373          * taking the name_cache_lock exclusively will
1374          * insure that everyone is out of the fast path who
1375          * might be trying to use a now stale copy of
1376          * vp->v_mountedhere->mnt_realrootvp
1377          * bumping mount_generation causes the cached values
1378          * to be invalidated
1379          */
1380         name_cache_lock();
1381         mount_generation++;
1382         name_cache_unlock();
1383
1384         error = vnode_ref(vp);
1385         if (error != 0) {
1386                 goto out;
1387         }
1388
1389         error = checkdirs(vp, ctx);
1390         if (error != 0)  {
1391                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1392                 vnode_rele(vp);
1393                 goto out;
1394         }
1395
1396 out:
1397         if (error != 0) {
1398                 mp->mnt_vnodecovered = NULLVP;
1399         }
1400         return error;
1401 }
1402
1403 static void
1404 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1405 {
1406         vnode_rele(vp);
1407         vnode_lock_spin(vp);
1408         vp->v_mountedhere = (mount_t)NULL;
1409         vnode_unlock(vp);
1410
1411         mp->mnt_vnodecovered = NULLVP;
1412 }
1413
1414 static int
1415 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1416 {
1417         int error;
1418
1419         /* unmount in progress return error */
1420         mount_lock_spin(mp);
1421         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1422                 mount_unlock(mp);
1423                 return EBUSY;
1424         }
1425         mount_unlock(mp);
1426         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1427
1428         /*
1429          * We only allow the filesystem to be reloaded if it
1430          * is currently mounted read-only.
1431          */
1432         if ((flags & MNT_RELOAD) &&
1433                         ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1434                 error = ENOTSUP;
1435                 goto out;
1436         }
1437
1438         /*
1439          * Only root, or the user that did the original mount is
1440          * permitted to update it.
1441          */
1442         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1443                         (!vfs_context_issuser(ctx))) {
1444                 error = EPERM;
1445                 goto out;
1446         }
1447 #if CONFIG_MACF
1448         error = mac_mount_check_remount(ctx, mp);
1449         if (error != 0) {
1450                 goto out;
1451         }
1452 #endif
1453
1454 out:
1455         if (error) {
1456                 lck_rw_done(&mp->mnt_rwlock);
1457         }
1458
1459         return error;
1460 }
1461
1462 static void
1463 mount_end_update(mount_t mp)
1464 {
1465         lck_rw_done(&mp->mnt_rwlock);
1466 }
1467
1468 static int
1469 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1470 {
1471         vnode_t vp;
1472
1473         if (height >= MAX_IMAGEBOOT_NESTING) {
1474                 return EINVAL;
1475         }
1476
1477         vp = imgsrc_rootvnodes[height];
1478         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1479                 *rvpp = vp;
1480                 return 0;
1481         } else {
1482                 return ENOENT;
1483         }
1484 }
1485
1486 static int
1487 relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
1488                 const char *fsname, vfs_context_t ctx,
1489                 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1490 {
1491         int error;
1492         mount_t mp;
1493         boolean_t placed = FALSE;
1494         vnode_t devvp = NULLVP;
1495         struct vfstable *vfsp;
1496         user_addr_t devpath;
1497         char *old_mntonname;
1498         vnode_t rvp;
1499         uint32_t height;
1500         uint32_t flags;
1501
1502         /* If we didn't imageboot, nothing to move */
1503         if (imgsrc_rootvnodes[0] == NULLVP) {
1504                 return EINVAL;
1505         }
1506
1507         /* Only root can do this */
1508         if (!vfs_context_issuser(ctx)) {
1509                 return EPERM;
1510         }
1511
1512         IMGSRC_DEBUG("looking for root vnode.\n");
1513
1514         /*
1515          * Get root vnode of filesystem we're moving.
1516          */
1517         if (by_index) {
1518                 if (is64bit) {
1519                         struct user64_mnt_imgsrc_args mia64;
1520                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1521                         if (error != 0) {
1522                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1523                                 return error;
1524                         }
1525
1526                         height = mia64.mi_height;
1527                         flags = mia64.mi_flags;
1528                         devpath = mia64.mi_devpath;
1529                 } else {
1530                         struct user32_mnt_imgsrc_args mia32;
1531                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1532                         if (error != 0) {
1533                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1534                                 return error;
1535                         }
1536
1537                         height = mia32.mi_height;
1538                         flags = mia32.mi_flags;
1539                         devpath = mia32.mi_devpath;
1540                 }
1541         } else {
1542                 /*
1543                  * For binary compatibility--assumes one level of nesting.
1544                  */
1545                 if (is64bit) {
1546                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
1547                                 return error;
1548                 } else {
1549                         user32_addr_t tmp;
1550                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
1551                                 return error;
1552
1553                         /* munge into LP64 addr */
1554                         devpath = CAST_USER_ADDR_T(tmp);
1555                 }
1556
1557                 height = 0;
1558                 flags = 0;
1559         }
1560
1561         if (flags != 0) {
1562                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1563                 return EINVAL;
1564         }
1565
1566         error = get_imgsrc_rootvnode(height, &rvp);
1567         if (error != 0) {
1568                 IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
1569                 return error;
1570         }
1571
1572         IMGSRC_DEBUG("got root vnode.\n");
1573
1574         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1575
1576         /* Can only move once */
1577         mp = vnode_mount(rvp);
1578         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1579                 IMGSRC_DEBUG("Already moved.\n");
1580                 error = EBUSY;
1581                 goto out0;
1582         }
1583
1584         IMGSRC_DEBUG("Starting updated.\n");
1585
1586         /* Get exclusive rwlock on mount, authorize update on mp */
1587         error = mount_begin_update(mp , ctx, 0);
1588         if (error != 0) {
1589                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1590                 goto out0;
1591         }
1592
1593         /*
1594          * It can only be moved once.  Flag is set under the rwlock,
1595          * so we're now safe to proceed.
1596          */
1597         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1598                 IMGSRC_DEBUG("Already moved [2]\n");
1599                 goto out1;
1600         }
1601
1602
1603         IMGSRC_DEBUG("Preparing coveredvp.\n");
1604
1605         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1606         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1607         if (error != 0) {
1608                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1609                 goto out1;
1610         }
1611
1612         IMGSRC_DEBUG("Covered vp OK.\n");
1613
1614         /* Sanity check the name caller has provided */
1615         vfsp = mp->mnt_vtable;
1616         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1617                 IMGSRC_DEBUG("Wrong fs name.\n");
1618                 error = EINVAL;
1619                 goto out2;
1620         }
1621
1622         /* Check the device vnode and update mount-from name, for local filesystems */
1623         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1624                 IMGSRC_DEBUG("Local, doing device validation.\n");
1625
1626                 if (devpath != USER_ADDR_NULL) {
1627                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1628                         if (error) {
1629                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1630                                 goto out2;
1631                         }
1632
1633                         vnode_put(devvp);
1634                 }
1635         }
1636
1637         /*
1638          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1639          * and increment the name cache's mount generation
1640          */
1641
1642         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1643         error = place_mount_and_checkdirs(mp, vp, ctx);
1644         if (error != 0) {
1645                 goto out2;
1646         }
1647
1648         placed = TRUE;
1649
1650         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1651         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1652
1653         /* Forbid future moves */
1654         mount_lock(mp);
1655         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1656         mount_unlock(mp);
1657
1658         /* Finally, add to mount list, completely ready to go */
1659         if (mount_list_add(mp) != 0) {
1660                 /*
1661                  * The system is shutting down trying to umount
1662                  * everything, so fail with a plausible errno.
1663                  */
1664                 error = EBUSY;
1665                 goto out3;
1666         }
1667
1668         mount_end_update(mp);
1669         vnode_put(rvp);
1670         FREE(old_mntonname, M_TEMP);
1671
1672         vfs_notify_mount(pvp);
1673
1674         return 0;
1675 out3:
1676         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1677
1678         mount_lock(mp);
1679         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1680         mount_unlock(mp);
1681
1682 out2:
1683         /*
1684          * Placing the mp on the vnode clears VMOUNT,
1685          * so cleanup is different after that point
1686          */
1687         if (placed) {
1688                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1689                 undo_place_on_covered_vp(mp, vp);
1690         } else {
1691                 vnode_lock_spin(vp);
1692                 CLR(vp->v_flag, VMOUNT);
1693                 vnode_unlock(vp);
1694         }
1695 out1:
1696         mount_end_update(mp);
1697
1698 out0:
1699         vnode_put(rvp);
1700         FREE(old_mntonname, M_TEMP);
1701         return error;
1702 }
1703
1704 #endif /* CONFIG_IMGSRC_ACCESS */
1705
1706 void
1707 enablequotas(struct mount *mp, vfs_context_t ctx)
1708 {
1709         struct nameidata qnd;
1710         int type;
1711         char qfpath[MAXPATHLEN];
1712         const char *qfname = QUOTAFILENAME;
1713         const char *qfopsname = QUOTAOPSNAME;
1714         const char *qfextension[] = INITQFNAMES;
1715
1716         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1717         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0 ) {
1718                 return;
1719         }
1720         /*
1721          * Enable filesystem disk quotas if necessary.
1722          * We ignore errors as this should not interfere with final mount
1723          */
1724         for (type=0; type < MAXQUOTAS; type++) {
1725                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1726                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1727                        CAST_USER_ADDR_T(qfpath), ctx);
1728                 if (namei(&qnd) != 0)
1729                         continue;           /* option file to trigger quotas is not present */
1730                 vnode_put(qnd.ni_vp);
1731                 nameidone(&qnd);
1732                 snprintf(qfpath, sizeof(qfpath),  "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1733
1734                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1735         }
1736         return;
1737 }
1738
1739
1740 static int
1741 checkdirs_callback(proc_t p, void * arg)
1742 {
1743         struct cdirargs * cdrp = (struct cdirargs * )arg;
1744         vnode_t olddp = cdrp->olddp;
1745         vnode_t newdp = cdrp->newdp;
1746         struct filedesc *fdp;
1747         vnode_t tvp;
1748         vnode_t fdp_cvp;
1749         vnode_t fdp_rvp;
1750         int cdir_changed = 0;
1751         int rdir_changed = 0;
1752
1753         /*
1754          * XXX Also needs to iterate each thread in the process to see if it
1755          * XXX is using a per-thread current working directory, and, if so,
1756          * XXX update that as well.
1757          */
1758
1759         proc_fdlock(p);
1760         fdp = p->p_fd;
1761         if (fdp == (struct filedesc *)0) {
1762                 proc_fdunlock(p);
1763                 return(PROC_RETURNED);
1764         }
1765         fdp_cvp = fdp->fd_cdir;
1766         fdp_rvp = fdp->fd_rdir;
1767         proc_fdunlock(p);
1768
1769         if (fdp_cvp == olddp) {
1770                 vnode_ref(newdp);
1771                 tvp = fdp->fd_cdir;
1772                 fdp_cvp = newdp;
1773                 cdir_changed = 1;
1774                 vnode_rele(tvp);
1775         }
1776         if (fdp_rvp == olddp) {
1777                 vnode_ref(newdp);
1778                 tvp = fdp->fd_rdir;
1779                 fdp_rvp = newdp;
1780                 rdir_changed = 1;
1781                 vnode_rele(tvp);
1782         }
1783         if (cdir_changed || rdir_changed) {
1784                 proc_fdlock(p);
1785                 fdp->fd_cdir = fdp_cvp;
1786                 fdp->fd_rdir = fdp_rvp;
1787                 proc_fdunlock(p);
1788         }
1789         return(PROC_RETURNED);
1790 }
1791
1792
1793
1794 /*
1795  * Scan all active processes to see if any of them have a current
1796  * or root directory onto which the new filesystem has just been
1797  * mounted. If so, replace them with the new mount point.
1798  */
1799 static int
1800 checkdirs(vnode_t olddp, vfs_context_t ctx)
1801 {
1802         vnode_t newdp;
1803         vnode_t tvp;
1804         int err;
1805         struct cdirargs cdr;
1806
1807         if (olddp->v_usecount == 1)
1808                 return(0);
1809         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
1810
1811         if (err != 0) {
1812 #if DIAGNOSTIC
1813                 panic("mount: lost mount: error %d", err);
1814 #endif
1815                 return(err);
1816         }
1817
1818         cdr.olddp = olddp;
1819         cdr.newdp = newdp;
1820         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
1821         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
1822
1823         if (rootvnode == olddp) {
1824                 vnode_ref(newdp);
1825                 tvp = rootvnode;
1826                 rootvnode = newdp;
1827                 vnode_rele(tvp);
1828         }
1829
1830         vnode_put(newdp);
1831         return(0);
1832 }
1833
1834 /*
1835  * Unmount a file system.
1836  *
1837  * Note: unmount takes a path to the vnode mounted on as argument,
1838  * not special file (as before).
1839  */
1840 /* ARGSUSED */
1841 int
1842 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
1843 {
1844         vnode_t vp;
1845         struct mount *mp;
1846         int error;
1847         struct nameidata nd;
1848         vfs_context_t ctx = vfs_context_current();
1849
1850         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
1851                 UIO_USERSPACE, uap->path, ctx);
1852         error = namei(&nd);
1853         if (error)
1854                 return (error);
1855         vp = nd.ni_vp;
1856         mp = vp->v_mount;
1857         nameidone(&nd);
1858
1859 #if CONFIG_MACF
1860         error = mac_mount_check_umount(ctx, mp);
1861         if (error != 0) {
1862                 vnode_put(vp);
1863                 return (error);
1864         }
1865 #endif
1866         /*
1867          * Must be the root of the filesystem
1868          */
1869         if ((vp->v_flag & VROOT) == 0) {
1870                 vnode_put(vp);
1871                 return (EINVAL);
1872         }
1873         mount_ref(mp, 0);
1874         vnode_put(vp);
1875         /* safedounmount consumes the mount ref */
1876         return (safedounmount(mp, uap->flags, ctx));
1877 }
1878
1879 int
1880 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
1881 {
1882         mount_t mp;
1883
1884         mp = mount_list_lookupby_fsid(fsid, 0, 1);
1885         if (mp == (mount_t)0) {
1886                 return(ENOENT);
1887         }
1888         mount_ref(mp, 0);
1889         mount_iterdrop(mp);
1890         /* safedounmount consumes the mount ref */
1891         return(safedounmount(mp, flags, ctx));
1892 }
1893
1894
1895 /*
1896  * The mount struct comes with a mount ref which will be consumed.
1897  * Do the actual file system unmount, prevent some common foot shooting.
1898  */
1899 int
1900 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
1901 {
1902         int error;
1903         proc_t p = vfs_context_proc(ctx);
1904
1905         /*
1906          * If the file system is not responding and MNT_NOBLOCK
1907          * is set and not a forced unmount then return EBUSY.
1908          */
1909         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
1910                 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
1911                 error = EBUSY;
1912                 goto out;
1913         }
1914
1915         /*
1916          * Skip authorization if the mount is tagged as permissive and
1917          * this is not a forced-unmount attempt.
1918          */
1919         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
1920                 /*
1921                  * Only root, or the user that did the original mount is
1922                  * permitted to unmount this filesystem.
1923                  */
1924                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
1925                                 (error = suser(kauth_cred_get(), &p->p_acflag)))
1926                         goto out;
1927         }
1928         /*
1929          * Don't allow unmounting the root file system.
1930          */
1931         if (mp->mnt_flag & MNT_ROOTFS) {
1932                 error = EBUSY; /* the root is always busy */
1933                 goto out;
1934         }
1935
1936 #ifdef CONFIG_IMGSRC_ACCESS
1937         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
1938                 error = EBUSY;
1939                 goto out;
1940         }
1941 #endif /* CONFIG_IMGSRC_ACCESS */
1942
1943         return (dounmount(mp, flags, 1, ctx));
1944
1945 out:
1946         mount_drop(mp, 0);
1947         return(error);
1948 }
1949
1950 /*
1951  * Do the actual file system unmount.
1952  */
1953 int
1954 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
1955 {
1956         vnode_t coveredvp = (vnode_t)0;
1957         int error;
1958         int needwakeup = 0;
1959         int forcedunmount = 0;
1960         int lflags = 0;
1961         struct vnode *devvp = NULLVP;
1962 #if CONFIG_TRIGGERS
1963         proc_t p = vfs_context_proc(ctx);
1964         int did_vflush = 0;
1965         int pflags_save = 0;
1966 #endif /* CONFIG_TRIGGERS */
1967
1968 #if CONFIG_FSE
1969         if (!(flags & MNT_FORCE)) {
1970                 fsevent_unmount(mp, ctx);  /* has to come first! */
1971         }
1972 #endif
1973
1974         mount_lock(mp);
1975
1976         /*
1977          * If already an unmount in progress just return EBUSY.
1978          * Even a forced unmount cannot override.
1979          */
1980         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1981                 if (withref != 0)
1982                         mount_drop(mp, 1);
1983                 mount_unlock(mp);
1984                 return (EBUSY);
1985         }
1986
1987         if (flags & MNT_FORCE) {
1988                 forcedunmount = 1;
1989                 mp->mnt_lflag |= MNT_LFORCE;
1990         }
1991
1992 #if CONFIG_TRIGGERS
1993         if (flags & MNT_NOBLOCK && p != kernproc)
1994                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
1995 #endif
1996
1997         mp->mnt_kern_flag |= MNTK_UNMOUNT;
1998         mp->mnt_lflag |= MNT_LUNMOUNT;
1999         mp->mnt_flag &=~ MNT_ASYNC;
2000         /*
2001          * anyone currently in the fast path that
2002          * trips over the cached rootvp will be
2003          * dumped out and forced into the slow path
2004          * to regenerate a new cached value
2005          */
2006         mp->mnt_realrootvp = NULLVP;
2007         mount_unlock(mp);
2008
2009         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2010                 /*
2011                  * Force unmount any mounts in this filesystem.
2012                  * If any unmounts fail - just leave them dangling.
2013                  * Avoids recursion.
2014                  */
2015                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2016         }
2017
2018         /*
2019          * taking the name_cache_lock exclusively will
2020          * insure that everyone is out of the fast path who
2021          * might be trying to use a now stale copy of
2022          * vp->v_mountedhere->mnt_realrootvp
2023          * bumping mount_generation causes the cached values
2024          * to be invalidated
2025          */
2026         name_cache_lock();
2027         mount_generation++;
2028         name_cache_unlock();
2029
2030
2031         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2032         if (withref != 0)
2033                 mount_drop(mp, 0);
2034         error = 0;
2035         if (forcedunmount == 0) {
2036                 ubc_umount(mp); /* release cached vnodes */
2037                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2038                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
2039                         if (error) {
2040                                 mount_lock(mp);
2041                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2042                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2043                                 mp->mnt_lflag &= ~MNT_LFORCE;
2044                                 goto out;
2045                         }
2046                 }
2047         }
2048
2049         /* free disk_conditioner_info structure for this mount */
2050         disk_conditioner_unmount(mp);
2051
2052         IOBSDMountChange(mp, kIOMountChangeUnmount);
2053
2054 #if CONFIG_TRIGGERS
2055         vfs_nested_trigger_unmounts(mp, flags, ctx);
2056         did_vflush = 1;
2057 #endif
2058         if (forcedunmount)
2059                 lflags |= FORCECLOSE;
2060         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2061         if ((forcedunmount == 0) && error) {
2062                 mount_lock(mp);
2063                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2064                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2065                 mp->mnt_lflag &= ~MNT_LFORCE;
2066                 goto out;
2067         }
2068
2069         /* make sure there are no one in the mount iterations or lookup */
2070         mount_iterdrain(mp);
2071
2072         error = VFS_UNMOUNT(mp, flags, ctx);
2073         if (error) {
2074                 mount_iterreset(mp);
2075                 mount_lock(mp);
2076                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2077                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2078                 mp->mnt_lflag &= ~MNT_LFORCE;
2079                 goto out;
2080         }
2081
2082         /* increment the operations count */
2083         if (!error)
2084                 OSAddAtomic(1, &vfs_nummntops);
2085
2086         if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2087                 /* hold an io reference and drop the usecount before close */
2088                 devvp = mp->mnt_devvp;
2089                 vnode_getalways(devvp);
2090                 vnode_rele(devvp);
2091                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
2092                        ctx);
2093                 vnode_clearmountedon(devvp);
2094                 vnode_put(devvp);
2095         }
2096         lck_rw_done(&mp->mnt_rwlock);
2097         mount_list_remove(mp);
2098         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2099
2100         /* mark the mount point hook in the vp but not drop the ref yet */
2101         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2102                 /*
2103                  * The covered vnode needs special handling. Trying to get an
2104                  * iocount must not block here as this may lead to deadlocks
2105                  * if the Filesystem to which the covered vnode belongs is
2106                  * undergoing forced unmounts. Since we hold a usecount, the
2107                  * vnode cannot be reused (it can, however, still be terminated)
2108                  */
2109                 vnode_getalways(coveredvp);
2110                 vnode_lock_spin(coveredvp);
2111
2112                 mp->mnt_crossref++;
2113                 coveredvp->v_mountedhere = (struct mount *)0;
2114                 CLR(coveredvp->v_flag, VMOUNT);
2115
2116                 vnode_unlock(coveredvp);
2117                 vnode_put(coveredvp);
2118         }
2119
2120         mount_list_lock();
2121         mp->mnt_vtable->vfc_refcount--;
2122         mount_list_unlock();
2123
2124         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2125         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2126         mount_lock(mp);
2127         mp->mnt_lflag |= MNT_LDEAD;
2128
2129         if (mp->mnt_lflag & MNT_LWAIT) {
2130                 /*
2131                  * do the wakeup here
2132                  * in case we block in mount_refdrain
2133                  * which will drop the mount lock
2134                  * and allow anyone blocked in vfs_busy
2135                  * to wakeup and see the LDEAD state
2136                  */
2137                 mp->mnt_lflag &= ~MNT_LWAIT;
2138                 wakeup((caddr_t)mp);
2139         }
2140         mount_refdrain(mp);
2141 out:
2142         if (mp->mnt_lflag & MNT_LWAIT) {
2143                 mp->mnt_lflag &= ~MNT_LWAIT;
2144                 needwakeup = 1;
2145         }
2146
2147 #if CONFIG_TRIGGERS
2148         if (flags & MNT_NOBLOCK && p != kernproc) {
2149                 // Restore P_NOREMOTEHANG bit to its previous value
2150                 if ((pflags_save & P_NOREMOTEHANG) == 0)
2151                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2152         }
2153
2154         /*
2155          * Callback and context are set together under the mount lock, and
2156          * never cleared, so we're safe to examine them here, drop the lock,
2157          * and call out.
2158          */
2159         if (mp->mnt_triggercallback != NULL) {
2160                 mount_unlock(mp);
2161                 if (error == 0) {
2162                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2163                 } else if (did_vflush) {
2164                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2165                 }
2166         } else {
2167                 mount_unlock(mp);
2168         }
2169 #else
2170         mount_unlock(mp);
2171 #endif /* CONFIG_TRIGGERS */
2172
2173         lck_rw_done(&mp->mnt_rwlock);
2174
2175         if (needwakeup)
2176                 wakeup((caddr_t)mp);
2177
2178         if (!error) {
2179                 if ((coveredvp != NULLVP)) {
2180                         vnode_t pvp = NULLVP;
2181
2182                         /*
2183                          * The covered vnode needs special handling. Trying to
2184                          * get an iocount must not block here as this may lead
2185                          * to deadlocks if the Filesystem to which the covered
2186                          * vnode belongs is undergoing forced unmounts. Since we
2187                          * hold a usecount, the  vnode cannot be reused
2188                          * (it can, however, still be terminated).
2189                          */
2190                         vnode_getalways(coveredvp);
2191
2192                         mount_dropcrossref(mp, coveredvp, 0);
2193                         /*
2194                          * We'll _try_ to detect if this really needs to be
2195                          * done. The coveredvp can only be in termination (or
2196                          * terminated) if the coveredvp's mount point is in a
2197                          * forced unmount (or has been) since we still hold the
2198                          * ref.
2199                          */
2200                         if (!vnode_isrecycled(coveredvp)) {
2201                                 pvp = vnode_getparent(coveredvp);
2202 #if CONFIG_TRIGGERS
2203                                 if (coveredvp->v_resolve) {
2204                                         vnode_trigger_rearm(coveredvp, ctx);
2205                                 }
2206 #endif
2207                         }
2208
2209                         vnode_rele(coveredvp);
2210                         vnode_put(coveredvp);
2211                         coveredvp = NULLVP;
2212
2213                         if (pvp) {
2214                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2215                                 vnode_put(pvp);
2216                         }
2217                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2218                                 mount_lock_destroy(mp);
2219 #if CONFIG_MACF
2220                                 mac_mount_label_destroy(mp);
2221 #endif
2222                                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2223                 } else
2224                         panic("dounmount: no coveredvp");
2225         }
2226         return (error);
2227 }
2228
2229 /*
2230  * Unmount any mounts in this filesystem.
2231  */
2232 void
2233 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2234 {
2235         mount_t smp;
2236         fsid_t *fsids, fsid;
2237         int fsids_sz;
2238         int count = 0, i, m = 0;
2239         vnode_t vp;
2240
2241         mount_list_lock();
2242
2243         // Get an array to hold the submounts fsids.
2244         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2245                 count++;
2246         fsids_sz = count * sizeof(fsid_t);
2247         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2248         if (fsids == NULL) {
2249                 mount_list_unlock();
2250                 goto out;
2251         }
2252         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2253
2254         /*
2255          * Fill the array with submount fsids.
2256          * Since mounts are always added to the tail of the mount list, the
2257          * list is always in mount order.
2258          * For each mount check if the mounted-on vnode belongs to a
2259          * mount that's already added to our array of mounts to be unmounted.
2260          */
2261         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2262                 vp = smp->mnt_vnodecovered;
2263                 if (vp == NULL)
2264                         continue;
2265                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2266                 for (i = 0; i <= m; i++) {
2267                         if (fsids[i].val[0] == fsid.val[0] &&
2268                             fsids[i].val[1] == fsid.val[1]) {
2269                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2270                                 break;
2271                         }
2272                 }
2273         }
2274         mount_list_unlock();
2275
2276         // Unmount the submounts in reverse order. Ignore errors.
2277         for (i = m; i > 0; i--) {
2278                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2279                 if (smp) {
2280                         mount_ref(smp, 0);
2281                         mount_iterdrop(smp);
2282                         (void) dounmount(smp, flags, 1, ctx);
2283                 }
2284         }
2285 out:
2286         if (fsids)
2287                 FREE(fsids, M_TEMP);
2288 }
2289
2290 void
2291 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2292 {
2293         vnode_lock(dp);
2294         mp->mnt_crossref--;
2295
2296         if (mp->mnt_crossref < 0)
2297                 panic("mount cross refs -ve");
2298
2299         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2300
2301                 if (need_put)
2302                         vnode_put_locked(dp);
2303                 vnode_unlock(dp);
2304
2305                 mount_lock_destroy(mp);
2306 #if CONFIG_MACF
2307                 mac_mount_label_destroy(mp);
2308 #endif
2309                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2310                 return;
2311         }
2312         if (need_put)
2313                 vnode_put_locked(dp);
2314         vnode_unlock(dp);
2315 }
2316
2317
2318 /*
2319  * Sync each mounted filesystem.
2320  */
2321 #if DIAGNOSTIC
2322 int syncprt = 0;
2323 #endif
2324
2325 int print_vmpage_stat=0;
2326 int sync_timeout = 60;  // Sync time limit (sec)
2327
2328
2329 static int
2330 sync_callback(mount_t mp, __unused void *arg)
2331 {
2332         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2333                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2334
2335                 mp->mnt_flag &= ~MNT_ASYNC;
2336                 VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
2337                 if (asyncflag)
2338                         mp->mnt_flag |= MNT_ASYNC;
2339         }
2340
2341         return (VFS_RETURNED);
2342 }
2343
2344 /* ARGSUSED */
2345 int
2346 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2347 {
2348         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2349
2350         if (print_vmpage_stat) {
2351                 vm_countdirtypages();
2352         }
2353
2354 #if DIAGNOSTIC
2355         if (syncprt)
2356                 vfs_bufstats();
2357 #endif /* DIAGNOSTIC */
2358         return 0;
2359 }
2360
2361 static void
2362 hibernate_sync_thread(void *arg, __unused wait_result_t wr)
2363 {
2364         int *timeout = (int *) arg;
2365
2366         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2367
2368         if (timeout)
2369                 wakeup((caddr_t) timeout);
2370         if (print_vmpage_stat) {
2371                 vm_countdirtypages();
2372         }
2373
2374 #if DIAGNOSTIC
2375         if (syncprt)
2376                 vfs_bufstats();
2377 #endif /* DIAGNOSTIC */
2378 }
2379
2380 /*
2381  * Sync in a separate thread so we can time out if it blocks.
2382  */
2383 static int
2384 hibernate_sync_async(int timeout)
2385 {
2386         thread_t thd;
2387         int error;
2388         struct timespec ts = {timeout, 0};
2389
2390         lck_mtx_lock(sync_mtx_lck);
2391         if (kernel_thread_start(hibernate_sync_thread, &timeout, &thd) != KERN_SUCCESS) {
2392                 printf("hibernate_sync_thread failed\n");
2393                 lck_mtx_unlock(sync_mtx_lck);
2394                 return (0);
2395         }
2396
2397         error = msleep((caddr_t) &timeout, sync_mtx_lck, (PVFS | PDROP | PCATCH), "hibernate_sync_thread", &ts);
2398         if (error) {
2399                 printf("sync timed out: %d sec\n", timeout);
2400         }
2401         thread_deallocate(thd);
2402
2403         return (0);
2404 }
2405
2406 /*
2407  * An in-kernel sync for power management to call.
2408  */
2409 __private_extern__ int
2410 sync_internal(void)
2411 {
2412         (void) hibernate_sync_async(sync_timeout);
2413
2414         return 0;
2415 } /* end of sync_internal call */
2416
2417 /*
2418  * Change filesystem quotas.
2419  */
2420 #if QUOTA
2421 int
2422 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2423 {
2424         struct mount *mp;
2425         int error, quota_cmd, quota_status;
2426         caddr_t datap;
2427         size_t fnamelen;
2428         struct nameidata nd;
2429         vfs_context_t ctx = vfs_context_current();
2430         struct dqblk my_dqblk;
2431
2432         AUDIT_ARG(uid, uap->uid);
2433         AUDIT_ARG(cmd, uap->cmd);
2434         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2435                uap->path, ctx);
2436         error = namei(&nd);
2437         if (error)
2438                 return (error);
2439         mp = nd.ni_vp->v_mount;
2440         vnode_put(nd.ni_vp);
2441         nameidone(&nd);
2442
2443         /* copyin any data we will need for downstream code */
2444         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2445
2446         switch (quota_cmd) {
2447         case Q_QUOTAON:
2448                 /* uap->arg specifies a file from which to take the quotas */
2449                 fnamelen = MAXPATHLEN;
2450                 datap = kalloc(MAXPATHLEN);
2451                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2452                 break;
2453         case Q_GETQUOTA:
2454                 /* uap->arg is a pointer to a dqblk structure. */
2455                 datap = (caddr_t) &my_dqblk;
2456                 break;
2457         case Q_SETQUOTA:
2458         case Q_SETUSE:
2459                 /* uap->arg is a pointer to a dqblk structure. */
2460                 datap = (caddr_t) &my_dqblk;
2461                 if (proc_is64bit(p)) {
2462                         struct user_dqblk       my_dqblk64;
2463                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof (my_dqblk64));
2464                         if (error == 0) {
2465                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2466                         }
2467                 }
2468                 else {
2469                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof (my_dqblk));
2470                 }
2471                 break;
2472         case Q_QUOTASTAT:
2473                 /* uap->arg is a pointer to an integer */
2474                 datap = (caddr_t) &quota_status;
2475                 break;
2476         default:
2477                 datap = NULL;
2478                 break;
2479         } /* switch */
2480
2481         if (error == 0) {
2482                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2483         }
2484
2485         switch (quota_cmd) {
2486         case Q_QUOTAON:
2487                 if (datap != NULL)
2488                         kfree(datap, MAXPATHLEN);
2489                 break;
2490         case Q_GETQUOTA:
2491                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2492                 if (error == 0) {
2493                         if (proc_is64bit(p)) {
2494                                 struct user_dqblk       my_dqblk64;
2495
2496                                 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
2497                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2498                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof (my_dqblk64));
2499                         }
2500                         else {
2501                                 error = copyout(datap, uap->arg, sizeof (struct dqblk));
2502                         }
2503                 }
2504                 break;
2505         case Q_QUOTASTAT:
2506                 /* uap->arg is a pointer to an integer */
2507                 if (error == 0) {
2508                         error = copyout(datap, uap->arg, sizeof(quota_status));
2509                 }
2510                 break;
2511         default:
2512                 break;
2513         } /* switch */
2514
2515         return (error);
2516 }
2517 #else
2518 int
2519 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2520 {
2521         return (EOPNOTSUPP);
2522 }
2523 #endif /* QUOTA */
2524
2525 /*
2526  * Get filesystem statistics.
2527  *
2528  * Returns:     0                       Success
2529  *      namei:???
2530  *      vfs_update_vfsstat:???
2531  *      munge_statfs:EFAULT
2532  */
2533 /* ARGSUSED */
2534 int
2535 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2536 {
2537         struct mount *mp;
2538         struct vfsstatfs *sp;
2539         int error;
2540         struct nameidata nd;
2541         vfs_context_t ctx = vfs_context_current();
2542         vnode_t vp;
2543
2544         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2545                 UIO_USERSPACE, uap->path, ctx);
2546         error = namei(&nd);
2547         if (error != 0)
2548                 return (error);
2549         vp = nd.ni_vp;
2550         mp = vp->v_mount;
2551         sp = &mp->mnt_vfsstat;
2552         nameidone(&nd);
2553
2554 #if CONFIG_MACF
2555         error = mac_mount_check_stat(ctx, mp);
2556         if (error != 0)
2557                 return (error);
2558 #endif
2559
2560         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2561         if (error != 0) {
2562                 vnode_put(vp);
2563                 return (error);
2564         }
2565
2566         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2567         vnode_put(vp);
2568         return (error);
2569 }
2570
2571 /*
2572  * Get filesystem statistics.
2573  */
2574 /* ARGSUSED */
2575 int
2576 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2577 {
2578         vnode_t vp;
2579         struct mount *mp;
2580         struct vfsstatfs *sp;
2581         int error;
2582
2583         AUDIT_ARG(fd, uap->fd);
2584
2585         if ( (error = file_vnode(uap->fd, &vp)) )
2586                 return (error);
2587
2588         error = vnode_getwithref(vp);
2589         if (error) {
2590                 file_drop(uap->fd);
2591                 return (error);
2592         }
2593
2594         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2595
2596         mp = vp->v_mount;
2597         if (!mp) {
2598                 error = EBADF;
2599                 goto out;
2600         }
2601
2602 #if CONFIG_MACF
2603         error = mac_mount_check_stat(vfs_context_current(), mp);
2604         if (error != 0)
2605                 goto out;
2606 #endif
2607
2608         sp = &mp->mnt_vfsstat;
2609         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2610                 goto out;
2611         }
2612
2613         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2614
2615 out:
2616         file_drop(uap->fd);
2617         vnode_put(vp);
2618
2619         return (error);
2620 }
2621
2622 /*
2623  * Common routine to handle copying of statfs64 data to user space
2624  */
2625 static int
2626 statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
2627 {
2628         int error;
2629         struct statfs64 sfs;
2630
2631         bzero(&sfs, sizeof(sfs));
2632
2633         sfs.f_bsize = sfsp->f_bsize;
2634         sfs.f_iosize = (int32_t)sfsp->f_iosize;
2635         sfs.f_blocks = sfsp->f_blocks;
2636         sfs.f_bfree = sfsp->f_bfree;
2637         sfs.f_bavail = sfsp->f_bavail;
2638         sfs.f_files = sfsp->f_files;
2639         sfs.f_ffree = sfsp->f_ffree;
2640         sfs.f_fsid = sfsp->f_fsid;
2641         sfs.f_owner = sfsp->f_owner;
2642         sfs.f_type = mp->mnt_vtable->vfc_typenum;
2643         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2644         sfs.f_fssubtype = sfsp->f_fssubtype;
2645         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2646                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
2647         } else {
2648                 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
2649         }
2650         strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
2651         strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
2652
2653         error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
2654
2655         return(error);
2656 }
2657
2658 /*
2659  * Get file system statistics in 64-bit mode
2660  */
2661 int
2662 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
2663 {
2664         struct mount *mp;
2665         struct vfsstatfs *sp;
2666         int error;
2667         struct nameidata nd;
2668         vfs_context_t ctxp = vfs_context_current();
2669         vnode_t vp;
2670
2671         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2672                 UIO_USERSPACE, uap->path, ctxp);
2673         error = namei(&nd);
2674         if (error != 0)
2675                 return (error);
2676         vp = nd.ni_vp;
2677         mp = vp->v_mount;
2678         sp = &mp->mnt_vfsstat;
2679         nameidone(&nd);
2680
2681 #if CONFIG_MACF
2682         error = mac_mount_check_stat(ctxp, mp);
2683         if (error != 0)
2684                 return (error);
2685 #endif
2686
2687         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
2688         if (error != 0) {
2689                 vnode_put(vp);
2690                 return (error);
2691         }
2692
2693         error = statfs64_common(mp, sp, uap->buf);
2694         vnode_put(vp);
2695
2696         return (error);
2697 }
2698
2699 /*
2700  * Get file system statistics in 64-bit mode
2701  */
2702 int
2703 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
2704 {
2705         struct vnode *vp;
2706         struct mount *mp;
2707         struct vfsstatfs *sp;
2708         int error;
2709
2710         AUDIT_ARG(fd, uap->fd);
2711
2712         if ( (error = file_vnode(uap->fd, &vp)) )
2713                 return (error);
2714
2715         error = vnode_getwithref(vp);
2716         if (error) {
2717                 file_drop(uap->fd);
2718                 return (error);
2719         }
2720
2721         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2722
2723         mp = vp->v_mount;
2724         if (!mp) {
2725                 error = EBADF;
2726                 goto out;
2727         }
2728
2729 #if CONFIG_MACF
2730         error = mac_mount_check_stat(vfs_context_current(), mp);
2731         if (error != 0)
2732                 goto out;
2733 #endif
2734
2735         sp = &mp->mnt_vfsstat;
2736         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2737                 goto out;
2738         }
2739
2740         error = statfs64_common(mp, sp, uap->buf);
2741
2742 out:
2743         file_drop(uap->fd);
2744         vnode_put(vp);
2745
2746         return (error);
2747 }
2748
2749 struct getfsstat_struct {
2750         user_addr_t     sfsp;
2751         user_addr_t     *mp;
2752         int             count;
2753         int             maxcount;
2754         int             flags;
2755         int             error;
2756 };
2757
2758
2759 static int
2760 getfsstat_callback(mount_t mp, void * arg)
2761 {
2762
2763         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2764         struct vfsstatfs *sp;
2765         int error, my_size;
2766         vfs_context_t ctx = vfs_context_current();
2767
2768         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2769 #if CONFIG_MACF
2770                 error = mac_mount_check_stat(ctx, mp);
2771                 if (error != 0) {
2772                         fstp->error = error;
2773                         return(VFS_RETURNED_DONE);
2774                 }
2775 #endif
2776                 sp = &mp->mnt_vfsstat;
2777                 /*
2778                  * If MNT_NOWAIT is specified, do not refresh the
2779                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
2780                  */
2781                 if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2782                         (error = vfs_update_vfsstat(mp, ctx,
2783                             VFS_USER_EVENT))) {
2784                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2785                         return(VFS_RETURNED);
2786                 }
2787
2788                 /*
2789                  * Need to handle LP64 version of struct statfs
2790                  */
2791                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
2792                 if (error) {
2793                         fstp->error = error;
2794                         return(VFS_RETURNED_DONE);
2795                 }
2796                 fstp->sfsp += my_size;
2797
2798                 if (fstp->mp) {
2799 #if CONFIG_MACF
2800                         error = mac_mount_label_get(mp, *fstp->mp);
2801                         if (error) {
2802                                 fstp->error = error;
2803                                 return(VFS_RETURNED_DONE);
2804                         }
2805 #endif
2806                         fstp->mp++;
2807                 }
2808         }
2809         fstp->count++;
2810         return(VFS_RETURNED);
2811 }
2812
2813 /*
2814  * Get statistics on all filesystems.
2815  */
2816 int
2817 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
2818 {
2819         struct __mac_getfsstat_args muap;
2820
2821         muap.buf = uap->buf;
2822         muap.bufsize = uap->bufsize;
2823         muap.mac = USER_ADDR_NULL;
2824         muap.macsize = 0;
2825         muap.flags = uap->flags;
2826
2827         return (__mac_getfsstat(p, &muap, retval));
2828 }
2829
2830 /*
2831  * __mac_getfsstat: Get MAC-related file system statistics
2832  *
2833  * Parameters:    p                        (ignored)
2834  *                uap                      User argument descriptor (see below)
2835  *                retval                   Count of file system statistics (N stats)
2836  *
2837  * Indirect:      uap->bufsize             Buffer size
2838  *                uap->macsize             MAC info size
2839  *                uap->buf                 Buffer where information will be returned
2840  *                uap->mac                 MAC info
2841  *                uap->flags               File system flags
2842  *
2843  *
2844  * Returns:        0                       Success
2845  *                !0                       Not success
2846  *
2847  */
2848 int
2849 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
2850 {
2851         user_addr_t sfsp;
2852         user_addr_t *mp;
2853         size_t count, maxcount, bufsize, macsize;
2854         struct getfsstat_struct fst;
2855
2856         bufsize = (size_t) uap->bufsize;
2857         macsize = (size_t) uap->macsize;
2858
2859         if (IS_64BIT_PROCESS(p)) {
2860                 maxcount = bufsize / sizeof(struct user64_statfs);
2861         }
2862         else {
2863                 maxcount = bufsize / sizeof(struct user32_statfs);
2864         }
2865         sfsp = uap->buf;
2866         count = 0;
2867
2868         mp = NULL;
2869
2870 #if CONFIG_MACF
2871         if (uap->mac != USER_ADDR_NULL) {
2872                 u_int32_t *mp0;
2873                 int error;
2874                 unsigned int i;
2875
2876                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
2877                 if (count != maxcount)
2878                         return (EINVAL);
2879
2880                 /* Copy in the array */
2881                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
2882                 if (mp0 == NULL) {
2883                         return (ENOMEM);
2884                 }
2885
2886                 error = copyin(uap->mac, mp0, macsize);
2887                 if (error) {
2888                         FREE(mp0, M_MACTEMP);
2889                         return (error);
2890                 }
2891
2892                 /* Normalize to an array of user_addr_t */
2893                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
2894                 if (mp == NULL) {
2895                         FREE(mp0, M_MACTEMP);
2896                         return (ENOMEM);
2897                 }
2898
2899                 for (i = 0; i < count; i++) {
2900                         if (IS_64BIT_PROCESS(p))
2901                                 mp[i] = ((user_addr_t *)mp0)[i];
2902                         else
2903                                 mp[i] = (user_addr_t)mp0[i];
2904                 }
2905                 FREE(mp0, M_MACTEMP);
2906         }
2907 #endif
2908
2909
2910         fst.sfsp = sfsp;
2911         fst.mp = mp;
2912         fst.flags = uap->flags;
2913         fst.count = 0;
2914         fst.error = 0;
2915         fst.maxcount = maxcount;
2916
2917
2918         vfs_iterate(0, getfsstat_callback, &fst);
2919
2920         if (mp)
2921                 FREE(mp, M_MACTEMP);
2922
2923         if (fst.error ) {
2924                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2925                 return(fst.error);
2926         }
2927
2928         if (fst.sfsp && fst.count > fst.maxcount)
2929                 *retval = fst.maxcount;
2930         else
2931                 *retval = fst.count;
2932         return (0);
2933 }
2934
2935 static int
2936 getfsstat64_callback(mount_t mp, void * arg)
2937 {
2938         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2939         struct vfsstatfs *sp;
2940         int error;
2941
2942         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2943 #if CONFIG_MACF
2944                 error = mac_mount_check_stat(vfs_context_current(), mp);
2945                 if (error != 0) {
2946                         fstp->error = error;
2947                         return(VFS_RETURNED_DONE);
2948                 }
2949 #endif
2950                 sp = &mp->mnt_vfsstat;
2951                 /*
2952                  * If MNT_NOWAIT is specified, do not refresh the fsstat
2953                  * cache. MNT_WAIT overrides MNT_NOWAIT.
2954                  *
2955                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
2956                  * getfsstat, since the constants are out of the same
2957                  * namespace.
2958                  */
2959                 if (((fstp->flags & MNT_NOWAIT) == 0 ||
2960                      (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2961                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
2962                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2963                         return(VFS_RETURNED);
2964                 }
2965
2966                 error = statfs64_common(mp, sp, fstp->sfsp);
2967                 if (error) {
2968                         fstp->error = error;
2969                         return(VFS_RETURNED_DONE);
2970                 }
2971                 fstp->sfsp += sizeof(struct statfs64);
2972         }
2973         fstp->count++;
2974         return(VFS_RETURNED);
2975 }
2976
2977 /*
2978  * Get statistics on all file systems in 64 bit mode.
2979  */
2980 int
2981 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
2982 {
2983         user_addr_t sfsp;
2984         int count, maxcount;
2985         struct getfsstat_struct fst;
2986
2987         maxcount = uap->bufsize / sizeof(struct statfs64);
2988
2989         sfsp = uap->buf;
2990         count = 0;
2991
2992         fst.sfsp = sfsp;
2993         fst.flags = uap->flags;
2994         fst.count = 0;
2995         fst.error = 0;
2996         fst.maxcount = maxcount;
2997
2998         vfs_iterate(0, getfsstat64_callback, &fst);
2999
3000         if (fst.error ) {
3001                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3002                 return(fst.error);
3003         }
3004
3005         if (fst.sfsp && fst.count > fst.maxcount)
3006                 *retval = fst.maxcount;
3007         else
3008                 *retval = fst.count;
3009
3010         return (0);
3011 }
3012
3013 /*
3014  * gets the associated vnode with the file descriptor passed.
3015  * as input
3016  *
3017  * INPUT
3018  * ctx - vfs context of caller
3019  * fd - file descriptor for which vnode is required.
3020  * vpp - Pointer to pointer to vnode to be returned.
3021  *
3022  * The vnode is returned with an iocount so any vnode obtained
3023  * by this call needs a vnode_put
3024  *
3025  */
3026 int
3027 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3028 {
3029         int error;
3030         vnode_t vp;
3031         struct fileproc *fp;
3032         proc_t p = vfs_context_proc(ctx);
3033
3034         *vpp =  NULLVP;
3035
3036         error = fp_getfvp(p, fd, &fp, &vp);
3037         if (error)
3038                 return (error);
3039
3040         error = vnode_getwithref(vp);
3041         if (error) {
3042                 (void)fp_drop(p, fd, fp, 0);
3043                 return (error);
3044         }
3045
3046         (void)fp_drop(p, fd, fp, 0);
3047         *vpp = vp;
3048         return (error);
3049 }
3050
3051 /*
3052  * Wrapper function around namei to start lookup from a directory
3053  * specified by a file descriptor ni_dirfd.
3054  *
3055  * In addition to all the errors returned by namei, this call can
3056  * return ENOTDIR if the file descriptor does not refer to a directory.
3057  * and EBADF if the file descriptor is not valid.
3058  */
3059 int
3060 nameiat(struct nameidata *ndp, int dirfd)
3061 {
3062         if ((dirfd != AT_FDCWD) &&
3063             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3064             !(ndp->ni_cnd.cn_flags & USEDVP)) {
3065                 int error = 0;
3066                 char c;
3067
3068                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3069                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3070                         if (error)
3071                                 return (error);
3072                 } else {
3073                         c = *((char *)(ndp->ni_dirp));
3074                 }
3075
3076                 if (c != '/') {
3077                         vnode_t dvp_at;
3078
3079                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3080                             &dvp_at);
3081                         if (error)
3082                                 return (error);
3083
3084                         if (vnode_vtype(dvp_at) != VDIR) {
3085                                 vnode_put(dvp_at);
3086                                 return (ENOTDIR);
3087                         }
3088
3089                         ndp->ni_dvp = dvp_at;
3090                         ndp->ni_cnd.cn_flags |= USEDVP;
3091                         error = namei(ndp);
3092                         ndp->ni_cnd.cn_flags &= ~USEDVP;
3093                         vnode_put(dvp_at);
3094                         return (error);
3095                 }
3096         }
3097
3098         return (namei(ndp));
3099 }
3100
3101 /*
3102  * Change current working directory to a given file descriptor.
3103  */
3104 /* ARGSUSED */
3105 static int
3106 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3107 {
3108         struct filedesc *fdp = p->p_fd;
3109         vnode_t vp;
3110         vnode_t tdp;
3111         vnode_t tvp;
3112         struct mount *mp;
3113         int error;
3114         vfs_context_t ctx = vfs_context_current();
3115
3116         AUDIT_ARG(fd, uap->fd);
3117         if (per_thread && uap->fd == -1) {
3118                 /*
3119                  * Switching back from per-thread to per process CWD; verify we
3120                  * in fact have one before proceeding.  The only success case
3121                  * for this code path is to return 0 preemptively after zapping
3122                  * the thread structure contents.
3123                  */
3124                 thread_t th = vfs_context_thread(ctx);
3125                 if (th) {
3126                         uthread_t uth = get_bsdthread_info(th);
3127                         tvp = uth->uu_cdir;
3128                         uth->uu_cdir = NULLVP;
3129                         if (tvp != NULLVP) {
3130                                 vnode_rele(tvp);
3131                                 return (0);
3132                         }
3133                 }
3134                 return (EBADF);
3135         }
3136
3137         if ( (error = file_vnode(uap->fd, &vp)) )
3138                 return(error);
3139         if ( (error = vnode_getwithref(vp)) ) {
3140                 file_drop(uap->fd);
3141                 return(error);
3142         }
3143
3144         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3145
3146         if (vp->v_type != VDIR) {
3147                 error = ENOTDIR;
3148                 goto out;
3149         }
3150
3151 #if CONFIG_MACF
3152         error = mac_vnode_check_chdir(ctx, vp);
3153         if (error)
3154                 goto out;
3155 #endif
3156         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3157         if (error)
3158                 goto out;
3159
3160         while (!error && (mp = vp->v_mountedhere) != NULL) {
3161                 if (vfs_busy(mp, LK_NOWAIT)) {
3162                         error = EACCES;
3163                         goto out;
3164                 }
3165                 error = VFS_ROOT(mp, &tdp, ctx);
3166                 vfs_unbusy(mp);
3167                 if (error)
3168                         break;
3169                 vnode_put(vp);
3170                 vp = tdp;
3171         }
3172         if (error)
3173                 goto out;
3174         if ( (error = vnode_ref(vp)) )
3175                 goto out;
3176         vnode_put(vp);
3177
3178         if (per_thread) {
3179                 thread_t th = vfs_context_thread(ctx);
3180                 if (th) {
3181                         uthread_t uth = get_bsdthread_info(th);
3182                         tvp = uth->uu_cdir;
3183                         uth->uu_cdir = vp;
3184                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3185                 } else {
3186                         vnode_rele(vp);
3187                         return (ENOENT);
3188                 }
3189         } else {
3190                 proc_fdlock(p);
3191                 tvp = fdp->fd_cdir;
3192                 fdp->fd_cdir = vp;
3193                 proc_fdunlock(p);
3194         }
3195
3196         if (tvp)
3197                 vnode_rele(tvp);
3198         file_drop(uap->fd);
3199
3200         return (0);
3201 out:
3202         vnode_put(vp);
3203         file_drop(uap->fd);
3204
3205         return(error);
3206 }
3207
3208 int
3209 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3210 {
3211         return common_fchdir(p, uap, 0);
3212 }
3213
3214 int
3215 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3216 {
3217         return common_fchdir(p, (void *)uap, 1);
3218 }
3219
3220 /*
3221  * Change current working directory (".").
3222  *
3223  * Returns:     0                       Success
3224  *      change_dir:ENOTDIR
3225  *      change_dir:???
3226  *      vnode_ref:ENOENT                No such file or directory
3227  */
3228 /* ARGSUSED */
3229 static int
3230 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3231 {
3232         struct filedesc *fdp = p->p_fd;
3233         int error;
3234         struct nameidata nd;
3235         vnode_t tvp;
3236         vfs_context_t ctx = vfs_context_current();
3237
3238         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3239                 UIO_USERSPACE, uap->path, ctx);
3240         error = change_dir(&nd, ctx);
3241         if (error)
3242                 return (error);
3243         if ( (error = vnode_ref(nd.ni_vp)) ) {
3244                 vnode_put(nd.ni_vp);
3245                 return (error);
3246         }
3247         /*
3248          * drop the iocount we picked up in change_dir
3249          */
3250         vnode_put(nd.ni_vp);
3251
3252         if (per_thread) {
3253                 thread_t th = vfs_context_thread(ctx);
3254                 if (th) {
3255                         uthread_t uth = get_bsdthread_info(th);
3256                         tvp = uth->uu_cdir;
3257                         uth->uu_cdir = nd.ni_vp;
3258                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3259                 } else {
3260                         vnode_rele(nd.ni_vp);
3261                         return (ENOENT);
3262                 }
3263         } else {
3264                 proc_fdlock(p);
3265                 tvp = fdp->fd_cdir;
3266                 fdp->fd_cdir = nd.ni_vp;
3267                 proc_fdunlock(p);
3268         }
3269
3270         if (tvp)
3271                 vnode_rele(tvp);
3272
3273         return (0);
3274 }
3275
3276
3277 /*
3278  * chdir
3279  *
3280  * Change current working directory (".") for the entire process
3281  *
3282  * Parameters:  p       Process requesting the call
3283  *              uap     User argument descriptor (see below)
3284  *              retval  (ignored)
3285  *
3286  * Indirect parameters: uap->path       Directory path
3287  *
3288  * Returns:     0                       Success
3289  *              common_chdir: ENOTDIR
3290  *              common_chdir: ENOENT    No such file or directory
3291  *              common_chdir: ???
3292  *
3293  */
3294 int
3295 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3296 {
3297         return common_chdir(p, (void *)uap, 0);
3298 }
3299
3300 /*
3301  * __pthread_chdir
3302  *
3303  * Change current working directory (".") for a single thread
3304  *
3305  * Parameters:  p       Process requesting the call
3306  *              uap     User argument descriptor (see below)
3307  *              retval  (ignored)
3308  *
3309  * Indirect parameters: uap->path       Directory path
3310  *
3311  * Returns:     0                       Success
3312  *              common_chdir: ENOTDIR
3313  *              common_chdir: ENOENT    No such file or directory
3314  *              common_chdir: ???
3315  *
3316  */
3317 int
3318 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3319 {
3320         return common_chdir(p, (void *)uap, 1);
3321 }
3322
3323
3324 /*
3325  * Change notion of root (``/'') directory.
3326  */
3327 /* ARGSUSED */
3328 int
3329 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3330 {
3331         struct filedesc *fdp = p->p_fd;
3332         int error;
3333         struct nameidata nd;
3334         vnode_t tvp;
3335         vfs_context_t ctx = vfs_context_current();
3336
3337         if ((error = suser(kauth_cred_get(), &p->p_acflag)))
3338                 return (error);
3339
3340         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3341                 UIO_USERSPACE, uap->path, ctx);
3342         error = change_dir(&nd, ctx);
3343         if (error)
3344                 return (error);
3345
3346 #if CONFIG_MACF
3347         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3348             &nd.ni_cnd);
3349         if (error) {
3350                 vnode_put(nd.ni_vp);
3351                 return (error);
3352         }
3353 #endif
3354
3355         if ( (error = vnode_ref(nd.ni_vp)) ) {
3356                 vnode_put(nd.ni_vp);
3357                 return (error);
3358         }
3359         vnode_put(nd.ni_vp);
3360
3361         proc_fdlock(p);
3362         tvp = fdp->fd_rdir;
3363         fdp->fd_rdir = nd.ni_vp;
3364         fdp->fd_flags |= FD_CHROOT;
3365         proc_fdunlock(p);
3366
3367         if (tvp != NULL)
3368                 vnode_rele(tvp);
3369
3370         return (0);
3371 }
3372
3373 /*
3374  * Common routine for chroot and chdir.
3375  *
3376  * Returns:     0                       Success
3377  *              ENOTDIR                 Not a directory
3378  *              namei:???               [anything namei can return]
3379  *              vnode_authorize:???     [anything vnode_authorize can return]
3380  */
3381 static int
3382 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3383 {
3384         vnode_t vp;
3385         int error;
3386
3387         if ((error = namei(ndp)))
3388                 return (error);
3389         nameidone(ndp);
3390         vp = ndp->ni_vp;
3391
3392         if (vp->v_type != VDIR) {
3393                 vnode_put(vp);
3394                 return (ENOTDIR);
3395         }
3396
3397 #if CONFIG_MACF
3398         error = mac_vnode_check_chdir(ctx, vp);
3399         if (error) {
3400                 vnode_put(vp);
3401                 return (error);
3402         }
3403 #endif
3404
3405         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3406         if (error) {
3407                 vnode_put(vp);
3408                 return (error);
3409         }
3410
3411         return (error);
3412 }
3413
3414 /*
3415  * Free the vnode data (for directories) associated with the file glob.
3416  */
3417 struct fd_vn_data *
3418 fg_vn_data_alloc(void)
3419 {
3420         struct fd_vn_data *fvdata;
3421
3422         /* Allocate per fd vnode data */
3423         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3424                M_FD_VN_DATA, M_WAITOK | M_ZERO);
3425         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3426         return fvdata;
3427 }
3428
3429 /*
3430  * Free the vnode data (for directories) associated with the file glob.
3431  */
3432 void
3433 fg_vn_data_free(void *fgvndata)
3434 {
3435         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3436
3437         if (fvdata->fv_buf)
3438                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3439         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3440         FREE(fvdata, M_FD_VN_DATA);
3441 }
3442
3443 /*
3444  * Check permissions, allocate an open file structure,
3445  * and call the device open routine if any.
3446  *
3447  * Returns:     0                       Success
3448  *              EINVAL
3449  *              EINTR
3450  *      falloc:ENFILE
3451  *      falloc:EMFILE
3452  *      falloc:ENOMEM
3453  *      vn_open_auth:???
3454  *      dupfdopen:???
3455  *      VNOP_ADVLOCK:???
3456  *      vnode_setsize:???
3457  *
3458  * XXX Need to implement uid, gid
3459  */
3460 int
3461 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3462     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3463     int32_t *retval)
3464 {
3465         proc_t p = vfs_context_proc(ctx);
3466         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3467         struct fileproc *fp;
3468         vnode_t vp;
3469         int flags, oflags;
3470         int type, indx, error;
3471         struct flock lf;
3472         struct vfs_context context;
3473
3474         oflags = uflags;
3475
3476         if ((oflags & O_ACCMODE) == O_ACCMODE)
3477                 return(EINVAL);
3478
3479         flags = FFLAGS(uflags);
3480         CLR(flags, FENCRYPTED);
3481         CLR(flags, FUNENCRYPTED);
3482
3483         AUDIT_ARG(fflags, oflags);
3484         AUDIT_ARG(mode, vap->va_mode);
3485
3486         if ((error = falloc_withalloc(p,
3487             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3488                 return (error);
3489         }
3490         uu->uu_dupfd = -indx - 1;
3491
3492         if ((error = vn_open_auth(ndp, &flags, vap))) {
3493                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)){        /* XXX from fdopen */
3494                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3495                                 fp_drop(p, indx, NULL, 0);
3496                                 *retval = indx;
3497                                 return (0);
3498                         }
3499                 }
3500                 if (error == ERESTART)
3501                         error = EINTR;
3502                 fp_free(p, indx, fp);
3503                 return (error);
3504         }
3505         uu->uu_dupfd = 0;
3506         vp = ndp->ni_vp;
3507
3508         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3509         fp->f_fglob->fg_ops = &vnops;
3510         fp->f_fglob->fg_data = (caddr_t)vp;
3511
3512         if (flags & (O_EXLOCK | O_SHLOCK)) {
3513                 lf.l_whence = SEEK_SET;
3514                 lf.l_start = 0;
3515                 lf.l_len = 0;
3516                 if (flags & O_EXLOCK)
3517                         lf.l_type = F_WRLCK;
3518                 else
3519                         lf.l_type = F_RDLCK;
3520                 type = F_FLOCK;
3521                 if ((flags & FNONBLOCK) == 0)
3522                         type |= F_WAIT;
3523 #if CONFIG_MACF
3524                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3525                     F_SETLK, &lf);
3526                 if (error)
3527                         goto bad;
3528 #endif
3529                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL)))
3530                         goto bad;
3531                 fp->f_fglob->fg_flag |= FHASLOCK;
3532         }
3533
3534 #if DEVELOPMENT || DEBUG
3535         /*
3536          * XXX VSWAP: Check for entitlements or special flag here
3537          * so we can restrict access appropriately.
3538          */
3539 #else /* DEVELOPMENT || DEBUG */
3540
3541         if (vnode_isswap(vp) && (flags & (FWRITE | O_TRUNC)) && (ctx != vfs_context_kernel())) {
3542                 /* block attempt to write/truncate swapfile */
3543                 error = EPERM;
3544                 goto bad;
3545         }
3546 #endif /* DEVELOPMENT || DEBUG */
3547
3548         /* try to truncate by setting the size attribute */
3549         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0))
3550                 goto bad;
3551
3552         /*
3553          * For directories we hold some additional information in the fd.
3554          */
3555         if (vnode_vtype(vp) == VDIR) {
3556                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3557         } else {
3558                 fp->f_fglob->fg_vn_data = NULL;
3559         }
3560
3561         vnode_put(vp);
3562
3563         /*
3564          * The first terminal open (without a O_NOCTTY) by a session leader
3565          * results in it being set as the controlling terminal.
3566          */
3567         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3568             !(flags & O_NOCTTY)) {
3569                 int tmp = 0;
3570
3571                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3572                     (caddr_t)&tmp, ctx);
3573         }
3574
3575         proc_fdlock(p);
3576         if (flags & O_CLOEXEC)
3577                 *fdflags(p, indx) |= UF_EXCLOSE;
3578         if (flags & O_CLOFORK)
3579                 *fdflags(p, indx) |= UF_FORKCLOSE;
3580         procfdtbl_releasefd(p, indx, NULL);
3581
3582 #if CONFIG_SECLUDED_MEMORY
3583         if (secluded_for_filecache &&
3584             FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
3585             vnode_vtype(vp) == VREG) {
3586                 memory_object_control_t moc;
3587
3588                 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
3589
3590                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
3591                         /* nothing to do... */
3592                 } else if (fp->f_fglob->fg_flag & FWRITE) {
3593                         /* writable -> no longer  eligible for secluded pages */
3594                         memory_object_mark_eligible_for_secluded(moc,
3595                                                                  FALSE);
3596                 } else if (secluded_for_filecache == 1) {
3597                         char pathname[32] = { 0, };
3598                         size_t copied;
3599                         /* XXX FBDP: better way to detect /Applications/ ? */
3600                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3601                                 copyinstr(ndp->ni_dirp,
3602                                           pathname,
3603                                           sizeof (pathname),
3604                                           &copied);
3605                         } else {
3606                                 copystr(CAST_DOWN(void *, ndp->ni_dirp),
3607                                         pathname,
3608                                         sizeof (pathname),
3609                                         &copied);
3610                         }
3611                         pathname[sizeof (pathname) - 1] = '\0';
3612                         if (strncmp(pathname,
3613                                     "/Applications/",
3614                                     strlen("/Applications/")) == 0 &&
3615                             strncmp(pathname,
3616                                     "/Applications/Camera.app/",
3617                                     strlen("/Applications/Camera.app/")) != 0) {
3618                                 /*
3619                                  * not writable
3620                                  * AND from "/Applications/"
3621                                  * AND not from "/Applications/Camera.app/"
3622                                  * ==> eligible for secluded
3623                                  */
3624                                 memory_object_mark_eligible_for_secluded(moc,
3625                                                                          TRUE);
3626                         }
3627                 } else if (secluded_for_filecache == 2) {
3628 #if __arm64__
3629 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
3630 #elif __arm__
3631 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
3632 #else
3633 /* not implemented... */
3634 #endif
3635                         if (!strncmp(vp->v_name,
3636                                      DYLD_SHARED_CACHE_NAME,
3637                                      strlen(DYLD_SHARED_CACHE_NAME)) ||
3638                             !strncmp(vp->v_name,
3639                                      "dyld",
3640                                      strlen(vp->v_name)) ||
3641                             !strncmp(vp->v_name,
3642                                      "launchd",
3643                                      strlen(vp->v_name)) ||
3644                             !strncmp(vp->v_name,
3645                                      "Camera",
3646                                      strlen(vp->v_name)) ||
3647                             !strncmp(vp->v_name,
3648                                      "mediaserverd",
3649                                      strlen(vp->v_name))) {
3650                                 /*
3651                                  * This file matters when launching Camera:
3652                                  * do not store its contents in the secluded
3653                                  * pool that will be drained on Camera launch.
3654                                  */
3655                                 memory_object_mark_eligible_for_secluded(moc,
3656                                                                          FALSE);
3657                         }
3658                 }
3659         }
3660 #endif /* CONFIG_SECLUDED_MEMORY */
3661
3662         fp_drop(p, indx, fp, 1);
3663         proc_fdunlock(p);
3664
3665         *retval = indx;
3666
3667         return (0);
3668 bad:
3669         context = *vfs_context_current();
3670         context.vc_ucred = fp->f_fglob->fg_cred;
3671
3672         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
3673             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
3674                 lf.l_whence = SEEK_SET;
3675                 lf.l_start = 0;
3676                 lf.l_len = 0;
3677                 lf.l_type = F_UNLCK;
3678
3679                 (void)VNOP_ADVLOCK(
3680                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
3681         }
3682
3683         vn_close(vp, fp->f_fglob->fg_flag, &context);
3684         vnode_put(vp);
3685         fp_free(p, indx, fp);
3686
3687         return (error);
3688 }
3689
3690 /*
3691  * While most of the *at syscall handlers can call nameiat() which
3692  * is a wrapper around namei, the use of namei and initialisation
3693  * of nameidata are far removed and in different functions  - namei
3694  * gets called in vn_open_auth for open1. So we'll just do here what
3695  * nameiat() does.
3696  */
3697 static int
3698 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3699     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
3700     int dirfd)
3701 {
3702         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
3703                 int error;
3704                 char c;
3705
3706                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3707                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3708                         if (error)
3709                                 return (error);
3710                 } else {
3711                         c = *((char *)(ndp->ni_dirp));
3712                 }
3713
3714                 if (c != '/') {
3715                         vnode_t dvp_at;
3716
3717                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3718                             &dvp_at);
3719                         if (error)
3720                                 return (error);
3721
3722                         if (vnode_vtype(dvp_at) != VDIR) {
3723                                 vnode_put(dvp_at);
3724                                 return (ENOTDIR);
3725                         }
3726
3727                         ndp->ni_dvp = dvp_at;
3728                         ndp->ni_cnd.cn_flags |= USEDVP;
3729                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
3730                             retval);
3731                         vnode_put(dvp_at);
3732                         return (error);
3733                 }
3734         }
3735
3736         return (open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval));
3737 }
3738
3739 /*
3740  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
3741  *
3742  * Parameters:  p                       Process requesting the open
3743  *              uap                     User argument descriptor (see below)
3744  *              retval                  Pointer to an area to receive the
3745  *                                      return calue from the system call
3746  *
3747  * Indirect:    uap->path               Path to open (same as 'open')
3748  *              uap->flags              Flags to open (same as 'open'
3749  *              uap->uid                UID to set, if creating
3750  *              uap->gid                GID to set, if creating
3751  *              uap->mode               File mode, if creating (same as 'open')
3752  *              uap->xsecurity          ACL to set, if creating
3753  *
3754  * Returns:     0                       Success
3755  *              !0                      errno value
3756  *
3757  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3758  *
3759  * XXX:         We should enummerate the possible errno values here, and where
3760  *              in the code they originated.
3761  */
3762 int
3763 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
3764 {
3765         struct filedesc *fdp = p->p_fd;
3766         int ciferror;
3767         kauth_filesec_t xsecdst;
3768         struct vnode_attr va;
3769         struct nameidata nd;
3770         int cmode;
3771
3772         AUDIT_ARG(owner, uap->uid, uap->gid);
3773
3774         xsecdst = NULL;
3775         if ((uap->xsecurity != USER_ADDR_NULL) &&
3776             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
3777                 return ciferror;
3778
3779         VATTR_INIT(&va);
3780         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3781         VATTR_SET(&va, va_mode, cmode);
3782         if (uap->uid != KAUTH_UID_NONE)
3783                 VATTR_SET(&va, va_uid, uap->uid);
3784         if (uap->gid != KAUTH_GID_NONE)
3785                 VATTR_SET(&va, va_gid, uap->gid);
3786         if (xsecdst != NULL)
3787                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3788
3789         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3790                uap->path, vfs_context_current());
3791
3792         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
3793                          fileproc_alloc_init, NULL, retval);
3794         if (xsecdst != NULL)
3795                 kauth_filesec_free(xsecdst);
3796
3797         return ciferror;
3798 }
3799
3800 /*
3801  * Go through the data-protected atomically controlled open (2)
3802  *
3803  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
3804  */
3805 int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval) {
3806         int flags = uap->flags;
3807         int class = uap->class;
3808         int dpflags = uap->dpflags;
3809
3810         /*
3811          * Follow the same path as normal open(2)
3812          * Look up the item if it exists, and acquire the vnode.
3813          */
3814         struct filedesc *fdp = p->p_fd;
3815         struct vnode_attr va;
3816         struct nameidata nd;
3817         int cmode;
3818         int error;
3819
3820         VATTR_INIT(&va);
3821         /* Mask off all but regular access permissions */
3822         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3823         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3824
3825         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3826                uap->path, vfs_context_current());
3827
3828         /*
3829          * Initialize the extra fields in vnode_attr to pass down our
3830          * extra fields.
3831          * 1. target cprotect class.
3832          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
3833          */
3834         if (flags & O_CREAT) {
3835                /* lower level kernel code validates that the class is valid before applying it. */
3836                if (class != PROTECTION_CLASS_DEFAULT) {
3837                        /*
3838                         * PROTECTION_CLASS_DEFAULT implies that we make the class for this
3839                         * file behave the same as open (2)
3840                         */
3841                        VATTR_SET(&va, va_dataprotect_class, class);
3842                }
3843         }
3844
3845         if (dpflags & (O_DP_GETRAWENCRYPTED|O_DP_GETRAWUNENCRYPTED)) {
3846                 if ( flags & (O_RDWR | O_WRONLY)) {
3847                         /* Not allowed to write raw encrypted bytes */
3848                         return EINVAL;
3849                 }
3850                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
3851                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
3852                 }
3853                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
3854                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
3855                 }
3856         }
3857
3858         error = open1(vfs_context_current(), &nd, uap->flags, &va,
3859                       fileproc_alloc_init, NULL, retval);
3860
3861         return error;
3862 }
3863
3864 static int
3865 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
3866     int fd, enum uio_seg segflg, int *retval)
3867 {
3868         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
3869         struct vnode_attr va;
3870         struct nameidata nd;
3871         int cmode;
3872
3873         VATTR_INIT(&va);
3874         /* Mask off all but regular access permissions */
3875         cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3876         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3877
3878         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
3879             segflg, path, ctx);
3880
3881         return (open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
3882             retval, fd));
3883 }
3884
3885 int
3886 open(proc_t p, struct open_args *uap, int32_t *retval)
3887 {
3888         __pthread_testcancel(1);
3889         return(open_nocancel(p, (struct open_nocancel_args *)uap, retval));
3890 }
3891
3892 int
3893 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
3894     int32_t *retval)
3895 {
3896         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3897             uap->mode, AT_FDCWD, UIO_USERSPACE, retval));
3898 }
3899
3900 int
3901 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
3902                 int32_t *retval)
3903 {
3904         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3905             uap->mode, uap->fd, UIO_USERSPACE, retval));
3906 }
3907
3908 int
3909 openat(proc_t p, struct openat_args *uap, int32_t *retval)
3910 {
3911         __pthread_testcancel(1);
3912         return(openat_nocancel(p, (struct openat_nocancel_args *)uap, retval));
3913 }
3914
3915 /*
3916  * openbyid_np: open a file given a file system id and a file system object id
3917  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
3918  *      file systems that don't support object ids it is a node id (uint64_t).
3919  *
3920  * Parameters:  p                       Process requesting the open
3921  *              uap                     User argument descriptor (see below)
3922  *              retval                  Pointer to an area to receive the
3923  *                                      return calue from the system call
3924  *
3925  * Indirect:    uap->path               Path to open (same as 'open')
3926  *
3927  *              uap->fsid               id of target file system
3928  *              uap->objid              id of target file system object
3929  *              uap->flags              Flags to open (same as 'open')
3930  *
3931  * Returns:     0                       Success
3932  *              !0                      errno value
3933  *
3934  *
3935  * XXX:         We should enummerate the possible errno values here, and where
3936  *              in the code they originated.
3937  */
3938 int
3939 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
3940 {
3941         fsid_t fsid;
3942         uint64_t objid;
3943         int error;
3944         char *buf = NULL;
3945         int buflen = MAXPATHLEN;
3946         int pathlen = 0;
3947         vfs_context_t ctx = vfs_context_current();
3948
3949         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
3950                 return (error);
3951         }
3952
3953         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
3954                 return (error);
3955         }
3956
3957         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
3958         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
3959                 return (error);
3960         }
3961
3962         AUDIT_ARG(value32, fsid.val[0]);
3963         AUDIT_ARG(value64, objid);
3964
3965         /*resolve path from fsis, objid*/
3966         do {
3967                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
3968                 if (buf == NULL) {
3969                         return (ENOMEM);
3970                 }
3971
3972                 error = fsgetpath_internal(
3973                         ctx, fsid.val[0], objid,
3974                         buflen, buf, &pathlen);
3975
3976                 if (error) {
3977                         FREE(buf, M_TEMP);
3978                         buf = NULL;
3979                 }
3980         } while (error == ENOSPC && (buflen += MAXPATHLEN));
3981
3982         if (error) {
3983                 return error;
3984         }
3985
3986         buf[pathlen] = 0;
3987
3988         error = openat_internal(
3989                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
3990
3991         FREE(buf, M_TEMP);
3992
3993         return error;
3994 }
3995
3996
3997 /*
3998  * Create a special file.
3999  */
4000 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4001
4002 int
4003 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4004 {
4005         struct vnode_attr va;
4006         vfs_context_t ctx = vfs_context_current();
4007         int error;
4008         struct nameidata nd;
4009         vnode_t vp, dvp;
4010
4011         VATTR_INIT(&va);
4012         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4013         VATTR_SET(&va, va_rdev, uap->dev);
4014
4015         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4016         if ((uap->mode & S_IFMT) == S_IFIFO)
4017                 return(mkfifo1(ctx, uap->path, &va));
4018
4019         AUDIT_ARG(mode, uap->mode);
4020         AUDIT_ARG(value32, uap->dev);
4021
4022         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
4023                 return (error);
4024         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4025                 UIO_USERSPACE, uap->path, ctx);
4026         error = namei(&nd);
4027         if (error)
4028                 return (error);
4029         dvp = nd.ni_dvp;
4030         vp = nd.ni_vp;
4031
4032         if (vp != NULL) {
4033                 error = EEXIST;
4034                 goto out;
4035         }
4036
4037         switch (uap->mode & S_IFMT) {
4038         case S_IFCHR:
4039                 VATTR_SET(&va, va_type, VCHR);
4040                 break;
4041         case S_IFBLK:
4042                 VATTR_SET(&va, va_type, VBLK);
4043                 break;
4044         default:
4045                 error = EINVAL;
4046                 goto out;
4047         }
4048
4049 #if CONFIG_MACF
4050         error = mac_vnode_check_create(ctx,
4051             nd.ni_dvp, &nd.ni_cnd, &va);
4052         if (error)
4053                 goto out;
4054 #endif
4055
4056         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
4057                 goto out;
4058
4059         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0)
4060                 goto out;
4061
4062         if (vp) {
4063                 int     update_flags = 0;
4064
4065                 // Make sure the name & parent pointers are hooked up
4066                 if (vp->v_name == NULL)
4067                         update_flags |= VNODE_UPDATE_NAME;
4068                 if (vp->v_parent == NULLVP)
4069                         update_flags |= VNODE_UPDATE_PARENT;
4070
4071                 if (update_flags)
4072                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4073
4074 #if CONFIG_FSE
4075                 add_fsevent(FSE_CREATE_FILE, ctx,
4076                     FSE_ARG_VNODE, vp,
4077                     FSE_ARG_DONE);
4078 #endif
4079         }
4080
4081 out:
4082         /*
4083          * nameidone has to happen before we vnode_put(dvp)
4084          * since it may need to release the fs_nodelock on the dvp
4085          */
4086         nameidone(&nd);
4087
4088         if (vp)
4089                 vnode_put(vp);
4090         vnode_put(dvp);
4091
4092         return (error);
4093 }
4094
4095 /*
4096  * Create a named pipe.
4097  *
4098  * Returns:     0                       Success
4099  *              EEXIST
4100  *      namei:???
4101  *      vnode_authorize:???
4102  *      vn_create:???
4103  */
4104 static int
4105 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4106 {
4107         vnode_t vp, dvp;
4108         int error;
4109         struct nameidata nd;
4110
4111         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4112                 UIO_USERSPACE, upath, ctx);
4113         error = namei(&nd);
4114         if (error)
4115                 return (error);
4116         dvp = nd.ni_dvp;
4117         vp = nd.ni_vp;
4118
4119         /* check that this is a new file and authorize addition */
4120         if (vp != NULL) {
4121                 error = EEXIST;
4122                 goto out;
4123         }
4124         VATTR_SET(vap, va_type, VFIFO);
4125
4126         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0)
4127                 goto out;
4128
4129         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4130 out:
4131         /*
4132          * nameidone has to happen before we vnode_put(dvp)
4133          * since it may need to release the fs_nodelock on the dvp
4134          */
4135         nameidone(&nd);
4136
4137         if (vp)
4138                 vnode_put(vp);
4139         vnode_put(dvp);
4140
4141         return error;
4142 }
4143
4144
4145 /*
4146  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4147  *
4148  * Parameters:  p                       Process requesting the open
4149  *              uap                     User argument descriptor (see below)
4150  *              retval                  (Ignored)
4151  *
4152  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
4153  *              uap->uid                UID to set
4154  *              uap->gid                GID to set
4155  *              uap->mode               File mode to set (same as 'mkfifo')
4156  *              uap->xsecurity          ACL to set, if creating
4157  *
4158  * Returns:     0                       Success
4159  *              !0                      errno value
4160  *
4161  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4162  *
4163  * XXX:         We should enummerate the possible errno values here, and where
4164  *              in the code they originated.
4165  */
4166 int
4167 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4168 {
4169         int ciferror;
4170         kauth_filesec_t xsecdst;
4171         struct vnode_attr va;
4172
4173         AUDIT_ARG(owner, uap->uid, uap->gid);
4174
4175         xsecdst = KAUTH_FILESEC_NONE;
4176         if (uap->xsecurity != USER_ADDR_NULL) {
4177                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
4178                         return ciferror;
4179         }
4180
4181         VATTR_INIT(&va);
4182         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4183         if (uap->uid != KAUTH_UID_NONE)
4184                 VATTR_SET(&va, va_uid, uap->uid);
4185         if (uap->gid != KAUTH_GID_NONE)
4186                 VATTR_SET(&va, va_gid, uap->gid);
4187         if (xsecdst != KAUTH_FILESEC_NONE)
4188                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4189
4190         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4191
4192         if (xsecdst != KAUTH_FILESEC_NONE)
4193                 kauth_filesec_free(xsecdst);
4194         return ciferror;
4195 }
4196
4197 /* ARGSUSED */
4198 int
4199 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4200 {
4201         struct vnode_attr va;
4202
4203         VATTR_INIT(&va);
4204         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4205
4206         return(mkfifo1(vfs_context_current(), uap->path, &va));
4207 }
4208
4209
4210 static char *
4211 my_strrchr(char *p, int ch)
4212 {
4213         char *save;
4214
4215         for (save = NULL;; ++p) {
4216                 if (*p == ch)
4217                         save = p;
4218                 if (!*p)
4219                         return(save);
4220         }
4221         /* NOTREACHED */
4222 }
4223
4224 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4225
4226 int
4227 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4228 {
4229         int ret, len = _len;
4230
4231         *truncated_path = 0;
4232         ret = vn_getpath(dvp, path, &len);
4233         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4234                 if (leafname) {
4235                         path[len-1] = '/';
4236                         len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + 1;
4237                         if (len > MAXPATHLEN) {
4238                                 char *ptr;
4239
4240                                 // the string got truncated!
4241                                 *truncated_path = 1;
4242                                 ptr = my_strrchr(path, '/');
4243                                 if (ptr) {
4244                                         *ptr = '\0';   // chop off the string at the last directory component
4245                                 }
4246                                 len = strlen(path) + 1;
4247                         }
4248                 }
4249         } else if (ret == 0) {
4250                 *truncated_path = 1;
4251         } else if (ret != 0) {
4252                 struct vnode *mydvp=dvp;
4253
4254                 if (ret != ENOSPC) {
4255                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4256                                dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4257                 }
4258                 *truncated_path = 1;
4259
4260                 do {
4261                         if (mydvp->v_parent != NULL) {
4262                                 mydvp = mydvp->v_parent;
4263                         } else if (mydvp->v_mount) {
4264                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4265                                 break;
4266                         } else {
4267                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4268                                 strlcpy(path, "/", _len);
4269                                 len = 2;
4270                                 mydvp = NULL;
4271                         }
4272
4273                         if (mydvp == NULL) {
4274                                 break;
4275                         }
4276
4277                         len = _len;
4278                         ret = vn_getpath(mydvp, path, &len);
4279                 } while (ret == ENOSPC);
4280         }
4281
4282         return len;
4283 }
4284
4285
4286 /*
4287  * Make a hard file link.
4288  *
4289  * Returns:     0                       Success
4290  *              EPERM
4291  *              EEXIST
4292  *              EXDEV
4293  *      namei:???
4294  *      vnode_authorize:???
4295  *      VNOP_LINK:???
4296  */
4297 /* ARGSUSED */
4298 static int
4299 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4300     user_addr_t link, int flag, enum uio_seg segflg)
4301 {
4302         vnode_t vp, dvp, lvp;
4303         struct nameidata nd;
4304         int follow;
4305         int error;
4306 #if CONFIG_FSE
4307         fse_info finfo;
4308 #endif
4309         int need_event, has_listeners;
4310         char *target_path = NULL;
4311         int truncated=0;
4312
4313         vp = dvp = lvp = NULLVP;
4314
4315         /* look up the object we are linking to */
4316         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4317         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4318             segflg, path, ctx);
4319
4320         error = nameiat(&nd, fd1);
4321         if (error)
4322                 return (error);
4323         vp = nd.ni_vp;
4324
4325         nameidone(&nd);
4326
4327         /*
4328          * Normally, linking to directories is not supported.
4329          * However, some file systems may have limited support.
4330          */
4331         if (vp->v_type == VDIR) {
4332                 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4333                         error = EPERM;   /* POSIX */
4334                         goto out;
4335                 }
4336
4337                 /* Linking to a directory requires ownership. */
4338                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4339                         struct vnode_attr dva;
4340
4341                         VATTR_INIT(&dva);
4342                         VATTR_WANTED(&dva, va_uid);
4343                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4344                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4345                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4346                                 error = EACCES;
4347                                 goto out;
4348                         }
4349                 }
4350         }
4351
4352         /* lookup the target node */
4353 #if CONFIG_TRIGGERS
4354         nd.ni_op = OP_LINK;
4355 #endif
4356         nd.ni_cnd.cn_nameiop = CREATE;
4357         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4358         nd.ni_dirp = link;
4359         error = nameiat(&nd, fd2);
4360         if (error != 0)
4361                 goto out;
4362         dvp = nd.ni_dvp;
4363         lvp = nd.ni_vp;
4364
4365 #if CONFIG_MACF
4366         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0)
4367                 goto out2;
4368 #endif
4369
4370         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4371         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0)
4372                 goto out2;
4373
4374         /* target node must not exist */
4375         if (lvp != NULLVP) {
4376                 error = EEXIST;
4377                 goto out2;
4378         }
4379         /* cannot link across mountpoints */
4380         if (vnode_mount(vp) != vnode_mount(dvp)) {
4381                 error = EXDEV;
4382                 goto out2;
4383         }
4384
4385         /* authorize creation of the target note */
4386         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
4387                 goto out2;
4388
4389         /* and finally make the link */
4390         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4391         if (error)
4392                 goto out2;
4393
4394 #if CONFIG_MACF
4395         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4396 #endif
4397
4398 #if CONFIG_FSE
4399         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4400 #else
4401         need_event = 0;
4402 #endif
4403         has_listeners = kauth_authorize_fileop_has_listeners();
4404
4405         if (need_event || has_listeners) {
4406                 char *link_to_path = NULL;
4407                 int len, link_name_len;
4408
4409                 /* build the path to the new link file */
4410                 GET_PATH(target_path);
4411                 if (target_path == NULL) {
4412                         error = ENOMEM;
4413                         goto out2;
4414                 }
4415
4416                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4417
4418                 if (has_listeners) {
4419                         /* build the path to file we are linking to */
4420                         GET_PATH(link_to_path);
4421                         if (link_to_path == NULL) {
4422                                 error = ENOMEM;
4423                                 goto out2;
4424                         }
4425
4426                         link_name_len = MAXPATHLEN;
4427                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4428                                 /*
4429                                  * Call out to allow 3rd party notification of rename.
4430                                  * Ignore result of kauth_authorize_fileop call.
4431                                  */
4432                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4433                                                        (uintptr_t)link_to_path,
4434                                                        (uintptr_t)target_path);
4435                         }
4436                         if (link_to_path != NULL) {
4437                                 RELEASE_PATH(link_to_path);
4438                         }
4439                 }
4440 #if CONFIG_FSE
4441                 if (need_event) {
4442                         /* construct fsevent */
4443                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4444                                 if (truncated) {
4445                                         finfo.mode |= FSE_TRUNCATED_PATH;
4446                                 }
4447
4448                                 // build the path to the destination of the link
4449                                 add_fsevent(FSE_CREATE_FILE, ctx,
4450                                             FSE_ARG_STRING, len, target_path,
4451                                             FSE_ARG_FINFO, &finfo,
4452                                             FSE_ARG_DONE);
4453                         }
4454                         if (vp->v_parent) {
4455                             add_fsevent(FSE_STAT_CHANGED, ctx,
4456                                 FSE_ARG_VNODE, vp->v_parent,
4457                                 FSE_ARG_DONE);
4458                         }
4459                 }
4460 #endif
4461         }
4462 out2:
4463         /*
4464          * nameidone has to happen before we vnode_put(dvp)
4465          * since it may need to release the fs_nodelock on the dvp
4466          */
4467         nameidone(&nd);
4468         if (target_path != NULL) {
4469                 RELEASE_PATH(target_path);
4470         }
4471 out:
4472         if (lvp)
4473                 vnode_put(lvp);
4474         if (dvp)
4475                 vnode_put(dvp);
4476         vnode_put(vp);
4477         return (error);
4478 }
4479
4480 int
4481 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4482 {
4483         return (linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4484             AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE));
4485 }
4486
4487 int
4488 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4489 {
4490         if (uap->flag & ~AT_SYMLINK_FOLLOW)
4491                 return (EINVAL);
4492
4493         return (linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4494             uap->fd2, uap->link, uap->flag, UIO_USERSPACE));
4495 }
4496
4497 /*
4498  * Make a symbolic link.
4499  *
4500  * We could add support for ACLs here too...
4501  */
4502 /* ARGSUSED */
4503 static int
4504 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4505     user_addr_t link, enum uio_seg segflg)
4506 {
4507         struct vnode_attr va;
4508         char *path;
4509         int error;
4510         struct nameidata nd;
4511         vnode_t vp, dvp;
4512         size_t dummy=0;
4513         proc_t p;
4514
4515         error = 0;
4516         if (UIO_SEG_IS_USER_SPACE(segflg)) {
4517                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
4518                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
4519         } else {
4520                 path = (char *)path_data;
4521         }
4522         if (error)
4523                 goto out;
4524         AUDIT_ARG(text, path);  /* This is the link string */
4525
4526         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
4527             segflg, link, ctx);
4528
4529         error = nameiat(&nd, fd);
4530         if (error)
4531                 goto out;
4532         dvp = nd.ni_dvp;
4533         vp = nd.ni_vp;
4534
4535         p = vfs_context_proc(ctx);
4536         VATTR_INIT(&va);
4537         VATTR_SET(&va, va_type, VLNK);
4538         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
4539
4540 #if CONFIG_MACF
4541         error = mac_vnode_check_create(ctx,
4542                         dvp, &nd.ni_cnd, &va);
4543 #endif
4544         if (error != 0) {
4545             goto skipit;
4546         }
4547
4548         if (vp != NULL) {
4549             error = EEXIST;
4550             goto skipit;
4551         }
4552
4553         /* authorize */
4554         if (error == 0)
4555                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
4556         /* get default ownership, etc. */
4557         if (error == 0)
4558                 error = vnode_authattr_new(dvp, &va, 0, ctx);
4559         if (error == 0)
4560                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
4561
4562 #if CONFIG_MACF
4563         if (error == 0 && vp)
4564                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
4565 #endif
4566
4567         /* do fallback attribute handling */
4568         if (error == 0 && vp)
4569                 error = vnode_setattr_fallback(vp, &va, ctx);
4570
4571         if (error == 0) {
4572                 int     update_flags = 0;
4573
4574                 /*check if a new vnode was created, else try to get one*/
4575                 if (vp == NULL) {
4576                         nd.ni_cnd.cn_nameiop = LOOKUP;
4577 #if CONFIG_TRIGGERS
4578                         nd.ni_op = OP_LOOKUP;
4579 #endif
4580                         nd.ni_cnd.cn_flags = 0;
4581                         error = nameiat(&nd, fd);
4582                         vp = nd.ni_vp;
4583
4584                         if (vp == NULL)
4585                                 goto skipit;
4586                 }
4587
4588 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
4589                 /* call out to allow 3rd party notification of rename.
4590                  * Ignore result of kauth_authorize_fileop call.
4591                  */
4592                 if (kauth_authorize_fileop_has_listeners() &&
4593                     namei(&nd) == 0) {
4594                         char *new_link_path = NULL;
4595                         int             len;
4596
4597                         /* build the path to the new link file */
4598                         new_link_path = get_pathbuff();
4599                         len = MAXPATHLEN;
4600                         vn_getpath(dvp, new_link_path, &len);
4601                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
4602                                 new_link_path[len - 1] = '/';
4603                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN-len);
4604                         }
4605
4606                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
4607                                            (uintptr_t)path, (uintptr_t)new_link_path);
4608                         if (new_link_path != NULL)
4609                                 release_pathbuff(new_link_path);
4610                 }
4611 #endif
4612                 // Make sure the name & parent pointers are hooked up
4613                 if (vp->v_name == NULL)
4614                         update_flags |= VNODE_UPDATE_NAME;
4615                 if (vp->v_parent == NULLVP)
4616                         update_flags |= VNODE_UPDATE_PARENT;
4617
4618                 if (update_flags)
4619                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4620
4621 #if CONFIG_FSE
4622                 add_fsevent(FSE_CREATE_FILE, ctx,
4623                             FSE_ARG_VNODE, vp,
4624                             FSE_ARG_DONE);
4625 #endif
4626         }
4627
4628 skipit:
4629         /*
4630          * nameidone has to happen before we vnode_put(dvp)
4631          * since it may need to release the fs_nodelock on the dvp
4632          */
4633         nameidone(&nd);
4634
4635         if (vp)
4636                 vnode_put(vp);
4637         vnode_put(dvp);
4638 out:
4639         if (path && (path != (char *)path_data))
4640                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
4641
4642         return (error);
4643 }
4644
4645 int
4646 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
4647 {
4648         return (symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
4649             uap->link, UIO_USERSPACE));
4650 }
4651
4652 int
4653 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
4654     __unused int32_t *retval)
4655 {
4656         return (symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
4657             uap->path2, UIO_USERSPACE));
4658 }
4659
4660 /*
4661  * Delete a whiteout from the filesystem.
4662  * No longer supported.
4663  */
4664 int
4665 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
4666 {
4667         return (ENOTSUP);
4668 }
4669
4670 /*
4671  * Delete a name from the filesystem.
4672  */
4673 /* ARGSUSED */
4674 static int
4675 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
4676     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
4677 {
4678         struct nameidata nd;
4679         vnode_t vp, dvp;
4680         int error;
4681         struct componentname *cnp;
4682         char  *path = NULL;
4683         int  len=0;
4684 #if CONFIG_FSE
4685         fse_info  finfo;
4686         struct vnode_attr va;
4687 #endif
4688         int flags;
4689         int need_event;
4690         int has_listeners;
4691         int truncated_path;
4692         int batched;
4693         struct vnode_attr *vap;
4694         int do_retry;
4695         int retry_count = 0;
4696         int cn_flags;
4697
4698         cn_flags = LOCKPARENT;
4699         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH))
4700                 cn_flags |= AUDITVNPATH1;
4701         /* If a starting dvp is passed, it trumps any fd passed. */
4702         if (start_dvp)
4703                 cn_flags |= USEDVP;
4704
4705 #if NAMEDRSRCFORK
4706         /* unlink or delete is allowed on rsrc forks and named streams */
4707         cn_flags |= CN_ALLOWRSRCFORK;
4708 #endif
4709
4710 retry:
4711         do_retry = 0;
4712         flags = 0;
4713         need_event = 0;
4714         has_listeners = 0;
4715         truncated_path = 0;
4716         vap = NULL;
4717
4718         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
4719
4720         nd.ni_dvp = start_dvp;
4721         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
4722         cnp = &nd.ni_cnd;
4723
4724 continue_lookup:
4725         error = nameiat(&nd, fd);
4726         if (error)
4727                 return (error);
4728
4729         dvp = nd.ni_dvp;
4730         vp = nd.ni_vp;
4731
4732
4733         /* With Carbon delete semantics, busy files cannot be deleted */
4734         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
4735                 flags |= VNODE_REMOVE_NODELETEBUSY;
4736         }
4737
4738         /* Skip any potential upcalls if told to. */
4739         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
4740                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
4741         }
4742
4743         if (vp) {
4744                 batched = vnode_compound_remove_available(vp);
4745                 /*
4746                  * The root of a mounted filesystem cannot be deleted.
4747                  */
4748                 if (vp->v_flag & VROOT) {
4749                         error = EBUSY;
4750                 }
4751
4752 #if DEVELOPMENT || DEBUG
4753         /*
4754          * XXX VSWAP: Check for entitlements or special flag here
4755          * so we can restrict access appropriately.
4756          */
4757 #else /* DEVELOPMENT || DEBUG */
4758
4759                 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
4760                         error = EPERM;
4761                         goto out;
4762                 }
4763 #endif /* DEVELOPMENT || DEBUG */
4764
4765                 if (!batched) {
4766                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
4767                         if (error) {
4768                                 if (error == ENOENT) {
4769                                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4770                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4771                                                 do_retry = 1;
4772                                                 retry_count++;
4773                                         }
4774                                 }
4775                                 goto out;
4776                         }
4777                 }
4778         } else {
4779                 batched = 1;
4780
4781                 if (!vnode_compound_remove_available(dvp)) {
4782                         panic("No vp, but no compound remove?");
4783                 }
4784         }
4785
4786 #if CONFIG_FSE
4787         need_event = need_fsevent(FSE_DELETE, dvp);
4788         if (need_event) {
4789                 if (!batched) {
4790                         if ((vp->v_flag & VISHARDLINK) == 0) {
4791                                 /* XXX need to get these data in batched VNOP */
4792                                 get_fse_info(vp, &finfo, ctx);
4793                         }
4794                 } else {
4795                         error = vfs_get_notify_attributes(&va);
4796                         if (error) {
4797                                 goto out;
4798                         }
4799
4800                         vap = &va;
4801                 }
4802         }
4803 #endif
4804         has_listeners = kauth_authorize_fileop_has_listeners();
4805         if (need_event || has_listeners) {
4806                 if (path == NULL) {
4807                         GET_PATH(path);
4808                         if (path == NULL) {
4809                                 error = ENOMEM;
4810                                 goto out;
4811                         }
4812                 }
4813                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
4814         }
4815
4816 #if NAMEDRSRCFORK
4817         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK)
4818                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
4819         else
4820 #endif
4821         {
4822                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
4823                 vp = nd.ni_vp;
4824                 if (error == EKEEPLOOKING) {
4825                         if (!batched) {
4826                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
4827                         }
4828
4829                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
4830                                 panic("EKEEPLOOKING, but continue flag not set?");
4831                         }
4832
4833                         if (vnode_isdir(vp)) {
4834                                 error = EISDIR;
4835                                 goto out;
4836                         }
4837                         goto continue_lookup;
4838                 } else if (error == ENOENT && batched) {
4839                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4840                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4841                                 /*
4842                                  * For compound VNOPs, the authorization callback may
4843                                  * return ENOENT in case of racing hardlink lookups
4844                                  * hitting the name  cache, redrive the lookup.
4845                                  */
4846                                 do_retry = 1;
4847                                 retry_count += 1;
4848                                 goto out;
4849                         }
4850                 }
4851         }
4852
4853         /*
4854          * Call out to allow 3rd party notification of delete.
4855          * Ignore result of kauth_authorize_fileop call.
4856          */
4857         if (!error) {
4858                 if (has_listeners) {
4859                         kauth_authorize_fileop(vfs_context_ucred(ctx),
4860                                 KAUTH_FILEOP_DELETE,
4861                                 (uintptr_t)vp,
4862                                 (uintptr_t)path);
4863                 }
4864
4865                 if (vp->v_flag & VISHARDLINK) {
4866                     //
4867                     // if a hardlink gets deleted we want to blow away the
4868                     // v_parent link because the path that got us to this
4869                     // instance of the link is no longer valid.  this will
4870                     // force the next call to get the path to ask the file
4871                     // system instead of just following the v_parent link.
4872                     //
4873                     vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
4874                 }
4875
4876 #if CONFIG_FSE
4877                 if (need_event) {
4878                         if (vp->v_flag & VISHARDLINK) {
4879                                 get_fse_info(vp, &finfo, ctx);
4880                         } else if (vap) {
4881                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
4882                         }
4883                         if (truncated_path) {
4884                                 finfo.mode |= FSE_TRUNCATED_PATH;
4885                         }
4886                         add_fsevent(FSE_DELETE, ctx,
4887                                                 FSE_ARG_STRING, len, path,
4888                                                 FSE_ARG_FINFO, &finfo,
4889                                                 FSE_ARG_DONE);
4890                 }
4891 #endif
4892         }
4893
4894 out:
4895         if (path != NULL)
4896                 RELEASE_PATH(path);
4897
4898 #if NAMEDRSRCFORK
4899         /* recycle the deleted rsrc fork vnode to force a reclaim, which
4900          * will cause its shadow file to go away if necessary.
4901          */
4902          if (vp && (vnode_isnamedstream(vp)) &&
4903                 (vp->v_parent != NULLVP) &&
4904                 vnode_isshadow(vp)) {
4905                         vnode_recycle(vp);
4906          }
4907 #endif
4908         /*
4909          * nameidone has to happen before we vnode_put(dvp)
4910          * since it may need to release the fs_nodelock on the dvp
4911          */
4912         nameidone(&nd);
4913         vnode_put(dvp);
4914         if (vp) {
4915                 vnode_put(vp);
4916         }
4917
4918         if (do_retry) {
4919                 goto retry;
4920         }
4921
4922         return (error);
4923 }
4924
4925 int
4926 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
4927     enum uio_seg segflg, int unlink_flags)
4928 {
4929         return (unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
4930             unlink_flags));
4931 }
4932
4933 /*
4934  * Delete a name from the filesystem using Carbon semantics.
4935  */
4936 int
4937 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
4938 {
4939         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4940             uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY));
4941 }
4942
4943 /*
4944  * Delete a name from the filesystem using POSIX semantics.
4945  */
4946 int
4947 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
4948 {
4949         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4950             uap->path, UIO_USERSPACE, 0));
4951 }
4952
4953 int
4954 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
4955 {
4956         if (uap->flag & ~AT_REMOVEDIR)
4957                 return (EINVAL);
4958
4959         if (uap->flag & AT_REMOVEDIR)
4960                 return (rmdirat_internal(vfs_context_current(), uap->fd,
4961                     uap->path, UIO_USERSPACE));
4962         else
4963                 return (unlinkat_internal(vfs_context_current(), uap->fd,
4964                     NULLVP, uap->path, UIO_USERSPACE, 0));
4965 }
4966
4967 /*
4968  * Reposition read/write file offset.
4969  */
4970 int
4971 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
4972 {
4973         struct fileproc *fp;
4974         vnode_t vp;
4975         struct vfs_context *ctx;
4976         off_t offset = uap->offset, file_size;
4977         int error;
4978
4979         if ( (error = fp_getfvp(p,uap->fd, &fp, &vp)) ) {
4980                 if (error == ENOTSUP)
4981                         return (ESPIPE);
4982                 return (error);
4983         }
4984         if (vnode_isfifo(vp)) {
4985                 file_drop(uap->fd);
4986                 return(ESPIPE);
4987         }
4988
4989
4990         ctx = vfs_context_current();
4991 #if CONFIG_MACF
4992         if (uap->whence == L_INCR && uap->offset == 0)
4993                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
4994                     fp->f_fglob);
4995         else
4996                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
4997                     fp->f_fglob);
4998         if (error) {
4999                 file_drop(uap->fd);
5000                 return (error);
5001         }
5002 #endif
5003         if ( (error = vnode_getwithref(vp)) ) {
5004                 file_drop(uap->fd);
5005                 return(error);
5006         }
5007
5008         switch (uap->whence) {
5009         case L_INCR:
5010                 offset += fp->f_fglob->fg_offset;
5011                 break;
5012         case L_XTND:
5013                 if ((error = vnode_size(vp, &file_size, ctx)) != 0)
5014                         break;
5015                 offset += file_size;
5016                 break;
5017         case L_SET:
5018                 break;
5019         case SEEK_HOLE:
5020         error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5021                 break;
5022         case SEEK_DATA:
5023         error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5024                 break;
5025         default:
5026                 error = EINVAL;
5027         }
5028         if (error == 0) {
5029                 if (uap->offset > 0 && offset < 0) {
5030                         /* Incremented/relative move past max size */
5031                         error = EOVERFLOW;
5032                 } else {
5033                         /*
5034                          * Allow negative offsets on character devices, per
5035                          * POSIX 1003.1-2001.  Most likely for writing disk
5036                          * labels.
5037                          */
5038                         if (offset < 0 && vp->v_type != VCHR) {
5039                                 /* Decremented/relative move before start */
5040                                 error = EINVAL;
5041                         } else {
5042                                 /* Success */
5043                                 fp->f_fglob->fg_offset = offset;
5044                                 *retval = fp->f_fglob->fg_offset;
5045                         }
5046                 }
5047         }
5048
5049         /*
5050          * An lseek can affect whether data is "available to read."  Use
5051          * hint of NOTE_NONE so no EVFILT_VNODE events fire
5052          */
5053         post_event_if_success(vp, error, NOTE_NONE);
5054         (void)vnode_put(vp);
5055         file_drop(uap->fd);
5056         return (error);
5057 }
5058
5059
5060 /*
5061  * Check access permissions.
5062  *
5063  * Returns:     0                       Success
5064  *              vnode_authorize:???
5065  */
5066 static int
5067 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5068 {
5069         kauth_action_t action;
5070         int error;
5071
5072         /*
5073          * If just the regular access bits, convert them to something
5074          * that vnode_authorize will understand.
5075          */
5076         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5077                 action = 0;
5078                 if (uflags & R_OK)
5079                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
5080                 if (uflags & W_OK) {
5081                         if (vnode_isdir(vp)) {
5082                                 action |= KAUTH_VNODE_ADD_FILE |
5083                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
5084                                 /* might want delete rights here too */
5085                         } else {
5086                                 action |= KAUTH_VNODE_WRITE_DATA;
5087                         }
5088                 }
5089                 if (uflags & X_OK) {
5090                         if (vnode_isdir(vp)) {
5091                                 action |= KAUTH_VNODE_SEARCH;
5092                         } else {
5093                                 action |= KAUTH_VNODE_EXECUTE;
5094                         }
5095                 }
5096         } else {
5097                 /* take advantage of definition of uflags */
5098                 action = uflags >> 8;
5099         }
5100
5101 #if CONFIG_MACF
5102         error = mac_vnode_check_access(ctx, vp, uflags);
5103         if (error)
5104                 return (error);
5105 #endif /* MAC */
5106
5107         /* action == 0 means only check for existence */
5108         if (action != 0) {
5109                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5110         } else {
5111                 error = 0;
5112         }
5113
5114         return(error);
5115 }
5116
5117
5118
5119 /*
5120  * access_extended: Check access permissions in bulk.
5121  *
5122  * Description: uap->entries            Pointer to an array of accessx
5123  *                                      descriptor structs, plus one or
5124  *                                      more NULL terminated strings (see
5125  *                                      "Notes" section below).
5126  *              uap->size               Size of the area pointed to by
5127  *                                      uap->entries.
5128  *              uap->results            Pointer to the results array.
5129  *
5130  * Returns:     0                       Success
5131  *              ENOMEM                  Insufficient memory
5132  *              EINVAL                  Invalid arguments
5133  *              namei:EFAULT            Bad address
5134  *              namei:ENAMETOOLONG      Filename too long
5135  *              namei:ENOENT            No such file or directory
5136  *              namei:ELOOP             Too many levels of symbolic links
5137  *              namei:EBADF             Bad file descriptor
5138  *              namei:ENOTDIR           Not a directory
5139  *              namei:???
5140  *              access1:
5141  *
5142  * Implicit returns:
5143  *              uap->results            Array contents modified
5144  *
5145  * Notes:       The uap->entries are structured as an arbitrary length array
5146  *              of accessx descriptors, followed by one or more NULL terminated
5147  *              strings
5148  *
5149  *                      struct accessx_descriptor[0]
5150  *                      ...
5151  *                      struct accessx_descriptor[n]
5152  *                      char name_data[0];
5153  *
5154  *              We determine the entry count by walking the buffer containing
5155  *              the uap->entries argument descriptor.  For each descriptor we
5156  *              see, the valid values for the offset ad_name_offset will be
5157  *              in the byte range:
5158  *
5159  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
5160  *                                              to
5161  *                              [ uap->entries + uap->size - 2 ]
5162  *
5163  *              since we must have at least one string, and the string must
5164  *              be at least one character plus the NULL terminator in length.
5165  *
5166  * XXX:         Need to support the check-as uid argument
5167  */
5168 int
5169 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5170 {
5171         struct accessx_descriptor *input = NULL;
5172         errno_t *result = NULL;
5173         errno_t error = 0;
5174         int wantdelete = 0;
5175         unsigned int desc_max, desc_actual, i, j;
5176         struct vfs_context context;
5177         struct nameidata nd;
5178         int niopts;
5179         vnode_t vp = NULL;
5180         vnode_t dvp = NULL;
5181 #define ACCESSX_MAX_DESCR_ON_STACK 10
5182         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5183
5184         context.vc_ucred = NULL;
5185
5186         /*
5187          * Validate parameters; if valid, copy the descriptor array and string
5188          * arguments into local memory.  Before proceeding, the following
5189          * conditions must have been met:
5190          *
5191          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5192          * o    There must be sufficient room in the request for at least one
5193          *      descriptor and a one yte NUL terminated string.
5194          * o    The allocation of local storage must not fail.
5195          */
5196         if (uap->size > ACCESSX_MAX_TABLESIZE)
5197                 return(ENOMEM);
5198         if (uap->size < (sizeof(struct accessx_descriptor) + 2))
5199                 return(EINVAL);
5200         if (uap->size <= sizeof (stack_input)) {
5201                 input = stack_input;
5202         } else {
5203         MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5204         if (input == NULL) {
5205                 error = ENOMEM;
5206                 goto out;
5207         }
5208         }
5209         error = copyin(uap->entries, input, uap->size);
5210         if (error)
5211                 goto out;
5212
5213         AUDIT_ARG(opaque, input, uap->size);
5214
5215         /*
5216          * Force NUL termination of the copyin buffer to avoid nami() running
5217          * off the end.  If the caller passes us bogus data, they may get a
5218          * bogus result.
5219          */
5220         ((char *)input)[uap->size - 1] = 0;
5221
5222         /*
5223          * Access is defined as checking against the process' real identity,
5224          * even if operations are checking the effective identity.  This
5225          * requires that we use a local vfs context.
5226          */
5227         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5228         context.vc_thread = current_thread();
5229
5230         /*
5231          * Find out how many entries we have, so we can allocate the result
5232          * array by walking the list and adjusting the count downward by the
5233          * earliest string offset we see.
5234          */
5235         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5236         desc_actual = desc_max;
5237         for (i = 0; i < desc_actual; i++) {
5238                 /*
5239                  * Take the offset to the name string for this entry and
5240                  * convert to an input array index, which would be one off
5241                  * the end of the array if this entry was the lowest-addressed
5242                  * name string.
5243                  */
5244                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5245
5246                 /*
5247                  * An offset greater than the max allowable offset is an error.
5248                  * It is also an error for any valid entry to point
5249                  * to a location prior to the end of the current entry, if
5250                  * it's not a reference to the string of the previous entry.
5251                  */
5252                 if (j > desc_max || (j != 0 && j <= i)) {
5253                         error = EINVAL;
5254                         goto out;
5255                 }
5256
5257                 /* Also do not let ad_name_offset point to something beyond the size of the input */
5258                 if (input[i].ad_name_offset >= uap->size) {
5259                         error = EINVAL;
5260                         goto out;
5261                 }
5262
5263                 /*
5264                  * An offset of 0 means use the previous descriptor's offset;
5265                  * this is used to chain multiple requests for the same file
5266                  * to avoid multiple lookups.
5267                  */
5268                 if (j == 0) {
5269                         /* This is not valid for the first entry */
5270                         if (i == 0) {
5271                                 error = EINVAL;
5272                                 goto out;
5273                         }
5274                         continue;
5275                 }
5276
5277                 /*
5278                  * If the offset of the string for this descriptor is before
5279                  * what we believe is the current actual last descriptor,
5280                  * then we need to adjust our estimate downward; this permits
5281                  * the string table following the last descriptor to be out
5282                  * of order relative to the descriptor list.
5283                  */
5284                 if (j < desc_actual)
5285                         desc_actual = j;
5286         }
5287
5288         /*
5289          * We limit the actual number of descriptors we are willing to process
5290          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5291          * requested does not exceed this limit,
5292          */
5293         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5294                 error = ENOMEM;
5295                 goto out;
5296         }
5297         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK);
5298         if (result == NULL) {
5299                 error = ENOMEM;
5300                 goto out;
5301         }
5302
5303         /*
5304          * Do the work by iterating over the descriptor entries we know to
5305          * at least appear to contain valid data.
5306          */
5307         error = 0;
5308         for (i = 0; i < desc_actual; i++) {
5309                 /*
5310                  * If the ad_name_offset is 0, then we use the previous
5311                  * results to make the check; otherwise, we are looking up
5312                  * a new file name.
5313                  */
5314                 if (input[i].ad_name_offset != 0) {
5315                         /* discard old vnodes */
5316                         if (vp) {
5317                                 vnode_put(vp);
5318                                 vp = NULL;
5319                         }
5320                         if (dvp) {
5321                                 vnode_put(dvp);
5322                                 dvp = NULL;
5323                         }
5324
5325                         /*
5326                          * Scan forward in the descriptor list to see if we
5327                          * need the parent vnode.  We will need it if we are
5328                          * deleting, since we must have rights  to remove
5329                          * entries in the parent directory, as well as the
5330                          * rights to delete the object itself.
5331                          */
5332                         wantdelete = input[i].ad_flags & _DELETE_OK;
5333                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++)
5334                                 if (input[j].ad_flags & _DELETE_OK)
5335                                         wantdelete = 1;
5336
5337                         niopts = FOLLOW | AUDITVNPATH1;
5338
5339                         /* need parent for vnode_authorize for deletion test */
5340                         if (wantdelete)
5341                                 niopts |= WANTPARENT;
5342
5343                         /* do the lookup */
5344                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5345                                CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5346                                &context);
5347                         error = namei(&nd);
5348                         if (!error) {
5349                                 vp = nd.ni_vp;
5350                                 if (wantdelete)
5351                                         dvp = nd.ni_dvp;
5352                         }
5353                         nameidone(&nd);
5354                 }
5355
5356                 /*
5357                  * Handle lookup errors.
5358                  */
5359                 switch(error) {
5360                 case ENOENT:
5361                 case EACCES:
5362                 case EPERM:
5363                 case ENOTDIR:
5364                         result[i] = error;
5365                         break;
5366                 case 0:
5367                         /* run this access check */
5368                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5369                         break;
5370                 default:
5371                         /* fatal lookup error */
5372
5373                         goto out;
5374                 }
5375         }
5376
5377         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5378
5379         /* copy out results */
5380         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5381
5382 out:
5383         if (input && input != stack_input)
5384                 FREE(input, M_TEMP);
5385         if (result)
5386                 FREE(result, M_TEMP);
5387         if (vp)
5388                 vnode_put(vp);
5389         if (dvp)
5390                 vnode_put(dvp);
5391         if (IS_VALID_CRED(context.vc_ucred))
5392                 kauth_cred_unref(&context.vc_ucred);
5393         return(error);
5394 }
5395
5396
5397 /*
5398  * Returns:     0                       Success
5399  *              namei:EFAULT            Bad address
5400  *              namei:ENAMETOOLONG      Filename too long
5401  *              namei:ENOENT            No such file or directory
5402  *              namei:ELOOP             Too many levels of symbolic links
5403  *              namei:EBADF             Bad file descriptor
5404  *              namei:ENOTDIR           Not a directory
5405  *              namei:???
5406  *              access1:
5407  */
5408 static int
5409 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5410     int flag, enum uio_seg segflg)
5411 {
5412         int error;
5413         struct nameidata nd;
5414         int niopts;
5415         struct vfs_context context;
5416 #if NAMEDRSRCFORK
5417         int is_namedstream = 0;
5418 #endif
5419
5420         /*
5421          * Unless the AT_EACCESS option is used, Access is defined as checking
5422          * against the process' real identity, even if operations are checking
5423          * the effective identity.  So we need to tweak the credential
5424          * in the context for that case.
5425          */
5426         if (!(flag & AT_EACCESS))
5427                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5428         else
5429                 context.vc_ucred = ctx->vc_ucred;
5430         context.vc_thread = ctx->vc_thread;
5431
5432
5433         niopts = FOLLOW | AUDITVNPATH1;
5434         /* need parent for vnode_authorize for deletion test */
5435         if (amode & _DELETE_OK)
5436                 niopts |= WANTPARENT;
5437         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5438                path, &context);
5439
5440 #if NAMEDRSRCFORK
5441         /* access(F_OK) calls are allowed for resource forks. */
5442         if (amode == F_OK)
5443                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5444 #endif
5445         error = nameiat(&nd, fd);
5446         if (error)
5447                 goto out;
5448
5449 #if NAMEDRSRCFORK
5450         /* Grab reference on the shadow stream file vnode to
5451          * force an inactive on release which will mark it
5452          * for recycle.
5453          */
5454         if (vnode_isnamedstream(nd.ni_vp) &&
5455             (nd.ni_vp->v_parent != NULLVP) &&
5456             vnode_isshadow(nd.ni_vp)) {
5457                 is_namedstream = 1;
5458                 vnode_ref(nd.ni_vp);
5459         }
5460 #endif
5461
5462         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
5463
5464 #if NAMEDRSRCFORK
5465         if (is_namedstream) {
5466                 vnode_rele(nd.ni_vp);
5467         }
5468 #endif
5469
5470         vnode_put(nd.ni_vp);
5471         if (amode & _DELETE_OK)
5472                 vnode_put(nd.ni_dvp);
5473         nameidone(&nd);
5474
5475 out:
5476         if (!(flag & AT_EACCESS))
5477                 kauth_cred_unref(&context.vc_ucred);
5478         return (error);
5479 }
5480
5481 int
5482 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
5483 {
5484         return (faccessat_internal(vfs_context_current(), AT_FDCWD,
5485             uap->path, uap->flags, 0, UIO_USERSPACE));
5486 }
5487
5488 int
5489 faccessat(__unused proc_t p, struct faccessat_args *uap,
5490           __unused int32_t *retval)
5491 {
5492         if (uap->flag & ~AT_EACCESS)
5493                 return (EINVAL);
5494
5495         return (faccessat_internal(vfs_context_current(), uap->fd,
5496             uap->path, uap->amode, uap->flag, UIO_USERSPACE));
5497 }
5498
5499 /*
5500  * Returns:     0                       Success
5501  *              EFAULT
5502  *      copyout:EFAULT
5503  *      namei:???
5504  *      vn_stat:???
5505  */
5506 static int
5507 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
5508     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
5509     enum uio_seg segflg, int fd, int flag)
5510 {
5511         struct nameidata nd;
5512         int follow;
5513         union {
5514                 struct stat sb;
5515                 struct stat64 sb64;
5516         } source = {};
5517         union {
5518                 struct user64_stat user64_sb;
5519                 struct user32_stat user32_sb;
5520                 struct user64_stat64 user64_sb64;
5521                 struct user32_stat64 user32_sb64;
5522         } dest = {};
5523         caddr_t sbp;
5524         int error, my_size;
5525         kauth_filesec_t fsec;
5526         size_t xsecurity_bufsize;
5527         void * statptr;
5528
5529         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5530         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
5531             segflg, path, ctx);
5532
5533 #if NAMEDRSRCFORK
5534         int is_namedstream = 0;
5535         /* stat calls are allowed for resource forks. */
5536         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5537 #endif
5538         error = nameiat(&nd, fd);
5539         if (error)
5540                 return (error);
5541         fsec = KAUTH_FILESEC_NONE;
5542
5543         statptr = (void *)&source;
5544
5545 #if NAMEDRSRCFORK
5546         /* Grab reference on the shadow stream file vnode to
5547          * force an inactive on release which will mark it
5548          * for recycle.
5549          */
5550         if (vnode_isnamedstream(nd.ni_vp) &&
5551             (nd.ni_vp->v_parent != NULLVP) &&
5552             vnode_isshadow(nd.ni_vp)) {
5553                 is_namedstream = 1;
5554                 vnode_ref(nd.ni_vp);
5555         }
5556 #endif
5557
5558         error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
5559
5560 #if NAMEDRSRCFORK
5561         if (is_namedstream) {
5562                 vnode_rele(nd.ni_vp);
5563         }
5564 #endif
5565         vnode_put(nd.ni_vp);
5566         nameidone(&nd);
5567
5568         if (error)
5569                 return (error);
5570         /* Zap spare fields */
5571         if (isstat64 != 0) {
5572                 source.sb64.st_lspare = 0;
5573                 source.sb64.st_qspare[0] = 0LL;
5574                 source.sb64.st_qspare[1] = 0LL;
5575                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5576                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
5577                         my_size = sizeof(dest.user64_sb64);
5578                         sbp = (caddr_t)&dest.user64_sb64;
5579                 } else {
5580                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
5581                         my_size = sizeof(dest.user32_sb64);
5582                         sbp = (caddr_t)&dest.user32_sb64;
5583                 }
5584                 /*
5585                  * Check if we raced (post lookup) against the last unlink of a file.
5586                  */
5587                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
5588                         source.sb64.st_nlink = 1;
5589                 }
5590         } else {
5591                 source.sb.st_lspare = 0;
5592                 source.sb.st_qspare[0] = 0LL;
5593                 source.sb.st_qspare[1] = 0LL;
5594                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5595                         munge_user64_stat(&source.sb, &dest.user64_sb);
5596                         my_size = sizeof(dest.user64_sb);
5597                         sbp = (caddr_t)&dest.user64_sb;
5598                 } else {
5599                         munge_user32_stat(&source.sb, &dest.user32_sb);
5600                         my_size = sizeof(dest.user32_sb);
5601                         sbp = (caddr_t)&dest.user32_sb;
5602                 }
5603
5604                 /*
5605                  * Check if we raced (post lookup) against the last unlink of a file.
5606                  */
5607                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
5608                         source.sb.st_nlink = 1;
5609                 }
5610         }
5611         if ((error = copyout(sbp, ub, my_size)) != 0)
5612                 goto out;
5613
5614         /* caller wants extended security information? */
5615         if (xsecurity != USER_ADDR_NULL) {
5616
5617                 /* did we get any? */
5618                 if (fsec == KAUTH_FILESEC_NONE) {
5619                         if (susize(xsecurity_size, 0) != 0) {
5620                                 error = EFAULT;
5621                                 goto out;
5622                         }
5623                 } else {
5624                         /* find the user buffer size */
5625                         xsecurity_bufsize = fusize(xsecurity_size);
5626
5627                         /* copy out the actual data size */
5628                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
5629                                 error = EFAULT;
5630                                 goto out;
5631                         }
5632
5633                         /* if the caller supplied enough room, copy out to it */
5634                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec))
5635                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
5636                 }
5637         }
5638 out:
5639         if (fsec != KAUTH_FILESEC_NONE)
5640                 kauth_filesec_free(fsec);
5641         return (error);
5642 }
5643
5644 /*
5645  * stat_extended: Get file status; with extended security (ACL).
5646  *
5647  * Parameters:    p                       (ignored)
5648  *                uap                     User argument descriptor (see below)
5649  *                retval                  (ignored)
5650  *
5651  * Indirect:      uap->path               Path of file to get status from
5652  *                uap->ub                 User buffer (holds file status info)
5653  *                uap->xsecurity          ACL to get (extended security)
5654  *                uap->xsecurity_size     Size of ACL
5655  *
5656  * Returns:        0                      Success
5657  *                !0                      errno value
5658  *
5659  */
5660 int
5661 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
5662     __unused int32_t *retval)
5663 {
5664         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5665             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5666             0));
5667 }
5668
5669 /*
5670  * Returns:     0                       Success
5671  *      fstatat_internal:???            [see fstatat_internal() in this file]
5672  */
5673 int
5674 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
5675 {
5676         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5677             0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0));
5678 }
5679
5680 int
5681 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
5682 {
5683         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5684             0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0));
5685 }
5686
5687 /*
5688  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
5689  *
5690  * Parameters:    p                       (ignored)
5691  *                uap                     User argument descriptor (see below)
5692  *                retval                  (ignored)
5693  *
5694  * Indirect:      uap->path               Path of file to get status from
5695  *                uap->ub                 User buffer (holds file status info)
5696  *                uap->xsecurity          ACL to get (extended security)
5697  *                uap->xsecurity_size     Size of ACL
5698  *
5699  * Returns:        0                      Success
5700  *                !0                      errno value
5701  *
5702  */
5703 int
5704 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
5705 {
5706         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5707             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5708             0));
5709 }
5710
5711 /*
5712  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
5713  *
5714  * Parameters:    p                       (ignored)
5715  *                uap                     User argument descriptor (see below)
5716  *                retval                  (ignored)
5717  *
5718  * Indirect:      uap->path               Path of file to get status from
5719  *                uap->ub                 User buffer (holds file status info)
5720  *                uap->xsecurity          ACL to get (extended security)
5721  *                uap->xsecurity_size     Size of ACL
5722  *
5723  * Returns:        0                      Success
5724  *                !0                      errno value
5725  *
5726  */
5727 int
5728 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
5729 {
5730         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5731             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5732             AT_SYMLINK_NOFOLLOW));
5733 }
5734
5735 /*
5736  * Get file status; this version does not follow links.
5737  */
5738 int
5739 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
5740 {
5741         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5742             0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5743 }
5744
5745 int
5746 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
5747 {
5748         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5749             0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5750 }
5751
5752 /*
5753  * lstat64_extended: Get file status; can handle large inode numbers; does not
5754  * follow links; with extended security (ACL).
5755  *
5756  * Parameters:    p                       (ignored)
5757  *                uap                     User argument descriptor (see below)
5758  *                retval                  (ignored)
5759  *
5760  * Indirect:      uap->path               Path of file to get status from
5761  *                uap->ub                 User buffer (holds file status info)
5762  *                uap->xsecurity          ACL to get (extended security)
5763  *                uap->xsecurity_size     Size of ACL
5764  *
5765  * Returns:        0                      Success
5766  *                !0                      errno value
5767  *
5768  */
5769 int
5770 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
5771 {
5772         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5773             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5774             AT_SYMLINK_NOFOLLOW));
5775 }
5776
5777 int
5778 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
5779 {
5780         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5781                 return (EINVAL);
5782
5783         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5784             0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag));
5785 }
5786
5787 int
5788 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
5789     __unused int32_t *retval)
5790 {
5791         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5792                 return (EINVAL);
5793
5794         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5795             0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag));
5796 }
5797
5798 /*
5799  * Get configurable pathname variables.
5800  *
5801  * Returns:     0                       Success
5802  *      namei:???
5803  *      vn_pathconf:???
5804  *
5805  * Notes:       Global implementation  constants are intended to be
5806  *              implemented in this function directly; all other constants
5807  *              are per-FS implementation, and therefore must be handled in
5808  *              each respective FS, instead.
5809  *
5810  * XXX We implement some things globally right now that should actually be
5811  * XXX per-FS; we will need to deal with this at some point.
5812  */
5813 /* ARGSUSED */
5814 int
5815 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
5816 {
5817         int error;
5818         struct nameidata nd;
5819         vfs_context_t ctx = vfs_context_current();
5820
5821         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
5822                 UIO_USERSPACE, uap->path, ctx);
5823         error = namei(&nd);
5824         if (error)
5825                 return (error);
5826
5827         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
5828
5829         vnode_put(nd.ni_vp);
5830         nameidone(&nd);
5831         return (error);
5832 }
5833
5834 /*
5835  * Return target name of a symbolic link.
5836  */
5837 /* ARGSUSED */
5838 static int
5839 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
5840     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
5841     int *retval)
5842 {
5843         vnode_t vp;
5844         uio_t auio;
5845         int error;
5846         struct nameidata nd;
5847         char uio_buf[ UIO_SIZEOF(1) ];
5848
5849         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
5850             seg, path, ctx);
5851
5852         error = nameiat(&nd, fd);
5853         if (error)
5854                 return (error);
5855         vp = nd.ni_vp;
5856
5857         nameidone(&nd);
5858
5859         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
5860                                     &uio_buf[0], sizeof(uio_buf));
5861         uio_addiov(auio, buf, bufsize);
5862         if (vp->v_type != VLNK) {
5863                 error = EINVAL;
5864         } else {
5865 #if CONFIG_MACF
5866                 error = mac_vnode_check_readlink(ctx, vp);
5867 #endif
5868                 if (error == 0)
5869                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
5870                                                 ctx);
5871                 if (error == 0)
5872                         error = VNOP_READLINK(vp, auio, ctx);
5873         }
5874         vnode_put(vp);
5875
5876         *retval = bufsize - (int)uio_resid(auio);
5877         return (error);
5878 }
5879
5880 int
5881 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
5882 {
5883         enum uio_seg procseg;
5884
5885         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5886         return (readlinkat_internal(vfs_context_current(), AT_FDCWD,
5887             CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
5888             uap->count, procseg, retval));
5889 }
5890
5891 int
5892 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
5893 {
5894         enum uio_seg procseg;
5895
5896         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5897         return (readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
5898             procseg, uap->buf, uap->bufsize, procseg, retval));
5899 }
5900
5901 /*
5902  * Change file flags.
5903  *
5904  * NOTE: this will vnode_put() `vp'
5905  */
5906 static int
5907 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
5908 {
5909         struct vnode_attr va;
5910         kauth_action_t action;
5911         int error;
5912
5913         VATTR_INIT(&va);
5914         VATTR_SET(&va, va_flags, flags);
5915
5916 #if CONFIG_MACF
5917         error = mac_vnode_check_setflags(ctx, vp, flags);
5918         if (error)
5919                 goto out;
5920 #endif
5921
5922         /* request authorisation, disregard immutability */
5923         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5924                 goto out;
5925         /*
5926          * Request that the auth layer disregard those file flags it's allowed to when
5927          * authorizing this operation; we need to do this in order to be able to
5928          * clear immutable flags.
5929          */
5930         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0))
5931                 goto out;
5932         error = vnode_setattr(vp, &va, ctx);
5933
5934 #if CONFIG_MACF
5935         if (error == 0)
5936                 mac_vnode_notify_setflags(ctx, vp, flags);
5937 #endif
5938
5939         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
5940                 error = ENOTSUP;
5941         }
5942 out:
5943         vnode_put(vp);
5944         return(error);
5945 }
5946
5947 /*
5948  * Change flags of a file given a path name.
5949  */
5950 /* ARGSUSED */
5951 int
5952 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
5953 {
5954         vnode_t vp;
5955         vfs_context_t ctx = vfs_context_current();
5956         int error;
5957         struct nameidata nd;
5958
5959         AUDIT_ARG(fflags, uap->flags);
5960         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
5961                 UIO_USERSPACE, uap->path, ctx);
5962         error = namei(&nd);
5963         if (error)
5964                 return (error);
5965         vp = nd.ni_vp;
5966         nameidone(&nd);
5967
5968         /* we don't vnode_put() here because chflags1 does internally */
5969         error = chflags1(vp, uap->flags, ctx);
5970
5971         return(error);
5972 }
5973
5974 /*
5975  * Change flags of a file given a file descriptor.
5976  */
5977 /* ARGSUSED */
5978 int
5979 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
5980 {
5981         vnode_t vp;
5982         int error;
5983
5984         AUDIT_ARG(fd, uap->fd);
5985         AUDIT_ARG(fflags, uap->flags);
5986         if ( (error = file_vnode(uap->fd, &vp)) )
5987                 return (error);
5988
5989         if ((error = vnode_getwithref(vp))) {
5990                 file_drop(uap->fd);
5991                 return(error);
5992         }
5993
5994         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5995
5996         /* we don't vnode_put() here because chflags1 does internally */
5997         error = chflags1(vp, uap->flags, vfs_context_current());
5998
5999         file_drop(uap->fd);
6000         return (error);
6001 }
6002
6003 /*
6004  * Change security information on a filesystem object.
6005  *
6006  * Returns:     0                       Success
6007  *              EPERM                   Operation not permitted
6008  *              vnode_authattr:???      [anything vnode_authattr can return]
6009  *              vnode_authorize:???     [anything vnode_authorize can return]
6010  *              vnode_setattr:???       [anything vnode_setattr can return]
6011  *
6012  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
6013  *              translated to EPERM before being returned.
6014  */
6015 static int
6016 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6017 {
6018         kauth_action_t action;
6019         int error;
6020
6021         AUDIT_ARG(mode, vap->va_mode);
6022         /* XXX audit new args */
6023
6024 #if NAMEDSTREAMS
6025         /* chmod calls are not allowed for resource forks. */
6026         if (vp->v_flag & VISNAMEDSTREAM) {
6027                 return (EPERM);
6028         }
6029 #endif
6030
6031 #if CONFIG_MACF
6032         if (VATTR_IS_ACTIVE(vap, va_mode) &&
6033             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0)
6034                 return (error);
6035
6036         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6037                 if ((error = mac_vnode_check_setowner(ctx, vp,
6038                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6039                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1)))
6040                         return (error);
6041         }
6042
6043         if (VATTR_IS_ACTIVE(vap, va_acl) &&
6044             (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl)))
6045                 return (error);
6046 #endif
6047
6048         /* make sure that the caller is allowed to set this security information */
6049         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6050             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6051                 if (error == EACCES)
6052                         error = EPERM;
6053                 return(error);
6054         }
6055
6056         if ((error = vnode_setattr(vp, vap, ctx)) != 0)
6057                 return (error);
6058
6059 #if CONFIG_MACF
6060         if (VATTR_IS_ACTIVE(vap, va_mode))
6061                 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6062
6063         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))
6064                 mac_vnode_notify_setowner(ctx, vp,
6065                         VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6066                         VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6067
6068         if (VATTR_IS_ACTIVE(vap, va_acl))
6069                 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6070 #endif
6071
6072         return (error);
6073 }
6074
6075
6076 /*
6077  * Change mode of a file given a path name.
6078  *
6079  * Returns:     0                       Success
6080  *              namei:???               [anything namei can return]
6081  *              chmod_vnode:???         [anything chmod_vnode can return]
6082  */
6083 static int
6084 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6085     int fd, int flag, enum uio_seg segflg)
6086 {
6087         struct nameidata nd;
6088         int follow, error;
6089
6090         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6091         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
6092             segflg, path, ctx);
6093         if ((error = nameiat(&nd, fd)))
6094                 return (error);
6095         error = chmod_vnode(ctx, nd.ni_vp, vap);
6096         vnode_put(nd.ni_vp);
6097         nameidone(&nd);
6098         return(error);
6099 }
6100
6101 /*
6102  * chmod_extended: Change the mode of a file given a path name; with extended
6103  * argument list (including extended security (ACL)).
6104  *
6105  * Parameters:  p                       Process requesting the open
6106  *              uap                     User argument descriptor (see below)
6107  *              retval                  (ignored)
6108  *
6109  * Indirect:    uap->path               Path to object (same as 'chmod')
6110  *              uap->uid                UID to set
6111  *              uap->gid                GID to set
6112  *              uap->mode               File mode to set (same as 'chmod')
6113  *              uap->xsecurity          ACL to set (or delete)
6114  *
6115  * Returns:     0                       Success
6116  *              !0                      errno value
6117  *
6118  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
6119  *
6120  * XXX:         We should enummerate the possible errno values here, and where
6121  *              in the code they originated.
6122  */
6123 int
6124 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
6125 {
6126         int error;
6127         struct vnode_attr va;
6128         kauth_filesec_t xsecdst;
6129
6130         AUDIT_ARG(owner, uap->uid, uap->gid);
6131
6132         VATTR_INIT(&va);
6133         if (uap->mode != -1)
6134                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6135         if (uap->uid != KAUTH_UID_NONE)
6136                 VATTR_SET(&va, va_uid, uap->uid);
6137         if (uap->gid != KAUTH_GID_NONE)
6138                 VATTR_SET(&va, va_gid, uap->gid);
6139
6140         xsecdst = NULL;
6141         switch(uap->xsecurity) {
6142                 /* explicit remove request */
6143         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6144                 VATTR_SET(&va, va_acl, NULL);
6145                 break;
6146                 /* not being set */
6147         case USER_ADDR_NULL:
6148                 break;
6149         default:
6150                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
6151                         return(error);
6152                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6153                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6154         }
6155
6156         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6157             UIO_USERSPACE);
6158
6159         if (xsecdst != NULL)
6160                 kauth_filesec_free(xsecdst);
6161         return(error);
6162 }
6163
6164 /*
6165  * Returns:     0                       Success
6166  *              chmodat:???             [anything chmodat can return]
6167  */
6168 static int
6169 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6170     int flag, enum uio_seg segflg)
6171 {
6172         struct vnode_attr va;
6173
6174         VATTR_INIT(&va);
6175         VATTR_SET(&va, va_mode, mode & ALLPERMS);
6176
6177         return (chmodat(ctx, path, &va, fd, flag, segflg));
6178 }
6179
6180 int
6181 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6182 {
6183         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6184             AT_FDCWD, 0, UIO_USERSPACE));
6185 }
6186
6187 int
6188 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6189 {
6190         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6191                 return (EINVAL);
6192
6193         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6194             uap->fd, uap->flag, UIO_USERSPACE));
6195 }
6196
6197 /*
6198  * Change mode of a file given a file descriptor.
6199  */
6200 static int
6201 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6202 {
6203         vnode_t vp;
6204         int error;
6205
6206         AUDIT_ARG(fd, fd);
6207
6208         if ((error = file_vnode(fd, &vp)) != 0)
6209                 return (error);
6210         if ((error = vnode_getwithref(vp)) != 0) {
6211                 file_drop(fd);
6212                 return(error);
6213         }
6214         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6215
6216         error = chmod_vnode(vfs_context_current(), vp, vap);
6217         (void)vnode_put(vp);
6218         file_drop(fd);
6219
6220         return (error);
6221 }
6222
6223 /*
6224  * fchmod_extended: Change mode of a file given a file descriptor; with
6225  * extended argument list (including extended security (ACL)).
6226  *
6227  * Parameters:    p                       Process requesting to change file mode
6228  *                uap                     User argument descriptor (see below)
6229  *                retval                  (ignored)
6230  *
6231  * Indirect:      uap->mode               File mode to set (same as 'chmod')
6232  *                uap->uid                UID to set
6233  *                uap->gid                GID to set
6234  *                uap->xsecurity          ACL to set (or delete)
6235  *                uap->fd                 File descriptor of file to change mode
6236  *
6237  * Returns:        0                      Success
6238  *                !0                      errno value
6239  *
6240  */
6241 int
6242 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6243 {
6244         int error;
6245         struct vnode_attr va;
6246         kauth_filesec_t xsecdst;
6247
6248         AUDIT_ARG(owner, uap->uid, uap->gid);
6249
6250         VATTR_INIT(&va);
6251         if (uap->mode != -1)
6252                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6253         if (uap->uid != KAUTH_UID_NONE)
6254                 VATTR_SET(&va, va_uid, uap->uid);
6255         if (uap->gid != KAUTH_GID_NONE)
6256                 VATTR_SET(&va, va_gid, uap->gid);
6257
6258         xsecdst = NULL;
6259         switch(uap->xsecurity) {
6260         case USER_ADDR_NULL:
6261                 VATTR_SET(&va, va_acl, NULL);
6262                 break;
6263         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6264                 VATTR_SET(&va, va_acl, NULL);
6265                 break;
6266                 /* not being set */
6267         case CAST_USER_ADDR_T(-1):
6268                 break;
6269         default:
6270                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
6271                         return(error);
6272                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6273         }
6274
6275         error = fchmod1(p, uap->fd, &va);
6276
6277
6278         switch(uap->xsecurity) {
6279         case USER_ADDR_NULL:
6280         case CAST_USER_ADDR_T(-1):
6281                 break;
6282         default:
6283                 if (xsecdst != NULL)
6284                         kauth_filesec_free(xsecdst);
6285         }
6286         return(error);
6287 }
6288
6289 int
6290 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6291 {
6292         struct vnode_attr va;
6293
6294         VATTR_INIT(&va);
6295         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6296
6297         return(fchmod1(p, uap->fd, &va));
6298 }
6299
6300
6301 /*
6302  * Set ownership given a path name.
6303  */
6304 /* ARGSUSED */
6305 static int
6306 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6307    gid_t gid, int flag, enum uio_seg segflg)
6308 {
6309         vnode_t vp;
6310         struct vnode_attr va;
6311         int error;
6312         struct nameidata nd;
6313         int follow;
6314         kauth_action_t action;
6315
6316         AUDIT_ARG(owner, uid, gid);
6317
6318         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6319         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6320             path, ctx);
6321         error = nameiat(&nd, fd);
6322         if (error)
6323                 return (error);
6324         vp = nd.ni_vp;
6325
6326         nameidone(&nd);
6327
6328         VATTR_INIT(&va);
6329         if (uid != (uid_t)VNOVAL)
6330                 VATTR_SET(&va, va_uid, uid);
6331         if (gid != (gid_t)VNOVAL)
6332                 VATTR_SET(&va, va_gid, gid);
6333
6334 #if CONFIG_MACF
6335         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6336         if (error)
6337                 goto out;
6338 #endif
6339
6340         /* preflight and authorize attribute changes */
6341         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6342                 goto out;
6343         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6344                 goto out;
6345         error = vnode_setattr(vp, &va, ctx);
6346
6347 #if CONFIG_MACF
6348         if (error == 0)
6349                 mac_vnode_notify_setowner(ctx, vp, uid, gid);
6350 #endif
6351
6352 out:
6353         /*
6354          * EACCES is only allowed from namei(); permissions failure should
6355          * return EPERM, so we need to translate the error code.
6356          */
6357         if (error == EACCES)
6358                 error = EPERM;
6359
6360         vnode_put(vp);
6361         return (error);
6362 }
6363
6364 int
6365 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
6366 {
6367         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6368             uap->uid, uap->gid, 0, UIO_USERSPACE));
6369 }
6370
6371 int
6372 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
6373 {
6374         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6375             uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE));
6376 }
6377
6378 int
6379 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
6380 {
6381         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6382                 return (EINVAL);
6383
6384         return (fchownat_internal(vfs_context_current(), uap->fd, uap->path,
6385             uap->uid, uap->gid, uap->flag, UIO_USERSPACE));
6386 }
6387
6388 /*
6389  * Set ownership given a file descriptor.
6390  */
6391 /* ARGSUSED */
6392 int
6393 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
6394 {
6395         struct vnode_attr va;
6396         vfs_context_t ctx = vfs_context_current();
6397         vnode_t vp;
6398         int error;
6399         kauth_action_t action;
6400
6401         AUDIT_ARG(owner, uap->uid, uap->gid);
6402         AUDIT_ARG(fd, uap->fd);
6403
6404         if ( (error = file_vnode(uap->fd, &vp)) )
6405                 return (error);
6406
6407         if ( (error = vnode_getwithref(vp)) ) {
6408                 file_drop(uap->fd);
6409                 return(error);
6410         }
6411         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6412
6413         VATTR_INIT(&va);
6414         if (uap->uid != VNOVAL)
6415                 VATTR_SET(&va, va_uid, uap->uid);
6416         if (uap->gid != VNOVAL)
6417                 VATTR_SET(&va, va_gid, uap->gid);
6418
6419 #if NAMEDSTREAMS
6420         /* chown calls are not allowed for resource forks. */
6421         if (vp->v_flag & VISNAMEDSTREAM) {
6422                 error = EPERM;
6423                 goto out;
6424         }
6425 #endif
6426
6427 #if CONFIG_MACF
6428         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
6429         if (error)
6430                 goto out;
6431 #endif
6432
6433         /* preflight and authorize attribute changes */
6434         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6435                 goto out;
6436         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6437                 if (error == EACCES)
6438                         error = EPERM;
6439                 goto out;
6440         }
6441         error = vnode_setattr(vp, &va, ctx);
6442
6443 #if CONFIG_MACF
6444         if (error == 0)
6445                 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
6446 #endif
6447
6448 out:
6449         (void)vnode_put(vp);
6450         file_drop(uap->fd);
6451         return (error);
6452 }
6453
6454 static int
6455 getutimes(user_addr_t usrtvp, struct timespec *tsp)
6456 {
6457         int error;
6458
6459         if (usrtvp == USER_ADDR_NULL) {
6460                 struct timeval old_tv;
6461                 /* XXX Y2038 bug because of microtime argument */
6462                 microtime(&old_tv);
6463                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
6464                 tsp[1] = tsp[0];
6465         } else {
6466                 if (IS_64BIT_PROCESS(current_proc())) {
6467                         struct user64_timeval tv[2];
6468                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6469                         if (error)
6470                                 return (error);
6471                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6472                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6473                 } else {
6474                         struct user32_timeval tv[2];
6475                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6476                         if (error)
6477                                 return (error);
6478                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6479                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6480                 }
6481         }
6482         return 0;
6483 }
6484
6485 static int
6486 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
6487         int nullflag)
6488 {
6489         int error;
6490         struct vnode_attr va;
6491         kauth_action_t action;
6492
6493         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6494
6495         VATTR_INIT(&va);
6496         VATTR_SET(&va, va_access_time, ts[0]);
6497         VATTR_SET(&va, va_modify_time, ts[1]);
6498         if (nullflag)
6499                 va.va_vaflags |= VA_UTIMES_NULL;
6500
6501 #if NAMEDSTREAMS
6502         /* utimes calls are not allowed for resource forks. */
6503         if (vp->v_flag & VISNAMEDSTREAM) {
6504                 error = EPERM;
6505                 goto out;
6506         }
6507 #endif
6508
6509 #if CONFIG_MACF
6510         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
6511         if (error)
6512                 goto out;
6513 #endif
6514         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6515                 if (!nullflag && error == EACCES)
6516                         error = EPERM;
6517                 goto out;
6518         }
6519
6520         /* since we may not need to auth anything, check here */
6521         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6522                 if (!nullflag && error == EACCES)
6523                         error = EPERM;
6524                 goto out;
6525         }
6526         error = vnode_setattr(vp, &va, ctx);
6527
6528 #if CONFIG_MACF
6529         if (error == 0)
6530                 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
6531 #endif
6532
6533 out:
6534         return error;
6535 }
6536
6537 /*
6538  * Set the access and modification times of a file.
6539  */
6540 /* ARGSUSED */
6541 int
6542 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
6543 {
6544         struct timespec ts[2];
6545         user_addr_t usrtvp;
6546         int error;
6547         struct nameidata nd;
6548         vfs_context_t ctx = vfs_context_current();
6549
6550         /*
6551          * AUDIT: Needed to change the order of operations to do the
6552          * name lookup first because auditing wants the path.
6553          */
6554         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6555                 UIO_USERSPACE, uap->path, ctx);
6556         error = namei(&nd);
6557         if (error)
6558                 return (error);
6559         nameidone(&nd);
6560
6561         /*
6562          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
6563          * the current time instead.
6564          */
6565         usrtvp = uap->tptr;
6566         if ((error = getutimes(usrtvp, ts)) != 0)
6567                 goto out;
6568
6569         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
6570
6571 out:
6572         vnode_put(nd.ni_vp);
6573         return (error);
6574 }
6575
6576 /*
6577  * Set the access and modification times of a file.
6578  */
6579 /* ARGSUSED */
6580 int
6581 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
6582 {
6583         struct timespec ts[2];
6584         vnode_t vp;
6585         user_addr_t usrtvp;
6586         int error;
6587
6588         AUDIT_ARG(fd, uap->fd);
6589         usrtvp = uap->tptr;
6590         if ((error = getutimes(usrtvp, ts)) != 0)
6591                 return (error);
6592         if ((error = file_vnode(uap->fd, &vp)) != 0)
6593                 return (error);
6594         if((error = vnode_getwithref(vp))) {
6595                 file_drop(uap->fd);
6596                 return(error);
6597         }
6598
6599         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
6600         vnode_put(vp);
6601         file_drop(uap->fd);
6602         return(error);
6603 }
6604
6605 /*
6606  * Truncate a file given its path name.
6607  */
6608 /* ARGSUSED */
6609 int
6610 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
6611 {
6612         vnode_t vp;
6613         struct vnode_attr va;
6614         vfs_context_t ctx = vfs_context_current();
6615         int error;
6616         struct nameidata nd;
6617         kauth_action_t action;
6618
6619         if (uap->length < 0)
6620                 return(EINVAL);
6621         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
6622                 UIO_USERSPACE, uap->path, ctx);
6623         if ((error = namei(&nd)))
6624                 return (error);
6625         vp = nd.ni_vp;
6626
6627         nameidone(&nd);
6628
6629         VATTR_INIT(&va);
6630         VATTR_SET(&va, va_data_size, uap->length);
6631
6632 #if CONFIG_MACF
6633         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
6634         if (error)
6635                 goto out;
6636 #endif
6637
6638         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6639                 goto out;
6640         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6641                 goto out;
6642         error = vnode_setattr(vp, &va, ctx);
6643
6644 #if CONFIG_MACF
6645         if (error == 0)
6646                 mac_vnode_notify_truncate(ctx, NOCRED, vp);
6647 #endif
6648
6649 out:
6650         vnode_put(vp);
6651         return (error);
6652 }
6653
6654 /*
6655  * Truncate a file given a file descriptor.
6656  */
6657 /* ARGSUSED */
6658 int
6659 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
6660 {
6661         vfs_context_t ctx = vfs_context_current();
6662         struct vnode_attr va;
6663         vnode_t vp;
6664         struct fileproc *fp;
6665         int error ;
6666         int fd = uap->fd;
6667
6668         AUDIT_ARG(fd, uap->fd);
6669         if (uap->length < 0)
6670                 return(EINVAL);
6671
6672         if ( (error = fp_lookup(p,fd,&fp,0)) ) {
6673                 return(error);
6674         }
6675
6676         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
6677         case DTYPE_PSXSHM:
6678                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
6679                 goto out;
6680         case DTYPE_VNODE:
6681                 break;
6682         default:
6683                 error = EINVAL;
6684                 goto out;
6685         }
6686
6687         vp = (vnode_t)fp->f_fglob->fg_data;
6688
6689         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
6690                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
6691                 error = EINVAL;
6692                 goto out;
6693         }
6694
6695         if ((error = vnode_getwithref(vp)) != 0) {
6696                 goto out;
6697         }
6698
6699         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6700
6701 #if CONFIG_MACF
6702         error = mac_vnode_check_truncate(ctx,
6703             fp->f_fglob->fg_cred, vp);
6704         if (error) {
6705                 (void)vnode_put(vp);
6706                 goto out;
6707         }
6708 #endif
6709         VATTR_INIT(&va);
6710         VATTR_SET(&va, va_data_size, uap->length);
6711         error = vnode_setattr(vp, &va, ctx);
6712
6713 #if CONFIG_MACF
6714         if (error == 0)
6715                 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
6716 #endif
6717
6718         (void)vnode_put(vp);
6719 out:
6720         file_drop(fd);
6721         return (error);
6722 }
6723
6724
6725 /*
6726  * Sync an open file with synchronized I/O _file_ integrity completion
6727  */
6728 /* ARGSUSED */
6729 int
6730 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
6731 {
6732         __pthread_testcancel(1);
6733         return(fsync_common(p, uap, MNT_WAIT));
6734 }
6735
6736
6737 /*
6738  * Sync an open file with synchronized I/O _file_ integrity completion
6739  *
6740  * Notes:       This is a legacy support function that does not test for
6741  *              thread cancellation points.
6742  */
6743 /* ARGSUSED */
6744 int
6745 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
6746 {
6747         return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT));
6748 }
6749
6750
6751 /*
6752  * Sync an open file with synchronized I/O _data_ integrity completion
6753  */
6754 /* ARGSUSED */
6755 int
6756 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
6757 {
6758         __pthread_testcancel(1);
6759         return(fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT));
6760 }
6761
6762
6763 /*
6764  * fsync_common
6765  *
6766  * Common fsync code to support both synchronized I/O file integrity completion
6767  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
6768  *
6769  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
6770  * will only guarantee that the file data contents are retrievable.  If
6771  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
6772  * includes additional metadata unnecessary for retrieving the file data
6773  * contents, such as atime, mtime, ctime, etc., also be committed to stable
6774  * storage.
6775  *
6776  * Parameters:  p                               The process
6777  *              uap->fd                         The descriptor to synchronize
6778  *              flags                           The data integrity flags
6779  *
6780  * Returns:     int                             Success
6781  *      fp_getfvp:EBADF                         Bad file descriptor
6782  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
6783  *      VNOP_FSYNC:???                          unspecified
6784  *
6785  * Notes:       We use struct fsync_args because it is a short name, and all
6786  *              caller argument structures are otherwise identical.
6787  */
6788 static int
6789 fsync_common(proc_t p, struct fsync_args *uap, int flags)
6790 {
6791         vnode_t vp;
6792         struct fileproc *fp;
6793         vfs_context_t ctx = vfs_context_current();
6794         int error;
6795
6796         AUDIT_ARG(fd, uap->fd);
6797
6798         if ( (error = fp_getfvp(p, uap->fd, &fp, &vp)) )
6799                 return (error);
6800         if ( (error = vnode_getwithref(vp)) ) {
6801                 file_drop(uap->fd);
6802                 return(error);
6803         }
6804
6805         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6806
6807         error = VNOP_FSYNC(vp, flags, ctx);
6808
6809 #if NAMEDRSRCFORK
6810         /* Sync resource fork shadow file if necessary. */
6811         if ((error == 0) &&
6812             (vp->v_flag & VISNAMEDSTREAM) &&
6813             (vp->v_parent != NULLVP) &&
6814             vnode_isshadow(vp) &&
6815             (fp->f_flags & FP_WRITTEN)) {
6816                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
6817         }
6818 #endif
6819
6820         (void)vnode_put(vp);
6821         file_drop(uap->fd);
6822         return (error);
6823 }
6824
6825 /*
6826  * Duplicate files.  Source must be a file, target must be a file or
6827  * must not exist.
6828  *
6829  * XXX Copyfile authorisation checking is woefully inadequate, and will not
6830  *     perform inheritance correctly.
6831  */
6832 /* ARGSUSED */
6833 int
6834 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
6835 {
6836         vnode_t tvp, fvp, tdvp, sdvp;
6837         struct nameidata fromnd, tond;
6838         int error;
6839         vfs_context_t ctx = vfs_context_current();
6840 #if CONFIG_MACF
6841         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
6842         struct vnode_attr va;
6843 #endif
6844
6845         /* Check that the flags are valid. */
6846
6847         if (uap->flags & ~CPF_MASK) {
6848                 return(EINVAL);
6849         }
6850
6851         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
6852                 UIO_USERSPACE, uap->from, ctx);
6853         if ((error = namei(&fromnd)))
6854                 return (error);
6855         fvp = fromnd.ni_vp;
6856
6857         NDINIT(&tond, CREATE, OP_LINK,
6858                LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6859                UIO_USERSPACE, uap->to, ctx);
6860         if ((error = namei(&tond))) {
6861                 goto out1;
6862         }
6863         tdvp = tond.ni_dvp;
6864         tvp = tond.ni_vp;
6865
6866         if (tvp != NULL) {
6867                 if (!(uap->flags & CPF_OVERWRITE)) {
6868                         error = EEXIST;
6869                         goto out;
6870                 }
6871         }
6872
6873         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
6874                 error = EISDIR;
6875                 goto out;
6876         }
6877
6878         /* This calls existing MAC hooks for open  */
6879         if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
6880             NULL))) {
6881                 goto out;
6882         }
6883
6884         if (tvp) {
6885                 /*
6886                  * See unlinkat_internal for an explanation of the potential
6887                  * ENOENT from the MAC hook but the gist is that the MAC hook
6888                  * can fail because vn_getpath isn't able to return the full
6889                  * path. We choose to ignore this failure.
6890                  */
6891                 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
6892                 if (error && error != ENOENT)
6893                         goto out;
6894                 error = 0;
6895         }
6896
6897 #if CONFIG_MACF
6898         VATTR_INIT(&va);
6899         VATTR_SET(&va, va_type, fvp->v_type);
6900         /* Mask off all but regular access permissions */
6901         VATTR_SET(&va, va_mode,
6902             ((((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
6903         error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
6904         if (error)
6905                 goto out;
6906 #endif /* CONFIG_MACF */
6907
6908         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
6909                 goto out;
6910
6911         if (fvp == tdvp)
6912                 error = EINVAL;
6913         /*
6914          * If source is the same as the destination (that is the
6915          * same inode number) then there is nothing to do.
6916          * (fixed to have POSIX semantics - CSM 3/2/98)
6917          */
6918         if (fvp == tvp)
6919                 error = -1;
6920         if (!error)
6921                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
6922 out:
6923         sdvp = tond.ni_startdir;
6924         /*
6925          * nameidone has to happen before we vnode_put(tdvp)
6926          * since it may need to release the fs_nodelock on the tdvp
6927          */
6928         nameidone(&tond);
6929
6930         if (tvp)
6931                 vnode_put(tvp);
6932         vnode_put(tdvp);
6933         vnode_put(sdvp);
6934 out1:
6935         vnode_put(fvp);
6936
6937         nameidone(&fromnd);
6938
6939         if (error == -1)
6940                 return (0);
6941         return (error);
6942 }
6943
6944 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
6945
6946 /*
6947  * Helper function for doing clones. The caller is expected to provide an
6948  * iocounted source vnode and release it.
6949  */
6950 static int
6951 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
6952     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
6953 {
6954         vnode_t tvp, tdvp;
6955         struct nameidata tond;
6956         int error;
6957         int follow;
6958         boolean_t free_src_acl;
6959         boolean_t attr_cleanup;
6960         enum vtype v_type;
6961         kauth_action_t action;
6962         struct componentname *cnp;
6963         uint32_t defaulted;
6964         struct vnode_attr va;
6965         struct vnode_attr nva;
6966         uint32_t vnop_flags;
6967
6968         v_type = vnode_vtype(fvp);
6969         switch (v_type) {
6970         case VLNK:
6971                 /* FALLTHRU */
6972         case VREG:
6973                 action = KAUTH_VNODE_ADD_FILE;
6974                 break;
6975         case VDIR:
6976                 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
6977                     fvp->v_mountedhere) {
6978                         return (EINVAL);
6979                 }
6980                 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
6981                 break;
6982         default:
6983                 return (EINVAL);
6984         }
6985
6986         AUDIT_ARG(fd2, dst_dirfd);
6987         AUDIT_ARG(value32, flags);
6988
6989         follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6990         NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
6991             UIO_USERSPACE, dst, ctx);
6992         if ((error = nameiat(&tond, dst_dirfd)))
6993                 return (error);
6994         cnp = &tond.ni_cnd;
6995         tdvp = tond.ni_dvp;
6996         tvp = tond.ni_vp;
6997
6998         free_src_acl = FALSE;
6999         attr_cleanup = FALSE;
7000
7001         if (tvp != NULL) {
7002                 error = EEXIST;
7003                 goto out;
7004         }
7005
7006         if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7007                 error = EXDEV;
7008                 goto out;
7009         }
7010
7011 #if CONFIG_MACF
7012         if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp)))
7013                 goto out;
7014 #endif
7015         if ((error = vnode_authorize(tdvp, NULL, action, ctx)))
7016                 goto out;
7017
7018         action = KAUTH_VNODE_GENERIC_READ_BITS;
7019         if (data_read_authorised)
7020                 action &= ~KAUTH_VNODE_READ_DATA;
7021         if ((error = vnode_authorize(fvp, NULL, action, ctx)))
7022                 goto out;
7023
7024         /*
7025          * certain attributes may need to be changed from the source, we ask for
7026          * those here.
7027          */
7028         VATTR_INIT(&va);
7029         VATTR_WANTED(&va, va_uid);
7030         VATTR_WANTED(&va, va_gid);
7031         VATTR_WANTED(&va, va_mode);
7032         VATTR_WANTED(&va, va_flags);
7033         VATTR_WANTED(&va, va_acl);
7034
7035         if ((error = vnode_getattr(fvp, &va, ctx)) != 0)
7036                 goto out;
7037
7038         VATTR_INIT(&nva);
7039         VATTR_SET(&nva, va_type, v_type);
7040         if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
7041                 VATTR_SET(&nva, va_acl, va.va_acl);
7042                 free_src_acl = TRUE;
7043         }
7044
7045         /* Handle ACL inheritance, initialize vap. */
7046         if (v_type == VLNK) {
7047                 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
7048         } else {
7049                 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
7050                 if (error)
7051                         goto out;
7052                 attr_cleanup = TRUE;
7053         }
7054
7055         vnop_flags = VNODE_CLONEFILE_DEFAULT;
7056         /*
7057          * We've got initial values for all security parameters,
7058          * If we are superuser, then we can change owners to be the
7059          * same as the source. Both superuser and the owner have default
7060          * WRITE_SECURITY privileges so all other fields can be taken
7061          * from source as well.
7062          */
7063         if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
7064                 if (VATTR_IS_SUPPORTED(&va, va_uid))
7065                         VATTR_SET(&nva, va_uid, va.va_uid);
7066                 if (VATTR_IS_SUPPORTED(&va, va_gid))
7067                         VATTR_SET(&nva, va_gid, va.va_gid);
7068         } else {
7069                 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
7070         }
7071
7072         if (VATTR_IS_SUPPORTED(&va, va_mode))
7073                 VATTR_SET(&nva, va_mode, va.va_mode);
7074         if (VATTR_IS_SUPPORTED(&va, va_flags)) {
7075                 VATTR_SET(&nva, va_flags,
7076                     ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
7077                     (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
7078         }
7079
7080         error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
7081
7082         if (!error && tvp) {
7083                 int     update_flags = 0;
7084 #if CONFIG_FSE
7085                 int fsevent;
7086 #endif /* CONFIG_FSE */
7087
7088 #if CONFIG_MACF
7089                 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
7090                     VNODE_LABEL_CREATE, ctx);
7091 #endif
7092                 /*
7093                  * If some of the requested attributes weren't handled by the
7094                  * VNOP, use our fallback code.
7095                  */
7096                 if (!VATTR_ALL_SUPPORTED(&va))
7097                         (void)vnode_setattr_fallback(tvp, &nva, ctx);
7098
7099                 // Make sure the name & parent pointers are hooked up
7100                 if (tvp->v_name == NULL)
7101                         update_flags |= VNODE_UPDATE_NAME;
7102                 if (tvp->v_parent == NULLVP)
7103                         update_flags |= VNODE_UPDATE_PARENT;
7104
7105                 if (update_flags) {
7106                         (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
7107                             cnp->cn_namelen, cnp->cn_hash, update_flags);
7108                 }
7109
7110 #if CONFIG_FSE
7111                 switch (vnode_vtype(tvp)) {
7112                 case VLNK:
7113                         /* FALLTHRU */
7114                 case VREG:
7115                         fsevent = FSE_CREATE_FILE;
7116                         break;
7117                 case VDIR:
7118                         fsevent = FSE_CREATE_DIR;
7119                         break;
7120                 default:
7121                         goto out;
7122                 }
7123
7124                 if (need_fsevent(fsevent, tvp)) {
7125                         /*
7126                          * The following is a sequence of three explicit events.
7127                          * A pair of FSE_CLONE events representing the source and destination
7128                          * followed by an FSE_CREATE_[FILE | DIR] for the destination.
7129                          * fseventsd may coalesce the destination clone and create events
7130                          * into a single event resulting in the following sequence for a client
7131                          * FSE_CLONE (src)
7132                          * FSE_CLONE | FSE_CREATE (dst)
7133                          */
7134                         add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
7135                             FSE_ARG_DONE);
7136                         add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
7137                             FSE_ARG_DONE);
7138                 }
7139 #endif /* CONFIG_FSE */
7140         }
7141
7142 out:
7143         if (attr_cleanup)
7144                 vn_attribute_cleanup(&nva, defaulted);
7145         if (free_src_acl && va.va_acl)
7146                 kauth_acl_free(va.va_acl);
7147         nameidone(&tond);
7148         if (tvp)
7149                 vnode_put(tvp);
7150         vnode_put(tdvp);
7151         return (error);
7152 }
7153
7154 /*
7155  * clone files or directories, target must not exist.
7156  */
7157 /* ARGSUSED */
7158 int
7159 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7160     __unused int32_t *retval)
7161 {
7162         vnode_t fvp;
7163         struct nameidata fromnd;
7164         int follow;
7165         int error;
7166         vfs_context_t ctx = vfs_context_current();
7167
7168         /* Check that the flags are valid. */
7169         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY))
7170                 return (EINVAL);
7171
7172         AUDIT_ARG(fd, uap->src_dirfd);
7173
7174         follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7175         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7176             UIO_USERSPACE, uap->src, ctx);
7177         if ((error = nameiat(&fromnd, uap->src_dirfd)))
7178                 return (error);
7179
7180         fvp = fromnd.ni_vp;
7181         nameidone(&fromnd);
7182
7183         error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7184             uap->flags, ctx);
7185
7186         vnode_put(fvp);
7187         return (error);
7188 }
7189
7190 int
7191 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7192     __unused int32_t *retval)
7193 {
7194         vnode_t fvp;
7195         struct fileproc *fp;
7196         int error;
7197         vfs_context_t ctx = vfs_context_current();
7198
7199         /* Check that the flags are valid. */
7200         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY))
7201                 return (EINVAL);
7202
7203         AUDIT_ARG(fd, uap->src_fd);
7204         error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7205         if (error)
7206                 return (error);
7207
7208         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7209                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7210                 error = EBADF;
7211                 goto out;
7212         }
7213
7214         if ((error = vnode_getwithref(fvp)))
7215                 goto out;
7216
7217         AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7218
7219         error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7220             uap->flags, ctx);
7221
7222         vnode_put(fvp);
7223 out:
7224         file_drop(uap->src_fd);
7225         return (error);
7226 }
7227
7228 /*
7229  * Rename files.  Source and destination must either both be directories,
7230  * or both not be directories.  If target is a directory, it must be empty.
7231  */
7232 /* ARGSUSED */
7233 static int
7234 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
7235     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
7236 {
7237         if (flags & ~VFS_RENAME_FLAGS_MASK)
7238                 return EINVAL;
7239
7240         if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL))
7241                 return EINVAL;
7242
7243         vnode_t tvp, tdvp;
7244         vnode_t fvp, fdvp;
7245         struct nameidata *fromnd, *tond;
7246         int error;
7247         int do_retry;
7248         int retry_count;
7249         int mntrename;
7250         int need_event;
7251         const char *oname = NULL;
7252         char *from_name = NULL, *to_name = NULL;
7253         int from_len=0, to_len=0;
7254         int holding_mntlock;
7255         mount_t locked_mp = NULL;
7256         vnode_t oparent = NULLVP;
7257 #if CONFIG_FSE
7258         fse_info from_finfo, to_finfo;
7259 #endif
7260         int from_truncated=0, to_truncated;
7261         int batched = 0;
7262         struct vnode_attr *fvap, *tvap;
7263         int continuing = 0;
7264         /* carving out a chunk for structs that are too big to be on stack. */
7265         struct {
7266                 struct nameidata from_node, to_node;
7267                 struct vnode_attr fv_attr, tv_attr;
7268         } * __rename_data;
7269         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
7270         fromnd = &__rename_data->from_node;
7271         tond = &__rename_data->to_node;
7272
7273         holding_mntlock = 0;
7274         do_retry = 0;
7275         retry_count = 0;
7276 retry:
7277         fvp = tvp = NULL;
7278         fdvp = tdvp = NULL;
7279         fvap = tvap = NULL;
7280         mntrename = FALSE;
7281
7282         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
7283             segflg, from, ctx);
7284         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
7285
7286         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7287             segflg, to, ctx);
7288         tond->ni_flag = NAMEI_COMPOUNDRENAME;
7289
7290 continue_lookup:
7291         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
7292                 if ( (error = nameiat(fromnd, fromfd)) )
7293                         goto out1;
7294                 fdvp = fromnd->ni_dvp;
7295                 fvp  = fromnd->ni_vp;
7296
7297                 if (fvp && fvp->v_type == VDIR)
7298                         tond->ni_cnd.cn_flags |= WILLBEDIR;
7299         }
7300
7301         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
7302                 if ( (error = nameiat(tond, tofd)) ) {
7303                         /*
7304                          * Translate error code for rename("dir1", "dir2/.").
7305                          */
7306                         if (error == EISDIR && fvp->v_type == VDIR)
7307                                 error = EINVAL;
7308                         goto out1;
7309                 }
7310                 tdvp = tond->ni_dvp;
7311                 tvp  = tond->ni_vp;
7312         }
7313
7314 #if DEVELOPMENT || DEBUG
7315         /*
7316          * XXX VSWAP: Check for entitlements or special flag here
7317          * so we can restrict access appropriately.
7318          */
7319 #else /* DEVELOPMENT || DEBUG */
7320
7321         if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
7322                 error = EPERM;
7323                 goto out1;
7324         }
7325
7326         if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
7327                 error = EPERM;
7328                 goto out1;
7329         }
7330 #endif /* DEVELOPMENT || DEBUG */
7331
7332         if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
7333                 error = ENOENT;
7334                 goto out1;
7335         }
7336
7337         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
7338                 error = EEXIST;
7339                 goto out1;
7340         }
7341
7342         batched = vnode_compound_rename_available(fdvp);
7343         if (!fvp) {
7344                 /*
7345                  * Claim: this check will never reject a valid rename.
7346                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
7347                  * Suppose fdvp and tdvp are not on the same mount.
7348                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
7349                  *      then you can't move it to within another dir on the same mountpoint.
7350                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
7351                  *
7352                  * If this check passes, then we are safe to pass these vnodes to the same FS.
7353                  */
7354                 if (fdvp->v_mount != tdvp->v_mount) {
7355                         error = EXDEV;
7356                         goto out1;
7357                 }
7358                 goto skipped_lookup;
7359         }
7360
7361         if (!batched) {
7362                 error = vn_authorize_renamex(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, flags, NULL);
7363                 if (error) {
7364                         if (error == ENOENT) {
7365                                 assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7366                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7367                                         /*
7368                                          * We encountered a race where after doing the namei, tvp stops
7369                                          * being valid. If so, simply re-drive the rename call from the
7370                                          * top.
7371                                          */
7372                                         do_retry = 1;
7373                                         retry_count += 1;
7374                                 }
7375                         }
7376                         goto out1;
7377                 }
7378         }
7379
7380         /*
7381          * If the source and destination are the same (i.e. they're
7382          * links to the same vnode) and the target file system is
7383          * case sensitive, then there is nothing to do.
7384          *
7385          * XXX Come back to this.
7386          */
7387         if (fvp == tvp) {
7388                 int pathconf_val;
7389
7390                 /*
7391                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
7392                  * then assume that this file system is case sensitive.
7393                  */
7394                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
7395                     pathconf_val != 0) {
7396                         goto out1;
7397                 }
7398         }
7399
7400         /*
7401          * Allow the renaming of mount points.
7402          * - target must not exist
7403          * - target must reside in the same directory as source
7404          * - union mounts cannot be renamed
7405          * - "/" cannot be renamed
7406          *
7407          * XXX Handle this in VFS after a continued lookup (if we missed
7408          * in the cache to start off)
7409          *
7410          * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
7411          * we'll skip past here.  The file system is responsible for
7412          * checking that @tvp is not a descendent of @fvp and vice versa
7413          * so it should always return EINVAL if either @tvp or @fvp is the
7414          * root of a volume.
7415          */
7416         if ((fvp->v_flag & VROOT) &&
7417             (fvp->v_type == VDIR) &&
7418             (tvp == NULL)  &&
7419             (fvp->v_mountedhere == NULL)  &&
7420             (fdvp == tdvp)  &&
7421             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0)  &&
7422             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
7423                 vnode_t coveredvp;
7424
7425                 /* switch fvp to the covered vnode */
7426                 coveredvp = fvp->v_mount->mnt_vnodecovered;
7427                 if ( (vnode_getwithref(coveredvp)) ) {
7428                         error = ENOENT;
7429                         goto out1;
7430                 }
7431                 vnode_put(fvp);
7432
7433                 fvp = coveredvp;
7434                 mntrename = TRUE;
7435         }
7436         /*
7437          * Check for cross-device rename.
7438          */
7439         if ((fvp->v_mount != tdvp->v_mount) ||
7440             (tvp && (fvp->v_mount != tvp->v_mount))) {
7441                 error = EXDEV;
7442                 goto out1;
7443         }
7444
7445         /*
7446          * If source is the same as the destination (that is the
7447          * same inode number) then there is nothing to do...
7448          * EXCEPT if the underlying file system supports case
7449          * insensitivity and is case preserving.  In this case
7450          * the file system needs to handle the special case of
7451          * getting the same vnode as target (fvp) and source (tvp).
7452          *
7453          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
7454          * and _PC_CASE_PRESERVING can have this exception, and they need to
7455          * handle the special case of getting the same vnode as target and
7456          * source.  NOTE: Then the target is unlocked going into vnop_rename,
7457          * so not to cause locking problems. There is a single reference on tvp.
7458          *
7459          * NOTE - that fvp == tvp also occurs if they are hard linked and
7460          * that correct behaviour then is just to return success without doing
7461          * anything.
7462          *
7463          * XXX filesystem should take care of this itself, perhaps...
7464          */
7465         if (fvp == tvp && fdvp == tdvp) {
7466                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
7467                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
7468                           fromnd->ni_cnd.cn_namelen)) {
7469                         goto out1;
7470                 }
7471         }
7472
7473         if (holding_mntlock && fvp->v_mount != locked_mp) {
7474                 /*
7475                  * we're holding a reference and lock
7476                  * on locked_mp, but it no longer matches
7477                  * what we want to do... so drop our hold
7478                  */
7479                 mount_unlock_renames(locked_mp);
7480                 mount_drop(locked_mp, 0);
7481                 holding_mntlock = 0;
7482         }
7483         if (tdvp != fdvp && fvp->v_type == VDIR) {
7484                 /*
7485                  * serialize renames that re-shape
7486                  * the tree... if holding_mntlock is
7487                  * set, then we're ready to go...
7488                  * otherwise we
7489                  * first need to drop the iocounts
7490                  * we picked up, second take the
7491                  * lock to serialize the access,
7492                  * then finally start the lookup
7493                  * process over with the lock held
7494                  */
7495                 if (!holding_mntlock) {
7496                         /*
7497                          * need to grab a reference on
7498                          * the mount point before we
7499                          * drop all the iocounts... once
7500                          * the iocounts are gone, the mount
7501                          * could follow
7502                          */
7503                         locked_mp = fvp->v_mount;
7504                         mount_ref(locked_mp, 0);
7505
7506                         /*
7507                          * nameidone has to happen before we vnode_put(tvp)
7508                          * since it may need to release the fs_nodelock on the tvp
7509                          */
7510                         nameidone(tond);
7511
7512                         if (tvp)
7513                                 vnode_put(tvp);
7514                         vnode_put(tdvp);
7515
7516                         /*
7517                          * nameidone has to happen before we vnode_put(fdvp)
7518                          * since it may need to release the fs_nodelock on the fvp
7519                          */
7520                         nameidone(fromnd);
7521
7522                         vnode_put(fvp);
7523                         vnode_put(fdvp);
7524
7525                         mount_lock_renames(locked_mp);
7526                         holding_mntlock = 1;
7527
7528                         goto retry;
7529                 }
7530         } else {
7531                 /*
7532                  * when we dropped the iocounts to take
7533                  * the lock, we allowed the identity of
7534                  * the various vnodes to change... if they did,
7535                  * we may no longer be dealing with a rename
7536                  * that reshapes the tree... once we're holding
7537                  * the iocounts, the vnodes can't change type
7538                  * so we're free to drop the lock at this point
7539                  * and continue on
7540                  */
7541                 if (holding_mntlock) {
7542                         mount_unlock_renames(locked_mp);
7543                         mount_drop(locked_mp, 0);
7544                         holding_mntlock = 0;
7545                 }
7546         }
7547
7548         // save these off so we can later verify that fvp is the same
7549         oname   = fvp->v_name;
7550         oparent = fvp->v_parent;
7551
7552 skipped_lookup:
7553 #if CONFIG_FSE
7554         need_event = need_fsevent(FSE_RENAME, fdvp);
7555         if (need_event) {
7556                 if (fvp) {
7557                         get_fse_info(fvp, &from_finfo, ctx);
7558                 } else {
7559                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
7560                         if (error) {
7561                                 goto out1;
7562                         }
7563
7564                         fvap = &__rename_data->fv_attr;
7565                 }
7566
7567                 if (tvp) {
7568                         get_fse_info(tvp, &to_finfo, ctx);
7569                 } else if (batched) {
7570                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
7571                         if (error) {
7572                                 goto out1;
7573                         }
7574
7575                         tvap = &__rename_data->tv_attr;
7576                 }
7577         }
7578 #else
7579         need_event = 0;
7580 #endif /* CONFIG_FSE */
7581
7582         if (need_event || kauth_authorize_fileop_has_listeners()) {
7583                 if (from_name == NULL) {
7584                         GET_PATH(from_name);
7585                         if (from_name == NULL) {
7586                                 error = ENOMEM;
7587                                 goto out1;
7588                         }
7589                 }
7590
7591                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
7592
7593                 if (to_name == NULL) {
7594                         GET_PATH(to_name);
7595                         if (to_name == NULL) {
7596                                 error = ENOMEM;
7597                                 goto out1;
7598                         }
7599                 }
7600
7601                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
7602         }
7603         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
7604                             tdvp, &tvp, &tond->ni_cnd, tvap,
7605                             flags, ctx);
7606
7607         if (holding_mntlock) {
7608                 /*
7609                  * we can drop our serialization
7610                  * lock now
7611                  */
7612                 mount_unlock_renames(locked_mp);
7613                 mount_drop(locked_mp, 0);
7614                 holding_mntlock = 0;
7615         }
7616         if (error) {
7617                 if (error == EKEEPLOOKING) {
7618                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
7619                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
7620                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
7621                                 }
7622                         }
7623
7624                         fromnd->ni_vp = fvp;
7625                         tond->ni_vp = tvp;
7626
7627                         goto continue_lookup;
7628                 }
7629
7630                 /*
7631                  * We may encounter a race in the VNOP where the destination didn't
7632                  * exist when we did the namei, but it does by the time we go and
7633                  * try to create the entry. In this case, we should re-drive this rename
7634                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
7635                  * but other filesystems susceptible to this race could return it, too.
7636                  */
7637                 if (error == ERECYCLE) {
7638                         do_retry = 1;
7639                 }
7640
7641                 /*
7642                  * For compound VNOPs, the authorization callback may return
7643                  * ENOENT in case of racing hardlink lookups hitting the name
7644                  * cache, redrive the lookup.
7645                  */
7646                 if (batched && error == ENOENT) {
7647                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7648                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7649                                 do_retry = 1;
7650                                 retry_count += 1;
7651                         }
7652                 }
7653
7654                 goto out1;
7655         }
7656
7657         /* call out to allow 3rd party notification of rename.
7658          * Ignore result of kauth_authorize_fileop call.
7659          */
7660         kauth_authorize_fileop(vfs_context_ucred(ctx),
7661                         KAUTH_FILEOP_RENAME,
7662                         (uintptr_t)from_name, (uintptr_t)to_name);
7663         if (flags & VFS_RENAME_SWAP) {
7664                 kauth_authorize_fileop(vfs_context_ucred(ctx),
7665                                                            KAUTH_FILEOP_RENAME,
7666                                                            (uintptr_t)to_name, (uintptr_t)from_name);
7667         }
7668
7669 #if CONFIG_FSE
7670         if (from_name != NULL && to_name != NULL) {
7671                 if (from_truncated || to_truncated) {
7672                         // set it here since only the from_finfo gets reported up to user space
7673                         from_finfo.mode |= FSE_TRUNCATED_PATH;
7674                 }
7675
7676                 if (tvap && tvp) {
7677                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
7678                 }
7679                 if (fvap) {
7680                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
7681                 }
7682
7683                 if (tvp) {
7684                         add_fsevent(FSE_RENAME, ctx,
7685                                                 FSE_ARG_STRING, from_len, from_name,
7686                                                 FSE_ARG_FINFO, &from_finfo,
7687                                                 FSE_ARG_STRING, to_len, to_name,
7688                                                 FSE_ARG_FINFO, &to_finfo,
7689                                                 FSE_ARG_DONE);
7690                         if (flags & VFS_RENAME_SWAP) {
7691                                 /*
7692                                  * Strictly speaking, swap is the equivalent of
7693                                  * *three* renames.  FSEvents clients should only take
7694                                  * the events as a hint, so we only bother reporting
7695                                  * two.
7696                                  */
7697                                 add_fsevent(FSE_RENAME, ctx,
7698                                                         FSE_ARG_STRING, to_len, to_name,
7699                                                         FSE_ARG_FINFO, &to_finfo,
7700                                                         FSE_ARG_STRING, from_len, from_name,
7701                                                         FSE_ARG_FINFO, &from_finfo,
7702                                                         FSE_ARG_DONE);
7703                         }
7704                 } else {
7705                         add_fsevent(FSE_RENAME, ctx,
7706                                     FSE_ARG_STRING, from_len, from_name,
7707                                     FSE_ARG_FINFO, &from_finfo,
7708                                     FSE_ARG_STRING, to_len, to_name,
7709                                     FSE_ARG_DONE);
7710                 }
7711         }
7712 #endif /* CONFIG_FSE */
7713
7714         /*
7715          * update filesystem's mount point data
7716          */
7717         if (mntrename) {
7718                 char *cp, *pathend, *mpname;
7719                 char * tobuf;
7720                 struct mount *mp;
7721                 int maxlen;
7722                 size_t len = 0;
7723
7724                 mp = fvp->v_mountedhere;
7725
7726                 if (vfs_busy(mp, LK_NOWAIT)) {
7727                         error = EBUSY;
7728                         goto out1;
7729                 }
7730                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
7731
7732                 if (UIO_SEG_IS_USER_SPACE(segflg))
7733                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
7734                 else
7735                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
7736                 if (!error) {
7737                         /* find current mount point prefix */
7738                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
7739                         for (cp = pathend; *cp != '\0'; ++cp) {
7740                                 if (*cp == '/')
7741                                         pathend = cp + 1;
7742                         }
7743                         /* find last component of target name */
7744                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
7745                                 if (*cp == '/')
7746                                         mpname = cp + 1;
7747                         }
7748                         /* append name to prefix */
7749                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
7750                         bzero(pathend, maxlen);
7751                         strlcpy(pathend, mpname, maxlen);
7752                 }
7753                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
7754
7755                 vfs_unbusy(mp);
7756         }
7757         /*
7758          * fix up name & parent pointers.  note that we first
7759          * check that fvp has the same name/parent pointers it
7760          * had before the rename call... this is a 'weak' check
7761          * at best...
7762          *
7763          * XXX oparent and oname may not be set in the compound vnop case
7764          */
7765         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
7766                 int update_flags;
7767
7768                 update_flags = VNODE_UPDATE_NAME;
7769
7770                 if (fdvp != tdvp)
7771                         update_flags |= VNODE_UPDATE_PARENT;
7772
7773                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
7774         }
7775 out1:
7776         if (to_name != NULL) {
7777                 RELEASE_PATH(to_name);
7778                 to_name = NULL;
7779         }
7780         if (from_name != NULL) {
7781                 RELEASE_PATH(from_name);
7782                 from_name = NULL;
7783         }
7784         if (holding_mntlock) {
7785                 mount_unlock_renames(locked_mp);
7786                 mount_drop(locked_mp, 0);
7787                 holding_mntlock = 0;
7788         }
7789         if (tdvp) {
7790                 /*
7791                  * nameidone has to happen before we vnode_put(tdvp)
7792                  * since it may need to release the fs_nodelock on the tdvp
7793                  */
7794                 nameidone(tond);
7795
7796                 if (tvp)
7797                         vnode_put(tvp);
7798                 vnode_put(tdvp);
7799         }
7800         if (fdvp) {
7801                 /*
7802                  * nameidone has to happen before we vnode_put(fdvp)
7803                  * since it may need to release the fs_nodelock on the fdvp
7804                  */
7805                 nameidone(fromnd);
7806
7807                 if (fvp)
7808                         vnode_put(fvp);
7809                 vnode_put(fdvp);
7810         }
7811
7812         /*
7813          * If things changed after we did the namei, then we will re-drive
7814          * this rename call from the top.
7815          */
7816         if (do_retry) {
7817                 do_retry = 0;
7818                 goto retry;
7819         }
7820
7821         FREE(__rename_data, M_TEMP);
7822         return (error);
7823 }
7824
7825 int
7826 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
7827 {
7828         return (renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
7829             AT_FDCWD, uap->to, UIO_USERSPACE, 0));
7830 }
7831
7832 int renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
7833 {
7834         return renameat_internal(
7835                 vfs_context_current(),
7836                 uap->fromfd, uap->from,
7837                 uap->tofd, uap->to,
7838                 UIO_USERSPACE, uap->flags);
7839 }
7840
7841 int
7842 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
7843 {
7844         return (renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
7845             uap->tofd, uap->to, UIO_USERSPACE, 0));
7846 }
7847
7848 /*
7849  * Make a directory file.
7850  *
7851  * Returns:     0                       Success
7852  *              EEXIST
7853  *      namei:???
7854  *      vnode_authorize:???
7855  *      vn_create:???
7856  */
7857 /* ARGSUSED */
7858 static int
7859 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
7860     enum uio_seg segflg)
7861 {
7862         vnode_t vp, dvp;
7863         int error;
7864         int update_flags = 0;
7865         int batched;
7866         struct nameidata nd;
7867
7868         AUDIT_ARG(mode, vap->va_mode);
7869         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
7870                path, ctx);
7871         nd.ni_cnd.cn_flags |= WILLBEDIR;
7872         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
7873
7874 continue_lookup:
7875         error = nameiat(&nd, fd);
7876         if (error)
7877                 return (error);
7878         dvp = nd.ni_dvp;
7879         vp = nd.ni_vp;
7880
7881         if (vp != NULL) {
7882                 error = EEXIST;
7883                 goto out;
7884         }
7885
7886         batched = vnode_compound_mkdir_available(dvp);
7887
7888         VATTR_SET(vap, va_type, VDIR);
7889
7890         /*
7891          * XXX
7892          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
7893          * only get EXISTS or EISDIR for existing path components, and not that it could see
7894          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
7895          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
7896          */
7897         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
7898                 if (error == EACCES || error == EPERM) {
7899                         int error2;
7900
7901                         nameidone(&nd);
7902                         vnode_put(dvp);
7903                         dvp = NULLVP;
7904
7905                         /*
7906                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
7907                          * rather than EACCESS if the target exists.
7908                          */
7909                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
7910                                         path, ctx);
7911                         error2 = nameiat(&nd, fd);
7912                         if (error2) {
7913                                 goto out;
7914                         } else {
7915                                 vp = nd.ni_vp;
7916                                 error = EEXIST;
7917                                 goto out;
7918                         }
7919                 }
7920
7921                 goto out;
7922         }
7923
7924         /*
7925          * make the directory
7926          */
7927         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
7928                 if (error == EKEEPLOOKING) {
7929                         nd.ni_vp = vp;
7930                         goto continue_lookup;
7931                 }
7932
7933                 goto out;
7934         }
7935
7936         // Make sure the name & parent pointers are hooked up
7937         if (vp->v_name == NULL)
7938                 update_flags |= VNODE_UPDATE_NAME;
7939         if (vp->v_parent == NULLVP)
7940                 update_flags |= VNODE_UPDATE_PARENT;
7941
7942         if (update_flags)
7943                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
7944
7945 #if CONFIG_FSE
7946         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
7947 #endif
7948
7949 out:
7950         /*
7951          * nameidone has to happen before we vnode_put(dvp)
7952          * since it may need to release the fs_nodelock on the dvp
7953          */
7954         nameidone(&nd);
7955
7956         if (vp)
7957                 vnode_put(vp);
7958         if (dvp)
7959                 vnode_put(dvp);
7960
7961         return (error);
7962 }
7963
7964 /*
7965  * mkdir_extended: Create a directory; with extended security (ACL).
7966  *
7967  * Parameters:    p                       Process requesting to create the directory
7968  *                uap                     User argument descriptor (see below)
7969  *                retval                  (ignored)
7970  *
7971  * Indirect:      uap->path               Path of directory to create
7972  *                uap->mode               Access permissions to set
7973  *                uap->xsecurity          ACL to set
7974  *
7975  * Returns:        0                      Success
7976  *                !0                      Not success
7977  *
7978  */
7979 int
7980 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
7981 {
7982         int ciferror;
7983         kauth_filesec_t xsecdst;
7984         struct vnode_attr va;
7985
7986         AUDIT_ARG(owner, uap->uid, uap->gid);
7987
7988         xsecdst = NULL;
7989         if ((uap->xsecurity != USER_ADDR_NULL) &&
7990             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
7991                 return ciferror;
7992
7993         VATTR_INIT(&va);
7994         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7995         if (xsecdst != NULL)
7996                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7997
7998         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7999             UIO_USERSPACE);
8000         if (xsecdst != NULL)
8001                 kauth_filesec_free(xsecdst);
8002         return ciferror;
8003 }
8004
8005 int
8006 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
8007 {
8008         struct vnode_attr va;
8009
8010         VATTR_INIT(&va);
8011         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8012
8013         return (mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8014             UIO_USERSPACE));
8015 }
8016
8017 int
8018 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
8019 {
8020         struct vnode_attr va;
8021
8022         VATTR_INIT(&va);
8023         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8024
8025         return(mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
8026             UIO_USERSPACE));
8027 }
8028
8029 static int
8030 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
8031     enum uio_seg segflg)
8032 {
8033         vnode_t vp, dvp;
8034         int error;
8035         struct nameidata nd;
8036         char     *path = NULL;
8037         int       len=0;
8038         int has_listeners = 0;
8039         int need_event = 0;
8040         int truncated = 0;
8041 #if CONFIG_FSE
8042         struct vnode_attr va;
8043 #endif /* CONFIG_FSE */
8044         struct vnode_attr *vap = NULL;
8045         int restart_count = 0;
8046         int batched;
8047
8048         int restart_flag;
8049
8050         /*
8051          * This loop exists to restart rmdir in the unlikely case that two
8052          * processes are simultaneously trying to remove the same directory
8053          * containing orphaned appleDouble files.
8054          */
8055         do {
8056                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
8057                     segflg, dirpath, ctx);
8058                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
8059 continue_lookup:
8060                 restart_flag = 0;
8061                 vap = NULL;
8062
8063                 error = nameiat(&nd, fd);
8064                 if (error)
8065                         return (error);
8066
8067                 dvp = nd.ni_dvp;
8068                 vp = nd.ni_vp;
8069
8070                 if (vp) {
8071                         batched = vnode_compound_rmdir_available(vp);
8072
8073                         if (vp->v_flag & VROOT) {
8074                                 /*
8075                                  * The root of a mounted filesystem cannot be deleted.
8076                                  */
8077                                 error = EBUSY;
8078                                 goto out;
8079                         }
8080
8081 #if DEVELOPMENT || DEBUG
8082                         /*
8083                          * XXX VSWAP: Check for entitlements or special flag here
8084                          * so we can restrict access appropriately.
8085                          */
8086 #else /* DEVELOPMENT || DEBUG */
8087
8088                         if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
8089                                 error = EPERM;
8090                                 goto out;
8091                         }
8092 #endif /* DEVELOPMENT || DEBUG */
8093
8094                         /*
8095                          * Removed a check here; we used to abort if vp's vid
8096                          * was not the same as what we'd seen the last time around.
8097                          * I do not think that check was valid, because if we retry
8098                          * and all dirents are gone, the directory could legitimately
8099                          * be recycled but still be present in a situation where we would
8100                          * have had permission to delete.  Therefore, we won't make
8101                          * an effort to preserve that check now that we may not have a
8102                          * vp here.
8103                          */
8104
8105                         if (!batched) {
8106                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
8107                                 if (error) {
8108                                         if (error == ENOENT) {
8109                                                 assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
8110                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8111                                                         restart_flag = 1;
8112                                                         restart_count += 1;
8113                                                 }
8114                                         }
8115                                         goto out;
8116                                 }
8117                         }
8118                 } else {
8119                         batched = 1;
8120
8121                         if (!vnode_compound_rmdir_available(dvp)) {
8122                                 panic("No error, but no compound rmdir?");
8123                         }
8124                 }
8125
8126 #if CONFIG_FSE
8127                 fse_info  finfo;
8128
8129                 need_event = need_fsevent(FSE_DELETE, dvp);
8130                 if (need_event) {
8131                         if (!batched) {
8132                                 get_fse_info(vp, &finfo, ctx);
8133                         } else {
8134                                 error = vfs_get_notify_attributes(&va);
8135                                 if (error) {
8136                                         goto out;
8137                                 }
8138
8139                                 vap = &va;
8140                         }
8141                 }
8142 #endif
8143                 has_listeners = kauth_authorize_fileop_has_listeners();
8144                 if (need_event || has_listeners) {
8145                         if (path == NULL) {
8146                                 GET_PATH(path);
8147                                 if (path == NULL) {
8148                                         error = ENOMEM;
8149                                         goto out;
8150                                 }
8151                         }
8152
8153                         len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
8154 #if CONFIG_FSE
8155                         if (truncated) {
8156                                 finfo.mode |= FSE_TRUNCATED_PATH;
8157                         }
8158 #endif
8159                 }
8160
8161                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8162                 nd.ni_vp = vp;
8163                 if (vp == NULLVP) {
8164                         /* Couldn't find a vnode */
8165                         goto out;
8166                 }
8167
8168                 if (error == EKEEPLOOKING) {
8169                         goto continue_lookup;
8170                 } else if (batched && error == ENOENT) {
8171                         assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
8172                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8173                                 /*
8174                                  * For compound VNOPs, the authorization callback
8175                                  * may return ENOENT in case of racing hard link lookups
8176                                  * redrive the lookup.
8177                                  */
8178                                 restart_flag = 1;
8179                                 restart_count += 1;
8180                                 goto out;
8181                         }
8182                 }
8183 #if CONFIG_APPLEDOUBLE
8184                 /*
8185                  * Special case to remove orphaned AppleDouble
8186                  * files. I don't like putting this in the kernel,
8187                  * but carbon does not like putting this in carbon either,
8188                  * so here we are.
8189                  */
8190                 if (error == ENOTEMPTY) {
8191                         error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
8192                         if (error == EBUSY) {
8193                                 goto out;
8194                         }
8195
8196
8197                         /*
8198                          * Assuming everything went well, we will try the RMDIR again
8199                          */
8200                         if (!error)
8201                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8202                 }
8203 #endif /* CONFIG_APPLEDOUBLE */
8204                 /*
8205                  * Call out to allow 3rd party notification of delete.
8206                  * Ignore result of kauth_authorize_fileop call.
8207                  */
8208                 if (!error) {
8209                         if (has_listeners) {
8210                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8211                                                 KAUTH_FILEOP_DELETE,
8212                                                 (uintptr_t)vp,
8213                                                 (uintptr_t)path);
8214                         }
8215
8216                         if (vp->v_flag & VISHARDLINK) {
8217                                 // see the comment in unlink1() about why we update
8218                                 // the parent of a hard link when it is removed
8219                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
8220                         }
8221
8222 #if CONFIG_FSE
8223                         if (need_event) {
8224                                 if (vap) {
8225                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
8226                                 }
8227                                 add_fsevent(FSE_DELETE, ctx,
8228                                                 FSE_ARG_STRING, len, path,
8229                                                 FSE_ARG_FINFO, &finfo,
8230                                                 FSE_ARG_DONE);
8231                         }
8232 #endif
8233                 }
8234
8235 out:
8236                 if (path != NULL) {
8237                         RELEASE_PATH(path);
8238                         path = NULL;
8239                 }
8240                 /*
8241                  * nameidone has to happen before we vnode_put(dvp)
8242                  * since it may need to release the fs_nodelock on the dvp
8243                  */
8244                 nameidone(&nd);
8245                 vnode_put(dvp);
8246
8247                 if (vp)
8248                         vnode_put(vp);
8249
8250                 if (restart_flag == 0) {
8251                         wakeup_one((caddr_t)vp);
8252                         return (error);
8253                 }
8254                 tsleep(vp, PVFS, "rm AD", 1);
8255
8256         } while (restart_flag != 0);
8257
8258         return (error);
8259
8260 }
8261
8262 /*
8263  * Remove a directory file.
8264  */
8265 /* ARGSUSED */
8266 int
8267 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
8268 {
8269         return (rmdirat_internal(vfs_context_current(), AT_FDCWD,
8270             CAST_USER_ADDR_T(uap->path), UIO_USERSPACE));
8271 }
8272
8273 /* Get direntry length padded to 8 byte alignment */
8274 #define DIRENT64_LEN(namlen) \
8275         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
8276
8277 /* Get dirent length padded to 4 byte alignment */
8278 #define DIRENT_LEN(namelen) \
8279         ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
8280
8281 /* Get the end of this dirent */
8282 #define DIRENT_END(dep) \
8283         (((char *)(dep)) + (dep)->d_reclen - 1)
8284
8285 errno_t
8286 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
8287                 int *numdirent, vfs_context_t ctxp)
8288 {
8289         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
8290         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
8291                    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0))  {
8292                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
8293         } else {
8294                 size_t bufsize;
8295                 void * bufptr;
8296                 uio_t auio;
8297                 struct direntry *entry64;
8298                 struct dirent *dep;
8299                 int bytesread;
8300                 int error;
8301
8302                 /*
8303                  * We're here because the underlying file system does not
8304                  * support direnties or we mounted denying support so we must
8305                  * fall back to dirents and convert them to direntries.
8306                  *
8307                  * Our kernel buffer needs to be smaller since re-packing will
8308                  * expand each dirent.  The worse case (when the name length
8309                  * is 3 or less) corresponds to a struct direntry size of 32
8310                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
8311                  * (4-byte aligned).  So having a buffer that is 3/8 the size
8312                  * will prevent us from reading more than we can pack.
8313                  *
8314                  * Since this buffer is wired memory, we will limit the
8315                  * buffer size to a maximum of 32K. We would really like to
8316                  * use 32K in the MIN(), but we use magic number 87371 to
8317                  * prevent uio_resid() * 3 / 8 from overflowing.
8318                  */
8319                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
8320                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
8321                 if (bufptr == NULL) {
8322                         return ENOMEM;
8323                 }
8324
8325                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
8326                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
8327                 auio->uio_offset = uio->uio_offset;
8328
8329                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
8330
8331                 dep = (struct dirent *)bufptr;
8332                 bytesread = bufsize - uio_resid(auio);
8333
8334                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
8335                        M_TEMP, M_WAITOK);
8336                 /*
8337                  * Convert all the entries and copy them out to user's buffer.
8338                  */
8339                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
8340                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
8341
8342                         if (DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
8343                             DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
8344                                 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
8345                                        vp->v_mount->mnt_vfsstat.f_mntonname,
8346                                        vp->v_name ? vp->v_name : "<unknown>");
8347                                 error = EIO;
8348                                 break;
8349                         }
8350
8351                         bzero(entry64, enbufsize);
8352                         /* Convert a dirent to a dirent64. */
8353                         entry64->d_ino = dep->d_ino;
8354                         entry64->d_seekoff = 0;
8355                         entry64->d_reclen = enbufsize;
8356                         entry64->d_namlen = dep->d_namlen;
8357                         entry64->d_type = dep->d_type;
8358                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
8359
8360                         /* Move to next entry. */
8361                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
8362
8363                         /* Copy entry64 to user's buffer. */
8364                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
8365                 }
8366
8367                 /* Update the real offset using the offset we got from VNOP_READDIR. */
8368                 if (error == 0) {
8369                         uio->uio_offset = auio->uio_offset;
8370                 }
8371                 uio_free(auio);
8372                 FREE(bufptr, M_TEMP);
8373                 FREE(entry64, M_TEMP);
8374                 return (error);
8375         }
8376 }
8377
8378 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
8379
8380 /*
8381  * Read a block of directory entries in a file system independent format.
8382  */
8383 static int
8384 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
8385                      off_t *offset, int flags)
8386 {
8387         vnode_t vp;
8388         struct vfs_context context = *vfs_context_current();    /* local copy */
8389         struct fileproc *fp;
8390         uio_t auio;
8391         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8392         off_t loff;
8393         int error, eofflag, numdirent;
8394         char uio_buf[ UIO_SIZEOF(1) ];
8395
8396         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
8397         if (error) {
8398                 return (error);
8399         }
8400         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
8401                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8402                 error = EBADF;
8403                 goto out;
8404         }
8405
8406         if (bufsize > GETDIRENTRIES_MAXBUFSIZE)
8407                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
8408
8409 #if CONFIG_MACF
8410         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
8411         if (error)
8412                 goto out;
8413 #endif
8414         if ( (error = vnode_getwithref(vp)) ) {
8415                 goto out;
8416         }
8417         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8418
8419 unionread:
8420         if (vp->v_type != VDIR) {
8421                 (void)vnode_put(vp);
8422                 error = EINVAL;
8423                 goto out;
8424         }
8425
8426 #if CONFIG_MACF
8427         error = mac_vnode_check_readdir(&context, vp);
8428         if (error != 0) {
8429                 (void)vnode_put(vp);
8430                 goto out;
8431         }
8432 #endif /* MAC */
8433
8434         loff = fp->f_fglob->fg_offset;
8435         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8436         uio_addiov(auio, bufp, bufsize);
8437
8438         if (flags & VNODE_READDIR_EXTENDED) {
8439                 error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
8440                 fp->f_fglob->fg_offset = uio_offset(auio);
8441         } else {
8442                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
8443                 fp->f_fglob->fg_offset = uio_offset(auio);
8444         }
8445         if (error) {
8446                 (void)vnode_put(vp);
8447                 goto out;
8448         }
8449
8450         if ((user_ssize_t)bufsize == uio_resid(auio)){
8451                 if (union_dircheckp) {
8452                         error = union_dircheckp(&vp, fp, &context);
8453                         if (error == -1)
8454                                 goto unionread;
8455                         if (error) {
8456                                 (void)vnode_put(vp);
8457                                 goto out;
8458                         }
8459                 }
8460
8461                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
8462                         struct vnode *tvp = vp;
8463                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
8464                                 vnode_ref(vp);
8465                                 fp->f_fglob->fg_data = (caddr_t) vp;
8466                                 fp->f_fglob->fg_offset = 0;
8467                                 vnode_rele(tvp);
8468                                 vnode_put(tvp);
8469                                 goto unionread;
8470                         }
8471                         vp = tvp;
8472                 }
8473         }
8474
8475         vnode_put(vp);
8476         if (offset) {
8477                 *offset = loff;
8478         }
8479
8480         *bytesread = bufsize - uio_resid(auio);
8481 out:
8482         file_drop(fd);
8483         return (error);
8484 }
8485
8486
8487 int
8488 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
8489 {
8490         off_t offset;
8491         ssize_t bytesread;
8492         int error;
8493
8494         AUDIT_ARG(fd, uap->fd);
8495         error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
8496
8497         if (error == 0) {
8498                 if (proc_is64bit(p)) {
8499                         user64_long_t base = (user64_long_t)offset;
8500                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
8501                 } else {
8502                         user32_long_t base = (user32_long_t)offset;
8503                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
8504                 }
8505                 *retval = bytesread;
8506         }
8507         return (error);
8508 }
8509
8510 int
8511 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
8512 {
8513         off_t offset;
8514         ssize_t bytesread;
8515         int error;
8516
8517         AUDIT_ARG(fd, uap->fd);
8518         error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
8519
8520         if (error == 0) {
8521                 *retval = bytesread;
8522                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
8523         }
8524         return (error);
8525 }
8526
8527
8528 /*
8529  * Set the mode mask for creation of filesystem nodes.
8530  * XXX implement xsecurity
8531  */
8532 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
8533 static int
8534 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
8535 {
8536         struct filedesc *fdp;
8537
8538         AUDIT_ARG(mask, newmask);
8539         proc_fdlock(p);
8540         fdp = p->p_fd;
8541         *retval = fdp->fd_cmask;
8542         fdp->fd_cmask = newmask & ALLPERMS;
8543         proc_fdunlock(p);
8544         return (0);
8545 }
8546
8547 /*
8548  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
8549  *
8550  * Parameters:    p                       Process requesting to set the umask
8551  *                uap                     User argument descriptor (see below)
8552  *                retval                  umask of the process (parameter p)
8553  *
8554  * Indirect:      uap->newmask            umask to set
8555  *                uap->xsecurity          ACL to set
8556  *
8557  * Returns:        0                      Success
8558  *                !0                      Not success
8559  *
8560  */
8561 int
8562 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
8563 {
8564         int ciferror;
8565         kauth_filesec_t xsecdst;
8566
8567         xsecdst = KAUTH_FILESEC_NONE;
8568         if (uap->xsecurity != USER_ADDR_NULL) {
8569                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
8570                         return ciferror;
8571         } else {
8572                 xsecdst = KAUTH_FILESEC_NONE;
8573         }
8574
8575         ciferror = umask1(p, uap->newmask, xsecdst, retval);
8576
8577         if (xsecdst != KAUTH_FILESEC_NONE)
8578                 kauth_filesec_free(xsecdst);
8579         return ciferror;
8580 }
8581
8582 int
8583 umask(proc_t p, struct umask_args *uap, int32_t *retval)
8584 {
8585         return(umask1(p, uap->newmask, UMASK_NOXSECURITY, retval));
8586 }
8587
8588 /*
8589  * Void all references to file by ripping underlying filesystem
8590  * away from vnode.
8591  */
8592 /* ARGSUSED */
8593 int
8594 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
8595 {
8596         vnode_t vp;
8597         struct vnode_attr va;
8598         vfs_context_t ctx = vfs_context_current();
8599         int error;
8600         struct nameidata nd;
8601
8602         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
8603                uap->path, ctx);
8604         error = namei(&nd);
8605         if (error)
8606                 return (error);
8607         vp = nd.ni_vp;
8608
8609         nameidone(&nd);
8610
8611         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
8612                 error = ENOTSUP;
8613                 goto out;
8614         }
8615
8616         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
8617                 error = EBUSY;
8618                 goto out;
8619         }
8620
8621 #if CONFIG_MACF
8622         error = mac_vnode_check_revoke(ctx, vp);
8623         if (error)
8624                 goto out;
8625 #endif
8626
8627         VATTR_INIT(&va);
8628         VATTR_WANTED(&va, va_uid);
8629         if ((error = vnode_getattr(vp, &va, ctx)))
8630                 goto out;
8631         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
8632             (error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
8633                 goto out;
8634         if (vp->v_usecount > 0 || (vnode_isaliased(vp)))
8635                 VNOP_REVOKE(vp, REVOKEALL, ctx);
8636 out:
8637         vnode_put(vp);
8638         return (error);
8639 }
8640
8641
8642 /*
8643  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
8644  *  The following system calls are designed to support features
8645  *  which are specific to the HFS & HFS Plus volume formats
8646  */
8647
8648
8649 /*
8650  * Obtain attribute information on objects in a directory while enumerating
8651  * the directory.
8652  */
8653 /* ARGSUSED */
8654 int
8655 getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
8656 {
8657         vnode_t vp;
8658         struct fileproc *fp;
8659         uio_t auio = NULL;
8660         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8661         uint32_t count, savecount;
8662         uint32_t newstate;
8663         int error, eofflag;
8664         uint32_t loff;
8665         struct attrlist attributelist;
8666         vfs_context_t ctx = vfs_context_current();
8667         int fd = uap->fd;
8668         char uio_buf[ UIO_SIZEOF(1) ];
8669         kauth_action_t action;
8670
8671         AUDIT_ARG(fd, fd);
8672
8673         /* Get the attributes into kernel space */
8674         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
8675                 return(error);
8676         }
8677         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
8678                 return(error);
8679         }
8680         savecount = count;
8681         if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) {
8682                 return (error);
8683         }
8684         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
8685                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8686                 error = EBADF;
8687                 goto out;
8688         }
8689
8690
8691 #if CONFIG_MACF
8692         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
8693             fp->f_fglob);
8694         if (error)
8695                 goto out;
8696 #endif
8697
8698
8699         if ( (error = vnode_getwithref(vp)) )
8700                 goto out;
8701
8702         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8703
8704 unionread:
8705         if (vp->v_type != VDIR) {
8706                 (void)vnode_put(vp);
8707                 error = EINVAL;
8708                 goto out;
8709         }
8710
8711 #if CONFIG_MACF
8712         error = mac_vnode_check_readdir(ctx, vp);
8713         if (error != 0) {
8714                 (void)vnode_put(vp);
8715                 goto out;
8716         }
8717 #endif /* MAC */
8718
8719         /* set up the uio structure which will contain the users return buffer */
8720         loff = fp->f_fglob->fg_offset;
8721         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8722         uio_addiov(auio, uap->buffer, uap->buffersize);
8723
8724         /*
8725          * If the only item requested is file names, we can let that past with
8726          * just LIST_DIRECTORY.  If they want any other attributes, that means
8727          * they need SEARCH as well.
8728          */
8729         action = KAUTH_VNODE_LIST_DIRECTORY;
8730         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
8731             attributelist.fileattr || attributelist.dirattr)
8732                 action |= KAUTH_VNODE_SEARCH;
8733
8734         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
8735
8736                 /* Believe it or not, uap->options only has 32-bits of valid
8737                  * info, so truncate before extending again */
8738
8739                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
8740                                 (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
8741         }
8742
8743         if (error) {
8744                 (void) vnode_put(vp);
8745                 goto out;
8746         }
8747
8748         /*
8749          * If we've got the last entry of a directory in a union mount
8750          * then reset the eofflag and pretend there's still more to come.
8751          * The next call will again set eofflag and the buffer will be empty,
8752          * so traverse to the underlying directory and do the directory
8753          * read there.
8754          */
8755         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
8756                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
8757                         eofflag = 0;
8758                 } else {                                                // Empty buffer
8759                         struct vnode *tvp = vp;
8760                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
8761                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
8762                                 fp->f_fglob->fg_data = (caddr_t) vp;
8763                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
8764                                 count = savecount;
8765                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
8766                                 vnode_put(tvp);
8767                                 goto unionread;
8768                         }
8769                         vp = tvp;
8770                 }
8771         }
8772
8773         (void)vnode_put(vp);
8774
8775         if (error)
8776                 goto out;
8777         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
8778
8779         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count))))
8780                 goto out;
8781         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate))))
8782                 goto out;
8783         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff))))
8784                 goto out;
8785
8786         *retval = eofflag;  /* similar to getdirentries */
8787         error = 0;
8788 out:
8789         file_drop(fd);
8790         return (error); /* return error earlier, an retval of 0 or 1 now */
8791
8792 } /* end of getdirentriesattr system call */
8793
8794 /*
8795 * Exchange data between two files
8796 */
8797
8798 /* ARGSUSED */
8799 int
8800 exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
8801 {
8802
8803         struct nameidata fnd, snd;
8804         vfs_context_t ctx = vfs_context_current();
8805         vnode_t fvp;
8806         vnode_t svp;
8807         int error;
8808         u_int32_t nameiflags;
8809         char *fpath = NULL;
8810         char *spath = NULL;
8811         int   flen=0, slen=0;
8812         int from_truncated=0, to_truncated=0;
8813 #if CONFIG_FSE
8814         fse_info f_finfo, s_finfo;
8815 #endif
8816
8817         nameiflags = 0;
8818         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8819
8820         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
8821                UIO_USERSPACE, uap->path1, ctx);
8822
8823         error = namei(&fnd);
8824         if (error)
8825                 goto out2;
8826
8827         nameidone(&fnd);
8828         fvp = fnd.ni_vp;
8829
8830         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
8831                UIO_USERSPACE, uap->path2, ctx);
8832
8833         error = namei(&snd);
8834         if (error) {
8835                 vnode_put(fvp);
8836                 goto out2;
8837         }
8838         nameidone(&snd);
8839         svp = snd.ni_vp;
8840
8841         /*
8842          * if the files are the same, return an inval error
8843          */
8844         if (svp == fvp) {
8845                 error = EINVAL;
8846                 goto out;
8847         }
8848
8849         /*
8850          * if the files are on different volumes, return an error
8851          */
8852         if (svp->v_mount != fvp->v_mount) {
8853                 error = EXDEV;
8854                 goto out;
8855         }
8856
8857         /* If they're not files, return an error */
8858         if ( (vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
8859                 error = EINVAL;
8860                 goto out;
8861         }
8862
8863 #if CONFIG_MACF
8864         error = mac_vnode_check_exchangedata(ctx,
8865             fvp, svp);
8866         if (error)
8867                 goto out;
8868 #endif
8869         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
8870             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0))
8871                 goto out;
8872
8873         if (
8874 #if CONFIG_FSE
8875         need_fsevent(FSE_EXCHANGE, fvp) ||
8876 #endif
8877         kauth_authorize_fileop_has_listeners()) {
8878                 GET_PATH(fpath);
8879                 GET_PATH(spath);
8880                 if (fpath == NULL || spath == NULL) {
8881                         error = ENOMEM;
8882                         goto out;
8883                 }
8884
8885                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
8886                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
8887
8888 #if CONFIG_FSE
8889                 get_fse_info(fvp, &f_finfo, ctx);
8890                 get_fse_info(svp, &s_finfo, ctx);
8891                 if (from_truncated || to_truncated) {
8892                         // set it here since only the f_finfo gets reported up to user space
8893                         f_finfo.mode |= FSE_TRUNCATED_PATH;
8894                 }
8895 #endif
8896         }
8897         /* Ok, make the call */
8898         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
8899
8900         if (error == 0) {
8901             const char *tmpname;
8902
8903             if (fpath != NULL && spath != NULL) {
8904                     /* call out to allow 3rd party notification of exchangedata.
8905                      * Ignore result of kauth_authorize_fileop call.
8906                      */
8907                     kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
8908                                            (uintptr_t)fpath, (uintptr_t)spath);
8909             }
8910             name_cache_lock();
8911
8912             tmpname     = fvp->v_name;
8913             fvp->v_name = svp->v_name;
8914             svp->v_name = tmpname;
8915
8916             if (fvp->v_parent != svp->v_parent) {
8917                 vnode_t tmp;
8918
8919                 tmp           = fvp->v_parent;
8920                 fvp->v_parent = svp->v_parent;
8921                 svp->v_parent = tmp;
8922             }
8923             name_cache_unlock();
8924
8925 #if CONFIG_FSE
8926             if (fpath != NULL && spath != NULL) {
8927                     add_fsevent(FSE_EXCHANGE, ctx,
8928                                 FSE_ARG_STRING, flen, fpath,
8929                                 FSE_ARG_FINFO, &f_finfo,
8930                                 FSE_ARG_STRING, slen, spath,
8931                                 FSE_ARG_FINFO, &s_finfo,
8932                                 FSE_ARG_DONE);
8933             }
8934 #endif
8935         }
8936
8937 out:
8938         if (fpath != NULL)
8939                 RELEASE_PATH(fpath);
8940         if (spath != NULL)
8941                 RELEASE_PATH(spath);
8942         vnode_put(svp);
8943         vnode_put(fvp);
8944 out2:
8945         return (error);
8946 }
8947
8948 /*
8949  * Return (in MB) the amount of freespace on the given vnode's volume.
8950  */
8951 uint32_t freespace_mb(vnode_t vp);
8952
8953 uint32_t
8954 freespace_mb(vnode_t vp)
8955 {
8956         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
8957         return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
8958                 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
8959 }
8960
8961 #if CONFIG_SEARCHFS
8962
8963 /* ARGSUSED */
8964
8965 int
8966 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
8967 {
8968         vnode_t vp, tvp;
8969         int i, error=0;
8970         int fserror = 0;
8971         struct nameidata nd;
8972         struct user64_fssearchblock searchblock;
8973         struct searchstate *state;
8974         struct attrlist *returnattrs;
8975         struct timeval timelimit;
8976         void *searchparams1,*searchparams2;
8977         uio_t auio = NULL;
8978         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8979         uint32_t nummatches;
8980         int mallocsize;
8981         uint32_t nameiflags;
8982         vfs_context_t ctx = vfs_context_current();
8983         char uio_buf[ UIO_SIZEOF(1) ];
8984
8985         /* Start by copying in fsearchblock parameter list */
8986     if (IS_64BIT_PROCESS(p)) {
8987         error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
8988         timelimit.tv_sec = searchblock.timelimit.tv_sec;
8989         timelimit.tv_usec = searchblock.timelimit.tv_usec;
8990     }
8991     else {
8992         struct user32_fssearchblock tmp_searchblock;
8993
8994         error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
8995         // munge into 64-bit version
8996         searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
8997         searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
8998         searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
8999         searchblock.maxmatches = tmp_searchblock.maxmatches;
9000                 /*
9001                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
9002                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
9003                  */
9004         timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
9005         timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
9006         searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
9007         searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
9008         searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
9009         searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
9010         searchblock.searchattrs = tmp_searchblock.searchattrs;
9011     }
9012         if (error)
9013                 return(error);
9014
9015         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
9016          */
9017         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
9018                 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS)
9019                 return(EINVAL);
9020
9021         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
9022         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
9023         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
9024         /* block.                                                                                             */
9025         /*                                                                                                    */
9026         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
9027         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
9028         /*       assumes the size is still 556 bytes it will continue to work                                 */
9029
9030         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
9031                 sizeof(struct attrlist) + sizeof(struct searchstate) + (2*sizeof(uint32_t));
9032
9033         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
9034
9035         /* Now set up the various pointers to the correct place in our newly allocated memory */
9036
9037         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
9038         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
9039         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof (struct attrlist));
9040
9041         /* Now copy in the stuff given our local variables. */
9042
9043         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1)))
9044                 goto freeandexit;
9045
9046         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2)))
9047                 goto freeandexit;
9048
9049         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist))))
9050                 goto freeandexit;
9051
9052         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate))))
9053                 goto freeandexit;
9054
9055         /*
9056          * When searching a union mount, need to set the
9057          * start flag at the first call on each layer to
9058          * reset state for the new volume.
9059          */
9060         if (uap->options & SRCHFS_START)
9061                 state->ss_union_layer = 0;
9062         else
9063                 uap->options |= state->ss_union_flags;
9064         state->ss_union_flags = 0;
9065
9066         /*
9067          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
9068          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
9069          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
9070          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
9071          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
9072          */
9073
9074         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
9075                 attrreference_t* string_ref;
9076                 u_int32_t* start_length;
9077                 user64_size_t param_length;
9078
9079                 /* validate searchparams1 */
9080                 param_length = searchblock.sizeofsearchparams1;
9081                 /* skip the word that specifies length of the buffer */
9082                 start_length= (u_int32_t*) searchparams1;
9083                 start_length= start_length+1;
9084                 string_ref= (attrreference_t*) start_length;
9085
9086                 /* ensure no negative offsets or too big offsets */
9087                 if (string_ref->attr_dataoffset < 0 ) {
9088                         error = EINVAL;
9089                         goto freeandexit;
9090                 }
9091                 if (string_ref->attr_length > MAXPATHLEN) {
9092                         error = EINVAL;
9093                         goto freeandexit;
9094                 }
9095
9096                 /* Check for pointer overflow in the string ref */
9097                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
9098                         error = EINVAL;
9099                         goto freeandexit;
9100                 }
9101
9102                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
9103                         error = EINVAL;
9104                         goto freeandexit;
9105                 }
9106                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
9107                         error = EINVAL;
9108                         goto freeandexit;
9109                 }
9110         }
9111
9112         /* set up the uio structure which will contain the users return buffer */
9113         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9114         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
9115
9116         nameiflags = 0;
9117         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
9118         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
9119                UIO_USERSPACE, uap->path, ctx);
9120
9121         error = namei(&nd);
9122         if (error)
9123                 goto freeandexit;
9124         vp = nd.ni_vp;
9125         nameidone(&nd);
9126
9127         /*
9128          * Switch to the root vnode for the volume
9129          */
9130         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
9131         vnode_put(vp);
9132         if (error)
9133                 goto freeandexit;
9134         vp = tvp;
9135
9136         /*
9137          * If it's a union mount, the path lookup takes
9138          * us to the top layer. But we may need to descend
9139          * to a lower layer. For non-union mounts the layer
9140          * is always zero.
9141          */
9142         for (i = 0; i < (int) state->ss_union_layer; i++) {
9143                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0)
9144                         break;
9145                 tvp = vp;
9146                 vp = vp->v_mount->mnt_vnodecovered;
9147                 if (vp == NULL) {
9148                         vnode_put(tvp);
9149                         error = ENOENT;
9150                         goto freeandexit;
9151                 }
9152                 error = vnode_getwithref(vp);
9153                 vnode_put(tvp);
9154                 if (error)
9155                         goto freeandexit;
9156         }
9157
9158 #if CONFIG_MACF
9159         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
9160         if (error) {
9161                 vnode_put(vp);
9162                 goto freeandexit;
9163         }
9164 #endif
9165
9166
9167         /*
9168          * If searchblock.maxmatches == 0, then skip the search. This has happened
9169          * before and sometimes the underlying code doesnt deal with it well.
9170          */
9171          if (searchblock.maxmatches == 0) {
9172                 nummatches = 0;
9173                 goto saveandexit;
9174          }
9175
9176         /*
9177          * Allright, we have everything we need, so lets make that call.
9178          *
9179          * We keep special track of the return value from the file system:
9180          * EAGAIN is an acceptable error condition that shouldn't keep us
9181          * from copying out any results...
9182          */
9183
9184         fserror = VNOP_SEARCHFS(vp,
9185                 searchparams1,
9186                 searchparams2,
9187                 &searchblock.searchattrs,
9188                 (u_long)searchblock.maxmatches,
9189                 &timelimit,
9190                 returnattrs,
9191                 &nummatches,
9192                 (u_long)uap->scriptcode,
9193                 (u_long)uap->options,
9194                 auio,
9195                 (struct searchstate *) &state->ss_fsstate,
9196                 ctx);
9197
9198         /*
9199          * If it's a union mount we need to be called again
9200          * to search the mounted-on filesystem.
9201          */
9202         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
9203                 state->ss_union_flags = SRCHFS_START;
9204                 state->ss_union_layer++;        // search next layer down
9205                 fserror = EAGAIN;
9206         }
9207
9208 saveandexit:
9209
9210         vnode_put(vp);
9211
9212         /* Now copy out the stuff that needs copying out. That means the number of matches, the
9213            search state.  Everything was already put into he return buffer by the vop call. */
9214
9215         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0)
9216                 goto freeandexit;
9217
9218         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0)
9219                 goto freeandexit;
9220
9221         error = fserror;
9222
9223 freeandexit:
9224
9225         FREE(searchparams1,M_TEMP);
9226
9227         return(error);
9228
9229
9230 } /* end of searchfs system call */
9231
9232 #else /* CONFIG_SEARCHFS */
9233
9234 int
9235 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
9236 {
9237         return (ENOTSUP);
9238 }
9239
9240 #endif /* CONFIG_SEARCHFS */
9241
9242
9243 lck_grp_attr_t *  nspace_group_attr;
9244 lck_attr_t *      nspace_lock_attr;
9245 lck_grp_t *       nspace_mutex_group;
9246
9247 lck_mtx_t         nspace_handler_lock;
9248 lck_mtx_t         nspace_handler_exclusion_lock;
9249
9250 time_t snapshot_timestamp=0;
9251 int nspace_allow_virtual_devs=0;
9252
9253 void nspace_handler_init(void);
9254
9255 typedef struct nspace_item_info {
9256         struct vnode *vp;
9257         void         *arg;
9258         uint64_t      op;
9259         uint32_t      vid;
9260         uint32_t      flags;
9261         uint32_t      token;
9262         uint32_t      refcount;
9263 } nspace_item_info;
9264
9265 #define MAX_NSPACE_ITEMS   128
9266 nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
9267 uint32_t      nspace_item_idx=0;              // also used as the sleep/wakeup rendezvous address
9268 uint32_t      nspace_token_id=0;
9269 uint32_t      nspace_handler_timeout = 15;    // seconds
9270
9271 #define NSPACE_ITEM_NEW         0x0001
9272 #define NSPACE_ITEM_PROCESSING  0x0002
9273 #define NSPACE_ITEM_DEAD        0x0004
9274 #define NSPACE_ITEM_CANCELLED   0x0008
9275 #define NSPACE_ITEM_DONE        0x0010
9276 #define NSPACE_ITEM_RESET_TIMER 0x0020
9277
9278 #define NSPACE_ITEM_NSPACE_EVENT   0x0040
9279 #define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
9280
9281 #define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT)
9282
9283 //#pragma optimization_level 0
9284
9285 typedef enum {
9286         NSPACE_HANDLER_NSPACE = 0,
9287         NSPACE_HANDLER_SNAPSHOT = 1,
9288
9289         NSPACE_HANDLER_COUNT,
9290 } nspace_type_t;
9291
9292 typedef struct {
9293         uint64_t handler_tid;
9294         struct proc *handler_proc;
9295         int handler_busy;
9296 } nspace_handler_t;
9297
9298 nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
9299
9300 /* namespace fsctl functions */
9301 static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
9302 static int nspace_item_flags_for_type(nspace_type_t nspace_type);
9303 static int nspace_open_flags_for_type(nspace_type_t nspace_type);
9304 static nspace_type_t nspace_type_for_op(uint64_t op);
9305 static int nspace_is_special_process(struct proc *proc);
9306 static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
9307 static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
9308 static int validate_namespace_args (int is64bit, int size);
9309 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
9310
9311
9312 static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
9313 {
9314         switch(nspace_type) {
9315                 case NSPACE_HANDLER_NSPACE:
9316                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
9317                 case NSPACE_HANDLER_SNAPSHOT:
9318                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
9319                 default:
9320                         printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
9321                         return 0;
9322         }
9323 }
9324
9325 static inline int nspace_item_flags_for_type(nspace_type_t nspace_type)
9326 {
9327         switch(nspace_type) {
9328                 case NSPACE_HANDLER_NSPACE:
9329                         return NSPACE_ITEM_NSPACE_EVENT;
9330                 case NSPACE_HANDLER_SNAPSHOT:
9331                         return NSPACE_ITEM_SNAPSHOT_EVENT;
9332                 default:
9333                         printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
9334                         return 0;
9335         }
9336 }
9337
9338 static inline int nspace_open_flags_for_type(nspace_type_t nspace_type)
9339 {
9340         switch(nspace_type) {
9341                 case NSPACE_HANDLER_NSPACE:
9342                         return FREAD | FWRITE | O_EVTONLY;
9343                 case NSPACE_HANDLER_SNAPSHOT:
9344                         return FREAD | O_EVTONLY;
9345                 default:
9346                         printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
9347                         return 0;
9348         }
9349 }
9350
9351 static inline nspace_type_t nspace_type_for_op(uint64_t op)
9352 {
9353         switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
9354                 case NAMESPACE_HANDLER_NSPACE_EVENT:
9355                         return NSPACE_HANDLER_NSPACE;
9356                 case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
9357                         return NSPACE_HANDLER_SNAPSHOT;
9358                 default:
9359                         printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
9360                         return NSPACE_HANDLER_NSPACE;
9361         }
9362 }
9363
9364 static inline int nspace_is_special_process(struct proc *proc)
9365 {
9366         int i;
9367         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
9368                 if (proc == nspace_handlers[i].handler_proc)
9369                         return 1;
9370         }
9371         return 0;
9372 }
9373
9374 void
9375 nspace_handler_init(void)
9376 {
9377         nspace_lock_attr    = lck_attr_alloc_init();
9378         nspace_group_attr   = lck_grp_attr_alloc_init();
9379         nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
9380         lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
9381         lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
9382         memset(&nspace_items[0], 0, sizeof(nspace_items));
9383 }
9384
9385 void
9386 nspace_proc_exit(struct proc *p)
9387 {
9388         int i, event_mask = 0;
9389
9390         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
9391                 if (p == nspace_handlers[i].handler_proc) {
9392                         event_mask |= nspace_item_flags_for_type(i);
9393                         nspace_handlers[i].handler_tid = 0;
9394                         nspace_handlers[i].handler_proc = NULL;
9395                 }
9396         }
9397
9398         if (event_mask == 0) {
9399                 return;
9400         }
9401
9402         lck_mtx_lock(&nspace_handler_lock);
9403         if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
9404                 // if this process was the snapshot handler, zero snapshot_timeout
9405                 snapshot_timestamp = 0;
9406         }
9407
9408         //
9409         // unblock anyone that's waiting for the handler that died
9410         //
9411         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9412                 if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
9413
9414                         if ( nspace_items[i].flags & event_mask ) {
9415
9416                                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9417                                         vnode_lock_spin(nspace_items[i].vp);
9418                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9419                                         vnode_unlock(nspace_items[i].vp);
9420                                 }
9421                                 nspace_items[i].vp = NULL;
9422                                 nspace_items[i].vid = 0;
9423                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9424                                 nspace_items[i].token = 0;
9425
9426                                 wakeup((caddr_t)&(nspace_items[i].vp));
9427                         }
9428                 }
9429         }
9430
9431         wakeup((caddr_t)&nspace_item_idx);
9432         lck_mtx_unlock(&nspace_handler_lock);
9433 }
9434
9435
9436 int
9437 resolve_nspace_item(struct vnode *vp, uint64_t op)
9438 {
9439         return resolve_nspace_item_ext(vp, op, NULL);
9440 }
9441
9442 int
9443 resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
9444 {
9445         int i, error, keep_waiting;
9446         struct timespec ts;
9447         nspace_type_t nspace_type = nspace_type_for_op(op);
9448
9449         // only allow namespace events on regular files, directories and symlinks.
9450         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
9451                 return 0;
9452         }
9453
9454         //
9455         // if this is a snapshot event and the vnode is on a
9456         // disk image just pretend nothing happened since any
9457         // change to the disk image will cause the disk image
9458         // itself to get backed up and this avoids multi-way
9459         // deadlocks between the snapshot handler and the ever
9460         // popular diskimages-helper process.  the variable
9461         // nspace_allow_virtual_devs allows this behavior to
9462         // be overridden (for use by the Mobile TimeMachine
9463         // testing infrastructure which uses disk images)
9464         //
9465         if (   (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
9466             && (vp->v_mount != NULL)
9467             && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
9468             && !nspace_allow_virtual_devs) {
9469
9470                 return 0;
9471         }
9472
9473         // if (thread_tid(current_thread()) == namespace_handler_tid) {
9474         if (nspace_handlers[nspace_type].handler_proc == NULL) {
9475                 return 0;
9476         }
9477
9478         if (nspace_is_special_process(current_proc())) {
9479                 return EDEADLK;
9480         }
9481
9482         lck_mtx_lock(&nspace_handler_lock);
9483
9484 retry:
9485         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9486                 if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
9487                         break;
9488                 }
9489         }
9490
9491         if (i >= MAX_NSPACE_ITEMS) {
9492                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9493                         if (nspace_items[i].flags == 0) {
9494                                 break;
9495                         }
9496                 }
9497         } else {
9498                 nspace_items[i].refcount++;
9499         }
9500
9501         if (i >= MAX_NSPACE_ITEMS) {
9502                 ts.tv_sec = nspace_handler_timeout;
9503                 ts.tv_nsec = 0;
9504
9505                 error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS|PCATCH, "nspace-no-space", &ts);
9506                 if (error == 0) {
9507                         // an entry got free'd up, go see if we can get a slot
9508                         goto retry;
9509                 } else {
9510                         lck_mtx_unlock(&nspace_handler_lock);
9511                         return error;
9512                 }
9513         }
9514
9515         //
9516         // if it didn't already exist, add it.  if it did exist
9517         // we'll get woken up when someone does a wakeup() on
9518         // the slot in the nspace_items table.
9519         //
9520         if (vp != nspace_items[i].vp) {
9521                 nspace_items[i].vp = vp;
9522                 nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
9523                 nspace_items[i].op = op;
9524                 nspace_items[i].vid = vnode_vid(vp);
9525                 nspace_items[i].flags = NSPACE_ITEM_NEW;
9526                 nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
9527                 if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
9528                         if (arg) {
9529                                 vnode_lock_spin(vp);
9530                                 vp->v_flag |= VNEEDSSNAPSHOT;
9531                                 vnode_unlock(vp);
9532                         }
9533                 }
9534
9535                 nspace_items[i].token = 0;
9536                 nspace_items[i].refcount = 1;
9537
9538                 wakeup((caddr_t)&nspace_item_idx);
9539         }
9540
9541         //
9542         // Now go to sleep until the handler does a wakeup on this
9543         // slot in the nspace_items table (or we timeout).
9544         //
9545         keep_waiting = 1;
9546         while(keep_waiting) {
9547                 ts.tv_sec = nspace_handler_timeout;
9548                 ts.tv_nsec = 0;
9549                 error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS|PCATCH, "namespace-done", &ts);
9550
9551                 if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
9552                         error = 0;
9553                 } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
9554                         error = nspace_items[i].token;
9555                 } else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
9556                         if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
9557                                 nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
9558                                 continue;
9559                         } else {
9560                                 error = ETIMEDOUT;
9561                         }
9562                 } else if (error == 0) {
9563                         // hmmm, why did we get woken up?
9564                         printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
9565                                nspace_items[i].token);
9566                 }
9567
9568                 if (--nspace_items[i].refcount == 0) {
9569                         nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
9570                         nspace_items[i].arg = NULL;
9571                         nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
9572                         nspace_items[i].flags = 0;     // this clears it for re-use
9573                 }
9574                 wakeup(&nspace_token_id);
9575                 keep_waiting = 0;
9576         }
9577
9578         lck_mtx_unlock(&nspace_handler_lock);
9579
9580         return error;
9581 }
9582
9583 int nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg)
9584 {
9585         int snapshot_error = 0;
9586
9587         if (vp == NULL) {
9588                 return 0;
9589         }
9590
9591         /* Swap files are special; skip them */
9592         if (vnode_isswap(vp)) {
9593                 return 0;
9594         }
9595
9596         if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) {
9597                 // the change time is within this epoch
9598                 int error;
9599
9600                 error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg);
9601                 if (error == EDEADLK) {
9602                         snapshot_error = 0;
9603                 } else if (error) {
9604                         if (error == EAGAIN) {
9605                                 printf("nspace_snapshot_event: timed out waiting for namespace handler...\n");
9606                         } else if (error == EINTR) {
9607                                 // printf("nspace_snapshot_event: got a signal while waiting for namespace handler...\n");
9608                                 snapshot_error = EINTR;
9609                         }
9610                 }
9611         }
9612
9613         return snapshot_error;
9614 }
9615
9616 int
9617 get_nspace_item_status(struct vnode *vp, int32_t *status)
9618 {
9619         int i;
9620
9621         lck_mtx_lock(&nspace_handler_lock);
9622         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9623                 if (nspace_items[i].vp == vp) {
9624                         break;
9625                 }
9626         }
9627
9628         if (i >= MAX_NSPACE_ITEMS) {
9629                 lck_mtx_unlock(&nspace_handler_lock);
9630                 return ENOENT;
9631         }
9632
9633         *status = nspace_items[i].flags;
9634         lck_mtx_unlock(&nspace_handler_lock);
9635         return 0;
9636 }
9637
9638
9639 #if 0
9640 static int
9641 build_volfs_path(struct vnode *vp, char *path, int *len)
9642 {
9643         struct vnode_attr va;
9644         int ret;
9645
9646         VATTR_INIT(&va);
9647         VATTR_WANTED(&va, va_fsid);
9648         VATTR_WANTED(&va, va_fileid);
9649
9650         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
9651                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
9652                 ret = -1;
9653         } else {
9654                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
9655                 ret = 0;
9656         }
9657
9658         return ret;
9659 }
9660 #endif
9661
9662 //
9663 // Note: this function does NOT check permissions on all of the
9664 // parent directories leading to this vnode.  It should only be
9665 // called on behalf of a root process.  Otherwise a process may
9666 // get access to a file because the file itself is readable even
9667 // though its parent directories would prevent access.
9668 //
9669 static int
9670 vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
9671 {
9672         int error, action;
9673
9674         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9675                 return error;
9676         }
9677
9678 #if CONFIG_MACF
9679         error = mac_vnode_check_open(ctx, vp, fmode);
9680         if (error)
9681                 return error;
9682 #endif
9683
9684         /* compute action to be authorized */
9685         action = 0;
9686         if (fmode & FREAD) {
9687                 action |= KAUTH_VNODE_READ_DATA;
9688         }
9689         if (fmode & (FWRITE | O_TRUNC)) {
9690                 /*
9691                  * If we are writing, appending, and not truncating,
9692                  * indicate that we are appending so that if the
9693                  * UF_APPEND or SF_APPEND bits are set, we do not deny
9694                  * the open.
9695                  */
9696                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
9697                         action |= KAUTH_VNODE_APPEND_DATA;
9698                 } else {
9699                         action |= KAUTH_VNODE_WRITE_DATA;
9700                 }
9701         }
9702
9703         if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
9704                 return error;
9705
9706
9707         //
9708         // if the vnode is tagged VOPENEVT and the current process
9709         // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
9710         // flag to the open mode so that this open won't count against
9711         // the vnode when carbon delete() does a vnode_isinuse() to see
9712         // if a file is currently in use.  this allows spotlight
9713         // importers to not interfere with carbon apps that depend on
9714         // the no-delete-if-busy semantics of carbon delete().
9715         //
9716         if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
9717                 fmode |= O_EVTONLY;
9718         }
9719
9720         if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) {
9721                 return error;
9722         }
9723         if ( (error = vnode_ref_ext(vp, fmode, 0)) ) {
9724                 VNOP_CLOSE(vp, fmode, ctx);
9725                 return error;
9726         }
9727
9728         /* Call out to allow 3rd party notification of open.
9729          * Ignore result of kauth_authorize_fileop call.
9730          */
9731 #if CONFIG_MACF
9732         mac_vnode_notify_open(ctx, vp, fmode);
9733 #endif
9734         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
9735                                (uintptr_t)vp, 0);
9736
9737
9738         return 0;
9739 }
9740
9741 static int
9742 wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
9743 {
9744         int i;
9745         int error = 0;
9746         int unblock = 0;
9747         task_t curtask;
9748
9749         lck_mtx_lock(&nspace_handler_exclusion_lock);
9750         if (nspace_handlers[nspace_type].handler_busy) {
9751                 lck_mtx_unlock(&nspace_handler_exclusion_lock);
9752                 return EBUSY;
9753         }
9754
9755         nspace_handlers[nspace_type].handler_busy = 1;
9756         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9757
9758         /*
9759          * Any process that gets here will be one of the namespace handlers.
9760          * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
9761          * as we can cause deadlocks to occur, because the namespace handler may prevent
9762          * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
9763          * process.
9764          */
9765         curtask = current_task();
9766         bsd_set_dependency_capable (curtask);
9767
9768         lck_mtx_lock(&nspace_handler_lock);
9769         if (nspace_handlers[nspace_type].handler_proc == NULL) {
9770                 nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
9771                 nspace_handlers[nspace_type].handler_proc = current_proc();
9772         }
9773
9774         if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
9775                         (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9776                 error = EINVAL;
9777         }
9778
9779         while (error == 0) {
9780
9781                 /* Try to find matching namespace item */
9782                 for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
9783                         if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9784                                 if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9785                                         break;
9786                                 }
9787                         }
9788                 }
9789
9790                 if (i >= MAX_NSPACE_ITEMS) {
9791                         /* Nothing is there yet. Wait for wake up and retry */
9792                         error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0);
9793                         if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9794                                 /* Prevent infinite loop if snapshot handler exited */
9795                                 error = EINVAL;
9796                                 break;
9797                         }
9798                         continue;
9799                 }
9800
9801                 nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
9802                 nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
9803                 nspace_items[i].token  = ++nspace_token_id;
9804
9805                 assert(nspace_items[i].vp);
9806                 struct fileproc *fp;
9807                 int32_t indx;
9808                 int32_t fmode;
9809                 struct proc *p = current_proc();
9810                 vfs_context_t ctx = vfs_context_current();
9811                 struct vnode_attr va;
9812                 bool vn_get_succsessful = false;
9813                 bool vn_open_successful = false;
9814                 bool fp_alloc_successful = false;
9815
9816                 /*
9817                  * Use vnode pointer to acquire a file descriptor for
9818                  * hand-off to userland
9819                  */
9820                 fmode = nspace_open_flags_for_type(nspace_type);
9821                 error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
9822                 if (error) goto cleanup;
9823                 vn_get_succsessful = true;
9824
9825                 error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
9826                 if (error) goto cleanup;
9827                 vn_open_successful = true;
9828
9829                 error = falloc(p, &fp, &indx, ctx);
9830                 if (error) goto cleanup;
9831                 fp_alloc_successful = true;
9832
9833                 fp->f_fglob->fg_flag = fmode;
9834                 fp->f_fglob->fg_ops = &vnops;
9835                 fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
9836
9837                 proc_fdlock(p);
9838                 procfdtbl_releasefd(p, indx, NULL);
9839                 fp_drop(p, indx, fp, 1);
9840                 proc_fdunlock(p);
9841
9842                 /*
9843                  * All variants of the namespace handler struct support these three fields:
9844                  * token, flags, and the FD pointer
9845                  */
9846                 error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
9847                 if (error) goto cleanup;
9848                 error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
9849                 if (error) goto cleanup;
9850                 error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
9851                 if (error) goto cleanup;
9852
9853                 /*
9854                  * Handle optional fields:
9855                  * extended version support an info ptr (offset, length), and the
9856                  *
9857                  * namedata version supports a unique per-link object ID
9858                  *
9859                  */
9860                 if (nhd->infoptr) {
9861                         uio_t uio = (uio_t)nspace_items[i].arg;
9862                         uint64_t u_offset, u_length;
9863
9864                         if (uio) {
9865                                 u_offset = uio_offset(uio);
9866                                 u_length = uio_resid(uio);
9867                         } else {
9868                                 u_offset = 0;
9869                                 u_length = 0;
9870                         }
9871                         error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
9872                         if (error) goto cleanup;
9873                         error = copyout(&u_length, nhd->infoptr + sizeof(uint64_t), sizeof(uint64_t));
9874                         if (error) goto cleanup;
9875                 }
9876
9877                 if (nhd->objid) {
9878                         VATTR_INIT(&va);
9879                         VATTR_WANTED(&va, va_linkid);
9880                         error = vnode_getattr(nspace_items[i].vp, &va, ctx);
9881                         if (error) goto cleanup;
9882
9883                         uint64_t linkid = 0;
9884                         if (VATTR_IS_SUPPORTED (&va, va_linkid)) {
9885                                 linkid = (uint64_t)va.va_linkid;
9886                         }
9887                         error = copyout(&linkid, nhd->objid, sizeof(uint64_t));
9888                 }
9889 cleanup:
9890                 if (error) {
9891                         if (fp_alloc_successful) fp_free(p, indx, fp);
9892                         if (vn_open_successful) vn_close(nspace_items[i].vp, fmode, ctx);
9893                         unblock = 1;
9894                 }
9895
9896                 if (vn_get_succsessful) vnode_put(nspace_items[i].vp);
9897
9898                 break;
9899         }
9900
9901         if (unblock) {
9902                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9903                         vnode_lock_spin(nspace_items[i].vp);
9904                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9905                         vnode_unlock(nspace_items[i].vp);
9906                 }
9907                 nspace_items[i].vp = NULL;
9908                 nspace_items[i].vid = 0;
9909                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9910                 nspace_items[i].token = 0;
9911
9912                 wakeup((caddr_t)&(nspace_items[i].vp));
9913         }
9914
9915         if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
9916                 // just go through every snapshot event and unblock it immediately.
9917                 if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9918                         for(i = 0; i < MAX_NSPACE_ITEMS; i++) {
9919                                 if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9920                                         if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9921                                                 nspace_items[i].vp = NULL;
9922                                                 nspace_items[i].vid = 0;
9923                                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9924                                                 nspace_items[i].token = 0;
9925
9926                                                 wakeup((caddr_t)&(nspace_items[i].vp));
9927                                         }
9928                                 }
9929                         }
9930                 }
9931         }
9932
9933         lck_mtx_unlock(&nspace_handler_lock);
9934
9935         lck_mtx_lock(&nspace_handler_exclusion_lock);
9936         nspace_handlers[nspace_type].handler_busy = 0;
9937         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9938
9939         return error;
9940 }
9941
9942 static inline int validate_namespace_args (int is64bit, int size) {
9943
9944         if (is64bit) {
9945                 /* Must be one of these */
9946                 if (size == sizeof(user64_namespace_handler_info)) {
9947                         goto sizeok;
9948                 }
9949                 if (size == sizeof(user64_namespace_handler_info_ext)) {
9950                         goto sizeok;
9951                 }
9952                 if (size == sizeof(user64_namespace_handler_data)) {
9953                         goto sizeok;
9954                 }
9955                 return EINVAL;
9956         }
9957         else {
9958                 /* 32 bit -- must be one of these */
9959                 if (size == sizeof(user32_namespace_handler_info)) {
9960                         goto sizeok;
9961                 }
9962                 if (size == sizeof(user32_namespace_handler_info_ext)) {
9963                         goto sizeok;
9964                 }
9965                 if (size == sizeof(user32_namespace_handler_data)) {
9966                         goto sizeok;
9967                 }
9968                 return EINVAL;
9969         }
9970
9971 sizeok:
9972
9973         return 0;
9974
9975 }
9976
9977 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
9978 {
9979         int error = 0;
9980         namespace_handler_data nhd;
9981
9982         bzero (&nhd, sizeof(namespace_handler_data));
9983
9984         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9985                 return error;
9986         }
9987
9988         error = validate_namespace_args (is64bit, size);
9989         if (error) {
9990                 return error;
9991         }
9992
9993         /* Copy in the userland pointers into our kernel-only struct */
9994
9995         if (is64bit) {
9996                 /* 64 bit userland structures */
9997                 nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
9998                 nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
9999                 nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
10000
10001                 /* If the size is greater than the standard info struct, add in extra fields */
10002                 if (size > (sizeof(user64_namespace_handler_info))) {
10003                         if (size >= (sizeof(user64_namespace_handler_info_ext))) {
10004                                 nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
10005                         }
10006                         if (size == (sizeof(user64_namespace_handler_data))) {
10007                                 nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
10008                         }
10009                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
10010                 }
10011         }
10012         else {
10013                 /* 32 bit userland structures */
10014                 nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
10015                 nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
10016                 nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
10017
10018                 if (size > (sizeof(user32_namespace_handler_info))) {
10019                         if (size >= (sizeof(user32_namespace_handler_info_ext))) {
10020                                 nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
10021                         }
10022                         if (size == (sizeof(user32_namespace_handler_data))) {
10023                                 nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
10024                         }
10025                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
10026                 }
10027         }
10028
10029         return wait_for_namespace_event(&nhd, nspace_type);
10030 }
10031
10032 static unsigned long
10033 fsctl_bogus_command_compat(unsigned long cmd)
10034 {
10035
10036         switch (cmd) {
10037         case IOCBASECMD(FSIOC_SYNC_VOLUME):
10038                 return (FSIOC_SYNC_VOLUME);
10039         case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
10040                 return (FSIOC_ROUTEFS_SETROUTEID);
10041         case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
10042                 return (FSIOC_SET_PACKAGE_EXTS);
10043         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_GET):
10044                 return (FSIOC_NAMESPACE_HANDLER_GET);
10045         case IOCBASECMD(FSIOC_OLD_SNAPSHOT_HANDLER_GET):
10046                 return (FSIOC_OLD_SNAPSHOT_HANDLER_GET);
10047         case IOCBASECMD(FSIOC_SNAPSHOT_HANDLER_GET_EXT):
10048                 return (FSIOC_SNAPSHOT_HANDLER_GET_EXT);
10049         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UPDATE):
10050                 return (FSIOC_NAMESPACE_HANDLER_UPDATE);
10051         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UNBLOCK):
10052                 return (FSIOC_NAMESPACE_HANDLER_UNBLOCK);
10053         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_CANCEL):
10054                 return (FSIOC_NAMESPACE_HANDLER_CANCEL);
10055         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME):
10056                 return (FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME);
10057         case IOCBASECMD(FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS):
10058                 return (FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS);
10059         case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
10060                 return (FSIOC_SET_FSTYPENAME_OVERRIDE);
10061         case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
10062                 return (DISK_CONDITIONER_IOC_GET);
10063         case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
10064                 return (DISK_CONDITIONER_IOC_SET);
10065         case IOCBASECMD(FSIOC_FIOSEEKHOLE):
10066                 return (FSIOC_FIOSEEKHOLE);
10067         case IOCBASECMD(FSIOC_FIOSEEKDATA):
10068                 return (FSIOC_FIOSEEKDATA);
10069         case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
10070                 return (SPOTLIGHT_IOC_GET_MOUNT_TIME);
10071         case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
10072                 return (SPOTLIGHT_IOC_GET_LAST_MTIME);
10073         }
10074
10075         return (cmd);
10076 }
10077
10078 /*
10079  * Make a filesystem-specific control call:
10080  */
10081 /* ARGSUSED */
10082 static int
10083 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
10084 {
10085         int error=0;
10086         boolean_t is64bit;
10087         u_int size;
10088 #define STK_PARAMS 128
10089         char stkbuf[STK_PARAMS] = {0};
10090         caddr_t data, memp;
10091         vnode_t vp = *arg_vp;
10092
10093         cmd = fsctl_bogus_command_compat(cmd);
10094
10095         size = IOCPARM_LEN(cmd);
10096         if (size > IOCPARM_MAX) return (EINVAL);
10097
10098         is64bit = proc_is64bit(p);
10099
10100         memp = NULL;
10101
10102         if (size > sizeof (stkbuf)) {
10103                 if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
10104                 data = memp;
10105         } else {
10106                 data = &stkbuf[0];
10107         };
10108
10109         if (cmd & IOC_IN) {
10110                 if (size) {
10111                         error = copyin(udata, data, size);
10112                         if (error) {
10113                                 if (memp) {
10114                                         kfree (memp, size);
10115                                 }
10116                                 return error;
10117                         }
10118                 } else {
10119                         if (is64bit) {
10120                                 *(user_addr_t *)data = udata;
10121                         }
10122                         else {
10123                                 *(uint32_t *)data = (uint32_t)udata;
10124                         }
10125                 };
10126         } else if ((cmd & IOC_OUT) && size) {
10127                 /*
10128                  * Zero the buffer so the user always
10129                  * gets back something deterministic.
10130                  */
10131                 bzero(data, size);
10132         } else if (cmd & IOC_VOID) {
10133                 if (is64bit) {
10134                         *(user_addr_t *)data = udata;
10135                 }
10136                 else {
10137                         *(uint32_t *)data = (uint32_t)udata;
10138                 }
10139         }
10140
10141         /* Check to see if it's a generic command */
10142         switch (cmd) {
10143
10144                 case FSIOC_SYNC_VOLUME: {
10145                         mount_t mp = vp->v_mount;
10146                         int arg = *(uint32_t*)data;
10147
10148                         /* record vid of vp so we can drop it below. */
10149                         uint32_t vvid = vp->v_id;
10150
10151                         /*
10152                          * Then grab mount_iterref so that we can release the vnode.
10153                          * Without this, a thread may call vnode_iterate_prepare then
10154                          * get into a deadlock because we've never released the root vp
10155                          */
10156                         error = mount_iterref (mp, 0);
10157                         if (error)  {
10158                                 break;
10159                         }
10160                         vnode_put(vp);
10161
10162                         /* issue the sync for this volume */
10163                         (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
10164
10165                         /*
10166                          * Then release the mount_iterref once we're done syncing; it's not
10167                          * needed for the VNOP_IOCTL below
10168                          */
10169                         mount_iterdrop(mp);
10170
10171                         if (arg & FSCTL_SYNC_FULLSYNC) {
10172                                 /* re-obtain vnode iocount on the root vp, if possible */
10173                                 error = vnode_getwithvid (vp, vvid);
10174                                 if (error == 0) {
10175                                         error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
10176                                         vnode_put (vp);
10177                                 }
10178                         }
10179                         /* mark the argument VP as having been released */
10180                         *arg_vp = NULL;
10181                 }
10182                 break;
10183
10184                 case FSIOC_ROUTEFS_SETROUTEID: {
10185 #if ROUTEFS
10186                         char routepath[MAXPATHLEN];
10187                         size_t len = 0;
10188
10189                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10190                                 break;
10191                         }
10192                         bzero(routepath, MAXPATHLEN);
10193                         error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
10194                         if (error) {
10195                                 break;
10196                         }
10197                         error = routefs_kernel_mount(routepath);
10198                         if (error) {
10199                                 break;
10200                         }
10201 #endif
10202                 }
10203                 break;
10204
10205                 case FSIOC_SET_PACKAGE_EXTS: {
10206                         user_addr_t ext_strings;
10207                         uint32_t    num_entries;
10208                         uint32_t    max_width;
10209
10210                         if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0)))
10211                                 break;
10212
10213                         if (   (is64bit && size != sizeof(user64_package_ext_info))
10214                                         || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
10215
10216                                 // either you're 64-bit and passed a 64-bit struct or
10217                                 // you're 32-bit and passed a 32-bit struct.  otherwise
10218                                 // it's not ok.
10219                                 error = EINVAL;
10220                                 break;
10221                         }
10222
10223                         if (is64bit) {
10224                                 ext_strings = ((user64_package_ext_info *)data)->strings;
10225                                 num_entries = ((user64_package_ext_info *)data)->num_entries;
10226                                 max_width   = ((user64_package_ext_info *)data)->max_width;
10227                         } else {
10228                                 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
10229                                 num_entries = ((user32_package_ext_info *)data)->num_entries;
10230                                 max_width   = ((user32_package_ext_info *)data)->max_width;
10231                         }
10232                         error = set_package_extensions_table(ext_strings, num_entries, max_width);
10233                 }
10234                 break;
10235
10236                 /* namespace handlers */
10237                 case FSIOC_NAMESPACE_HANDLER_GET: {
10238                         error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
10239                 }
10240                 break;
10241
10242                 /* Snapshot handlers */
10243                 case FSIOC_OLD_SNAPSHOT_HANDLER_GET: {
10244                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10245                 }
10246                 break;
10247
10248                 case FSIOC_SNAPSHOT_HANDLER_GET_EXT: {
10249                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10250                 }
10251                 break;
10252
10253                 case FSIOC_NAMESPACE_HANDLER_UPDATE: {
10254                         uint32_t token, val;
10255                         int i;
10256
10257                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10258                                 break;
10259                         }
10260
10261                         if (!nspace_is_special_process(p)) {
10262                                 error = EINVAL;
10263                                 break;
10264                         }
10265
10266                         token = ((uint32_t *)data)[0];
10267                         val   = ((uint32_t *)data)[1];
10268
10269                         lck_mtx_lock(&nspace_handler_lock);
10270
10271                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10272                                 if (nspace_items[i].token == token) {
10273                                         break;  /* exit for loop, not case stmt */
10274                                 }
10275                         }
10276
10277                         if (i >= MAX_NSPACE_ITEMS) {
10278                                 error = ENOENT;
10279                         } else {
10280                                 //
10281                                 // if this bit is set, when resolve_nspace_item() times out
10282                                 // it will loop and go back to sleep.
10283                                 //
10284                                 nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
10285                         }
10286
10287                         lck_mtx_unlock(&nspace_handler_lock);
10288
10289                         if (error) {
10290                                 printf("nspace-handler-update: did not find token %u\n", token);
10291                         }
10292                 }
10293                 break;
10294
10295                 case FSIOC_NAMESPACE_HANDLER_UNBLOCK: {
10296                         uint32_t token, val;
10297                         int i;
10298
10299                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10300                                 break;
10301                         }
10302
10303                         if (!nspace_is_special_process(p)) {
10304                                 error = EINVAL;
10305                                 break;
10306                         }
10307
10308                         token = ((uint32_t *)data)[0];
10309                         val   = ((uint32_t *)data)[1];
10310
10311                         lck_mtx_lock(&nspace_handler_lock);
10312
10313                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10314                                 if (nspace_items[i].token == token) {
10315                                         break; /* exit for loop, not case statement */
10316                                 }
10317                         }
10318
10319                         if (i >= MAX_NSPACE_ITEMS) {
10320                                 printf("nspace-handler-unblock: did not find token %u\n", token);
10321                                 error = ENOENT;
10322                         } else {
10323                                 if (val == 0 && nspace_items[i].vp) {
10324                                         vnode_lock_spin(nspace_items[i].vp);
10325                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10326                                         vnode_unlock(nspace_items[i].vp);
10327                                 }
10328
10329                                 nspace_items[i].vp = NULL;
10330                                 nspace_items[i].arg = NULL;
10331                                 nspace_items[i].op = 0;
10332                                 nspace_items[i].vid = 0;
10333                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
10334                                 nspace_items[i].token = 0;
10335
10336                                 wakeup((caddr_t)&(nspace_items[i].vp));
10337                         }
10338
10339                         lck_mtx_unlock(&nspace_handler_lock);
10340                 }
10341                 break;
10342
10343                 case FSIOC_NAMESPACE_HANDLER_CANCEL: {
10344                         uint32_t token, val;
10345                         int i;
10346
10347                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10348                                 break;
10349                         }
10350
10351                         if (!nspace_is_special_process(p)) {
10352                                 error = EINVAL;
10353                                 break;
10354                         }
10355
10356                         token = ((uint32_t *)data)[0];
10357                         val   = ((uint32_t *)data)[1];
10358
10359                         lck_mtx_lock(&nspace_handler_lock);
10360
10361                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10362                                 if (nspace_items[i].token == token) {
10363                                         break;  /* exit for loop, not case stmt */
10364                                 }
10365                         }
10366
10367                         if (i >= MAX_NSPACE_ITEMS) {
10368                                 printf("nspace-handler-cancel: did not find token %u\n", token);
10369                                 error = ENOENT;
10370                         } else {
10371                                 if (nspace_items[i].vp) {
10372                                         vnode_lock_spin(nspace_items[i].vp);
10373                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10374                                         vnode_unlock(nspace_items[i].vp);
10375                                 }
10376
10377                                 nspace_items[i].vp = NULL;
10378                                 nspace_items[i].arg = NULL;
10379                                 nspace_items[i].vid = 0;
10380                                 nspace_items[i].token = val;
10381                                 nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
10382                                 nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
10383
10384                                 wakeup((caddr_t)&(nspace_items[i].vp));
10385                         }
10386
10387                         lck_mtx_unlock(&nspace_handler_lock);
10388                 }
10389                 break;
10390
10391                 case FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
10392                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10393                                 break;
10394                         }
10395
10396                         // we explicitly do not do the namespace_handler_proc check here
10397
10398                         lck_mtx_lock(&nspace_handler_lock);
10399                         snapshot_timestamp = ((uint32_t *)data)[0];
10400                         wakeup(&nspace_item_idx);
10401                         lck_mtx_unlock(&nspace_handler_lock);
10402                         printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
10403
10404                 }
10405                 break;
10406
10407                 case FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
10408                 {
10409                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10410                                 break;
10411                         }
10412
10413                         lck_mtx_lock(&nspace_handler_lock);
10414                         nspace_allow_virtual_devs = ((uint32_t *)data)[0];
10415                         lck_mtx_unlock(&nspace_handler_lock);
10416                         printf("nspace-snapshot-handler will%s allow events on disk-images\n",
10417                                         nspace_allow_virtual_devs ? "" : " NOT");
10418                         error = 0;
10419
10420                 }
10421                 break;
10422
10423                 case FSIOC_SET_FSTYPENAME_OVERRIDE:
10424                 {
10425                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10426                                 break;
10427                         }
10428                         if (vp->v_mount) {
10429                                 mount_lock(vp->v_mount);
10430                                 if (data[0] != 0) {
10431                                         strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
10432                                         vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
10433                                         if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
10434                                                 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
10435                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
10436                                         }
10437                                 } else {
10438                                         if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
10439                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
10440                                         }
10441                                         vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
10442                                         vp->v_mount->fstypename_override[0] = '\0';
10443                                 }
10444                                 mount_unlock(vp->v_mount);
10445                         }
10446                 }
10447                 break;
10448
10449                 case DISK_CONDITIONER_IOC_GET: {
10450                   error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
10451                 }
10452                 break;
10453
10454                 case DISK_CONDITIONER_IOC_SET: {
10455                   error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
10456                 }
10457                 break;
10458
10459                 default: {
10460                         /* other, known commands shouldn't be passed down here */
10461                         switch (cmd) {
10462                                 case F_PUNCHHOLE:
10463                                 case F_TRIM_ACTIVE_FILE:
10464                                 case F_RDADVISE:
10465                                 case F_TRANSCODEKEY:
10466                                 case F_GETPROTECTIONLEVEL:
10467                                 case F_GETDEFAULTPROTLEVEL:
10468                                 case F_MAKECOMPRESSED:
10469                                 case F_SET_GREEDY_MODE:
10470                                 case F_SETSTATICCONTENT:
10471                                 case F_SETIOTYPE:
10472                                 case F_SETBACKINGSTORE:
10473                                 case F_GETPATH_MTMINFO:
10474                                 case APFSIOC_REVERT_TO_SNAPSHOT:
10475                                 case FSIOC_FIOSEEKHOLE:
10476                                 case FSIOC_FIOSEEKDATA:
10477                                 case HFS_GET_BOOT_INFO:
10478                                 case HFS_SET_BOOT_INFO:
10479                                 case FIOPINSWAP:
10480                                 case F_CHKCLEAN:
10481                                 case F_FULLFSYNC:
10482                                 case F_BARRIERFSYNC:
10483                                 case F_FREEZE_FS:
10484                                 case F_THAW_FS:
10485                                         error = EINVAL;
10486                                         goto outdrop;
10487                         }
10488                         /* Invoke the filesystem-specific code */
10489                         error = VNOP_IOCTL(vp, cmd, data, options, ctx);
10490                 }
10491
10492         } /* end switch stmt */
10493
10494         /*
10495          * if no errors, copy any data to user. Size was
10496          * already set and checked above.
10497          */
10498         if (error == 0 && (cmd & IOC_OUT) && size)
10499                 error = copyout(data, udata, size);
10500
10501 outdrop:
10502         if (memp) {
10503                 kfree(memp, size);
10504         }
10505
10506         return error;
10507 }
10508
10509 /* ARGSUSED */
10510 int
10511 fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
10512 {
10513         int error;
10514         struct nameidata nd;
10515         u_long nameiflags;
10516         vnode_t vp = NULL;
10517         vfs_context_t ctx = vfs_context_current();
10518
10519         AUDIT_ARG(cmd, uap->cmd);
10520         AUDIT_ARG(value32, uap->options);
10521         /* Get the vnode for the file we are getting info on:  */
10522         nameiflags = 0;
10523         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
10524         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
10525                UIO_USERSPACE, uap->path, ctx);
10526         if ((error = namei(&nd))) goto done;
10527         vp = nd.ni_vp;
10528         nameidone(&nd);
10529
10530 #if CONFIG_MACF
10531         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
10532         if (error) {
10533                 goto done;
10534         }
10535 #endif
10536
10537         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
10538
10539 done:
10540         if (vp)
10541                 vnode_put(vp);
10542         return error;
10543 }
10544 /* ARGSUSED */
10545 int
10546 ffsctl (proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
10547 {
10548         int error;
10549         vnode_t vp = NULL;
10550         vfs_context_t ctx = vfs_context_current();
10551         int fd = -1;
10552
10553         AUDIT_ARG(fd, uap->fd);
10554         AUDIT_ARG(cmd, uap->cmd);
10555         AUDIT_ARG(value32, uap->options);
10556
10557         /* Get the vnode for the file we are getting info on:  */
10558         if ((error = file_vnode(uap->fd, &vp)))
10559                 return error;
10560         fd = uap->fd;
10561         if ((error = vnode_getwithref(vp))) {
10562                 file_drop(fd);
10563                 return error;
10564         }
10565
10566 #if CONFIG_MACF
10567         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
10568                 file_drop(fd);
10569                 vnode_put(vp);
10570                 return error;
10571         }
10572 #endif
10573
10574         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
10575
10576         file_drop(fd);
10577
10578         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
10579         if (vp) {
10580                 vnode_put(vp);
10581         }
10582
10583         return error;
10584 }
10585 /* end of fsctl system call */
10586
10587 /*
10588  *  Retrieve the data of an extended attribute.
10589  */
10590 int
10591 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
10592 {
10593         vnode_t vp;
10594         struct nameidata nd;
10595         char attrname[XATTR_MAXNAMELEN+1];
10596         vfs_context_t ctx = vfs_context_current();
10597         uio_t auio = NULL;
10598         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10599         size_t attrsize = 0;
10600         size_t namelen;
10601         u_int32_t nameiflags;
10602         int error;
10603         char uio_buf[ UIO_SIZEOF(1) ];
10604
10605         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10606                 return (EINVAL);
10607
10608         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10609         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
10610         if ((error = namei(&nd))) {
10611                 return (error);
10612         }
10613         vp = nd.ni_vp;
10614         nameidone(&nd);
10615
10616         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10617                 goto out;
10618         }
10619         if (xattr_protected(attrname)) {
10620                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
10621                         error = EPERM;
10622                         goto out;
10623                 }
10624         }
10625         /*
10626          * the specific check for 0xffffffff is a hack to preserve
10627          * binaray compatibilty in K64 with applications that discovered
10628          * that passing in a buf pointer and a size of -1 resulted in
10629          * just the size of the indicated extended attribute being returned.
10630          * this isn't part of the documented behavior, but because of the
10631          * original implemtation's check for "uap->size > 0", this behavior
10632          * was allowed. In K32 that check turned into a signed comparison
10633          * even though uap->size is unsigned...  in K64, we blow by that
10634          * check because uap->size is unsigned and doesn't get sign smeared
10635          * in the munger for a 32 bit user app.  we also need to add a
10636          * check to limit the maximum size of the buffer being passed in...
10637          * unfortunately, the underlying fileystems seem to just malloc
10638          * the requested size even if the actual extended attribute is tiny.
10639          * because that malloc is for kernel wired memory, we have to put a
10640          * sane limit on it.
10641          *
10642          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
10643          * U64 running on K64 will yield -1 (64 bits wide)
10644          * U32/U64 running on K32 will yield -1 (32 bits wide)
10645          */
10646         if (uap->size == 0xffffffff || uap->size == (size_t)-1)
10647                 goto no_uio;
10648
10649         if (uap->value) {
10650                 if (uap->size > (size_t)XATTR_MAXSIZE)
10651                         uap->size = XATTR_MAXSIZE;
10652
10653                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
10654                                             &uio_buf[0], sizeof(uio_buf));
10655                 uio_addiov(auio, uap->value, uap->size);
10656         }
10657 no_uio:
10658         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
10659 out:
10660         vnode_put(vp);
10661
10662         if (auio) {
10663                 *retval = uap->size - uio_resid(auio);
10664         } else {
10665                 *retval = (user_ssize_t)attrsize;
10666         }
10667
10668         return (error);
10669 }
10670
10671 /*
10672  * Retrieve the data of an extended attribute.
10673  */
10674 int
10675 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
10676 {
10677         vnode_t vp;
10678         char attrname[XATTR_MAXNAMELEN+1];
10679         uio_t auio = NULL;
10680         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10681         size_t attrsize = 0;
10682         size_t namelen;
10683         int error;
10684         char uio_buf[ UIO_SIZEOF(1) ];
10685
10686         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10687                 return (EINVAL);
10688
10689         if ( (error = file_vnode(uap->fd, &vp)) ) {
10690                 return (error);
10691         }
10692         if ( (error = vnode_getwithref(vp)) ) {
10693                 file_drop(uap->fd);
10694                 return(error);
10695         }
10696         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10697                 goto out;
10698         }
10699         if (xattr_protected(attrname)) {
10700                 error = EPERM;
10701                 goto out;
10702         }
10703         if (uap->value && uap->size > 0) {
10704                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
10705                                             &uio_buf[0], sizeof(uio_buf));
10706                 uio_addiov(auio, uap->value, uap->size);
10707         }
10708
10709         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
10710 out:
10711         (void)vnode_put(vp);
10712         file_drop(uap->fd);
10713
10714         if (auio) {
10715                 *retval = uap->size - uio_resid(auio);
10716         } else {
10717                 *retval = (user_ssize_t)attrsize;
10718         }
10719         return (error);
10720 }
10721
10722 /*
10723  * Set the data of an extended attribute.
10724  */
10725 int
10726 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
10727 {
10728         vnode_t vp;
10729         struct nameidata nd;
10730         char attrname[XATTR_MAXNAMELEN+1];
10731         vfs_context_t ctx = vfs_context_current();
10732         uio_t auio = NULL;
10733         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10734         size_t namelen;
10735         u_int32_t nameiflags;
10736         int error;
10737         char uio_buf[ UIO_SIZEOF(1) ];
10738
10739         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10740                 return (EINVAL);
10741
10742         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10743                 if (error == EPERM) {
10744                         /* if the string won't fit in attrname, copyinstr emits EPERM */
10745                         return (ENAMETOOLONG);
10746                 }
10747                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
10748                 return error;
10749         }
10750         if (xattr_protected(attrname))
10751                 return(EPERM);
10752         if (uap->size != 0 && uap->value == 0) {
10753                 return (EINVAL);
10754         }
10755
10756         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10757         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
10758         if ((error = namei(&nd))) {
10759                 return (error);
10760         }
10761         vp = nd.ni_vp;
10762         nameidone(&nd);
10763
10764         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
10765                                     &uio_buf[0], sizeof(uio_buf));
10766         uio_addiov(auio, uap->value, uap->size);
10767
10768         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
10769 #if CONFIG_FSE
10770         if (error == 0) {
10771                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
10772                     FSE_ARG_VNODE, vp,
10773                     FSE_ARG_DONE);
10774         }
10775 #endif
10776         vnode_put(vp);
10777         *retval = 0;
10778         return (error);
10779 }
10780
10781 /*
10782  * Set the data of an extended attribute.
10783  */
10784 int
10785 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
10786 {
10787         vnode_t vp;
10788         char attrname[XATTR_MAXNAMELEN+1];
10789         uio_t auio = NULL;
10790         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10791         size_t namelen;
10792         int error;
10793         char uio_buf[ UIO_SIZEOF(1) ];
10794 #if CONFIG_FSE
10795         vfs_context_t ctx = vfs_context_current();
10796 #endif
10797
10798         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10799                 return (EINVAL);
10800
10801         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10802                 if (error == EPERM) {
10803                         /* if the string won't fit in attrname, copyinstr emits EPERM */
10804                         return (ENAMETOOLONG);
10805                 }
10806                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
10807                 return error;
10808         }
10809         if (xattr_protected(attrname))
10810                 return(EPERM);
10811         if (uap->size != 0 && uap->value == 0) {
10812                 return (EINVAL);
10813         }
10814         if ( (error = file_vnode(uap->fd, &vp)) ) {
10815                 return (error);
10816         }
10817         if ( (error = vnode_getwithref(vp)) ) {
10818                 file_drop(uap->fd);
10819                 return(error);
10820         }
10821         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
10822                                     &uio_buf[0], sizeof(uio_buf));
10823         uio_addiov(auio, uap->value, uap->size);
10824
10825         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
10826 #if CONFIG_FSE
10827         if (error == 0) {
10828                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
10829                     FSE_ARG_VNODE, vp,
10830                     FSE_ARG_DONE);
10831         }
10832 #endif
10833         vnode_put(vp);
10834         file_drop(uap->fd);
10835         *retval = 0;
10836         return (error);
10837 }
10838
10839 /*
10840  * Remove an extended attribute.
10841  * XXX Code duplication here.
10842  */
10843 int
10844 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
10845 {
10846         vnode_t vp;
10847         struct nameidata nd;
10848         char attrname[XATTR_MAXNAMELEN+1];
10849         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10850         vfs_context_t ctx = vfs_context_current();
10851         size_t namelen;
10852         u_int32_t nameiflags;
10853         int error;
10854
10855         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10856                 return (EINVAL);
10857
10858         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10859         if (error != 0) {
10860                 return (error);
10861         }
10862         if (xattr_protected(attrname))
10863                 return(EPERM);
10864         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10865         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
10866         if ((error = namei(&nd))) {
10867                 return (error);
10868         }
10869         vp = nd.ni_vp;
10870         nameidone(&nd);
10871
10872         error = vn_removexattr(vp, attrname, uap->options, ctx);
10873 #if CONFIG_FSE
10874         if (error == 0) {
10875                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10876                     FSE_ARG_VNODE, vp,
10877                     FSE_ARG_DONE);
10878         }
10879 #endif
10880         vnode_put(vp);
10881         *retval = 0;
10882         return (error);
10883 }
10884
10885 /*
10886  * Remove an extended attribute.
10887  * XXX Code duplication here.
10888  */
10889 int
10890 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
10891 {
10892         vnode_t vp;
10893         char attrname[XATTR_MAXNAMELEN+1];
10894         size_t namelen;
10895         int error;
10896 #if CONFIG_FSE
10897         vfs_context_t ctx = vfs_context_current();
10898 #endif
10899
10900         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10901                 return (EINVAL);
10902
10903         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10904         if (error != 0) {
10905                 return (error);
10906         }
10907         if (xattr_protected(attrname))
10908                 return(EPERM);
10909         if ( (error = file_vnode(uap->fd, &vp)) ) {
10910                 return (error);
10911         }
10912         if ( (error = vnode_getwithref(vp)) ) {
10913                 file_drop(uap->fd);
10914                 return(error);
10915         }
10916
10917         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
10918 #if CONFIG_FSE
10919         if (error == 0) {
10920                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10921                     FSE_ARG_VNODE, vp,
10922                     FSE_ARG_DONE);
10923         }
10924 #endif
10925         vnode_put(vp);
10926         file_drop(uap->fd);
10927         *retval = 0;
10928         return (error);
10929 }
10930
10931 /*
10932  * Retrieve the list of extended attribute names.
10933  * XXX Code duplication here.
10934  */
10935 int
10936 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
10937 {
10938         vnode_t vp;
10939         struct nameidata nd;
10940         vfs_context_t ctx = vfs_context_current();
10941         uio_t auio = NULL;
10942         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10943         size_t attrsize = 0;
10944         u_int32_t nameiflags;
10945         int error;
10946         char uio_buf[ UIO_SIZEOF(1) ];
10947
10948         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10949                 return (EINVAL);
10950
10951         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10952         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
10953         if ((error = namei(&nd))) {
10954                 return (error);
10955         }
10956         vp = nd.ni_vp;
10957         nameidone(&nd);
10958         if (uap->namebuf != 0 && uap->bufsize > 0) {
10959                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
10960                                             &uio_buf[0], sizeof(uio_buf));
10961                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10962         }
10963
10964         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
10965
10966         vnode_put(vp);
10967         if (auio) {
10968                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10969         } else {
10970                 *retval = (user_ssize_t)attrsize;
10971         }
10972         return (error);
10973 }
10974
10975 /*
10976  * Retrieve the list of extended attribute names.
10977  * XXX Code duplication here.
10978  */
10979 int
10980 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
10981 {
10982         vnode_t vp;
10983         uio_t auio = NULL;
10984         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10985         size_t attrsize = 0;
10986         int error;
10987         char uio_buf[ UIO_SIZEOF(1) ];
10988
10989         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10990                 return (EINVAL);
10991
10992         if ( (error = file_vnode(uap->fd, &vp)) ) {
10993                 return (error);
10994         }
10995         if ( (error = vnode_getwithref(vp)) ) {
10996                 file_drop(uap->fd);
10997                 return(error);
10998         }
10999         if (uap->namebuf != 0 && uap->bufsize > 0) {
11000                 auio = uio_createwithbuffer(1, 0, spacetype,
11001                                                                           UIO_READ, &uio_buf[0], sizeof(uio_buf));
11002                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11003         }
11004
11005         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
11006
11007         vnode_put(vp);
11008         file_drop(uap->fd);
11009         if (auio) {
11010                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11011         } else {
11012                 *retval = (user_ssize_t)attrsize;
11013         }
11014         return (error);
11015 }
11016
11017 static int fsgetpath_internal(
11018         vfs_context_t ctx, int volfs_id, uint64_t objid,
11019         vm_size_t bufsize, caddr_t buf, int *pathlen)
11020 {
11021         int error;
11022         struct mount *mp = NULL;
11023         vnode_t vp;
11024         int length;
11025         int bpflags;
11026         /* maximum number of times to retry build_path */
11027         unsigned int retries = 0x10;
11028
11029         if (bufsize > PAGE_SIZE) {
11030                 return (EINVAL);
11031         }
11032
11033         if (buf == NULL) {
11034                 return (ENOMEM);
11035         }
11036
11037 retry:
11038         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
11039                 error = ENOTSUP;  /* unexpected failure */
11040                 return ENOTSUP;
11041         }
11042
11043 unionget:
11044         if (objid == 2) {
11045                 error = VFS_ROOT(mp, &vp, ctx);
11046         } else {
11047                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
11048         }
11049
11050         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
11051                 /*
11052                  * If the fileid isn't found and we're in a union
11053                  * mount volume, then see if the fileid is in the
11054                  * mounted-on volume.
11055                  */
11056                 struct mount *tmp = mp;
11057                 mp = vnode_mount(tmp->mnt_vnodecovered);
11058                 vfs_unbusy(tmp);
11059                 if (vfs_busy(mp, LK_NOWAIT) == 0)
11060                         goto unionget;
11061         } else {
11062                 vfs_unbusy(mp);
11063         }
11064
11065         if (error) {
11066                 return error;
11067         }
11068
11069 #if CONFIG_MACF
11070         error = mac_vnode_check_fsgetpath(ctx, vp);
11071         if (error) {
11072                 vnode_put(vp);
11073                 return error;
11074         }
11075 #endif
11076
11077         /* Obtain the absolute path to this vnode. */
11078         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
11079         bpflags |= BUILDPATH_CHECK_MOVED;
11080         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
11081         vnode_put(vp);
11082
11083         if (error) {
11084                 /* there was a race building the path, try a few more times */
11085                 if (error == EAGAIN) {
11086                         --retries;
11087                         if (retries > 0)
11088                                 goto retry;
11089
11090                         error = ENOENT;
11091                 }
11092                 goto out;
11093         }
11094
11095         AUDIT_ARG(text, buf);
11096
11097         if (kdebug_enable) {
11098                 long dbg_parms[NUMPARMS];
11099                 int  dbg_namelen;
11100
11101                 dbg_namelen = (int)sizeof(dbg_parms);
11102
11103         if (length < dbg_namelen) {
11104                         memcpy((char *)dbg_parms, buf, length);
11105                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
11106
11107                         dbg_namelen = length;
11108                 } else {
11109                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
11110                 }
11111
11112                 kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE);
11113         }
11114
11115         *pathlen = (user_ssize_t)length; /* may be superseded by error */
11116
11117 out:
11118         return (error);
11119 }
11120
11121 /*
11122  * Obtain the full pathname of a file system object by id.
11123  */
11124 int
11125 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
11126 {
11127         vfs_context_t ctx = vfs_context_current();
11128         fsid_t fsid;
11129         char *realpath;
11130         int length;
11131         int error;
11132
11133         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
11134                 return (error);
11135         }
11136         AUDIT_ARG(value32, fsid.val[0]);
11137         AUDIT_ARG(value64, uap->objid);
11138         /* Restrict output buffer size for now. */
11139
11140         if (uap->bufsize > PAGE_SIZE) {
11141                 return (EINVAL);
11142         }
11143         MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK);
11144         if (realpath == NULL) {
11145                 return (ENOMEM);
11146         }
11147
11148         error = fsgetpath_internal(
11149                 ctx, fsid.val[0], uap->objid,
11150                 uap->bufsize, realpath, &length);
11151
11152         if (error) {
11153                 goto out;
11154         }
11155
11156         error = copyout((caddr_t)realpath, uap->buf, length);
11157
11158         *retval = (user_ssize_t)length; /* may be superseded by error */
11159 out:
11160         if (realpath) {
11161                 FREE(realpath, M_TEMP);
11162         }
11163         return (error);
11164 }
11165
11166 /*
11167  * Common routine to handle various flavors of statfs data heading out
11168  *      to user space.
11169  *
11170  * Returns:     0                       Success
11171  *              EFAULT
11172  */
11173 static int
11174 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
11175     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
11176     boolean_t partial_copy)
11177 {
11178         int             error;
11179         int             my_size, copy_size;
11180
11181         if (is_64_bit) {
11182                 struct user64_statfs sfs;
11183                 my_size = copy_size = sizeof(sfs);
11184                 bzero(&sfs, my_size);
11185                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
11186                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
11187                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
11188                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
11189                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
11190                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
11191                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
11192                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
11193                 sfs.f_files = (user64_long_t)sfsp->f_files;
11194                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
11195                 sfs.f_fsid = sfsp->f_fsid;
11196                 sfs.f_owner = sfsp->f_owner;
11197                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11198                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
11199                 } else {
11200                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
11201                 }
11202                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
11203                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
11204
11205                 if (partial_copy) {
11206                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
11207                 }
11208                 error = copyout((caddr_t)&sfs, bufp, copy_size);
11209         }
11210         else {
11211                 struct user32_statfs sfs;
11212
11213                 my_size = copy_size = sizeof(sfs);
11214                 bzero(&sfs, my_size);
11215
11216                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
11217                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
11218                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
11219
11220                 /*
11221                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
11222                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
11223                  * to reflect the filesystem size as best we can.
11224                  */
11225                 if ((sfsp->f_blocks > INT_MAX)
11226                         /* Hack for 4061702 . I think the real fix is for Carbon to
11227                          * look for some volume capability and not depend on hidden
11228                          * semantics agreed between a FS and carbon.
11229                          * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
11230                          * for Carbon to set bNoVolumeSizes volume attribute.
11231                          * Without this the webdavfs files cannot be copied onto
11232                          * disk as they look huge. This change should not affect
11233                          * XSAN as they should not setting these to -1..
11234                          */
11235                          && (sfsp->f_blocks != 0xffffffffffffffffULL)
11236                          && (sfsp->f_bfree != 0xffffffffffffffffULL)
11237                          && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
11238                         int             shift;
11239
11240                         /*
11241                          * Work out how far we have to shift the block count down to make it fit.
11242                          * Note that it's possible to have to shift so far that the resulting
11243                          * blocksize would be unreportably large.  At that point, we will clip
11244                          * any values that don't fit.
11245                          *
11246                          * For safety's sake, we also ensure that f_iosize is never reported as
11247                          * being smaller than f_bsize.
11248                          */
11249                         for (shift = 0; shift < 32; shift++) {
11250                                 if ((sfsp->f_blocks >> shift) <= INT_MAX)
11251                                         break;
11252                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX)
11253                                         break;
11254                         }
11255 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
11256                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
11257                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
11258                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
11259 #undef __SHIFT_OR_CLIP
11260                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
11261                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
11262                 } else {
11263                         /* filesystem is small enough to be reported honestly */
11264                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
11265                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
11266                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
11267                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
11268                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
11269                 }
11270                 sfs.f_files = (user32_long_t)sfsp->f_files;
11271                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
11272                 sfs.f_fsid = sfsp->f_fsid;
11273                 sfs.f_owner = sfsp->f_owner;
11274                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11275                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
11276                 } else {
11277                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
11278                 }
11279                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
11280                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
11281
11282                 if (partial_copy) {
11283                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
11284                 }
11285                 error = copyout((caddr_t)&sfs, bufp, copy_size);
11286         }
11287
11288         if (sizep != NULL) {
11289                 *sizep = my_size;
11290         }
11291         return(error);
11292 }
11293
11294 /*
11295  * copy stat structure into user_stat structure.
11296  */
11297 void munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
11298 {
11299         bzero(usbp, sizeof(*usbp));
11300
11301         usbp->st_dev = sbp->st_dev;
11302         usbp->st_ino = sbp->st_ino;
11303         usbp->st_mode = sbp->st_mode;
11304         usbp->st_nlink = sbp->st_nlink;
11305         usbp->st_uid = sbp->st_uid;
11306         usbp->st_gid = sbp->st_gid;
11307         usbp->st_rdev = sbp->st_rdev;
11308 #ifndef _POSIX_C_SOURCE
11309         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11310         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11311         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11312         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11313         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11314         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11315 #else
11316         usbp->st_atime = sbp->st_atime;
11317         usbp->st_atimensec = sbp->st_atimensec;
11318         usbp->st_mtime = sbp->st_mtime;
11319         usbp->st_mtimensec = sbp->st_mtimensec;
11320         usbp->st_ctime = sbp->st_ctime;
11321         usbp->st_ctimensec = sbp->st_ctimensec;
11322 #endif
11323         usbp->st_size = sbp->st_size;
11324         usbp->st_blocks = sbp->st_blocks;
11325         usbp->st_blksize = sbp->st_blksize;
11326         usbp->st_flags = sbp->st_flags;
11327         usbp->st_gen = sbp->st_gen;
11328         usbp->st_lspare = sbp->st_lspare;
11329         usbp->st_qspare[0] = sbp->st_qspare[0];
11330         usbp->st_qspare[1] = sbp->st_qspare[1];
11331 }
11332
11333 void munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
11334 {
11335         bzero(usbp, sizeof(*usbp));
11336
11337         usbp->st_dev = sbp->st_dev;
11338         usbp->st_ino = sbp->st_ino;
11339         usbp->st_mode = sbp->st_mode;
11340         usbp->st_nlink = sbp->st_nlink;
11341         usbp->st_uid = sbp->st_uid;
11342         usbp->st_gid = sbp->st_gid;
11343         usbp->st_rdev = sbp->st_rdev;
11344 #ifndef _POSIX_C_SOURCE
11345         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11346         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11347         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11348         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11349         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11350         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11351 #else
11352         usbp->st_atime = sbp->st_atime;
11353         usbp->st_atimensec = sbp->st_atimensec;
11354         usbp->st_mtime = sbp->st_mtime;
11355         usbp->st_mtimensec = sbp->st_mtimensec;
11356         usbp->st_ctime = sbp->st_ctime;
11357         usbp->st_ctimensec = sbp->st_ctimensec;
11358 #endif
11359         usbp->st_size = sbp->st_size;
11360         usbp->st_blocks = sbp->st_blocks;
11361         usbp->st_blksize = sbp->st_blksize;
11362         usbp->st_flags = sbp->st_flags;
11363         usbp->st_gen = sbp->st_gen;
11364         usbp->st_lspare = sbp->st_lspare;
11365         usbp->st_qspare[0] = sbp->st_qspare[0];
11366         usbp->st_qspare[1] = sbp->st_qspare[1];
11367 }
11368
11369 /*
11370  * copy stat64 structure into user_stat64 structure.
11371  */
11372 void munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
11373 {
11374         bzero(usbp, sizeof(*usbp));
11375
11376         usbp->st_dev = sbp->st_dev;
11377         usbp->st_ino = sbp->st_ino;
11378         usbp->st_mode = sbp->st_mode;
11379         usbp->st_nlink = sbp->st_nlink;
11380         usbp->st_uid = sbp->st_uid;
11381         usbp->st_gid = sbp->st_gid;
11382         usbp->st_rdev = sbp->st_rdev;
11383 #ifndef _POSIX_C_SOURCE
11384         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11385         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11386         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11387         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11388         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11389         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11390         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11391         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11392 #else
11393         usbp->st_atime = sbp->st_atime;
11394         usbp->st_atimensec = sbp->st_atimensec;
11395         usbp->st_mtime = sbp->st_mtime;
11396         usbp->st_mtimensec = sbp->st_mtimensec;
11397         usbp->st_ctime = sbp->st_ctime;
11398         usbp->st_ctimensec = sbp->st_ctimensec;
11399         usbp->st_birthtime = sbp->st_birthtime;
11400         usbp->st_birthtimensec = sbp->st_birthtimensec;
11401 #endif
11402         usbp->st_size = sbp->st_size;
11403         usbp->st_blocks = sbp->st_blocks;
11404         usbp->st_blksize = sbp->st_blksize;
11405         usbp->st_flags = sbp->st_flags;
11406         usbp->st_gen = sbp->st_gen;
11407         usbp->st_lspare = sbp->st_lspare;
11408         usbp->st_qspare[0] = sbp->st_qspare[0];
11409         usbp->st_qspare[1] = sbp->st_qspare[1];
11410 }
11411
11412 void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
11413 {
11414         bzero(usbp, sizeof(*usbp));
11415
11416         usbp->st_dev = sbp->st_dev;
11417         usbp->st_ino = sbp->st_ino;
11418         usbp->st_mode = sbp->st_mode;
11419         usbp->st_nlink = sbp->st_nlink;
11420         usbp->st_uid = sbp->st_uid;
11421         usbp->st_gid = sbp->st_gid;
11422         usbp->st_rdev = sbp->st_rdev;
11423 #ifndef _POSIX_C_SOURCE
11424         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11425         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11426         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11427         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11428         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11429         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11430         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11431         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11432 #else
11433         usbp->st_atime = sbp->st_atime;
11434         usbp->st_atimensec = sbp->st_atimensec;
11435         usbp->st_mtime = sbp->st_mtime;
11436         usbp->st_mtimensec = sbp->st_mtimensec;
11437         usbp->st_ctime = sbp->st_ctime;
11438         usbp->st_ctimensec = sbp->st_ctimensec;
11439         usbp->st_birthtime = sbp->st_birthtime;
11440         usbp->st_birthtimensec = sbp->st_birthtimensec;
11441 #endif
11442         usbp->st_size = sbp->st_size;
11443         usbp->st_blocks = sbp->st_blocks;
11444         usbp->st_blksize = sbp->st_blksize;
11445         usbp->st_flags = sbp->st_flags;
11446         usbp->st_gen = sbp->st_gen;
11447         usbp->st_lspare = sbp->st_lspare;
11448         usbp->st_qspare[0] = sbp->st_qspare[0];
11449         usbp->st_qspare[1] = sbp->st_qspare[1];
11450 }
11451
11452 /*
11453  * Purge buffer cache for simulating cold starts
11454  */
11455 static int vnode_purge_callback(struct vnode *vp, __unused void *cargs)
11456 {
11457         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
11458
11459         return VNODE_RETURNED;
11460 }
11461
11462 static int vfs_purge_callback(mount_t mp, __unused void * arg)
11463 {
11464         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
11465
11466         return VFS_RETURNED;
11467 }
11468
11469 int
11470 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
11471 {
11472         if (!kauth_cred_issuser(kauth_cred_get()))
11473                 return EPERM;
11474
11475         vfs_iterate(0/* flags */, vfs_purge_callback, NULL);
11476
11477         return 0;
11478 }
11479
11480 /*
11481  * gets the vnode associated with the (unnamed) snapshot directory
11482  * for a Filesystem. The snapshot directory vnode is returned with
11483  * an iocount on it.
11484  */
11485 int
11486 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
11487 {
11488         return (VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx));
11489 }
11490
11491 /*
11492  * Get the snapshot vnode.
11493  *
11494  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
11495  * needs nameidone() on ndp.
11496  *
11497  * If the snapshot vnode exists it is returned in ndp->ni_vp.
11498  *
11499  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
11500  * not needed.
11501  */
11502 static int
11503 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
11504     user_addr_t name, struct nameidata *ndp, int32_t op,
11505 #if !CONFIG_TRIGGERS
11506     __unused
11507 #endif
11508     enum path_operation pathop,
11509     vfs_context_t ctx)
11510 {
11511         int error, i;
11512         caddr_t name_buf;
11513         size_t name_len;
11514         struct vfs_attr vfa;
11515
11516         *sdvpp = NULLVP;
11517         *rvpp = NULLVP;
11518
11519         error = vnode_getfromfd(ctx, dirfd, rvpp);
11520         if (error)
11521                 return (error);
11522
11523         if (!vnode_isvroot(*rvpp)) {
11524                 error = EINVAL;
11525                 goto out;
11526         }
11527
11528         /* Make sure the filesystem supports snapshots */
11529         VFSATTR_INIT(&vfa);
11530         VFSATTR_WANTED(&vfa, f_capabilities);
11531         if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
11532             !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
11533             !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
11534             VOL_CAP_INT_SNAPSHOT)) ||
11535             !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
11536             VOL_CAP_INT_SNAPSHOT))) {
11537                 error = ENOTSUP;
11538                 goto out;
11539         }
11540
11541         error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
11542         if (error)
11543                 goto out;
11544
11545         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11546         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11547         if (error)
11548                 goto out1;
11549
11550         /*
11551          * Some sanity checks- name can't be empty, "." or ".." or have slashes.
11552          * (the length returned by copyinstr includes the terminating NUL)
11553          */
11554         if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
11555             (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
11556                 error = EINVAL;
11557                 goto out1;
11558         }
11559         for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++);
11560         if (i < (int)name_len) {
11561                 error = EINVAL;
11562                 goto out1;
11563         }
11564
11565 #if CONFIG_MACF
11566         if (op == CREATE) {
11567                 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
11568                     name_buf);
11569         } else if (op == DELETE) {
11570                 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
11571                     name_buf);
11572         }
11573         if (error)
11574                 goto out1;
11575 #endif
11576
11577         /* Check if the snapshot already exists ... */
11578         NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
11579             UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
11580         ndp->ni_dvp = *sdvpp;
11581
11582         error = namei(ndp);
11583 out1:
11584         FREE(name_buf, M_TEMP);
11585 out:
11586         if (error) {
11587                 if (*sdvpp) {
11588                         vnode_put(*sdvpp);
11589                         *sdvpp = NULLVP;
11590                 }
11591                 if (*rvpp) {
11592                         vnode_put(*rvpp);
11593                         *rvpp = NULLVP;
11594                 }
11595         }
11596         return (error);
11597 }
11598
11599 /*
11600  * create a filesystem snapshot (for supporting filesystems)
11601  *
11602  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
11603  * We get to the (unnamed) snapshot directory vnode and create the vnode
11604  * for the snapshot in it.
11605  *
11606  * Restrictions:
11607  *
11608  *    a) Passed in name for snapshot cannot have slashes.
11609  *    b) name can't be "." or ".."
11610  *
11611  * Since this requires superuser privileges, vnode_authorize calls are not
11612  * made.
11613  */
11614 static int
11615 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
11616     vfs_context_t ctx)
11617 {
11618         vnode_t rvp, snapdvp;
11619         int error;
11620         struct nameidata namend;
11621
11622         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
11623             OP_LINK, ctx);
11624         if (error)
11625                 return (error);
11626
11627         if (namend.ni_vp) {
11628                 vnode_put(namend.ni_vp);
11629                 error = EEXIST;
11630         } else {
11631                 struct vnode_attr va;
11632                 vnode_t vp = NULLVP;
11633
11634                 VATTR_INIT(&va);
11635                 VATTR_SET(&va, va_type, VREG);
11636                 VATTR_SET(&va, va_mode, 0);
11637
11638                 error = vn_create(snapdvp, &vp, &namend, &va,
11639                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
11640                 if (!error && vp)
11641                         vnode_put(vp);
11642         }
11643
11644         nameidone(&namend);
11645         vnode_put(snapdvp);
11646         vnode_put(rvp);
11647         return (error);
11648 }
11649
11650 /*
11651  * Delete a Filesystem snapshot
11652  *
11653  * get the vnode for the unnamed snapshot directory and the snapshot and
11654  * delete the snapshot.
11655  */
11656 static int
11657 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
11658     vfs_context_t ctx)
11659 {
11660         vnode_t rvp, snapdvp;
11661         int error;
11662         struct nameidata namend;
11663
11664         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
11665             OP_UNLINK, ctx);
11666         if (error)
11667                 goto out;
11668
11669         error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
11670             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
11671
11672         vnode_put(namend.ni_vp);
11673         nameidone(&namend);
11674         vnode_put(snapdvp);
11675         vnode_put(rvp);
11676 out:
11677         return (error);
11678 }
11679
11680 /*
11681  * Revert a filesystem to a snapshot
11682  *
11683  * Marks the filesystem to revert to the given snapshot on next mount.
11684  */
11685 static int
11686 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
11687                 vfs_context_t ctx)
11688 {
11689     int error;
11690     vnode_t rvp;
11691     mount_t mp;
11692     struct fs_snapshot_revert_args revert_data;
11693     struct componentname cnp;
11694     caddr_t name_buf;
11695     size_t name_len;
11696
11697     error = vnode_getfromfd(ctx, dirfd, &rvp);
11698     if (error) {
11699         return (error);
11700     }
11701     mp = vnode_mount(rvp);
11702
11703     MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11704     error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11705     if (error) {
11706         FREE(name_buf, M_TEMP);
11707         vnode_put(rvp);
11708         return (error);
11709     }
11710
11711 #if CONFIG_MACF
11712     error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
11713     if (error) {
11714         FREE(name_buf, M_TEMP);
11715         vnode_put(rvp);
11716         return (error);
11717     }
11718 #endif
11719
11720     /*
11721      * Grab mount_iterref so that we can release the vnode,
11722      * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
11723      */
11724     error = mount_iterref (mp, 0);
11725     vnode_put(rvp);
11726     if (error) {
11727         FREE(name_buf, M_TEMP);
11728         return (error);
11729     }
11730
11731     memset(&cnp, 0, sizeof(cnp));
11732     cnp.cn_pnbuf = (char *)name_buf;
11733     cnp.cn_nameiop = LOOKUP;
11734     cnp.cn_flags = ISLASTCN | HASBUF;
11735     cnp.cn_pnlen = MAXPATHLEN;
11736     cnp.cn_nameptr = cnp.cn_pnbuf;
11737     cnp.cn_namelen = (int)name_len;
11738     revert_data.sr_cnp = &cnp;
11739
11740     error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
11741     mount_iterdrop(mp);
11742     FREE(name_buf, M_TEMP);
11743
11744     if (error) {
11745         /* If there was any error, try again using VNOP_IOCTL */
11746
11747         vnode_t snapdvp;
11748         struct nameidata namend;
11749
11750         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
11751                                    OP_LOOKUP, ctx);
11752         if (error) {
11753             return (error);
11754         }
11755
11756
11757         error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
11758                            0, ctx);
11759
11760         vnode_put(namend.ni_vp);
11761         nameidone(&namend);
11762         vnode_put(snapdvp);
11763         vnode_put(rvp);
11764     }
11765
11766         return (error);
11767 }
11768
11769 /*
11770  * rename a Filesystem snapshot
11771  *
11772  * get the vnode for the unnamed snapshot directory and the snapshot and
11773  * rename the snapshot. This is a very specialised (and simple) case of
11774  * rename(2) (which has to deal with a lot more complications). It differs
11775  * slightly from rename(2) in that EEXIST is returned if the new name exists.
11776  */
11777 static int
11778 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
11779     __unused uint32_t flags, vfs_context_t ctx)
11780 {
11781         vnode_t rvp, snapdvp;
11782         int error, i;
11783         caddr_t newname_buf;
11784         size_t name_len;
11785         vnode_t fvp;
11786         struct nameidata *fromnd, *tond;
11787         /* carving out a chunk for structs that are too big to be on stack. */
11788         struct {
11789                 struct nameidata from_node;
11790                 struct nameidata to_node;
11791         } * __rename_data;
11792
11793         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
11794         fromnd = &__rename_data->from_node;
11795         tond = &__rename_data->to_node;
11796
11797         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
11798             OP_UNLINK, ctx);
11799         if (error)
11800                 goto out;
11801         fvp  = fromnd->ni_vp;
11802
11803         MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11804         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
11805         if (error)
11806                 goto out1;
11807
11808         /*
11809          * Some sanity checks- new name can't be empty, "." or ".." or have
11810          * slashes.
11811          * (the length returned by copyinstr includes the terminating NUL)
11812          *
11813          * The FS rename VNOP is suppossed to handle this but we'll pick it
11814          * off here itself.
11815          */
11816         if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
11817             (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
11818                 error = EINVAL;
11819                 goto out1;
11820         }
11821         for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++);
11822         if (i < (int)name_len) {
11823                 error = EINVAL;
11824                 goto out1;
11825         }
11826
11827 #if CONFIG_MACF
11828         error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
11829             newname_buf);
11830         if (error)
11831                 goto out1;
11832 #endif
11833
11834         NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
11835             UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
11836         tond->ni_dvp = snapdvp;
11837
11838         error = namei(tond);
11839         if (error) {
11840                 goto out2;
11841         } else if (tond->ni_vp) {
11842                 /*
11843                  * snapshot rename behaves differently than rename(2) - if the
11844                  * new name exists, EEXIST is returned.
11845                  */
11846                 vnode_put(tond->ni_vp);
11847                 error = EEXIST;
11848                 goto out2;
11849         }
11850
11851         error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
11852             &tond->ni_cnd, ctx);
11853
11854 out2:
11855         nameidone(tond);
11856 out1:
11857         FREE(newname_buf, M_TEMP);
11858         vnode_put(fvp);
11859         vnode_put(snapdvp);
11860         vnode_put(rvp);
11861         nameidone(fromnd);
11862 out:
11863         FREE(__rename_data, M_TEMP);
11864         return (error);
11865 }
11866
11867 /*
11868  * Mount a Filesystem snapshot
11869  *
11870  * get the vnode for the unnamed snapshot directory and the snapshot and
11871  * mount the snapshot.
11872  */
11873 static int
11874 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
11875     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
11876 {
11877         vnode_t rvp, snapdvp, snapvp, vp, pvp;
11878         int error;
11879         struct nameidata *snapndp, *dirndp;
11880         /* carving out a chunk for structs that are too big to be on stack. */
11881         struct {
11882                 struct nameidata snapnd;
11883                 struct nameidata dirnd;
11884         } * __snapshot_mount_data;
11885
11886         MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
11887             M_TEMP, M_WAITOK);
11888         snapndp = &__snapshot_mount_data->snapnd;
11889         dirndp = &__snapshot_mount_data->dirnd;
11890
11891         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
11892             OP_LOOKUP, ctx);
11893         if (error)
11894                 goto out;
11895
11896         snapvp  = snapndp->ni_vp;
11897         if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
11898                 error = EIO;
11899                 goto out1;
11900         }
11901
11902         /* Get the vnode to be covered */
11903         NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
11904             UIO_USERSPACE, directory, ctx);
11905         error = namei(dirndp);
11906         if (error)
11907                 goto out1;
11908
11909         vp = dirndp->ni_vp;
11910         pvp = dirndp->ni_dvp;
11911
11912         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
11913                 error = EINVAL;
11914         } else {
11915                 mount_t mp = vnode_mount(rvp);
11916                 struct fs_snapshot_mount_args smnt_data;
11917
11918                 smnt_data.sm_mp  = mp;
11919                 smnt_data.sm_cnp = &snapndp->ni_cnd;
11920                 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
11921                    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
11922                    KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
11923         }
11924
11925         vnode_put(vp);
11926         vnode_put(pvp);
11927         nameidone(dirndp);
11928 out1:
11929         vnode_put(snapvp);
11930         vnode_put(snapdvp);
11931         vnode_put(rvp);
11932         nameidone(snapndp);
11933 out:
11934         FREE(__snapshot_mount_data, M_TEMP);
11935         return (error);
11936 }
11937
11938 /*
11939  * Root from a snapshot of the filesystem
11940  *
11941  * Marks the filesystem to root from the given snapshot on next boot.
11942  */
11943 static int
11944 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
11945                 vfs_context_t ctx)
11946 {
11947     int error;
11948     vnode_t rvp;
11949     mount_t mp;
11950     struct fs_snapshot_root_args root_data;
11951     struct componentname cnp;
11952     caddr_t name_buf;
11953     size_t name_len;
11954
11955     error = vnode_getfromfd(ctx, dirfd, &rvp);
11956     if (error) {
11957         return (error);
11958     }
11959     mp = vnode_mount(rvp);
11960
11961     MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11962     error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11963     if (error) {
11964         FREE(name_buf, M_TEMP);
11965         vnode_put(rvp);
11966         return (error);
11967     }
11968
11969     // XXX MAC checks ?
11970
11971     /*
11972      * Grab mount_iterref so that we can release the vnode,
11973      * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
11974      */
11975     error = mount_iterref (mp, 0);
11976     vnode_put(rvp);
11977     if (error) {
11978         FREE(name_buf, M_TEMP);
11979         return (error);
11980     }
11981
11982     memset(&cnp, 0, sizeof(cnp));
11983     cnp.cn_pnbuf = (char *)name_buf;
11984     cnp.cn_nameiop = LOOKUP;
11985     cnp.cn_flags = ISLASTCN | HASBUF;
11986     cnp.cn_pnlen = MAXPATHLEN;
11987     cnp.cn_nameptr = cnp.cn_pnbuf;
11988     cnp.cn_namelen = (int)name_len;
11989     root_data.sr_cnp = &cnp;
11990
11991     error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
11992
11993     mount_iterdrop(mp);
11994     FREE(name_buf, M_TEMP);
11995
11996     return (error);
11997 }
11998
11999 /*
12000  * FS snapshot operations dispatcher
12001  */
12002 int
12003 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
12004     __unused int32_t *retval)
12005 {
12006         int error;
12007         vfs_context_t ctx = vfs_context_current();
12008
12009         AUDIT_ARG(fd, uap->dirfd);
12010         AUDIT_ARG(value32, uap->op);
12011
12012         error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
12013         if (error)
12014                 return (error);
12015
12016         switch (uap->op) {
12017         case SNAPSHOT_OP_CREATE:
12018                 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
12019                 break;
12020         case SNAPSHOT_OP_DELETE:
12021                 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
12022                 break;
12023         case SNAPSHOT_OP_RENAME:
12024                 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
12025                     uap->flags, ctx);
12026                 break;
12027         case SNAPSHOT_OP_MOUNT:
12028                 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
12029                     uap->data, uap->flags, ctx);
12030                 break;
12031     case SNAPSHOT_OP_REVERT:
12032         error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
12033         break;
12034 #if !TARGET_OS_OSX
12035         case SNAPSHOT_OP_ROOT:
12036                 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
12037                 break;
12038 #endif /* !TARGET_OS_OSX */
12039         default:
12040                 error = ENOSYS;
12041         }
12042
12043         return (error);
12044 }