bsd/vfs/vfs_subr.c

   1 /*
   2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  * (c) UNIX System Laboratories, Inc.
  33  * All or some portions of this file are derived from material licensed
  34  * to the University of California by American Telephone and Telegraph
  35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  36  * the permission of UNIX System Laboratories, Inc.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  67  */
  68 /*
  69  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  70  * support for mandatory and extensible security protections.  This notice
  71  * is included in support of clause 2.2 (b) of the Apple Public License,
  72  * Version 2.0.
  73  */
  74
  75 /*
  76  * External virtual filesystem routines
  77  */
  78
  79 #include <sys/param.h>
  80 #include <sys/systm.h>
  81 #include <sys/proc_internal.h>
  82 #include <sys/kauth.h>
  83 #include <sys/mount_internal.h>
  84 #include <sys/time.h>
  85 #include <sys/lock.h>
  86 #include <sys/vnode.h>
  87 #include <sys/vnode_internal.h>
  88 #include <sys/stat.h>
  89 #include <sys/namei.h>
  90 #include <sys/ucred.h>
  91 #include <sys/buf_internal.h>
  92 #include <sys/errno.h>
  93 #include <sys/malloc.h>
  94 #include <sys/uio_internal.h>
  95 #include <sys/uio.h>
  96 #include <sys/domain.h>
  97 #include <sys/mbuf.h>
  98 #include <sys/syslog.h>
  99 #include <sys/ubc_internal.h>
 100 #include <sys/vm.h>
 101 #include <sys/sysctl.h>
 102 #include <sys/filedesc.h>
 103 #include <sys/event.h>
 104 #include <sys/kdebug.h>
 105 #include <sys/kauth.h>
 106 #include <sys/user.h>
 107 #include <sys/systm.h>
 108 #include <sys/kern_memorystatus.h>
 109 #include <sys/lockf.h>
 110 #include <miscfs/fifofs/fifo.h>
 111
 112 #include <nfs/nfs_conf.h>
 113
 114 #include <string.h>
 115 #include <machine/machine_routines.h>
 116
 117 #include <kern/assert.h>
 118 #include <mach/kern_return.h>
 119 #include <kern/thread.h>
 120 #include <kern/sched_prim.h>
 121
 122 #include <miscfs/specfs/specdev.h>
 123
 124 #include <mach/mach_types.h>
 125 #include <mach/memory_object_types.h>
 126 #include <mach/memory_object_control.h>
 127
 128 #include <kern/kalloc.h>        /* kalloc()/kfree() */
 129 #include <kern/clock.h>         /* delay_for_interval() */
 130 #include <libkern/OSAtomic.h>   /* OSAddAtomic() */
 131 #if !CONFIG_EMBEDDED
 132 #include <console/video_console.h>
 133 #endif
 134
 135 #ifdef JOE_DEBUG
 136 #include <libkern/OSDebug.h>
 137 #endif
 138
 139 #include <vm/vm_protos.h>       /* vnode_pager_vrele() */
 140
 141 #if CONFIG_MACF
 142 #include <security/mac_framework.h>
 143 #endif
 144
 145 #include <vfs/vfs_disk_conditioner.h>
 146 #include <libkern/section_keywords.h>
 147
 148 extern lck_grp_t *vnode_lck_grp;
 149 extern lck_attr_t *vnode_lck_attr;
 150
 151 #if CONFIG_TRIGGERS
 152 extern lck_grp_t *trigger_vnode_lck_grp;
 153 extern lck_attr_t *trigger_vnode_lck_attr;
 154 #endif
 155
 156 extern lck_mtx_t * mnt_list_mtx_lock;
 157
 158 enum vtype iftovt_tab[16] = {
 159         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 160         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 161 };
 162 int     vttoif_tab[9] = {
 163         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 164         S_IFSOCK, S_IFIFO, S_IFMT,
 165 };
 166
 167
 168 /* XXX These should be in a BSD accessible Mach header, but aren't. */
 169 extern void             memory_object_mark_used(
 170         memory_object_control_t         control);
 171
 172 extern void             memory_object_mark_unused(
 173         memory_object_control_t         control,
 174         boolean_t                       rage);
 175
 176 extern void             memory_object_mark_io_tracking(
 177         memory_object_control_t         control);
 178
 179 /* XXX next protptype should be from <nfs/nfs.h> */
 180 extern int       nfs_vinvalbuf(vnode_t, int, vfs_context_t, int);
 181
 182 extern int paniclog_append_noflush(const char *format, ...);
 183
 184 /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
 185 __private_extern__ void qsort(
 186         void * array,
 187         size_t nmembers,
 188         size_t member_size,
 189         int (*)(const void *, const void *));
 190
 191 __private_extern__ void vntblinit(void);
 192 __private_extern__ int unlink1(vfs_context_t, vnode_t, user_addr_t,
 193     enum uio_seg, int);
 194
 195 extern int system_inshutdown;
 196
 197 static void vnode_list_add(vnode_t);
 198 static void vnode_async_list_add(vnode_t);
 199 static void vnode_list_remove(vnode_t);
 200 static void vnode_list_remove_locked(vnode_t);
 201
 202 static void vnode_abort_advlocks(vnode_t);
 203 static errno_t vnode_drain(vnode_t);
 204 static void vgone(vnode_t, int flags);
 205 static void vclean(vnode_t vp, int flag);
 206 static void vnode_reclaim_internal(vnode_t, int, int, int);
 207
 208 static void vnode_dropiocount(vnode_t);
 209
 210 static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev);
 211 static int  vnode_reload(vnode_t);
 212 static int  vnode_isinuse_locked(vnode_t, int, int);
 213
 214 static int unmount_callback(mount_t, __unused void *);
 215
 216 static void insmntque(vnode_t vp, mount_t mp);
 217 static int mount_getvfscnt(void);
 218 static int mount_fillfsids(fsid_t *, int );
 219 static void vnode_iterate_setup(mount_t);
 220 int vnode_umount_preflight(mount_t, vnode_t, int);
 221 static int vnode_iterate_prepare(mount_t);
 222 static int vnode_iterate_reloadq(mount_t);
 223 static void vnode_iterate_clear(mount_t);
 224 static mount_t vfs_getvfs_locked(fsid_t *);
 225 static int vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp,
 226     struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx);
 227 static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx);
 228
 229 errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 230
 231 #ifdef JOE_DEBUG
 232 static void record_vp(vnode_t vp, int count);
 233 #endif
 234
 235 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
 236 extern int bootarg_no_vnode_jetsam;    /* from bsd_init.c default value is 0 */
 237 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
 238
 239 extern int bootarg_no_vnode_drain;    /* from bsd_init.c default value is 0 */
 240
 241 boolean_t root_is_CF_drive = FALSE;
 242
 243 #if CONFIG_TRIGGERS
 244 static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external);
 245 static void vnode_resolver_detach(vnode_t);
 246 #endif
 247
 248 TAILQ_HEAD(freelst, vnode) vnode_free_list;     /* vnode free list */
 249 TAILQ_HEAD(deadlst, vnode) vnode_dead_list;     /* vnode dead list */
 250 TAILQ_HEAD(async_work_lst, vnode) vnode_async_work_list;
 251
 252
 253 TAILQ_HEAD(ragelst, vnode) vnode_rage_list;     /* vnode rapid age list */
 254 struct timeval rage_tv;
 255 int     rage_limit = 0;
 256 int     ragevnodes = 0;
 257 static  int vfs_unmountall_started = 0;
 258
 259 #define RAGE_LIMIT_MIN  100
 260 #define RAGE_TIME_LIMIT 5
 261
 262 /*
 263  * ROSV definitions
 264  * NOTE: These are shadowed from PlatformSupport definitions, but XNU
 265  * builds standalone.
 266  */
 267 #define PLATFORM_DATA_VOLUME_MOUNT_POINT "/System/Volumes/Data"
 268 #define PLATFORM_VM_VOLUME_MOUNT_POINT "/private/var/vm"
 269
 270
 271 struct mntlist mountlist;                       /* mounted filesystem list */
 272 static int nummounts = 0;
 273
 274 static int print_busy_vnodes = 0;                               /* print out busy vnodes */
 275
 276 #if DIAGNOSTIC
 277 #define VLISTCHECK(fun, vp, list)       \
 278         if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
 279                 panic("%s: %s vnode not on %slist", (fun), (list), (list));
 280 #else
 281 #define VLISTCHECK(fun, vp, list)
 282 #endif /* DIAGNOSTIC */
 283
 284 #define VLISTNONE(vp)   \
 285         do {    \
 286                 (vp)->v_freelist.tqe_next = (struct vnode *)0;  \
 287                 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb;   \
 288         } while(0)
 289
 290 #define VONLIST(vp)     \
 291         ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
 292
 293 /* remove a vnode from free vnode list */
 294 #define VREMFREE(fun, vp)       \
 295         do {    \
 296                 VLISTCHECK((fun), (vp), "free");        \
 297                 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist);       \
 298                 VLISTNONE((vp));        \
 299                 freevnodes--;   \
 300         } while(0)
 301
 302
 303 /* remove a vnode from dead vnode list */
 304 #define VREMDEAD(fun, vp)       \
 305         do {    \
 306                 VLISTCHECK((fun), (vp), "dead");        \
 307                 TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist);       \
 308                 VLISTNONE((vp));        \
 309                 vp->v_listflag &= ~VLIST_DEAD;  \
 310                 deadvnodes--;   \
 311         } while(0)
 312
 313
 314 /* remove a vnode from async work vnode list */
 315 #define VREMASYNC_WORK(fun, vp) \
 316         do {    \
 317                 VLISTCHECK((fun), (vp), "async_work");  \
 318                 TAILQ_REMOVE(&vnode_async_work_list, (vp), v_freelist); \
 319                 VLISTNONE((vp));        \
 320                 vp->v_listflag &= ~VLIST_ASYNC_WORK;    \
 321                 async_work_vnodes--;    \
 322         } while(0)
 323
 324
 325 /* remove a vnode from rage vnode list */
 326 #define VREMRAGE(fun, vp)       \
 327         do {    \
 328                 if ( !(vp->v_listflag & VLIST_RAGE))                    \
 329                         panic("VREMRAGE: vp not on rage list");         \
 330                 VLISTCHECK((fun), (vp), "rage");                        \
 331                 TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist);       \
 332                 VLISTNONE((vp));                \
 333                 vp->v_listflag &= ~VLIST_RAGE;  \
 334                 ragevnodes--;                   \
 335         } while(0)
 336
 337 static void async_work_continue(void);
 338
 339 /*
 340  * Initialize the vnode management data structures.
 341  */
 342 __private_extern__ void
 343 vntblinit(void)
 344 {
 345         thread_t        thread = THREAD_NULL;
 346
 347         TAILQ_INIT(&vnode_free_list);
 348         TAILQ_INIT(&vnode_rage_list);
 349         TAILQ_INIT(&vnode_dead_list);
 350         TAILQ_INIT(&vnode_async_work_list);
 351         TAILQ_INIT(&mountlist);
 352
 353         microuptime(&rage_tv);
 354         rage_limit = desiredvnodes / 100;
 355
 356         if (rage_limit < RAGE_LIMIT_MIN) {
 357                 rage_limit = RAGE_LIMIT_MIN;
 358         }
 359
 360         /*
 361          * create worker threads
 362          */
 363         kernel_thread_start((thread_continue_t)async_work_continue, NULL, &thread);
 364         thread_deallocate(thread);
 365 }
 366
 367 /* the timeout is in 10 msecs */
 368 int
 369 vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg)
 370 {
 371         int error = 0;
 372         struct timespec ts;
 373
 374         KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0);
 375
 376         if (vp->v_numoutput > output_target) {
 377                 slpflag |= PDROP;
 378
 379                 vnode_lock_spin(vp);
 380
 381                 while ((vp->v_numoutput > output_target) && error == 0) {
 382                         if (output_target) {
 383                                 vp->v_flag |= VTHROTTLED;
 384                         } else {
 385                                 vp->v_flag |= VBWAIT;
 386                         }
 387
 388                         ts.tv_sec = (slptimeout / 100);
 389                         ts.tv_nsec = (slptimeout % 1000)  * 10 * NSEC_PER_USEC * 1000;
 390                         error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts);
 391
 392                         vnode_lock_spin(vp);
 393                 }
 394                 vnode_unlock(vp);
 395         }
 396         KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0);
 397
 398         return error;
 399 }
 400
 401
 402 void
 403 vnode_startwrite(vnode_t vp)
 404 {
 405         OSAddAtomic(1, &vp->v_numoutput);
 406 }
 407
 408
 409 void
 410 vnode_writedone(vnode_t vp)
 411 {
 412         if (vp) {
 413                 int need_wakeup = 0;
 414
 415                 OSAddAtomic(-1, &vp->v_numoutput);
 416
 417                 vnode_lock_spin(vp);
 418
 419                 if (vp->v_numoutput < 0) {
 420                         panic("vnode_writedone: numoutput < 0");
 421                 }
 422
 423                 if ((vp->v_flag & VTHROTTLED)) {
 424                         vp->v_flag &= ~VTHROTTLED;
 425                         need_wakeup = 1;
 426                 }
 427                 if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) {
 428                         vp->v_flag &= ~VBWAIT;
 429                         need_wakeup = 1;
 430                 }
 431                 vnode_unlock(vp);
 432
 433                 if (need_wakeup) {
 434                         wakeup((caddr_t)&vp->v_numoutput);
 435                 }
 436         }
 437 }
 438
 439
 440
 441 int
 442 vnode_hasdirtyblks(vnode_t vp)
 443 {
 444         struct cl_writebehind *wbp;
 445
 446         /*
 447          * Not taking the buf_mtxp as there is little
 448          * point doing it. Even if the lock is taken the
 449          * state can change right after that. If their
 450          * needs to be a synchronization, it must be driven
 451          * by the caller
 452          */
 453         if (vp->v_dirtyblkhd.lh_first) {
 454                 return 1;
 455         }
 456
 457         if (!UBCINFOEXISTS(vp)) {
 458                 return 0;
 459         }
 460
 461         wbp = vp->v_ubcinfo->cl_wbehind;
 462
 463         if (wbp && (wbp->cl_number || wbp->cl_scmap)) {
 464                 return 1;
 465         }
 466
 467         return 0;
 468 }
 469
 470 int
 471 vnode_hascleanblks(vnode_t vp)
 472 {
 473         /*
 474          * Not taking the buf_mtxp as there is little
 475          * point doing it. Even if the lock is taken the
 476          * state can change right after that. If their
 477          * needs to be a synchronization, it must be driven
 478          * by the caller
 479          */
 480         if (vp->v_cleanblkhd.lh_first) {
 481                 return 1;
 482         }
 483         return 0;
 484 }
 485
 486 void
 487 vnode_iterate_setup(mount_t mp)
 488 {
 489         mp->mnt_lflag |= MNT_LITER;
 490 }
 491
 492 int
 493 vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
 494 {
 495         vnode_t vp;
 496         int ret = 0;
 497
 498         TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
 499                 if (vp->v_type == VDIR) {
 500                         continue;
 501                 }
 502                 if (vp == skipvp) {
 503                         continue;
 504                 }
 505                 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) {
 506                         continue;
 507                 }
 508                 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
 509                         continue;
 510                 }
 511                 if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) {
 512                         continue;
 513                 }
 514
 515                 /* Look for busy vnode */
 516                 if ((vp->v_usecount != 0) && ((vp->v_usecount - vp->v_kusecount) != 0)) {
 517                         ret = 1;
 518                         if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) {
 519                                 vprint("vnode_umount_preflight - busy vnode", vp);
 520                         } else {
 521                                 return ret;
 522                         }
 523                 } else if (vp->v_iocount > 0) {
 524                         /* Busy if iocount is > 0 for more than 3 seconds */
 525                         tsleep(&vp->v_iocount, PVFS, "vnode_drain_network", 3 * hz);
 526                         if (vp->v_iocount > 0) {
 527                                 ret = 1;
 528                                 if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) {
 529                                         vprint("vnode_umount_preflight - busy vnode", vp);
 530                                 } else {
 531                                         return ret;
 532                                 }
 533                         }
 534                         continue;
 535                 }
 536         }
 537
 538         return ret;
 539 }
 540
 541 /*
 542  * This routine prepares iteration by moving all the vnodes to worker queue
 543  * called with mount lock held
 544  */
 545 int
 546 vnode_iterate_prepare(mount_t mp)
 547 {
 548         vnode_t vp;
 549
 550         if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
 551                 /* nothing to do */
 552                 return 0;
 553         }
 554
 555         vp = TAILQ_FIRST(&mp->mnt_vnodelist);
 556         vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first);
 557         mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first;
 558         mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last;
 559
 560         TAILQ_INIT(&mp->mnt_vnodelist);
 561         if (mp->mnt_newvnodes.tqh_first != NULL) {
 562                 panic("vnode_iterate_prepare: newvnode when entering vnode");
 563         }
 564         TAILQ_INIT(&mp->mnt_newvnodes);
 565
 566         return 1;
 567 }
 568
 569
 570 /* called with mount lock held */
 571 int
 572 vnode_iterate_reloadq(mount_t mp)
 573 {
 574         int moved = 0;
 575
 576         /* add the remaining entries in workerq to the end of mount vnode list */
 577         if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
 578                 struct vnode * mvp;
 579                 mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst);
 580
 581                 /* Joining the workerque entities to mount vnode list */
 582                 if (mvp) {
 583                         mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first;
 584                 } else {
 585                         mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first;
 586                 }
 587                 mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last;
 588                 mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last;
 589                 TAILQ_INIT(&mp->mnt_workerqueue);
 590         }
 591
 592         /* add the newvnodes to the head of mount vnode list */
 593         if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) {
 594                 struct vnode * nlvp;
 595                 nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst);
 596
 597                 mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first;
 598                 nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first;
 599                 if (mp->mnt_vnodelist.tqh_first) {
 600                         mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next;
 601                 } else {
 602                         mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last;
 603                 }
 604                 mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first;
 605                 TAILQ_INIT(&mp->mnt_newvnodes);
 606                 moved = 1;
 607         }
 608
 609         return moved;
 610 }
 611
 612
 613 void
 614 vnode_iterate_clear(mount_t mp)
 615 {
 616         mp->mnt_lflag &= ~MNT_LITER;
 617 }
 618
 619 #if !CONFIG_EMBEDDED
 620
 621 #include <i386/panic_hooks.h>
 622
 623 struct vnode_iterate_panic_hook {
 624         panic_hook_t hook;
 625         mount_t mp;
 626         struct vnode *vp;
 627 };
 628
 629 static void
 630 vnode_iterate_panic_hook(panic_hook_t *hook_)
 631 {
 632         struct vnode_iterate_panic_hook *hook = (struct vnode_iterate_panic_hook *)hook_;
 633         panic_phys_range_t range;
 634         uint64_t phys;
 635
 636         if (panic_phys_range_before(hook->mp, &phys, &range)) {
 637                 paniclog_append_noflush("mp = %p, phys = %p, prev (%p: %p-%p)\n",
 638                     hook->mp, phys, range.type, range.phys_start,
 639                     range.phys_start + range.len);
 640         } else {
 641                 paniclog_append_noflush("mp = %p, phys = %p, prev (!)\n", hook->mp, phys);
 642         }
 643
 644         if (panic_phys_range_before(hook->vp, &phys, &range)) {
 645                 paniclog_append_noflush("vp = %p, phys = %p, prev (%p: %p-%p)\n",
 646                     hook->vp, phys, range.type, range.phys_start,
 647                     range.phys_start + range.len);
 648         } else {
 649                 paniclog_append_noflush("vp = %p, phys = %p, prev (!)\n", hook->vp, phys);
 650         }
 651         panic_dump_mem((void *)(((vm_offset_t)hook->mp - 4096) & ~4095), 12288);
 652 }
 653 #endif //CONFIG_EMBEDDED
 654
 655 int
 656 vnode_iterate(mount_t mp, int flags, int (*callout)(struct vnode *, void *),
 657     void *arg)
 658 {
 659         struct vnode *vp;
 660         int vid, retval;
 661         int ret = 0;
 662
 663         /*
 664          * The mount iterate mutex is held for the duration of the iteration.
 665          * This can be done by a state flag on the mount structure but we can
 666          * run into priority inversion issues sometimes.
 667          * Using a mutex allows us to benefit from the priority donation
 668          * mechanisms in the kernel for locks. This mutex should never be
 669          * acquired in spin mode and it should be acquired before attempting to
 670          * acquire the mount lock.
 671          */
 672         mount_iterate_lock(mp);
 673
 674         mount_lock(mp);
 675
 676         vnode_iterate_setup(mp);
 677
 678         /* If it returns 0 then there is nothing to do */
 679         retval = vnode_iterate_prepare(mp);
 680
 681         if (retval == 0) {
 682                 vnode_iterate_clear(mp);
 683                 mount_unlock(mp);
 684                 mount_iterate_unlock(mp);
 685                 return ret;
 686         }
 687
 688 #if !CONFIG_EMBEDDED
 689         struct vnode_iterate_panic_hook hook;
 690         hook.mp = mp;
 691         hook.vp = NULL;
 692         panic_hook(&hook.hook, vnode_iterate_panic_hook);
 693 #endif
 694         /* iterate over all the vnodes */
 695         while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
 696                 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
 697 #if !CONFIG_EMBEDDED
 698                 hook.vp = vp;
 699 #endif
 700                 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
 701                 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
 702                 vid = vp->v_id;
 703                 if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) {
 704                         continue;
 705                 }
 706                 mount_unlock(mp);
 707
 708                 if (vget_internal(vp, vid, (flags | VNODE_NODEAD | VNODE_WITHID | VNODE_NOSUSPEND))) {
 709                         mount_lock(mp);
 710                         continue;
 711                 }
 712                 if (flags & VNODE_RELOAD) {
 713                         /*
 714                          * we're reloading the filesystem
 715                          * cast out any inactive vnodes...
 716                          */
 717                         if (vnode_reload(vp)) {
 718                                 /* vnode will be recycled on the refcount drop */
 719                                 vnode_put(vp);
 720                                 mount_lock(mp);
 721                                 continue;
 722                         }
 723                 }
 724
 725                 retval = callout(vp, arg);
 726
 727                 switch (retval) {
 728                 case VNODE_RETURNED:
 729                 case VNODE_RETURNED_DONE:
 730                         vnode_put(vp);
 731                         if (retval == VNODE_RETURNED_DONE) {
 732                                 mount_lock(mp);
 733                                 ret = 0;
 734                                 goto out;
 735                         }
 736                         break;
 737
 738                 case VNODE_CLAIMED_DONE:
 739                         mount_lock(mp);
 740                         ret = 0;
 741                         goto out;
 742                 case VNODE_CLAIMED:
 743                 default:
 744                         break;
 745                 }
 746                 mount_lock(mp);
 747         }
 748
 749 out:
 750 #if !CONFIG_EMBEDDED
 751         panic_unhook(&hook.hook);
 752 #endif
 753         (void)vnode_iterate_reloadq(mp);
 754         vnode_iterate_clear(mp);
 755         mount_unlock(mp);
 756         mount_iterate_unlock(mp);
 757         return ret;
 758 }
 759
 760 void
 761 mount_lock_renames(mount_t mp)
 762 {
 763         lck_mtx_lock(&mp->mnt_renamelock);
 764 }
 765
 766 void
 767 mount_unlock_renames(mount_t mp)
 768 {
 769         lck_mtx_unlock(&mp->mnt_renamelock);
 770 }
 771
 772 void
 773 mount_iterate_lock(mount_t mp)
 774 {
 775         lck_mtx_lock(&mp->mnt_iter_lock);
 776 }
 777
 778 void
 779 mount_iterate_unlock(mount_t mp)
 780 {
 781         lck_mtx_unlock(&mp->mnt_iter_lock);
 782 }
 783
 784 void
 785 mount_lock(mount_t mp)
 786 {
 787         lck_mtx_lock(&mp->mnt_mlock);
 788 }
 789
 790 void
 791 mount_lock_spin(mount_t mp)
 792 {
 793         lck_mtx_lock_spin(&mp->mnt_mlock);
 794 }
 795
 796 void
 797 mount_unlock(mount_t mp)
 798 {
 799         lck_mtx_unlock(&mp->mnt_mlock);
 800 }
 801
 802
 803 void
 804 mount_ref(mount_t mp, int locked)
 805 {
 806         if (!locked) {
 807                 mount_lock_spin(mp);
 808         }
 809
 810         mp->mnt_count++;
 811
 812         if (!locked) {
 813                 mount_unlock(mp);
 814         }
 815 }
 816
 817
 818 void
 819 mount_drop(mount_t mp, int locked)
 820 {
 821         if (!locked) {
 822                 mount_lock_spin(mp);
 823         }
 824
 825         mp->mnt_count--;
 826
 827         if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN)) {
 828                 wakeup(&mp->mnt_lflag);
 829         }
 830
 831         if (!locked) {
 832                 mount_unlock(mp);
 833         }
 834 }
 835
 836
 837 int
 838 mount_iterref(mount_t mp, int locked)
 839 {
 840         int retval = 0;
 841
 842         if (!locked) {
 843                 mount_list_lock();
 844         }
 845         if (mp->mnt_iterref < 0) {
 846                 retval = 1;
 847         } else {
 848                 mp->mnt_iterref++;
 849         }
 850         if (!locked) {
 851                 mount_list_unlock();
 852         }
 853         return retval;
 854 }
 855
 856 int
 857 mount_isdrained(mount_t mp, int locked)
 858 {
 859         int retval;
 860
 861         if (!locked) {
 862                 mount_list_lock();
 863         }
 864         if (mp->mnt_iterref < 0) {
 865                 retval = 1;
 866         } else {
 867                 retval = 0;
 868         }
 869         if (!locked) {
 870                 mount_list_unlock();
 871         }
 872         return retval;
 873 }
 874
 875 void
 876 mount_iterdrop(mount_t mp)
 877 {
 878         mount_list_lock();
 879         mp->mnt_iterref--;
 880         wakeup(&mp->mnt_iterref);
 881         mount_list_unlock();
 882 }
 883
 884 void
 885 mount_iterdrain(mount_t mp)
 886 {
 887         mount_list_lock();
 888         while (mp->mnt_iterref) {
 889                 msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL);
 890         }
 891         /* mount iterations drained */
 892         mp->mnt_iterref = -1;
 893         mount_list_unlock();
 894 }
 895 void
 896 mount_iterreset(mount_t mp)
 897 {
 898         mount_list_lock();
 899         if (mp->mnt_iterref == -1) {
 900                 mp->mnt_iterref = 0;
 901         }
 902         mount_list_unlock();
 903 }
 904
 905 /* always called with  mount lock held */
 906 int
 907 mount_refdrain(mount_t mp)
 908 {
 909         if (mp->mnt_lflag & MNT_LDRAIN) {
 910                 panic("already in drain");
 911         }
 912         mp->mnt_lflag |= MNT_LDRAIN;
 913
 914         while (mp->mnt_count) {
 915                 msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", NULL);
 916         }
 917
 918         if (mp->mnt_vnodelist.tqh_first != NULL) {
 919                 panic("mount_refdrain: dangling vnode");
 920         }
 921
 922         mp->mnt_lflag &= ~MNT_LDRAIN;
 923
 924         return 0;
 925 }
 926
 927 /* Tags the mount point as not supportine extended readdir for NFS exports */
 928 void
 929 mount_set_noreaddirext(mount_t mp)
 930 {
 931         mount_lock(mp);
 932         mp->mnt_kern_flag |= MNTK_DENY_READDIREXT;
 933         mount_unlock(mp);
 934 }
 935
 936 /*
 937  * Mark a mount point as busy. Used to synchronize access and to delay
 938  * unmounting.
 939  */
 940 int
 941 vfs_busy(mount_t mp, int flags)
 942 {
 943 restart:
 944         if (mp->mnt_lflag & MNT_LDEAD) {
 945                 return ENOENT;
 946         }
 947
 948         mount_lock(mp);
 949
 950         if (mp->mnt_lflag & MNT_LUNMOUNT) {
 951                 if (flags & LK_NOWAIT || mp->mnt_lflag & MNT_LDEAD) {
 952                         mount_unlock(mp);
 953                         return ENOENT;
 954                 }
 955
 956                 /*
 957                  * Since all busy locks are shared except the exclusive
 958                  * lock granted when unmounting, the only place that a
 959                  * wakeup needs to be done is at the release of the
 960                  * exclusive lock at the end of dounmount.
 961                  */
 962                 mp->mnt_lflag |= MNT_LWAIT;
 963                 msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", NULL);
 964                 return ENOENT;
 965         }
 966
 967         mount_unlock(mp);
 968
 969         lck_rw_lock_shared(&mp->mnt_rwlock);
 970
 971         /*
 972          * Until we are granted the rwlock, it's possible for the mount point to
 973          * change state, so re-evaluate before granting the vfs_busy.
 974          */
 975         if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
 976                 lck_rw_done(&mp->mnt_rwlock);
 977                 goto restart;
 978         }
 979         return 0;
 980 }
 981
 982 /*
 983  * Free a busy filesystem.
 984  */
 985 void
 986 vfs_unbusy(mount_t mp)
 987 {
 988         lck_rw_done(&mp->mnt_rwlock);
 989 }
 990
 991
 992
 993 static void
 994 vfs_rootmountfailed(mount_t mp)
 995 {
 996         mount_list_lock();
 997         mp->mnt_vtable->vfc_refcount--;
 998         mount_list_unlock();
 999
1000         vfs_unbusy(mp);
1001
1002         mount_lock_destroy(mp);
1003
1004 #if CONFIG_MACF
1005         mac_mount_label_destroy(mp);
1006 #endif
1007
1008         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1009 }
1010
1011 /*
1012  * Lookup a filesystem type, and if found allocate and initialize
1013  * a mount structure for it.
1014  *
1015  * Devname is usually updated by mount(8) after booting.
1016  */
1017 static mount_t
1018 vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname)
1019 {
1020         mount_t mp;
1021
1022         mp = _MALLOC_ZONE(sizeof(struct mount), M_MOUNT, M_WAITOK);
1023         bzero((char *)mp, sizeof(struct mount));
1024
1025         /* Initialize the default IO constraints */
1026         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1027         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1028         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1029         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1030         mp->mnt_devblocksize = DEV_BSIZE;
1031         mp->mnt_alignmentmask = PAGE_MASK;
1032         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1033         mp->mnt_ioscale = 1;
1034         mp->mnt_ioflags = 0;
1035         mp->mnt_realrootvp = NULLVP;
1036         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1037         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1038         mp->mnt_devbsdunit = 0;
1039
1040         mount_lock_init(mp);
1041         (void)vfs_busy(mp, LK_NOWAIT);
1042
1043         TAILQ_INIT(&mp->mnt_vnodelist);
1044         TAILQ_INIT(&mp->mnt_workerqueue);
1045         TAILQ_INIT(&mp->mnt_newvnodes);
1046
1047         mp->mnt_vtable = vfsp;
1048         mp->mnt_op = vfsp->vfc_vfsops;
1049         mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS;
1050         mp->mnt_vnodecovered = NULLVP;
1051         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1052         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1053
1054         mount_list_lock();
1055         vfsp->vfc_refcount++;
1056         mount_list_unlock();
1057
1058         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1059         mp->mnt_vfsstat.f_mntonname[0] = '/';
1060         /* XXX const poisoning layering violation */
1061         (void) copystr((const void *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, NULL);
1062
1063 #if CONFIG_MACF
1064         mac_mount_label_init(mp);
1065         mac_mount_label_associate(vfs_context_kernel(), mp);
1066 #endif
1067         return mp;
1068 }
1069
1070 errno_t
1071 vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp)
1072 {
1073         struct vfstable *vfsp;
1074
1075         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1076                 if (!strncmp(vfsp->vfc_name, fstypename,
1077                     sizeof(vfsp->vfc_name))) {
1078                         break;
1079                 }
1080         }
1081         if (vfsp == NULL) {
1082                 return ENODEV;
1083         }
1084
1085         *mpp = vfs_rootmountalloc_internal(vfsp, devname);
1086
1087         if (*mpp) {
1088                 return 0;
1089         }
1090
1091         return ENOMEM;
1092 }
1093
1094 #define DBG_MOUNTROOT (FSDBG_CODE(DBG_MOUNT, 0))
1095
1096 /*
1097  * Find an appropriate filesystem to use for the root. If a filesystem
1098  * has not been preselected, walk through the list of known filesystems
1099  * trying those that have mountroot routines, and try them until one
1100  * works or we have tried them all.
1101  */
1102 extern int (*mountroot)(void);
1103
1104 int
1105 vfs_mountroot(void)
1106 {
1107 #if CONFIG_MACF
1108         struct vnode *vp;
1109 #endif
1110         struct vfstable *vfsp;
1111         vfs_context_t ctx = vfs_context_kernel();
1112         struct vfs_attr vfsattr;
1113         int     error;
1114         mount_t mp;
1115         vnode_t bdevvp_rootvp;
1116
1117         KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_START);
1118         if (mountroot != NULL) {
1119                 /*
1120                  * used for netboot which follows a different set of rules
1121                  */
1122                 error = (*mountroot)();
1123
1124                 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 0);
1125                 return error;
1126         }
1127         if ((error = bdevvp(rootdev, &rootvp))) {
1128                 printf("vfs_mountroot: can't setup bdevvp\n");
1129
1130                 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 1);
1131                 return error;
1132         }
1133         /*
1134          * 4951998 - code we call in vfc_mountroot may replace rootvp
1135          * so keep a local copy for some house keeping.
1136          */
1137         bdevvp_rootvp = rootvp;
1138
1139         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1140                 if (vfsp->vfc_mountroot == NULL
1141                     && !ISSET(vfsp->vfc_vfsflags, VFC_VFSCANMOUNTROOT)) {
1142                         continue;
1143                 }
1144
1145                 mp = vfs_rootmountalloc_internal(vfsp, "root_device");
1146                 mp->mnt_devvp = rootvp;
1147
1148                 if (vfsp->vfc_mountroot) {
1149                         error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx);
1150                 } else {
1151                         error = VFS_MOUNT(mp, rootvp, 0, ctx);
1152                 }
1153
1154                 if (!error) {
1155                         if (bdevvp_rootvp != rootvp) {
1156                                 /*
1157                                  * rootvp changed...
1158                                  *   bump the iocount and fix up mnt_devvp for the
1159                                  *   new rootvp (it will already have a usecount taken)...
1160                                  *   drop the iocount and the usecount on the orignal
1161                                  *   since we are no longer going to use it...
1162                                  */
1163                                 vnode_getwithref(rootvp);
1164                                 mp->mnt_devvp = rootvp;
1165
1166                                 vnode_rele(bdevvp_rootvp);
1167                                 vnode_put(bdevvp_rootvp);
1168                         }
1169                         mp->mnt_devvp->v_specflags |= SI_MOUNTEDON;
1170
1171                         vfs_unbusy(mp);
1172
1173                         mount_list_add(mp);
1174
1175                         /*
1176                          *   cache the IO attributes for the underlying physical media...
1177                          *   an error return indicates the underlying driver doesn't
1178                          *   support all the queries necessary... however, reasonable
1179                          *   defaults will have been set, so no reason to bail or care
1180                          */
1181                         vfs_init_io_attributes(rootvp, mp);
1182
1183                         if (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) {
1184                                 root_is_CF_drive = TRUE;
1185                         }
1186
1187                         /*
1188                          * Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS.
1189                          */
1190                         if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1191                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1192                         }
1193                         if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1194                                 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1195                         }
1196
1197 #if !CONFIG_EMBEDDED
1198                         uint32_t speed;
1199
1200                         if (MNTK_VIRTUALDEV & mp->mnt_kern_flag) {
1201                                 speed = 128;
1202                         } else if (disk_conditioner_mount_is_ssd(mp)) {
1203                                 speed = 7 * 256;
1204                         } else {
1205                                 speed = 256;
1206                         }
1207                         vc_progress_setdiskspeed(speed);
1208 #endif
1209                         /*
1210                          * Probe root file system for additional features.
1211                          */
1212                         (void)VFS_START(mp, 0, ctx);
1213
1214                         VFSATTR_INIT(&vfsattr);
1215                         VFSATTR_WANTED(&vfsattr, f_capabilities);
1216                         if (vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1217                             VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1218                                 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1219                                     (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1220                                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1221                                 }
1222 #if NAMEDSTREAMS
1223                                 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1224                                     (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1225                                         mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1226                                 }
1227 #endif
1228                                 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1229                                     (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1230                                         mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1231                                 }
1232
1233                                 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1234                                     (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1235                                         mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1236                                 }
1237                         }
1238
1239                         /*
1240                          * get rid of iocount reference returned
1241                          * by bdevvp (or picked up by us on the substitued
1242                          * rootvp)... it (or we) will have also taken
1243                          * a usecount reference which we want to keep
1244                          */
1245                         vnode_put(rootvp);
1246
1247 #if CONFIG_MACF
1248                         if ((vfs_flags(mp) & MNT_MULTILABEL) == 0) {
1249                                 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 2);
1250                                 return 0;
1251                         }
1252
1253                         error = VFS_ROOT(mp, &vp, ctx);
1254                         if (error) {
1255                                 printf("%s() VFS_ROOT() returned %d\n",
1256                                     __func__, error);
1257                                 dounmount(mp, MNT_FORCE, 0, ctx);
1258                                 goto fail;
1259                         }
1260                         error = vnode_label(mp, NULL, vp, NULL, 0, ctx);
1261                         /*
1262                          * get rid of reference provided by VFS_ROOT
1263                          */
1264                         vnode_put(vp);
1265
1266                         if (error) {
1267                                 printf("%s() vnode_label() returned %d\n",
1268                                     __func__, error);
1269                                 dounmount(mp, MNT_FORCE, 0, ctx);
1270                                 goto fail;
1271                         }
1272 #endif
1273                         KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 3);
1274                         return 0;
1275                 }
1276 #if CONFIG_MACF
1277 fail:
1278 #endif
1279                 vfs_rootmountfailed(mp);
1280
1281                 if (error != EINVAL) {
1282                         printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
1283                 }
1284         }
1285         KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error ? error : ENODEV, 4);
1286         return ENODEV;
1287 }
1288
1289 /*
1290  * Mount the data volume of an ROSV volume group
1291  */
1292 int
1293 vfs_mount_rosv_data(void)
1294 {
1295 #if CONFIG_ROSV_STARTUP
1296         int error = 0;
1297         int do_rosv_mounts = 0;
1298
1299         error = vnode_get(rootvnode);
1300         if (error) {
1301                 /* root must be mounted first */
1302                 printf("vnode_get(rootvnode) failed with error %d\n", error);
1303                 return error;
1304         }
1305
1306         printf("NOTE: Attempting ROSV mount\n");
1307         struct vfs_attr vfsattr;
1308         VFSATTR_INIT(&vfsattr);
1309         VFSATTR_WANTED(&vfsattr, f_capabilities);
1310         if (vfs_getattr(rootvnode->v_mount, &vfsattr, vfs_context_kernel()) == 0 &&
1311             VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1312                 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
1313                     (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
1314                         printf("NOTE: DETECTED ROSV CONFIG\n");
1315                         do_rosv_mounts = 1;
1316                 }
1317         }
1318
1319         if (!do_rosv_mounts) {
1320                 vnode_put(rootvnode);
1321                 //bail out if config not supported
1322                 return 0;
1323         }
1324
1325         char datapath[] = PLATFORM_DATA_VOLUME_MOUNT_POINT; /* !const because of internal casting */
1326
1327         /* Mount the data volume */
1328         printf("attempting kernel mount for data volume... \n");
1329         error = kernel_mount(rootvnode->v_mount->mnt_vfsstat.f_fstypename, NULLVP, NULLVP,
1330             datapath, (rootvnode->v_mount), 0, 0, (KERNEL_MOUNT_DATAVOL), vfs_context_kernel());
1331
1332         if (error) {
1333                 printf("Failed to mount data volume (%d)\n", error);
1334         }
1335
1336         vnode_put(rootvnode);
1337
1338         return error;
1339
1340 #else
1341         return 0;
1342 #endif
1343 }
1344
1345 /*
1346  * Mount the VM volume of a container
1347  */
1348 int
1349 vfs_mount_vm(void)
1350 {
1351 #if CONFIG_MOUNT_VM
1352         int error = 0;
1353
1354         error = vnode_get(rootvnode);
1355         if (error) {
1356                 /* root must be mounted first */
1357                 printf("vnode_get(rootvnode) failed with error %d\n", error);
1358                 return error;
1359         }
1360
1361         char vmpath[] = PLATFORM_VM_VOLUME_MOUNT_POINT; /* !const because of internal casting */
1362
1363         /* Mount the VM volume */
1364         printf("attempting kernel mount for vm volume... \n");
1365         error = kernel_mount(rootvnode->v_mount->mnt_vfsstat.f_fstypename, NULLVP, NULLVP,
1366             vmpath, (rootvnode->v_mount), 0, 0, (KERNEL_MOUNT_VMVOL), vfs_context_kernel());
1367
1368         if (error) {
1369                 printf("Failed to mount vm volume (%d)\n", error);
1370         } else {
1371                 printf("mounted VM volume\n");
1372         }
1373
1374         vnode_put(rootvnode);
1375         return error;
1376 #else
1377         return 0;
1378 #endif
1379 }
1380
1381 /*
1382  * Lookup a mount point by filesystem identifier.
1383  */
1384
1385 struct mount *
1386 vfs_getvfs(fsid_t *fsid)
1387 {
1388         return mount_list_lookupby_fsid(fsid, 0, 0);
1389 }
1390
1391 static struct mount *
1392 vfs_getvfs_locked(fsid_t *fsid)
1393 {
1394         return mount_list_lookupby_fsid(fsid, 1, 0);
1395 }
1396
1397 struct mount *
1398 vfs_getvfs_by_mntonname(char *path)
1399 {
1400         mount_t retmp = (mount_t)0;
1401         mount_t mp;
1402
1403         mount_list_lock();
1404         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
1405                 if (!strncmp(mp->mnt_vfsstat.f_mntonname, path,
1406                     sizeof(mp->mnt_vfsstat.f_mntonname))) {
1407                         retmp = mp;
1408                         if (mount_iterref(retmp, 1)) {
1409                                 retmp = NULL;
1410                         }
1411                         goto out;
1412                 }
1413         }
1414 out:
1415         mount_list_unlock();
1416         return retmp;
1417 }
1418
1419 /* generation number for creation of new fsids */
1420 u_short mntid_gen = 0;
1421 /*
1422  * Get a new unique fsid
1423  */
1424 void
1425 vfs_getnewfsid(struct mount *mp)
1426 {
1427         fsid_t tfsid;
1428         int mtype;
1429
1430         mount_list_lock();
1431
1432         /* generate a new fsid */
1433         mtype = mp->mnt_vtable->vfc_typenum;
1434         if (++mntid_gen == 0) {
1435                 mntid_gen++;
1436         }
1437         tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1438         tfsid.val[1] = mtype;
1439
1440         while (vfs_getvfs_locked(&tfsid)) {
1441                 if (++mntid_gen == 0) {
1442                         mntid_gen++;
1443                 }
1444                 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1445         }
1446
1447         mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0];
1448         mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1];
1449         mount_list_unlock();
1450 }
1451
1452 /*
1453  * Routines having to do with the management of the vnode table.
1454  */
1455 extern int(**dead_vnodeop_p)(void *);
1456 long numvnodes, freevnodes, deadvnodes, async_work_vnodes;
1457
1458
1459 int async_work_timed_out = 0;
1460 int async_work_handled = 0;
1461 int dead_vnode_wanted = 0;
1462 int dead_vnode_waited = 0;
1463
1464 /*
1465  * Move a vnode from one mount queue to another.
1466  */
1467 static void
1468 insmntque(vnode_t vp, mount_t mp)
1469 {
1470         mount_t lmp;
1471         /*
1472          * Delete from old mount point vnode list, if on one.
1473          */
1474         if ((lmp = vp->v_mount) != NULL && lmp != dead_mountp) {
1475                 if ((vp->v_lflag & VNAMED_MOUNT) == 0) {
1476                         panic("insmntque: vp not in mount vnode list");
1477                 }
1478                 vp->v_lflag &= ~VNAMED_MOUNT;
1479
1480                 mount_lock_spin(lmp);
1481
1482                 mount_drop(lmp, 1);
1483
1484                 if (vp->v_mntvnodes.tqe_next == NULL) {
1485                         if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp) {
1486                                 TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes);
1487                         } else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp) {
1488                                 TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes);
1489                         } else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp) {
1490                                 TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes);
1491                         }
1492                 } else {
1493                         vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev;
1494                         *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next;
1495                 }
1496                 vp->v_mntvnodes.tqe_next = NULL;
1497                 vp->v_mntvnodes.tqe_prev = NULL;
1498                 mount_unlock(lmp);
1499                 return;
1500         }
1501
1502         /*
1503          * Insert into list of vnodes for the new mount point, if available.
1504          */
1505         if ((vp->v_mount = mp) != NULL) {
1506                 mount_lock_spin(mp);
1507                 if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0)) {
1508                         panic("vp already in mount list");
1509                 }
1510                 if (mp->mnt_lflag & MNT_LITER) {
1511                         TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes);
1512                 } else {
1513                         TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
1514                 }
1515                 if (vp->v_lflag & VNAMED_MOUNT) {
1516                         panic("insmntque: vp already in mount vnode list");
1517                 }
1518                 vp->v_lflag |= VNAMED_MOUNT;
1519                 mount_ref(mp, 1);
1520                 mount_unlock(mp);
1521         }
1522 }
1523
1524
1525 /*
1526  * Create a vnode for a block device.
1527  * Used for root filesystem, argdev, and swap areas.
1528  * Also used for memory file system special devices.
1529  */
1530 int
1531 bdevvp(dev_t dev, vnode_t *vpp)
1532 {
1533         vnode_t nvp;
1534         int     error;
1535         struct vnode_fsparam vfsp;
1536         struct vfs_context context;
1537
1538         if (dev == NODEV) {
1539                 *vpp = NULLVP;
1540                 return ENODEV;
1541         }
1542
1543         context.vc_thread = current_thread();
1544         context.vc_ucred = FSCRED;
1545
1546         vfsp.vnfs_mp = (struct mount *)0;
1547         vfsp.vnfs_vtype = VBLK;
1548         vfsp.vnfs_str = "bdevvp";
1549         vfsp.vnfs_dvp = NULL;
1550         vfsp.vnfs_fsnode = NULL;
1551         vfsp.vnfs_cnp = NULL;
1552         vfsp.vnfs_vops = spec_vnodeop_p;
1553         vfsp.vnfs_rdev = dev;
1554         vfsp.vnfs_filesize = 0;
1555
1556         vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE;
1557
1558         vfsp.vnfs_marksystem = 0;
1559         vfsp.vnfs_markroot = 0;
1560
1561         if ((error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp))) {
1562                 *vpp = NULLVP;
1563                 return error;
1564         }
1565         vnode_lock_spin(nvp);
1566         nvp->v_flag |= VBDEVVP;
1567         nvp->v_tag = VT_NON;    /* set this to VT_NON so during aliasing it can be replaced */
1568         vnode_unlock(nvp);
1569         if ((error = vnode_ref(nvp))) {
1570                 panic("bdevvp failed: vnode_ref");
1571                 return error;
1572         }
1573         if ((error = VNOP_FSYNC(nvp, MNT_WAIT, &context))) {
1574                 panic("bdevvp failed: fsync");
1575                 return error;
1576         }
1577         if ((error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0))) {
1578                 panic("bdevvp failed: invalidateblks");
1579                 return error;
1580         }
1581
1582 #if CONFIG_MACF
1583         /*
1584          * XXXMAC: We can't put a MAC check here, the system will
1585          * panic without this vnode.
1586          */
1587 #endif /* MAC */
1588
1589         if ((error = VNOP_OPEN(nvp, FREAD, &context))) {
1590                 panic("bdevvp failed: open");
1591                 return error;
1592         }
1593         *vpp = nvp;
1594
1595         return 0;
1596 }
1597
1598 /*
1599  * Check to see if the new vnode represents a special device
1600  * for which we already have a vnode (either because of
1601  * bdevvp() or because of a different vnode representing
1602  * the same block device). If such an alias exists, deallocate
1603  * the existing contents and return the aliased vnode. The
1604  * caller is responsible for filling it with its new contents.
1605  */
1606 static vnode_t
1607 checkalias(struct vnode *nvp, dev_t nvp_rdev)
1608 {
1609         struct vnode *vp;
1610         struct vnode **vpp;
1611         struct specinfo *sin = NULL;
1612         int vid = 0;
1613
1614         vpp = &speclisth[SPECHASH(nvp_rdev)];
1615 loop:
1616         SPECHASH_LOCK();
1617
1618         for (vp = *vpp; vp; vp = vp->v_specnext) {
1619                 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1620                         vid = vp->v_id;
1621                         break;
1622                 }
1623         }
1624         SPECHASH_UNLOCK();
1625
1626         if (vp) {
1627 found_alias:
1628                 if (vnode_getwithvid(vp, vid)) {
1629                         goto loop;
1630                 }
1631                 /*
1632                  * Termination state is checked in vnode_getwithvid
1633                  */
1634                 vnode_lock(vp);
1635
1636                 /*
1637                  * Alias, but not in use, so flush it out.
1638                  */
1639                 if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) {
1640                         vnode_reclaim_internal(vp, 1, 1, 0);
1641                         vnode_put_locked(vp);
1642                         vnode_unlock(vp);
1643                         goto loop;
1644                 }
1645         }
1646         if (vp == NULL || vp->v_tag != VT_NON) {
1647                 if (sin == NULL) {
1648                         MALLOC_ZONE(sin, struct specinfo *, sizeof(struct specinfo),
1649                             M_SPECINFO, M_WAITOK);
1650                 }
1651
1652                 nvp->v_specinfo = sin;
1653                 bzero(nvp->v_specinfo, sizeof(struct specinfo));
1654                 nvp->v_rdev = nvp_rdev;
1655                 nvp->v_specflags = 0;
1656                 nvp->v_speclastr = -1;
1657                 nvp->v_specinfo->si_opencount = 0;
1658                 nvp->v_specinfo->si_initted = 0;
1659                 nvp->v_specinfo->si_throttleable = 0;
1660
1661                 SPECHASH_LOCK();
1662
1663                 /* We dropped the lock, someone could have added */
1664                 if (vp == NULLVP) {
1665                         for (vp = *vpp; vp; vp = vp->v_specnext) {
1666                                 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1667                                         vid = vp->v_id;
1668                                         SPECHASH_UNLOCK();
1669                                         goto found_alias;
1670                                 }
1671                         }
1672                 }
1673
1674                 nvp->v_hashchain = vpp;
1675                 nvp->v_specnext = *vpp;
1676                 *vpp = nvp;
1677
1678                 if (vp != NULLVP) {
1679                         nvp->v_specflags |= SI_ALIASED;
1680                         vp->v_specflags |= SI_ALIASED;
1681                         SPECHASH_UNLOCK();
1682                         vnode_put_locked(vp);
1683                         vnode_unlock(vp);
1684                 } else {
1685                         SPECHASH_UNLOCK();
1686                 }
1687
1688                 return NULLVP;
1689         }
1690
1691         if (sin) {
1692                 FREE_ZONE(sin, sizeof(struct specinfo), M_SPECINFO);
1693         }
1694
1695         if ((vp->v_flag & (VBDEVVP | VDEVFLUSH)) != 0) {
1696                 return vp;
1697         }
1698
1699         panic("checkalias with VT_NON vp that shouldn't: %p", vp);
1700
1701         return vp;
1702 }
1703
1704
1705 /*
1706  * Get a reference on a particular vnode and lock it if requested.
1707  * If the vnode was on the inactive list, remove it from the list.
1708  * If the vnode was on the free list, remove it from the list and
1709  * move it to inactive list as needed.
1710  * The vnode lock bit is set if the vnode is being eliminated in
1711  * vgone. The process is awakened when the transition is completed,
1712  * and an error returned to indicate that the vnode is no longer
1713  * usable (possibly having been changed to a new file system type).
1714  */
1715 int
1716 vget_internal(vnode_t vp, int vid, int vflags)
1717 {
1718         int error = 0;
1719
1720         vnode_lock_spin(vp);
1721
1722         if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) {
1723                 /*
1724                  * vnode to be returned only if it has writers opened
1725                  */
1726                 error = EINVAL;
1727         } else {
1728                 error = vnode_getiocount(vp, vid, vflags);
1729         }
1730
1731         vnode_unlock(vp);
1732
1733         return error;
1734 }
1735
1736 /*
1737  * Returns:     0                       Success
1738  *              ENOENT                  No such file or directory [terminating]
1739  */
1740 int
1741 vnode_ref(vnode_t vp)
1742 {
1743         return vnode_ref_ext(vp, 0, 0);
1744 }
1745
1746 /*
1747  * Returns:     0                       Success
1748  *              ENOENT                  No such file or directory [terminating]
1749  */
1750 int
1751 vnode_ref_ext(vnode_t vp, int fmode, int flags)
1752 {
1753         int     error = 0;
1754
1755         vnode_lock_spin(vp);
1756
1757         /*
1758          * once all the current call sites have been fixed to insure they have
1759          * taken an iocount, we can toughen this assert up and insist that the
1760          * iocount is non-zero... a non-zero usecount doesn't insure correctness
1761          */
1762         if (vp->v_iocount <= 0 && vp->v_usecount <= 0) {
1763                 panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount);
1764         }
1765
1766         /*
1767          * if you are the owner of drain/termination, can acquire usecount
1768          */
1769         if ((flags & VNODE_REF_FORCE) == 0) {
1770                 if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) {
1771                         if (vp->v_owner != current_thread()) {
1772                                 error = ENOENT;
1773                                 goto out;
1774                         }
1775                 }
1776         }
1777         vp->v_usecount++;
1778
1779         if (fmode & FWRITE) {
1780                 if (++vp->v_writecount <= 0) {
1781                         panic("vnode_ref_ext: v_writecount");
1782                 }
1783         }
1784         if (fmode & O_EVTONLY) {
1785                 if (++vp->v_kusecount <= 0) {
1786                         panic("vnode_ref_ext: v_kusecount");
1787                 }
1788         }
1789         if (vp->v_flag & VRAGE) {
1790                 struct  uthread *ut;
1791
1792                 ut = get_bsdthread_info(current_thread());
1793
1794                 if (!(current_proc()->p_lflag & P_LRAGE_VNODES) &&
1795                     !(ut->uu_flag & UT_RAGE_VNODES)) {
1796                         /*
1797                          * a 'normal' process accessed this vnode
1798                          * so make sure its no longer marked
1799                          * for rapid aging...  also, make sure
1800                          * it gets removed from the rage list...
1801                          * when v_usecount drops back to 0, it
1802                          * will be put back on the real free list
1803                          */
1804                         vp->v_flag &= ~VRAGE;
1805                         vp->v_references = 0;
1806                         vnode_list_remove(vp);
1807                 }
1808         }
1809         if (vp->v_usecount == 1 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
1810                 if (vp->v_ubcinfo) {
1811                         vnode_lock_convert(vp);
1812                         memory_object_mark_used(vp->v_ubcinfo->ui_control);
1813                 }
1814         }
1815 out:
1816         vnode_unlock(vp);
1817
1818         return error;
1819 }
1820
1821
1822 boolean_t
1823 vnode_on_reliable_media(vnode_t vp)
1824 {
1825         if (!(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) && (vp->v_mount->mnt_flag & MNT_LOCAL)) {
1826                 return TRUE;
1827         }
1828         return FALSE;
1829 }
1830
1831 static void
1832 vnode_async_list_add(vnode_t vp)
1833 {
1834         vnode_list_lock();
1835
1836         if (VONLIST(vp) || (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) {
1837                 panic("vnode_async_list_add: %p is in wrong state", vp);
1838         }
1839
1840         TAILQ_INSERT_HEAD(&vnode_async_work_list, vp, v_freelist);
1841         vp->v_listflag |= VLIST_ASYNC_WORK;
1842
1843         async_work_vnodes++;
1844
1845         vnode_list_unlock();
1846
1847         wakeup(&vnode_async_work_list);
1848 }
1849
1850
1851 /*
1852  * put the vnode on appropriate free list.
1853  * called with vnode LOCKED
1854  */
1855 static void
1856 vnode_list_add(vnode_t vp)
1857 {
1858         boolean_t need_dead_wakeup = FALSE;
1859
1860 #if DIAGNOSTIC
1861         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1862 #endif
1863
1864 again:
1865
1866         /*
1867          * if it is already on a list or non zero references return
1868          */
1869         if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || (vp->v_lflag & VL_TERMINATE)) {
1870                 return;
1871         }
1872
1873         /*
1874          * In vclean, we might have deferred ditching locked buffers
1875          * because something was still referencing them (indicated by
1876          * usecount).  We can ditch them now.
1877          */
1878         if (ISSET(vp->v_lflag, VL_DEAD)
1879             && (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))) {
1880                 ++vp->v_iocount;        // Probably not necessary, but harmless
1881 #ifdef JOE_DEBUG
1882                 record_vp(vp, 1);
1883 #endif
1884                 vnode_unlock(vp);
1885                 buf_invalidateblks(vp, BUF_INVALIDATE_LOCKED, 0, 0);
1886                 vnode_lock(vp);
1887                 vnode_dropiocount(vp);
1888                 goto again;
1889         }
1890
1891         vnode_list_lock();
1892
1893         if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) {
1894                 /*
1895                  * add the new guy to the appropriate end of the RAGE list
1896                  */
1897                 if ((vp->v_flag & VAGE)) {
1898                         TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist);
1899                 } else {
1900                         TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist);
1901                 }
1902
1903                 vp->v_listflag |= VLIST_RAGE;
1904                 ragevnodes++;
1905
1906                 /*
1907                  * reset the timestamp for the last inserted vp on the RAGE
1908                  * queue to let new_vnode know that its not ok to start stealing
1909                  * from this list... as long as we're actively adding to this list
1910                  * we'll push out the vnodes we want to donate to the real free list
1911                  * once we stop pushing, we'll let some time elapse before we start
1912                  * stealing them in the new_vnode routine
1913                  */
1914                 microuptime(&rage_tv);
1915         } else {
1916                 /*
1917                  * if VL_DEAD, insert it at head of the dead list
1918                  * else insert at tail of LRU list or at head if VAGE is set
1919                  */
1920                 if ((vp->v_lflag & VL_DEAD)) {
1921                         TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist);
1922                         vp->v_listflag |= VLIST_DEAD;
1923                         deadvnodes++;
1924
1925                         if (dead_vnode_wanted) {
1926                                 dead_vnode_wanted--;
1927                                 need_dead_wakeup = TRUE;
1928                         }
1929                 } else if ((vp->v_flag & VAGE)) {
1930                         TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1931                         vp->v_flag &= ~VAGE;
1932                         freevnodes++;
1933                 } else {
1934                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1935                         freevnodes++;
1936                 }
1937         }
1938         vnode_list_unlock();
1939
1940         if (need_dead_wakeup == TRUE) {
1941                 wakeup_one((caddr_t)&dead_vnode_wanted);
1942         }
1943 }
1944
1945
1946 /*
1947  * remove the vnode from appropriate free list.
1948  * called with vnode LOCKED and
1949  * the list lock held
1950  */
1951 static void
1952 vnode_list_remove_locked(vnode_t vp)
1953 {
1954         if (VONLIST(vp)) {
1955                 /*
1956                  * the v_listflag field is
1957                  * protected by the vnode_list_lock
1958                  */
1959                 if (vp->v_listflag & VLIST_RAGE) {
1960                         VREMRAGE("vnode_list_remove", vp);
1961                 } else if (vp->v_listflag & VLIST_DEAD) {
1962                         VREMDEAD("vnode_list_remove", vp);
1963                 } else if (vp->v_listflag & VLIST_ASYNC_WORK) {
1964                         VREMASYNC_WORK("vnode_list_remove", vp);
1965                 } else {
1966                         VREMFREE("vnode_list_remove", vp);
1967                 }
1968         }
1969 }
1970
1971
1972 /*
1973  * remove the vnode from appropriate free list.
1974  * called with vnode LOCKED
1975  */
1976 static void
1977 vnode_list_remove(vnode_t vp)
1978 {
1979 #if DIAGNOSTIC
1980         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1981 #endif
1982         /*
1983          * we want to avoid taking the list lock
1984          * in the case where we're not on the free
1985          * list... this will be true for most
1986          * directories and any currently in use files
1987          *
1988          * we're guaranteed that we can't go from
1989          * the not-on-list state to the on-list
1990          * state since we hold the vnode lock...
1991          * all calls to vnode_list_add are done
1992          * under the vnode lock... so we can
1993          * check for that condition (the prevelant one)
1994          * without taking the list lock
1995          */
1996         if (VONLIST(vp)) {
1997                 vnode_list_lock();
1998                 /*
1999                  * however, we're not guaranteed that
2000                  * we won't go from the on-list state
2001                  * to the not-on-list state until we
2002                  * hold the vnode_list_lock... this
2003                  * is due to "new_vnode" removing vnodes
2004                  * from the free list uder the list_lock
2005                  * w/o the vnode lock... so we need to
2006                  * check again whether we're currently
2007                  * on the free list
2008                  */
2009                 vnode_list_remove_locked(vp);
2010
2011                 vnode_list_unlock();
2012         }
2013 }
2014
2015
2016 void
2017 vnode_rele(vnode_t vp)
2018 {
2019         vnode_rele_internal(vp, 0, 0, 0);
2020 }
2021
2022
2023 void
2024 vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter)
2025 {
2026         vnode_rele_internal(vp, fmode, dont_reenter, 0);
2027 }
2028
2029
2030 void
2031 vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked)
2032 {
2033         if (!locked) {
2034                 vnode_lock_spin(vp);
2035         }
2036 #if DIAGNOSTIC
2037         else {
2038                 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
2039         }
2040 #endif
2041         if (--vp->v_usecount < 0) {
2042                 panic("vnode_rele_ext: vp %p usecount -ve : %d.  v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
2043         }
2044
2045         if (fmode & FWRITE) {
2046                 if (--vp->v_writecount < 0) {
2047                         panic("vnode_rele_ext: vp %p writecount -ve : %d.  v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_writecount, vp->v_tag, vp->v_type, vp->v_flag);
2048                 }
2049         }
2050         if (fmode & O_EVTONLY) {
2051                 if (--vp->v_kusecount < 0) {
2052                         panic("vnode_rele_ext: vp %p kusecount -ve : %d.  v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_tag, vp->v_type, vp->v_flag);
2053                 }
2054         }
2055         if (vp->v_kusecount > vp->v_usecount) {
2056                 panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d).  v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
2057         }
2058
2059         if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) {
2060                 /*
2061                  * vnode is still busy... if we're the last
2062                  * usecount, mark for a future call to VNOP_INACTIVE
2063                  * when the iocount finally drops to 0
2064                  */
2065                 if (vp->v_usecount == 0) {
2066                         vp->v_lflag |= VL_NEEDINACTIVE;
2067                         vp->v_flag  &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
2068                 }
2069                 goto done;
2070         }
2071         vp->v_flag  &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
2072
2073         if (ISSET(vp->v_lflag, VL_TERMINATE | VL_DEAD) || dont_reenter) {
2074                 /*
2075                  * vnode is being cleaned, or
2076                  * we've requested that we don't reenter
2077                  * the filesystem on this release...in
2078                  * the latter case, we'll mark the vnode aged
2079                  */
2080                 if (dont_reenter) {
2081                         if (!(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM))) {
2082                                 vp->v_lflag |= VL_NEEDINACTIVE;
2083
2084                                 if (vnode_on_reliable_media(vp) == FALSE || vp->v_flag & VISDIRTY) {
2085                                         vnode_async_list_add(vp);
2086                                         goto done;
2087                                 }
2088                         }
2089                         vp->v_flag |= VAGE;
2090                 }
2091                 vnode_list_add(vp);
2092
2093                 goto done;
2094         }
2095         /*
2096          * at this point both the iocount and usecount
2097          * are zero
2098          * pick up an iocount so that we can call
2099          * VNOP_INACTIVE with the vnode lock unheld
2100          */
2101         vp->v_iocount++;
2102 #ifdef JOE_DEBUG
2103         record_vp(vp, 1);
2104 #endif
2105         vp->v_lflag &= ~VL_NEEDINACTIVE;
2106         vnode_unlock(vp);
2107
2108         VNOP_INACTIVE(vp, vfs_context_current());
2109
2110         vnode_lock_spin(vp);
2111         /*
2112          * because we dropped the vnode lock to call VNOP_INACTIVE
2113          * the state of the vnode may have changed... we may have
2114          * picked up an iocount, usecount or the MARKTERM may have
2115          * been set... we need to reevaluate the reference counts
2116          * to determine if we can call vnode_reclaim_internal at
2117          * this point... if the reference counts are up, we'll pick
2118          * up the MARKTERM state when they get subsequently dropped
2119          */
2120         if ((vp->v_iocount == 1) && (vp->v_usecount == 0) &&
2121             ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) {
2122                 struct  uthread *ut;
2123
2124                 ut = get_bsdthread_info(current_thread());
2125
2126                 if (ut->uu_defer_reclaims) {
2127                         vp->v_defer_reclaimlist = ut->uu_vreclaims;
2128                         ut->uu_vreclaims = vp;
2129                         goto done;
2130                 }
2131                 vnode_lock_convert(vp);
2132                 vnode_reclaim_internal(vp, 1, 1, 0);
2133         }
2134         vnode_dropiocount(vp);
2135         vnode_list_add(vp);
2136 done:
2137         if (vp->v_usecount == 0 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
2138                 if (vp->v_ubcinfo) {
2139                         vnode_lock_convert(vp);
2140                         memory_object_mark_unused(vp->v_ubcinfo->ui_control, (vp->v_flag & VRAGE) == VRAGE);
2141                 }
2142         }
2143         if (!locked) {
2144                 vnode_unlock(vp);
2145         }
2146         return;
2147 }
2148
2149 /*
2150  * Remove any vnodes in the vnode table belonging to mount point mp.
2151  *
2152  * If MNT_NOFORCE is specified, there should not be any active ones,
2153  * return error if any are found (nb: this is a user error, not a
2154  * system error). If MNT_FORCE is specified, detach any active vnodes
2155  * that are found.
2156  */
2157
2158 int
2159 vflush(struct mount *mp, struct vnode *skipvp, int flags)
2160 {
2161         struct vnode *vp;
2162         int busy = 0;
2163         int reclaimed = 0;
2164         int retval;
2165         unsigned int vid;
2166         bool first_try = true;
2167
2168         /*
2169          * See comments in vnode_iterate() for the rationale for this lock
2170          */
2171         mount_iterate_lock(mp);
2172
2173         mount_lock(mp);
2174         vnode_iterate_setup(mp);
2175         /*
2176          * On regular unmounts(not forced) do a
2177          * quick check for vnodes to be in use. This
2178          * preserves the caching of vnodes. automounter
2179          * tries unmounting every so often to see whether
2180          * it is still busy or not.
2181          */
2182         if (((flags & FORCECLOSE) == 0) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != 0)) {
2183                 if (vnode_umount_preflight(mp, skipvp, flags)) {
2184                         vnode_iterate_clear(mp);
2185                         mount_unlock(mp);
2186                         mount_iterate_unlock(mp);
2187                         return EBUSY;
2188                 }
2189         }
2190 loop:
2191         /* If it returns 0 then there is nothing to do */
2192         retval = vnode_iterate_prepare(mp);
2193
2194         if (retval == 0) {
2195                 vnode_iterate_clear(mp);
2196                 mount_unlock(mp);
2197                 mount_iterate_unlock(mp);
2198                 return retval;
2199         }
2200
2201         /* iterate over all the vnodes */
2202         while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
2203                 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
2204                 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
2205                 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
2206
2207                 if ((vp->v_mount != mp) || (vp == skipvp)) {
2208                         continue;
2209                 }
2210                 vid = vp->v_id;
2211                 mount_unlock(mp);
2212
2213                 vnode_lock_spin(vp);
2214
2215                 // If vnode is already terminating, wait for it...
2216                 while (vp->v_id == vid && ISSET(vp->v_lflag, VL_TERMINATE)) {
2217                         vp->v_lflag |= VL_TERMWANT;
2218                         msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vflush", NULL);
2219                 }
2220
2221                 if ((vp->v_id != vid) || ISSET(vp->v_lflag, VL_DEAD)) {
2222                         vnode_unlock(vp);
2223                         mount_lock(mp);
2224                         continue;
2225                 }
2226
2227                 /*
2228                  * If requested, skip over vnodes marked VSYSTEM.
2229                  * Skip over all vnodes marked VNOFLUSH.
2230                  */
2231                 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
2232                     (vp->v_flag & VNOFLUSH))) {
2233                         vnode_unlock(vp);
2234                         mount_lock(mp);
2235                         continue;
2236                 }
2237                 /*
2238                  * If requested, skip over vnodes marked VSWAP.
2239                  */
2240                 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
2241                         vnode_unlock(vp);
2242                         mount_lock(mp);
2243                         continue;
2244                 }
2245                 /*
2246                  * If requested, skip over vnodes marked VROOT.
2247                  */
2248                 if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
2249                         vnode_unlock(vp);
2250                         mount_lock(mp);
2251                         continue;
2252                 }
2253                 /*
2254                  * If WRITECLOSE is set, only flush out regular file
2255                  * vnodes open for writing.
2256                  */
2257                 if ((flags & WRITECLOSE) &&
2258                     (vp->v_writecount == 0 || vp->v_type != VREG)) {
2259                         vnode_unlock(vp);
2260                         mount_lock(mp);
2261                         continue;
2262                 }
2263                 /*
2264                  * If the real usecount is 0, all we need to do is clear
2265                  * out the vnode data structures and we are done.
2266                  */
2267                 if (((vp->v_usecount == 0) ||
2268                     ((vp->v_usecount - vp->v_kusecount) == 0))) {
2269                         vnode_lock_convert(vp);
2270                         vp->v_iocount++;        /* so that drain waits for * other iocounts */
2271 #ifdef JOE_DEBUG
2272                         record_vp(vp, 1);
2273 #endif
2274                         vnode_reclaim_internal(vp, 1, 1, 0);
2275                         vnode_dropiocount(vp);
2276                         vnode_list_add(vp);
2277                         vnode_unlock(vp);
2278
2279                         reclaimed++;
2280                         mount_lock(mp);
2281                         continue;
2282                 }
2283                 /*
2284                  * If FORCECLOSE is set, forcibly close the vnode.
2285                  * For block or character devices, revert to an
2286                  * anonymous device. For all other files, just kill them.
2287                  */
2288                 if (flags & FORCECLOSE) {
2289                         vnode_lock_convert(vp);
2290
2291                         if (vp->v_type != VBLK && vp->v_type != VCHR) {
2292                                 vp->v_iocount++;        /* so that drain waits * for other iocounts */
2293 #ifdef JOE_DEBUG
2294                                 record_vp(vp, 1);
2295 #endif
2296                                 vnode_abort_advlocks(vp);
2297                                 vnode_reclaim_internal(vp, 1, 1, 0);
2298                                 vnode_dropiocount(vp);
2299                                 vnode_list_add(vp);
2300                                 vnode_unlock(vp);
2301                         } else {
2302                                 vclean(vp, 0);
2303                                 vp->v_lflag &= ~VL_DEAD;
2304                                 vp->v_op = spec_vnodeop_p;
2305                                 vp->v_flag |= VDEVFLUSH;
2306                                 vnode_unlock(vp);
2307                         }
2308                         mount_lock(mp);
2309                         continue;
2310                 }
2311
2312                 /* log vnodes blocking unforced unmounts */
2313                 if (print_busy_vnodes && first_try && ((flags & FORCECLOSE) == 0)) {
2314                         vprint("vflush - busy vnode", vp);
2315                 }
2316
2317                 vnode_unlock(vp);
2318                 mount_lock(mp);
2319                 busy++;
2320         }
2321
2322         /* At this point the worker queue is completed */
2323         if (busy && ((flags & FORCECLOSE) == 0) && reclaimed) {
2324                 busy = 0;
2325                 reclaimed = 0;
2326                 (void)vnode_iterate_reloadq(mp);
2327                 first_try = false;
2328                 /* returned with mount lock held */
2329                 goto loop;
2330         }
2331
2332         /* if new vnodes were created in between retry the reclaim */
2333         if (vnode_iterate_reloadq(mp) != 0) {
2334                 if (!(busy && ((flags & FORCECLOSE) == 0))) {
2335                         first_try = false;
2336                         goto loop;
2337                 }
2338         }
2339         vnode_iterate_clear(mp);
2340         mount_unlock(mp);
2341         mount_iterate_unlock(mp);
2342
2343         if (busy && ((flags & FORCECLOSE) == 0)) {
2344                 return EBUSY;
2345         }
2346         return 0;
2347 }
2348
2349 long num_recycledvnodes = 0;
2350 /*
2351  * Disassociate the underlying file system from a vnode.
2352  * The vnode lock is held on entry.
2353  */
2354 static void
2355 vclean(vnode_t vp, int flags)
2356 {
2357         vfs_context_t ctx = vfs_context_current();
2358         int active;
2359         int need_inactive;
2360         int already_terminating;
2361         int clflags = 0;
2362 #if NAMEDSTREAMS
2363         int is_namedstream;
2364 #endif
2365
2366         /*
2367          * Check to see if the vnode is in use.
2368          * If so we have to reference it before we clean it out
2369          * so that its count cannot fall to zero and generate a
2370          * race against ourselves to recycle it.
2371          */
2372         active = vp->v_usecount;
2373
2374         /*
2375          * just in case we missed sending a needed
2376          * VNOP_INACTIVE, we'll do it now
2377          */
2378         need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
2379
2380         vp->v_lflag &= ~VL_NEEDINACTIVE;
2381
2382         /*
2383          * Prevent the vnode from being recycled or
2384          * brought into use while we clean it out.
2385          */
2386         already_terminating = (vp->v_lflag & VL_TERMINATE);
2387
2388         vp->v_lflag |= VL_TERMINATE;
2389
2390 #if NAMEDSTREAMS
2391         is_namedstream = vnode_isnamedstream(vp);
2392 #endif
2393
2394         vnode_unlock(vp);
2395
2396         OSAddAtomicLong(1, &num_recycledvnodes);
2397
2398         if (flags & DOCLOSE) {
2399                 clflags |= IO_NDELAY;
2400         }
2401         if (flags & REVOKEALL) {
2402                 clflags |= IO_REVOKE;
2403         }
2404
2405         if (active && (flags & DOCLOSE)) {
2406                 VNOP_CLOSE(vp, clflags, ctx);
2407         }
2408
2409         /*
2410          * Clean out any buffers associated with the vnode.
2411          */
2412         if (flags & DOCLOSE) {
2413 #if CONFIG_NFS_CLIENT
2414                 if (vp->v_tag == VT_NFS) {
2415                         nfs_vinvalbuf(vp, V_SAVE, ctx, 0);
2416                 } else
2417 #endif /* CONFIG_NFS_CLIENT */
2418                 {
2419                         VNOP_FSYNC(vp, MNT_WAIT, ctx);
2420
2421                         /*
2422                          * If the vnode is still in use (by the journal for
2423                          * example) we don't want to invalidate locked buffers
2424                          * here.  In that case, either the journal will tidy them
2425                          * up, or we will deal with it when the usecount is
2426                          * finally released in vnode_rele_internal.
2427                          */
2428                         buf_invalidateblks(vp, BUF_WRITE_DATA | (active ? 0 : BUF_INVALIDATE_LOCKED), 0, 0);
2429                 }
2430                 if (UBCINFOEXISTS(vp)) {
2431                         /*
2432                          * Clean the pages in VM.
2433                          */
2434                         (void)ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC);
2435                 }
2436         }
2437         if (active || need_inactive) {
2438                 VNOP_INACTIVE(vp, ctx);
2439         }
2440
2441 #if NAMEDSTREAMS
2442         if ((is_namedstream != 0) && (vp->v_parent != NULLVP)) {
2443                 vnode_t pvp = vp->v_parent;
2444
2445                 /* Delete the shadow stream file before we reclaim its vnode */
2446                 if (vnode_isshadow(vp)) {
2447                         vnode_relenamedstream(pvp, vp);
2448                 }
2449
2450                 /*
2451                  * No more streams associated with the parent.  We
2452                  * have a ref on it, so its identity is stable.
2453                  * If the parent is on an opaque volume, then we need to know
2454                  * whether it has associated named streams.
2455                  */
2456                 if (vfs_authopaque(pvp->v_mount)) {
2457                         vnode_lock_spin(pvp);
2458                         pvp->v_lflag &= ~VL_HASSTREAMS;
2459                         vnode_unlock(pvp);
2460                 }
2461         }
2462 #endif
2463
2464         /*
2465          * Destroy ubc named reference
2466          * cluster_release is done on this path
2467          * along with dropping the reference on the ucred
2468          * (and in the case of forced unmount of an mmap-ed file,
2469          * the ubc reference on the vnode is dropped here too).
2470          */
2471         ubc_destroy_named(vp);
2472
2473 #if CONFIG_TRIGGERS
2474         /*
2475          * cleanup trigger info from vnode (if any)
2476          */
2477         if (vp->v_resolve) {
2478                 vnode_resolver_detach(vp);
2479         }
2480 #endif
2481
2482         /*
2483          * Reclaim the vnode.
2484          */
2485         if (VNOP_RECLAIM(vp, ctx)) {
2486                 panic("vclean: cannot reclaim");
2487         }
2488
2489         // make sure the name & parent ptrs get cleaned out!
2490         vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE | VNODE_UPDATE_PURGEFIRMLINK);
2491
2492         vnode_lock(vp);
2493
2494         /*
2495          * Remove the vnode from any mount list it might be on.  It is not
2496          * safe to do this any earlier because unmount needs to wait for
2497          * any vnodes to terminate and it cannot do that if it cannot find
2498          * them.
2499          */
2500         insmntque(vp, (struct mount *)0);
2501
2502         vp->v_mount = dead_mountp;
2503         vp->v_op = dead_vnodeop_p;
2504         vp->v_tag = VT_NON;
2505         vp->v_data = NULL;
2506
2507         vp->v_lflag |= VL_DEAD;
2508         vp->v_flag &= ~VISDIRTY;
2509
2510         if (already_terminating == 0) {
2511                 vp->v_lflag &= ~VL_TERMINATE;
2512                 /*
2513                  * Done with purge, notify sleepers of the grim news.
2514                  */
2515                 if (vp->v_lflag & VL_TERMWANT) {
2516                         vp->v_lflag &= ~VL_TERMWANT;
2517                         wakeup(&vp->v_lflag);
2518                 }
2519         }
2520 }
2521
2522 /*
2523  * Eliminate all activity associated with  the requested vnode
2524  * and with all vnodes aliased to the requested vnode.
2525  */
2526 int
2527 #if DIAGNOSTIC
2528 vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
2529 #else
2530 vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context)
2531 #endif
2532 {
2533         struct vnode *vq;
2534         int vid;
2535
2536 #if DIAGNOSTIC
2537         if ((flags & REVOKEALL) == 0) {
2538                 panic("vnop_revoke");
2539         }
2540 #endif
2541
2542         if (vnode_isaliased(vp)) {
2543                 /*
2544                  * If a vgone (or vclean) is already in progress,
2545                  * return an immediate error
2546                  */
2547                 if (vp->v_lflag & VL_TERMINATE) {
2548                         return ENOENT;
2549                 }
2550
2551                 /*
2552                  * Ensure that vp will not be vgone'd while we
2553                  * are eliminating its aliases.
2554                  */
2555                 SPECHASH_LOCK();
2556                 while ((vp->v_specflags & SI_ALIASED)) {
2557                         for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2558                                 if (vq->v_rdev != vp->v_rdev ||
2559                                     vq->v_type != vp->v_type || vp == vq) {
2560                                         continue;
2561                                 }
2562                                 vid = vq->v_id;
2563                                 SPECHASH_UNLOCK();
2564                                 if (vnode_getwithvid(vq, vid)) {
2565                                         SPECHASH_LOCK();
2566                                         break;
2567                                 }
2568                                 vnode_lock(vq);
2569                                 if (!(vq->v_lflag & VL_TERMINATE)) {
2570                                         vnode_reclaim_internal(vq, 1, 1, 0);
2571                                 }
2572                                 vnode_put_locked(vq);
2573                                 vnode_unlock(vq);
2574                                 SPECHASH_LOCK();
2575                                 break;
2576                         }
2577                 }
2578                 SPECHASH_UNLOCK();
2579         }
2580         vnode_lock(vp);
2581         if (vp->v_lflag & VL_TERMINATE) {
2582                 vnode_unlock(vp);
2583                 return ENOENT;
2584         }
2585         vnode_reclaim_internal(vp, 1, 0, REVOKEALL);
2586         vnode_unlock(vp);
2587
2588         return 0;
2589 }
2590
2591 /*
2592  * Recycle an unused vnode to the front of the free list.
2593  * Release the passed interlock if the vnode will be recycled.
2594  */
2595 int
2596 vnode_recycle(struct vnode *vp)
2597 {
2598         vnode_lock_spin(vp);
2599
2600         if (vp->v_iocount || vp->v_usecount) {
2601                 vp->v_lflag |= VL_MARKTERM;
2602                 vnode_unlock(vp);
2603                 return 0;
2604         }
2605         vnode_lock_convert(vp);
2606         vnode_reclaim_internal(vp, 1, 0, 0);
2607
2608         vnode_unlock(vp);
2609
2610         return 1;
2611 }
2612
2613 static int
2614 vnode_reload(vnode_t vp)
2615 {
2616         vnode_lock_spin(vp);
2617
2618         if ((vp->v_iocount > 1) || vp->v_usecount) {
2619                 vnode_unlock(vp);
2620                 return 0;
2621         }
2622         if (vp->v_iocount <= 0) {
2623                 panic("vnode_reload with no iocount %d", vp->v_iocount);
2624         }
2625
2626         /* mark for release when iocount is dopped */
2627         vp->v_lflag |= VL_MARKTERM;
2628         vnode_unlock(vp);
2629
2630         return 1;
2631 }
2632
2633
2634 static void
2635 vgone(vnode_t vp, int flags)
2636 {
2637         struct vnode *vq;
2638         struct vnode *vx;
2639
2640         /*
2641          * Clean out the filesystem specific data.
2642          * vclean also takes care of removing the
2643          * vnode from any mount list it might be on
2644          */
2645         vclean(vp, flags | DOCLOSE);
2646
2647         /*
2648          * If special device, remove it from special device alias list
2649          * if it is on one.
2650          */
2651         if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
2652                 SPECHASH_LOCK();
2653                 if (*vp->v_hashchain == vp) {
2654                         *vp->v_hashchain = vp->v_specnext;
2655                 } else {
2656                         for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2657                                 if (vq->v_specnext != vp) {
2658                                         continue;
2659                                 }
2660                                 vq->v_specnext = vp->v_specnext;
2661                                 break;
2662                         }
2663                         if (vq == NULL) {
2664                                 panic("missing bdev");
2665                         }
2666                 }
2667                 if (vp->v_specflags & SI_ALIASED) {
2668                         vx = NULL;
2669                         for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2670                                 if (vq->v_rdev != vp->v_rdev ||
2671                                     vq->v_type != vp->v_type) {
2672                                         continue;
2673                                 }
2674                                 if (vx) {
2675                                         break;
2676                                 }
2677                                 vx = vq;
2678                         }
2679                         if (vx == NULL) {
2680                                 panic("missing alias");
2681                         }
2682                         if (vq == NULL) {
2683                                 vx->v_specflags &= ~SI_ALIASED;
2684                         }
2685                         vp->v_specflags &= ~SI_ALIASED;
2686                 }
2687                 SPECHASH_UNLOCK();
2688                 {
2689                         struct specinfo *tmp = vp->v_specinfo;
2690                         vp->v_specinfo = NULL;
2691                         FREE_ZONE(tmp, sizeof(struct specinfo), M_SPECINFO);
2692                 }
2693         }
2694 }
2695
2696 /*
2697  * Lookup a vnode by device number.
2698  */
2699 int
2700 check_mountedon(dev_t dev, enum vtype type, int  *errorp)
2701 {
2702         vnode_t vp;
2703         int rc = 0;
2704         int vid;
2705
2706 loop:
2707         SPECHASH_LOCK();
2708         for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
2709                 if (dev != vp->v_rdev || type != vp->v_type) {
2710                         continue;
2711                 }
2712                 vid = vp->v_id;
2713                 SPECHASH_UNLOCK();
2714                 if (vnode_getwithvid(vp, vid)) {
2715                         goto loop;
2716                 }
2717                 vnode_lock_spin(vp);
2718                 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
2719                         vnode_unlock(vp);
2720                         if ((*errorp = vfs_mountedon(vp)) != 0) {
2721                                 rc = 1;
2722                         }
2723                 } else {
2724                         vnode_unlock(vp);
2725                 }
2726                 vnode_put(vp);
2727                 return rc;
2728         }
2729         SPECHASH_UNLOCK();
2730         return 0;
2731 }
2732
2733 /*
2734  * Calculate the total number of references to a special device.
2735  */
2736 int
2737 vcount(vnode_t vp)
2738 {
2739         vnode_t vq, vnext;
2740         int count;
2741         int vid;
2742
2743         if (!vnode_isspec(vp)) {
2744                 return vp->v_usecount - vp->v_kusecount;
2745         }
2746
2747 loop:
2748         if (!vnode_isaliased(vp)) {
2749                 return vp->v_specinfo->si_opencount;
2750         }
2751         count = 0;
2752
2753         SPECHASH_LOCK();
2754         /*
2755          * Grab first vnode and its vid.
2756          */
2757         vq = *vp->v_hashchain;
2758         vid = vq ? vq->v_id : 0;
2759
2760         SPECHASH_UNLOCK();
2761
2762         while (vq) {
2763                 /*
2764                  * Attempt to get the vnode outside the SPECHASH lock.
2765                  */
2766                 if (vnode_getwithvid(vq, vid)) {
2767                         goto loop;
2768                 }
2769                 vnode_lock(vq);
2770
2771                 if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) {
2772                         if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) {
2773                                 /*
2774                                  * Alias, but not in use, so flush it out.
2775                                  */
2776                                 vnode_reclaim_internal(vq, 1, 1, 0);
2777                                 vnode_put_locked(vq);
2778                                 vnode_unlock(vq);
2779                                 goto loop;
2780                         }
2781                         count += vq->v_specinfo->si_opencount;
2782                 }
2783                 vnode_unlock(vq);
2784
2785                 SPECHASH_LOCK();
2786                 /*
2787                  * must do this with the reference still held on 'vq'
2788                  * so that it can't be destroyed while we're poking
2789                  * through v_specnext
2790                  */
2791                 vnext = vq->v_specnext;
2792                 vid = vnext ? vnext->v_id : 0;
2793
2794                 SPECHASH_UNLOCK();
2795
2796                 vnode_put(vq);
2797
2798                 vq = vnext;
2799         }
2800
2801         return count;
2802 }
2803
2804 int     prtactive = 0;          /* 1 => print out reclaim of active vnodes */
2805
2806 /*
2807  * Print out a description of a vnode.
2808  */
2809 static const char *typename[] =
2810 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
2811
2812 void
2813 vprint(const char *label, struct vnode *vp)
2814 {
2815         char sbuf[64];
2816
2817         if (label != NULL) {
2818                 printf("%s: ", label);
2819         }
2820         printf("name %s type %s, usecount %d, writecount %d\n",
2821             vp->v_name, typename[vp->v_type],
2822             vp->v_usecount, vp->v_writecount);
2823         sbuf[0] = '\0';
2824         if (vp->v_flag & VROOT) {
2825                 strlcat(sbuf, "|VROOT", sizeof(sbuf));
2826         }
2827         if (vp->v_flag & VTEXT) {
2828                 strlcat(sbuf, "|VTEXT", sizeof(sbuf));
2829         }
2830         if (vp->v_flag & VSYSTEM) {
2831                 strlcat(sbuf, "|VSYSTEM", sizeof(sbuf));
2832         }
2833         if (vp->v_flag & VNOFLUSH) {
2834                 strlcat(sbuf, "|VNOFLUSH", sizeof(sbuf));
2835         }
2836         if (vp->v_flag & VBWAIT) {
2837                 strlcat(sbuf, "|VBWAIT", sizeof(sbuf));
2838         }
2839         if (vnode_isaliased(vp)) {
2840                 strlcat(sbuf, "|VALIASED", sizeof(sbuf));
2841         }
2842         if (sbuf[0] != '\0') {
2843                 printf("vnode flags (%s\n", &sbuf[1]);
2844         }
2845 }
2846
2847
2848 int
2849 vn_getpath(struct vnode *vp, char *pathbuf, int *len)
2850 {
2851         return build_path(vp, pathbuf, *len, len, BUILDPATH_NO_FS_ENTER, vfs_context_current());
2852 }
2853
2854 int
2855 vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len)
2856 {
2857         return build_path(vp, pathbuf, *len, len, 0, vfs_context_current());
2858 }
2859
2860 /*
2861  * vn_getpath_fsenter_with_parent will reenter the file system to fine the path of the
2862  * vnode.  It requires that there are IO counts on both the vnode and the directory vnode.
2863  *
2864  * vn_getpath_fsenter is called by MAC hooks to authorize operations for every thing, but
2865  * unlink, rmdir and rename. For these operation the MAC hook  calls vn_getpath. This presents
2866  * problems where if the path can not be found from the name cache, those operations can
2867  * erroneously fail with EPERM even though the call should succeed. When removing or moving
2868  * file system objects with operations such as unlink or rename, those operations need to
2869  * take IO counts on the target and containing directory. Calling vn_getpath_fsenter from a
2870  * MAC hook from these operations during forced unmount operations can lead to dead
2871  * lock. This happens when the operation starts, IO counts are taken on the containing
2872  * directories and targets. Before the MAC hook is called a forced unmount from another
2873  * thread takes place and blocks on the on going operation's directory vnode in vdrain.
2874  * After which, the MAC hook gets called and calls vn_getpath_fsenter.  vn_getpath_fsenter
2875  * is called with the understanding that there is an IO count on the target. If in
2876  * build_path the directory vnode is no longer in the cache, then the parent object id via
2877  * vnode_getattr from the target is obtain and used to call VFS_VGET to get the parent
2878  * vnode. The file system's VFS_VGET then looks up by inode in its hash and tries to get
2879  * an IO count. But VFS_VGET "sees" the directory vnode is in vdrain and can block
2880  * depending on which version and how it calls the vnode_get family of interfaces.
2881  *
2882  * N.B.  A reasonable interface to use is vnode_getwithvid. This interface was modified to
2883  * call vnode_getiocount with VNODE_DRAINO, so it will happily get an IO count and not
2884  * cause issues, but there is no guarantee that all or any file systems are doing that.
2885  *
2886  * vn_getpath_fsenter_with_parent can enter the file system safely since there is a known
2887  * IO count on the directory vnode by calling build_path_with_parent.
2888  */
2889
2890 int
2891 vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pathbuf, int *len)
2892 {
2893         return build_path_with_parent(vp, dvp, pathbuf, *len, len, NULL, 0, vfs_context_current());
2894 }
2895
2896 int
2897 vn_getpath_ext(struct vnode *vp, struct vnode *dvp, char *pathbuf, int *len, int flags)
2898 {
2899         int bpflags = (flags & VN_GETPATH_FSENTER) ? 0 : BUILDPATH_NO_FS_ENTER;
2900
2901         if (flags && (flags != VN_GETPATH_FSENTER)) {
2902                 if (flags & VN_GETPATH_NO_FIRMLINK) {
2903                         bpflags |= BUILDPATH_NO_FIRMLINK;;
2904                 }
2905                 if (flags & VN_GETPATH_VOLUME_RELATIVE) {
2906                         bpflags |= (BUILDPATH_VOLUME_RELATIVE | BUILDPATH_NO_FIRMLINK);
2907                 }
2908                 if (flags & VN_GETPATH_NO_PROCROOT) {
2909                         bpflags |= BUILDPATH_NO_PROCROOT;
2910                 }
2911         }
2912
2913         return build_path_with_parent(vp, dvp, pathbuf, *len, len, NULL, bpflags, vfs_context_current());
2914 }
2915
2916 int
2917 vn_getpath_no_firmlink(struct vnode *vp, char *pathbuf, int *len)
2918 {
2919         return vn_getpath_ext(vp, NULLVP, pathbuf, len, VN_GETPATH_NO_FIRMLINK);
2920 }
2921
2922 int
2923 vn_getpath_ext_with_mntlen(struct vnode *vp, struct vnode *dvp, char *pathbuf, size_t *len, size_t *mntlen, int flags)
2924 {
2925         int bpflags = (flags & VN_GETPATH_FSENTER) ? 0 : BUILDPATH_NO_FS_ENTER;
2926         int local_len;
2927         int error;
2928
2929         if (*len > INT_MAX) {
2930                 return EINVAL;
2931         }
2932
2933         local_len = *len;
2934
2935         if (flags && (flags != VN_GETPATH_FSENTER)) {
2936                 if (flags & VN_GETPATH_NO_FIRMLINK) {
2937                         bpflags |= BUILDPATH_NO_FIRMLINK;;
2938                 }
2939                 if (flags & VN_GETPATH_VOLUME_RELATIVE) {
2940                         bpflags |= (BUILDPATH_VOLUME_RELATIVE | BUILDPATH_NO_FIRMLINK);
2941                 }
2942                 if (flags & VN_GETPATH_NO_PROCROOT) {
2943                         bpflags |= BUILDPATH_NO_PROCROOT;
2944                 }
2945         }
2946
2947         error = build_path_with_parent(vp, dvp, pathbuf, local_len, &local_len, mntlen, bpflags, vfs_context_current());
2948
2949         if (local_len >= 0 && local_len <= (int)*len) {
2950                 *len = (size_t)local_len;
2951         }
2952
2953         return error;
2954 }
2955
2956 int
2957 vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash)
2958 {
2959         return ubc_cs_getcdhash(vp, offset, cdhash);
2960 }
2961
2962
2963 static char *extension_table = NULL;
2964 static int   nexts;
2965 static int   max_ext_width;
2966
2967 static int
2968 extension_cmp(const void *a, const void *b)
2969 {
2970         return strlen((const char *)a) - strlen((const char *)b);
2971 }
2972
2973
2974 //
2975 // This is the api LaunchServices uses to inform the kernel
2976 // the list of package extensions to ignore.
2977 //
2978 // Internally we keep the list sorted by the length of the
2979 // the extension (from longest to shortest).  We sort the
2980 // list of extensions so that we can speed up our searches
2981 // when comparing file names -- we only compare extensions
2982 // that could possibly fit into the file name, not all of
2983 // them (i.e. a short 8 character name can't have an 8
2984 // character extension).
2985 //
2986 extern lck_mtx_t *pkg_extensions_lck;
2987
2988 __private_extern__ int
2989 set_package_extensions_table(user_addr_t data, int nentries, int maxwidth)
2990 {
2991         char *new_exts, *old_exts;
2992         int error;
2993
2994         if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) {
2995                 return EINVAL;
2996         }
2997
2998
2999         // allocate one byte extra so we can guarantee null termination
3000         MALLOC(new_exts, char *, (nentries * maxwidth) + 1, M_TEMP, M_WAITOK);
3001         if (new_exts == NULL) {
3002                 return ENOMEM;
3003         }
3004
3005         error = copyin(data, new_exts, nentries * maxwidth);
3006         if (error) {
3007                 FREE(new_exts, M_TEMP);
3008                 return error;
3009         }
3010
3011         new_exts[(nentries * maxwidth)] = '\0'; // guarantee null termination of the block
3012
3013         qsort(new_exts, nentries, maxwidth, extension_cmp);
3014
3015         lck_mtx_lock(pkg_extensions_lck);
3016
3017         old_exts        = extension_table;
3018         extension_table = new_exts;
3019         nexts           = nentries;
3020         max_ext_width   = maxwidth;
3021
3022         lck_mtx_unlock(pkg_extensions_lck);
3023
3024         if (old_exts) {
3025                 FREE(old_exts, M_TEMP);
3026         }
3027
3028         return 0;
3029 }
3030
3031
3032 int
3033 is_package_name(const char *name, int len)
3034 {
3035         int i, extlen;
3036         const char *ptr, *name_ext;
3037
3038         if (len <= 3) {
3039                 return 0;
3040         }
3041
3042         name_ext = NULL;
3043         for (ptr = name; *ptr != '\0'; ptr++) {
3044                 if (*ptr == '.') {
3045                         name_ext = ptr;
3046                 }
3047         }
3048
3049         // if there is no "." extension, it can't match
3050         if (name_ext == NULL) {
3051                 return 0;
3052         }
3053
3054         // advance over the "."
3055         name_ext++;
3056
3057         lck_mtx_lock(pkg_extensions_lck);
3058
3059         // now iterate over all the extensions to see if any match
3060         ptr = &extension_table[0];
3061         for (i = 0; i < nexts; i++, ptr += max_ext_width) {
3062                 extlen = strlen(ptr);
3063                 if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') {
3064                         // aha, a match!
3065                         lck_mtx_unlock(pkg_extensions_lck);
3066                         return 1;
3067                 }
3068         }
3069
3070         lck_mtx_unlock(pkg_extensions_lck);
3071
3072         // if we get here, no extension matched
3073         return 0;
3074 }
3075
3076 int
3077 vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component)
3078 {
3079         char *ptr, *end;
3080         int comp = 0;
3081
3082         *component = -1;
3083         if (*path != '/') {
3084                 return EINVAL;
3085         }
3086
3087         end = path + 1;
3088         while (end < path + pathlen && *end != '\0') {
3089                 while (end < path + pathlen && *end == '/' && *end != '\0') {
3090                         end++;
3091                 }
3092
3093                 ptr = end;
3094
3095                 while (end < path + pathlen && *end != '/' && *end != '\0') {
3096                         end++;
3097                 }
3098
3099                 if (end > path + pathlen) {
3100                         // hmm, string wasn't null terminated
3101                         return EINVAL;
3102                 }
3103
3104                 *end = '\0';
3105                 if (is_package_name(ptr, end - ptr)) {
3106                         *component = comp;
3107                         break;
3108                 }
3109
3110                 end++;
3111                 comp++;
3112         }
3113
3114         return 0;
3115 }
3116
3117 /*
3118  * Determine if a name is inappropriate for a searchfs query.
3119  * This list consists of /System currently.
3120  */
3121
3122 int
3123 vn_searchfs_inappropriate_name(const char *name, int len)
3124 {
3125         const char *bad_names[] = { "System" };
3126         int   bad_len[]   = { 6 };
3127         int  i;
3128
3129         for (i = 0; i < (int) (sizeof(bad_names) / sizeof(bad_names[0])); i++) {
3130                 if (len == bad_len[i] && strncmp(name, bad_names[i], strlen(bad_names[i]) + 1) == 0) {
3131                         return 1;
3132                 }
3133         }
3134
3135         // if we get here, no name matched
3136         return 0;
3137 }
3138
3139 /*
3140  * Top level filesystem related information gathering.
3141  */
3142 extern unsigned int vfs_nummntops;
3143
3144 /*
3145  * The VFS_NUMMNTOPS shouldn't be at name[1] since
3146  * is a VFS generic variable. Since we no longer support
3147  * VT_UFS, we reserve its value to support this sysctl node.
3148  *
3149  * It should have been:
3150  *    name[0]:  VFS_GENERIC
3151  *    name[1]:  VFS_NUMMNTOPS
3152  */
3153 SYSCTL_INT(_vfs, VFS_NUMMNTOPS, nummntops,
3154     CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
3155     &vfs_nummntops, 0, "");
3156
3157 int
3158 vfs_sysctl(int *name __unused, u_int namelen __unused,
3159     user_addr_t oldp __unused, size_t *oldlenp __unused,
3160     user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused);
3161
3162 int
3163 vfs_sysctl(int *name __unused, u_int namelen __unused,
3164     user_addr_t oldp __unused, size_t *oldlenp __unused,
3165     user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused)
3166 {
3167         return EINVAL;
3168 }
3169
3170
3171 //
3172 // The following code disallows specific sysctl's that came through
3173 // the direct sysctl interface (vfs_sysctl_node) instead of the newer
3174 // sysctl_vfs_ctlbyfsid() interface.  We can not allow these selectors
3175 // through vfs_sysctl_node() because it passes the user's oldp pointer
3176 // directly to the file system which (for these selectors) casts it
3177 // back to a struct sysctl_req and then proceed to use SYSCTL_IN()
3178 // which jumps through an arbitrary function pointer.  When called
3179 // through the sysctl_vfs_ctlbyfsid() interface this does not happen
3180 // and so it's safe.
3181 //
3182 // Unfortunately we have to pull in definitions from AFP and SMB and
3183 // perform explicit name checks on the file system to determine if
3184 // these selectors are being used.
3185 //
3186
3187 #define AFPFS_VFS_CTL_GETID            0x00020001
3188 #define AFPFS_VFS_CTL_NETCHANGE        0x00020002
3189 #define AFPFS_VFS_CTL_VOLCHANGE        0x00020003
3190
3191 #define SMBFS_SYSCTL_REMOUNT           1
3192 #define SMBFS_SYSCTL_REMOUNT_INFO      2
3193 #define SMBFS_SYSCTL_GET_SERVER_SHARE  3
3194
3195
3196 static int
3197 is_bad_sysctl_name(struct vfstable *vfsp, int selector_name)
3198 {
3199         switch (selector_name) {
3200         case VFS_CTL_QUERY:
3201         case VFS_CTL_TIMEO:
3202         case VFS_CTL_NOLOCKS:
3203         case VFS_CTL_NSTATUS:
3204         case VFS_CTL_SADDR:
3205         case VFS_CTL_DISC:
3206         case VFS_CTL_SERVERINFO:
3207                 return 1;
3208
3209         default:
3210                 break;
3211         }
3212
3213         // the more complicated check for some of SMB's special values
3214         if (strcmp(vfsp->vfc_name, "smbfs") == 0) {
3215                 switch (selector_name) {
3216                 case SMBFS_SYSCTL_REMOUNT:
3217                 case SMBFS_SYSCTL_REMOUNT_INFO:
3218                 case SMBFS_SYSCTL_GET_SERVER_SHARE:
3219                         return 1;
3220                 }
3221         } else if (strcmp(vfsp->vfc_name, "afpfs") == 0) {
3222                 switch (selector_name) {
3223                 case AFPFS_VFS_CTL_GETID:
3224                 case AFPFS_VFS_CTL_NETCHANGE:
3225                 case AFPFS_VFS_CTL_VOLCHANGE:
3226                         return 1;
3227                 }
3228         }
3229
3230         //
3231         // If we get here we passed all the checks so the selector is ok
3232         //
3233         return 0;
3234 }
3235
3236
3237 int vfs_sysctl_node SYSCTL_HANDLER_ARGS
3238 {
3239         int *name, namelen;
3240         struct vfstable *vfsp;
3241         int error;
3242         int fstypenum;
3243
3244         fstypenum = oidp->oid_number;
3245         name = arg1;
3246         namelen = arg2;
3247
3248         /* all sysctl names at this level should have at least one name slot for the FS */
3249         if (namelen < 1) {
3250                 return EISDIR; /* overloaded */
3251         }
3252         mount_list_lock();
3253         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
3254                 if (vfsp->vfc_typenum == fstypenum) {
3255                         vfsp->vfc_refcount++;
3256                         break;
3257                 }
3258         }
3259         mount_list_unlock();
3260
3261         if (vfsp == NULL) {
3262                 return ENOTSUP;
3263         }
3264
3265         if (is_bad_sysctl_name(vfsp, name[0])) {
3266                 printf("vfs: bad selector 0x%.8x for old-style sysctl().  use the sysctl-by-fsid interface instead\n", name[0]);
3267                 return EPERM;
3268         }
3269
3270         error = (vfsp->vfc_vfsops->vfs_sysctl)(name, namelen, req->oldptr, &req->oldlen, req->newptr, req->newlen, vfs_context_current());
3271
3272         mount_list_lock();
3273         vfsp->vfc_refcount--;
3274         mount_list_unlock();
3275
3276         return error;
3277 }
3278
3279 /*
3280  * Check to see if a filesystem is mounted on a block device.
3281  */
3282 int
3283 vfs_mountedon(struct vnode *vp)
3284 {
3285         struct vnode *vq;
3286         int error = 0;
3287
3288         SPECHASH_LOCK();
3289         if (vp->v_specflags & SI_MOUNTEDON) {
3290                 error = EBUSY;
3291                 goto out;
3292         }
3293         if (vp->v_specflags & SI_ALIASED) {
3294                 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
3295                         if (vq->v_rdev != vp->v_rdev ||
3296                             vq->v_type != vp->v_type) {
3297                                 continue;
3298                         }
3299                         if (vq->v_specflags & SI_MOUNTEDON) {
3300                                 error = EBUSY;
3301                                 break;
3302                         }
3303                 }
3304         }
3305 out:
3306         SPECHASH_UNLOCK();
3307         return error;
3308 }
3309
3310 struct unmount_info {
3311         int     u_errs; // Total failed unmounts
3312         int     u_busy; // EBUSY failed unmounts
3313 };
3314
3315 static int
3316 unmount_callback(mount_t mp, void *arg)
3317 {
3318         int error;
3319         char *mntname;
3320         struct unmount_info *uip = arg;
3321
3322         mount_ref(mp, 0);
3323         mount_iterdrop(mp);     // avoid vfs_iterate deadlock in dounmount()
3324
3325         MALLOC_ZONE(mntname, void *, MAXPATHLEN, M_NAMEI, M_WAITOK);
3326         if (mntname) {
3327                 strlcpy(mntname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
3328         }
3329
3330         error = dounmount(mp, MNT_FORCE, 1, vfs_context_current());
3331         if (error) {
3332                 uip->u_errs++;
3333                 printf("Unmount of %s failed (%d)\n", mntname ? mntname:"?", error);
3334                 if (error == EBUSY) {
3335                         uip->u_busy++;
3336                 }
3337         }
3338         if (mntname) {
3339                 FREE_ZONE(mntname, MAXPATHLEN, M_NAMEI);
3340         }
3341
3342         return VFS_RETURNED;
3343 }
3344
3345 /*
3346  * Unmount all filesystems. The list is traversed in reverse order
3347  * of mounting to avoid dependencies.
3348  * Busy mounts are retried.
3349  */
3350 __private_extern__ void
3351 vfs_unmountall(void)
3352 {
3353         int mounts, sec = 1;
3354         struct unmount_info ui;
3355
3356         vfs_unmountall_started = 1;
3357
3358 retry:
3359         ui.u_errs = ui.u_busy = 0;
3360         vfs_iterate(VFS_ITERATE_CB_DROPREF | VFS_ITERATE_TAIL_FIRST, unmount_callback, &ui);
3361         mounts = mount_getvfscnt();
3362         if (mounts == 0) {
3363                 return;
3364         }
3365
3366         if (ui.u_busy > 0) {            // Busy mounts - wait & retry
3367                 tsleep(&nummounts, PVFS, "busy mount", sec * hz);
3368                 sec *= 2;
3369                 if (sec <= 32) {
3370                         goto retry;
3371                 }
3372                 printf("Unmounting timed out\n");
3373         } else if (ui.u_errs < mounts) {
3374                 // If the vfs_iterate missed mounts in progress - wait a bit
3375                 tsleep(&nummounts, PVFS, "missed mount", 2 * hz);
3376         }
3377 }
3378
3379 /*
3380  * This routine is called from vnode_pager_deallocate out of the VM
3381  * The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named
3382  * on a vnode that has a UBCINFO
3383  */
3384 __private_extern__ void
3385 vnode_pager_vrele(vnode_t vp)
3386 {
3387         struct ubc_info *uip;
3388
3389         vnode_lock_spin(vp);
3390
3391         vp->v_lflag &= ~VNAMED_UBC;
3392         if (vp->v_usecount != 0) {
3393                 /*
3394                  * At the eleventh hour, just before the ubcinfo is
3395                  * destroyed, ensure the ubc-specific v_usecount
3396                  * reference has gone.  We use v_usecount != 0 as a hint;
3397                  * ubc_unmap() does nothing if there's no mapping.
3398                  *
3399                  * This case is caused by coming here via forced unmount,
3400                  * versus the usual vm_object_deallocate() path.
3401                  * In the forced unmount case, ubc_destroy_named()
3402                  * releases the pager before memory_object_last_unmap()
3403                  * can be called.
3404                  */
3405                 vnode_unlock(vp);
3406                 ubc_unmap(vp);
3407                 vnode_lock_spin(vp);
3408         }
3409
3410         uip = vp->v_ubcinfo;
3411         vp->v_ubcinfo = UBC_INFO_NULL;
3412
3413         vnode_unlock(vp);
3414
3415         ubc_info_deallocate(uip);
3416 }
3417
3418
3419 #include <sys/disk.h>
3420
3421 u_int32_t rootunit = (u_int32_t)-1;
3422
3423 #if CONFIG_IOSCHED
3424 extern int lowpri_throttle_enabled;
3425 extern int iosched_enabled;
3426 #endif
3427
3428 errno_t
3429 vfs_init_io_attributes(vnode_t devvp, mount_t mp)
3430 {
3431         int     error;
3432         off_t   readblockcnt = 0;
3433         off_t   writeblockcnt = 0;
3434         off_t   readmaxcnt = 0;
3435         off_t   writemaxcnt = 0;
3436         off_t   readsegcnt = 0;
3437         off_t   writesegcnt = 0;
3438         off_t   readsegsize = 0;
3439         off_t   writesegsize = 0;
3440         off_t   alignment = 0;
3441         u_int32_t minsaturationbytecount = 0;
3442         u_int32_t ioqueue_depth = 0;
3443         u_int32_t blksize;
3444         u_int64_t temp;
3445         u_int32_t features;
3446         u_int64_t location = 0;
3447         vfs_context_t ctx = vfs_context_current();
3448         dk_corestorage_info_t cs_info;
3449         boolean_t cs_present = FALSE;;
3450         int isssd = 0;
3451         int isvirtual = 0;
3452
3453
3454         VNOP_IOCTL(devvp, DKIOCGETTHROTTLEMASK, (caddr_t)&mp->mnt_throttle_mask, 0, NULL);
3455         /*
3456          * as a reasonable approximation, only use the lowest bit of the mask
3457          * to generate a disk unit number
3458          */
3459         mp->mnt_devbsdunit = num_trailing_0(mp->mnt_throttle_mask);
3460
3461         if (devvp == rootvp) {
3462                 rootunit = mp->mnt_devbsdunit;
3463         }
3464
3465         if (mp->mnt_devbsdunit == rootunit) {
3466                 /*
3467                  * this mount point exists on the same device as the root
3468                  * partition, so it comes under the hard throttle control...
3469                  * this is true even for the root mount point itself
3470                  */
3471                 mp->mnt_kern_flag |= MNTK_ROOTDEV;
3472         }
3473         /*
3474          * force the spec device to re-cache
3475          * the underlying block size in case
3476          * the filesystem overrode the initial value
3477          */
3478         set_fsblocksize(devvp);
3479
3480
3481         if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
3482             (caddr_t)&blksize, 0, ctx))) {
3483                 return error;
3484         }
3485
3486         mp->mnt_devblocksize = blksize;
3487
3488         /*
3489          * set the maximum possible I/O size
3490          * this may get clipped to a smaller value
3491          * based on which constraints are being advertised
3492          * and if those advertised constraints result in a smaller
3493          * limit for a given I/O
3494          */
3495         mp->mnt_maxreadcnt = MAX_UPL_SIZE_BYTES;
3496         mp->mnt_maxwritecnt = MAX_UPL_SIZE_BYTES;
3497
3498         if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, ctx) == 0) {
3499                 if (isvirtual) {
3500                         mp->mnt_kern_flag |= MNTK_VIRTUALDEV;
3501                         mp->mnt_flag |= MNT_REMOVABLE;
3502                 }
3503         }
3504         if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ctx) == 0) {
3505                 if (isssd) {
3506                         mp->mnt_kern_flag |= MNTK_SSD;
3507                 }
3508         }
3509         if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES,
3510             (caddr_t)&features, 0, ctx))) {
3511                 return error;
3512         }
3513
3514         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
3515             (caddr_t)&readblockcnt, 0, ctx))) {
3516                 return error;
3517         }
3518
3519         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
3520             (caddr_t)&writeblockcnt, 0, ctx))) {
3521                 return error;
3522         }
3523
3524         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
3525             (caddr_t)&readmaxcnt, 0, ctx))) {
3526                 return error;
3527         }
3528
3529         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
3530             (caddr_t)&writemaxcnt, 0, ctx))) {
3531                 return error;
3532         }
3533
3534         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
3535             (caddr_t)&readsegcnt, 0, ctx))) {
3536                 return error;
3537         }
3538
3539         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
3540             (caddr_t)&writesegcnt, 0, ctx))) {
3541                 return error;
3542         }
3543
3544         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
3545             (caddr_t)&readsegsize, 0, ctx))) {
3546                 return error;
3547         }
3548
3549         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
3550             (caddr_t)&writesegsize, 0, ctx))) {
3551                 return error;
3552         }
3553
3554         if ((error = VNOP_IOCTL(devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT,
3555             (caddr_t)&alignment, 0, ctx))) {
3556                 return error;
3557         }
3558
3559         if ((error = VNOP_IOCTL(devvp, DKIOCGETCOMMANDPOOLSIZE,
3560             (caddr_t)&ioqueue_depth, 0, ctx))) {
3561                 return error;
3562         }
3563
3564         if (readmaxcnt) {
3565                 mp->mnt_maxreadcnt = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt;
3566         }
3567
3568         if (readblockcnt) {
3569                 temp = readblockcnt * blksize;
3570                 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
3571
3572                 if (temp < mp->mnt_maxreadcnt) {
3573                         mp->mnt_maxreadcnt = (u_int32_t)temp;
3574                 }
3575         }
3576
3577         if (writemaxcnt) {
3578                 mp->mnt_maxwritecnt = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt;
3579         }
3580
3581         if (writeblockcnt) {
3582                 temp = writeblockcnt * blksize;
3583                 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
3584
3585                 if (temp < mp->mnt_maxwritecnt) {
3586                         mp->mnt_maxwritecnt = (u_int32_t)temp;
3587                 }
3588         }
3589
3590         if (readsegcnt) {
3591                 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
3592         } else {
3593                 temp = mp->mnt_maxreadcnt / PAGE_SIZE;
3594
3595                 if (temp > UINT16_MAX) {
3596                         temp = UINT16_MAX;
3597                 }
3598         }
3599         mp->mnt_segreadcnt = (u_int16_t)temp;
3600
3601         if (writesegcnt) {
3602                 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
3603         } else {
3604                 temp = mp->mnt_maxwritecnt / PAGE_SIZE;
3605
3606                 if (temp > UINT16_MAX) {
3607                         temp = UINT16_MAX;
3608                 }
3609         }
3610         mp->mnt_segwritecnt = (u_int16_t)temp;
3611
3612         if (readsegsize) {
3613                 temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
3614         } else {
3615                 temp = mp->mnt_maxreadcnt;
3616         }
3617         mp->mnt_maxsegreadsize = (u_int32_t)temp;
3618
3619         if (writesegsize) {
3620                 temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
3621         } else {
3622                 temp = mp->mnt_maxwritecnt;
3623         }
3624         mp->mnt_maxsegwritesize = (u_int32_t)temp;
3625
3626         if (alignment) {
3627                 temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - 1;
3628         } else {
3629                 temp = 0;
3630         }
3631         mp->mnt_alignmentmask = temp;
3632
3633
3634         if (ioqueue_depth > MNT_DEFAULT_IOQUEUE_DEPTH) {
3635                 temp = ioqueue_depth;
3636         } else {
3637                 temp = MNT_DEFAULT_IOQUEUE_DEPTH;
3638         }
3639
3640         mp->mnt_ioqueue_depth = temp;
3641         mp->mnt_ioscale = MNT_IOSCALE(mp->mnt_ioqueue_depth);
3642
3643         if (mp->mnt_ioscale > 1) {
3644                 printf("ioqueue_depth = %d,   ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale);
3645         }
3646
3647         if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
3648                 mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED;
3649         }
3650
3651         if (VNOP_IOCTL(devvp, DKIOCGETIOMINSATURATIONBYTECOUNT, (caddr_t)&minsaturationbytecount, 0, ctx) == 0) {
3652                 mp->mnt_minsaturationbytecount = minsaturationbytecount;
3653         } else {
3654                 mp->mnt_minsaturationbytecount = 0;
3655         }
3656
3657         if (VNOP_IOCTL(devvp, DKIOCCORESTORAGE, (caddr_t)&cs_info, 0, ctx) == 0) {
3658                 cs_present = TRUE;
3659         }
3660
3661         if (features & DK_FEATURE_UNMAP) {
3662                 mp->mnt_ioflags |= MNT_IOFLAGS_UNMAP_SUPPORTED;
3663
3664                 if (cs_present == TRUE) {
3665                         mp->mnt_ioflags |= MNT_IOFLAGS_CSUNMAP_SUPPORTED;
3666                 }
3667         }
3668         if (cs_present == TRUE) {
3669                 /*
3670                  * for now we'll use the following test as a proxy for
3671                  * the underlying drive being FUSION in nature
3672                  */
3673                 if ((cs_info.flags & DK_CORESTORAGE_PIN_YOUR_METADATA)) {
3674                         mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE;
3675                 }
3676         } else {
3677                 /* Check for APFS Fusion */
3678                 dk_apfs_flavour_t flavour;
3679                 if ((VNOP_IOCTL(devvp, DKIOCGETAPFSFLAVOUR, (caddr_t)&flavour, 0, ctx) == 0) &&
3680                     (flavour == DK_APFS_FUSION)) {
3681                         mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE;
3682                 }
3683         }
3684
3685         if (VNOP_IOCTL(devvp, DKIOCGETLOCATION, (caddr_t)&location, 0, ctx) == 0) {
3686                 if (location & DK_LOCATION_EXTERNAL) {
3687                         mp->mnt_ioflags |= MNT_IOFLAGS_PERIPHERAL_DRIVE;
3688                         mp->mnt_flag |= MNT_REMOVABLE;
3689                 }
3690         }
3691
3692 #if CONFIG_IOSCHED
3693         if (iosched_enabled && (features & DK_FEATURE_PRIORITY)) {
3694                 mp->mnt_ioflags |= MNT_IOFLAGS_IOSCHED_SUPPORTED;
3695                 throttle_info_disable_throttle(mp->mnt_devbsdunit, (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) != 0);
3696         }
3697 #endif /* CONFIG_IOSCHED */
3698         return error;
3699 }
3700
3701 static struct klist fs_klist;
3702 lck_grp_t *fs_klist_lck_grp;
3703 lck_mtx_t *fs_klist_lock;
3704
3705 void
3706 vfs_event_init(void)
3707 {
3708         klist_init(&fs_klist);
3709         fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL);
3710         fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL);
3711 }
3712
3713 void
3714 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data)
3715 {
3716         if (event == VQ_DEAD || event == VQ_NOTRESP) {
3717                 struct mount *mp = vfs_getvfs(fsid);
3718                 if (mp) {
3719                         mount_lock_spin(mp);
3720                         if (data) {
3721                                 mp->mnt_kern_flag &= ~MNT_LNOTRESP;     // Now responding
3722                         } else {
3723                                 mp->mnt_kern_flag |= MNT_LNOTRESP;      // Not responding
3724                         }
3725                         mount_unlock(mp);
3726                 }
3727         }
3728
3729         lck_mtx_lock(fs_klist_lock);
3730         KNOTE(&fs_klist, event);
3731         lck_mtx_unlock(fs_klist_lock);
3732 }
3733
3734 /*
3735  * return the number of mounted filesystems.
3736  */
3737 static int
3738 sysctl_vfs_getvfscnt(void)
3739 {
3740         return mount_getvfscnt();
3741 }
3742
3743
3744 static int
3745 mount_getvfscnt(void)
3746 {
3747         int ret;
3748
3749         mount_list_lock();
3750         ret = nummounts;
3751         mount_list_unlock();
3752         return ret;
3753 }
3754
3755
3756
3757 static int
3758 mount_fillfsids(fsid_t *fsidlst, int count)
3759 {
3760         struct mount *mp;
3761         int actual = 0;
3762
3763         actual = 0;
3764         mount_list_lock();
3765         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3766                 if (actual <= count) {
3767                         fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
3768                         actual++;
3769                 }
3770         }
3771         mount_list_unlock();
3772         return actual;
3773 }
3774
3775 /*
3776  * fill in the array of fsid_t's up to a max of 'count', the actual
3777  * number filled in will be set in '*actual'.  If there are more fsid_t's
3778  * than room in fsidlst then ENOMEM will be returned and '*actual' will
3779  * have the actual count.
3780  * having *actual filled out even in the error case is depended upon.
3781  */
3782 static int
3783 sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual)
3784 {
3785         struct mount *mp;
3786
3787         *actual = 0;
3788         mount_list_lock();
3789         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3790                 (*actual)++;
3791                 if (*actual <= count) {
3792                         fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid;
3793                 }
3794         }
3795         mount_list_unlock();
3796         return *actual <= count ? 0 : ENOMEM;
3797 }
3798
3799 static int
3800 sysctl_vfs_vfslist(__unused struct sysctl_oid *oidp, __unused void *arg1,
3801     __unused int arg2, struct sysctl_req *req)
3802 {
3803         int actual, error;
3804         size_t space;
3805         fsid_t *fsidlst;
3806
3807         /* This is a readonly node. */
3808         if (req->newptr != USER_ADDR_NULL) {
3809                 return EPERM;
3810         }
3811
3812         /* they are querying us so just return the space required. */
3813         if (req->oldptr == USER_ADDR_NULL) {
3814                 req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3815                 return 0;
3816         }
3817 again:
3818         /*
3819          * Retrieve an accurate count of the amount of space required to copy
3820          * out all the fsids in the system.
3821          */
3822         space = req->oldlen;
3823         req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3824
3825         /* they didn't give us enough space. */
3826         if (space < req->oldlen) {
3827                 return ENOMEM;
3828         }
3829
3830         MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK | M_ZERO);
3831         if (fsidlst == NULL) {
3832                 return ENOMEM;
3833         }
3834
3835         error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
3836             &actual);
3837         /*
3838          * If we get back ENOMEM, then another mount has been added while we
3839          * slept in malloc above.  If this is the case then try again.
3840          */
3841         if (error == ENOMEM) {
3842                 FREE(fsidlst, M_TEMP);
3843                 req->oldlen = space;
3844                 goto again;
3845         }
3846         if (error == 0) {
3847                 error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
3848         }
3849         FREE(fsidlst, M_TEMP);
3850         return error;
3851 }
3852
3853 /*
3854  * Do a sysctl by fsid.
3855  */
3856 static int
3857 sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
3858     struct sysctl_req *req)
3859 {
3860         union union_vfsidctl vc;
3861         struct mount *mp;
3862         struct vfsstatfs *sp;
3863         int *name, flags, namelen;
3864         int error = 0, gotref = 0;
3865         vfs_context_t ctx = vfs_context_current();
3866         proc_t p = req->p;      /* XXX req->p != current_proc()? */
3867         boolean_t is_64_bit;
3868
3869         name = arg1;
3870         namelen = arg2;
3871         is_64_bit = proc_is64bit(p);
3872
3873         error = SYSCTL_IN(req, &vc, is_64_bit? sizeof(vc.vc64):sizeof(vc.vc32));
3874         if (error) {
3875                 goto out;
3876         }
3877         if (vc.vc32.vc_vers != VFS_CTL_VERS1) { /* works for 32 and 64 */
3878                 error = EINVAL;
3879                 goto out;
3880         }
3881         mp = mount_list_lookupby_fsid(&vc.vc32.vc_fsid, 0, 1); /* works for 32 and 64 */
3882         if (mp == NULL) {
3883                 error = ENOENT;
3884                 goto out;
3885         }
3886         gotref = 1;
3887         /* reset so that the fs specific code can fetch it. */
3888         req->newidx = 0;
3889         /*
3890          * Note if this is a VFS_CTL then we pass the actual sysctl req
3891          * in for "oldp" so that the lower layer can DTRT and use the
3892          * SYSCTL_IN/OUT routines.
3893          */
3894         if (mp->mnt_op->vfs_sysctl != NULL) {
3895                 if (is_64_bit) {
3896                         if (vfs_64bitready(mp)) {
3897                                 error = mp->mnt_op->vfs_sysctl(name, namelen,
3898                                     CAST_USER_ADDR_T(req),
3899                                     NULL, USER_ADDR_NULL, 0,
3900                                     ctx);
3901                         } else {
3902                                 error = ENOTSUP;
3903                         }
3904                 } else {
3905                         error = mp->mnt_op->vfs_sysctl(name, namelen,
3906                             CAST_USER_ADDR_T(req),
3907                             NULL, USER_ADDR_NULL, 0,
3908                             ctx);
3909                 }
3910                 if (error != ENOTSUP) {
3911                         goto out;
3912                 }
3913         }
3914         switch (name[0]) {
3915         case VFS_CTL_UMOUNT:
3916                 req->newidx = 0;
3917                 if (is_64_bit) {
3918                         req->newptr = vc.vc64.vc_ptr;
3919                         req->newlen = (size_t)vc.vc64.vc_len;
3920                 } else {
3921                         req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
3922                         req->newlen = vc.vc32.vc_len;
3923                 }
3924                 error = SYSCTL_IN(req, &flags, sizeof(flags));
3925                 if (error) {
3926                         break;
3927                 }
3928
3929                 mount_ref(mp, 0);
3930                 mount_iterdrop(mp);
3931                 gotref = 0;
3932                 /* safedounmount consumes a ref */
3933                 error = safedounmount(mp, flags, ctx);
3934                 break;
3935         case VFS_CTL_STATFS:
3936 #if CONFIG_MACF
3937                 error = mac_mount_check_stat(ctx, mp);
3938                 if (error != 0) {
3939                         break;
3940                 }
3941 #endif
3942                 req->newidx = 0;
3943                 if (is_64_bit) {
3944                         req->newptr = vc.vc64.vc_ptr;
3945                         req->newlen = (size_t)vc.vc64.vc_len;
3946                 } else {
3947                         req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
3948                         req->newlen = vc.vc32.vc_len;
3949                 }
3950                 error = SYSCTL_IN(req, &flags, sizeof(flags));
3951                 if (error) {
3952                         break;
3953                 }
3954                 sp = &mp->mnt_vfsstat;
3955                 if (((flags & MNT_NOWAIT) == 0 || (flags & (MNT_WAIT | MNT_DWAIT))) &&
3956                     (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT))) {
3957                         goto out;
3958                 }
3959                 if (is_64_bit) {
3960                         struct user64_statfs sfs;
3961                         bzero(&sfs, sizeof(sfs));
3962                         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3963                         sfs.f_type = mp->mnt_vtable->vfc_typenum;
3964                         sfs.f_bsize = (user64_long_t)sp->f_bsize;
3965                         sfs.f_iosize = (user64_long_t)sp->f_iosize;
3966                         sfs.f_blocks = (user64_long_t)sp->f_blocks;
3967                         sfs.f_bfree = (user64_long_t)sp->f_bfree;
3968                         sfs.f_bavail = (user64_long_t)sp->f_bavail;
3969                         sfs.f_files = (user64_long_t)sp->f_files;
3970                         sfs.f_ffree = (user64_long_t)sp->f_ffree;
3971                         sfs.f_fsid = sp->f_fsid;
3972                         sfs.f_owner = sp->f_owner;
3973 #ifdef CONFIG_NFS_CLIENT
3974                         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3975                                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
3976                         } else
3977 #endif /* CONFIG_NFS_CLIENT */
3978                         {
3979                                 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3980                         }
3981                         strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3982                         strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
3983
3984                         error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
3985                 } else {
3986                         struct user32_statfs sfs;
3987                         bzero(&sfs, sizeof(sfs));
3988                         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3989                         sfs.f_type = mp->mnt_vtable->vfc_typenum;
3990
3991                         /*
3992                          * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
3993                          * have to fudge the numbers here in that case.   We inflate the blocksize in order
3994                          * to reflect the filesystem size as best we can.
3995                          */
3996                         if (sp->f_blocks > INT_MAX) {
3997                                 int             shift;
3998
3999                                 /*
4000                                  * Work out how far we have to shift the block count down to make it fit.
4001                                  * Note that it's possible to have to shift so far that the resulting
4002                                  * blocksize would be unreportably large.  At that point, we will clip
4003                                  * any values that don't fit.
4004                                  *
4005                                  * For safety's sake, we also ensure that f_iosize is never reported as
4006                                  * being smaller than f_bsize.
4007                                  */
4008                                 for (shift = 0; shift < 32; shift++) {
4009                                         if ((sp->f_blocks >> shift) <= INT_MAX) {
4010                                                 break;
4011                                         }
4012                                         if ((((long long)sp->f_bsize) << (shift + 1)) > INT_MAX) {
4013                                                 break;
4014                                         }
4015                                 }
4016 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
4017                                 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sp->f_blocks, shift);
4018                                 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bfree, shift);
4019                                 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bavail, shift);
4020 #undef __SHIFT_OR_CLIP
4021                                 sfs.f_bsize = (user32_long_t)(sp->f_bsize << shift);
4022                                 sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize);
4023                         } else {
4024                                 sfs.f_bsize = (user32_long_t)sp->f_bsize;
4025                                 sfs.f_iosize = (user32_long_t)sp->f_iosize;
4026                                 sfs.f_blocks = (user32_long_t)sp->f_blocks;
4027                                 sfs.f_bfree = (user32_long_t)sp->f_bfree;
4028                                 sfs.f_bavail = (user32_long_t)sp->f_bavail;
4029                         }
4030                         sfs.f_files = (user32_long_t)sp->f_files;
4031                         sfs.f_ffree = (user32_long_t)sp->f_ffree;
4032                         sfs.f_fsid = sp->f_fsid;
4033                         sfs.f_owner = sp->f_owner;
4034
4035 #ifdef CONFIG_NFS_CLIENT
4036                         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
4037                                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
4038                         } else
4039 #endif /* CONFIG_NFS_CLIENT */
4040                         {
4041                                 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
4042                         }
4043                         strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
4044                         strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
4045
4046                         error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
4047                 }
4048                 break;
4049         default:
4050                 error = ENOTSUP;
4051                 goto out;
4052         }
4053 out:
4054         if (gotref != 0) {
4055                 mount_iterdrop(mp);
4056         }
4057         return error;
4058 }
4059
4060 static int      filt_fsattach(struct knote *kn, struct kevent_qos_s *kev);
4061 static void     filt_fsdetach(struct knote *kn);
4062 static int      filt_fsevent(struct knote *kn, long hint);
4063 static int      filt_fstouch(struct knote *kn, struct kevent_qos_s *kev);
4064 static int      filt_fsprocess(struct knote *kn, struct kevent_qos_s *kev);
4065 SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = {
4066         .f_attach = filt_fsattach,
4067         .f_detach = filt_fsdetach,
4068         .f_event = filt_fsevent,
4069         .f_touch = filt_fstouch,
4070         .f_process = filt_fsprocess,
4071 };
4072
4073 static int
4074 filt_fsattach(struct knote *kn, __unused struct kevent_qos_s *kev)
4075 {
4076         kn->kn_flags |= EV_CLEAR; /* automatic */
4077         kn->kn_sdata = 0;         /* incoming data is ignored */
4078
4079         lck_mtx_lock(fs_klist_lock);
4080         KNOTE_ATTACH(&fs_klist, kn);
4081         lck_mtx_unlock(fs_klist_lock);
4082
4083         /*
4084          * filter only sees future events,
4085          * so it can't be fired already.
4086          */
4087         return 0;
4088 }
4089
4090 static void
4091 filt_fsdetach(struct knote *kn)
4092 {
4093         lck_mtx_lock(fs_klist_lock);
4094         KNOTE_DETACH(&fs_klist, kn);
4095         lck_mtx_unlock(fs_klist_lock);
4096 }
4097
4098 static int
4099 filt_fsevent(struct knote *kn, long hint)
4100 {
4101         /*
4102          * Backwards compatibility:
4103          * Other filters would do nothing if kn->kn_sfflags == 0
4104          */
4105
4106         if ((kn->kn_sfflags == 0) || (kn->kn_sfflags & hint)) {
4107                 kn->kn_fflags |= hint;
4108         }
4109
4110         return kn->kn_fflags != 0;
4111 }
4112
4113 static int
4114 filt_fstouch(struct knote *kn, struct kevent_qos_s *kev)
4115 {
4116         int res;
4117
4118         lck_mtx_lock(fs_klist_lock);
4119
4120         kn->kn_sfflags = kev->fflags;
4121
4122         /*
4123          * the above filter function sets bits even if nobody is looking for them.
4124          * Just preserve those bits even in the new mask is more selective
4125          * than before.
4126          *
4127          * For compatibility with previous implementations, we leave kn_fflags
4128          * as they were before.
4129          */
4130         //if (kn->kn_sfflags)
4131         //      kn->kn_fflags &= kn->kn_sfflags;
4132         res = (kn->kn_fflags != 0);
4133
4134         lck_mtx_unlock(fs_klist_lock);
4135
4136         return res;
4137 }
4138
4139 static int
4140 filt_fsprocess(struct knote *kn, struct kevent_qos_s *kev)
4141 {
4142         int res = 0;
4143
4144         lck_mtx_lock(fs_klist_lock);
4145         if (kn->kn_fflags) {
4146                 knote_fill_kevent(kn, kev, 0);
4147                 res = 1;
4148         }
4149         lck_mtx_unlock(fs_klist_lock);
4150         return res;
4151 }
4152
4153 static int
4154 sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp,
4155     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
4156 {
4157         int out, error;
4158         pid_t pid;
4159         proc_t p;
4160
4161         /* We need a pid. */
4162         if (req->newptr == USER_ADDR_NULL) {
4163                 return EINVAL;
4164         }
4165
4166         error = SYSCTL_IN(req, &pid, sizeof(pid));
4167         if (error) {
4168                 return error;
4169         }
4170
4171         p = proc_find(pid < 0 ? -pid : pid);
4172         if (p == NULL) {
4173                 return ESRCH;
4174         }
4175
4176         /*
4177          * Fetching the value is ok, but we only fetch if the old
4178          * pointer is given.
4179          */
4180         if (req->oldptr != USER_ADDR_NULL) {
4181                 out = !((p->p_flag & P_NOREMOTEHANG) == 0);
4182                 proc_rele(p);
4183                 error = SYSCTL_OUT(req, &out, sizeof(out));
4184                 return error;
4185         }
4186
4187         /* cansignal offers us enough security. */
4188         if (p != req->p && proc_suser(req->p) != 0) {
4189                 proc_rele(p);
4190                 return EPERM;
4191         }
4192
4193         if (pid < 0) {
4194                 OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), &p->p_flag);
4195         } else {
4196                 OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
4197         }
4198         proc_rele(p);
4199
4200         return 0;
4201 }
4202
4203 static int
4204 sysctl_vfs_generic_conf SYSCTL_HANDLER_ARGS
4205 {
4206         int *name, namelen;
4207         struct vfstable *vfsp;
4208         struct vfsconf vfsc = {};
4209
4210         (void)oidp;
4211         name = arg1;
4212         namelen = arg2;
4213
4214         if (namelen < 1) {
4215                 return EISDIR;
4216         } else if (namelen > 1) {
4217                 return ENOTDIR;
4218         }
4219
4220         mount_list_lock();
4221         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
4222                 if (vfsp->vfc_typenum == name[0]) {
4223                         break;
4224                 }
4225         }
4226
4227         if (vfsp == NULL) {
4228                 mount_list_unlock();
4229                 return ENOTSUP;
4230         }
4231
4232         vfsc.vfc_reserved1 = 0;
4233         bcopy(vfsp->vfc_name, vfsc.vfc_name, sizeof(vfsc.vfc_name));
4234         vfsc.vfc_typenum = vfsp->vfc_typenum;
4235         vfsc.vfc_refcount = vfsp->vfc_refcount;
4236         vfsc.vfc_flags = vfsp->vfc_flags;
4237         vfsc.vfc_reserved2 = 0;
4238         vfsc.vfc_reserved3 = 0;
4239
4240         mount_list_unlock();
4241         return SYSCTL_OUT(req, &vfsc, sizeof(struct vfsconf));
4242 }
4243
4244 /* the vfs.generic. branch. */
4245 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge");
4246 /* retreive a list of mounted filesystem fsid_t */
4247 SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist,
4248     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
4249     NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
4250 /* perform operations on filesystem via fsid_t */
4251 SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW | CTLFLAG_LOCKED,
4252     sysctl_vfs_ctlbyfsid, "ctlbyfsid");
4253 SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY,
4254     NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
4255 SYSCTL_INT(_vfs_generic, VFS_MAXTYPENUM, maxtypenum,
4256     CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
4257     &maxvfstypenum, 0, "");
4258 SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &sync_timeout_seconds, 0, "");
4259 SYSCTL_NODE(_vfs_generic, VFS_CONF, conf,
4260     CTLFLAG_RD | CTLFLAG_LOCKED,
4261     sysctl_vfs_generic_conf, "");
4262 #if DEVELOPMENT || DEBUG
4263 SYSCTL_INT(_vfs_generic, OID_AUTO, print_busy_vnodes,
4264     CTLTYPE_INT | CTLFLAG_RW,
4265     &print_busy_vnodes, 0,
4266     "VFS log busy vnodes blocking unmount");
4267 #endif
4268
4269 /* Indicate that the root file system unmounted cleanly */
4270 static int vfs_root_unmounted_cleanly = 0;
4271 SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &vfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly");
4272
4273 void
4274 vfs_set_root_unmounted_cleanly(void)
4275 {
4276         vfs_root_unmounted_cleanly = 1;
4277 }
4278
4279 /*
4280  * Print vnode state.
4281  */
4282 void
4283 vn_print_state(struct vnode *vp, const char *fmt, ...)
4284 {
4285         va_list ap;
4286         char perm_str[] = "(VM_KERNEL_ADDRPERM pointer)";
4287         char fs_name[MFSNAMELEN];
4288
4289         va_start(ap, fmt);
4290         vprintf(fmt, ap);
4291         va_end(ap);
4292         printf("vp 0x%0llx %s: ", (uint64_t)VM_KERNEL_ADDRPERM(vp), perm_str);
4293         printf("tag %d, type %d\n", vp->v_tag, vp->v_type);
4294         /* Counts .. */
4295         printf("    iocount %d, usecount %d, kusecount %d references %d\n",
4296             vp->v_iocount, vp->v_usecount, vp->v_kusecount, vp->v_references);
4297         printf("    writecount %d, numoutput %d\n", vp->v_writecount,
4298             vp->v_numoutput);
4299         /* Flags */
4300         printf("    flag 0x%x, lflag 0x%x, listflag 0x%x\n", vp->v_flag,
4301             vp->v_lflag, vp->v_listflag);
4302
4303         if (vp->v_mount == NULL || vp->v_mount == dead_mountp) {
4304                 strlcpy(fs_name, "deadfs", MFSNAMELEN);
4305         } else {
4306                 vfs_name(vp->v_mount, fs_name);
4307         }
4308
4309         printf("    v_data 0x%0llx %s\n",
4310             (vp->v_data ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_data) : 0),
4311             perm_str);
4312         printf("    v_mount 0x%0llx %s vfs_name %s\n",
4313             (vp->v_mount ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_mount) : 0),
4314             perm_str, fs_name);
4315 }
4316
4317 long num_reusedvnodes = 0;
4318
4319
4320 static vnode_t
4321 process_vp(vnode_t vp, int want_vp, int *deferred)
4322 {
4323         unsigned int  vpid;
4324
4325         *deferred = 0;
4326
4327         vpid = vp->v_id;
4328
4329         vnode_list_remove_locked(vp);
4330
4331         vnode_list_unlock();
4332
4333         vnode_lock_spin(vp);
4334
4335         /*
4336          * We could wait for the vnode_lock after removing the vp from the freelist
4337          * and the vid is bumped only at the very end of reclaim. So it is  possible
4338          * that we are looking at a vnode that is being terminated. If so skip it.
4339          */
4340         if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) ||
4341             VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) {
4342                 /*
4343                  * we lost the race between dropping the list lock
4344                  * and picking up the vnode_lock... someone else
4345                  * used this vnode and it is now in a new state
4346                  */
4347                 vnode_unlock(vp);
4348
4349                 return NULLVP;
4350         }
4351         if ((vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE) {
4352                 /*
4353                  * we did a vnode_rele_ext that asked for
4354                  * us not to reenter the filesystem during
4355                  * the release even though VL_NEEDINACTIVE was
4356                  * set... we'll do it here by doing a
4357                  * vnode_get/vnode_put
4358                  *
4359                  * pick up an iocount so that we can call
4360                  * vnode_put and drive the VNOP_INACTIVE...
4361                  * vnode_put will either leave us off
4362                  * the freelist if a new ref comes in,
4363                  * or put us back on the end of the freelist
4364                  * or recycle us if we were marked for termination...
4365                  * so we'll just go grab a new candidate
4366                  */
4367                 vp->v_iocount++;
4368 #ifdef JOE_DEBUG
4369                 record_vp(vp, 1);
4370 #endif
4371                 vnode_put_locked(vp);
4372                 vnode_unlock(vp);
4373
4374                 return NULLVP;
4375         }
4376         /*
4377          * Checks for anyone racing us for recycle
4378          */
4379         if (vp->v_type != VBAD) {
4380                 if (want_vp && (vnode_on_reliable_media(vp) == FALSE || (vp->v_flag & VISDIRTY))) {
4381                         vnode_async_list_add(vp);
4382                         vnode_unlock(vp);
4383
4384                         *deferred = 1;
4385
4386                         return NULLVP;
4387                 }
4388                 if (vp->v_lflag & VL_DEAD) {
4389                         panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp);
4390                 }
4391
4392                 vnode_lock_convert(vp);
4393                 (void)vnode_reclaim_internal(vp, 1, want_vp, 0);
4394
4395                 if (want_vp) {
4396                         if ((VONLIST(vp))) {
4397                                 panic("new_vnode(%p): vp on list", vp);
4398                         }
4399                         if (vp->v_usecount || vp->v_iocount || vp->v_kusecount ||
4400                             (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH))) {
4401                                 panic("new_vnode(%p): free vnode still referenced", vp);
4402                         }
4403                         if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) {
4404                                 panic("new_vnode(%p): vnode seems to be on mount list", vp);
4405                         }
4406                         if (!LIST_EMPTY(&vp->v_nclinks) || !TAILQ_EMPTY(&vp->v_ncchildren)) {
4407                                 panic("new_vnode(%p): vnode still hooked into the name cache", vp);
4408                         }
4409                 } else {
4410                         vnode_unlock(vp);
4411                         vp = NULLVP;
4412                 }
4413         }
4414         return vp;
4415 }
4416
4417 __attribute__((noreturn))
4418 static void
4419 async_work_continue(void)
4420 {
4421         struct async_work_lst *q;
4422         int     deferred;
4423         vnode_t vp;
4424
4425         q = &vnode_async_work_list;
4426
4427         for (;;) {
4428                 vnode_list_lock();
4429
4430                 if (TAILQ_EMPTY(q)) {
4431                         assert_wait(q, (THREAD_UNINT));
4432
4433                         vnode_list_unlock();
4434
4435                         thread_block((thread_continue_t)async_work_continue);
4436
4437                         continue;
4438                 }
4439                 async_work_handled++;
4440
4441                 vp = TAILQ_FIRST(q);
4442
4443                 vp = process_vp(vp, 0, &deferred);
4444
4445                 if (vp != NULLVP) {
4446                         panic("found VBAD vp (%p) on async queue", vp);
4447                 }
4448         }
4449 }
4450
4451
4452 static int
4453 new_vnode(vnode_t *vpp)
4454 {
4455         vnode_t vp;
4456         uint32_t retries = 0, max_retries = 100;                /* retry incase of tablefull */
4457         int force_alloc = 0, walk_count = 0;
4458         boolean_t need_reliable_vp = FALSE;
4459         int deferred;
4460         struct timeval initial_tv;
4461         struct timeval current_tv;
4462         proc_t  curproc = current_proc();
4463
4464         initial_tv.tv_sec = 0;
4465 retry:
4466         vp = NULLVP;
4467
4468         vnode_list_lock();
4469
4470         if (need_reliable_vp == TRUE) {
4471                 async_work_timed_out++;
4472         }
4473
4474         if ((numvnodes - deadvnodes) < desiredvnodes || force_alloc) {
4475                 struct timespec ts;
4476
4477                 if (!TAILQ_EMPTY(&vnode_dead_list)) {
4478                         /*
4479                          * Can always reuse a dead one
4480                          */
4481                         vp = TAILQ_FIRST(&vnode_dead_list);
4482                         goto steal_this_vp;
4483                 }
4484                 /*
4485                  * no dead vnodes available... if we're under
4486                  * the limit, we'll create a new vnode
4487                  */
4488                 numvnodes++;
4489                 vnode_list_unlock();
4490
4491                 MALLOC_ZONE(vp, struct vnode *, sizeof(*vp), M_VNODE, M_WAITOK);
4492                 bzero((char *)vp, sizeof(*vp));
4493                 VLISTNONE(vp);          /* avoid double queue removal */
4494                 lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr);
4495
4496                 TAILQ_INIT(&vp->v_ncchildren);
4497
4498                 klist_init(&vp->v_knotes);
4499                 nanouptime(&ts);
4500                 vp->v_id = ts.tv_nsec;
4501                 vp->v_flag = VSTANDARD;
4502
4503 #if CONFIG_MACF
4504                 if (mac_vnode_label_init_needed(vp)) {
4505                         mac_vnode_label_init(vp);
4506                 }
4507 #endif /* MAC */
4508
4509                 vp->v_iocount = 1;
4510                 goto done;
4511         }
4512         microuptime(&current_tv);
4513
4514 #define MAX_WALK_COUNT 1000
4515
4516         if (!TAILQ_EMPTY(&vnode_rage_list) &&
4517             (ragevnodes >= rage_limit ||
4518             (current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) {
4519                 TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) {
4520                         if (!(vp->v_listflag & VLIST_RAGE)) {
4521                                 panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp);
4522                         }
4523
4524                         // if we're a dependency-capable process, skip vnodes that can
4525                         // cause recycling deadlocks. (i.e. this process is diskimages
4526                         // helper and the vnode is in a disk image).  Querying the
4527                         // mnt_kern_flag for the mount's virtual device status
4528                         // is safer than checking the mnt_dependent_process, which
4529                         // may not be updated if there are multiple devnode layers
4530                         // in between the disk image and the final consumer.
4531
4532                         if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL ||
4533                             (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) {
4534                                 /*
4535                                  * if need_reliable_vp == TRUE, then we've already sent one or more
4536                                  * non-reliable vnodes to the async thread for processing and timed
4537                                  * out waiting for a dead vnode to show up.  Use the MAX_WALK_COUNT
4538                                  * mechanism to first scan for a reliable vnode before forcing
4539                                  * a new vnode to be created
4540                                  */
4541                                 if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) {
4542                                         break;
4543                                 }
4544                         }
4545
4546                         // don't iterate more than MAX_WALK_COUNT vnodes to
4547                         // avoid keeping the vnode list lock held for too long.
4548
4549                         if (walk_count++ > MAX_WALK_COUNT) {
4550                                 vp = NULL;
4551                                 break;
4552                         }
4553                 }
4554         }
4555
4556         if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) {
4557                 /*
4558                  * Pick the first vp for possible reuse
4559                  */
4560                 walk_count = 0;
4561                 TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) {
4562                         // if we're a dependency-capable process, skip vnodes that can
4563                         // cause recycling deadlocks. (i.e. this process is diskimages
4564                         // helper and the vnode is in a disk image).  Querying the
4565                         // mnt_kern_flag for the mount's virtual device status
4566                         // is safer than checking the mnt_dependent_process, which
4567                         // may not be updated if there are multiple devnode layers
4568                         // in between the disk image and the final consumer.
4569
4570                         if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL ||
4571                             (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) {
4572                                 /*
4573                                  * if need_reliable_vp == TRUE, then we've already sent one or more
4574                                  * non-reliable vnodes to the async thread for processing and timed
4575                                  * out waiting for a dead vnode to show up.  Use the MAX_WALK_COUNT
4576                                  * mechanism to first scan for a reliable vnode before forcing
4577                                  * a new vnode to be created
4578                                  */
4579                                 if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) {
4580                                         break;
4581                                 }
4582                         }
4583
4584                         // don't iterate more than MAX_WALK_COUNT vnodes to
4585                         // avoid keeping the vnode list lock held for too long.
4586
4587                         if (walk_count++ > MAX_WALK_COUNT) {
4588                                 vp = NULL;
4589                                 break;
4590                         }
4591                 }
4592         }
4593
4594         //
4595         // if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT
4596         // then we're trying to create a vnode on behalf of a
4597         // process like diskimages-helper that has file systems
4598         // mounted on top of itself (and thus we can't reclaim
4599         // vnodes in the file systems on top of us).  if we can't
4600         // find a vnode to reclaim then we'll just have to force
4601         // the allocation.
4602         //
4603         if (vp == NULL && walk_count >= MAX_WALK_COUNT) {
4604                 force_alloc = 1;
4605                 vnode_list_unlock();
4606                 goto retry;
4607         }
4608
4609         if (vp == NULL) {
4610                 /*
4611                  * we've reached the system imposed maximum number of vnodes
4612                  * but there isn't a single one available
4613                  * wait a bit and then retry... if we can't get a vnode
4614                  * after our target number of retries, than log a complaint
4615                  */
4616                 if (++retries <= max_retries) {
4617                         vnode_list_unlock();
4618                         delay_for_interval(1, 1000 * 1000);
4619                         goto retry;
4620                 }
4621
4622                 vnode_list_unlock();
4623                 tablefull("vnode");
4624                 log(LOG_EMERG, "%d desired, %d numvnodes, "
4625                     "%d free, %d dead, %d async, %d rage\n",
4626                     desiredvnodes, numvnodes, freevnodes, deadvnodes, async_work_vnodes, ragevnodes);
4627 #if CONFIG_JETSAM
4628
4629 #if DEVELOPMENT || DEBUG
4630                 if (bootarg_no_vnode_jetsam) {
4631                         panic("vnode table is full\n");
4632                 }
4633 #endif /* DEVELOPMENT || DEBUG */
4634
4635                 /*
4636                  * Running out of vnodes tends to make a system unusable. Start killing
4637                  * processes that jetsam knows are killable.
4638                  */
4639                 if (memorystatus_kill_on_vnode_limit() == FALSE) {
4640                         /*
4641                          * If jetsam can't find any more processes to kill and there
4642                          * still aren't any free vnodes, panic. Hopefully we'll get a
4643                          * panic log to tell us why we ran out.
4644                          */
4645                         panic("vnode table is full\n");
4646                 }
4647
4648                 /*
4649                  * Now that we've killed someone, wait a bit and continue looking
4650                  * (with fewer retries before trying another kill).
4651                  */
4652                 delay_for_interval(3, 1000 * 1000);
4653                 retries = 0;
4654                 max_retries = 10;
4655                 goto retry;
4656 #endif
4657
4658                 *vpp = NULL;
4659                 return ENFILE;
4660         }
4661 steal_this_vp:
4662         if ((vp = process_vp(vp, 1, &deferred)) == NULLVP) {
4663                 if (deferred) {
4664                         int     elapsed_msecs;
4665                         struct timeval elapsed_tv;
4666
4667                         if (initial_tv.tv_sec == 0) {
4668                                 microuptime(&initial_tv);
4669                         }
4670
4671                         vnode_list_lock();
4672
4673                         dead_vnode_waited++;
4674                         dead_vnode_wanted++;
4675
4676                         /*
4677                          * note that we're only going to explicitly wait 10ms
4678                          * for a dead vnode to become available, since even if one
4679                          * isn't available, a reliable vnode might now be available
4680                          * at the head of the VRAGE or free lists... if so, we
4681                          * can satisfy the new_vnode request with less latency then waiting
4682                          * for the full 100ms duration we're ultimately willing to tolerate
4683                          */
4684                         assert_wait_timeout((caddr_t)&dead_vnode_wanted, (THREAD_INTERRUPTIBLE), 10000, NSEC_PER_USEC);
4685
4686                         vnode_list_unlock();
4687
4688                         thread_block(THREAD_CONTINUE_NULL);
4689
4690                         microuptime(&elapsed_tv);
4691
4692                         timevalsub(&elapsed_tv, &initial_tv);
4693                         elapsed_msecs = elapsed_tv.tv_sec * 1000 + elapsed_tv.tv_usec / 1000;
4694
4695                         if (elapsed_msecs >= 100) {
4696                                 /*
4697                                  * we've waited long enough... 100ms is
4698                                  * somewhat arbitrary for this case, but the
4699                                  * normal worst case latency used for UI
4700                                  * interaction is 100ms, so I've chosen to
4701                                  * go with that.
4702                                  *
4703                                  * setting need_reliable_vp to TRUE
4704                                  * forces us to find a reliable vnode
4705                                  * that we can process synchronously, or
4706                                  * to create a new one if the scan for
4707                                  * a reliable one hits the scan limit
4708                                  */
4709                                 need_reliable_vp = TRUE;
4710                         }
4711                 }
4712                 goto retry;
4713         }
4714         OSAddAtomicLong(1, &num_reusedvnodes);
4715
4716
4717 #if CONFIG_MACF
4718         /*
4719          * We should never see VL_LABELWAIT or VL_LABEL here.
4720          * as those operations hold a reference.
4721          */
4722         assert((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT);
4723         assert((vp->v_lflag & VL_LABEL) != VL_LABEL);
4724         if (vp->v_lflag & VL_LABELED || vp->v_label != NULL) {
4725                 vnode_lock_convert(vp);
4726                 mac_vnode_label_recycle(vp);
4727         } else if (mac_vnode_label_init_needed(vp)) {
4728                 vnode_lock_convert(vp);
4729                 mac_vnode_label_init(vp);
4730         }
4731
4732 #endif /* MAC */
4733
4734         vp->v_iocount = 1;
4735         vp->v_lflag = 0;
4736         vp->v_writecount = 0;
4737         vp->v_references = 0;
4738         vp->v_iterblkflags = 0;
4739         vp->v_flag = VSTANDARD;
4740         /* vbad vnodes can point to dead_mountp */
4741         vp->v_mount = NULL;
4742         vp->v_defer_reclaimlist = (vnode_t)0;
4743
4744         vnode_unlock(vp);
4745
4746 done:
4747         *vpp = vp;
4748
4749         return 0;
4750 }
4751
4752 void
4753 vnode_lock(vnode_t vp)
4754 {
4755         lck_mtx_lock(&vp->v_lock);
4756 }
4757
4758 void
4759 vnode_lock_spin(vnode_t vp)
4760 {
4761         lck_mtx_lock_spin(&vp->v_lock);
4762 }
4763
4764 void
4765 vnode_unlock(vnode_t vp)
4766 {
4767         lck_mtx_unlock(&vp->v_lock);
4768 }
4769
4770
4771
4772 int
4773 vnode_get(struct vnode *vp)
4774 {
4775         int retval;
4776
4777         vnode_lock_spin(vp);
4778         retval = vnode_get_locked(vp);
4779         vnode_unlock(vp);
4780
4781         return retval;
4782 }
4783
4784 int
4785 vnode_get_locked(struct vnode *vp)
4786 {
4787 #if DIAGNOSTIC
4788         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
4789 #endif
4790         if ((vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) {
4791                 return ENOENT;
4792         }
4793
4794         if (os_add_overflow(vp->v_iocount, 1, &vp->v_iocount)) {
4795                 panic("v_iocount overflow");
4796         }
4797
4798 #ifdef JOE_DEBUG
4799         record_vp(vp, 1);
4800 #endif
4801         return 0;
4802 }
4803
4804 /*
4805  * vnode_getwithvid() cuts in line in front of a vnode drain (that is,
4806  * while the vnode is draining, but at no point after that) to prevent
4807  * deadlocks when getting vnodes from filesystem hashes while holding
4808  * resources that may prevent other iocounts from being released.
4809  */
4810 int
4811 vnode_getwithvid(vnode_t vp, uint32_t vid)
4812 {
4813         return vget_internal(vp, vid, (VNODE_NODEAD | VNODE_WITHID | VNODE_DRAINO));
4814 }
4815
4816 /*
4817  * vnode_getwithvid_drainok() is like vnode_getwithvid(), but *does* block behind a vnode
4818  * drain; it exists for use in the VFS name cache, where we really do want to block behind
4819  * vnode drain to prevent holding off an unmount.
4820  */
4821 int
4822 vnode_getwithvid_drainok(vnode_t vp, uint32_t vid)
4823 {
4824         return vget_internal(vp, vid, (VNODE_NODEAD | VNODE_WITHID));
4825 }
4826
4827 int
4828 vnode_getwithref(vnode_t vp)
4829 {
4830         return vget_internal(vp, 0, 0);
4831 }
4832
4833
4834 __private_extern__ int
4835 vnode_getalways(vnode_t vp)
4836 {
4837         return vget_internal(vp, 0, VNODE_ALWAYS);
4838 }
4839
4840 int
4841 vnode_put(vnode_t vp)
4842 {
4843         int retval;
4844
4845         vnode_lock_spin(vp);
4846         retval = vnode_put_locked(vp);
4847         vnode_unlock(vp);
4848
4849         return retval;
4850 }
4851
4852 static inline void
4853 vn_set_dead(vnode_t vp)
4854 {
4855         vp->v_mount = NULL;
4856         vp->v_op = dead_vnodeop_p;
4857         vp->v_tag = VT_NON;
4858         vp->v_data = NULL;
4859         vp->v_type = VBAD;
4860         vp->v_lflag |= VL_DEAD;
4861 }
4862
4863 int
4864 vnode_put_locked(vnode_t vp)
4865 {
4866         vfs_context_t ctx = vfs_context_current();      /* hoist outside loop */
4867
4868 #if DIAGNOSTIC
4869         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
4870 #endif
4871 retry:
4872         if (vp->v_iocount < 1) {
4873                 panic("vnode_put(%p): iocount < 1", vp);
4874         }
4875
4876         if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
4877                 vnode_dropiocount(vp);
4878                 return 0;
4879         }
4880         if ((vp->v_lflag & (VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) {
4881                 vp->v_lflag &= ~VL_NEEDINACTIVE;
4882                 vnode_unlock(vp);
4883
4884                 VNOP_INACTIVE(vp, ctx);
4885
4886                 vnode_lock_spin(vp);
4887                 /*
4888                  * because we had to drop the vnode lock before calling
4889                  * VNOP_INACTIVE, the state of this vnode may have changed...
4890                  * we may pick up both VL_MARTERM and either
4891                  * an iocount or a usecount while in the VNOP_INACTIVE call
4892                  * we don't want to call vnode_reclaim_internal on a vnode
4893                  * that has active references on it... so loop back around
4894                  * and reevaluate the state
4895                  */
4896                 goto retry;
4897         }
4898         vp->v_lflag &= ~VL_NEEDINACTIVE;
4899
4900         if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM) {
4901                 vnode_lock_convert(vp);
4902                 vnode_reclaim_internal(vp, 1, 1, 0);
4903         }
4904         vnode_dropiocount(vp);
4905         vnode_list_add(vp);
4906
4907         return 0;
4908 }
4909
4910 /* is vnode_t in use by others?  */
4911 int
4912 vnode_isinuse(vnode_t vp, int refcnt)
4913 {
4914         return vnode_isinuse_locked(vp, refcnt, 0);
4915 }
4916
4917 int
4918 vnode_usecount(vnode_t vp)
4919 {
4920         return vp->v_usecount;
4921 }
4922
4923 int
4924 vnode_iocount(vnode_t vp)
4925 {
4926         return vp->v_iocount;
4927 }
4928
4929 static int
4930 vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
4931 {
4932         int retval = 0;
4933
4934         if (!locked) {
4935                 vnode_lock_spin(vp);
4936         }
4937         if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) {
4938                 retval = 1;
4939                 goto out;
4940         }
4941         if (vp->v_type == VREG) {
4942                 retval = ubc_isinuse_locked(vp, refcnt, 1);
4943         }
4944
4945 out:
4946         if (!locked) {
4947                 vnode_unlock(vp);
4948         }
4949         return retval;
4950 }
4951
4952
4953 /* resume vnode_t */
4954 errno_t
4955 vnode_resume(vnode_t vp)
4956 {
4957         if ((vp->v_lflag & VL_SUSPENDED) && vp->v_owner == current_thread()) {
4958                 vnode_lock_spin(vp);
4959                 vp->v_lflag &= ~VL_SUSPENDED;
4960                 vp->v_owner = NULL;
4961                 vnode_unlock(vp);
4962
4963                 wakeup(&vp->v_iocount);
4964         }
4965         return 0;
4966 }
4967
4968 /* suspend vnode_t
4969  * Please do not use on more than one vnode at a time as it may
4970  * cause deadlocks.
4971  * xxx should we explicity prevent this from happening?
4972  */
4973
4974 errno_t
4975 vnode_suspend(vnode_t vp)
4976 {
4977         if (vp->v_lflag & VL_SUSPENDED) {
4978                 return EBUSY;
4979         }
4980
4981         vnode_lock_spin(vp);
4982
4983         /*
4984          * xxx is this sufficient to check if a vnode_drain is
4985          * progress?
4986          */
4987
4988         if (vp->v_owner == NULL) {
4989                 vp->v_lflag |= VL_SUSPENDED;
4990                 vp->v_owner = current_thread();
4991         }
4992         vnode_unlock(vp);
4993
4994         return 0;
4995 }
4996
4997 /*
4998  * Release any blocked locking requests on the vnode.
4999  * Used for forced-unmounts.
5000  *
5001  * XXX  What about network filesystems?
5002  */
5003 static void
5004 vnode_abort_advlocks(vnode_t vp)
5005 {
5006         if (vp->v_flag & VLOCKLOCAL) {
5007                 lf_abort_advlocks(vp);
5008         }
5009 }
5010
5011
5012 static errno_t
5013 vnode_drain(vnode_t vp)
5014 {
5015         if (vp->v_lflag & VL_DRAIN) {
5016                 panic("vnode_drain: recursive drain");
5017                 return ENOENT;
5018         }
5019         vp->v_lflag |= VL_DRAIN;
5020         vp->v_owner = current_thread();
5021
5022         while (vp->v_iocount > 1) {
5023                 if (bootarg_no_vnode_drain) {
5024                         struct timespec ts = {.tv_sec = 10, .tv_nsec = 0};
5025                         int error;
5026
5027                         if (vfs_unmountall_started) {
5028                                 ts.tv_sec = 1;
5029                         }
5030
5031                         error = msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain_with_timeout", &ts);
5032
5033                         /* Try to deal with leaked iocounts under bootarg and shutting down */
5034                         if (vp->v_iocount > 1 && error == EWOULDBLOCK &&
5035                             ts.tv_sec == 1 && vp->v_numoutput == 0) {
5036                                 vp->v_iocount = 1;
5037                                 break;
5038                         }
5039                 } else {
5040                         msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL);
5041                 }
5042         }
5043
5044         vp->v_lflag &= ~VL_DRAIN;
5045
5046         return 0;
5047 }
5048
5049
5050 /*
5051  * if the number of recent references via vnode_getwithvid or vnode_getwithref
5052  * exceeds this threshold, than 'UN-AGE' the vnode by removing it from
5053  * the LRU list if it's currently on it... once the iocount and usecount both drop
5054  * to 0, it will get put back on the end of the list, effectively making it younger
5055  * this allows us to keep actively referenced vnodes in the list without having
5056  * to constantly remove and add to the list each time a vnode w/o a usecount is
5057  * referenced which costs us taking and dropping a global lock twice.
5058  * However, if the vnode is marked DIRTY, we want to pull it out much earlier
5059  */
5060 #define UNAGE_THRESHHOLD        25
5061 #define UNAGE_DIRTYTHRESHHOLD    6
5062
5063 errno_t
5064 vnode_getiocount(vnode_t vp, unsigned int vid, int vflags)
5065 {
5066         int nodead = vflags & VNODE_NODEAD;
5067         int nosusp = vflags & VNODE_NOSUSPEND;
5068         int always = vflags & VNODE_ALWAYS;
5069         int beatdrain = vflags & VNODE_DRAINO;
5070         int withvid = vflags & VNODE_WITHID;
5071
5072         for (;;) {
5073                 int sleepflg = 0;
5074
5075                 /*
5076                  * if it is a dead vnode with deadfs
5077                  */
5078                 if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) {
5079                         return ENOENT;
5080                 }
5081                 /*
5082                  * will return VL_DEAD ones
5083                  */
5084                 if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0) {
5085                         break;
5086                 }
5087                 /*
5088                  * if suspended vnodes are to be failed
5089                  */
5090                 if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
5091                         return ENOENT;
5092                 }
5093                 /*
5094                  * if you are the owner of drain/suspend/termination , can acquire iocount
5095                  * check for VL_TERMINATE; it does not set owner
5096                  */
5097                 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) &&
5098                     (vp->v_owner == current_thread())) {
5099                         break;
5100                 }
5101
5102                 if (always != 0) {
5103                         break;
5104                 }
5105
5106                 /*
5107                  * If this vnode is getting drained, there are some cases where
5108                  * we can't block or, in case of tty vnodes, want to be
5109                  * interruptible.
5110                  */
5111                 if (vp->v_lflag & VL_DRAIN) {
5112                         /*
5113                          * In some situations, we want to get an iocount
5114                          * even if the vnode is draining to prevent deadlock,
5115                          * e.g. if we're in the filesystem, potentially holding
5116                          * resources that could prevent other iocounts from
5117                          * being released.
5118                          */
5119                         if (beatdrain) {
5120                                 break;
5121                         }
5122                         /*
5123                          * Don't block if the vnode's mount point is unmounting as
5124                          * we may be the thread the unmount is itself waiting on
5125                          * Only callers who pass in vids (at this point, we've already
5126                          * handled nosusp and nodead) are expecting error returns
5127                          * from this function, so only we can only return errors for
5128                          * those. ENODEV is intended to inform callers that the call
5129                          * failed because an unmount is in progress.
5130                          */
5131                         if (withvid && (vp->v_mount) && vfs_isunmount(vp->v_mount)) {
5132                                 return ENODEV;
5133                         }
5134
5135                         if (vnode_istty(vp)) {
5136                                 sleepflg = PCATCH;
5137                         }
5138                 }
5139
5140                 vnode_lock_convert(vp);
5141
5142                 if (vp->v_lflag & VL_TERMINATE) {
5143                         int error;
5144
5145                         vp->v_lflag |= VL_TERMWANT;
5146
5147                         error = msleep(&vp->v_lflag, &vp->v_lock,
5148                             (PVFS | sleepflg), "vnode getiocount", NULL);
5149                         if (error) {
5150                                 return error;
5151                         }
5152                 } else {
5153                         msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL);
5154                 }
5155         }
5156         if (withvid && vid != vp->v_id) {
5157                 return ENOENT;
5158         }
5159         if (++vp->v_references >= UNAGE_THRESHHOLD ||
5160             (vp->v_flag & VISDIRTY && vp->v_references >= UNAGE_DIRTYTHRESHHOLD)) {
5161                 vp->v_references = 0;
5162                 vnode_list_remove(vp);
5163         }
5164         vp->v_iocount++;
5165 #ifdef JOE_DEBUG
5166         record_vp(vp, 1);
5167 #endif
5168         return 0;
5169 }
5170
5171 static void
5172 vnode_dropiocount(vnode_t vp)
5173 {
5174         if (vp->v_iocount < 1) {
5175                 panic("vnode_dropiocount(%p): v_iocount < 1", vp);
5176         }
5177
5178         vp->v_iocount--;
5179 #ifdef JOE_DEBUG
5180         record_vp(vp, -1);
5181 #endif
5182         if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1)) {
5183                 wakeup(&vp->v_iocount);
5184         }
5185 }
5186
5187
5188 void
5189 vnode_reclaim(struct vnode * vp)
5190 {
5191         vnode_reclaim_internal(vp, 0, 0, 0);
5192 }
5193
5194 __private_extern__
5195 void
5196 vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags)
5197 {
5198         int isfifo = 0;
5199
5200         if (!locked) {
5201                 vnode_lock(vp);
5202         }
5203
5204         if (vp->v_lflag & VL_TERMINATE) {
5205                 panic("vnode reclaim in progress");
5206         }
5207         vp->v_lflag |= VL_TERMINATE;
5208
5209         vn_clearunionwait(vp, 1);
5210
5211         if (vnode_istty(vp) && (flags & REVOKEALL) && vp->v_usecount &&
5212             (vp->v_iocount > 1)) {
5213                 vnode_unlock(vp);
5214                 VNOP_IOCTL(vp, TIOCREVOKE, (caddr_t)NULL, 0, vfs_context_kernel());
5215                 vnode_lock(vp);
5216         }
5217
5218         vnode_drain(vp);
5219
5220         isfifo = (vp->v_type == VFIFO);
5221
5222         if (vp->v_type != VBAD) {
5223                 vgone(vp, flags);               /* clean and reclaim the vnode */
5224         }
5225         /*
5226          * give the vnode a new identity so that vnode_getwithvid will fail
5227          * on any stale cache accesses...
5228          * grab the list_lock so that if we're in "new_vnode"
5229          * behind the list_lock trying to steal this vnode, the v_id is stable...
5230          * once new_vnode drops the list_lock, it will block trying to take
5231          * the vnode lock until we release it... at that point it will evaluate
5232          * whether the v_vid has changed
5233          * also need to make sure that the vnode isn't on a list where "new_vnode"
5234          * can find it after the v_id has been bumped until we are completely done
5235          * with the vnode (i.e. putting it back on a list has to be the very last
5236          * thing we do to this vnode... many of the callers of vnode_reclaim_internal
5237          * are holding an io_count on the vnode... they need to drop the io_count
5238          * BEFORE doing a vnode_list_add or make sure to hold the vnode lock until
5239          * they are completely done with the vnode
5240          */
5241         vnode_list_lock();
5242
5243         vnode_list_remove_locked(vp);
5244         vp->v_id++;
5245
5246         vnode_list_unlock();
5247
5248         if (isfifo) {
5249                 struct fifoinfo * fip;
5250
5251                 fip = vp->v_fifoinfo;
5252                 vp->v_fifoinfo = NULL;
5253                 FREE(fip, M_TEMP);
5254         }
5255         vp->v_type = VBAD;
5256
5257         if (vp->v_data) {
5258                 panic("vnode_reclaim_internal: cleaned vnode isn't");
5259         }
5260         if (vp->v_numoutput) {
5261                 panic("vnode_reclaim_internal: clean vnode has pending I/O's");
5262         }
5263         if (UBCINFOEXISTS(vp)) {
5264                 panic("vnode_reclaim_internal: ubcinfo not cleaned");
5265         }
5266         if (vp->v_parent) {
5267                 panic("vnode_reclaim_internal: vparent not removed");
5268         }
5269         if (vp->v_name) {
5270                 panic("vnode_reclaim_internal: vname not removed");
5271         }
5272
5273         vp->v_socket = NULL;
5274
5275         vp->v_lflag &= ~VL_TERMINATE;
5276         vp->v_owner = NULL;
5277
5278         KNOTE(&vp->v_knotes, NOTE_REVOKE);
5279
5280         /* Make sure that when we reuse the vnode, no knotes left over */
5281         klist_init(&vp->v_knotes);
5282
5283         if (vp->v_lflag & VL_TERMWANT) {
5284                 vp->v_lflag &= ~VL_TERMWANT;
5285                 wakeup(&vp->v_lflag);
5286         }
5287         if (!reuse) {
5288                 /*
5289                  * make sure we get on the
5290                  * dead list if appropriate
5291                  */
5292                 vnode_list_add(vp);
5293         }
5294         if (!locked) {
5295                 vnode_unlock(vp);
5296         }
5297 }
5298
5299 static int
5300 vnode_create_internal(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp,
5301     int init_vnode)
5302 {
5303         int error;
5304         int insert = 1;
5305         int existing_vnode;
5306         vnode_t vp;
5307         vnode_t nvp;
5308         vnode_t dvp;
5309         struct  uthread *ut;
5310         struct componentname *cnp;
5311         struct vnode_fsparam *param = (struct vnode_fsparam *)data;
5312 #if CONFIG_TRIGGERS
5313         struct vnode_trigger_param *tinfo = NULL;
5314 #endif
5315         if (*vpp) {
5316                 vp = *vpp;
5317                 *vpp = NULLVP;
5318                 existing_vnode = 1;
5319         } else {
5320                 existing_vnode = 0;
5321         }
5322
5323         if (init_vnode) {
5324                 /* Do quick sanity check on the parameters. */
5325                 if ((param == NULL) || (param->vnfs_vtype == VBAD)) {
5326                         error = EINVAL;
5327                         goto error_out;
5328                 }
5329
5330 #if CONFIG_TRIGGERS
5331                 if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) {
5332                         tinfo = (struct vnode_trigger_param *)data;
5333
5334                         /* Validate trigger vnode input */
5335                         if ((param->vnfs_vtype != VDIR) ||
5336                             (tinfo->vnt_resolve_func == NULL) ||
5337                             (tinfo->vnt_flags & ~VNT_VALID_MASK)) {
5338                                 error = EINVAL;
5339                                 goto error_out;
5340                         }
5341                         /* Fall through a normal create (params will be the same) */
5342                         flavor = VNCREATE_FLAVOR;
5343                         size = VCREATESIZE;
5344                 }
5345 #endif
5346                 if ((flavor != VNCREATE_FLAVOR) || (size != VCREATESIZE)) {
5347                         error = EINVAL;
5348                         goto error_out;
5349                 }
5350         }
5351
5352         if (!existing_vnode) {
5353                 if ((error = new_vnode(&vp))) {
5354                         return error;
5355                 }
5356                 if (!init_vnode) {
5357                         /* Make it so that it can be released by a vnode_put) */
5358                         vn_set_dead(vp);
5359                         *vpp = vp;
5360                         return 0;
5361                 }
5362         } else {
5363                 /*
5364                  * A vnode obtained by vnode_create_empty has been passed to
5365                  * vnode_initialize - Unset VL_DEAD set by vn_set_dead. After
5366                  * this point, it is set back on any error.
5367                  *
5368                  * N.B. vnode locking - We make the same assumptions as the
5369                  * "unsplit" vnode_create did - i.e. it is safe to update the
5370                  * vnode's fields without the vnode lock. This vnode has been
5371                  * out and about with the filesystem and hopefully nothing
5372                  * was done to the vnode between the vnode_create_empty and
5373                  * now when it has come in through vnode_initialize.
5374                  */
5375                 vp->v_lflag &= ~VL_DEAD;
5376         }
5377
5378         dvp = param->vnfs_dvp;
5379         cnp = param->vnfs_cnp;
5380
5381         vp->v_op = param->vnfs_vops;
5382         vp->v_type = param->vnfs_vtype;
5383         vp->v_data = param->vnfs_fsnode;
5384
5385         if (param->vnfs_markroot) {
5386                 vp->v_flag |= VROOT;
5387         }
5388         if (param->vnfs_marksystem) {
5389                 vp->v_flag |= VSYSTEM;
5390         }
5391         if (vp->v_type == VREG) {
5392                 error = ubc_info_init_withsize(vp, param->vnfs_filesize);
5393                 if (error) {
5394 #ifdef JOE_DEBUG
5395                         record_vp(vp, 1);
5396 #endif
5397                         vn_set_dead(vp);
5398
5399                         vnode_put(vp);
5400                         return error;
5401                 }
5402                 if (param->vnfs_mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED) {
5403                         memory_object_mark_io_tracking(vp->v_ubcinfo->ui_control);
5404                 }
5405         }
5406 #ifdef JOE_DEBUG
5407         record_vp(vp, 1);
5408 #endif
5409
5410 #if CONFIG_FIRMLINKS
5411         vp->v_fmlink = NULLVP;
5412 #endif
5413         vp->v_flag &= ~VFMLINKTARGET;
5414
5415 #if CONFIG_TRIGGERS
5416         /*
5417          * For trigger vnodes, attach trigger info to vnode
5418          */
5419         if ((vp->v_type == VDIR) && (tinfo != NULL)) {
5420                 /*
5421                  * Note: has a side effect of incrementing trigger count on the
5422                  * mount if successful, which we would need to undo on a
5423                  * subsequent failure.
5424                  */
5425 #ifdef JOE_DEBUG
5426                 record_vp(vp, -1);
5427 #endif
5428                 error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE);
5429                 if (error) {
5430                         printf("vnode_create: vnode_resolver_create() err %d\n", error);
5431                         vn_set_dead(vp);
5432 #ifdef JOE_DEBUG
5433                         record_vp(vp, 1);
5434 #endif
5435                         vnode_put(vp);
5436                         return error;
5437                 }
5438         }
5439 #endif
5440         if (vp->v_type == VCHR || vp->v_type == VBLK) {
5441                 vp->v_tag = VT_DEVFS;           /* callers will reset if needed (bdevvp) */
5442
5443                 if ((nvp = checkalias(vp, param->vnfs_rdev))) {
5444                         /*
5445                          * if checkalias returns a vnode, it will be locked
5446                          *
5447                          * first get rid of the unneeded vnode we acquired
5448                          */
5449                         vp->v_data = NULL;
5450                         vp->v_op = spec_vnodeop_p;
5451                         vp->v_type = VBAD;
5452                         vp->v_lflag = VL_DEAD;
5453                         vp->v_data = NULL;
5454                         vp->v_tag = VT_NON;
5455                         vnode_put(vp);
5456
5457                         /*
5458                          * switch to aliased vnode and finish
5459                          * preparing it
5460                          */
5461                         vp = nvp;
5462
5463                         vclean(vp, 0);
5464                         vp->v_op = param->vnfs_vops;
5465                         vp->v_type = param->vnfs_vtype;
5466                         vp->v_data = param->vnfs_fsnode;
5467                         vp->v_lflag = 0;
5468                         vp->v_mount = NULL;
5469                         insmntque(vp, param->vnfs_mp);
5470                         insert = 0;
5471                         vnode_unlock(vp);
5472                 }
5473
5474                 if (VCHR == vp->v_type) {
5475                         u_int maj = major(vp->v_rdev);
5476
5477                         if (maj < (u_int)nchrdev && cdevsw[maj].d_type == D_TTY) {
5478                                 vp->v_flag |= VISTTY;
5479                         }
5480                 }
5481         }
5482
5483         if (vp->v_type == VFIFO) {
5484                 struct fifoinfo *fip;
5485
5486                 MALLOC(fip, struct fifoinfo *,
5487                     sizeof(*fip), M_TEMP, M_WAITOK);
5488                 bzero(fip, sizeof(struct fifoinfo));
5489                 vp->v_fifoinfo = fip;
5490         }
5491         /* The file systems must pass the address of the location where
5492          * they store the vnode pointer. When we add the vnode into the mount
5493          * list and name cache they become discoverable. So the file system node
5494          * must have the connection to vnode setup by then
5495          */
5496         *vpp = vp;
5497
5498         /* Add fs named reference. */
5499         if (param->vnfs_flags & VNFS_ADDFSREF) {
5500                 vp->v_lflag |= VNAMED_FSHASH;
5501         }
5502         if (param->vnfs_mp) {
5503                 if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) {
5504                         vp->v_flag |= VLOCKLOCAL;
5505                 }
5506                 if (insert) {
5507                         if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) {
5508                                 panic("insmntque: vp on the free list\n");
5509                         }
5510
5511                         /*
5512                          * enter in mount vnode list
5513                          */
5514                         insmntque(vp, param->vnfs_mp);
5515                 }
5516         }
5517         if (dvp && vnode_ref(dvp) == 0) {
5518                 vp->v_parent = dvp;
5519         }
5520         if (cnp) {
5521                 if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) {
5522                         /*
5523                          * enter into name cache
5524                          * we've got the info to enter it into the name cache now
5525                          * cache_enter_create will pick up an extra reference on
5526                          * the name entered into the string cache
5527                          */
5528                         vp->v_name = cache_enter_create(dvp, vp, cnp);
5529                 } else {
5530                         vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0);
5531                 }
5532
5533                 if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) {
5534                         vp->v_flag |= VISUNION;
5535                 }
5536         }
5537         if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) {
5538                 /*
5539                  * this vnode is being created as cacheable in the name cache
5540                  * this allows us to re-enter it in the cache
5541                  */
5542                 vp->v_flag |= VNCACHEABLE;
5543         }
5544         ut = get_bsdthread_info(current_thread());
5545
5546         if ((current_proc()->p_lflag & P_LRAGE_VNODES) ||
5547             (ut->uu_flag & (UT_RAGE_VNODES | UT_KERN_RAGE_VNODES))) {
5548                 /*
5549                  * process has indicated that it wants any
5550                  * vnodes created on its behalf to be rapidly
5551                  * aged to reduce the impact on the cached set
5552                  * of vnodes
5553                  *
5554                  * if UT_KERN_RAGE_VNODES is set, then the
5555                  * kernel internally wants vnodes to be rapidly
5556                  * aged, even if the process hasn't requested
5557                  * this
5558                  */
5559                 vp->v_flag |= VRAGE;
5560         }
5561
5562 #if CONFIG_SECLUDED_MEMORY
5563         switch (secluded_for_filecache) {
5564         case 0:
5565                 /*
5566                  * secluded_for_filecache == 0:
5567                  * + no file contents in secluded pool
5568                  */
5569                 break;
5570         case 1:
5571                 /*
5572                  * secluded_for_filecache == 1:
5573                  * + no files from /
5574                  * + files from /Applications/ are OK
5575                  * + files from /Applications/Camera are not OK
5576                  * + no files that are open for write
5577                  */
5578                 if (vnode_vtype(vp) == VREG &&
5579                     vnode_mount(vp) != NULL &&
5580                     (!(vfs_flags(vnode_mount(vp)) & MNT_ROOTFS))) {
5581                         /* not from root filesystem: eligible for secluded pages */
5582                         memory_object_mark_eligible_for_secluded(
5583                                 ubc_getobject(vp, UBC_FLAGS_NONE),
5584                                 TRUE);
5585                 }
5586                 break;
5587         case 2:
5588                 /*
5589                  * secluded_for_filecache == 2:
5590                  * + all read-only files OK, except:
5591                  *      + dyld_shared_cache_arm64*
5592                  *      + Camera
5593                  *      + mediaserverd
5594                  */
5595                 if (vnode_vtype(vp) == VREG) {
5596                         memory_object_mark_eligible_for_secluded(
5597                                 ubc_getobject(vp, UBC_FLAGS_NONE),
5598                                 TRUE);
5599                 }
5600                 break;
5601         default:
5602                 break;
5603         }
5604 #endif /* CONFIG_SECLUDED_MEMORY */
5605
5606         return 0;
5607
5608 error_out:
5609         if (existing_vnode) {
5610                 vnode_put(vp);
5611         }
5612         return error;
5613 }
5614
5615 /* USAGE:
5616  * The following api creates a vnode and associates all the parameter specified in vnode_fsparam
5617  * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias
5618  * is obsoleted by this.
5619  */
5620 int
5621 vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp)
5622 {
5623         *vpp = NULLVP;
5624         return vnode_create_internal(flavor, size, data, vpp, 1);
5625 }
5626
5627 int
5628 vnode_create_empty(vnode_t *vpp)
5629 {
5630         *vpp = NULLVP;
5631         return vnode_create_internal(VNCREATE_FLAVOR, VCREATESIZE, NULL,
5632                    vpp, 0);
5633 }
5634
5635 int
5636 vnode_initialize(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp)
5637 {
5638         if (*vpp == NULLVP) {
5639                 panic("NULL vnode passed to vnode_initialize");
5640         }
5641 #if DEVELOPMENT || DEBUG
5642         /*
5643          * We lock to check that vnode is fit for unlocked use in
5644          * vnode_create_internal.
5645          */
5646         vnode_lock_spin(*vpp);
5647         VNASSERT(((*vpp)->v_iocount == 1), *vpp,
5648             ("vnode_initialize : iocount not 1, is %d", (*vpp)->v_iocount));
5649         VNASSERT(((*vpp)->v_usecount == 0), *vpp,
5650             ("vnode_initialize : usecount not 0, is %d", (*vpp)->v_usecount));
5651         VNASSERT(((*vpp)->v_lflag & VL_DEAD), *vpp,
5652             ("vnode_initialize : v_lflag does not have VL_DEAD, is 0x%x",
5653             (*vpp)->v_lflag));
5654         VNASSERT(((*vpp)->v_data == NULL), *vpp,
5655             ("vnode_initialize : v_data not NULL"));
5656         vnode_unlock(*vpp);
5657 #endif
5658         return vnode_create_internal(flavor, size, data, vpp, 1);
5659 }
5660
5661 int
5662 vnode_addfsref(vnode_t vp)
5663 {
5664         vnode_lock_spin(vp);
5665         if (vp->v_lflag & VNAMED_FSHASH) {
5666                 panic("add_fsref: vp already has named reference");
5667         }
5668         if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) {
5669                 panic("addfsref: vp on the free list\n");
5670         }
5671         vp->v_lflag |= VNAMED_FSHASH;
5672         vnode_unlock(vp);
5673         return 0;
5674 }
5675 int
5676 vnode_removefsref(vnode_t vp)
5677 {
5678         vnode_lock_spin(vp);
5679         if ((vp->v_lflag & VNAMED_FSHASH) == 0) {
5680                 panic("remove_fsref: no named reference");
5681         }
5682         vp->v_lflag &= ~VNAMED_FSHASH;
5683         vnode_unlock(vp);
5684         return 0;
5685 }
5686
5687
5688 int
5689 vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg)
5690 {
5691         mount_t mp;
5692         int ret = 0;
5693         fsid_t * fsid_list;
5694         int count, actualcount, i;
5695         void * allocmem;
5696         int indx_start, indx_stop, indx_incr;
5697         int cb_dropref = (flags & VFS_ITERATE_CB_DROPREF);
5698         int noskip_unmount = (flags & VFS_ITERATE_NOSKIP_UNMOUNT);
5699
5700         count = mount_getvfscnt();
5701         count += 10;
5702
5703         fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t));
5704         allocmem = (void *)fsid_list;
5705
5706         actualcount = mount_fillfsids(fsid_list, count);
5707
5708         /*
5709          * Establish the iteration direction
5710          * VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first)
5711          */
5712         if (flags & VFS_ITERATE_TAIL_FIRST) {
5713                 indx_start = actualcount - 1;
5714                 indx_stop = -1;
5715                 indx_incr = -1;
5716         } else { /* Head first by default */
5717                 indx_start = 0;
5718                 indx_stop = actualcount;
5719                 indx_incr = 1;
5720         }
5721
5722         for (i = indx_start; i != indx_stop; i += indx_incr) {
5723                 /* obtain the mount point with iteration reference */
5724                 mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1);
5725
5726                 if (mp == (struct mount *)0) {
5727                         continue;
5728                 }
5729                 mount_lock(mp);
5730                 if ((mp->mnt_lflag & MNT_LDEAD) ||
5731                     (!noskip_unmount && (mp->mnt_lflag & MNT_LUNMOUNT))) {
5732                         mount_unlock(mp);
5733                         mount_iterdrop(mp);
5734                         continue;
5735                 }
5736                 mount_unlock(mp);
5737
5738                 /* iterate over all the vnodes */
5739                 ret = callout(mp, arg);
5740
5741                 /*
5742                  * Drop the iterref here if the callback didn't do it.
5743                  * Note: If cb_dropref is set the mp may no longer exist.
5744                  */
5745                 if (!cb_dropref) {
5746                         mount_iterdrop(mp);
5747                 }
5748
5749                 switch (ret) {
5750                 case VFS_RETURNED:
5751                 case VFS_RETURNED_DONE:
5752                         if (ret == VFS_RETURNED_DONE) {
5753                                 ret = 0;
5754                                 goto out;
5755                         }
5756                         break;
5757
5758                 case VFS_CLAIMED_DONE:
5759                         ret = 0;
5760                         goto out;
5761                 case VFS_CLAIMED:
5762                 default:
5763                         break;
5764                 }
5765                 ret = 0;
5766         }
5767
5768 out:
5769         kfree(allocmem, (count * sizeof(fsid_t)));
5770         return ret;
5771 }
5772
5773 /*
5774  * Update the vfsstatfs structure in the mountpoint.
5775  * MAC: Parameter eventtype added, indicating whether the event that
5776  * triggered this update came from user space, via a system call
5777  * (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT).
5778  */
5779 int
5780 vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype)
5781 {
5782         struct vfs_attr va;
5783         int             error;
5784
5785         /*
5786          * Request the attributes we want to propagate into
5787          * the per-mount vfsstat structure.
5788          */
5789         VFSATTR_INIT(&va);
5790         VFSATTR_WANTED(&va, f_iosize);
5791         VFSATTR_WANTED(&va, f_blocks);
5792         VFSATTR_WANTED(&va, f_bfree);
5793         VFSATTR_WANTED(&va, f_bavail);
5794         VFSATTR_WANTED(&va, f_bused);
5795         VFSATTR_WANTED(&va, f_files);
5796         VFSATTR_WANTED(&va, f_ffree);
5797         VFSATTR_WANTED(&va, f_bsize);
5798         VFSATTR_WANTED(&va, f_fssubtype);
5799
5800         if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
5801                 KAUTH_DEBUG("STAT - filesystem returned error %d", error);
5802                 return error;
5803         }
5804 #if CONFIG_MACF
5805         if (eventtype == VFS_USER_EVENT) {
5806                 error = mac_mount_check_getattr(ctx, mp, &va);
5807                 if (error != 0) {
5808                         return error;
5809                 }
5810         }
5811 #endif
5812         /*
5813          * Unpack into the per-mount structure.
5814          *
5815          * We only overwrite these fields, which are likely to change:
5816          *      f_blocks
5817          *      f_bfree
5818          *      f_bavail
5819          *      f_bused
5820          *      f_files
5821          *      f_ffree
5822          *
5823          * And these which are not, but which the FS has no other way
5824          * of providing to us:
5825          *      f_bsize
5826          *      f_iosize
5827          *      f_fssubtype
5828          *
5829          */
5830         if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
5831                 /* 4822056 - protect against malformed server mount */
5832                 mp->mnt_vfsstat.f_bsize = (va.f_bsize > 0 ? va.f_bsize : 512);
5833         } else {
5834                 mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */
5835         }
5836         if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
5837                 mp->mnt_vfsstat.f_iosize = va.f_iosize;
5838         } else {
5839                 mp->mnt_vfsstat.f_iosize = 1024 * 1024;         /* 1MB sensible I/O size */
5840         }
5841         if (VFSATTR_IS_SUPPORTED(&va, f_blocks)) {
5842                 mp->mnt_vfsstat.f_blocks = va.f_blocks;
5843         }
5844         if (VFSATTR_IS_SUPPORTED(&va, f_bfree)) {
5845                 mp->mnt_vfsstat.f_bfree = va.f_bfree;
5846         }
5847         if (VFSATTR_IS_SUPPORTED(&va, f_bavail)) {
5848                 mp->mnt_vfsstat.f_bavail = va.f_bavail;
5849         }
5850         if (VFSATTR_IS_SUPPORTED(&va, f_bused)) {
5851                 mp->mnt_vfsstat.f_bused = va.f_bused;
5852         }
5853         if (VFSATTR_IS_SUPPORTED(&va, f_files)) {
5854                 mp->mnt_vfsstat.f_files = va.f_files;
5855         }
5856         if (VFSATTR_IS_SUPPORTED(&va, f_ffree)) {
5857                 mp->mnt_vfsstat.f_ffree = va.f_ffree;
5858         }
5859
5860         /* this is unlikely to change, but has to be queried for */
5861         if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype)) {
5862                 mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
5863         }
5864
5865         return 0;
5866 }
5867
5868 int
5869 mount_list_add(mount_t mp)
5870 {
5871         int res;
5872
5873         mount_list_lock();
5874         if (system_inshutdown != 0) {
5875                 res = -1;
5876         } else {
5877                 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
5878                 nummounts++;
5879                 res = 0;
5880         }
5881         mount_list_unlock();
5882
5883         return res;
5884 }
5885
5886 void
5887 mount_list_remove(mount_t mp)
5888 {
5889         mount_list_lock();
5890         TAILQ_REMOVE(&mountlist, mp, mnt_list);
5891         nummounts--;
5892         mp->mnt_list.tqe_next = NULL;
5893         mp->mnt_list.tqe_prev = NULL;
5894         mount_list_unlock();
5895 }
5896
5897 mount_t
5898 mount_lookupby_volfsid(int volfs_id, int withref)
5899 {
5900         mount_t cur_mount = (mount_t)0;
5901         mount_t mp;
5902
5903         mount_list_lock();
5904         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
5905                 if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) &&
5906                     (mp->mnt_kern_flag & MNTK_PATH_FROM_ID) &&
5907                     (mp->mnt_vfsstat.f_fsid.val[0] == volfs_id)) {
5908                         cur_mount = mp;
5909                         if (withref) {
5910                                 if (mount_iterref(cur_mount, 1)) {
5911                                         cur_mount = (mount_t)0;
5912                                         mount_list_unlock();
5913                                         goto out;
5914                                 }
5915                         }
5916                         break;
5917                 }
5918         }
5919         mount_list_unlock();
5920         if (withref && (cur_mount != (mount_t)0)) {
5921                 mp = cur_mount;
5922                 if (vfs_busy(mp, LK_NOWAIT) != 0) {
5923                         cur_mount = (mount_t)0;
5924                 }
5925                 mount_iterdrop(mp);
5926         }
5927 out:
5928         return cur_mount;
5929 }
5930
5931 mount_t
5932 mount_list_lookupby_fsid(fsid_t *fsid, int locked, int withref)
5933 {
5934         mount_t retmp = (mount_t)0;
5935         mount_t mp;
5936
5937         if (!locked) {
5938                 mount_list_lock();
5939         }
5940         TAILQ_FOREACH(mp, &mountlist, mnt_list)
5941         if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] &&
5942             mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) {
5943                 retmp = mp;
5944                 if (withref) {
5945                         if (mount_iterref(retmp, 1)) {
5946                                 retmp = (mount_t)0;
5947                         }
5948                 }
5949                 goto out;
5950         }
5951 out:
5952         if (!locked) {
5953                 mount_list_unlock();
5954         }
5955         return retmp;
5956 }
5957
5958 errno_t
5959 vnode_lookupat(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx,
5960     vnode_t start_dvp)
5961 {
5962         struct nameidata nd;
5963         int error;
5964         u_int32_t ndflags = 0;
5965
5966         if (ctx == NULL) {
5967                 return EINVAL;
5968         }
5969
5970         if (flags & VNODE_LOOKUP_NOFOLLOW) {
5971                 ndflags = NOFOLLOW;
5972         } else {
5973                 ndflags = FOLLOW;
5974         }
5975
5976         if (flags & VNODE_LOOKUP_NOCROSSMOUNT) {
5977                 ndflags |= NOCROSSMOUNT;
5978         }
5979
5980         if (flags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
5981                 ndflags |= CN_NBMOUNTLOOK;
5982         }
5983
5984         /* XXX AUDITVNPATH1 needed ? */
5985         NDINIT(&nd, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE,
5986             CAST_USER_ADDR_T(path), ctx);
5987
5988         if (start_dvp && (path[0] != '/')) {
5989                 nd.ni_dvp = start_dvp;
5990                 nd.ni_cnd.cn_flags |= USEDVP;
5991                 /* Don't take proc lock vnode_lookupat with a startdir specified */
5992                 nd.ni_flag |=  NAMEI_NOPROCLOCK;
5993         }
5994
5995         if ((error = namei(&nd))) {
5996                 return error;
5997         }
5998
5999         nd.ni_cnd.cn_flags &= ~USEDVP;
6000
6001         *vpp = nd.ni_vp;
6002         nameidone(&nd);
6003
6004         return 0;
6005 }
6006
6007 errno_t
6008 vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx)
6009 {
6010         return vnode_lookupat(path, flags, vpp, ctx, NULLVP);
6011 }
6012
6013 errno_t
6014 vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx)
6015 {
6016         struct nameidata nd;
6017         int error;
6018         u_int32_t ndflags = 0;
6019         int lflags = flags;
6020
6021         if (ctx == NULL) {              /* XXX technically an error */
6022                 ctx = vfs_context_current();
6023         }
6024
6025         if (fmode & O_NOFOLLOW) {
6026                 lflags |= VNODE_LOOKUP_NOFOLLOW;
6027         }
6028
6029         if (lflags & VNODE_LOOKUP_NOFOLLOW) {
6030                 ndflags = NOFOLLOW;
6031         } else {
6032                 ndflags = FOLLOW;
6033         }
6034
6035         if (lflags & VNODE_LOOKUP_NOCROSSMOUNT) {
6036                 ndflags |= NOCROSSMOUNT;
6037         }
6038
6039         if (lflags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
6040                 ndflags |= CN_NBMOUNTLOOK;
6041         }
6042
6043         /* XXX AUDITVNPATH1 needed ? */
6044         NDINIT(&nd, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE,
6045             CAST_USER_ADDR_T(path), ctx);
6046
6047         if ((error = vn_open(&nd, fmode, cmode))) {
6048                 *vpp = NULL;
6049         } else {
6050                 *vpp = nd.ni_vp;
6051         }
6052
6053         return error;
6054 }
6055
6056 errno_t
6057 vnode_close(vnode_t vp, int flags, vfs_context_t ctx)
6058 {
6059         int error;
6060
6061         if (ctx == NULL) {
6062                 ctx = vfs_context_current();
6063         }
6064
6065         error = vn_close(vp, flags, ctx);
6066         vnode_put(vp);
6067         return error;
6068 }
6069
6070 errno_t
6071 vnode_mtime(vnode_t vp, struct timespec *mtime, vfs_context_t ctx)
6072 {
6073         struct vnode_attr       va;
6074         int                     error;
6075
6076         VATTR_INIT(&va);
6077         VATTR_WANTED(&va, va_modify_time);
6078         error = vnode_getattr(vp, &va, ctx);
6079         if (!error) {
6080                 *mtime = va.va_modify_time;
6081         }
6082         return error;
6083 }
6084
6085 errno_t
6086 vnode_flags(vnode_t vp, uint32_t *flags, vfs_context_t ctx)
6087 {
6088         struct vnode_attr       va;
6089         int                     error;
6090
6091         VATTR_INIT(&va);
6092         VATTR_WANTED(&va, va_flags);
6093         error = vnode_getattr(vp, &va, ctx);
6094         if (!error) {
6095                 *flags = va.va_flags;
6096         }
6097         return error;
6098 }
6099
6100 /*
6101  * Returns:     0                       Success
6102  *      vnode_getattr:???
6103  */
6104 errno_t
6105 vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
6106 {
6107         struct vnode_attr       va;
6108         int                     error;
6109
6110         VATTR_INIT(&va);
6111         VATTR_WANTED(&va, va_data_size);
6112         error = vnode_getattr(vp, &va, ctx);
6113         if (!error) {
6114                 *sizep = va.va_data_size;
6115         }
6116         return error;
6117 }
6118
6119 errno_t
6120 vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
6121 {
6122         struct vnode_attr       va;
6123
6124         VATTR_INIT(&va);
6125         VATTR_SET(&va, va_data_size, size);
6126         va.va_vaflags = ioflag & 0xffff;
6127         return vnode_setattr(vp, &va, ctx);
6128 }
6129
6130 int
6131 vnode_setdirty(vnode_t vp)
6132 {
6133         vnode_lock_spin(vp);
6134         vp->v_flag |= VISDIRTY;
6135         vnode_unlock(vp);
6136         return 0;
6137 }
6138
6139 int
6140 vnode_cleardirty(vnode_t vp)
6141 {
6142         vnode_lock_spin(vp);
6143         vp->v_flag &= ~VISDIRTY;
6144         vnode_unlock(vp);
6145         return 0;
6146 }
6147
6148 int
6149 vnode_isdirty(vnode_t vp)
6150 {
6151         int dirty;
6152
6153         vnode_lock_spin(vp);
6154         dirty = (vp->v_flag & VISDIRTY) ? 1 : 0;
6155         vnode_unlock(vp);
6156
6157         return dirty;
6158 }
6159
6160 static int
6161 vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx)
6162 {
6163         /* Only use compound VNOP for compound operation */
6164         if (vnode_compound_open_available(dvp) && ((flags & VN_CREATE_DOOPEN) != 0)) {
6165                 *vpp = NULLVP;
6166                 return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, O_CREAT, fmode, statusp, vap, ctx);
6167         } else {
6168                 return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx);
6169         }
6170 }
6171
6172 /*
6173  * Create a filesystem object of arbitrary type with arbitrary attributes in
6174  * the spevied directory with the specified name.
6175  *
6176  * Parameters:  dvp                     Pointer to the vnode of the directory
6177  *                                      in which to create the object.
6178  *              vpp                     Pointer to the area into which to
6179  *                                      return the vnode of the created object.
6180  *              cnp                     Component name pointer from the namei
6181  *                                      data structure, containing the name to
6182  *                                      use for the create object.
6183  *              vap                     Pointer to the vnode_attr structure
6184  *                                      describing the object to be created,
6185  *                                      including the type of object.
6186  *              flags                   VN_* flags controlling ACL inheritance
6187  *                                      and whether or not authorization is to
6188  *                                      be required for the operation.
6189  *
6190  * Returns:     0                       Success
6191  *              !0                      errno value
6192  *
6193  * Implicit:    *vpp                    Contains the vnode of the object that
6194  *                                      was created, if successful.
6195  *              *cnp                    May be modified by the underlying VFS.
6196  *              *vap                    May be modified by the underlying VFS.
6197  *                                      modified by either ACL inheritance or
6198  *
6199  *
6200  *                                      be modified, even if the operation is
6201  *
6202  *
6203  * Notes:       The kauth_filesec_t in 'vap', if any, is in host byte order.
6204  *
6205  *              Modification of '*cnp' and '*vap' by the underlying VFS is
6206  *              strongly discouraged.
6207  *
6208  * XXX:         This function is a 'vn_*' function; it belongs in vfs_vnops.c
6209  *
6210  * XXX:         We should enummerate the possible errno values here, and where
6211  *              in the code they originated.
6212  */
6213 errno_t
6214 vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx)
6215 {
6216         errno_t error, old_error;
6217         vnode_t vp = (vnode_t)0;
6218         boolean_t batched;
6219         struct componentname *cnp;
6220         uint32_t defaulted;
6221
6222         cnp = &ndp->ni_cnd;
6223         error = 0;
6224         batched = namei_compound_available(dvp, ndp) ? TRUE : FALSE;
6225
6226         KAUTH_DEBUG("%p    CREATE - '%s'", dvp, cnp->cn_nameptr);
6227
6228         if (flags & VN_CREATE_NOINHERIT) {
6229                 vap->va_vaflags |= VA_NOINHERIT;
6230         }
6231         if (flags & VN_CREATE_NOAUTH) {
6232                 vap->va_vaflags |= VA_NOAUTH;
6233         }
6234         /*
6235          * Handle ACL inheritance, initialize vap.
6236          */
6237         error = vn_attribute_prepare(dvp, vap, &defaulted, ctx);
6238         if (error) {
6239                 return error;
6240         }
6241
6242         if (vap->va_type != VREG && (fmode != 0 || (flags & VN_CREATE_DOOPEN) || statusp)) {
6243                 panic("Open parameters, but not a regular file.");
6244         }
6245         if ((fmode != 0) && ((flags & VN_CREATE_DOOPEN) == 0)) {
6246                 panic("Mode for open, but not trying to open...");
6247         }
6248
6249
6250         /*
6251          * Create the requested node.
6252          */
6253         switch (vap->va_type) {
6254         case VREG:
6255                 error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx);
6256                 break;
6257         case VDIR:
6258                 error = vn_mkdir(dvp, vpp, ndp, vap, ctx);
6259                 break;
6260         case VSOCK:
6261         case VFIFO:
6262         case VBLK:
6263         case VCHR:
6264                 error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
6265                 break;
6266         default:
6267                 panic("vnode_create: unknown vtype %d", vap->va_type);
6268         }
6269         if (error != 0) {
6270                 KAUTH_DEBUG("%p    CREATE - error %d returned by filesystem", dvp, error);
6271                 goto out;
6272         }
6273
6274         vp = *vpp;
6275         old_error = error;
6276
6277         /*
6278          * If some of the requested attributes weren't handled by the VNOP,
6279          * use our fallback code.
6280          */
6281         if ((error == 0) && !VATTR_ALL_SUPPORTED(vap) && *vpp) {
6282                 KAUTH_DEBUG("     CREATE - doing fallback with ACL %p", vap->va_acl);
6283                 error = vnode_setattr_fallback(*vpp, vap, ctx);
6284         }
6285
6286 #if CONFIG_MACF
6287         if ((error == 0) && !(flags & VN_CREATE_NOLABEL)) {
6288                 error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx);
6289         }
6290 #endif
6291
6292         if ((error != 0) && (vp != (vnode_t)0)) {
6293                 /* If we've done a compound open, close */
6294                 if (batched && (old_error == 0) && (vap->va_type == VREG)) {
6295                         VNOP_CLOSE(vp, fmode, ctx);
6296                 }
6297
6298                 /* Need to provide notifications if a create succeeded */
6299                 if (!batched) {
6300                         *vpp = (vnode_t) 0;
6301                         vnode_put(vp);
6302                         vp = NULLVP;
6303                 }
6304         }
6305
6306         /*
6307          * For creation VNOPs, this is the equivalent of
6308          * lookup_handle_found_vnode.
6309          */
6310         if (kdebug_enable && *vpp) {
6311                 kdebug_lookup(*vpp, cnp);
6312         }
6313
6314 out:
6315         vn_attribute_cleanup(vap, defaulted);
6316
6317         return error;
6318 }
6319
6320 static kauth_scope_t    vnode_scope;
6321 static int      vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action,
6322     uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
6323 static int vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
6324     vnode_t vp, vnode_t dvp, int *errorp);
6325
6326 typedef struct _vnode_authorize_context {
6327         vnode_t         vp;
6328         struct vnode_attr *vap;
6329         vnode_t         dvp;
6330         struct vnode_attr *dvap;
6331         vfs_context_t   ctx;
6332         int             flags;
6333         int             flags_valid;
6334 #define _VAC_IS_OWNER           (1<<0)
6335 #define _VAC_IN_GROUP           (1<<1)
6336 #define _VAC_IS_DIR_OWNER       (1<<2)
6337 #define _VAC_IN_DIR_GROUP       (1<<3)
6338 #define _VAC_NO_VNODE_POINTERS  (1<<4)
6339 } *vauth_ctx;
6340
6341 void
6342 vnode_authorize_init(void)
6343 {
6344         vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL);
6345 }
6346
6347 #define VATTR_PREPARE_DEFAULTED_UID             0x1
6348 #define VATTR_PREPARE_DEFAULTED_GID             0x2
6349 #define VATTR_PREPARE_DEFAULTED_MODE            0x4
6350
6351 int
6352 vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
6353 {
6354         kauth_acl_t nacl = NULL, oacl = NULL;
6355         int error;
6356
6357         /*
6358          * Handle ACL inheritance.
6359          */
6360         if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
6361                 /* save the original filesec */
6362                 if (VATTR_IS_ACTIVE(vap, va_acl)) {
6363                         oacl = vap->va_acl;
6364                 }
6365
6366                 vap->va_acl = NULL;
6367                 if ((error = kauth_acl_inherit(dvp,
6368                     oacl,
6369                     &nacl,
6370                     vap->va_type == VDIR,
6371                     ctx)) != 0) {
6372                         KAUTH_DEBUG("%p    CREATE - error %d processing inheritance", dvp, error);
6373                         return error;
6374                 }
6375
6376                 /*
6377                  * If the generated ACL is NULL, then we can save ourselves some effort
6378                  * by clearing the active bit.
6379                  */
6380                 if (nacl == NULL) {
6381                         VATTR_CLEAR_ACTIVE(vap, va_acl);
6382                 } else {
6383                         vap->va_base_acl = oacl;
6384                         VATTR_SET(vap, va_acl, nacl);
6385                 }
6386         }
6387
6388         error = vnode_authattr_new_internal(dvp, vap, (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx);
6389         if (error) {
6390                 vn_attribute_cleanup(vap, *defaulted_fieldsp);
6391         }
6392
6393         return error;
6394 }
6395
6396 void
6397 vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields)
6398 {
6399         /*
6400          * If the caller supplied a filesec in vap, it has been replaced
6401          * now by the post-inheritance copy.  We need to put the original back
6402          * and free the inherited product.
6403          */
6404         kauth_acl_t nacl, oacl;
6405
6406         if (VATTR_IS_ACTIVE(vap, va_acl)) {
6407                 nacl = vap->va_acl;
6408                 oacl = vap->va_base_acl;
6409
6410                 if (oacl) {
6411                         VATTR_SET(vap, va_acl, oacl);
6412                         vap->va_base_acl = NULL;
6413                 } else {
6414                         VATTR_CLEAR_ACTIVE(vap, va_acl);
6415                 }
6416
6417                 if (nacl != NULL) {
6418                         kauth_acl_free(nacl);
6419                 }
6420         }
6421
6422         if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != 0) {
6423                 VATTR_CLEAR_ACTIVE(vap, va_mode);
6424         }
6425         if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != 0) {
6426                 VATTR_CLEAR_ACTIVE(vap, va_gid);
6427         }
6428         if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != 0) {
6429                 VATTR_CLEAR_ACTIVE(vap, va_uid);
6430         }
6431
6432         return;
6433 }
6434
6435 int
6436 vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, __unused void *reserved)
6437 {
6438 #if !CONFIG_MACF
6439 #pragma unused(cnp)
6440 #endif
6441         int error = 0;
6442
6443         /*
6444          * Normally, unlinking of directories is not supported.
6445          * However, some file systems may have limited support.
6446          */
6447         if ((vp->v_type == VDIR) &&
6448             !(vp->v_mount->mnt_kern_flag & MNTK_DIR_HARDLINKS)) {
6449                 return EPERM; /* POSIX */
6450         }
6451
6452         /* authorize the delete operation */
6453 #if CONFIG_MACF
6454         if (!error) {
6455                 error = mac_vnode_check_unlink(ctx, dvp, vp, cnp);
6456         }
6457 #endif /* MAC */
6458         if (!error) {
6459                 error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
6460         }
6461
6462         return error;
6463 }
6464
6465 int
6466 vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved)
6467 {
6468         /* Open of existing case */
6469         kauth_action_t action;
6470         int error = 0;
6471         if (cnp->cn_ndp == NULL) {
6472                 panic("NULL ndp");
6473         }
6474         if (reserved != NULL) {
6475                 panic("reserved not NULL.");
6476         }
6477
6478 #if CONFIG_MACF
6479         /* XXX may do duplicate work here, but ignore that for now (idempotent) */
6480         if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) {
6481                 error = vnode_label(vnode_mount(vp), NULL, vp, NULL, 0, ctx);
6482                 if (error) {
6483                         return error;
6484                 }
6485         }
6486 #endif
6487
6488         if ((fmode & O_DIRECTORY) && vp->v_type != VDIR) {
6489                 return ENOTDIR;
6490         }
6491
6492         if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) {
6493                 return EOPNOTSUPP;    /* Operation not supported on socket */
6494         }
6495
6496         if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) {
6497                 return ELOOP;         /* O_NOFOLLOW was specified and the target is a symbolic link */
6498         }
6499
6500         /* disallow write operations on directories */
6501         if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) {
6502                 return EISDIR;
6503         }
6504
6505         if ((cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH)) {
6506                 if (vp->v_type != VDIR) {
6507                         return ENOTDIR;
6508                 }
6509         }
6510
6511 #if CONFIG_MACF
6512         /* If a file being opened is a shadow file containing
6513          * namedstream data, ignore the macf checks because it
6514          * is a kernel internal file and access should always
6515          * be allowed.
6516          */
6517         if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) {
6518                 error = mac_vnode_check_open(ctx, vp, fmode);
6519                 if (error) {
6520                         return error;
6521                 }
6522         }
6523 #endif
6524
6525         /* compute action to be authorized */
6526         action = 0;
6527         if (fmode & FREAD) {
6528                 action |= KAUTH_VNODE_READ_DATA;
6529         }
6530         if (fmode & (FWRITE | O_TRUNC)) {
6531                 /*
6532                  * If we are writing, appending, and not truncating,
6533                  * indicate that we are appending so that if the
6534                  * UF_APPEND or SF_APPEND bits are set, we do not deny
6535                  * the open.
6536                  */
6537                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
6538                         action |= KAUTH_VNODE_APPEND_DATA;
6539                 } else {
6540                         action |= KAUTH_VNODE_WRITE_DATA;
6541                 }
6542         }
6543         error = vnode_authorize(vp, NULL, action, ctx);
6544 #if NAMEDSTREAMS
6545         if (error == EACCES) {
6546                 /*
6547                  * Shadow files may exist on-disk with a different UID/GID
6548                  * than that of the current context.  Verify that this file
6549                  * is really a shadow file.  If it was created successfully
6550                  * then it should be authorized.
6551                  */
6552                 if (vnode_isshadow(vp) && vnode_isnamedstream(vp)) {
6553                         error = vnode_verifynamedstream(vp);
6554                 }
6555         }
6556 #endif
6557
6558         return error;
6559 }
6560
6561 int
6562 vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved)
6563 {
6564 #if !CONFIG_MACF
6565 #pragma unused(vap)
6566 #endif
6567         /* Creation case */
6568         int error;
6569
6570         if (cnp->cn_ndp == NULL) {
6571                 panic("NULL cn_ndp");
6572         }
6573         if (reserved != NULL) {
6574                 panic("reserved not NULL.");
6575         }
6576
6577         /* Only validate path for creation if we didn't do a complete lookup */
6578         if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) {
6579                 error = lookup_validate_creation_path(cnp->cn_ndp);
6580                 if (error) {
6581                         return error;
6582                 }
6583         }
6584
6585 #if CONFIG_MACF
6586         error = mac_vnode_check_create(ctx, dvp, cnp, vap);
6587         if (error) {
6588                 return error;
6589         }
6590 #endif /* CONFIG_MACF */
6591
6592         return vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6593 }
6594
6595 int
6596 vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
6597     struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
6598     vfs_context_t ctx, void *reserved)
6599 {
6600         return vn_authorize_renamex(fdvp, fvp, fcnp, tdvp, tvp, tcnp, ctx, 0, reserved);
6601 }
6602
6603 int
6604 vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
6605     struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
6606     vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
6607 {
6608         return vn_authorize_renamex_with_paths(fdvp, fvp, fcnp, NULL, tdvp, tvp, tcnp, NULL, ctx, flags, reserved);
6609 }
6610
6611 int
6612 vn_authorize_renamex_with_paths(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, const char *from_path,
6613     struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, const char *to_path,
6614     vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
6615 {
6616         int error = 0;
6617         int moving = 0;
6618         bool swap = flags & VFS_RENAME_SWAP;
6619
6620         if (reserved != NULL) {
6621                 panic("Passed something other than NULL as reserved field!");
6622         }
6623
6624         /*
6625          * Avoid renaming "." and "..".
6626          *
6627          * XXX No need to check for this in the FS.  We should always have the leaves
6628          * in VFS in this case.
6629          */
6630         if (fvp->v_type == VDIR &&
6631             ((fdvp == fvp) ||
6632             (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
6633             ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT))) {
6634                 error = EINVAL;
6635                 goto out;
6636         }
6637
6638         if (tvp == NULLVP && vnode_compound_rename_available(tdvp)) {
6639                 error = lookup_validate_creation_path(tcnp->cn_ndp);
6640                 if (error) {
6641                         goto out;
6642                 }
6643         }
6644
6645         /***** <MACF> *****/
6646 #if CONFIG_MACF
6647         error = mac_vnode_check_rename(ctx, fdvp, fvp, fcnp, tdvp, tvp, tcnp);
6648         if (error) {
6649                 goto out;
6650         }
6651         if (swap) {
6652                 error = mac_vnode_check_rename(ctx, tdvp, tvp, tcnp, fdvp, fvp, fcnp);
6653                 if (error) {
6654                         goto out;
6655                 }
6656         }
6657 #endif
6658         /***** </MACF> *****/
6659
6660         /***** <MiscChecks> *****/
6661         if (tvp != NULL) {
6662                 if (!swap) {
6663                         if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
6664                                 error = ENOTDIR;
6665                                 goto out;
6666                         } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
6667                                 error = EISDIR;
6668                                 goto out;
6669                         }
6670                 }
6671         } else if (swap) {
6672                 /*
6673                  * Caller should have already checked this and returned
6674                  * ENOENT.  If we send back ENOENT here, caller will retry
6675                  * which isn't what we want so we send back EINVAL here
6676                  * instead.
6677                  */
6678                 error = EINVAL;
6679                 goto out;
6680         }
6681
6682         if (fvp == tdvp) {
6683                 error = EINVAL;
6684                 goto out;
6685         }
6686
6687         /*
6688          * The following edge case is caught here:
6689          * (to cannot be a descendent of from)
6690          *
6691          *       o fdvp
6692          *      /
6693          *     /
6694          *    o fvp
6695          *     \
6696          *      \
6697          *       o tdvp
6698          *      /
6699          *     /
6700          *    o tvp
6701          */
6702         if (tdvp->v_parent == fvp) {
6703                 error = EINVAL;
6704                 goto out;
6705         }
6706
6707         if (swap && fdvp->v_parent == tvp) {
6708                 error = EINVAL;
6709                 goto out;
6710         }
6711         /***** </MiscChecks> *****/
6712
6713         /***** <Kauth> *****/
6714
6715         /*
6716          * As part of the Kauth step, we call out to allow 3rd-party
6717          * fileop notification of "about to rename".  This is needed
6718          * in the event that 3rd-parties need to know that the DELETE
6719          * authorization is actually part of a rename.  It's important
6720          * that we guarantee that the DELETE call-out will always be
6721          * made if the WILL_RENAME call-out is made.  Another fileop
6722          * call-out will be performed once the operation is completed.
6723          * We can ignore the result of kauth_authorize_fileop().
6724          *
6725          * N.B. We are passing the vnode and *both* paths to each
6726          * call; kauth_authorize_fileop() extracts the "from" path
6727          * when posting a KAUTH_FILEOP_WILL_RENAME notification.
6728          * As such, we only post these notifications if all of the
6729          * information we need is provided.
6730          */
6731
6732         if (swap) {
6733                 kauth_action_t f = 0, t = 0;
6734
6735                 /*
6736                  * Directories changing parents need ...ADD_SUBDIR...  to
6737                  * permit changing ".."
6738                  */
6739                 if (fdvp != tdvp) {
6740                         if (vnode_isdir(fvp)) {
6741                                 f = KAUTH_VNODE_ADD_SUBDIRECTORY;
6742                         }
6743                         if (vnode_isdir(tvp)) {
6744                                 t = KAUTH_VNODE_ADD_SUBDIRECTORY;
6745                         }
6746                 }
6747                 if (to_path != NULL) {
6748                         kauth_authorize_fileop(vfs_context_ucred(ctx),
6749                             KAUTH_FILEOP_WILL_RENAME,
6750                             (uintptr_t)fvp,
6751                             (uintptr_t)to_path);
6752                 }
6753                 error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | f, ctx);
6754                 if (error) {
6755                         goto out;
6756                 }
6757                 if (from_path != NULL) {
6758                         kauth_authorize_fileop(vfs_context_ucred(ctx),
6759                             KAUTH_FILEOP_WILL_RENAME,
6760                             (uintptr_t)tvp,
6761                             (uintptr_t)from_path);
6762                 }
6763                 error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE | t, ctx);
6764                 if (error) {
6765                         goto out;
6766                 }
6767                 f = vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
6768                 t = vnode_isdir(tvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
6769                 if (fdvp == tdvp) {
6770                         error = vnode_authorize(fdvp, NULL, f | t, ctx);
6771                 } else {
6772                         error = vnode_authorize(fdvp, NULL, t, ctx);
6773                         if (error) {
6774                                 goto out;
6775                         }
6776                         error = vnode_authorize(tdvp, NULL, f, ctx);
6777                 }
6778                 if (error) {
6779                         goto out;
6780                 }
6781         } else {
6782                 error = 0;
6783                 if ((tvp != NULL) && vnode_isdir(tvp)) {
6784                         if (tvp != fdvp) {
6785                                 moving = 1;
6786                         }
6787                 } else if (tdvp != fdvp) {
6788                         moving = 1;
6789                 }
6790
6791                 /*
6792                  * must have delete rights to remove the old name even in
6793                  * the simple case of fdvp == tdvp.
6794                  *
6795                  * If fvp is a directory, and we are changing it's parent,
6796                  * then we also need rights to rewrite its ".." entry as well.
6797                  */
6798                 if (to_path != NULL) {
6799                         kauth_authorize_fileop(vfs_context_ucred(ctx),
6800                             KAUTH_FILEOP_WILL_RENAME,
6801                             (uintptr_t)fvp,
6802                             (uintptr_t)to_path);
6803                 }
6804                 if (vnode_isdir(fvp)) {
6805                         if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) {
6806                                 goto out;
6807                         }
6808                 } else {
6809                         if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
6810                                 goto out;
6811                         }
6812                 }
6813                 if (moving) {
6814                         /* moving into tdvp or tvp, must have rights to add */
6815                         if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp,
6816                             NULL,
6817                             vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE,
6818                             ctx)) != 0) {
6819                                 goto out;
6820                         }
6821                 } else {
6822                         /* node staying in same directory, must be allowed to add new name */
6823                         if ((error = vnode_authorize(fdvp, NULL,
6824                             vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
6825                                 goto out;
6826                         }
6827                 }
6828                 /* overwriting tvp */
6829                 if ((tvp != NULL) && !vnode_isdir(tvp) &&
6830                     ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) {
6831                         goto out;
6832                 }
6833         }
6834
6835         /***** </Kauth> *****/
6836
6837         /* XXX more checks? */
6838 out:
6839         return error;
6840 }
6841
6842 int
6843 vn_authorize_mkdir(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved)
6844 {
6845 #if !CONFIG_MACF
6846 #pragma unused(vap)
6847 #endif
6848         int error;
6849
6850         if (reserved != NULL) {
6851                 panic("reserved not NULL in vn_authorize_mkdir()");
6852         }
6853
6854         /* XXX A hack for now, to make shadow files work */
6855         if (cnp->cn_ndp == NULL) {
6856                 return 0;
6857         }
6858
6859         if (vnode_compound_mkdir_available(dvp)) {
6860                 error = lookup_validate_creation_path(cnp->cn_ndp);
6861                 if (error) {
6862                         goto out;
6863                 }
6864         }
6865
6866 #if CONFIG_MACF
6867         error = mac_vnode_check_create(ctx,
6868             dvp, cnp, vap);
6869         if (error) {
6870                 goto out;
6871         }
6872 #endif
6873
6874         /* authorize addition of a directory to the parent */
6875         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) {
6876                 goto out;
6877         }
6878
6879 out:
6880         return error;
6881 }
6882
6883 int
6884 vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved)
6885 {
6886 #if CONFIG_MACF
6887         int error;
6888 #else
6889 #pragma unused(cnp)
6890 #endif
6891         if (reserved != NULL) {
6892                 panic("Non-NULL reserved argument to vn_authorize_rmdir()");
6893         }
6894
6895         if (vp->v_type != VDIR) {
6896                 /*
6897                  * rmdir only deals with directories
6898                  */
6899                 return ENOTDIR;
6900         }
6901
6902         if (dvp == vp) {
6903                 /*
6904                  * No rmdir "." please.
6905                  */
6906                 return EINVAL;
6907         }
6908
6909 #if CONFIG_MACF
6910         error = mac_vnode_check_unlink(ctx, dvp,
6911             vp, cnp);
6912         if (error) {
6913                 return error;
6914         }
6915 #endif
6916
6917         return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
6918 }
6919
6920 /*
6921  * Authorizer for directory cloning. This does not use vnodes but instead
6922  * uses prefilled vnode attributes from the filesystem.
6923  *
6924  * The same function is called to set up the attributes required, perform the
6925  * authorization and cleanup (if required)
6926  */
6927 int
6928 vnode_attr_authorize_dir_clone(struct vnode_attr *vap, kauth_action_t action,
6929     struct vnode_attr *dvap, __unused vnode_t sdvp, mount_t mp,
6930     dir_clone_authorizer_op_t vattr_op, uint32_t flags, vfs_context_t ctx,
6931     __unused void *reserved)
6932 {
6933         int error;
6934         int is_suser = vfs_context_issuser(ctx);
6935
6936         if (vattr_op == OP_VATTR_SETUP) {
6937                 VATTR_INIT(vap);
6938
6939                 /*
6940                  * When ACL inheritence is implemented, both vap->va_acl and
6941                  * dvap->va_acl will be required (even as superuser).
6942                  */
6943                 VATTR_WANTED(vap, va_type);
6944                 VATTR_WANTED(vap, va_mode);
6945                 VATTR_WANTED(vap, va_flags);
6946                 VATTR_WANTED(vap, va_uid);
6947                 VATTR_WANTED(vap, va_gid);
6948                 if (dvap) {
6949                         VATTR_INIT(dvap);
6950                         VATTR_WANTED(dvap, va_flags);
6951                 }
6952
6953                 if (!is_suser) {
6954                         /*
6955                          * If not superuser, we have to evaluate ACLs and
6956                          * need the target directory gid to set the initial
6957                          * gid of the new object.
6958                          */
6959                         VATTR_WANTED(vap, va_acl);
6960                         if (dvap) {
6961                                 VATTR_WANTED(dvap, va_gid);
6962                         }
6963                 } else if (dvap && (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
6964                         VATTR_WANTED(dvap, va_gid);
6965                 }
6966                 return 0;
6967         } else if (vattr_op == OP_VATTR_CLEANUP) {
6968                 return 0; /* Nothing to do for now */
6969         }
6970
6971         /* dvap isn't used for authorization */
6972         error = vnode_attr_authorize(vap, NULL, mp, action, ctx);
6973
6974         if (error) {
6975                 return error;
6976         }
6977
6978         /*
6979          * vn_attribute_prepare should be able to accept attributes as well as
6980          * vnodes but for now we do this inline.
6981          */
6982         if (!is_suser || (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
6983                 /*
6984                  * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit
6985                  * owner is set, that owner takes ownership of all new files.
6986                  */
6987                 if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
6988                     (mp->mnt_fsowner != KAUTH_UID_NONE)) {
6989                         VATTR_SET(vap, va_uid, mp->mnt_fsowner);
6990                 } else {
6991                         /* default owner is current user */
6992                         VATTR_SET(vap, va_uid,
6993                             kauth_cred_getuid(vfs_context_ucred(ctx)));
6994                 }
6995
6996                 if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
6997                     (mp->mnt_fsgroup != KAUTH_GID_NONE)) {
6998                         VATTR_SET(vap, va_gid, mp->mnt_fsgroup);
6999                 } else {
7000                         /*
7001                          * default group comes from parent object,
7002                          * fallback to current user
7003                          */
7004                         if (VATTR_IS_SUPPORTED(dvap, va_gid)) {
7005                                 VATTR_SET(vap, va_gid, dvap->va_gid);
7006                         } else {
7007                                 VATTR_SET(vap, va_gid,
7008                                     kauth_cred_getgid(vfs_context_ucred(ctx)));
7009                         }
7010                 }
7011         }
7012
7013         /* Inherit SF_RESTRICTED bit from destination directory only */
7014         if (VATTR_IS_ACTIVE(vap, va_flags)) {
7015                 VATTR_SET(vap, va_flags,
7016                     ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)))); /* Turn off from source */
7017                 if (VATTR_IS_ACTIVE(dvap, va_flags)) {
7018                         VATTR_SET(vap, va_flags,
7019                             vap->va_flags | (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED)));
7020                 }
7021         } else if (VATTR_IS_ACTIVE(dvap, va_flags)) {
7022                 VATTR_SET(vap, va_flags, (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED)));
7023         }
7024
7025         return 0;
7026 }
7027
7028
7029 /*
7030  * Authorize an operation on a vnode.
7031  *
7032  * This is KPI, but here because it needs vnode_scope.
7033  *
7034  * Returns:     0                       Success
7035  *      kauth_authorize_action:EPERM    ...
7036  *      xlate => EACCES                 Permission denied
7037  *      kauth_authorize_action:0        Success
7038  *      kauth_authorize_action:         Depends on callback return; this is
7039  *                                      usually only vnode_authorize_callback(),
7040  *                                      but may include other listerners, if any
7041  *                                      exist.
7042  *              EROFS
7043  *              EACCES
7044  *              EPERM
7045  *              ???
7046  */
7047 int
7048 vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx)
7049 {
7050         int     error, result;
7051
7052         /*
7053          * We can't authorize against a dead vnode; allow all operations through so that
7054          * the correct error can be returned.
7055          */
7056         if (vp->v_type == VBAD) {
7057                 return 0;
7058         }
7059
7060         error = 0;
7061         result = kauth_authorize_action(vnode_scope, vfs_context_ucred(ctx), action,
7062             (uintptr_t)ctx, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error);
7063         if (result == EPERM) {          /* traditional behaviour */
7064                 result = EACCES;
7065         }
7066         /* did the lower layers give a better error return? */
7067         if ((result != 0) && (error != 0)) {
7068                 return error;
7069         }
7070         return result;
7071 }
7072
7073 /*
7074  * Test for vnode immutability.
7075  *
7076  * The 'append' flag is set when the authorization request is constrained
7077  * to operations which only request the right to append to a file.
7078  *
7079  * The 'ignore' flag is set when an operation modifying the immutability flags
7080  * is being authorized.  We check the system securelevel to determine which
7081  * immutability flags we can ignore.
7082  */
7083 static int
7084 vnode_immutable(struct vnode_attr *vap, int append, int ignore)
7085 {
7086         int     mask;
7087
7088         /* start with all bits precluding the operation */
7089         mask = IMMUTABLE | APPEND;
7090
7091         /* if appending only, remove the append-only bits */
7092         if (append) {
7093                 mask &= ~APPEND;
7094         }
7095
7096         /* ignore only set when authorizing flags changes */
7097         if (ignore) {
7098                 if (securelevel <= 0) {
7099                         /* in insecure state, flags do not inhibit changes */
7100                         mask = 0;
7101                 } else {
7102                         /* in secure state, user flags don't inhibit */
7103                         mask &= ~(UF_IMMUTABLE | UF_APPEND);
7104                 }
7105         }
7106         KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
7107         if ((vap->va_flags & mask) != 0) {
7108                 return EPERM;
7109         }
7110         return 0;
7111 }
7112
7113 static int
7114 vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
7115 {
7116         int result;
7117
7118         /* default assumption is not-owner */
7119         result = 0;
7120
7121         /*
7122          * If the filesystem has given us a UID, we treat this as authoritative.
7123          */
7124         if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
7125                 result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0;
7126         }
7127         /* we could test the owner UUID here if we had a policy for it */
7128
7129         return result;
7130 }
7131
7132 /*
7133  * vauth_node_group
7134  *
7135  * Description: Ask if a cred is a member of the group owning the vnode object
7136  *
7137  * Parameters:          vap             vnode attribute
7138  *                              vap->va_gid     group owner of vnode object
7139  *                      cred            credential to check
7140  *                      ismember        pointer to where to put the answer
7141  *                      idontknow       Return this if we can't get an answer
7142  *
7143  * Returns:             0               Success
7144  *                      idontknow       Can't get information
7145  *      kauth_cred_ismember_gid:?       Error from kauth subsystem
7146  *      kauth_cred_ismember_gid:?       Error from kauth subsystem
7147  */
7148 static int
7149 vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember, int idontknow)
7150 {
7151         int     error;
7152         int     result;
7153
7154         error = 0;
7155         result = 0;
7156
7157         /*
7158          * The caller is expected to have asked the filesystem for a group
7159          * at some point prior to calling this function.  The answer may
7160          * have been that there is no group ownership supported for the
7161          * vnode object, in which case we return
7162          */
7163         if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
7164                 error = kauth_cred_ismember_gid(cred, vap->va_gid, &result);
7165                 /*
7166                  * Credentials which are opted into external group membership
7167                  * resolution which are not known to the external resolver
7168                  * will result in an ENOENT error.  We translate this into
7169                  * the appropriate 'idontknow' response for our caller.
7170                  *
7171                  * XXX We do not make a distinction here between an ENOENT
7172                  * XXX arising from a response from the external resolver,
7173                  * XXX and an ENOENT which is internally generated.  This is
7174                  * XXX a deficiency of the published kauth_cred_ismember_gid()
7175                  * XXX KPI which can not be overcome without new KPI.  For
7176                  * XXX all currently known cases, however, this wil result
7177                  * XXX in correct behaviour.
7178                  */
7179                 if (error == ENOENT) {
7180                         error = idontknow;
7181                 }
7182         }
7183         /*
7184          * XXX We could test the group UUID here if we had a policy for it,
7185          * XXX but this is problematic from the perspective of synchronizing
7186          * XXX group UUID and POSIX GID ownership of a file and keeping the
7187          * XXX values coherent over time.  The problem is that the local
7188          * XXX system will vend transient group UUIDs for unknown POSIX GID
7189          * XXX values, and these are not persistent, whereas storage of values
7190          * XXX is persistent.  One potential solution to this is a local
7191          * XXX (persistent) replica of remote directory entries and vended
7192          * XXX local ids in a local directory server (think in terms of a
7193          * XXX caching DNS server).
7194          */
7195
7196         if (!error) {
7197                 *ismember = result;
7198         }
7199         return error;
7200 }
7201
7202 static int
7203 vauth_file_owner(vauth_ctx vcp)
7204 {
7205         int result;
7206
7207         if (vcp->flags_valid & _VAC_IS_OWNER) {
7208                 result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0;
7209         } else {
7210                 result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred);
7211
7212                 /* cache our result */
7213                 vcp->flags_valid |= _VAC_IS_OWNER;
7214                 if (result) {
7215                         vcp->flags |= _VAC_IS_OWNER;
7216                 } else {
7217                         vcp->flags &= ~_VAC_IS_OWNER;
7218                 }
7219         }
7220         return result;
7221 }
7222
7223
7224 /*
7225  * vauth_file_ingroup
7226  *
7227  * Description: Ask if a user is a member of the group owning the directory
7228  *
7229  * Parameters:          vcp             The vnode authorization context that
7230  *                                      contains the user and directory info
7231  *                              vcp->flags_valid        Valid flags
7232  *                              vcp->flags              Flags values
7233  *                              vcp->vap                File vnode attributes
7234  *                              vcp->ctx                VFS Context (for user)
7235  *                      ismember        pointer to where to put the answer
7236  *                      idontknow       Return this if we can't get an answer
7237  *
7238  * Returns:             0               Success
7239  *              vauth_node_group:?      Error from vauth_node_group()
7240  *
7241  * Implicit returns:    *ismember       0       The user is not a group member
7242  *                                      1       The user is a group member
7243  */
7244 static int
7245 vauth_file_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
7246 {
7247         int     error;
7248
7249         /* Check for a cached answer first, to avoid the check if possible */
7250         if (vcp->flags_valid & _VAC_IN_GROUP) {
7251                 *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0;
7252                 error = 0;
7253         } else {
7254                 /* Otherwise, go look for it */
7255                 error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember, idontknow);
7256
7257                 if (!error) {
7258                         /* cache our result */
7259                         vcp->flags_valid |= _VAC_IN_GROUP;
7260                         if (*ismember) {
7261                                 vcp->flags |= _VAC_IN_GROUP;
7262                         } else {
7263                                 vcp->flags &= ~_VAC_IN_GROUP;
7264                         }
7265                 }
7266         }
7267         return error;
7268 }
7269
7270 static int
7271 vauth_dir_owner(vauth_ctx vcp)
7272 {
7273         int result;
7274
7275         if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
7276                 result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0;
7277         } else {
7278                 result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred);
7279
7280                 /* cache our result */
7281                 vcp->flags_valid |= _VAC_IS_DIR_OWNER;
7282                 if (result) {
7283                         vcp->flags |= _VAC_IS_DIR_OWNER;
7284                 } else {
7285                         vcp->flags &= ~_VAC_IS_DIR_OWNER;
7286                 }
7287         }
7288         return result;
7289 }
7290
7291 /*
7292  * vauth_dir_ingroup
7293  *
7294  * Description: Ask if a user is a member of the group owning the directory
7295  *
7296  * Parameters:          vcp             The vnode authorization context that
7297  *                                      contains the user and directory info
7298  *                              vcp->flags_valid        Valid flags
7299  *                              vcp->flags              Flags values
7300  *                              vcp->dvap               Dir vnode attributes
7301  *                              vcp->ctx                VFS Context (for user)
7302  *                      ismember        pointer to where to put the answer
7303  *                      idontknow       Return this if we can't get an answer
7304  *
7305  * Returns:             0               Success
7306  *              vauth_node_group:?      Error from vauth_node_group()
7307  *
7308  * Implicit returns:    *ismember       0       The user is not a group member
7309  *                                      1       The user is a group member
7310  */
7311 static int
7312 vauth_dir_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
7313 {
7314         int     error;
7315
7316         /* Check for a cached answer first, to avoid the check if possible */
7317         if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
7318                 *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0;
7319                 error = 0;
7320         } else {
7321                 /* Otherwise, go look for it */
7322                 error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember, idontknow);
7323
7324                 if (!error) {
7325                         /* cache our result */
7326                         vcp->flags_valid |= _VAC_IN_DIR_GROUP;
7327                         if (*ismember) {
7328                                 vcp->flags |= _VAC_IN_DIR_GROUP;
7329                         } else {
7330                                 vcp->flags &= ~_VAC_IN_DIR_GROUP;
7331                         }
7332                 }
7333         }
7334         return error;
7335 }
7336
7337 /*
7338  * Test the posix permissions in (vap) to determine whether (credential)
7339  * may perform (action)
7340  */
7341 static int
7342 vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
7343 {
7344         struct vnode_attr *vap;
7345         int needed, error, owner_ok, group_ok, world_ok, ismember;
7346 #ifdef KAUTH_DEBUG_ENABLE
7347         const char *where = "uninitialized";
7348 # define _SETWHERE(c)   where = c;
7349 #else
7350 # define _SETWHERE(c)
7351 #endif
7352
7353         /* checking file or directory? */
7354         if (on_dir) {
7355                 vap = vcp->dvap;
7356         } else {
7357                 vap = vcp->vap;
7358         }
7359
7360         error = 0;
7361
7362         /*
7363          * We want to do as little work here as possible.  So first we check
7364          * which sets of permissions grant us the access we need, and avoid checking
7365          * whether specific permissions grant access when more generic ones would.
7366          */
7367
7368         /* owner permissions */
7369         needed = 0;
7370         if (action & VREAD) {
7371                 needed |= S_IRUSR;
7372         }
7373         if (action & VWRITE) {
7374                 needed |= S_IWUSR;
7375         }
7376         if (action & VEXEC) {
7377                 needed |= S_IXUSR;
7378         }
7379         owner_ok = (needed & vap->va_mode) == needed;
7380
7381         /* group permissions */
7382         needed = 0;
7383         if (action & VREAD) {
7384                 needed |= S_IRGRP;
7385         }
7386         if (action & VWRITE) {
7387                 needed |= S_IWGRP;
7388         }
7389         if (action & VEXEC) {
7390                 needed |= S_IXGRP;
7391         }
7392         group_ok = (needed & vap->va_mode) == needed;
7393
7394         /* world permissions */
7395         needed = 0;
7396         if (action & VREAD) {
7397                 needed |= S_IROTH;
7398         }
7399         if (action & VWRITE) {
7400                 needed |= S_IWOTH;
7401         }
7402         if (action & VEXEC) {
7403                 needed |= S_IXOTH;
7404         }
7405         world_ok = (needed & vap->va_mode) == needed;
7406
7407         /* If granted/denied by all three, we're done */
7408         if (owner_ok && group_ok && world_ok) {
7409                 _SETWHERE("all");
7410                 goto out;
7411         }
7412         if (!owner_ok && !group_ok && !world_ok) {
7413                 _SETWHERE("all");
7414                 error = EACCES;
7415                 goto out;
7416         }
7417
7418         /* Check ownership (relatively cheap) */
7419         if ((on_dir && vauth_dir_owner(vcp)) ||
7420             (!on_dir && vauth_file_owner(vcp))) {
7421                 _SETWHERE("user");
7422                 if (!owner_ok) {
7423                         error = EACCES;
7424                 }
7425                 goto out;
7426         }
7427
7428         /* Not owner; if group and world both grant it we're done */
7429         if (group_ok && world_ok) {
7430                 _SETWHERE("group/world");
7431                 goto out;
7432         }
7433         if (!group_ok && !world_ok) {
7434                 _SETWHERE("group/world");
7435                 error = EACCES;
7436                 goto out;
7437         }
7438
7439         /* Check group membership (most expensive) */
7440         ismember = 0;   /* Default to allow, if the target has no group owner */
7441
7442         /*
7443          * In the case we can't get an answer about the user from the call to
7444          * vauth_dir_ingroup() or vauth_file_ingroup(), we want to fail on
7445          * the side of caution, rather than simply granting access, or we will
7446          * fail to correctly implement exclusion groups, so we set the third
7447          * parameter on the basis of the state of 'group_ok'.
7448          */
7449         if (on_dir) {
7450                 error = vauth_dir_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
7451         } else {
7452                 error = vauth_file_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
7453         }
7454         if (error) {
7455                 if (!group_ok) {
7456                         ismember = 1;
7457                 }
7458                 error = 0;
7459         }
7460         if (ismember) {
7461                 _SETWHERE("group");
7462                 if (!group_ok) {
7463                         error = EACCES;
7464                 }
7465                 goto out;
7466         }
7467
7468         /* Not owner, not in group, use world result */
7469         _SETWHERE("world");
7470         if (!world_ok) {
7471                 error = EACCES;
7472         }
7473
7474         /* FALLTHROUGH */
7475
7476 out:
7477         KAUTH_DEBUG("%p    %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
7478             vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where,
7479             (action & VREAD)  ? "r" : "-",
7480             (action & VWRITE) ? "w" : "-",
7481             (action & VEXEC)  ? "x" : "-",
7482             needed,
7483             (vap->va_mode & S_IRUSR) ? "r" : "-",
7484             (vap->va_mode & S_IWUSR) ? "w" : "-",
7485             (vap->va_mode & S_IXUSR) ? "x" : "-",
7486             (vap->va_mode & S_IRGRP) ? "r" : "-",
7487             (vap->va_mode & S_IWGRP) ? "w" : "-",
7488             (vap->va_mode & S_IXGRP) ? "x" : "-",
7489             (vap->va_mode & S_IROTH) ? "r" : "-",
7490             (vap->va_mode & S_IWOTH) ? "w" : "-",
7491             (vap->va_mode & S_IXOTH) ? "x" : "-",
7492             kauth_cred_getuid(vcp->ctx->vc_ucred),
7493             on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
7494             on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
7495         return error;
7496 }
7497
7498 /*
7499  * Authorize the deletion of the node vp from the directory dvp.
7500  *
7501  * We assume that:
7502  * - Neither the node nor the directory are immutable.
7503  * - The user is not the superuser.
7504  *
7505  * The precedence of factors for authorizing or denying delete for a credential
7506  *
7507  * 1) Explicit ACE on the node. (allow or deny DELETE)
7508  * 2) Explicit ACE on the directory (allow or deny DELETE_CHILD).
7509  *
7510  *    If there are conflicting ACEs on the node and the directory, the node
7511  *    ACE wins.
7512  *
7513  * 3) Sticky bit on the directory.
7514  *    Deletion is not permitted if the directory is sticky and the caller is
7515  *    not owner of the node or directory. The sticky bit rules are like a deny
7516  *    delete ACE except lower in priority than ACL's either allowing or denying
7517  *    delete.
7518  *
7519  * 4) POSIX permisions on the directory.
7520  *
7521  * As an optimization, we cache whether or not delete child is permitted
7522  * on directories. This enables us to skip directory ACL and POSIX checks
7523  * as we already have the result from those checks. However, we always check the
7524  * node ACL and, if the directory has the sticky bit set, we always check its
7525  * ACL (even for a directory with an authorized delete child). Furthermore,
7526  * caching the delete child authorization is independent of the sticky bit
7527  * being set as it is only applicable in determining whether the node can be
7528  * deleted or not.
7529  */
7530 static int
7531 vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
7532 {
7533         struct vnode_attr       *vap = vcp->vap;
7534         struct vnode_attr       *dvap = vcp->dvap;
7535         kauth_cred_t            cred = vcp->ctx->vc_ucred;
7536         struct kauth_acl_eval   eval;
7537         int                     error, ismember;
7538
7539         /* Check the ACL on the node first */
7540         if (VATTR_IS_NOT(vap, va_acl, NULL)) {
7541                 eval.ae_requested = KAUTH_VNODE_DELETE;
7542                 eval.ae_acl = &vap->va_acl->acl_ace[0];
7543                 eval.ae_count = vap->va_acl->acl_entrycount;
7544                 eval.ae_options = 0;
7545                 if (vauth_file_owner(vcp)) {
7546                         eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
7547                 }
7548                 /*
7549                  * We use ENOENT as a marker to indicate we could not get
7550                  * information in order to delay evaluation until after we
7551                  * have the ACL evaluation answer.  Previously, we would
7552                  * always deny the operation at this point.
7553                  */
7554                 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
7555                         return error;
7556                 }
7557                 if (error == ENOENT) {
7558                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
7559                 } else if (ismember) {
7560                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
7561                 }
7562                 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7563                 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7564                 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7565                 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7566
7567                 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
7568                         KAUTH_DEBUG("%p    ERROR during ACL processing - %d", vcp->vp, error);
7569                         return error;
7570                 }
7571
7572                 switch (eval.ae_result) {
7573                 case KAUTH_RESULT_DENY:
7574                         KAUTH_DEBUG("%p    DENIED - denied by ACL", vcp->vp);
7575                         return EACCES;
7576                 case KAUTH_RESULT_ALLOW:
7577                         KAUTH_DEBUG("%p    ALLOWED - granted by ACL", vcp->vp);
7578                         return 0;
7579                 case KAUTH_RESULT_DEFER:
7580                 default:
7581                         /* Defer to directory */
7582                         KAUTH_DEBUG("%p    DEFERRED - by file ACL", vcp->vp);
7583                         break;
7584                 }
7585         }
7586
7587         /*
7588          * Without a sticky bit, a previously authorized delete child is
7589          * sufficient to authorize this delete.
7590          *
7591          * If the sticky bit is set, a directory ACL which allows delete child
7592          * overrides a (potential) sticky bit deny. The authorized delete child
7593          * cannot tell us if it was authorized because of an explicit delete
7594          * child allow ACE or because of POSIX permisions so we have to check
7595          * the directory ACL everytime if the directory has a sticky bit.
7596          */
7597         if (!(dvap->va_mode & S_ISTXT) && cached_delete_child) {
7598                 KAUTH_DEBUG("%p    ALLOWED - granted by directory ACL or POSIX permissions and no sticky bit on directory", vcp->vp);
7599                 return 0;
7600         }
7601
7602         /* check the ACL on the directory */
7603         if (VATTR_IS_NOT(dvap, va_acl, NULL)) {
7604                 eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
7605                 eval.ae_acl = &dvap->va_acl->acl_ace[0];
7606                 eval.ae_count = dvap->va_acl->acl_entrycount;
7607                 eval.ae_options = 0;
7608                 if (vauth_dir_owner(vcp)) {
7609                         eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
7610                 }
7611                 /*
7612                  * We use ENOENT as a marker to indicate we could not get
7613                  * information in order to delay evaluation until after we
7614                  * have the ACL evaluation answer.  Previously, we would
7615                  * always deny the operation at this point.
7616                  */
7617                 if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
7618                         return error;
7619                 }
7620                 if (error == ENOENT) {
7621                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
7622                 } else if (ismember) {
7623                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
7624                 }
7625                 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7626                 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7627                 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7628                 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7629
7630                 /*
7631                  * If there is no entry, we are going to defer to other
7632                  * authorization mechanisms.
7633                  */
7634                 error = kauth_acl_evaluate(cred, &eval);
7635
7636                 if (error != 0) {
7637                         KAUTH_DEBUG("%p    ERROR during ACL processing - %d", vcp->vp, error);
7638                         return error;
7639                 }
7640                 switch (eval.ae_result) {
7641                 case KAUTH_RESULT_DENY:
7642                         KAUTH_DEBUG("%p    DENIED - denied by directory ACL", vcp->vp);
7643                         return EACCES;
7644                 case KAUTH_RESULT_ALLOW:
7645                         KAUTH_DEBUG("%p    ALLOWED - granted by directory ACL", vcp->vp);
7646                         if (!cached_delete_child && vcp->dvp) {
7647                                 vnode_cache_authorized_action(vcp->dvp,
7648                                     vcp->ctx, KAUTH_VNODE_DELETE_CHILD);
7649                         }
7650                         return 0;
7651                 case KAUTH_RESULT_DEFER:
7652                 default:
7653                         /* Deferred by directory ACL */
7654                         KAUTH_DEBUG("%p    DEFERRED - directory ACL", vcp->vp);
7655                         break;
7656                 }
7657         }
7658
7659         /*
7660          * From this point, we can't explicitly allow and if we reach the end
7661          * of the function without a denial, then the delete is authorized.
7662          */
7663         if (!cached_delete_child) {
7664                 if (vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */) != 0) {
7665                         KAUTH_DEBUG("%p    DENIED - denied by posix permisssions", vcp->vp);
7666                         return EACCES;
7667                 }
7668                 /*
7669                  * Cache the authorized action on the vnode if allowed by the
7670                  * directory ACL or POSIX permissions. It is correct to cache
7671                  * this action even if sticky bit would deny deleting the node.
7672                  */
7673                 if (vcp->dvp) {
7674                         vnode_cache_authorized_action(vcp->dvp, vcp->ctx,
7675                             KAUTH_VNODE_DELETE_CHILD);
7676                 }
7677         }
7678
7679         /* enforce sticky bit behaviour */
7680         if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
7681                 KAUTH_DEBUG("%p    DENIED - sticky bit rules (user %d  file %d  dir %d)",
7682                     vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid);
7683                 return EACCES;
7684         }
7685
7686         /* not denied, must be OK */
7687         return 0;
7688 }
7689
7690
7691 /*
7692  * Authorize an operation based on the node's attributes.
7693  */
7694 static int
7695 vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny)
7696 {
7697         struct vnode_attr       *vap = vcp->vap;
7698         kauth_cred_t            cred = vcp->ctx->vc_ucred;
7699         struct kauth_acl_eval   eval;
7700         int                     error, ismember;
7701         mode_t                  posix_action;
7702
7703         /*
7704          * If we are the file owner, we automatically have some rights.
7705          *
7706          * Do we need to expand this to support group ownership?
7707          */
7708         if (vauth_file_owner(vcp)) {
7709                 acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY);
7710         }
7711
7712         /*
7713          * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can
7714          * mask the latter.  If TAKE_OWNERSHIP is requested the caller is about to
7715          * change ownership to themselves, and WRITE_SECURITY is implicitly
7716          * granted to the owner.  We need to do this because at this point
7717          * WRITE_SECURITY may not be granted as the caller is not currently
7718          * the owner.
7719          */
7720         if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) &&
7721             (acl_rights & KAUTH_VNODE_WRITE_SECURITY)) {
7722                 acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY;
7723         }
7724
7725         if (acl_rights == 0) {
7726                 KAUTH_DEBUG("%p    ALLOWED - implicit or no rights required", vcp->vp);
7727                 return 0;
7728         }
7729
7730         /* if we have an ACL, evaluate it */
7731         if (VATTR_IS_NOT(vap, va_acl, NULL)) {
7732                 eval.ae_requested = acl_rights;
7733                 eval.ae_acl = &vap->va_acl->acl_ace[0];
7734                 eval.ae_count = vap->va_acl->acl_entrycount;
7735                 eval.ae_options = 0;
7736                 if (vauth_file_owner(vcp)) {
7737                         eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
7738                 }
7739                 /*
7740                  * We use ENOENT as a marker to indicate we could not get
7741                  * information in order to delay evaluation until after we
7742                  * have the ACL evaluation answer.  Previously, we would
7743                  * always deny the operation at this point.
7744                  */
7745                 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
7746                         return error;
7747                 }
7748                 if (error == ENOENT) {
7749                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
7750                 } else if (ismember) {
7751                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
7752                 }
7753                 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7754                 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7755                 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7756                 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7757
7758                 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
7759                         KAUTH_DEBUG("%p    ERROR during ACL processing - %d", vcp->vp, error);
7760                         return error;
7761                 }
7762
7763                 switch (eval.ae_result) {
7764                 case KAUTH_RESULT_DENY:
7765                         KAUTH_DEBUG("%p    DENIED - by ACL", vcp->vp);
7766                         return EACCES;         /* deny, deny, counter-allege */
7767                 case KAUTH_RESULT_ALLOW:
7768                         KAUTH_DEBUG("%p    ALLOWED - all rights granted by ACL", vcp->vp);
7769                         return 0;
7770                 case KAUTH_RESULT_DEFER:
7771                 default:
7772                         /* Effectively the same as !delete_child_denied */
7773                         KAUTH_DEBUG("%p    DEFERRED - directory ACL", vcp->vp);
7774                         break;
7775                 }
7776
7777                 *found_deny = eval.ae_found_deny;
7778
7779                 /* fall through and evaluate residual rights */
7780         } else {
7781                 /* no ACL, everything is residual */
7782                 eval.ae_residual = acl_rights;
7783         }
7784
7785         /*
7786          * Grant residual rights that have been pre-authorized.
7787          */
7788         eval.ae_residual &= ~preauth_rights;
7789
7790         /*
7791          * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied.
7792          */
7793         if (vauth_file_owner(vcp)) {
7794                 eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES;
7795         }
7796
7797         if (eval.ae_residual == 0) {
7798                 KAUTH_DEBUG("%p    ALLOWED - rights already authorized", vcp->vp);
7799                 return 0;
7800         }
7801
7802         /*
7803          * Bail if we have residual rights that can't be granted by posix permissions,
7804          * or aren't presumed granted at this point.
7805          *
7806          * XXX these can be collapsed for performance
7807          */
7808         if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) {
7809                 KAUTH_DEBUG("%p    DENIED - CHANGE_OWNER not permitted", vcp->vp);
7810                 return EACCES;
7811         }
7812         if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) {
7813                 KAUTH_DEBUG("%p    DENIED - WRITE_SECURITY not permitted", vcp->vp);
7814                 return EACCES;
7815         }
7816
7817 #if DIAGNOSTIC
7818         if (eval.ae_residual & KAUTH_VNODE_DELETE) {
7819                 panic("vnode_authorize: can't be checking delete permission here");
7820         }
7821 #endif
7822
7823         /*
7824          * Compute the fallback posix permissions that will satisfy the remaining
7825          * rights.
7826          */
7827         posix_action = 0;
7828         if (eval.ae_residual & (KAUTH_VNODE_READ_DATA |
7829             KAUTH_VNODE_LIST_DIRECTORY |
7830             KAUTH_VNODE_READ_EXTATTRIBUTES)) {
7831                 posix_action |= VREAD;
7832         }
7833         if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA |
7834             KAUTH_VNODE_ADD_FILE |
7835             KAUTH_VNODE_ADD_SUBDIRECTORY |
7836             KAUTH_VNODE_DELETE_CHILD |
7837             KAUTH_VNODE_WRITE_ATTRIBUTES |
7838             KAUTH_VNODE_WRITE_EXTATTRIBUTES)) {
7839                 posix_action |= VWRITE;
7840         }
7841         if (eval.ae_residual & (KAUTH_VNODE_EXECUTE |
7842             KAUTH_VNODE_SEARCH)) {
7843                 posix_action |= VEXEC;
7844         }
7845
7846         if (posix_action != 0) {
7847                 return vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */);
7848         } else {
7849                 KAUTH_DEBUG("%p    ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping",
7850                     vcp->vp,
7851                     (eval.ae_residual & KAUTH_VNODE_READ_DATA)
7852                     ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
7853                     (eval.ae_residual & KAUTH_VNODE_WRITE_DATA)
7854                     ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "",
7855                     (eval.ae_residual & KAUTH_VNODE_EXECUTE)
7856                     ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "",
7857                     (eval.ae_residual & KAUTH_VNODE_DELETE)
7858                     ? " DELETE" : "",
7859                     (eval.ae_residual & KAUTH_VNODE_APPEND_DATA)
7860                     ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
7861                     (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD)
7862                     ? " DELETE_CHILD" : "",
7863                     (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES)
7864                     ? " READ_ATTRIBUTES" : "",
7865                     (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES)
7866                     ? " WRITE_ATTRIBUTES" : "",
7867                     (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES)
7868                     ? " READ_EXTATTRIBUTES" : "",
7869                     (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES)
7870                     ? " WRITE_EXTATTRIBUTES" : "",
7871                     (eval.ae_residual & KAUTH_VNODE_READ_SECURITY)
7872                     ? " READ_SECURITY" : "",
7873                     (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY)
7874                     ? " WRITE_SECURITY" : "",
7875                     (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE)
7876                     ? " CHECKIMMUTABLE" : "",
7877                     (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER)
7878                     ? " CHANGE_OWNER" : "");
7879         }
7880
7881         /*
7882          * Lack of required Posix permissions implies no reason to deny access.
7883          */
7884         return 0;
7885 }
7886
7887 /*
7888  * Check for file immutability.
7889  */
7890 static int
7891 vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, int ignore)
7892 {
7893         int error;
7894         int append;
7895
7896         /*
7897          * Perform immutability checks for operations that change data.
7898          *
7899          * Sockets, fifos and devices require special handling.
7900          */
7901         switch (vap->va_type) {
7902         case VSOCK:
7903         case VFIFO:
7904         case VBLK:
7905         case VCHR:
7906                 /*
7907                  * Writing to these nodes does not change the filesystem data,
7908                  * so forget that it's being tried.
7909                  */
7910                 rights &= ~KAUTH_VNODE_WRITE_DATA;
7911                 break;
7912         default:
7913                 break;
7914         }
7915
7916         error = 0;
7917         if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
7918                 /* check per-filesystem options if possible */
7919                 if (mp != NULL) {
7920                         /* check for no-EA filesystems */
7921                         if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
7922                             (vfs_flags(mp) & MNT_NOUSERXATTR)) {
7923                                 KAUTH_DEBUG("%p    DENIED - filesystem disallowed extended attributes", vap);
7924                                 error = EACCES;  /* User attributes disabled */
7925                                 goto out;
7926                         }
7927                 }
7928
7929                 /*
7930                  * check for file immutability. first, check if the requested rights are
7931                  * allowable for a UF_APPEND file.
7932                  */
7933                 append = 0;
7934                 if (vap->va_type == VDIR) {
7935                         if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) {
7936                                 append = 1;
7937                         }
7938                 } else {
7939                         if ((rights & (KAUTH_VNODE_APPEND_DATA | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) {
7940                                 append = 1;
7941                         }
7942                 }
7943                 if ((error = vnode_immutable(vap, append, ignore)) != 0) {
7944                         KAUTH_DEBUG("%p    DENIED - file is immutable", vap);
7945                         goto out;
7946                 }
7947         }
7948 out:
7949         return error;
7950 }
7951
7952 /*
7953  * Handle authorization actions for filesystems that advertise that the
7954  * server will be enforcing.
7955  *
7956  * Returns:     0                       Authorization should be handled locally
7957  *              1                       Authorization was handled by the FS
7958  *
7959  * Note:        Imputed returns will only occur if the authorization request
7960  *              was handled by the FS.
7961  *
7962  * Imputed:     *resultp, modified      Return code from FS when the request is
7963  *                                      handled by the FS.
7964  *              VNOP_ACCESS:???
7965  *              VNOP_OPEN:???
7966  */
7967 static int
7968 vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx)
7969 {
7970         int     error;
7971
7972         /*
7973          * If the vp is a device node, socket or FIFO it actually represents a local
7974          * endpoint, so we need to handle it locally.
7975          */
7976         switch (vp->v_type) {
7977         case VBLK:
7978         case VCHR:
7979         case VSOCK:
7980         case VFIFO:
7981                 return 0;
7982         default:
7983                 break;
7984         }
7985
7986         /*
7987          * In the advisory request case, if the filesystem doesn't think it's reliable
7988          * we will attempt to formulate a result ourselves based on VNOP_GETATTR data.
7989          */
7990         if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vp->v_mount)) {
7991                 return 0;
7992         }
7993
7994         /*
7995          * Let the filesystem have a say in the matter.  It's OK for it to not implemnent
7996          * VNOP_ACCESS, as most will authorise inline with the actual request.
7997          */
7998         if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) {
7999                 *resultp = error;
8000                 KAUTH_DEBUG("%p    DENIED - opaque filesystem VNOP_ACCESS denied access", vp);
8001                 return 1;
8002         }
8003
8004         /*
8005          * Typically opaque filesystems do authorisation in-line, but exec is a special case.  In
8006          * order to be reasonably sure that exec will be permitted, we try a bit harder here.
8007          */
8008         if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) {
8009                 /* try a VNOP_OPEN for readonly access */
8010                 if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
8011                         *resultp = error;
8012                         KAUTH_DEBUG("%p    DENIED - EXECUTE denied because file could not be opened readonly", vp);
8013                         return 1;
8014                 }
8015                 VNOP_CLOSE(vp, FREAD, ctx);
8016         }
8017
8018         /*
8019          * We don't have any reason to believe that the request has to be denied at this point,
8020          * so go ahead and allow it.
8021          */
8022         *resultp = 0;
8023         KAUTH_DEBUG("%p    ALLOWED - bypassing access check for non-local filesystem", vp);
8024         return 1;
8025 }
8026
8027
8028
8029
8030 /*
8031  * Returns:     KAUTH_RESULT_ALLOW
8032  *              KAUTH_RESULT_DENY
8033  *
8034  * Imputed:     *arg3, modified         Error code in the deny case
8035  *              EROFS                   Read-only file system
8036  *              EACCES                  Permission denied
8037  *              EPERM                   Operation not permitted [no execute]
8038  *      vnode_getattr:ENOMEM            Not enough space [only if has filesec]
8039  *      vnode_getattr:???
8040  *      vnode_authorize_opaque:*arg2    ???
8041  *      vnode_authorize_checkimmutable:???
8042  *      vnode_authorize_delete:???
8043  *      vnode_authorize_simple:???
8044  */
8045
8046
8047 static int
8048 vnode_authorize_callback(__unused kauth_cred_t cred, __unused void *idata,
8049     kauth_action_t action, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
8050     uintptr_t arg3)
8051 {
8052         vfs_context_t   ctx;
8053         vnode_t         cvp = NULLVP;
8054         vnode_t         vp, dvp;
8055         int             result = KAUTH_RESULT_DENY;
8056         int             parent_iocount = 0;
8057         int             parent_action; /* In case we need to use namedstream's data fork for cached rights*/
8058
8059         ctx = (vfs_context_t)arg0;
8060         vp = (vnode_t)arg1;
8061         dvp = (vnode_t)arg2;
8062
8063         /*
8064          * if there are 2 vnodes passed in, we don't know at
8065          * this point which rights to look at based on the
8066          * combined action being passed in... defer until later...
8067          * otherwise check the kauth 'rights' cache hung
8068          * off of the vnode we're interested in... if we've already
8069          * been granted the right we're currently interested in,
8070          * we can just return success... otherwise we'll go through
8071          * the process of authorizing the requested right(s)... if that
8072          * succeeds, we'll add the right(s) to the cache.
8073          * VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache
8074          */
8075         if (dvp && vp) {
8076                 goto defer;
8077         }
8078         if (dvp) {
8079                 cvp = dvp;
8080         } else {
8081                 /*
8082                  * For named streams on local-authorization volumes, rights are cached on the parent;
8083                  * authorization is determined by looking at the parent's properties anyway, so storing
8084                  * on the parent means that we don't recompute for the named stream and that if
8085                  * we need to flush rights (e.g. on VNOP_SETATTR()) we don't need to track down the
8086                  * stream to flush its cache separately.  If we miss in the cache, then we authorize
8087                  * as if there were no cached rights (passing the named stream vnode and desired rights to
8088                  * vnode_authorize_callback_int()).
8089                  *
8090                  * On an opaquely authorized volume, we don't know the relationship between the
8091                  * data fork's properties and the rights granted on a stream.  Thus, named stream vnodes
8092                  * on such a volume are authorized directly (rather than using the parent) and have their
8093                  * own caches.  When a named stream vnode is created, we mark the parent as having a named
8094                  * stream. On a VNOP_SETATTR() for the parent that may invalidate cached authorization, we
8095                  * find the stream and flush its cache.
8096                  */
8097                 if (vnode_isnamedstream(vp) && (!vfs_authopaque(vp->v_mount))) {
8098                         cvp = vnode_getparent(vp);
8099                         if (cvp != NULLVP) {
8100                                 parent_iocount = 1;
8101                         } else {
8102                                 cvp = NULL;
8103                                 goto defer; /* If we can't use the parent, take the slow path */
8104                         }
8105
8106                         /* Have to translate some actions */
8107                         parent_action = action;
8108                         if (parent_action & KAUTH_VNODE_READ_DATA) {
8109                                 parent_action &= ~KAUTH_VNODE_READ_DATA;
8110                                 parent_action |= KAUTH_VNODE_READ_EXTATTRIBUTES;
8111                         }
8112                         if (parent_action & KAUTH_VNODE_WRITE_DATA) {
8113                                 parent_action &= ~KAUTH_VNODE_WRITE_DATA;
8114                                 parent_action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
8115                         }
8116                 } else {
8117                         cvp = vp;
8118                 }
8119         }
8120
8121         if (vnode_cache_is_authorized(cvp, ctx, parent_iocount ? parent_action : action) == TRUE) {
8122                 result = KAUTH_RESULT_ALLOW;
8123                 goto out;
8124         }
8125 defer:
8126         result = vnode_authorize_callback_int(action, ctx, vp, dvp, (int *)arg3);
8127
8128         if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) {
8129                 KAUTH_DEBUG("%p - caching action = %x", cvp, action);
8130                 vnode_cache_authorized_action(cvp, ctx, action);
8131         }
8132
8133 out:
8134         if (parent_iocount) {
8135                 vnode_put(cvp);
8136         }
8137
8138         return result;
8139 }
8140
8141 static int
8142 vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp,
8143     kauth_ace_rights_t rights, int is_suser, boolean_t *found_deny,
8144     int noimmutable, int parent_authorized_for_delete_child)
8145 {
8146         int result;
8147
8148         /*
8149          * Check for immutability.
8150          *
8151          * In the deletion case, parent directory immutability vetoes specific
8152          * file rights.
8153          */
8154         if ((result = vnode_authorize_checkimmutable(mp, vcp->vap, rights,
8155             noimmutable)) != 0) {
8156                 goto out;
8157         }
8158
8159         if ((rights & KAUTH_VNODE_DELETE) &&
8160             !parent_authorized_for_delete_child) {
8161                 result = vnode_authorize_checkimmutable(mp, vcp->dvap,
8162                     KAUTH_VNODE_DELETE_CHILD, 0);
8163                 if (result) {
8164                         goto out;
8165                 }
8166         }
8167
8168         /*
8169          * Clear rights that have been authorized by reaching this point, bail if nothing left to
8170          * check.
8171          */
8172         rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE);
8173         if (rights == 0) {
8174                 goto out;
8175         }
8176
8177         /*
8178          * If we're not the superuser, authorize based on file properties;
8179          * note that even if parent_authorized_for_delete_child is TRUE, we
8180          * need to check on the node itself.
8181          */
8182         if (!is_suser) {
8183                 /* process delete rights */
8184                 if ((rights & KAUTH_VNODE_DELETE) &&
8185                     ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0)) {
8186                         goto out;
8187                 }
8188
8189                 /* process remaining rights */
8190                 if ((rights & ~KAUTH_VNODE_DELETE) &&
8191                     (result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, found_deny)) != 0) {
8192                         goto out;
8193                 }
8194         } else {
8195                 /*
8196                  * Execute is only granted to root if one of the x bits is set.  This check only
8197                  * makes sense if the posix mode bits are actually supported.
8198                  */
8199                 if ((rights & KAUTH_VNODE_EXECUTE) &&
8200                     (vcp->vap->va_type == VREG) &&
8201                     VATTR_IS_SUPPORTED(vcp->vap, va_mode) &&
8202                     !(vcp->vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
8203                         result = EPERM;
8204                         KAUTH_DEBUG("%p    DENIED - root execute requires at least one x bit in 0x%x", vcp, vcp->vap->va_mode);
8205                         goto out;
8206                 }
8207
8208                 /* Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE */
8209                 *found_deny = TRUE;
8210
8211                 KAUTH_DEBUG("%p    ALLOWED - caller is superuser", vcp);
8212         }
8213 out:
8214         return result;
8215 }
8216
8217 static int
8218 vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
8219     vnode_t vp, vnode_t dvp, int *errorp)
8220 {
8221         struct _vnode_authorize_context auth_context;
8222         vauth_ctx               vcp;
8223         kauth_cred_t            cred;
8224         kauth_ace_rights_t      rights;
8225         struct vnode_attr       va, dva;
8226         int                     result;
8227         int                     noimmutable;
8228         boolean_t               parent_authorized_for_delete_child = FALSE;
8229         boolean_t               found_deny = FALSE;
8230         boolean_t               parent_ref = FALSE;
8231         boolean_t               is_suser = FALSE;
8232
8233         vcp = &auth_context;
8234         vcp->ctx = ctx;
8235         vcp->vp = vp;
8236         vcp->dvp = dvp;
8237         /*
8238          * Note that we authorize against the context, not the passed cred
8239          * (the same thing anyway)
8240          */
8241         cred = ctx->vc_ucred;
8242
8243         VATTR_INIT(&va);
8244         vcp->vap = &va;
8245         VATTR_INIT(&dva);
8246         vcp->dvap = &dva;
8247
8248         vcp->flags = vcp->flags_valid = 0;
8249
8250 #if DIAGNOSTIC
8251         if ((ctx == NULL) || (vp == NULL) || (cred == NULL)) {
8252                 panic("vnode_authorize: bad arguments (context %p  vp %p  cred %p)", ctx, vp, cred);
8253         }
8254 #endif
8255
8256         KAUTH_DEBUG("%p  AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)",
8257             vp, vfs_context_proc(ctx)->p_comm,
8258             (action & KAUTH_VNODE_ACCESS)               ? "access" : "auth",
8259             (action & KAUTH_VNODE_READ_DATA)            ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
8260             (action & KAUTH_VNODE_WRITE_DATA)           ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "",
8261             (action & KAUTH_VNODE_EXECUTE)              ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "",
8262             (action & KAUTH_VNODE_DELETE)               ? " DELETE" : "",
8263             (action & KAUTH_VNODE_APPEND_DATA)          ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
8264             (action & KAUTH_VNODE_DELETE_CHILD)         ? " DELETE_CHILD" : "",
8265             (action & KAUTH_VNODE_READ_ATTRIBUTES)      ? " READ_ATTRIBUTES" : "",
8266             (action & KAUTH_VNODE_WRITE_ATTRIBUTES)     ? " WRITE_ATTRIBUTES" : "",
8267             (action & KAUTH_VNODE_READ_EXTATTRIBUTES)   ? " READ_EXTATTRIBUTES" : "",
8268             (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES)  ? " WRITE_EXTATTRIBUTES" : "",
8269             (action & KAUTH_VNODE_READ_SECURITY)        ? " READ_SECURITY" : "",
8270             (action & KAUTH_VNODE_WRITE_SECURITY)       ? " WRITE_SECURITY" : "",
8271             (action & KAUTH_VNODE_CHANGE_OWNER)         ? " CHANGE_OWNER" : "",
8272             (action & KAUTH_VNODE_NOIMMUTABLE)          ? " (noimmutable)" : "",
8273             vnode_isdir(vp) ? "directory" : "file",
8274             vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp);
8275
8276         /*
8277          * Extract the control bits from the action, everything else is
8278          * requested rights.
8279          */
8280         noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
8281         rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
8282
8283         if (rights & KAUTH_VNODE_DELETE) {
8284 #if DIAGNOSTIC
8285                 if (dvp == NULL) {
8286                         panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory");
8287                 }
8288 #endif
8289                 /*
8290                  * check to see if we've already authorized the parent
8291                  * directory for deletion of its children... if so, we
8292                  * can skip a whole bunch of work... we will still have to
8293                  * authorize that this specific child can be removed
8294                  */
8295                 if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE) {
8296                         parent_authorized_for_delete_child = TRUE;
8297                 }
8298         } else {
8299                 vcp->dvp = NULLVP;
8300                 vcp->dvap = NULL;
8301         }
8302
8303         /*
8304          * Check for read-only filesystems.
8305          */
8306         if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
8307             (vp->v_mount->mnt_flag & MNT_RDONLY) &&
8308             ((vp->v_type == VREG) || (vp->v_type == VDIR) ||
8309             (vp->v_type == VLNK) || (vp->v_type == VCPLX) ||
8310             (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) {
8311                 result = EROFS;
8312                 goto out;
8313         }
8314
8315         /*
8316          * Check for noexec filesystems.
8317          */
8318         if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) {
8319                 result = EACCES;
8320                 goto out;
8321         }
8322
8323         /*
8324          * Handle cases related to filesystems with non-local enforcement.
8325          * This call can return 0, in which case we will fall through to perform a
8326          * check based on VNOP_GETATTR data.  Otherwise it returns 1 and sets
8327          * an appropriate result, at which point we can return immediately.
8328          */
8329         if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, &result, action, ctx)) {
8330                 goto out;
8331         }
8332
8333         /*
8334          * If the vnode is a namedstream (extended attribute) data vnode (eg.
8335          * a resource fork), *_DATA becomes *_EXTATTRIBUTES.
8336          */
8337         if (vnode_isnamedstream(vp)) {
8338                 if (rights & KAUTH_VNODE_READ_DATA) {
8339                         rights &= ~KAUTH_VNODE_READ_DATA;
8340                         rights |= KAUTH_VNODE_READ_EXTATTRIBUTES;
8341                 }
8342                 if (rights & KAUTH_VNODE_WRITE_DATA) {
8343                         rights &= ~KAUTH_VNODE_WRITE_DATA;
8344                         rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
8345                 }
8346
8347                 /*
8348                  * Point 'vp' to the namedstream's parent for ACL checking
8349                  */
8350                 if ((vp->v_parent != NULL) &&
8351                     (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) {
8352                         parent_ref = TRUE;
8353                         vcp->vp = vp = vp->v_parent;
8354                 }
8355         }
8356
8357         if (vfs_context_issuser(ctx)) {
8358                 /*
8359                  * if we're not asking for execute permissions or modifications,
8360                  * then we're done, this action is authorized.
8361                  */
8362                 if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) {
8363                         goto success;
8364                 }
8365
8366                 is_suser = TRUE;
8367         }
8368
8369         /*
8370          * Get vnode attributes and extended security information for the vnode
8371          * and directory if required.
8372          *
8373          * If we're root we only want mode bits and flags for checking
8374          * execute and immutability.
8375          */
8376         VATTR_WANTED(&va, va_mode);
8377         VATTR_WANTED(&va, va_flags);
8378         if (!is_suser) {
8379                 VATTR_WANTED(&va, va_uid);
8380                 VATTR_WANTED(&va, va_gid);
8381                 VATTR_WANTED(&va, va_acl);
8382         }
8383         if ((result = vnode_getattr(vp, &va, ctx)) != 0) {
8384                 KAUTH_DEBUG("%p    ERROR - failed to get vnode attributes - %d", vp, result);
8385                 goto out;
8386         }
8387         VATTR_WANTED(&va, va_type);
8388         VATTR_RETURN(&va, va_type, vnode_vtype(vp));
8389
8390         if (vcp->dvp) {
8391                 VATTR_WANTED(&dva, va_mode);
8392                 VATTR_WANTED(&dva, va_flags);
8393                 if (!is_suser) {
8394                         VATTR_WANTED(&dva, va_uid);
8395                         VATTR_WANTED(&dva, va_gid);
8396                         VATTR_WANTED(&dva, va_acl);
8397                 }
8398                 if ((result = vnode_getattr(vcp->dvp, &dva, ctx)) != 0) {
8399                         KAUTH_DEBUG("%p    ERROR - failed to get directory vnode attributes - %d", vp, result);
8400                         goto out;
8401                 }
8402                 VATTR_WANTED(&dva, va_type);
8403                 VATTR_RETURN(&dva, va_type, vnode_vtype(vcp->dvp));
8404         }
8405
8406         result = vnode_attr_authorize_internal(vcp, vp->v_mount, rights, is_suser,
8407             &found_deny, noimmutable, parent_authorized_for_delete_child);
8408 out:
8409         if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) {
8410                 kauth_acl_free(va.va_acl);
8411         }
8412         if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL)) {
8413                 kauth_acl_free(dva.va_acl);
8414         }
8415
8416         if (result) {
8417                 if (parent_ref) {
8418                         vnode_put(vp);
8419                 }
8420                 *errorp = result;
8421                 KAUTH_DEBUG("%p    DENIED - auth denied", vp);
8422                 return KAUTH_RESULT_DENY;
8423         }
8424         if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) {
8425                 /*
8426                  * if we were successfully granted the right to search this directory
8427                  * and there were NO ACL DENYs for search and the posix permissions also don't
8428                  * deny execute, we can synthesize a global right that allows anyone to
8429                  * traverse this directory during a pathname lookup without having to
8430                  * match the credential associated with this cache of rights.
8431                  *
8432                  * Note that we can correctly cache KAUTH_VNODE_SEARCHBYANYONE
8433                  * only if we actually check ACLs which we don't for root. As
8434                  * a workaround, the lookup fast path checks for root.
8435                  */
8436                 if (!VATTR_IS_SUPPORTED(&va, va_mode) ||
8437                     ((va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) ==
8438                     (S_IXUSR | S_IXGRP | S_IXOTH))) {
8439                         vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE);
8440                 }
8441         }
8442 success:
8443         if (parent_ref) {
8444                 vnode_put(vp);
8445         }
8446
8447         /*
8448          * Note that this implies that we will allow requests for no rights, as well as
8449          * for rights that we do not recognise.  There should be none of these.
8450          */
8451         KAUTH_DEBUG("%p    ALLOWED - auth granted", vp);
8452         return KAUTH_RESULT_ALLOW;
8453 }
8454
8455 int
8456 vnode_attr_authorize_init(struct vnode_attr *vap, struct vnode_attr *dvap,
8457     kauth_action_t action, vfs_context_t ctx)
8458 {
8459         VATTR_INIT(vap);
8460         VATTR_WANTED(vap, va_type);
8461         VATTR_WANTED(vap, va_mode);
8462         VATTR_WANTED(vap, va_flags);
8463         if (dvap) {
8464                 VATTR_INIT(dvap);
8465                 if (action & KAUTH_VNODE_DELETE) {
8466                         VATTR_WANTED(dvap, va_type);
8467                         VATTR_WANTED(dvap, va_mode);
8468                         VATTR_WANTED(dvap, va_flags);
8469                 }
8470         } else if (action & KAUTH_VNODE_DELETE) {
8471                 return EINVAL;
8472         }
8473
8474         if (!vfs_context_issuser(ctx)) {
8475                 VATTR_WANTED(vap, va_uid);
8476                 VATTR_WANTED(vap, va_gid);
8477                 VATTR_WANTED(vap, va_acl);
8478                 if (dvap && (action & KAUTH_VNODE_DELETE)) {
8479                         VATTR_WANTED(dvap, va_uid);
8480                         VATTR_WANTED(dvap, va_gid);
8481                         VATTR_WANTED(dvap, va_acl);
8482                 }
8483         }
8484
8485         return 0;
8486 }
8487
8488 int
8489 vnode_attr_authorize(struct vnode_attr *vap, struct vnode_attr *dvap, mount_t mp,
8490     kauth_action_t action, vfs_context_t ctx)
8491 {
8492         struct _vnode_authorize_context auth_context;
8493         vauth_ctx vcp;
8494         kauth_ace_rights_t rights;
8495         int noimmutable;
8496         boolean_t found_deny;
8497         boolean_t is_suser = FALSE;
8498         int result = 0;
8499
8500         vcp = &auth_context;
8501         vcp->ctx = ctx;
8502         vcp->vp = NULLVP;
8503         vcp->vap = vap;
8504         vcp->dvp = NULLVP;
8505         vcp->dvap = dvap;
8506         vcp->flags = vcp->flags_valid = 0;
8507
8508         noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
8509         rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
8510
8511         /*
8512          * Check for read-only filesystems.
8513          */
8514         if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
8515             mp && (mp->mnt_flag & MNT_RDONLY) &&
8516             ((vap->va_type == VREG) || (vap->va_type == VDIR) ||
8517             (vap->va_type == VLNK) || (rights & KAUTH_VNODE_DELETE) ||
8518             (rights & KAUTH_VNODE_DELETE_CHILD))) {
8519                 result = EROFS;
8520                 goto out;
8521         }
8522
8523         /*
8524          * Check for noexec filesystems.
8525          */
8526         if ((rights & KAUTH_VNODE_EXECUTE) &&
8527             (vap->va_type == VREG) && mp && (mp->mnt_flag & MNT_NOEXEC)) {
8528                 result = EACCES;
8529                 goto out;
8530         }
8531
8532         if (vfs_context_issuser(ctx)) {
8533                 /*
8534                  * if we're not asking for execute permissions or modifications,
8535                  * then we're done, this action is authorized.
8536                  */
8537                 if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) {
8538                         goto out;
8539                 }
8540                 is_suser = TRUE;
8541         } else {
8542                 if (!VATTR_IS_SUPPORTED(vap, va_uid) ||
8543                     !VATTR_IS_SUPPORTED(vap, va_gid) ||
8544                     (mp && vfs_extendedsecurity(mp) && !VATTR_IS_SUPPORTED(vap, va_acl))) {
8545                         panic("vnode attrs not complete for vnode_attr_authorize\n");
8546                 }
8547         }
8548
8549         result = vnode_attr_authorize_internal(vcp, mp, rights, is_suser,
8550             &found_deny, noimmutable, FALSE);
8551
8552         if (result == EPERM) {
8553                 result = EACCES;
8554         }
8555 out:
8556         return result;
8557 }
8558
8559
8560 int
8561 vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx)
8562 {
8563         return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx);
8564 }
8565
8566 /*
8567  * Check that the attribute information in vattr can be legally applied to
8568  * a new file by the context.
8569  */
8570 static int
8571 vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
8572 {
8573         int             error;
8574         int             has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
8575         uint32_t        inherit_flags;
8576         kauth_cred_t    cred;
8577         guid_t          changer;
8578         mount_t         dmp;
8579         struct vnode_attr dva;
8580
8581         error = 0;
8582
8583         if (defaulted_fieldsp) {
8584                 *defaulted_fieldsp = 0;
8585         }
8586
8587         defaulted_owner = defaulted_group = defaulted_mode = 0;
8588
8589         inherit_flags = 0;
8590
8591         /*
8592          * Require that the filesystem support extended security to apply any.
8593          */
8594         if (!vfs_extendedsecurity(dvp->v_mount) &&
8595             (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) {
8596                 error = EINVAL;
8597                 goto out;
8598         }
8599
8600         /*
8601          * Default some fields.
8602          */
8603         dmp = dvp->v_mount;
8604
8605         /*
8606          * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that
8607          * owner takes ownership of all new files.
8608          */
8609         if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) {
8610                 VATTR_SET(vap, va_uid, dmp->mnt_fsowner);
8611                 defaulted_owner = 1;
8612         } else {
8613                 if (!VATTR_IS_ACTIVE(vap, va_uid)) {
8614                         /* default owner is current user */
8615                         VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx)));
8616                         defaulted_owner = 1;
8617                 }
8618         }
8619
8620         /*
8621          * We need the dvp's va_flags and *may* need the gid of the directory,
8622          * we ask for both here.
8623          */
8624         VATTR_INIT(&dva);
8625         VATTR_WANTED(&dva, va_gid);
8626         VATTR_WANTED(&dva, va_flags);
8627         if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) {
8628                 goto out;
8629         }
8630
8631         /*
8632          * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
8633          * group takes ownership of all new files.
8634          */
8635         if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) {
8636                 VATTR_SET(vap, va_gid, dmp->mnt_fsgroup);
8637                 defaulted_group = 1;
8638         } else {
8639                 if (!VATTR_IS_ACTIVE(vap, va_gid)) {
8640                         /* default group comes from parent object, fallback to current user */
8641                         if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
8642                                 VATTR_SET(vap, va_gid, dva.va_gid);
8643                         } else {
8644                                 VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx)));
8645                         }
8646                         defaulted_group = 1;
8647                 }
8648         }
8649
8650         if (!VATTR_IS_ACTIVE(vap, va_flags)) {
8651                 VATTR_SET(vap, va_flags, 0);
8652         }
8653
8654         /* Determine if SF_RESTRICTED should be inherited from the parent
8655          * directory. */
8656         if (VATTR_IS_SUPPORTED(&dva, va_flags)) {
8657                 inherit_flags = dva.va_flags & (UF_DATAVAULT | SF_RESTRICTED);
8658         }
8659
8660         /* default mode is everything, masked with current umask */
8661         if (!VATTR_IS_ACTIVE(vap, va_mode)) {
8662                 VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask);
8663                 KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask);
8664                 defaulted_mode = 1;
8665         }
8666         /* set timestamps to now */
8667         if (!VATTR_IS_ACTIVE(vap, va_create_time)) {
8668                 nanotime(&vap->va_create_time);
8669                 VATTR_SET_ACTIVE(vap, va_create_time);
8670         }
8671
8672         /*
8673          * Check for attempts to set nonsensical fields.
8674          */
8675         if (vap->va_active & ~VNODE_ATTR_NEWOBJ) {
8676                 error = EINVAL;
8677                 KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx",
8678                     vap->va_active & ~VNODE_ATTR_NEWOBJ);
8679                 goto out;
8680         }
8681
8682         /*
8683          * Quickly check for the applicability of any enforcement here.
8684          * Tests below maintain the integrity of the local security model.
8685          */
8686         if (vfs_authopaque(dvp->v_mount)) {
8687                 goto out;
8688         }
8689
8690         /*
8691          * We need to know if the caller is the superuser, or if the work is
8692          * otherwise already authorised.
8693          */
8694         cred = vfs_context_ucred(ctx);
8695         if (noauth) {
8696                 /* doing work for the kernel */
8697                 has_priv_suser = 1;
8698         } else {
8699                 has_priv_suser = vfs_context_issuser(ctx);
8700         }
8701
8702
8703         if (VATTR_IS_ACTIVE(vap, va_flags)) {
8704                 vap->va_flags &= ~SF_SYNTHETIC;
8705                 if (has_priv_suser) {
8706                         if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) {
8707                                 error = EPERM;
8708                                 KAUTH_DEBUG("  DENIED - superuser attempt to set illegal flag(s)");
8709                                 goto out;
8710                         }
8711                 } else {
8712                         if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) {
8713                                 error = EPERM;
8714                                 KAUTH_DEBUG("  DENIED - user attempt to set illegal flag(s)");
8715                                 goto out;
8716                         }
8717                 }
8718         }
8719
8720         /* if not superuser, validate legality of new-item attributes */
8721         if (!has_priv_suser) {
8722                 if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) {
8723                         /* setgid? */
8724                         if (vap->va_mode & S_ISGID) {
8725                                 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
8726                                         KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
8727                                         goto out;
8728                                 }
8729                                 if (!ismember) {
8730                                         KAUTH_DEBUG("  DENIED - can't set SGID bit, not a member of %d", vap->va_gid);
8731                                         error = EPERM;
8732                                         goto out;
8733                                 }
8734                         }
8735
8736                         /* setuid? */
8737                         if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) {
8738                                 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
8739                                 error = EPERM;
8740                                 goto out;
8741                         }
8742                 }
8743                 if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) {
8744                         KAUTH_DEBUG("  DENIED - cannot create new item owned by %d", vap->va_uid);
8745                         error = EPERM;
8746                         goto out;
8747                 }
8748                 if (!defaulted_group) {
8749                         if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
8750                                 KAUTH_DEBUG("  ERROR - got %d checking for membership in %d", error, vap->va_gid);
8751                                 goto out;
8752                         }
8753                         if (!ismember) {
8754                                 KAUTH_DEBUG("  DENIED - cannot create new item with group %d - not a member", vap->va_gid);
8755                                 error = EPERM;
8756                                 goto out;
8757                         }
8758                 }
8759
8760                 /* initialising owner/group UUID */
8761                 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
8762                         if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
8763                                 KAUTH_DEBUG("  ERROR - got %d trying to get caller UUID", error);
8764                                 /* XXX ENOENT here - no GUID - should perhaps become EPERM */
8765                                 goto out;
8766                         }
8767                         if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
8768                                 KAUTH_DEBUG("  ERROR - cannot create item with supplied owner UUID - not us");
8769                                 error = EPERM;
8770                                 goto out;
8771                         }
8772                 }
8773                 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
8774                         if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
8775                                 KAUTH_DEBUG("  ERROR - got %d trying to check group membership", error);
8776                                 goto out;
8777                         }
8778                         if (!ismember) {
8779                                 KAUTH_DEBUG("  ERROR - cannot create item with supplied group UUID - not a member");
8780                                 error = EPERM;
8781                                 goto out;
8782                         }
8783                 }
8784         }
8785 out:
8786         if (inherit_flags) {
8787                 /* Apply SF_RESTRICTED to the file if its parent directory was
8788                  * restricted.  This is done at the end so that root is not
8789                  * required if this flag is only set due to inheritance. */
8790                 VATTR_SET(vap, va_flags, (vap->va_flags | inherit_flags));
8791         }
8792         if (defaulted_fieldsp) {
8793                 if (defaulted_mode) {
8794                         *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE;
8795                 }
8796                 if (defaulted_group) {
8797                         *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_GID;
8798                 }
8799                 if (defaulted_owner) {
8800                         *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_UID;
8801                 }
8802         }
8803         return error;
8804 }
8805
8806 /*
8807  * Check that the attribute information in vap can be legally written by the
8808  * context.
8809  *
8810  * Call this when you're not sure about the vnode_attr; either its contents
8811  * have come from an unknown source, or when they are variable.
8812  *
8813  * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that
8814  * must be authorized to be permitted to write the vattr.
8815  */
8816 int
8817 vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx)
8818 {
8819         struct vnode_attr ova;
8820         kauth_action_t  required_action;
8821         int             error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid;
8822         guid_t          changer;
8823         gid_t           group;
8824         uid_t           owner;
8825         mode_t          newmode;
8826         kauth_cred_t    cred;
8827         uint32_t        fdelta;
8828
8829         VATTR_INIT(&ova);
8830         required_action = 0;
8831         error = 0;
8832
8833         /*
8834          * Quickly check for enforcement applicability.
8835          */
8836         if (vfs_authopaque(vp->v_mount)) {
8837                 goto out;
8838         }
8839
8840         /*
8841          * Check for attempts to set nonsensical fields.
8842          */
8843         if (vap->va_active & VNODE_ATTR_RDONLY) {
8844                 KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)");
8845                 error = EINVAL;
8846                 goto out;
8847         }
8848
8849         /*
8850          * We need to know if the caller is the superuser.
8851          */
8852         cred = vfs_context_ucred(ctx);
8853         has_priv_suser = kauth_cred_issuser(cred);
8854
8855         /*
8856          * If any of the following are changing, we need information from the old file:
8857          * va_uid
8858          * va_gid
8859          * va_mode
8860          * va_uuuid
8861          * va_guuid
8862          */
8863         if (VATTR_IS_ACTIVE(vap, va_uid) ||
8864             VATTR_IS_ACTIVE(vap, va_gid) ||
8865             VATTR_IS_ACTIVE(vap, va_mode) ||
8866             VATTR_IS_ACTIVE(vap, va_uuuid) ||
8867             VATTR_IS_ACTIVE(vap, va_guuid)) {
8868                 VATTR_WANTED(&ova, va_mode);
8869                 VATTR_WANTED(&ova, va_uid);
8870                 VATTR_WANTED(&ova, va_gid);
8871                 VATTR_WANTED(&ova, va_uuuid);
8872                 VATTR_WANTED(&ova, va_guuid);
8873                 KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes");
8874         }
8875
8876         /*
8877          * If timestamps are being changed, we need to know who the file is owned
8878          * by.
8879          */
8880         if (VATTR_IS_ACTIVE(vap, va_create_time) ||
8881             VATTR_IS_ACTIVE(vap, va_change_time) ||
8882             VATTR_IS_ACTIVE(vap, va_modify_time) ||
8883             VATTR_IS_ACTIVE(vap, va_access_time) ||
8884             VATTR_IS_ACTIVE(vap, va_backup_time) ||
8885             VATTR_IS_ACTIVE(vap, va_addedtime)) {
8886                 VATTR_WANTED(&ova, va_uid);
8887 #if 0   /* enable this when we support UUIDs as official owners */
8888                 VATTR_WANTED(&ova, va_uuuid);
8889 #endif
8890                 KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID");
8891         }
8892
8893         /*
8894          * If flags are being changed, we need the old flags.
8895          */
8896         if (VATTR_IS_ACTIVE(vap, va_flags)) {
8897                 KAUTH_DEBUG("ATTR - flags changing, fetching old flags");
8898                 VATTR_WANTED(&ova, va_flags);
8899         }
8900
8901         /*
8902          * If ACLs are being changed, we need the old ACLs.
8903          */
8904         if (VATTR_IS_ACTIVE(vap, va_acl)) {
8905                 KAUTH_DEBUG("ATTR - acl changing, fetching old flags");
8906                 VATTR_WANTED(&ova, va_acl);
8907         }
8908
8909         /*
8910          * If the size is being set, make sure it's not a directory.
8911          */
8912         if (VATTR_IS_ACTIVE(vap, va_data_size)) {
8913                 /* size is only meaningful on regular files, don't permit otherwise */
8914                 if (!vnode_isreg(vp)) {
8915                         KAUTH_DEBUG("ATTR - ERROR: size change requested on non-file");
8916                         error = vnode_isdir(vp) ? EISDIR : EINVAL;
8917                         goto out;
8918                 }
8919         }
8920
8921         /*
8922          * Get old data.
8923          */
8924         KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active);
8925         if ((error = vnode_getattr(vp, &ova, ctx)) != 0) {
8926                 KAUTH_DEBUG("  ERROR - got %d trying to get attributes", error);
8927                 goto out;
8928         }
8929
8930         /*
8931          * Size changes require write access to the file data.
8932          */
8933         if (VATTR_IS_ACTIVE(vap, va_data_size)) {
8934                 /* if we can't get the size, or it's different, we need write access */
8935                 KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA");
8936                 required_action |= KAUTH_VNODE_WRITE_DATA;
8937         }
8938
8939         /*
8940          * Changing timestamps?
8941          *
8942          * Note that we are only called to authorize user-requested time changes;
8943          * side-effect time changes are not authorized.  Authorisation is only
8944          * required for existing files.
8945          *
8946          * Non-owners are not permitted to change the time on an existing
8947          * file to anything other than the current time.
8948          */
8949         if (VATTR_IS_ACTIVE(vap, va_create_time) ||
8950             VATTR_IS_ACTIVE(vap, va_change_time) ||
8951             VATTR_IS_ACTIVE(vap, va_modify_time) ||
8952             VATTR_IS_ACTIVE(vap, va_access_time) ||
8953             VATTR_IS_ACTIVE(vap, va_backup_time) ||
8954             VATTR_IS_ACTIVE(vap, va_addedtime)) {
8955                 /*
8956                  * The owner and root may set any timestamps they like,
8957                  * provided that the file is not immutable.  The owner still needs
8958                  * WRITE_ATTRIBUTES (implied by ownership but still deniable).
8959                  */
8960                 if (has_priv_suser || vauth_node_owner(&ova, cred)) {
8961                         KAUTH_DEBUG("ATTR - root or owner changing timestamps");
8962                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES;
8963                 } else {
8964                         /* just setting the current time? */
8965                         if (vap->va_vaflags & VA_UTIMES_NULL) {
8966                                 KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES");
8967                                 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
8968                         } else {
8969                                 KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted");
8970                                 error = EACCES;
8971                                 goto out;
8972                         }
8973                 }
8974         }
8975
8976         /*
8977          * Changing file mode?
8978          */
8979         if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) {
8980                 KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode);
8981
8982                 /*
8983                  * Mode changes always have the same basic auth requirements.
8984                  */
8985                 if (has_priv_suser) {
8986                         KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check");
8987                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
8988                 } else {
8989                         /* need WRITE_SECURITY */
8990                         KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY");
8991                         required_action |= KAUTH_VNODE_WRITE_SECURITY;
8992                 }
8993
8994                 /*
8995                  * Can't set the setgid bit if you're not in the group and not root.  Have to have
8996                  * existing group information in the case we're not setting it right now.
8997                  */
8998                 if (vap->va_mode & S_ISGID) {
8999                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE;  /* always required */
9000                         if (!has_priv_suser) {
9001                                 if (VATTR_IS_ACTIVE(vap, va_gid)) {
9002                                         group = vap->va_gid;
9003                                 } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) {
9004                                         group = ova.va_gid;
9005                                 } else {
9006                                         KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available");
9007                                         error = EINVAL;
9008                                         goto out;
9009                                 }
9010                                 /*
9011                                  * This might be too restrictive; WRITE_SECURITY might be implied by
9012                                  * membership in this case, rather than being an additional requirement.
9013                                  */
9014                                 if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) {
9015                                         KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
9016                                         goto out;
9017                                 }
9018                                 if (!ismember) {
9019                                         KAUTH_DEBUG("  DENIED - can't set SGID bit, not a member of %d", group);
9020                                         error = EPERM;
9021                                         goto out;
9022                                 }
9023                         }
9024                 }
9025
9026                 /*
9027                  * Can't set the setuid bit unless you're root or the file's owner.
9028                  */
9029                 if (vap->va_mode & S_ISUID) {
9030                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE;  /* always required */
9031                         if (!has_priv_suser) {
9032                                 if (VATTR_IS_ACTIVE(vap, va_uid)) {
9033                                         owner = vap->va_uid;
9034                                 } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) {
9035                                         owner = ova.va_uid;
9036                                 } else {
9037                                         KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available");
9038                                         error = EINVAL;
9039                                         goto out;
9040                                 }
9041                                 if (owner != kauth_cred_getuid(cred)) {
9042                                         /*
9043                                          * We could allow this if WRITE_SECURITY is permitted, perhaps.
9044                                          */
9045                                         KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
9046                                         error = EPERM;
9047                                         goto out;
9048                                 }
9049                         }
9050                 }
9051         }
9052
9053         /*
9054          * Validate/mask flags changes.  This checks that only the flags in
9055          * the UF_SETTABLE mask are being set, and preserves the flags in
9056          * the SF_SETTABLE case.
9057          *
9058          * Since flags changes may be made in conjunction with other changes,
9059          * we will ask the auth code to ignore immutability in the case that
9060          * the SF_* flags are not set and we are only manipulating the file flags.
9061          *
9062          */
9063         if (VATTR_IS_ACTIVE(vap, va_flags)) {
9064                 /* compute changing flags bits */
9065                 vap->va_flags &= ~SF_SYNTHETIC;
9066                 ova.va_flags &= ~SF_SYNTHETIC;
9067                 if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
9068                         fdelta = vap->va_flags ^ ova.va_flags;
9069                 } else {
9070                         fdelta = vap->va_flags;
9071                 }
9072
9073                 if (fdelta != 0) {
9074                         KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY");
9075                         required_action |= KAUTH_VNODE_WRITE_SECURITY;
9076
9077                         /* check that changing bits are legal */
9078                         if (has_priv_suser) {
9079                                 /*
9080                                  * The immutability check will prevent us from clearing the SF_*
9081                                  * flags unless the system securelevel permits it, so just check
9082                                  * for legal flags here.
9083                                  */
9084                                 if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) {
9085                                         error = EPERM;
9086                                         KAUTH_DEBUG("  DENIED - superuser attempt to set illegal flag(s)");
9087                                         goto out;
9088                                 }
9089                         } else {
9090                                 if (fdelta & ~UF_SETTABLE) {
9091                                         error = EPERM;
9092                                         KAUTH_DEBUG("  DENIED - user attempt to set illegal flag(s)");
9093                                         goto out;
9094                                 }
9095                         }
9096                         /*
9097                          * If the caller has the ability to manipulate file flags,
9098                          * security is not reduced by ignoring them for this operation.
9099                          *
9100                          * A more complete test here would consider the 'after' states of the flags
9101                          * to determine whether it would permit the operation, but this becomes
9102                          * very complex.
9103                          *
9104                          * Ignoring immutability is conditional on securelevel; this does not bypass
9105                          * the SF_* flags if securelevel > 0.
9106                          */
9107                         required_action |= KAUTH_VNODE_NOIMMUTABLE;
9108                 }
9109         }
9110
9111         /*
9112          * Validate ownership information.
9113          */
9114         chowner = 0;
9115         chgroup = 0;
9116         clear_suid = 0;
9117         clear_sgid = 0;
9118
9119         /*
9120          * uid changing
9121          * Note that if the filesystem didn't give us a UID, we expect that it doesn't
9122          * support them in general, and will ignore it if/when we try to set it.
9123          * We might want to clear the uid out of vap completely here.
9124          */
9125         if (VATTR_IS_ACTIVE(vap, va_uid)) {
9126                 if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) {
9127                         if (!has_priv_suser && (kauth_cred_getuid(cred) != vap->va_uid)) {
9128                                 KAUTH_DEBUG("  DENIED - non-superuser cannot change ownershipt to a third party");
9129                                 error = EPERM;
9130                                 goto out;
9131                         }
9132                         chowner = 1;
9133                 }
9134                 clear_suid = 1;
9135         }
9136
9137         /*
9138          * gid changing
9139          * Note that if the filesystem didn't give us a GID, we expect that it doesn't
9140          * support them in general, and will ignore it if/when we try to set it.
9141          * We might want to clear the gid out of vap completely here.
9142          */
9143         if (VATTR_IS_ACTIVE(vap, va_gid)) {
9144                 if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) {
9145                         if (!has_priv_suser) {
9146                                 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
9147                                         KAUTH_DEBUG("  ERROR - got %d checking for membership in %d", error, vap->va_gid);
9148                                         goto out;
9149                                 }
9150                                 if (!ismember) {
9151                                         KAUTH_DEBUG("  DENIED - group change from %d to %d but not a member of target group",
9152                                             ova.va_gid, vap->va_gid);
9153                                         error = EPERM;
9154                                         goto out;
9155                                 }
9156                         }
9157                         chgroup = 1;
9158                 }
9159                 clear_sgid = 1;
9160         }
9161
9162         /*
9163          * Owner UUID being set or changed.
9164          */
9165         if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
9166                 /* if the owner UUID is not actually changing ... */
9167                 if (VATTR_IS_SUPPORTED(&ova, va_uuuid)) {
9168                         if (kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid)) {
9169                                 goto no_uuuid_change;
9170                         }
9171
9172                         /*
9173                          * If the current owner UUID is a null GUID, check
9174                          * it against the UUID corresponding to the owner UID.
9175                          */
9176                         if (kauth_guid_equal(&ova.va_uuuid, &kauth_null_guid) &&
9177                             VATTR_IS_SUPPORTED(&ova, va_uid)) {
9178                                 guid_t uid_guid;
9179
9180                                 if (kauth_cred_uid2guid(ova.va_uid, &uid_guid) == 0 &&
9181                                     kauth_guid_equal(&vap->va_uuuid, &uid_guid)) {
9182                                         goto no_uuuid_change;
9183                                 }
9184                         }
9185                 }
9186
9187                 /*
9188                  * The owner UUID cannot be set by a non-superuser to anything other than
9189                  * their own or a null GUID (to "unset" the owner UUID).
9190                  * Note that file systems must be prepared to handle the
9191                  * null UUID case in a manner appropriate for that file
9192                  * system.
9193                  */
9194                 if (!has_priv_suser) {
9195                         if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
9196                                 KAUTH_DEBUG("  ERROR - got %d trying to get caller UUID", error);
9197                                 /* XXX ENOENT here - no UUID - should perhaps become EPERM */
9198                                 goto out;
9199                         }
9200                         if (!kauth_guid_equal(&vap->va_uuuid, &changer) &&
9201                             !kauth_guid_equal(&vap->va_uuuid, &kauth_null_guid)) {
9202                                 KAUTH_DEBUG("  ERROR - cannot set supplied owner UUID - not us / null");
9203                                 error = EPERM;
9204                                 goto out;
9205                         }
9206                 }
9207                 chowner = 1;
9208                 clear_suid = 1;
9209         }
9210 no_uuuid_change:
9211         /*
9212          * Group UUID being set or changed.
9213          */
9214         if (VATTR_IS_ACTIVE(vap, va_guuid)) {
9215                 /* if the group UUID is not actually changing ... */
9216                 if (VATTR_IS_SUPPORTED(&ova, va_guuid)) {
9217                         if (kauth_guid_equal(&vap->va_guuid, &ova.va_guuid)) {
9218                                 goto no_guuid_change;
9219                         }
9220
9221                         /*
9222                          * If the current group UUID is a null UUID, check
9223                          * it against the UUID corresponding to the group GID.
9224                          */
9225                         if (kauth_guid_equal(&ova.va_guuid, &kauth_null_guid) &&
9226                             VATTR_IS_SUPPORTED(&ova, va_gid)) {
9227                                 guid_t gid_guid;
9228
9229                                 if (kauth_cred_gid2guid(ova.va_gid, &gid_guid) == 0 &&
9230                                     kauth_guid_equal(&vap->va_guuid, &gid_guid)) {
9231                                         goto no_guuid_change;
9232                                 }
9233                         }
9234                 }
9235
9236                 /*
9237                  * The group UUID cannot be set by a non-superuser to anything other than
9238                  * one of which they are a member or a null GUID (to "unset"
9239                  * the group UUID).
9240                  * Note that file systems must be prepared to handle the
9241                  * null UUID case in a manner appropriate for that file
9242                  * system.
9243                  */
9244                 if (!has_priv_suser) {
9245                         if (kauth_guid_equal(&vap->va_guuid, &kauth_null_guid)) {
9246                                 ismember = 1;
9247                         } else if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
9248                                 KAUTH_DEBUG("  ERROR - got %d trying to check group membership", error);
9249                                 goto out;
9250                         }
9251                         if (!ismember) {
9252                                 KAUTH_DEBUG("  ERROR - cannot set supplied group UUID - not a member / null");
9253                                 error = EPERM;
9254                                 goto out;
9255                         }
9256                 }
9257                 chgroup = 1;
9258         }
9259 no_guuid_change:
9260
9261         /*
9262          * Compute authorisation for group/ownership changes.
9263          */
9264         if (chowner || chgroup || clear_suid || clear_sgid) {
9265                 if (has_priv_suser) {
9266                         KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check");
9267                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
9268                 } else {
9269                         if (chowner) {
9270                                 KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP");
9271                                 required_action |= KAUTH_VNODE_TAKE_OWNERSHIP;
9272                         }
9273                         if (chgroup && !chowner) {
9274                                 KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY");
9275                                 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9276                         }
9277                 }
9278
9279                 /*
9280                  * clear set-uid and set-gid bits. POSIX only requires this for
9281                  * non-privileged processes but we do it even for root.
9282                  */
9283                 if (VATTR_IS_ACTIVE(vap, va_mode)) {
9284                         newmode = vap->va_mode;
9285                 } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
9286                         newmode = ova.va_mode;
9287                 } else {
9288                         KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
9289                         newmode = 0;
9290                 }
9291
9292                 /* chown always clears setuid/gid bits. An exception is made for
9293                  * setattrlist which can set both at the same time: <uid, gid, mode> on a file:
9294                  * setattrlist is allowed to set the new mode on the file and change (chown)
9295                  * uid/gid.
9296                  */
9297                 if (newmode & (S_ISUID | S_ISGID)) {
9298                         if (!VATTR_IS_ACTIVE(vap, va_mode)) {
9299                                 KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o",
9300                                     newmode, newmode & ~(S_ISUID | S_ISGID));
9301                                 newmode &= ~(S_ISUID | S_ISGID);
9302                         }
9303                         VATTR_SET(vap, va_mode, newmode);
9304                 }
9305         }
9306
9307         /*
9308          * Authorise changes in the ACL.
9309          */
9310         if (VATTR_IS_ACTIVE(vap, va_acl)) {
9311                 /* no existing ACL */
9312                 if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) {
9313                         /* adding an ACL */
9314                         if (vap->va_acl != NULL) {
9315                                 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9316                                 KAUTH_DEBUG("CHMOD - adding ACL");
9317                         }
9318
9319                         /* removing an existing ACL */
9320                 } else if (vap->va_acl == NULL) {
9321                         required_action |= KAUTH_VNODE_WRITE_SECURITY;
9322                         KAUTH_DEBUG("CHMOD - removing ACL");
9323
9324                         /* updating an existing ACL */
9325                 } else {
9326                         if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) {
9327                                 /* entry count changed, must be different */
9328                                 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9329                                 KAUTH_DEBUG("CHMOD - adding/removing ACL entries");
9330                         } else if (vap->va_acl->acl_entrycount > 0) {
9331                                 /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */
9332                                 if (memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0],
9333                                     sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) {
9334                                         required_action |= KAUTH_VNODE_WRITE_SECURITY;
9335                                         KAUTH_DEBUG("CHMOD - changing ACL entries");
9336                                 }
9337                         }
9338                 }
9339         }
9340
9341         /*
9342          * Other attributes that require authorisation.
9343          */
9344         if (VATTR_IS_ACTIVE(vap, va_encoding)) {
9345                 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
9346         }
9347
9348 out:
9349         if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL)) {
9350                 kauth_acl_free(ova.va_acl);
9351         }
9352         if (error == 0) {
9353                 *actionp = required_action;
9354         }
9355         return error;
9356 }
9357
9358 static int
9359 setlocklocal_callback(struct vnode *vp, __unused void *cargs)
9360 {
9361         vnode_lock_spin(vp);
9362         vp->v_flag |= VLOCKLOCAL;
9363         vnode_unlock(vp);
9364
9365         return VNODE_RETURNED;
9366 }
9367
9368 void
9369 vfs_setlocklocal(mount_t mp)
9370 {
9371         mount_lock_spin(mp);
9372         mp->mnt_kern_flag |= MNTK_LOCK_LOCAL;
9373         mount_unlock(mp);
9374
9375         /*
9376          * The number of active vnodes is expected to be
9377          * very small when vfs_setlocklocal is invoked.
9378          */
9379         vnode_iterate(mp, 0, setlocklocal_callback, NULL);
9380 }
9381
9382 void
9383 vfs_setcompoundopen(mount_t mp)
9384 {
9385         mount_lock_spin(mp);
9386         mp->mnt_compound_ops |= COMPOUND_VNOP_OPEN;
9387         mount_unlock(mp);
9388 }
9389
9390 void
9391 vnode_setswapmount(vnode_t vp)
9392 {
9393         mount_lock(vp->v_mount);
9394         vp->v_mount->mnt_kern_flag |= MNTK_SWAP_MOUNT;
9395         mount_unlock(vp->v_mount);
9396 }
9397
9398
9399 int64_t
9400 vnode_getswappin_avail(vnode_t vp)
9401 {
9402         int64_t max_swappin_avail = 0;
9403
9404         mount_lock(vp->v_mount);
9405         if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_SWAPPIN_SUPPORTED) {
9406                 max_swappin_avail = vp->v_mount->mnt_max_swappin_available;
9407         }
9408         mount_unlock(vp->v_mount);
9409
9410         return max_swappin_avail;
9411 }
9412
9413
9414 void
9415 vn_setunionwait(vnode_t vp)
9416 {
9417         vnode_lock_spin(vp);
9418         vp->v_flag |= VISUNION;
9419         vnode_unlock(vp);
9420 }
9421
9422
9423 void
9424 vn_checkunionwait(vnode_t vp)
9425 {
9426         vnode_lock_spin(vp);
9427         while ((vp->v_flag & VISUNION) == VISUNION) {
9428                 msleep((caddr_t)&vp->v_flag, &vp->v_lock, 0, 0, 0);
9429         }
9430         vnode_unlock(vp);
9431 }
9432
9433 void
9434 vn_clearunionwait(vnode_t vp, int locked)
9435 {
9436         if (!locked) {
9437                 vnode_lock_spin(vp);
9438         }
9439         if ((vp->v_flag & VISUNION) == VISUNION) {
9440                 vp->v_flag &= ~VISUNION;
9441                 wakeup((caddr_t)&vp->v_flag);
9442         }
9443         if (!locked) {
9444                 vnode_unlock(vp);
9445         }
9446 }
9447
9448 int
9449 vnode_materialize_dataless_file(vnode_t vp, uint64_t op_type)
9450 {
9451         int error;
9452
9453         /* Swap files are special; ignore them */
9454         if (vnode_isswap(vp)) {
9455                 return 0;
9456         }
9457
9458         error = resolve_nspace_item(vp,
9459             op_type | NAMESPACE_HANDLER_NSPACE_EVENT);
9460
9461         /*
9462          * The file resolver owns the logic about what error to return
9463          * to the caller.  We only need to handle a couple of special
9464          * cases here:
9465          */
9466         if (error == EJUSTRETURN) {
9467                 /*
9468                  * The requesting process is allowed to interact with
9469                  * dataless objects.  Make a couple of sanity-checks
9470                  * here to ensure the action makes sense.
9471                  */
9472                 switch (op_type) {
9473                 case NAMESPACE_HANDLER_WRITE_OP:
9474                 case NAMESPACE_HANDLER_TRUNCATE_OP:
9475                 case NAMESPACE_HANDLER_RENAME_OP:
9476                         /*
9477                          * This handles the case of the resolver itself
9478                          * writing data to the file (or throwing it
9479                          * away).
9480                          */
9481                         error = 0;
9482                         break;
9483                 case NAMESPACE_HANDLER_READ_OP:
9484                         /*
9485                          * This handles the case of the resolver needing
9486                          * to look up inside of a dataless directory while
9487                          * it's in the process of materializing it (for
9488                          * example, creating files or directories).
9489                          */
9490                         error = (vnode_vtype(vp) == VDIR) ? 0 : EBADF;
9491                         break;
9492                 default:
9493                         error = EBADF;
9494                         break;
9495                 }
9496         }
9497
9498         return error;
9499 }
9500
9501 /*
9502  * Removes orphaned apple double files during a rmdir
9503  * Works by:
9504  * 1. vnode_suspend().
9505  * 2. Call VNOP_READDIR() till the end of directory is reached.
9506  * 3. Check if the directory entries returned are regular files with name starting with "._".  If not, return ENOTEMPTY.
9507  * 4. Continue (2) and (3) till end of directory is reached.
9508  * 5. If all the entries in the directory were files with "._" name, delete all the files.
9509  * 6. vnode_resume()
9510  * 7. If deletion of all files succeeded, call VNOP_RMDIR() again.
9511  */
9512
9513 errno_t
9514 rmdir_remove_orphaned_appleDouble(vnode_t vp, vfs_context_t ctx, int * restart_flag)
9515 {
9516 #define UIO_BUFF_SIZE 2048
9517         uio_t auio = NULL;
9518         int eofflag, siz = UIO_BUFF_SIZE, nentries = 0;
9519         int open_flag = 0, full_erase_flag = 0;
9520         char uio_buf[UIO_SIZEOF(1)];
9521         char *rbuf = NULL;
9522         void *dir_pos;
9523         void *dir_end;
9524         struct dirent *dp;
9525         errno_t error;
9526
9527         error = vnode_suspend(vp);
9528
9529         /*
9530          * restart_flag is set so that the calling rmdir sleeps and resets
9531          */
9532         if (error == EBUSY) {
9533                 *restart_flag = 1;
9534         }
9535         if (error != 0) {
9536                 return error;
9537         }
9538
9539         /*
9540          * Prevent dataless fault materialization while we have
9541          * a suspended vnode.
9542          */
9543         uthread_t ut = get_bsdthread_info(current_thread());
9544         bool saved_nodatalessfaults =
9545             (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? true : false;
9546         ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
9547
9548         /*
9549          * set up UIO
9550          */
9551         MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
9552         if (rbuf) {
9553                 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
9554                     &uio_buf[0], sizeof(uio_buf));
9555         }
9556         if (!rbuf || !auio) {
9557                 error = ENOMEM;
9558                 goto outsc;
9559         }
9560
9561         uio_setoffset(auio, 0);
9562
9563         eofflag = 0;
9564
9565         if ((error = VNOP_OPEN(vp, FREAD, ctx))) {
9566                 goto outsc;
9567         } else {
9568                 open_flag = 1;
9569         }
9570
9571         /*
9572          * First pass checks if all files are appleDouble files.
9573          */
9574
9575         do {
9576                 siz = UIO_BUFF_SIZE;
9577                 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
9578                 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
9579
9580                 if ((error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx))) {
9581                         goto outsc;
9582                 }
9583
9584                 if (uio_resid(auio) != 0) {
9585                         siz -= uio_resid(auio);
9586                 }
9587
9588                 /*
9589                  * Iterate through directory
9590                  */
9591                 dir_pos = (void*) rbuf;
9592                 dir_end = (void*) (rbuf + siz);
9593                 dp = (struct dirent*) (dir_pos);
9594
9595                 if (dir_pos == dir_end) {
9596                         eofflag = 1;
9597                 }
9598
9599                 while (dir_pos < dir_end) {
9600                         /*
9601                          * Check for . and .. as well as directories
9602                          */
9603                         if (dp->d_ino != 0 &&
9604                             !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
9605                             (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))) {
9606                                 /*
9607                                  * Check for irregular files and ._ files
9608                                  * If there is a ._._ file abort the op
9609                                  */
9610                                 if (dp->d_namlen < 2 ||
9611                                     strncmp(dp->d_name, "._", 2) ||
9612                                     (dp->d_namlen >= 4 && !strncmp(&(dp->d_name[2]), "._", 2))) {
9613                                         error = ENOTEMPTY;
9614                                         goto outsc;
9615                                 }
9616                         }
9617                         dir_pos = (void*) ((uint8_t*)dir_pos + dp->d_reclen);
9618                         dp = (struct dirent*)dir_pos;
9619                 }
9620
9621                 /*
9622                  * workaround for HFS/NFS setting eofflag before end of file
9623                  */
9624                 if (vp->v_tag == VT_HFS && nentries > 2) {
9625                         eofflag = 0;
9626                 }
9627
9628                 if (vp->v_tag == VT_NFS) {
9629                         if (eofflag && !full_erase_flag) {
9630                                 full_erase_flag = 1;
9631                                 eofflag = 0;
9632                                 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
9633                         } else if (!eofflag && full_erase_flag) {
9634                                 full_erase_flag = 0;
9635                         }
9636                 }
9637         } while (!eofflag);
9638         /*
9639          * If we've made it here all the files in the dir are ._ files.
9640          * We can delete the files even though the node is suspended
9641          * because we are the owner of the file.
9642          */
9643
9644         uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
9645         eofflag = 0;
9646         full_erase_flag = 0;
9647
9648         do {
9649                 siz = UIO_BUFF_SIZE;
9650                 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
9651                 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
9652
9653                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx);
9654
9655                 if (error != 0) {
9656                         goto outsc;
9657                 }
9658
9659                 if (uio_resid(auio) != 0) {
9660                         siz -= uio_resid(auio);
9661                 }
9662
9663                 /*
9664                  * Iterate through directory
9665                  */
9666                 dir_pos = (void*) rbuf;
9667                 dir_end = (void*) (rbuf + siz);
9668                 dp = (struct dirent*) dir_pos;
9669
9670                 if (dir_pos == dir_end) {
9671                         eofflag = 1;
9672                 }
9673
9674                 while (dir_pos < dir_end) {
9675                         /*
9676                          * Check for . and .. as well as directories
9677                          */
9678                         if (dp->d_ino != 0 &&
9679                             !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
9680                             (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))
9681                             ) {
9682                                 error = unlink1(ctx, vp,
9683                                     CAST_USER_ADDR_T(dp->d_name), UIO_SYSSPACE,
9684                                     VNODE_REMOVE_SKIP_NAMESPACE_EVENT |
9685                                     VNODE_REMOVE_NO_AUDIT_PATH);
9686
9687                                 if (error && error != ENOENT) {
9688                                         goto outsc;
9689                                 }
9690                         }
9691                         dir_pos = (void*) ((uint8_t*)dir_pos + dp->d_reclen);
9692                         dp = (struct dirent*)dir_pos;
9693                 }
9694
9695                 /*
9696                  * workaround for HFS/NFS setting eofflag before end of file
9697                  */
9698                 if (vp->v_tag == VT_HFS && nentries > 2) {
9699                         eofflag = 0;
9700                 }
9701
9702                 if (vp->v_tag == VT_NFS) {
9703                         if (eofflag && !full_erase_flag) {
9704                                 full_erase_flag = 1;
9705                                 eofflag = 0;
9706                                 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
9707                         } else if (!eofflag && full_erase_flag) {
9708                                 full_erase_flag = 0;
9709                         }
9710                 }
9711         } while (!eofflag);
9712
9713
9714         error = 0;
9715
9716 outsc:
9717         if (open_flag) {
9718                 VNOP_CLOSE(vp, FREAD, ctx);
9719         }
9720
9721         if (auio) {
9722                 uio_free(auio);
9723         }
9724         FREE(rbuf, M_TEMP);
9725
9726         if (saved_nodatalessfaults == false) {
9727                 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
9728         }
9729
9730         vnode_resume(vp);
9731
9732         return error;
9733 }
9734
9735
9736 void
9737 lock_vnode_and_post(vnode_t vp, int kevent_num)
9738 {
9739         /* Only take the lock if there's something there! */
9740         if (vp->v_knotes.slh_first != NULL) {
9741                 vnode_lock(vp);
9742                 KNOTE(&vp->v_knotes, kevent_num);
9743                 vnode_unlock(vp);
9744         }
9745 }
9746
9747 void panic_print_vnodes(void);
9748
9749 /* define PANIC_PRINTS_VNODES only if investigation is required. */
9750 #ifdef PANIC_PRINTS_VNODES
9751
9752 static const char *
9753 __vtype(uint16_t vtype)
9754 {
9755         switch (vtype) {
9756         case VREG:
9757                 return "R";
9758         case VDIR:
9759                 return "D";
9760         case VBLK:
9761                 return "B";
9762         case VCHR:
9763                 return "C";
9764         case VLNK:
9765                 return "L";
9766         case VSOCK:
9767                 return "S";
9768         case VFIFO:
9769                 return "F";
9770         case VBAD:
9771                 return "x";
9772         case VSTR:
9773                 return "T";
9774         case VCPLX:
9775                 return "X";
9776         default:
9777                 return "?";
9778         }
9779 }
9780
9781 /*
9782  * build a path from the bottom up
9783  * NOTE: called from the panic path - no alloc'ing of memory and no locks!
9784  */
9785 static char *
9786 __vpath(vnode_t vp, char *str, int len, int depth)
9787 {
9788         int vnm_len;
9789         const char *src;
9790         char *dst;
9791
9792         if (len <= 0) {
9793                 return str;
9794         }
9795         /* str + len is the start of the string we created */
9796         if (!vp->v_name) {
9797                 return str + len;
9798         }
9799
9800         /* follow mount vnodes to get the full path */
9801         if ((vp->v_flag & VROOT)) {
9802                 if (vp->v_mount != NULL && vp->v_mount->mnt_vnodecovered) {
9803                         return __vpath(vp->v_mount->mnt_vnodecovered,
9804                                    str, len, depth + 1);
9805                 }
9806                 return str + len;
9807         }
9808
9809         src = vp->v_name;
9810         vnm_len = strlen(src);
9811         if (vnm_len > len) {
9812                 /* truncate the name to fit in the string */
9813                 src += (vnm_len - len);
9814                 vnm_len = len;
9815         }
9816
9817         /* start from the back and copy just characters (no NULLs) */
9818
9819         /* this will chop off leaf path (file) names */
9820         if (depth > 0) {
9821                 dst = str + len - vnm_len;
9822                 memcpy(dst, src, vnm_len);
9823                 len -= vnm_len;
9824         } else {
9825                 dst = str + len;
9826         }
9827
9828         if (vp->v_parent && len > 1) {
9829                 /* follow parents up the chain */
9830                 len--;
9831                 *(dst - 1) = '/';
9832                 return __vpath(vp->v_parent, str, len, depth + 1);
9833         }
9834
9835         return dst;
9836 }
9837
9838 #define SANE_VNODE_PRINT_LIMIT 5000
9839 void
9840 panic_print_vnodes(void)
9841 {
9842         mount_t mnt;
9843         vnode_t vp;
9844         int nvnodes = 0;
9845         const char *type;
9846         char *nm;
9847         char vname[257];
9848
9849         paniclog_append_noflush("\n***** VNODES *****\n"
9850             "TYPE UREF ICNT PATH\n");
9851
9852         /* NULL-terminate the path name */
9853         vname[sizeof(vname) - 1] = '\0';
9854
9855         /*
9856          * iterate all vnodelist items in all mounts (mntlist) -> mnt_vnodelist
9857          */
9858         TAILQ_FOREACH(mnt, &mountlist, mnt_list) {
9859                 if (!ml_validate_nofault((vm_offset_t)mnt, sizeof(mount_t))) {
9860                         paniclog_append_noflush("Unable to iterate the mount list %p - encountered an invalid mount pointer %p \n",
9861                             &mountlist, mnt);
9862                         break;
9863                 }
9864
9865                 TAILQ_FOREACH(vp, &mnt->mnt_vnodelist, v_mntvnodes) {
9866                         if (!ml_validate_nofault((vm_offset_t)vp, sizeof(vnode_t))) {
9867                                 paniclog_append_noflush("Unable to iterate the vnode list %p - encountered an invalid vnode pointer %p \n",
9868                                     &mnt->mnt_vnodelist, vp);
9869                                 break;
9870                         }
9871
9872                         if (++nvnodes > SANE_VNODE_PRINT_LIMIT) {
9873                                 return;
9874                         }
9875                         type = __vtype(vp->v_type);
9876                         nm = __vpath(vp, vname, sizeof(vname) - 1, 0);
9877                         paniclog_append_noflush("%s %0d %0d %s\n",
9878                             type, vp->v_usecount, vp->v_iocount, nm);
9879                 }
9880         }
9881 }
9882
9883 #else /* !PANIC_PRINTS_VNODES */
9884 void
9885 panic_print_vnodes(void)
9886 {
9887         return;
9888 }
9889 #endif
9890
9891
9892 #ifdef JOE_DEBUG
9893 static void
9894 record_vp(vnode_t vp, int count)
9895 {
9896         struct uthread *ut;
9897
9898 #if CONFIG_TRIGGERS
9899         if (vp->v_resolve) {
9900                 return;
9901         }
9902 #endif
9903         if ((vp->v_flag & VSYSTEM)) {
9904                 return;
9905         }
9906
9907         ut = get_bsdthread_info(current_thread());
9908         ut->uu_iocount += count;
9909
9910         if (count == 1) {
9911                 if (ut->uu_vpindex < 32) {
9912                         OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][0], 10);
9913
9914                         ut->uu_vps[ut->uu_vpindex] = vp;
9915                         ut->uu_vpindex++;
9916                 }
9917         }
9918 }
9919 #endif
9920
9921
9922 #if CONFIG_TRIGGERS
9923
9924 #define TRIG_DEBUG 0
9925
9926 #if TRIG_DEBUG
9927 #define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0)
9928 #else
9929 #define TRIG_LOG(...)
9930 #endif
9931
9932 /*
9933  * Resolver result functions
9934  */
9935
9936 resolver_result_t
9937 vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux)
9938 {
9939         /*
9940          * |<---   32   --->|<---  28  --->|<- 4 ->|
9941          *      sequence        auxiliary    status
9942          */
9943         return (((uint64_t)seq) << 32) |
9944                (((uint64_t)(aux & 0x0fffffff)) << 4) |
9945                (uint64_t)(stat & 0x0000000F);
9946 }
9947
9948 enum resolver_status
9949 vfs_resolver_status(resolver_result_t result)
9950 {
9951         /* lower 4 bits is status */
9952         return result & 0x0000000F;
9953 }
9954
9955 uint32_t
9956 vfs_resolver_sequence(resolver_result_t result)
9957 {
9958         /* upper 32 bits is sequence */
9959         return (uint32_t)(result >> 32);
9960 }
9961
9962 int
9963 vfs_resolver_auxiliary(resolver_result_t result)
9964 {
9965         /* 28 bits of auxiliary */
9966         return (int)(((uint32_t)(result & 0xFFFFFFF0)) >> 4);
9967 }
9968
9969 /*
9970  * SPI
9971  * Call in for resolvers to update vnode trigger state
9972  */
9973 int
9974 vnode_trigger_update(vnode_t vp, resolver_result_t result)
9975 {
9976         vnode_resolve_t rp;
9977         uint32_t seq;
9978         enum resolver_status stat;
9979
9980         if (vp->v_resolve == NULL) {
9981                 return EINVAL;
9982         }
9983
9984         stat = vfs_resolver_status(result);
9985         seq = vfs_resolver_sequence(result);
9986
9987         if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) {
9988                 return EINVAL;
9989         }
9990
9991         rp = vp->v_resolve;
9992         lck_mtx_lock(&rp->vr_lock);
9993
9994         if (seq > rp->vr_lastseq) {
9995                 if (stat == RESOLVER_RESOLVED) {
9996                         rp->vr_flags |= VNT_RESOLVED;
9997                 } else {
9998                         rp->vr_flags &= ~VNT_RESOLVED;
9999                 }
10000
10001                 rp->vr_lastseq = seq;
10002         }
10003
10004         lck_mtx_unlock(&rp->vr_lock);
10005
10006         return 0;
10007 }
10008
10009 static int
10010 vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref)
10011 {
10012         int error;
10013
10014         vnode_lock_spin(vp);
10015         if (vp->v_resolve != NULL) {
10016                 vnode_unlock(vp);
10017                 return EINVAL;
10018         } else {
10019                 vp->v_resolve = rp;
10020         }
10021         vnode_unlock(vp);
10022
10023         if (ref) {
10024                 error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE);
10025                 if (error != 0) {
10026                         panic("VNODE_REF_FORCE didn't help...");
10027                 }
10028         }
10029
10030         return 0;
10031 }
10032
10033 /*
10034  * VFS internal interfaces for vnode triggers
10035  *
10036  * vnode must already have an io count on entry
10037  * v_resolve is stable when io count is non-zero
10038  */
10039 static int
10040 vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external)
10041 {
10042         vnode_resolve_t rp;
10043         int result;
10044         char byte;
10045
10046 #if 1
10047         /* minimum pointer test (debugging) */
10048         if (tinfo->vnt_data) {
10049                 byte = *((char *)tinfo->vnt_data);
10050         }
10051 #endif
10052         MALLOC(rp, vnode_resolve_t, sizeof(*rp), M_TEMP, M_WAITOK);
10053         if (rp == NULL) {
10054                 return ENOMEM;
10055         }
10056
10057         lck_mtx_init(&rp->vr_lock, trigger_vnode_lck_grp, trigger_vnode_lck_attr);
10058
10059         rp->vr_resolve_func = tinfo->vnt_resolve_func;
10060         rp->vr_unresolve_func = tinfo->vnt_unresolve_func;
10061         rp->vr_rearm_func = tinfo->vnt_rearm_func;
10062         rp->vr_reclaim_func = tinfo->vnt_reclaim_func;
10063         rp->vr_data = tinfo->vnt_data;
10064         rp->vr_lastseq = 0;
10065         rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK;
10066         if (external) {
10067                 rp->vr_flags |= VNT_EXTERNAL;
10068         }
10069
10070         result = vnode_resolver_attach(vp, rp, external);
10071         if (result != 0) {
10072                 goto out;
10073         }
10074
10075         if (mp) {
10076                 OSAddAtomic(1, &mp->mnt_numtriggers);
10077         }
10078
10079         return result;
10080
10081 out:
10082         FREE(rp, M_TEMP);
10083         return result;
10084 }
10085
10086 static void
10087 vnode_resolver_release(vnode_resolve_t rp)
10088 {
10089         /*
10090          * Give them a chance to free any private data
10091          */
10092         if (rp->vr_data && rp->vr_reclaim_func) {
10093                 rp->vr_reclaim_func(NULLVP, rp->vr_data);
10094         }
10095
10096         lck_mtx_destroy(&rp->vr_lock, trigger_vnode_lck_grp);
10097         FREE(rp, M_TEMP);
10098 }
10099
10100 /* Called after the vnode has been drained */
10101 static void
10102 vnode_resolver_detach(vnode_t vp)
10103 {
10104         vnode_resolve_t rp;
10105         mount_t mp;
10106
10107         mp = vnode_mount(vp);
10108
10109         vnode_lock(vp);
10110         rp = vp->v_resolve;
10111         vp->v_resolve = NULL;
10112         vnode_unlock(vp);
10113
10114         if ((rp->vr_flags & VNT_EXTERNAL) != 0) {
10115                 vnode_rele_ext(vp, O_EVTONLY, 1);
10116         }
10117
10118         vnode_resolver_release(rp);
10119
10120         /* Keep count of active trigger vnodes per mount */
10121         OSAddAtomic(-1, &mp->mnt_numtriggers);
10122 }
10123
10124 __private_extern__
10125 void
10126 vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx)
10127 {
10128         vnode_resolve_t rp;
10129         resolver_result_t result;
10130         enum resolver_status status;
10131         uint32_t seq;
10132
10133         if ((vp->v_resolve == NULL) ||
10134             (vp->v_resolve->vr_rearm_func == NULL) ||
10135             (vp->v_resolve->vr_flags & VNT_AUTO_REARM) == 0) {
10136                 return;
10137         }
10138
10139         rp = vp->v_resolve;
10140         lck_mtx_lock(&rp->vr_lock);
10141
10142         /*
10143          * Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes.
10144          */
10145         if (rp->vr_flags & VNT_VFS_UNMOUNTED) {
10146                 lck_mtx_unlock(&rp->vr_lock);
10147                 return;
10148         }
10149
10150         /* Check if this vnode is already armed */
10151         if ((rp->vr_flags & VNT_RESOLVED) == 0) {
10152                 lck_mtx_unlock(&rp->vr_lock);
10153                 return;
10154         }
10155
10156         lck_mtx_unlock(&rp->vr_lock);
10157
10158         result = rp->vr_rearm_func(vp, 0, rp->vr_data, ctx);
10159         status = vfs_resolver_status(result);
10160         seq = vfs_resolver_sequence(result);
10161
10162         lck_mtx_lock(&rp->vr_lock);
10163         if (seq > rp->vr_lastseq) {
10164                 if (status == RESOLVER_UNRESOLVED) {
10165                         rp->vr_flags &= ~VNT_RESOLVED;
10166                 }
10167                 rp->vr_lastseq = seq;
10168         }
10169         lck_mtx_unlock(&rp->vr_lock);
10170 }
10171
10172 __private_extern__
10173 int
10174 vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx)
10175 {
10176         vnode_resolve_t rp;
10177         enum path_operation op;
10178         resolver_result_t result;
10179         enum resolver_status status;
10180         uint32_t seq;
10181
10182         /* Only trigger on topmost vnodes */
10183         if ((vp->v_resolve == NULL) ||
10184             (vp->v_resolve->vr_resolve_func == NULL) ||
10185             (vp->v_mountedhere != NULL)) {
10186                 return 0;
10187         }
10188
10189         rp = vp->v_resolve;
10190         lck_mtx_lock(&rp->vr_lock);
10191
10192         /* Check if this vnode is already resolved */
10193         if (rp->vr_flags & VNT_RESOLVED) {
10194                 lck_mtx_unlock(&rp->vr_lock);
10195                 return 0;
10196         }
10197
10198         lck_mtx_unlock(&rp->vr_lock);
10199
10200 #if CONFIG_MACF
10201         if ((rp->vr_flags & VNT_KERN_RESOLVE) == 0) {
10202                 /*
10203                  * VNT_KERN_RESOLVE indicates this trigger has no parameters
10204                  * at the discression of the accessing process other than
10205                  * the act of access. All other triggers must be checked
10206                  */
10207                 int rv = mac_vnode_check_trigger_resolve(ctx, vp, &ndp->ni_cnd);
10208                 if (rv != 0) {
10209                         return rv;
10210                 }
10211         }
10212 #endif
10213
10214         /*
10215          * XXX
10216          * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
10217          * is there anyway to know this???
10218          * there can also be other legitimate lookups in parallel
10219          *
10220          * XXX - should we call this on a separate thread with a timeout?
10221          *
10222          * XXX - should we use ISLASTCN to pick the op value???  Perhaps only leafs should
10223          * get the richer set and non-leafs should get generic OP_LOOKUP?  TBD
10224          */
10225         op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP;
10226
10227         result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, 0, rp->vr_data, ctx);
10228         status = vfs_resolver_status(result);
10229         seq = vfs_resolver_sequence(result);
10230
10231         lck_mtx_lock(&rp->vr_lock);
10232         if (seq > rp->vr_lastseq) {
10233                 if (status == RESOLVER_RESOLVED) {
10234                         rp->vr_flags |= VNT_RESOLVED;
10235                 }
10236                 rp->vr_lastseq = seq;
10237         }
10238         lck_mtx_unlock(&rp->vr_lock);
10239
10240         /* On resolver errors, propagate the error back up */
10241         return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0;
10242 }
10243
10244 static int
10245 vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx)
10246 {
10247         vnode_resolve_t rp;
10248         resolver_result_t result;
10249         enum resolver_status status;
10250         uint32_t seq;
10251
10252         if ((vp->v_resolve == NULL) || (vp->v_resolve->vr_unresolve_func == NULL)) {
10253                 return 0;
10254         }
10255
10256         rp = vp->v_resolve;
10257         lck_mtx_lock(&rp->vr_lock);
10258
10259         /* Check if this vnode is already resolved */
10260         if ((rp->vr_flags & VNT_RESOLVED) == 0) {
10261                 printf("vnode_trigger_unresolve: not currently resolved\n");
10262                 lck_mtx_unlock(&rp->vr_lock);
10263                 return 0;
10264         }
10265
10266         rp->vr_flags |= VNT_VFS_UNMOUNTED;
10267
10268         lck_mtx_unlock(&rp->vr_lock);
10269
10270         /*
10271          * XXX
10272          * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
10273          * there can also be other legitimate lookups in parallel
10274          *
10275          * XXX - should we call this on a separate thread with a timeout?
10276          */
10277
10278         result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx);
10279         status = vfs_resolver_status(result);
10280         seq = vfs_resolver_sequence(result);
10281
10282         lck_mtx_lock(&rp->vr_lock);
10283         if (seq > rp->vr_lastseq) {
10284                 if (status == RESOLVER_UNRESOLVED) {
10285                         rp->vr_flags &= ~VNT_RESOLVED;
10286                 }
10287                 rp->vr_lastseq = seq;
10288         }
10289         rp->vr_flags &= ~VNT_VFS_UNMOUNTED;
10290         lck_mtx_unlock(&rp->vr_lock);
10291
10292         /* On resolver errors, propagate the error back up */
10293         return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0;
10294 }
10295
10296 static int
10297 triggerisdescendant(mount_t mp, mount_t rmp)
10298 {
10299         int match = FALSE;
10300
10301         /*
10302          * walk up vnode covered chain looking for a match
10303          */
10304         name_cache_lock_shared();
10305
10306         while (1) {
10307                 vnode_t vp;
10308
10309                 /* did we encounter "/" ? */
10310                 if (mp->mnt_flag & MNT_ROOTFS) {
10311                         break;
10312                 }
10313
10314                 vp = mp->mnt_vnodecovered;
10315                 if (vp == NULLVP) {
10316                         break;
10317                 }
10318
10319                 mp = vp->v_mount;
10320                 if (mp == rmp) {
10321                         match = TRUE;
10322                         break;
10323                 }
10324         }
10325
10326         name_cache_unlock();
10327
10328         return match;
10329 }
10330
10331 struct trigger_unmount_info {
10332         vfs_context_t   ctx;
10333         mount_t         top_mp;
10334         vnode_t         trigger_vp;
10335         mount_t         trigger_mp;
10336         uint32_t        trigger_vid;
10337         int             flags;
10338 };
10339
10340 static int
10341 trigger_unmount_callback(mount_t mp, void * arg)
10342 {
10343         struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg;
10344         boolean_t mountedtrigger = FALSE;
10345
10346         /*
10347          * When we encounter the top level mount we're done
10348          */
10349         if (mp == infop->top_mp) {
10350                 return VFS_RETURNED_DONE;
10351         }
10352
10353         if ((mp->mnt_vnodecovered == NULL) ||
10354             (vnode_getwithref(mp->mnt_vnodecovered) != 0)) {
10355                 return VFS_RETURNED;
10356         }
10357
10358         if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
10359             (mp->mnt_vnodecovered->v_resolve != NULL) &&
10360             (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) {
10361                 mountedtrigger = TRUE;
10362         }
10363         vnode_put(mp->mnt_vnodecovered);
10364
10365         /*
10366          * When we encounter a mounted trigger, check if its under the top level mount
10367          */
10368         if (!mountedtrigger || !triggerisdescendant(mp, infop->top_mp)) {
10369                 return VFS_RETURNED;
10370         }
10371
10372         /*
10373          * Process any pending nested mount (now that its not referenced)
10374          */
10375         if ((infop->trigger_vp != NULLVP) &&
10376             (vnode_getwithvid(infop->trigger_vp, infop->trigger_vid) == 0)) {
10377                 vnode_t vp = infop->trigger_vp;
10378                 int error;
10379
10380                 infop->trigger_vp = NULLVP;
10381
10382                 if (mp == vp->v_mountedhere) {
10383                         vnode_put(vp);
10384                         printf("trigger_unmount_callback: unexpected match '%s'\n",
10385                             mp->mnt_vfsstat.f_mntonname);
10386                         return VFS_RETURNED;
10387                 }
10388                 if (infop->trigger_mp != vp->v_mountedhere) {
10389                         vnode_put(vp);
10390                         printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n",
10391                             infop->trigger_mp, vp->v_mountedhere);
10392                         goto savenext;
10393                 }
10394
10395                 error = vnode_trigger_unresolve(vp, infop->flags, infop->ctx);
10396                 vnode_put(vp);
10397                 if (error) {
10398                         printf("unresolving: '%s', err %d\n",
10399                             vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname :
10400                             "???", error);
10401                         return VFS_RETURNED_DONE; /* stop iteration on errors */
10402                 }
10403         }
10404 savenext:
10405         /*
10406          * We can't call resolver here since we hold a mount iter
10407          * ref on mp so save its covered vp for later processing
10408          */
10409         infop->trigger_vp = mp->mnt_vnodecovered;
10410         if ((infop->trigger_vp != NULLVP) &&
10411             (vnode_getwithref(infop->trigger_vp) == 0)) {
10412                 if (infop->trigger_vp->v_mountedhere == mp) {
10413                         infop->trigger_vid = infop->trigger_vp->v_id;
10414                         infop->trigger_mp = mp;
10415                 }
10416                 vnode_put(infop->trigger_vp);
10417         }
10418
10419         return VFS_RETURNED;
10420 }
10421
10422 /*
10423  * Attempt to unmount any trigger mounts nested underneath a mount.
10424  * This is a best effort attempt and no retries are performed here.
10425  *
10426  * Note: mp->mnt_rwlock is held exclusively on entry (so be carefull)
10427  */
10428 __private_extern__
10429 void
10430 vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx)
10431 {
10432         struct trigger_unmount_info info;
10433
10434         /* Must have trigger vnodes */
10435         if (mp->mnt_numtriggers == 0) {
10436                 return;
10437         }
10438         /* Avoid recursive requests (by checking covered vnode) */
10439         if ((mp->mnt_vnodecovered != NULL) &&
10440             (vnode_getwithref(mp->mnt_vnodecovered) == 0)) {
10441                 boolean_t recursive = FALSE;
10442
10443                 if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
10444                     (mp->mnt_vnodecovered->v_resolve != NULL) &&
10445                     (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) {
10446                         recursive = TRUE;
10447                 }
10448                 vnode_put(mp->mnt_vnodecovered);
10449                 if (recursive) {
10450                         return;
10451                 }
10452         }
10453
10454         /*
10455          * Attempt to unmount any nested trigger mounts (best effort)
10456          */
10457         info.ctx = ctx;
10458         info.top_mp = mp;
10459         info.trigger_vp = NULLVP;
10460         info.trigger_vid = 0;
10461         info.trigger_mp = NULL;
10462         info.flags = flags;
10463
10464         (void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, trigger_unmount_callback, &info);
10465
10466         /*
10467          * Process remaining nested mount (now that its not referenced)
10468          */
10469         if ((info.trigger_vp != NULLVP) &&
10470             (vnode_getwithvid(info.trigger_vp, info.trigger_vid) == 0)) {
10471                 vnode_t vp = info.trigger_vp;
10472
10473                 if (info.trigger_mp == vp->v_mountedhere) {
10474                         (void) vnode_trigger_unresolve(vp, flags, ctx);
10475                 }
10476                 vnode_put(vp);
10477         }
10478 }
10479
10480 int
10481 vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx)
10482 {
10483         struct nameidata nd;
10484         int res;
10485         vnode_t rvp, vp;
10486         struct vnode_trigger_param vtp;
10487
10488         /*
10489          * Must be called for trigger callback, wherein rwlock is held
10490          */
10491         lck_rw_assert(&mp->mnt_rwlock, LCK_RW_ASSERT_HELD);
10492
10493         TRIG_LOG("Adding trigger at %s\n", relpath);
10494         TRIG_LOG("Trying VFS_ROOT\n");
10495
10496         /*
10497          * We do a lookup starting at the root of the mountpoint, unwilling
10498          * to cross into other mountpoints.
10499          */
10500         res = VFS_ROOT(mp, &rvp, ctx);
10501         if (res != 0) {
10502                 goto out;
10503         }
10504
10505         TRIG_LOG("Trying namei\n");
10506
10507         NDINIT(&nd, LOOKUP, OP_LOOKUP, USEDVP | NOCROSSMOUNT | FOLLOW, UIO_SYSSPACE,
10508             CAST_USER_ADDR_T(relpath), ctx);
10509         nd.ni_dvp = rvp;
10510         res = namei(&nd);
10511         if (res != 0) {
10512                 vnode_put(rvp);
10513                 goto out;
10514         }
10515
10516         vp = nd.ni_vp;
10517         nameidone(&nd);
10518         vnode_put(rvp);
10519
10520         TRIG_LOG("Trying vnode_resolver_create()\n");
10521
10522         /*
10523          * Set up blob.  vnode_create() takes a larger structure
10524          * with creation info, and we needed something different
10525          * for this case.  One needs to win, or we need to munge both;
10526          * vnode_create() wins.
10527          */
10528         bzero(&vtp, sizeof(vtp));
10529         vtp.vnt_resolve_func = vtip->vti_resolve_func;
10530         vtp.vnt_unresolve_func = vtip->vti_unresolve_func;
10531         vtp.vnt_rearm_func = vtip->vti_rearm_func;
10532         vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
10533         vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
10534         vtp.vnt_data = vtip->vti_data;
10535         vtp.vnt_flags = vtip->vti_flags;
10536
10537         res = vnode_resolver_create(mp, vp, &vtp, TRUE);
10538         vnode_put(vp);
10539 out:
10540         TRIG_LOG("Returning %d\n", res);
10541         return res;
10542 }
10543
10544 #endif /* CONFIG_TRIGGERS */
10545
10546 vm_offset_t
10547 kdebug_vnode(vnode_t vp)
10548 {
10549         return VM_KERNEL_ADDRPERM(vp);
10550 }
10551
10552 static int flush_cache_on_write = 0;
10553 SYSCTL_INT(_kern, OID_AUTO, flush_cache_on_write,
10554     CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0,
10555     "always flush the drive cache on writes to uncached files");
10556
10557 int
10558 vnode_should_flush_after_write(vnode_t vp, int ioflag)
10559 {
10560         return flush_cache_on_write
10561                && (ISSET(ioflag, IO_NOCACHE) || vnode_isnocache(vp));
10562 }
10563
10564 /*
10565  * sysctl for use by disk I/O tracing tools to get the list of existing
10566  * vnodes' paths
10567  */
10568
10569 struct vnode_trace_paths_context {
10570         uint64_t count;
10571         long path[MAXPATHLEN / sizeof(long) + 1];  /* + 1 in case sizeof (long) does not divide MAXPATHLEN */
10572 };
10573
10574 static int
10575 vnode_trace_path_callback(struct vnode *vp, void *arg)
10576 {
10577         int len, rv;
10578         struct vnode_trace_paths_context *ctx;
10579
10580         ctx = arg;
10581
10582         len = sizeof(ctx->path);
10583         rv = vn_getpath(vp, (char *)ctx->path, &len);
10584         /* vn_getpath() NUL-terminates, and len includes the NUL */
10585
10586         if (!rv) {
10587                 kdebug_vfs_lookup(ctx->path, len, vp,
10588                     KDBG_VFS_LOOKUP_FLAG_LOOKUP | KDBG_VFS_LOOKUP_FLAG_NOPROCFILT);
10589
10590                 if (++(ctx->count) == 1000) {
10591                         thread_yield_to_preemption();
10592                         ctx->count = 0;
10593                 }
10594         }
10595
10596         return VNODE_RETURNED;
10597 }
10598
10599 static int
10600 vfs_trace_paths_callback(mount_t mp, void *arg)
10601 {
10602         if (mp->mnt_flag & MNT_LOCAL) {
10603                 vnode_iterate(mp, VNODE_ITERATE_ALL, vnode_trace_path_callback, arg);
10604         }
10605
10606         return VFS_RETURNED;
10607 }
10608
10609 static int sysctl_vfs_trace_paths SYSCTL_HANDLER_ARGS {
10610         struct vnode_trace_paths_context ctx;
10611
10612         (void)oidp;
10613         (void)arg1;
10614         (void)arg2;
10615         (void)req;
10616
10617         if (!kauth_cred_issuser(kauth_cred_get())) {
10618                 return EPERM;
10619         }
10620
10621         if (!kdebug_enable || !kdebug_debugid_enabled(VFS_LOOKUP)) {
10622                 return EINVAL;
10623         }
10624
10625         bzero(&ctx, sizeof(struct vnode_trace_paths_context));
10626
10627         vfs_iterate(0, vfs_trace_paths_callback, &ctx);
10628
10629         return 0;
10630 }
10631
10632 SYSCTL_PROC(_vfs_generic, OID_AUTO, trace_paths, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, NULL, 0, &sysctl_vfs_trace_paths, "-", "trace_paths");