bsd/vfs/vfs_subr.c

   1 /*
   2  * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  * (c) UNIX System Laboratories, Inc.
  33  * All or some portions of this file are derived from material licensed
  34  * to the University of California by American Telephone and Telegraph
  35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  36  * the permission of UNIX System Laboratories, Inc.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  67  */
  68 /*
  69  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  70  * support for mandatory and extensible security protections.  This notice
  71  * is included in support of clause 2.2 (b) of the Apple Public License,
  72  * Version 2.0.
  73  */
  74
  75 /*
  76  * External virtual filesystem routines
  77  */
  78
  79 #include <sys/param.h>
  80 #include <sys/systm.h>
  81 #include <sys/proc_internal.h>
  82 #include <sys/kauth.h>
  83 #include <sys/mount_internal.h>
  84 #include <sys/time.h>
  85 #include <sys/lock.h>
  86 #include <sys/vnode.h>
  87 #include <sys/vnode_internal.h>
  88 #include <sys/stat.h>
  89 #include <sys/namei.h>
  90 #include <sys/ucred.h>
  91 #include <sys/buf_internal.h>
  92 #include <sys/errno.h>
  93 #include <sys/malloc.h>
  94 #include <sys/uio_internal.h>
  95 #include <sys/uio.h>
  96 #include <sys/domain.h>
  97 #include <sys/mbuf.h>
  98 #include <sys/syslog.h>
  99 #include <sys/ubc_internal.h>
 100 #include <sys/vm.h>
 101 #include <sys/sysctl.h>
 102 #include <sys/filedesc.h>
 103 #include <sys/event.h>
 104 #include <sys/kdebug.h>
 105 #include <sys/kauth.h>
 106 #include <sys/user.h>
 107 #include <sys/systm.h>
 108 #include <sys/kern_memorystatus.h>
 109 #include <sys/lockf.h>
 110 #include <miscfs/fifofs/fifo.h>
 111
 112 #include <string.h>
 113 #include <machine/machine_routines.h>
 114
 115 #include <kern/assert.h>
 116 #include <mach/kern_return.h>
 117 #include <kern/thread.h>
 118 #include <kern/sched_prim.h>
 119
 120 #include <miscfs/specfs/specdev.h>
 121
 122 #include <mach/mach_types.h>
 123 #include <mach/memory_object_types.h>
 124 #include <mach/memory_object_control.h>
 125
 126 #include <kern/kalloc.h>        /* kalloc()/kfree() */
 127 #include <kern/clock.h>         /* delay_for_interval() */
 128 #include <libkern/OSAtomic.h>   /* OSAddAtomic() */
 129 #if !CONFIG_EMBEDDED
 130 #include <console/video_console.h>
 131 #endif
 132
 133 #ifdef JOE_DEBUG
 134 #include <libkern/OSDebug.h>
 135 #endif
 136
 137 #include <vm/vm_protos.h>       /* vnode_pager_vrele() */
 138
 139 #if CONFIG_MACF
 140 #include <security/mac_framework.h>
 141 #endif
 142
 143 #include <vfs/vfs_disk_conditioner.h>
 144 #include <libkern/section_keywords.h>
 145
 146 extern lck_grp_t *vnode_lck_grp;
 147 extern lck_attr_t *vnode_lck_attr;
 148
 149 #if CONFIG_TRIGGERS
 150 extern lck_grp_t *trigger_vnode_lck_grp;
 151 extern lck_attr_t *trigger_vnode_lck_attr;
 152 #endif
 153
 154 extern lck_mtx_t * mnt_list_mtx_lock;
 155
 156 enum vtype iftovt_tab[16] = {
 157         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 158         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 159 };
 160 int     vttoif_tab[9] = {
 161         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 162         S_IFSOCK, S_IFIFO, S_IFMT,
 163 };
 164
 165
 166 /* XXX These should be in a BSD accessible Mach header, but aren't. */
 167 extern void             memory_object_mark_used(
 168         memory_object_control_t         control);
 169
 170 extern void             memory_object_mark_unused(
 171         memory_object_control_t         control,
 172         boolean_t                       rage);
 173
 174 extern void             memory_object_mark_io_tracking(
 175         memory_object_control_t         control);
 176
 177 /* XXX next protptype should be from <nfs/nfs.h> */
 178 extern int       nfs_vinvalbuf(vnode_t, int, vfs_context_t, int);
 179
 180 extern int paniclog_append_noflush(const char *format, ...);
 181
 182 /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
 183 __private_extern__ void qsort(
 184         void * array,
 185         size_t nmembers,
 186         size_t member_size,
 187         int (*)(const void *, const void *));
 188
 189 __private_extern__ void vntblinit(void);
 190 __private_extern__ int unlink1(vfs_context_t, vnode_t, user_addr_t,
 191     enum uio_seg, int);
 192
 193 extern int system_inshutdown;
 194
 195 static void vnode_list_add(vnode_t);
 196 static void vnode_async_list_add(vnode_t);
 197 static void vnode_list_remove(vnode_t);
 198 static void vnode_list_remove_locked(vnode_t);
 199
 200 static void vnode_abort_advlocks(vnode_t);
 201 static errno_t vnode_drain(vnode_t);
 202 static void vgone(vnode_t, int flags);
 203 static void vclean(vnode_t vp, int flag);
 204 static void vnode_reclaim_internal(vnode_t, int, int, int);
 205
 206 static void vnode_dropiocount(vnode_t);
 207
 208 static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev);
 209 static int  vnode_reload(vnode_t);
 210 static int  vnode_isinuse_locked(vnode_t, int, int);
 211
 212 static int unmount_callback(mount_t, __unused void *);
 213
 214 static void insmntque(vnode_t vp, mount_t mp);
 215 static int mount_getvfscnt(void);
 216 static int mount_fillfsids(fsid_t *, int );
 217 static void vnode_iterate_setup(mount_t);
 218 int vnode_umount_preflight(mount_t, vnode_t, int);
 219 static int vnode_iterate_prepare(mount_t);
 220 static int vnode_iterate_reloadq(mount_t);
 221 static void vnode_iterate_clear(mount_t);
 222 static mount_t vfs_getvfs_locked(fsid_t *);
 223 static int vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp,
 224     struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx);
 225 static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx);
 226
 227 errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 228
 229 #ifdef JOE_DEBUG
 230 static void record_vp(vnode_t vp, int count);
 231 #endif
 232
 233 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
 234 extern int bootarg_no_vnode_jetsam;    /* from bsd_init.c default value is 0 */
 235 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
 236
 237 boolean_t root_is_CF_drive = FALSE;
 238
 239 #if CONFIG_TRIGGERS
 240 static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external);
 241 static void vnode_resolver_detach(vnode_t);
 242 #endif
 243
 244 TAILQ_HEAD(freelst, vnode) vnode_free_list;     /* vnode free list */
 245 TAILQ_HEAD(deadlst, vnode) vnode_dead_list;     /* vnode dead list */
 246 TAILQ_HEAD(async_work_lst, vnode) vnode_async_work_list;
 247
 248
 249 TAILQ_HEAD(ragelst, vnode) vnode_rage_list;     /* vnode rapid age list */
 250 struct timeval rage_tv;
 251 int     rage_limit = 0;
 252 int     ragevnodes = 0;
 253
 254 #define RAGE_LIMIT_MIN  100
 255 #define RAGE_TIME_LIMIT 5
 256
 257 struct mntlist mountlist;                       /* mounted filesystem list */
 258 static int nummounts = 0;
 259
 260 #if DIAGNOSTIC
 261 #define VLISTCHECK(fun, vp, list)       \
 262         if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
 263                 panic("%s: %s vnode not on %slist", (fun), (list), (list));
 264 #else
 265 #define VLISTCHECK(fun, vp, list)
 266 #endif /* DIAGNOSTIC */
 267
 268 #define VLISTNONE(vp)   \
 269         do {    \
 270                 (vp)->v_freelist.tqe_next = (struct vnode *)0;  \
 271                 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb;   \
 272         } while(0)
 273
 274 #define VONLIST(vp)     \
 275         ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
 276
 277 /* remove a vnode from free vnode list */
 278 #define VREMFREE(fun, vp)       \
 279         do {    \
 280                 VLISTCHECK((fun), (vp), "free");        \
 281                 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist);       \
 282                 VLISTNONE((vp));        \
 283                 freevnodes--;   \
 284         } while(0)
 285
 286
 287 /* remove a vnode from dead vnode list */
 288 #define VREMDEAD(fun, vp)       \
 289         do {    \
 290                 VLISTCHECK((fun), (vp), "dead");        \
 291                 TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist);       \
 292                 VLISTNONE((vp));        \
 293                 vp->v_listflag &= ~VLIST_DEAD;  \
 294                 deadvnodes--;   \
 295         } while(0)
 296
 297
 298 /* remove a vnode from async work vnode list */
 299 #define VREMASYNC_WORK(fun, vp) \
 300         do {    \
 301                 VLISTCHECK((fun), (vp), "async_work");  \
 302                 TAILQ_REMOVE(&vnode_async_work_list, (vp), v_freelist); \
 303                 VLISTNONE((vp));        \
 304                 vp->v_listflag &= ~VLIST_ASYNC_WORK;    \
 305                 async_work_vnodes--;    \
 306         } while(0)
 307
 308
 309 /* remove a vnode from rage vnode list */
 310 #define VREMRAGE(fun, vp)       \
 311         do {    \
 312                 if ( !(vp->v_listflag & VLIST_RAGE))                    \
 313                         panic("VREMRAGE: vp not on rage list");         \
 314                 VLISTCHECK((fun), (vp), "rage");                        \
 315                 TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist);       \
 316                 VLISTNONE((vp));                \
 317                 vp->v_listflag &= ~VLIST_RAGE;  \
 318                 ragevnodes--;                   \
 319         } while(0)
 320
 321 static void async_work_continue(void);
 322
 323 /*
 324  * Initialize the vnode management data structures.
 325  */
 326 __private_extern__ void
 327 vntblinit(void)
 328 {
 329         thread_t        thread = THREAD_NULL;
 330
 331         TAILQ_INIT(&vnode_free_list);
 332         TAILQ_INIT(&vnode_rage_list);
 333         TAILQ_INIT(&vnode_dead_list);
 334         TAILQ_INIT(&vnode_async_work_list);
 335         TAILQ_INIT(&mountlist);
 336
 337         microuptime(&rage_tv);
 338         rage_limit = desiredvnodes / 100;
 339
 340         if (rage_limit < RAGE_LIMIT_MIN) {
 341                 rage_limit = RAGE_LIMIT_MIN;
 342         }
 343
 344         /*
 345          * create worker threads
 346          */
 347         kernel_thread_start((thread_continue_t)async_work_continue, NULL, &thread);
 348         thread_deallocate(thread);
 349 }
 350
 351 /* the timeout is in 10 msecs */
 352 int
 353 vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg)
 354 {
 355         int error = 0;
 356         struct timespec ts;
 357
 358         KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0);
 359
 360         if (vp->v_numoutput > output_target) {
 361                 slpflag |= PDROP;
 362
 363                 vnode_lock_spin(vp);
 364
 365                 while ((vp->v_numoutput > output_target) && error == 0) {
 366                         if (output_target) {
 367                                 vp->v_flag |= VTHROTTLED;
 368                         } else {
 369                                 vp->v_flag |= VBWAIT;
 370                         }
 371
 372                         ts.tv_sec = (slptimeout / 100);
 373                         ts.tv_nsec = (slptimeout % 1000)  * 10 * NSEC_PER_USEC * 1000;
 374                         error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts);
 375
 376                         vnode_lock_spin(vp);
 377                 }
 378                 vnode_unlock(vp);
 379         }
 380         KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0);
 381
 382         return error;
 383 }
 384
 385
 386 void
 387 vnode_startwrite(vnode_t vp)
 388 {
 389         OSAddAtomic(1, &vp->v_numoutput);
 390 }
 391
 392
 393 void
 394 vnode_writedone(vnode_t vp)
 395 {
 396         if (vp) {
 397                 int need_wakeup = 0;
 398
 399                 OSAddAtomic(-1, &vp->v_numoutput);
 400
 401                 vnode_lock_spin(vp);
 402
 403                 if (vp->v_numoutput < 0) {
 404                         panic("vnode_writedone: numoutput < 0");
 405                 }
 406
 407                 if ((vp->v_flag & VTHROTTLED)) {
 408                         vp->v_flag &= ~VTHROTTLED;
 409                         need_wakeup = 1;
 410                 }
 411                 if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) {
 412                         vp->v_flag &= ~VBWAIT;
 413                         need_wakeup = 1;
 414                 }
 415                 vnode_unlock(vp);
 416
 417                 if (need_wakeup) {
 418                         wakeup((caddr_t)&vp->v_numoutput);
 419                 }
 420         }
 421 }
 422
 423
 424
 425 int
 426 vnode_hasdirtyblks(vnode_t vp)
 427 {
 428         struct cl_writebehind *wbp;
 429
 430         /*
 431          * Not taking the buf_mtxp as there is little
 432          * point doing it. Even if the lock is taken the
 433          * state can change right after that. If their
 434          * needs to be a synchronization, it must be driven
 435          * by the caller
 436          */
 437         if (vp->v_dirtyblkhd.lh_first) {
 438                 return 1;
 439         }
 440
 441         if (!UBCINFOEXISTS(vp)) {
 442                 return 0;
 443         }
 444
 445         wbp = vp->v_ubcinfo->cl_wbehind;
 446
 447         if (wbp && (wbp->cl_number || wbp->cl_scmap)) {
 448                 return 1;
 449         }
 450
 451         return 0;
 452 }
 453
 454 int
 455 vnode_hascleanblks(vnode_t vp)
 456 {
 457         /*
 458          * Not taking the buf_mtxp as there is little
 459          * point doing it. Even if the lock is taken the
 460          * state can change right after that. If their
 461          * needs to be a synchronization, it must be driven
 462          * by the caller
 463          */
 464         if (vp->v_cleanblkhd.lh_first) {
 465                 return 1;
 466         }
 467         return 0;
 468 }
 469
 470 void
 471 vnode_iterate_setup(mount_t mp)
 472 {
 473         mp->mnt_lflag |= MNT_LITER;
 474 }
 475
 476 int
 477 vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
 478 {
 479         vnode_t vp;
 480
 481         TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
 482                 if (vp->v_type == VDIR) {
 483                         continue;
 484                 }
 485                 if (vp == skipvp) {
 486                         continue;
 487                 }
 488                 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) {
 489                         continue;
 490                 }
 491                 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
 492                         continue;
 493                 }
 494                 if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) {
 495                         continue;
 496                 }
 497
 498                 /* Look for busy vnode */
 499                 if ((vp->v_usecount != 0) && ((vp->v_usecount - vp->v_kusecount) != 0)) {
 500                         return 1;
 501                 } else if (vp->v_iocount > 0) {
 502                         /* Busy if iocount is > 0 for more than 3 seconds */
 503                         tsleep(&vp->v_iocount, PVFS, "vnode_drain_network", 3 * hz);
 504                         if (vp->v_iocount > 0) {
 505                                 return 1;
 506                         }
 507                         continue;
 508                 }
 509         }
 510
 511         return 0;
 512 }
 513
 514 /*
 515  * This routine prepares iteration by moving all the vnodes to worker queue
 516  * called with mount lock held
 517  */
 518 int
 519 vnode_iterate_prepare(mount_t mp)
 520 {
 521         vnode_t vp;
 522
 523         if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
 524                 /* nothing to do */
 525                 return 0;
 526         }
 527
 528         vp = TAILQ_FIRST(&mp->mnt_vnodelist);
 529         vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first);
 530         mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first;
 531         mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last;
 532
 533         TAILQ_INIT(&mp->mnt_vnodelist);
 534         if (mp->mnt_newvnodes.tqh_first != NULL) {
 535                 panic("vnode_iterate_prepare: newvnode when entering vnode");
 536         }
 537         TAILQ_INIT(&mp->mnt_newvnodes);
 538
 539         return 1;
 540 }
 541
 542
 543 /* called with mount lock held */
 544 int
 545 vnode_iterate_reloadq(mount_t mp)
 546 {
 547         int moved = 0;
 548
 549         /* add the remaining entries in workerq to the end of mount vnode list */
 550         if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
 551                 struct vnode * mvp;
 552                 mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst);
 553
 554                 /* Joining the workerque entities to mount vnode list */
 555                 if (mvp) {
 556                         mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first;
 557                 } else {
 558                         mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first;
 559                 }
 560                 mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last;
 561                 mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last;
 562                 TAILQ_INIT(&mp->mnt_workerqueue);
 563         }
 564
 565         /* add the newvnodes to the head of mount vnode list */
 566         if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) {
 567                 struct vnode * nlvp;
 568                 nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst);
 569
 570                 mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first;
 571                 nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first;
 572                 if (mp->mnt_vnodelist.tqh_first) {
 573                         mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next;
 574                 } else {
 575                         mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last;
 576                 }
 577                 mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first;
 578                 TAILQ_INIT(&mp->mnt_newvnodes);
 579                 moved = 1;
 580         }
 581
 582         return moved;
 583 }
 584
 585
 586 void
 587 vnode_iterate_clear(mount_t mp)
 588 {
 589         mp->mnt_lflag &= ~MNT_LITER;
 590 }
 591
 592 #if !CONFIG_EMBEDDED
 593
 594 #include <i386/panic_hooks.h>
 595
 596 struct vnode_iterate_panic_hook {
 597         panic_hook_t hook;
 598         mount_t mp;
 599         struct vnode *vp;
 600 };
 601
 602 static void
 603 vnode_iterate_panic_hook(panic_hook_t *hook_)
 604 {
 605         struct vnode_iterate_panic_hook *hook = (struct vnode_iterate_panic_hook *)hook_;
 606         panic_phys_range_t range;
 607         uint64_t phys;
 608
 609         if (panic_phys_range_before(hook->mp, &phys, &range)) {
 610                 paniclog_append_noflush("mp = %p, phys = %p, prev (%p: %p-%p)\n",
 611                     hook->mp, phys, range.type, range.phys_start,
 612                     range.phys_start + range.len);
 613         } else {
 614                 paniclog_append_noflush("mp = %p, phys = %p, prev (!)\n", hook->mp, phys);
 615         }
 616
 617         if (panic_phys_range_before(hook->vp, &phys, &range)) {
 618                 paniclog_append_noflush("vp = %p, phys = %p, prev (%p: %p-%p)\n",
 619                     hook->vp, phys, range.type, range.phys_start,
 620                     range.phys_start + range.len);
 621         } else {
 622                 paniclog_append_noflush("vp = %p, phys = %p, prev (!)\n", hook->vp, phys);
 623         }
 624         panic_dump_mem((void *)(((vm_offset_t)hook->mp - 4096) & ~4095), 12288);
 625 }
 626 #endif //CONFIG_EMBEDDED
 627
 628 int
 629 vnode_iterate(mount_t mp, int flags, int (*callout)(struct vnode *, void *),
 630     void *arg)
 631 {
 632         struct vnode *vp;
 633         int vid, retval;
 634         int ret = 0;
 635
 636         /*
 637          * The mount iterate mutex is held for the duration of the iteration.
 638          * This can be done by a state flag on the mount structure but we can
 639          * run into priority inversion issues sometimes.
 640          * Using a mutex allows us to benefit from the priority donation
 641          * mechanisms in the kernel for locks. This mutex should never be
 642          * acquired in spin mode and it should be acquired before attempting to
 643          * acquire the mount lock.
 644          */
 645         mount_iterate_lock(mp);
 646
 647         mount_lock(mp);
 648
 649         vnode_iterate_setup(mp);
 650
 651         /* If it returns 0 then there is nothing to do */
 652         retval = vnode_iterate_prepare(mp);
 653
 654         if (retval == 0) {
 655                 vnode_iterate_clear(mp);
 656                 mount_unlock(mp);
 657                 mount_iterate_unlock(mp);
 658                 return ret;
 659         }
 660
 661 #if !CONFIG_EMBEDDED
 662         struct vnode_iterate_panic_hook hook;
 663         hook.mp = mp;
 664         hook.vp = NULL;
 665         panic_hook(&hook.hook, vnode_iterate_panic_hook);
 666 #endif
 667         /* iterate over all the vnodes */
 668         while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
 669                 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
 670 #if !CONFIG_EMBEDDED
 671                 hook.vp = vp;
 672 #endif
 673                 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
 674                 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
 675                 vid = vp->v_id;
 676                 if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) {
 677                         continue;
 678                 }
 679                 mount_unlock(mp);
 680
 681                 if (vget_internal(vp, vid, (flags | VNODE_NODEAD | VNODE_WITHID | VNODE_NOSUSPEND))) {
 682                         mount_lock(mp);
 683                         continue;
 684                 }
 685                 if (flags & VNODE_RELOAD) {
 686                         /*
 687                          * we're reloading the filesystem
 688                          * cast out any inactive vnodes...
 689                          */
 690                         if (vnode_reload(vp)) {
 691                                 /* vnode will be recycled on the refcount drop */
 692                                 vnode_put(vp);
 693                                 mount_lock(mp);
 694                                 continue;
 695                         }
 696                 }
 697
 698                 retval = callout(vp, arg);
 699
 700                 switch (retval) {
 701                 case VNODE_RETURNED:
 702                 case VNODE_RETURNED_DONE:
 703                         vnode_put(vp);
 704                         if (retval == VNODE_RETURNED_DONE) {
 705                                 mount_lock(mp);
 706                                 ret = 0;
 707                                 goto out;
 708                         }
 709                         break;
 710
 711                 case VNODE_CLAIMED_DONE:
 712                         mount_lock(mp);
 713                         ret = 0;
 714                         goto out;
 715                 case VNODE_CLAIMED:
 716                 default:
 717                         break;
 718                 }
 719                 mount_lock(mp);
 720         }
 721
 722 out:
 723 #if !CONFIG_EMBEDDED
 724         panic_unhook(&hook.hook);
 725 #endif
 726         (void)vnode_iterate_reloadq(mp);
 727         vnode_iterate_clear(mp);
 728         mount_unlock(mp);
 729         mount_iterate_unlock(mp);
 730         return ret;
 731 }
 732
 733 void
 734 mount_lock_renames(mount_t mp)
 735 {
 736         lck_mtx_lock(&mp->mnt_renamelock);
 737 }
 738
 739 void
 740 mount_unlock_renames(mount_t mp)
 741 {
 742         lck_mtx_unlock(&mp->mnt_renamelock);
 743 }
 744
 745 void
 746 mount_iterate_lock(mount_t mp)
 747 {
 748         lck_mtx_lock(&mp->mnt_iter_lock);
 749 }
 750
 751 void
 752 mount_iterate_unlock(mount_t mp)
 753 {
 754         lck_mtx_unlock(&mp->mnt_iter_lock);
 755 }
 756
 757 void
 758 mount_lock(mount_t mp)
 759 {
 760         lck_mtx_lock(&mp->mnt_mlock);
 761 }
 762
 763 void
 764 mount_lock_spin(mount_t mp)
 765 {
 766         lck_mtx_lock_spin(&mp->mnt_mlock);
 767 }
 768
 769 void
 770 mount_unlock(mount_t mp)
 771 {
 772         lck_mtx_unlock(&mp->mnt_mlock);
 773 }
 774
 775
 776 void
 777 mount_ref(mount_t mp, int locked)
 778 {
 779         if (!locked) {
 780                 mount_lock_spin(mp);
 781         }
 782
 783         mp->mnt_count++;
 784
 785         if (!locked) {
 786                 mount_unlock(mp);
 787         }
 788 }
 789
 790
 791 void
 792 mount_drop(mount_t mp, int locked)
 793 {
 794         if (!locked) {
 795                 mount_lock_spin(mp);
 796         }
 797
 798         mp->mnt_count--;
 799
 800         if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN)) {
 801                 wakeup(&mp->mnt_lflag);
 802         }
 803
 804         if (!locked) {
 805                 mount_unlock(mp);
 806         }
 807 }
 808
 809
 810 int
 811 mount_iterref(mount_t mp, int locked)
 812 {
 813         int retval = 0;
 814
 815         if (!locked) {
 816                 mount_list_lock();
 817         }
 818         if (mp->mnt_iterref < 0) {
 819                 retval = 1;
 820         } else {
 821                 mp->mnt_iterref++;
 822         }
 823         if (!locked) {
 824                 mount_list_unlock();
 825         }
 826         return retval;
 827 }
 828
 829 int
 830 mount_isdrained(mount_t mp, int locked)
 831 {
 832         int retval;
 833
 834         if (!locked) {
 835                 mount_list_lock();
 836         }
 837         if (mp->mnt_iterref < 0) {
 838                 retval = 1;
 839         } else {
 840                 retval = 0;
 841         }
 842         if (!locked) {
 843                 mount_list_unlock();
 844         }
 845         return retval;
 846 }
 847
 848 void
 849 mount_iterdrop(mount_t mp)
 850 {
 851         mount_list_lock();
 852         mp->mnt_iterref--;
 853         wakeup(&mp->mnt_iterref);
 854         mount_list_unlock();
 855 }
 856
 857 void
 858 mount_iterdrain(mount_t mp)
 859 {
 860         mount_list_lock();
 861         while (mp->mnt_iterref) {
 862                 msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL);
 863         }
 864         /* mount iterations drained */
 865         mp->mnt_iterref = -1;
 866         mount_list_unlock();
 867 }
 868 void
 869 mount_iterreset(mount_t mp)
 870 {
 871         mount_list_lock();
 872         if (mp->mnt_iterref == -1) {
 873                 mp->mnt_iterref = 0;
 874         }
 875         mount_list_unlock();
 876 }
 877
 878 /* always called with  mount lock held */
 879 int
 880 mount_refdrain(mount_t mp)
 881 {
 882         if (mp->mnt_lflag & MNT_LDRAIN) {
 883                 panic("already in drain");
 884         }
 885         mp->mnt_lflag |= MNT_LDRAIN;
 886
 887         while (mp->mnt_count) {
 888                 msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", NULL);
 889         }
 890
 891         if (mp->mnt_vnodelist.tqh_first != NULL) {
 892                 panic("mount_refdrain: dangling vnode");
 893         }
 894
 895         mp->mnt_lflag &= ~MNT_LDRAIN;
 896
 897         return 0;
 898 }
 899
 900 /* Tags the mount point as not supportine extended readdir for NFS exports */
 901 void
 902 mount_set_noreaddirext(mount_t mp)
 903 {
 904         mount_lock(mp);
 905         mp->mnt_kern_flag |= MNTK_DENY_READDIREXT;
 906         mount_unlock(mp);
 907 }
 908
 909 /*
 910  * Mark a mount point as busy. Used to synchronize access and to delay
 911  * unmounting.
 912  */
 913 int
 914 vfs_busy(mount_t mp, int flags)
 915 {
 916 restart:
 917         if (mp->mnt_lflag & MNT_LDEAD) {
 918                 return ENOENT;
 919         }
 920
 921         mount_lock(mp);
 922
 923         if (mp->mnt_lflag & MNT_LUNMOUNT) {
 924                 if (flags & LK_NOWAIT || mp->mnt_lflag & MNT_LDEAD) {
 925                         mount_unlock(mp);
 926                         return ENOENT;
 927                 }
 928
 929                 /*
 930                  * Since all busy locks are shared except the exclusive
 931                  * lock granted when unmounting, the only place that a
 932                  * wakeup needs to be done is at the release of the
 933                  * exclusive lock at the end of dounmount.
 934                  */
 935                 mp->mnt_lflag |= MNT_LWAIT;
 936                 msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", NULL);
 937                 return ENOENT;
 938         }
 939
 940         mount_unlock(mp);
 941
 942         lck_rw_lock_shared(&mp->mnt_rwlock);
 943
 944         /*
 945          * Until we are granted the rwlock, it's possible for the mount point to
 946          * change state, so re-evaluate before granting the vfs_busy.
 947          */
 948         if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
 949                 lck_rw_done(&mp->mnt_rwlock);
 950                 goto restart;
 951         }
 952         return 0;
 953 }
 954
 955 /*
 956  * Free a busy filesystem.
 957  */
 958 void
 959 vfs_unbusy(mount_t mp)
 960 {
 961         lck_rw_done(&mp->mnt_rwlock);
 962 }
 963
 964
 965
 966 static void
 967 vfs_rootmountfailed(mount_t mp)
 968 {
 969         mount_list_lock();
 970         mp->mnt_vtable->vfc_refcount--;
 971         mount_list_unlock();
 972
 973         vfs_unbusy(mp);
 974
 975         mount_lock_destroy(mp);
 976
 977 #if CONFIG_MACF
 978         mac_mount_label_destroy(mp);
 979 #endif
 980
 981         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
 982 }
 983
 984 /*
 985  * Lookup a filesystem type, and if found allocate and initialize
 986  * a mount structure for it.
 987  *
 988  * Devname is usually updated by mount(8) after booting.
 989  */
 990 static mount_t
 991 vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname)
 992 {
 993         mount_t mp;
 994
 995         mp = _MALLOC_ZONE(sizeof(struct mount), M_MOUNT, M_WAITOK);
 996         bzero((char *)mp, sizeof(struct mount));
 997
 998         /* Initialize the default IO constraints */
 999         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1000         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1001         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1002         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1003         mp->mnt_devblocksize = DEV_BSIZE;
1004         mp->mnt_alignmentmask = PAGE_MASK;
1005         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1006         mp->mnt_ioscale = 1;
1007         mp->mnt_ioflags = 0;
1008         mp->mnt_realrootvp = NULLVP;
1009         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1010         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1011         mp->mnt_devbsdunit = 0;
1012
1013         mount_lock_init(mp);
1014         (void)vfs_busy(mp, LK_NOWAIT);
1015
1016         TAILQ_INIT(&mp->mnt_vnodelist);
1017         TAILQ_INIT(&mp->mnt_workerqueue);
1018         TAILQ_INIT(&mp->mnt_newvnodes);
1019
1020         mp->mnt_vtable = vfsp;
1021         mp->mnt_op = vfsp->vfc_vfsops;
1022         mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS;
1023         mp->mnt_vnodecovered = NULLVP;
1024         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1025         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1026
1027         mount_list_lock();
1028         vfsp->vfc_refcount++;
1029         mount_list_unlock();
1030
1031         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1032         mp->mnt_vfsstat.f_mntonname[0] = '/';
1033         /* XXX const poisoning layering violation */
1034         (void) copystr((const void *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, NULL);
1035
1036 #if CONFIG_MACF
1037         mac_mount_label_init(mp);
1038         mac_mount_label_associate(vfs_context_kernel(), mp);
1039 #endif
1040         return mp;
1041 }
1042
1043 errno_t
1044 vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp)
1045 {
1046         struct vfstable *vfsp;
1047
1048         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1049                 if (!strncmp(vfsp->vfc_name, fstypename,
1050                     sizeof(vfsp->vfc_name))) {
1051                         break;
1052                 }
1053         }
1054         if (vfsp == NULL) {
1055                 return ENODEV;
1056         }
1057
1058         *mpp = vfs_rootmountalloc_internal(vfsp, devname);
1059
1060         if (*mpp) {
1061                 return 0;
1062         }
1063
1064         return ENOMEM;
1065 }
1066
1067 #define DBG_MOUNTROOT (FSDBG_CODE(DBG_MOUNT, 0))
1068
1069 /*
1070  * Find an appropriate filesystem to use for the root. If a filesystem
1071  * has not been preselected, walk through the list of known filesystems
1072  * trying those that have mountroot routines, and try them until one
1073  * works or we have tried them all.
1074  */
1075 extern int (*mountroot)(void);
1076
1077 int
1078 vfs_mountroot(void)
1079 {
1080 #if CONFIG_MACF
1081         struct vnode *vp;
1082 #endif
1083         struct vfstable *vfsp;
1084         vfs_context_t ctx = vfs_context_kernel();
1085         struct vfs_attr vfsattr;
1086         int     error;
1087         mount_t mp;
1088         vnode_t bdevvp_rootvp;
1089
1090         KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_START);
1091         if (mountroot != NULL) {
1092                 /*
1093                  * used for netboot which follows a different set of rules
1094                  */
1095                 error = (*mountroot)();
1096
1097                 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 0);
1098                 return error;
1099         }
1100         if ((error = bdevvp(rootdev, &rootvp))) {
1101                 printf("vfs_mountroot: can't setup bdevvp\n");
1102
1103                 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 1);
1104                 return error;
1105         }
1106         /*
1107          * 4951998 - code we call in vfc_mountroot may replace rootvp
1108          * so keep a local copy for some house keeping.
1109          */
1110         bdevvp_rootvp = rootvp;
1111
1112         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1113                 if (vfsp->vfc_mountroot == NULL
1114                     && !ISSET(vfsp->vfc_vfsflags, VFC_VFSCANMOUNTROOT)) {
1115                         continue;
1116                 }
1117
1118                 mp = vfs_rootmountalloc_internal(vfsp, "root_device");
1119                 mp->mnt_devvp = rootvp;
1120
1121                 if (vfsp->vfc_mountroot) {
1122                         error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx);
1123                 } else {
1124                         error = VFS_MOUNT(mp, rootvp, 0, ctx);
1125                 }
1126
1127                 if (!error) {
1128                         if (bdevvp_rootvp != rootvp) {
1129                                 /*
1130                                  * rootvp changed...
1131                                  *   bump the iocount and fix up mnt_devvp for the
1132                                  *   new rootvp (it will already have a usecount taken)...
1133                                  *   drop the iocount and the usecount on the orignal
1134                                  *   since we are no longer going to use it...
1135                                  */
1136                                 vnode_getwithref(rootvp);
1137                                 mp->mnt_devvp = rootvp;
1138
1139                                 vnode_rele(bdevvp_rootvp);
1140                                 vnode_put(bdevvp_rootvp);
1141                         }
1142                         mp->mnt_devvp->v_specflags |= SI_MOUNTEDON;
1143
1144                         vfs_unbusy(mp);
1145
1146                         mount_list_add(mp);
1147
1148                         /*
1149                          *   cache the IO attributes for the underlying physical media...
1150                          *   an error return indicates the underlying driver doesn't
1151                          *   support all the queries necessary... however, reasonable
1152                          *   defaults will have been set, so no reason to bail or care
1153                          */
1154                         vfs_init_io_attributes(rootvp, mp);
1155
1156                         if (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) {
1157                                 root_is_CF_drive = TRUE;
1158                         }
1159
1160                         /*
1161                          * Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS.
1162                          */
1163                         if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1164                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1165                         }
1166                         if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1167                                 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1168                         }
1169
1170 #if !CONFIG_EMBEDDED
1171                         uint32_t speed;
1172
1173                         if (MNTK_VIRTUALDEV & mp->mnt_kern_flag) {
1174                                 speed = 128;
1175                         } else if (disk_conditioner_mount_is_ssd(mp)) {
1176                                 speed = 7 * 256;
1177                         } else {
1178                                 speed = 256;
1179                         }
1180                         vc_progress_setdiskspeed(speed);
1181 #endif
1182                         /*
1183                          * Probe root file system for additional features.
1184                          */
1185                         (void)VFS_START(mp, 0, ctx);
1186
1187                         VFSATTR_INIT(&vfsattr);
1188                         VFSATTR_WANTED(&vfsattr, f_capabilities);
1189                         if (vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1190                             VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1191                                 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1192                                     (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1193                                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1194                                 }
1195 #if NAMEDSTREAMS
1196                                 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1197                                     (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1198                                         mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1199                                 }
1200 #endif
1201                                 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1202                                     (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1203                                         mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1204                                 }
1205
1206                                 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1207                                     (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1208                                         mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1209                                 }
1210                         }
1211
1212                         /*
1213                          * get rid of iocount reference returned
1214                          * by bdevvp (or picked up by us on the substitued
1215                          * rootvp)... it (or we) will have also taken
1216                          * a usecount reference which we want to keep
1217                          */
1218                         vnode_put(rootvp);
1219
1220 #if CONFIG_MACF
1221                         if ((vfs_flags(mp) & MNT_MULTILABEL) == 0) {
1222                                 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 2);
1223                                 return 0;
1224                         }
1225
1226                         error = VFS_ROOT(mp, &vp, ctx);
1227                         if (error) {
1228                                 printf("%s() VFS_ROOT() returned %d\n",
1229                                     __func__, error);
1230                                 dounmount(mp, MNT_FORCE, 0, ctx);
1231                                 goto fail;
1232                         }
1233                         error = vnode_label(mp, NULL, vp, NULL, 0, ctx);
1234                         /*
1235                          * get rid of reference provided by VFS_ROOT
1236                          */
1237                         vnode_put(vp);
1238
1239                         if (error) {
1240                                 printf("%s() vnode_label() returned %d\n",
1241                                     __func__, error);
1242                                 dounmount(mp, MNT_FORCE, 0, ctx);
1243                                 goto fail;
1244                         }
1245 #endif
1246                         KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 3);
1247                         return 0;
1248                 }
1249 #if CONFIG_MACF
1250 fail:
1251 #endif
1252                 vfs_rootmountfailed(mp);
1253
1254                 if (error != EINVAL) {
1255                         printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
1256                 }
1257         }
1258         KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error ? error : ENODEV, 4);
1259         return ENODEV;
1260 }
1261
1262 /*
1263  * Lookup a mount point by filesystem identifier.
1264  */
1265
1266 struct mount *
1267 vfs_getvfs(fsid_t *fsid)
1268 {
1269         return mount_list_lookupby_fsid(fsid, 0, 0);
1270 }
1271
1272 static struct mount *
1273 vfs_getvfs_locked(fsid_t *fsid)
1274 {
1275         return mount_list_lookupby_fsid(fsid, 1, 0);
1276 }
1277
1278 struct mount *
1279 vfs_getvfs_by_mntonname(char *path)
1280 {
1281         mount_t retmp = (mount_t)0;
1282         mount_t mp;
1283
1284         mount_list_lock();
1285         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
1286                 if (!strncmp(mp->mnt_vfsstat.f_mntonname, path,
1287                     sizeof(mp->mnt_vfsstat.f_mntonname))) {
1288                         retmp = mp;
1289                         if (mount_iterref(retmp, 1)) {
1290                                 retmp = NULL;
1291                         }
1292                         goto out;
1293                 }
1294         }
1295 out:
1296         mount_list_unlock();
1297         return retmp;
1298 }
1299
1300 /* generation number for creation of new fsids */
1301 u_short mntid_gen = 0;
1302 /*
1303  * Get a new unique fsid
1304  */
1305 void
1306 vfs_getnewfsid(struct mount *mp)
1307 {
1308         fsid_t tfsid;
1309         int mtype;
1310
1311         mount_list_lock();
1312
1313         /* generate a new fsid */
1314         mtype = mp->mnt_vtable->vfc_typenum;
1315         if (++mntid_gen == 0) {
1316                 mntid_gen++;
1317         }
1318         tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1319         tfsid.val[1] = mtype;
1320
1321         while (vfs_getvfs_locked(&tfsid)) {
1322                 if (++mntid_gen == 0) {
1323                         mntid_gen++;
1324                 }
1325                 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1326         }
1327
1328         mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0];
1329         mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1];
1330         mount_list_unlock();
1331 }
1332
1333 /*
1334  * Routines having to do with the management of the vnode table.
1335  */
1336 extern int(**dead_vnodeop_p)(void *);
1337 long numvnodes, freevnodes, deadvnodes, async_work_vnodes;
1338
1339
1340 int async_work_timed_out = 0;
1341 int async_work_handled = 0;
1342 int dead_vnode_wanted = 0;
1343 int dead_vnode_waited = 0;
1344
1345 /*
1346  * Move a vnode from one mount queue to another.
1347  */
1348 static void
1349 insmntque(vnode_t vp, mount_t mp)
1350 {
1351         mount_t lmp;
1352         /*
1353          * Delete from old mount point vnode list, if on one.
1354          */
1355         if ((lmp = vp->v_mount) != NULL && lmp != dead_mountp) {
1356                 if ((vp->v_lflag & VNAMED_MOUNT) == 0) {
1357                         panic("insmntque: vp not in mount vnode list");
1358                 }
1359                 vp->v_lflag &= ~VNAMED_MOUNT;
1360
1361                 mount_lock_spin(lmp);
1362
1363                 mount_drop(lmp, 1);
1364
1365                 if (vp->v_mntvnodes.tqe_next == NULL) {
1366                         if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp) {
1367                                 TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes);
1368                         } else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp) {
1369                                 TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes);
1370                         } else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp) {
1371                                 TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes);
1372                         }
1373                 } else {
1374                         vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev;
1375                         *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next;
1376                 }
1377                 vp->v_mntvnodes.tqe_next = NULL;
1378                 vp->v_mntvnodes.tqe_prev = NULL;
1379                 mount_unlock(lmp);
1380                 return;
1381         }
1382
1383         /*
1384          * Insert into list of vnodes for the new mount point, if available.
1385          */
1386         if ((vp->v_mount = mp) != NULL) {
1387                 mount_lock_spin(mp);
1388                 if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0)) {
1389                         panic("vp already in mount list");
1390                 }
1391                 if (mp->mnt_lflag & MNT_LITER) {
1392                         TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes);
1393                 } else {
1394                         TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
1395                 }
1396                 if (vp->v_lflag & VNAMED_MOUNT) {
1397                         panic("insmntque: vp already in mount vnode list");
1398                 }
1399                 vp->v_lflag |= VNAMED_MOUNT;
1400                 mount_ref(mp, 1);
1401                 mount_unlock(mp);
1402         }
1403 }
1404
1405
1406 /*
1407  * Create a vnode for a block device.
1408  * Used for root filesystem, argdev, and swap areas.
1409  * Also used for memory file system special devices.
1410  */
1411 int
1412 bdevvp(dev_t dev, vnode_t *vpp)
1413 {
1414         vnode_t nvp;
1415         int     error;
1416         struct vnode_fsparam vfsp;
1417         struct vfs_context context;
1418
1419         if (dev == NODEV) {
1420                 *vpp = NULLVP;
1421                 return ENODEV;
1422         }
1423
1424         context.vc_thread = current_thread();
1425         context.vc_ucred = FSCRED;
1426
1427         vfsp.vnfs_mp = (struct mount *)0;
1428         vfsp.vnfs_vtype = VBLK;
1429         vfsp.vnfs_str = "bdevvp";
1430         vfsp.vnfs_dvp = NULL;
1431         vfsp.vnfs_fsnode = NULL;
1432         vfsp.vnfs_cnp = NULL;
1433         vfsp.vnfs_vops = spec_vnodeop_p;
1434         vfsp.vnfs_rdev = dev;
1435         vfsp.vnfs_filesize = 0;
1436
1437         vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE;
1438
1439         vfsp.vnfs_marksystem = 0;
1440         vfsp.vnfs_markroot = 0;
1441
1442         if ((error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp))) {
1443                 *vpp = NULLVP;
1444                 return error;
1445         }
1446         vnode_lock_spin(nvp);
1447         nvp->v_flag |= VBDEVVP;
1448         nvp->v_tag = VT_NON;    /* set this to VT_NON so during aliasing it can be replaced */
1449         vnode_unlock(nvp);
1450         if ((error = vnode_ref(nvp))) {
1451                 panic("bdevvp failed: vnode_ref");
1452                 return error;
1453         }
1454         if ((error = VNOP_FSYNC(nvp, MNT_WAIT, &context))) {
1455                 panic("bdevvp failed: fsync");
1456                 return error;
1457         }
1458         if ((error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0))) {
1459                 panic("bdevvp failed: invalidateblks");
1460                 return error;
1461         }
1462
1463 #if CONFIG_MACF
1464         /*
1465          * XXXMAC: We can't put a MAC check here, the system will
1466          * panic without this vnode.
1467          */
1468 #endif /* MAC */
1469
1470         if ((error = VNOP_OPEN(nvp, FREAD, &context))) {
1471                 panic("bdevvp failed: open");
1472                 return error;
1473         }
1474         *vpp = nvp;
1475
1476         return 0;
1477 }
1478
1479 /*
1480  * Check to see if the new vnode represents a special device
1481  * for which we already have a vnode (either because of
1482  * bdevvp() or because of a different vnode representing
1483  * the same block device). If such an alias exists, deallocate
1484  * the existing contents and return the aliased vnode. The
1485  * caller is responsible for filling it with its new contents.
1486  */
1487 static vnode_t
1488 checkalias(struct vnode *nvp, dev_t nvp_rdev)
1489 {
1490         struct vnode *vp;
1491         struct vnode **vpp;
1492         struct specinfo *sin = NULL;
1493         int vid = 0;
1494
1495         vpp = &speclisth[SPECHASH(nvp_rdev)];
1496 loop:
1497         SPECHASH_LOCK();
1498
1499         for (vp = *vpp; vp; vp = vp->v_specnext) {
1500                 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1501                         vid = vp->v_id;
1502                         break;
1503                 }
1504         }
1505         SPECHASH_UNLOCK();
1506
1507         if (vp) {
1508 found_alias:
1509                 if (vnode_getwithvid(vp, vid)) {
1510                         goto loop;
1511                 }
1512                 /*
1513                  * Termination state is checked in vnode_getwithvid
1514                  */
1515                 vnode_lock(vp);
1516
1517                 /*
1518                  * Alias, but not in use, so flush it out.
1519                  */
1520                 if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) {
1521                         vnode_reclaim_internal(vp, 1, 1, 0);
1522                         vnode_put_locked(vp);
1523                         vnode_unlock(vp);
1524                         goto loop;
1525                 }
1526         }
1527         if (vp == NULL || vp->v_tag != VT_NON) {
1528                 if (sin == NULL) {
1529                         MALLOC_ZONE(sin, struct specinfo *, sizeof(struct specinfo),
1530                             M_SPECINFO, M_WAITOK);
1531                 }
1532
1533                 nvp->v_specinfo = sin;
1534                 bzero(nvp->v_specinfo, sizeof(struct specinfo));
1535                 nvp->v_rdev = nvp_rdev;
1536                 nvp->v_specflags = 0;
1537                 nvp->v_speclastr = -1;
1538                 nvp->v_specinfo->si_opencount = 0;
1539                 nvp->v_specinfo->si_initted = 0;
1540                 nvp->v_specinfo->si_throttleable = 0;
1541
1542                 SPECHASH_LOCK();
1543
1544                 /* We dropped the lock, someone could have added */
1545                 if (vp == NULLVP) {
1546                         for (vp = *vpp; vp; vp = vp->v_specnext) {
1547                                 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1548                                         vid = vp->v_id;
1549                                         SPECHASH_UNLOCK();
1550                                         goto found_alias;
1551                                 }
1552                         }
1553                 }
1554
1555                 nvp->v_hashchain = vpp;
1556                 nvp->v_specnext = *vpp;
1557                 *vpp = nvp;
1558
1559                 if (vp != NULLVP) {
1560                         nvp->v_specflags |= SI_ALIASED;
1561                         vp->v_specflags |= SI_ALIASED;
1562                         SPECHASH_UNLOCK();
1563                         vnode_put_locked(vp);
1564                         vnode_unlock(vp);
1565                 } else {
1566                         SPECHASH_UNLOCK();
1567                 }
1568
1569                 return NULLVP;
1570         }
1571
1572         if (sin) {
1573                 FREE_ZONE(sin, sizeof(struct specinfo), M_SPECINFO);
1574         }
1575
1576         if ((vp->v_flag & (VBDEVVP | VDEVFLUSH)) != 0) {
1577                 return vp;
1578         }
1579
1580         panic("checkalias with VT_NON vp that shouldn't: %p", vp);
1581
1582         return vp;
1583 }
1584
1585
1586 /*
1587  * Get a reference on a particular vnode and lock it if requested.
1588  * If the vnode was on the inactive list, remove it from the list.
1589  * If the vnode was on the free list, remove it from the list and
1590  * move it to inactive list as needed.
1591  * The vnode lock bit is set if the vnode is being eliminated in
1592  * vgone. The process is awakened when the transition is completed,
1593  * and an error returned to indicate that the vnode is no longer
1594  * usable (possibly having been changed to a new file system type).
1595  */
1596 int
1597 vget_internal(vnode_t vp, int vid, int vflags)
1598 {
1599         int error = 0;
1600
1601         vnode_lock_spin(vp);
1602
1603         if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) {
1604                 /*
1605                  * vnode to be returned only if it has writers opened
1606                  */
1607                 error = EINVAL;
1608         } else {
1609                 error = vnode_getiocount(vp, vid, vflags);
1610         }
1611
1612         vnode_unlock(vp);
1613
1614         return error;
1615 }
1616
1617 /*
1618  * Returns:     0                       Success
1619  *              ENOENT                  No such file or directory [terminating]
1620  */
1621 int
1622 vnode_ref(vnode_t vp)
1623 {
1624         return vnode_ref_ext(vp, 0, 0);
1625 }
1626
1627 /*
1628  * Returns:     0                       Success
1629  *              ENOENT                  No such file or directory [terminating]
1630  */
1631 int
1632 vnode_ref_ext(vnode_t vp, int fmode, int flags)
1633 {
1634         int     error = 0;
1635
1636         vnode_lock_spin(vp);
1637
1638         /*
1639          * once all the current call sites have been fixed to insure they have
1640          * taken an iocount, we can toughen this assert up and insist that the
1641          * iocount is non-zero... a non-zero usecount doesn't insure correctness
1642          */
1643         if (vp->v_iocount <= 0 && vp->v_usecount <= 0) {
1644                 panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount);
1645         }
1646
1647         /*
1648          * if you are the owner of drain/termination, can acquire usecount
1649          */
1650         if ((flags & VNODE_REF_FORCE) == 0) {
1651                 if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) {
1652                         if (vp->v_owner != current_thread()) {
1653                                 error = ENOENT;
1654                                 goto out;
1655                         }
1656                 }
1657         }
1658         vp->v_usecount++;
1659
1660         if (fmode & FWRITE) {
1661                 if (++vp->v_writecount <= 0) {
1662                         panic("vnode_ref_ext: v_writecount");
1663                 }
1664         }
1665         if (fmode & O_EVTONLY) {
1666                 if (++vp->v_kusecount <= 0) {
1667                         panic("vnode_ref_ext: v_kusecount");
1668                 }
1669         }
1670         if (vp->v_flag & VRAGE) {
1671                 struct  uthread *ut;
1672
1673                 ut = get_bsdthread_info(current_thread());
1674
1675                 if (!(current_proc()->p_lflag & P_LRAGE_VNODES) &&
1676                     !(ut->uu_flag & UT_RAGE_VNODES)) {
1677                         /*
1678                          * a 'normal' process accessed this vnode
1679                          * so make sure its no longer marked
1680                          * for rapid aging...  also, make sure
1681                          * it gets removed from the rage list...
1682                          * when v_usecount drops back to 0, it
1683                          * will be put back on the real free list
1684                          */
1685                         vp->v_flag &= ~VRAGE;
1686                         vp->v_references = 0;
1687                         vnode_list_remove(vp);
1688                 }
1689         }
1690         if (vp->v_usecount == 1 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
1691                 if (vp->v_ubcinfo) {
1692                         vnode_lock_convert(vp);
1693                         memory_object_mark_used(vp->v_ubcinfo->ui_control);
1694                 }
1695         }
1696 out:
1697         vnode_unlock(vp);
1698
1699         return error;
1700 }
1701
1702
1703 boolean_t
1704 vnode_on_reliable_media(vnode_t vp)
1705 {
1706         if (!(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) && (vp->v_mount->mnt_flag & MNT_LOCAL)) {
1707                 return TRUE;
1708         }
1709         return FALSE;
1710 }
1711
1712 static void
1713 vnode_async_list_add(vnode_t vp)
1714 {
1715         vnode_list_lock();
1716
1717         if (VONLIST(vp) || (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) {
1718                 panic("vnode_async_list_add: %p is in wrong state", vp);
1719         }
1720
1721         TAILQ_INSERT_HEAD(&vnode_async_work_list, vp, v_freelist);
1722         vp->v_listflag |= VLIST_ASYNC_WORK;
1723
1724         async_work_vnodes++;
1725
1726         vnode_list_unlock();
1727
1728         wakeup(&vnode_async_work_list);
1729 }
1730
1731
1732 /*
1733  * put the vnode on appropriate free list.
1734  * called with vnode LOCKED
1735  */
1736 static void
1737 vnode_list_add(vnode_t vp)
1738 {
1739         boolean_t need_dead_wakeup = FALSE;
1740
1741 #if DIAGNOSTIC
1742         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1743 #endif
1744
1745 again:
1746
1747         /*
1748          * if it is already on a list or non zero references return
1749          */
1750         if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || (vp->v_lflag & VL_TERMINATE)) {
1751                 return;
1752         }
1753
1754         /*
1755          * In vclean, we might have deferred ditching locked buffers
1756          * because something was still referencing them (indicated by
1757          * usecount).  We can ditch them now.
1758          */
1759         if (ISSET(vp->v_lflag, VL_DEAD)
1760             && (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))) {
1761                 ++vp->v_iocount;        // Probably not necessary, but harmless
1762 #ifdef JOE_DEBUG
1763                 record_vp(vp, 1);
1764 #endif
1765                 vnode_unlock(vp);
1766                 buf_invalidateblks(vp, BUF_INVALIDATE_LOCKED, 0, 0);
1767                 vnode_lock(vp);
1768                 vnode_dropiocount(vp);
1769                 goto again;
1770         }
1771
1772         vnode_list_lock();
1773
1774         if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) {
1775                 /*
1776                  * add the new guy to the appropriate end of the RAGE list
1777                  */
1778                 if ((vp->v_flag & VAGE)) {
1779                         TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist);
1780                 } else {
1781                         TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist);
1782                 }
1783
1784                 vp->v_listflag |= VLIST_RAGE;
1785                 ragevnodes++;
1786
1787                 /*
1788                  * reset the timestamp for the last inserted vp on the RAGE
1789                  * queue to let new_vnode know that its not ok to start stealing
1790                  * from this list... as long as we're actively adding to this list
1791                  * we'll push out the vnodes we want to donate to the real free list
1792                  * once we stop pushing, we'll let some time elapse before we start
1793                  * stealing them in the new_vnode routine
1794                  */
1795                 microuptime(&rage_tv);
1796         } else {
1797                 /*
1798                  * if VL_DEAD, insert it at head of the dead list
1799                  * else insert at tail of LRU list or at head if VAGE is set
1800                  */
1801                 if ((vp->v_lflag & VL_DEAD)) {
1802                         TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist);
1803                         vp->v_listflag |= VLIST_DEAD;
1804                         deadvnodes++;
1805
1806                         if (dead_vnode_wanted) {
1807                                 dead_vnode_wanted--;
1808                                 need_dead_wakeup = TRUE;
1809                         }
1810                 } else if ((vp->v_flag & VAGE)) {
1811                         TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1812                         vp->v_flag &= ~VAGE;
1813                         freevnodes++;
1814                 } else {
1815                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1816                         freevnodes++;
1817                 }
1818         }
1819         vnode_list_unlock();
1820
1821         if (need_dead_wakeup == TRUE) {
1822                 wakeup_one((caddr_t)&dead_vnode_wanted);
1823         }
1824 }
1825
1826
1827 /*
1828  * remove the vnode from appropriate free list.
1829  * called with vnode LOCKED and
1830  * the list lock held
1831  */
1832 static void
1833 vnode_list_remove_locked(vnode_t vp)
1834 {
1835         if (VONLIST(vp)) {
1836                 /*
1837                  * the v_listflag field is
1838                  * protected by the vnode_list_lock
1839                  */
1840                 if (vp->v_listflag & VLIST_RAGE) {
1841                         VREMRAGE("vnode_list_remove", vp);
1842                 } else if (vp->v_listflag & VLIST_DEAD) {
1843                         VREMDEAD("vnode_list_remove", vp);
1844                 } else if (vp->v_listflag & VLIST_ASYNC_WORK) {
1845                         VREMASYNC_WORK("vnode_list_remove", vp);
1846                 } else {
1847                         VREMFREE("vnode_list_remove", vp);
1848                 }
1849         }
1850 }
1851
1852
1853 /*
1854  * remove the vnode from appropriate free list.
1855  * called with vnode LOCKED
1856  */
1857 static void
1858 vnode_list_remove(vnode_t vp)
1859 {
1860 #if DIAGNOSTIC
1861         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1862 #endif
1863         /*
1864          * we want to avoid taking the list lock
1865          * in the case where we're not on the free
1866          * list... this will be true for most
1867          * directories and any currently in use files
1868          *
1869          * we're guaranteed that we can't go from
1870          * the not-on-list state to the on-list
1871          * state since we hold the vnode lock...
1872          * all calls to vnode_list_add are done
1873          * under the vnode lock... so we can
1874          * check for that condition (the prevelant one)
1875          * without taking the list lock
1876          */
1877         if (VONLIST(vp)) {
1878                 vnode_list_lock();
1879                 /*
1880                  * however, we're not guaranteed that
1881                  * we won't go from the on-list state
1882                  * to the not-on-list state until we
1883                  * hold the vnode_list_lock... this
1884                  * is due to "new_vnode" removing vnodes
1885                  * from the free list uder the list_lock
1886                  * w/o the vnode lock... so we need to
1887                  * check again whether we're currently
1888                  * on the free list
1889                  */
1890                 vnode_list_remove_locked(vp);
1891
1892                 vnode_list_unlock();
1893         }
1894 }
1895
1896
1897 void
1898 vnode_rele(vnode_t vp)
1899 {
1900         vnode_rele_internal(vp, 0, 0, 0);
1901 }
1902
1903
1904 void
1905 vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter)
1906 {
1907         vnode_rele_internal(vp, fmode, dont_reenter, 0);
1908 }
1909
1910
1911 void
1912 vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked)
1913 {
1914         if (!locked) {
1915                 vnode_lock_spin(vp);
1916         }
1917 #if DIAGNOSTIC
1918         else {
1919                 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1920         }
1921 #endif
1922         if (--vp->v_usecount < 0) {
1923                 panic("vnode_rele_ext: vp %p usecount -ve : %d.  v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
1924         }
1925
1926         if (fmode & FWRITE) {
1927                 if (--vp->v_writecount < 0) {
1928                         panic("vnode_rele_ext: vp %p writecount -ve : %d.  v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_writecount, vp->v_tag, vp->v_type, vp->v_flag);
1929                 }
1930         }
1931         if (fmode & O_EVTONLY) {
1932                 if (--vp->v_kusecount < 0) {
1933                         panic("vnode_rele_ext: vp %p kusecount -ve : %d.  v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_tag, vp->v_type, vp->v_flag);
1934                 }
1935         }
1936         if (vp->v_kusecount > vp->v_usecount) {
1937                 panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d).  v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
1938         }
1939
1940         if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) {
1941                 /*
1942                  * vnode is still busy... if we're the last
1943                  * usecount, mark for a future call to VNOP_INACTIVE
1944                  * when the iocount finally drops to 0
1945                  */
1946                 if (vp->v_usecount == 0) {
1947                         vp->v_lflag |= VL_NEEDINACTIVE;
1948                         vp->v_flag  &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
1949                 }
1950                 goto done;
1951         }
1952         vp->v_flag  &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
1953
1954         if (ISSET(vp->v_lflag, VL_TERMINATE | VL_DEAD) || dont_reenter) {
1955                 /*
1956                  * vnode is being cleaned, or
1957                  * we've requested that we don't reenter
1958                  * the filesystem on this release...in
1959                  * the latter case, we'll mark the vnode aged
1960                  */
1961                 if (dont_reenter) {
1962                         if (!(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM))) {
1963                                 vp->v_lflag |= VL_NEEDINACTIVE;
1964
1965                                 if (vnode_on_reliable_media(vp) == FALSE || vp->v_flag & VISDIRTY) {
1966                                         vnode_async_list_add(vp);
1967                                         goto done;
1968                                 }
1969                         }
1970                         vp->v_flag |= VAGE;
1971                 }
1972                 vnode_list_add(vp);
1973
1974                 goto done;
1975         }
1976         /*
1977          * at this point both the iocount and usecount
1978          * are zero
1979          * pick up an iocount so that we can call
1980          * VNOP_INACTIVE with the vnode lock unheld
1981          */
1982         vp->v_iocount++;
1983 #ifdef JOE_DEBUG
1984         record_vp(vp, 1);
1985 #endif
1986         vp->v_lflag &= ~VL_NEEDINACTIVE;
1987         vnode_unlock(vp);
1988
1989         VNOP_INACTIVE(vp, vfs_context_current());
1990
1991         vnode_lock_spin(vp);
1992         /*
1993          * because we dropped the vnode lock to call VNOP_INACTIVE
1994          * the state of the vnode may have changed... we may have
1995          * picked up an iocount, usecount or the MARKTERM may have
1996          * been set... we need to reevaluate the reference counts
1997          * to determine if we can call vnode_reclaim_internal at
1998          * this point... if the reference counts are up, we'll pick
1999          * up the MARKTERM state when they get subsequently dropped
2000          */
2001         if ((vp->v_iocount == 1) && (vp->v_usecount == 0) &&
2002             ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) {
2003                 struct  uthread *ut;
2004
2005                 ut = get_bsdthread_info(current_thread());
2006
2007                 if (ut->uu_defer_reclaims) {
2008                         vp->v_defer_reclaimlist = ut->uu_vreclaims;
2009                         ut->uu_vreclaims = vp;
2010                         goto done;
2011                 }
2012                 vnode_lock_convert(vp);
2013                 vnode_reclaim_internal(vp, 1, 1, 0);
2014         }
2015         vnode_dropiocount(vp);
2016         vnode_list_add(vp);
2017 done:
2018         if (vp->v_usecount == 0 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
2019                 if (vp->v_ubcinfo) {
2020                         vnode_lock_convert(vp);
2021                         memory_object_mark_unused(vp->v_ubcinfo->ui_control, (vp->v_flag & VRAGE) == VRAGE);
2022                 }
2023         }
2024         if (!locked) {
2025                 vnode_unlock(vp);
2026         }
2027         return;
2028 }
2029
2030 /*
2031  * Remove any vnodes in the vnode table belonging to mount point mp.
2032  *
2033  * If MNT_NOFORCE is specified, there should not be any active ones,
2034  * return error if any are found (nb: this is a user error, not a
2035  * system error). If MNT_FORCE is specified, detach any active vnodes
2036  * that are found.
2037  */
2038 #if DIAGNOSTIC
2039 int busyprt = 0;        /* print out busy vnodes */
2040 #endif
2041
2042 int
2043 vflush(struct mount *mp, struct vnode *skipvp, int flags)
2044 {
2045         struct vnode *vp;
2046         int busy = 0;
2047         int reclaimed = 0;
2048         int retval;
2049         unsigned int vid;
2050
2051         /*
2052          * See comments in vnode_iterate() for the rationale for this lock
2053          */
2054         mount_iterate_lock(mp);
2055
2056         mount_lock(mp);
2057         vnode_iterate_setup(mp);
2058         /*
2059          * On regular unmounts(not forced) do a
2060          * quick check for vnodes to be in use. This
2061          * preserves the caching of vnodes. automounter
2062          * tries unmounting every so often to see whether
2063          * it is still busy or not.
2064          */
2065         if (((flags & FORCECLOSE) == 0) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != 0)) {
2066                 if (vnode_umount_preflight(mp, skipvp, flags)) {
2067                         vnode_iterate_clear(mp);
2068                         mount_unlock(mp);
2069                         mount_iterate_unlock(mp);
2070                         return EBUSY;
2071                 }
2072         }
2073 loop:
2074         /* If it returns 0 then there is nothing to do */
2075         retval = vnode_iterate_prepare(mp);
2076
2077         if (retval == 0) {
2078                 vnode_iterate_clear(mp);
2079                 mount_unlock(mp);
2080                 mount_iterate_unlock(mp);
2081                 return retval;
2082         }
2083
2084         /* iterate over all the vnodes */
2085         while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
2086                 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
2087                 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
2088                 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
2089
2090                 if ((vp->v_mount != mp) || (vp == skipvp)) {
2091                         continue;
2092                 }
2093                 vid = vp->v_id;
2094                 mount_unlock(mp);
2095
2096                 vnode_lock_spin(vp);
2097
2098                 // If vnode is already terminating, wait for it...
2099                 while (vp->v_id == vid && ISSET(vp->v_lflag, VL_TERMINATE)) {
2100                         vp->v_lflag |= VL_TERMWANT;
2101                         msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vflush", NULL);
2102                 }
2103
2104                 if ((vp->v_id != vid) || ISSET(vp->v_lflag, VL_DEAD)) {
2105                         vnode_unlock(vp);
2106                         mount_lock(mp);
2107                         continue;
2108                 }
2109
2110                 /*
2111                  * If requested, skip over vnodes marked VSYSTEM.
2112                  * Skip over all vnodes marked VNOFLUSH.
2113                  */
2114                 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
2115                     (vp->v_flag & VNOFLUSH))) {
2116                         vnode_unlock(vp);
2117                         mount_lock(mp);
2118                         continue;
2119                 }
2120                 /*
2121                  * If requested, skip over vnodes marked VSWAP.
2122                  */
2123                 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
2124                         vnode_unlock(vp);
2125                         mount_lock(mp);
2126                         continue;
2127                 }
2128                 /*
2129                  * If requested, skip over vnodes marked VROOT.
2130                  */
2131                 if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
2132                         vnode_unlock(vp);
2133                         mount_lock(mp);
2134                         continue;
2135                 }
2136                 /*
2137                  * If WRITECLOSE is set, only flush out regular file
2138                  * vnodes open for writing.
2139                  */
2140                 if ((flags & WRITECLOSE) &&
2141                     (vp->v_writecount == 0 || vp->v_type != VREG)) {
2142                         vnode_unlock(vp);
2143                         mount_lock(mp);
2144                         continue;
2145                 }
2146                 /*
2147                  * If the real usecount is 0, all we need to do is clear
2148                  * out the vnode data structures and we are done.
2149                  */
2150                 if (((vp->v_usecount == 0) ||
2151                     ((vp->v_usecount - vp->v_kusecount) == 0))) {
2152                         vnode_lock_convert(vp);
2153                         vp->v_iocount++;        /* so that drain waits for * other iocounts */
2154 #ifdef JOE_DEBUG
2155                         record_vp(vp, 1);
2156 #endif
2157                         vnode_reclaim_internal(vp, 1, 1, 0);
2158                         vnode_dropiocount(vp);
2159                         vnode_list_add(vp);
2160                         vnode_unlock(vp);
2161
2162                         reclaimed++;
2163                         mount_lock(mp);
2164                         continue;
2165                 }
2166                 /*
2167                  * If FORCECLOSE is set, forcibly close the vnode.
2168                  * For block or character devices, revert to an
2169                  * anonymous device. For all other files, just kill them.
2170                  */
2171                 if (flags & FORCECLOSE) {
2172                         vnode_lock_convert(vp);
2173
2174                         if (vp->v_type != VBLK && vp->v_type != VCHR) {
2175                                 vp->v_iocount++;        /* so that drain waits * for other iocounts */
2176 #ifdef JOE_DEBUG
2177                                 record_vp(vp, 1);
2178 #endif
2179                                 vnode_abort_advlocks(vp);
2180                                 vnode_reclaim_internal(vp, 1, 1, 0);
2181                                 vnode_dropiocount(vp);
2182                                 vnode_list_add(vp);
2183                                 vnode_unlock(vp);
2184                         } else {
2185                                 vclean(vp, 0);
2186                                 vp->v_lflag &= ~VL_DEAD;
2187                                 vp->v_op = spec_vnodeop_p;
2188                                 vp->v_flag |= VDEVFLUSH;
2189                                 vnode_unlock(vp);
2190                         }
2191                         mount_lock(mp);
2192                         continue;
2193                 }
2194 #if DIAGNOSTIC
2195                 if (busyprt) {
2196                         vprint("vflush: busy vnode", vp);
2197                 }
2198 #endif
2199                 vnode_unlock(vp);
2200                 mount_lock(mp);
2201                 busy++;
2202         }
2203
2204         /* At this point the worker queue is completed */
2205         if (busy && ((flags & FORCECLOSE) == 0) && reclaimed) {
2206                 busy = 0;
2207                 reclaimed = 0;
2208                 (void)vnode_iterate_reloadq(mp);
2209                 /* returned with mount lock held */
2210                 goto loop;
2211         }
2212
2213         /* if new vnodes were created in between retry the reclaim */
2214         if (vnode_iterate_reloadq(mp) != 0) {
2215                 if (!(busy && ((flags & FORCECLOSE) == 0))) {
2216                         goto loop;
2217                 }
2218         }
2219         vnode_iterate_clear(mp);
2220         mount_unlock(mp);
2221         mount_iterate_unlock(mp);
2222
2223         if (busy && ((flags & FORCECLOSE) == 0)) {
2224                 return EBUSY;
2225         }
2226         return 0;
2227 }
2228
2229 long num_recycledvnodes = 0;
2230 /*
2231  * Disassociate the underlying file system from a vnode.
2232  * The vnode lock is held on entry.
2233  */
2234 static void
2235 vclean(vnode_t vp, int flags)
2236 {
2237         vfs_context_t ctx = vfs_context_current();
2238         int active;
2239         int need_inactive;
2240         int already_terminating;
2241         int clflags = 0;
2242 #if NAMEDSTREAMS
2243         int is_namedstream;
2244 #endif
2245
2246         /*
2247          * Check to see if the vnode is in use.
2248          * If so we have to reference it before we clean it out
2249          * so that its count cannot fall to zero and generate a
2250          * race against ourselves to recycle it.
2251          */
2252         active = vp->v_usecount;
2253
2254         /*
2255          * just in case we missed sending a needed
2256          * VNOP_INACTIVE, we'll do it now
2257          */
2258         need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
2259
2260         vp->v_lflag &= ~VL_NEEDINACTIVE;
2261
2262         /*
2263          * Prevent the vnode from being recycled or
2264          * brought into use while we clean it out.
2265          */
2266         already_terminating = (vp->v_lflag & VL_TERMINATE);
2267
2268         vp->v_lflag |= VL_TERMINATE;
2269
2270 #if NAMEDSTREAMS
2271         is_namedstream = vnode_isnamedstream(vp);
2272 #endif
2273
2274         vnode_unlock(vp);
2275
2276         OSAddAtomicLong(1, &num_recycledvnodes);
2277
2278         if (flags & DOCLOSE) {
2279                 clflags |= IO_NDELAY;
2280         }
2281         if (flags & REVOKEALL) {
2282                 clflags |= IO_REVOKE;
2283         }
2284
2285         if (active && (flags & DOCLOSE)) {
2286                 VNOP_CLOSE(vp, clflags, ctx);
2287         }
2288
2289         /*
2290          * Clean out any buffers associated with the vnode.
2291          */
2292         if (flags & DOCLOSE) {
2293 #if NFSCLIENT
2294                 if (vp->v_tag == VT_NFS) {
2295                         nfs_vinvalbuf(vp, V_SAVE, ctx, 0);
2296                 } else
2297 #endif
2298                 {
2299                         VNOP_FSYNC(vp, MNT_WAIT, ctx);
2300
2301                         /*
2302                          * If the vnode is still in use (by the journal for
2303                          * example) we don't want to invalidate locked buffers
2304                          * here.  In that case, either the journal will tidy them
2305                          * up, or we will deal with it when the usecount is
2306                          * finally released in vnode_rele_internal.
2307                          */
2308                         buf_invalidateblks(vp, BUF_WRITE_DATA | (active ? 0 : BUF_INVALIDATE_LOCKED), 0, 0);
2309                 }
2310                 if (UBCINFOEXISTS(vp)) {
2311                         /*
2312                          * Clean the pages in VM.
2313                          */
2314                         (void)ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC);
2315                 }
2316         }
2317         if (active || need_inactive) {
2318                 VNOP_INACTIVE(vp, ctx);
2319         }
2320
2321 #if NAMEDSTREAMS
2322         if ((is_namedstream != 0) && (vp->v_parent != NULLVP)) {
2323                 vnode_t pvp = vp->v_parent;
2324
2325                 /* Delete the shadow stream file before we reclaim its vnode */
2326                 if (vnode_isshadow(vp)) {
2327                         vnode_relenamedstream(pvp, vp);
2328                 }
2329
2330                 /*
2331                  * No more streams associated with the parent.  We
2332                  * have a ref on it, so its identity is stable.
2333                  * If the parent is on an opaque volume, then we need to know
2334                  * whether it has associated named streams.
2335                  */
2336                 if (vfs_authopaque(pvp->v_mount)) {
2337                         vnode_lock_spin(pvp);
2338                         pvp->v_lflag &= ~VL_HASSTREAMS;
2339                         vnode_unlock(pvp);
2340                 }
2341         }
2342 #endif
2343
2344         /*
2345          * Destroy ubc named reference
2346          * cluster_release is done on this path
2347          * along with dropping the reference on the ucred
2348          * (and in the case of forced unmount of an mmap-ed file,
2349          * the ubc reference on the vnode is dropped here too).
2350          */
2351         ubc_destroy_named(vp);
2352
2353 #if CONFIG_TRIGGERS
2354         /*
2355          * cleanup trigger info from vnode (if any)
2356          */
2357         if (vp->v_resolve) {
2358                 vnode_resolver_detach(vp);
2359         }
2360 #endif
2361
2362         /*
2363          * Reclaim the vnode.
2364          */
2365         if (VNOP_RECLAIM(vp, ctx)) {
2366                 panic("vclean: cannot reclaim");
2367         }
2368
2369         // make sure the name & parent ptrs get cleaned out!
2370         vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE);
2371
2372         vnode_lock(vp);
2373
2374         /*
2375          * Remove the vnode from any mount list it might be on.  It is not
2376          * safe to do this any earlier because unmount needs to wait for
2377          * any vnodes to terminate and it cannot do that if it cannot find
2378          * them.
2379          */
2380         insmntque(vp, (struct mount *)0);
2381
2382         vp->v_mount = dead_mountp;
2383         vp->v_op = dead_vnodeop_p;
2384         vp->v_tag = VT_NON;
2385         vp->v_data = NULL;
2386
2387         vp->v_lflag |= VL_DEAD;
2388         vp->v_flag &= ~VISDIRTY;
2389
2390         if (already_terminating == 0) {
2391                 vp->v_lflag &= ~VL_TERMINATE;
2392                 /*
2393                  * Done with purge, notify sleepers of the grim news.
2394                  */
2395                 if (vp->v_lflag & VL_TERMWANT) {
2396                         vp->v_lflag &= ~VL_TERMWANT;
2397                         wakeup(&vp->v_lflag);
2398                 }
2399         }
2400 }
2401
2402 /*
2403  * Eliminate all activity associated with  the requested vnode
2404  * and with all vnodes aliased to the requested vnode.
2405  */
2406 int
2407 #if DIAGNOSTIC
2408 vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
2409 #else
2410 vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context)
2411 #endif
2412 {
2413         struct vnode *vq;
2414         int vid;
2415
2416 #if DIAGNOSTIC
2417         if ((flags & REVOKEALL) == 0) {
2418                 panic("vnop_revoke");
2419         }
2420 #endif
2421
2422         if (vnode_isaliased(vp)) {
2423                 /*
2424                  * If a vgone (or vclean) is already in progress,
2425                  * return an immediate error
2426                  */
2427                 if (vp->v_lflag & VL_TERMINATE) {
2428                         return ENOENT;
2429                 }
2430
2431                 /*
2432                  * Ensure that vp will not be vgone'd while we
2433                  * are eliminating its aliases.
2434                  */
2435                 SPECHASH_LOCK();
2436                 while ((vp->v_specflags & SI_ALIASED)) {
2437                         for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2438                                 if (vq->v_rdev != vp->v_rdev ||
2439                                     vq->v_type != vp->v_type || vp == vq) {
2440                                         continue;
2441                                 }
2442                                 vid = vq->v_id;
2443                                 SPECHASH_UNLOCK();
2444                                 if (vnode_getwithvid(vq, vid)) {
2445                                         SPECHASH_LOCK();
2446                                         break;
2447                                 }
2448                                 vnode_lock(vq);
2449                                 if (!(vq->v_lflag & VL_TERMINATE)) {
2450                                         vnode_reclaim_internal(vq, 1, 1, 0);
2451                                 }
2452                                 vnode_put_locked(vq);
2453                                 vnode_unlock(vq);
2454                                 SPECHASH_LOCK();
2455                                 break;
2456                         }
2457                 }
2458                 SPECHASH_UNLOCK();
2459         }
2460         vnode_lock(vp);
2461         if (vp->v_lflag & VL_TERMINATE) {
2462                 vnode_unlock(vp);
2463                 return ENOENT;
2464         }
2465         vnode_reclaim_internal(vp, 1, 0, REVOKEALL);
2466         vnode_unlock(vp);
2467
2468         return 0;
2469 }
2470
2471 /*
2472  * Recycle an unused vnode to the front of the free list.
2473  * Release the passed interlock if the vnode will be recycled.
2474  */
2475 int
2476 vnode_recycle(struct vnode *vp)
2477 {
2478         vnode_lock_spin(vp);
2479
2480         if (vp->v_iocount || vp->v_usecount) {
2481                 vp->v_lflag |= VL_MARKTERM;
2482                 vnode_unlock(vp);
2483                 return 0;
2484         }
2485         vnode_lock_convert(vp);
2486         vnode_reclaim_internal(vp, 1, 0, 0);
2487
2488         vnode_unlock(vp);
2489
2490         return 1;
2491 }
2492
2493 static int
2494 vnode_reload(vnode_t vp)
2495 {
2496         vnode_lock_spin(vp);
2497
2498         if ((vp->v_iocount > 1) || vp->v_usecount) {
2499                 vnode_unlock(vp);
2500                 return 0;
2501         }
2502         if (vp->v_iocount <= 0) {
2503                 panic("vnode_reload with no iocount %d", vp->v_iocount);
2504         }
2505
2506         /* mark for release when iocount is dopped */
2507         vp->v_lflag |= VL_MARKTERM;
2508         vnode_unlock(vp);
2509
2510         return 1;
2511 }
2512
2513
2514 static void
2515 vgone(vnode_t vp, int flags)
2516 {
2517         struct vnode *vq;
2518         struct vnode *vx;
2519
2520         /*
2521          * Clean out the filesystem specific data.
2522          * vclean also takes care of removing the
2523          * vnode from any mount list it might be on
2524          */
2525         vclean(vp, flags | DOCLOSE);
2526
2527         /*
2528          * If special device, remove it from special device alias list
2529          * if it is on one.
2530          */
2531         if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
2532                 SPECHASH_LOCK();
2533                 if (*vp->v_hashchain == vp) {
2534                         *vp->v_hashchain = vp->v_specnext;
2535                 } else {
2536                         for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2537                                 if (vq->v_specnext != vp) {
2538                                         continue;
2539                                 }
2540                                 vq->v_specnext = vp->v_specnext;
2541                                 break;
2542                         }
2543                         if (vq == NULL) {
2544                                 panic("missing bdev");
2545                         }
2546                 }
2547                 if (vp->v_specflags & SI_ALIASED) {
2548                         vx = NULL;
2549                         for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2550                                 if (vq->v_rdev != vp->v_rdev ||
2551                                     vq->v_type != vp->v_type) {
2552                                         continue;
2553                                 }
2554                                 if (vx) {
2555                                         break;
2556                                 }
2557                                 vx = vq;
2558                         }
2559                         if (vx == NULL) {
2560                                 panic("missing alias");
2561                         }
2562                         if (vq == NULL) {
2563                                 vx->v_specflags &= ~SI_ALIASED;
2564                         }
2565                         vp->v_specflags &= ~SI_ALIASED;
2566                 }
2567                 SPECHASH_UNLOCK();
2568                 {
2569                         struct specinfo *tmp = vp->v_specinfo;
2570                         vp->v_specinfo = NULL;
2571                         FREE_ZONE(tmp, sizeof(struct specinfo), M_SPECINFO);
2572                 }
2573         }
2574 }
2575
2576 /*
2577  * Lookup a vnode by device number.
2578  */
2579 int
2580 check_mountedon(dev_t dev, enum vtype type, int  *errorp)
2581 {
2582         vnode_t vp;
2583         int rc = 0;
2584         int vid;
2585
2586 loop:
2587         SPECHASH_LOCK();
2588         for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
2589                 if (dev != vp->v_rdev || type != vp->v_type) {
2590                         continue;
2591                 }
2592                 vid = vp->v_id;
2593                 SPECHASH_UNLOCK();
2594                 if (vnode_getwithvid(vp, vid)) {
2595                         goto loop;
2596                 }
2597                 vnode_lock_spin(vp);
2598                 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
2599                         vnode_unlock(vp);
2600                         if ((*errorp = vfs_mountedon(vp)) != 0) {
2601                                 rc = 1;
2602                         }
2603                 } else {
2604                         vnode_unlock(vp);
2605                 }
2606                 vnode_put(vp);
2607                 return rc;
2608         }
2609         SPECHASH_UNLOCK();
2610         return 0;
2611 }
2612
2613 /*
2614  * Calculate the total number of references to a special device.
2615  */
2616 int
2617 vcount(vnode_t vp)
2618 {
2619         vnode_t vq, vnext;
2620         int count;
2621         int vid;
2622
2623         if (!vnode_isspec(vp)) {
2624                 return vp->v_usecount - vp->v_kusecount;
2625         }
2626
2627 loop:
2628         if (!vnode_isaliased(vp)) {
2629                 return vp->v_specinfo->si_opencount;
2630         }
2631         count = 0;
2632
2633         SPECHASH_LOCK();
2634         /*
2635          * Grab first vnode and its vid.
2636          */
2637         vq = *vp->v_hashchain;
2638         vid = vq ? vq->v_id : 0;
2639
2640         SPECHASH_UNLOCK();
2641
2642         while (vq) {
2643                 /*
2644                  * Attempt to get the vnode outside the SPECHASH lock.
2645                  */
2646                 if (vnode_getwithvid(vq, vid)) {
2647                         goto loop;
2648                 }
2649                 vnode_lock(vq);
2650
2651                 if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) {
2652                         if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) {
2653                                 /*
2654                                  * Alias, but not in use, so flush it out.
2655                                  */
2656                                 vnode_reclaim_internal(vq, 1, 1, 0);
2657                                 vnode_put_locked(vq);
2658                                 vnode_unlock(vq);
2659                                 goto loop;
2660                         }
2661                         count += vq->v_specinfo->si_opencount;
2662                 }
2663                 vnode_unlock(vq);
2664
2665                 SPECHASH_LOCK();
2666                 /*
2667                  * must do this with the reference still held on 'vq'
2668                  * so that it can't be destroyed while we're poking
2669                  * through v_specnext
2670                  */
2671                 vnext = vq->v_specnext;
2672                 vid = vnext ? vnext->v_id : 0;
2673
2674                 SPECHASH_UNLOCK();
2675
2676                 vnode_put(vq);
2677
2678                 vq = vnext;
2679         }
2680
2681         return count;
2682 }
2683
2684 int     prtactive = 0;          /* 1 => print out reclaim of active vnodes */
2685
2686 /*
2687  * Print out a description of a vnode.
2688  */
2689 static const char *typename[] =
2690 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
2691
2692 void
2693 vprint(const char *label, struct vnode *vp)
2694 {
2695         char sbuf[64];
2696
2697         if (label != NULL) {
2698                 printf("%s: ", label);
2699         }
2700         printf("type %s, usecount %d, writecount %d",
2701             typename[vp->v_type], vp->v_usecount, vp->v_writecount);
2702         sbuf[0] = '\0';
2703         if (vp->v_flag & VROOT) {
2704                 strlcat(sbuf, "|VROOT", sizeof(sbuf));
2705         }
2706         if (vp->v_flag & VTEXT) {
2707                 strlcat(sbuf, "|VTEXT", sizeof(sbuf));
2708         }
2709         if (vp->v_flag & VSYSTEM) {
2710                 strlcat(sbuf, "|VSYSTEM", sizeof(sbuf));
2711         }
2712         if (vp->v_flag & VNOFLUSH) {
2713                 strlcat(sbuf, "|VNOFLUSH", sizeof(sbuf));
2714         }
2715         if (vp->v_flag & VBWAIT) {
2716                 strlcat(sbuf, "|VBWAIT", sizeof(sbuf));
2717         }
2718         if (vnode_isaliased(vp)) {
2719                 strlcat(sbuf, "|VALIASED", sizeof(sbuf));
2720         }
2721         if (sbuf[0] != '\0') {
2722                 printf(" flags (%s)", &sbuf[1]);
2723         }
2724 }
2725
2726
2727 int
2728 vn_getpath(struct vnode *vp, char *pathbuf, int *len)
2729 {
2730         return build_path(vp, pathbuf, *len, len, BUILDPATH_NO_FS_ENTER, vfs_context_current());
2731 }
2732
2733 int
2734 vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len)
2735 {
2736         return build_path(vp, pathbuf, *len, len, 0, vfs_context_current());
2737 }
2738
2739 /*
2740  * vn_getpath_fsenter_with_parent will reenter the file system to fine the path of the
2741  * vnode.  It requires that there are IO counts on both the vnode and the directory vnode.
2742  *
2743  * vn_getpath_fsenter is called by MAC hooks to authorize operations for every thing, but
2744  * unlink, rmdir and rename. For these operation the MAC hook  calls vn_getpath. This presents
2745  * problems where if the path can not be found from the name cache, those operations can
2746  * erroneously fail with EPERM even though the call should succeed. When removing or moving
2747  * file system objects with operations such as unlink or rename, those operations need to
2748  * take IO counts on the target and containing directory. Calling vn_getpath_fsenter from a
2749  * MAC hook from these operations during forced unmount operations can lead to dead
2750  * lock. This happens when the operation starts, IO counts are taken on the containing
2751  * directories and targets. Before the MAC hook is called a forced unmount from another
2752  * thread takes place and blocks on the on going operation's directory vnode in vdrain.
2753  * After which, the MAC hook gets called and calls vn_getpath_fsenter.  vn_getpath_fsenter
2754  * is called with the understanding that there is an IO count on the target. If in
2755  * build_path the directory vnode is no longer in the cache, then the parent object id via
2756  * vnode_getattr from the target is obtain and used to call VFS_VGET to get the parent
2757  * vnode. The file system's VFS_VGET then looks up by inode in its hash and tries to get
2758  * an IO count. But VFS_VGET "sees" the directory vnode is in vdrain and can block
2759  * depending on which version and how it calls the vnode_get family of interfaces.
2760  *
2761  * N.B.  A reasonable interface to use is vnode_getwithvid. This interface was modified to
2762  * call vnode_getiocount with VNODE_DRAINO, so it will happily get an IO count and not
2763  * cause issues, but there is no guarantee that all or any file systems are doing that.
2764  *
2765  * vn_getpath_fsenter_with_parent can enter the file system safely since there is a known
2766  * IO count on the directory vnode by calling build_path_with_parent.
2767  */
2768
2769 int
2770 vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pathbuf, int *len)
2771 {
2772         return build_path_with_parent(vp, dvp, pathbuf, *len, len, 0, vfs_context_current());
2773 }
2774
2775 int
2776 vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash)
2777 {
2778         return ubc_cs_getcdhash(vp, offset, cdhash);
2779 }
2780
2781
2782 static char *extension_table = NULL;
2783 static int   nexts;
2784 static int   max_ext_width;
2785
2786 static int
2787 extension_cmp(const void *a, const void *b)
2788 {
2789         return strlen((const char *)a) - strlen((const char *)b);
2790 }
2791
2792
2793 //
2794 // This is the api LaunchServices uses to inform the kernel
2795 // the list of package extensions to ignore.
2796 //
2797 // Internally we keep the list sorted by the length of the
2798 // the extension (from longest to shortest).  We sort the
2799 // list of extensions so that we can speed up our searches
2800 // when comparing file names -- we only compare extensions
2801 // that could possibly fit into the file name, not all of
2802 // them (i.e. a short 8 character name can't have an 8
2803 // character extension).
2804 //
2805 extern lck_mtx_t *pkg_extensions_lck;
2806
2807 __private_extern__ int
2808 set_package_extensions_table(user_addr_t data, int nentries, int maxwidth)
2809 {
2810         char *new_exts, *old_exts;
2811         int error;
2812
2813         if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) {
2814                 return EINVAL;
2815         }
2816
2817
2818         // allocate one byte extra so we can guarantee null termination
2819         MALLOC(new_exts, char *, (nentries * maxwidth) + 1, M_TEMP, M_WAITOK);
2820         if (new_exts == NULL) {
2821                 return ENOMEM;
2822         }
2823
2824         error = copyin(data, new_exts, nentries * maxwidth);
2825         if (error) {
2826                 FREE(new_exts, M_TEMP);
2827                 return error;
2828         }
2829
2830         new_exts[(nentries * maxwidth)] = '\0'; // guarantee null termination of the block
2831
2832         qsort(new_exts, nentries, maxwidth, extension_cmp);
2833
2834         lck_mtx_lock(pkg_extensions_lck);
2835
2836         old_exts        = extension_table;
2837         extension_table = new_exts;
2838         nexts           = nentries;
2839         max_ext_width   = maxwidth;
2840
2841         lck_mtx_unlock(pkg_extensions_lck);
2842
2843         if (old_exts) {
2844                 FREE(old_exts, M_TEMP);
2845         }
2846
2847         return 0;
2848 }
2849
2850
2851 int
2852 is_package_name(const char *name, int len)
2853 {
2854         int i, extlen;
2855         const char *ptr, *name_ext;
2856
2857         if (len <= 3) {
2858                 return 0;
2859         }
2860
2861         name_ext = NULL;
2862         for (ptr = name; *ptr != '\0'; ptr++) {
2863                 if (*ptr == '.') {
2864                         name_ext = ptr;
2865                 }
2866         }
2867
2868         // if there is no "." extension, it can't match
2869         if (name_ext == NULL) {
2870                 return 0;
2871         }
2872
2873         // advance over the "."
2874         name_ext++;
2875
2876         lck_mtx_lock(pkg_extensions_lck);
2877
2878         // now iterate over all the extensions to see if any match
2879         ptr = &extension_table[0];
2880         for (i = 0; i < nexts; i++, ptr += max_ext_width) {
2881                 extlen = strlen(ptr);
2882                 if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') {
2883                         // aha, a match!
2884                         lck_mtx_unlock(pkg_extensions_lck);
2885                         return 1;
2886                 }
2887         }
2888
2889         lck_mtx_unlock(pkg_extensions_lck);
2890
2891         // if we get here, no extension matched
2892         return 0;
2893 }
2894
2895 int
2896 vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component)
2897 {
2898         char *ptr, *end;
2899         int comp = 0;
2900
2901         *component = -1;
2902         if (*path != '/') {
2903                 return EINVAL;
2904         }
2905
2906         end = path + 1;
2907         while (end < path + pathlen && *end != '\0') {
2908                 while (end < path + pathlen && *end == '/' && *end != '\0') {
2909                         end++;
2910                 }
2911
2912                 ptr = end;
2913
2914                 while (end < path + pathlen && *end != '/' && *end != '\0') {
2915                         end++;
2916                 }
2917
2918                 if (end > path + pathlen) {
2919                         // hmm, string wasn't null terminated
2920                         return EINVAL;
2921                 }
2922
2923                 *end = '\0';
2924                 if (is_package_name(ptr, end - ptr)) {
2925                         *component = comp;
2926                         break;
2927                 }
2928
2929                 end++;
2930                 comp++;
2931         }
2932
2933         return 0;
2934 }
2935
2936 /*
2937  * Determine if a name is inappropriate for a searchfs query.
2938  * This list consists of /System currently.
2939  */
2940
2941 int
2942 vn_searchfs_inappropriate_name(const char *name, int len)
2943 {
2944         const char *bad_names[] = { "System" };
2945         int   bad_len[]   = { 6 };
2946         int  i;
2947
2948         for (i = 0; i < (int) (sizeof(bad_names) / sizeof(bad_names[0])); i++) {
2949                 if (len == bad_len[i] && strncmp(name, bad_names[i], strlen(bad_names[i]) + 1) == 0) {
2950                         return 1;
2951                 }
2952         }
2953
2954         // if we get here, no name matched
2955         return 0;
2956 }
2957
2958 /*
2959  * Top level filesystem related information gathering.
2960  */
2961 extern unsigned int vfs_nummntops;
2962
2963 /*
2964  * The VFS_NUMMNTOPS shouldn't be at name[1] since
2965  * is a VFS generic variable. Since we no longer support
2966  * VT_UFS, we reserve its value to support this sysctl node.
2967  *
2968  * It should have been:
2969  *    name[0]:  VFS_GENERIC
2970  *    name[1]:  VFS_NUMMNTOPS
2971  */
2972 SYSCTL_INT(_vfs, VFS_NUMMNTOPS, nummntops,
2973     CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
2974     &vfs_nummntops, 0, "");
2975
2976 int
2977 vfs_sysctl(int *name __unused, u_int namelen __unused,
2978     user_addr_t oldp __unused, size_t *oldlenp __unused,
2979     user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused);
2980
2981 int
2982 vfs_sysctl(int *name __unused, u_int namelen __unused,
2983     user_addr_t oldp __unused, size_t *oldlenp __unused,
2984     user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused)
2985 {
2986         return EINVAL;
2987 }
2988
2989
2990 //
2991 // The following code disallows specific sysctl's that came through
2992 // the direct sysctl interface (vfs_sysctl_node) instead of the newer
2993 // sysctl_vfs_ctlbyfsid() interface.  We can not allow these selectors
2994 // through vfs_sysctl_node() because it passes the user's oldp pointer
2995 // directly to the file system which (for these selectors) casts it
2996 // back to a struct sysctl_req and then proceed to use SYSCTL_IN()
2997 // which jumps through an arbitrary function pointer.  When called
2998 // through the sysctl_vfs_ctlbyfsid() interface this does not happen
2999 // and so it's safe.
3000 //
3001 // Unfortunately we have to pull in definitions from AFP and SMB and
3002 // perform explicit name checks on the file system to determine if
3003 // these selectors are being used.
3004 //
3005
3006 #define AFPFS_VFS_CTL_GETID            0x00020001
3007 #define AFPFS_VFS_CTL_NETCHANGE        0x00020002
3008 #define AFPFS_VFS_CTL_VOLCHANGE        0x00020003
3009
3010 #define SMBFS_SYSCTL_REMOUNT           1
3011 #define SMBFS_SYSCTL_REMOUNT_INFO      2
3012 #define SMBFS_SYSCTL_GET_SERVER_SHARE  3
3013
3014
3015 static int
3016 is_bad_sysctl_name(struct vfstable *vfsp, int selector_name)
3017 {
3018         switch (selector_name) {
3019         case VFS_CTL_QUERY:
3020         case VFS_CTL_TIMEO:
3021         case VFS_CTL_NOLOCKS:
3022         case VFS_CTL_NSTATUS:
3023         case VFS_CTL_SADDR:
3024         case VFS_CTL_DISC:
3025         case VFS_CTL_SERVERINFO:
3026                 return 1;
3027
3028         default:
3029                 break;
3030         }
3031
3032         // the more complicated check for some of SMB's special values
3033         if (strcmp(vfsp->vfc_name, "smbfs") == 0) {
3034                 switch (selector_name) {
3035                 case SMBFS_SYSCTL_REMOUNT:
3036                 case SMBFS_SYSCTL_REMOUNT_INFO:
3037                 case SMBFS_SYSCTL_GET_SERVER_SHARE:
3038                         return 1;
3039                 }
3040         } else if (strcmp(vfsp->vfc_name, "afpfs") == 0) {
3041                 switch (selector_name) {
3042                 case AFPFS_VFS_CTL_GETID:
3043                 case AFPFS_VFS_CTL_NETCHANGE:
3044                 case AFPFS_VFS_CTL_VOLCHANGE:
3045                         return 1;
3046                 }
3047         }
3048
3049         //
3050         // If we get here we passed all the checks so the selector is ok
3051         //
3052         return 0;
3053 }
3054
3055
3056 int vfs_sysctl_node SYSCTL_HANDLER_ARGS
3057 {
3058         int *name, namelen;
3059         struct vfstable *vfsp;
3060         int error;
3061         int fstypenum;
3062
3063         fstypenum = oidp->oid_number;
3064         name = arg1;
3065         namelen = arg2;
3066
3067         /* all sysctl names at this level should have at least one name slot for the FS */
3068         if (namelen < 1) {
3069                 return EISDIR; /* overloaded */
3070         }
3071         mount_list_lock();
3072         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
3073                 if (vfsp->vfc_typenum == fstypenum) {
3074                         vfsp->vfc_refcount++;
3075                         break;
3076                 }
3077         }
3078         mount_list_unlock();
3079
3080         if (vfsp == NULL) {
3081                 return ENOTSUP;
3082         }
3083
3084         if (is_bad_sysctl_name(vfsp, name[0])) {
3085                 printf("vfs: bad selector 0x%.8x for old-style sysctl().  use the sysctl-by-fsid interface instead\n", name[0]);
3086                 return EPERM;
3087         }
3088
3089         error = (vfsp->vfc_vfsops->vfs_sysctl)(name, namelen, req->oldptr, &req->oldlen, req->newptr, req->newlen, vfs_context_current());
3090
3091         mount_list_lock();
3092         vfsp->vfc_refcount--;
3093         mount_list_unlock();
3094
3095         return error;
3096 }
3097
3098 /*
3099  * Check to see if a filesystem is mounted on a block device.
3100  */
3101 int
3102 vfs_mountedon(struct vnode *vp)
3103 {
3104         struct vnode *vq;
3105         int error = 0;
3106
3107         SPECHASH_LOCK();
3108         if (vp->v_specflags & SI_MOUNTEDON) {
3109                 error = EBUSY;
3110                 goto out;
3111         }
3112         if (vp->v_specflags & SI_ALIASED) {
3113                 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
3114                         if (vq->v_rdev != vp->v_rdev ||
3115                             vq->v_type != vp->v_type) {
3116                                 continue;
3117                         }
3118                         if (vq->v_specflags & SI_MOUNTEDON) {
3119                                 error = EBUSY;
3120                                 break;
3121                         }
3122                 }
3123         }
3124 out:
3125         SPECHASH_UNLOCK();
3126         return error;
3127 }
3128
3129 struct unmount_info {
3130         int     u_errs; // Total failed unmounts
3131         int     u_busy; // EBUSY failed unmounts
3132 };
3133
3134 static int
3135 unmount_callback(mount_t mp, void *arg)
3136 {
3137         int error;
3138         char *mntname;
3139         struct unmount_info *uip = arg;
3140
3141         mount_ref(mp, 0);
3142         mount_iterdrop(mp);     // avoid vfs_iterate deadlock in dounmount()
3143
3144         MALLOC_ZONE(mntname, void *, MAXPATHLEN, M_NAMEI, M_WAITOK);
3145         if (mntname) {
3146                 strlcpy(mntname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
3147         }
3148
3149         error = dounmount(mp, MNT_FORCE, 1, vfs_context_current());
3150         if (error) {
3151                 uip->u_errs++;
3152                 printf("Unmount of %s failed (%d)\n", mntname ? mntname:"?", error);
3153                 if (error == EBUSY) {
3154                         uip->u_busy++;
3155                 }
3156         }
3157         if (mntname) {
3158                 FREE_ZONE(mntname, MAXPATHLEN, M_NAMEI);
3159         }
3160
3161         return VFS_RETURNED;
3162 }
3163
3164 /*
3165  * Unmount all filesystems. The list is traversed in reverse order
3166  * of mounting to avoid dependencies.
3167  * Busy mounts are retried.
3168  */
3169 __private_extern__ void
3170 vfs_unmountall(void)
3171 {
3172         int mounts, sec = 1;
3173         struct unmount_info ui;
3174
3175 retry:
3176         ui.u_errs = ui.u_busy = 0;
3177         vfs_iterate(VFS_ITERATE_CB_DROPREF | VFS_ITERATE_TAIL_FIRST, unmount_callback, &ui);
3178         mounts = mount_getvfscnt();
3179         if (mounts == 0) {
3180                 return;
3181         }
3182
3183         if (ui.u_busy > 0) {            // Busy mounts - wait & retry
3184                 tsleep(&nummounts, PVFS, "busy mount", sec * hz);
3185                 sec *= 2;
3186                 if (sec <= 32) {
3187                         goto retry;
3188                 }
3189                 printf("Unmounting timed out\n");
3190         } else if (ui.u_errs < mounts) {
3191                 // If the vfs_iterate missed mounts in progress - wait a bit
3192                 tsleep(&nummounts, PVFS, "missed mount", 2 * hz);
3193         }
3194 }
3195
3196 /*
3197  * This routine is called from vnode_pager_deallocate out of the VM
3198  * The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named
3199  * on a vnode that has a UBCINFO
3200  */
3201 __private_extern__ void
3202 vnode_pager_vrele(vnode_t vp)
3203 {
3204         struct ubc_info *uip;
3205
3206         vnode_lock_spin(vp);
3207
3208         vp->v_lflag &= ~VNAMED_UBC;
3209         if (vp->v_usecount != 0) {
3210                 /*
3211                  * At the eleventh hour, just before the ubcinfo is
3212                  * destroyed, ensure the ubc-specific v_usecount
3213                  * reference has gone.  We use v_usecount != 0 as a hint;
3214                  * ubc_unmap() does nothing if there's no mapping.
3215                  *
3216                  * This case is caused by coming here via forced unmount,
3217                  * versus the usual vm_object_deallocate() path.
3218                  * In the forced unmount case, ubc_destroy_named()
3219                  * releases the pager before memory_object_last_unmap()
3220                  * can be called.
3221                  */
3222                 vnode_unlock(vp);
3223                 ubc_unmap(vp);
3224                 vnode_lock_spin(vp);
3225         }
3226
3227         uip = vp->v_ubcinfo;
3228         vp->v_ubcinfo = UBC_INFO_NULL;
3229
3230         vnode_unlock(vp);
3231
3232         ubc_info_deallocate(uip);
3233 }
3234
3235
3236 #include <sys/disk.h>
3237
3238 u_int32_t rootunit = (u_int32_t)-1;
3239
3240 #if CONFIG_IOSCHED
3241 extern int lowpri_throttle_enabled;
3242 extern int iosched_enabled;
3243 #endif
3244
3245 errno_t
3246 vfs_init_io_attributes(vnode_t devvp, mount_t mp)
3247 {
3248         int     error;
3249         off_t   readblockcnt = 0;
3250         off_t   writeblockcnt = 0;
3251         off_t   readmaxcnt = 0;
3252         off_t   writemaxcnt = 0;
3253         off_t   readsegcnt = 0;
3254         off_t   writesegcnt = 0;
3255         off_t   readsegsize = 0;
3256         off_t   writesegsize = 0;
3257         off_t   alignment = 0;
3258         u_int32_t minsaturationbytecount = 0;
3259         u_int32_t ioqueue_depth = 0;
3260         u_int32_t blksize;
3261         u_int64_t temp;
3262         u_int32_t features;
3263         vfs_context_t ctx = vfs_context_current();
3264         dk_corestorage_info_t cs_info;
3265         boolean_t cs_present = FALSE;;
3266         int isssd = 0;
3267         int isvirtual = 0;
3268
3269
3270         VNOP_IOCTL(devvp, DKIOCGETTHROTTLEMASK, (caddr_t)&mp->mnt_throttle_mask, 0, NULL);
3271         /*
3272          * as a reasonable approximation, only use the lowest bit of the mask
3273          * to generate a disk unit number
3274          */
3275         mp->mnt_devbsdunit = num_trailing_0(mp->mnt_throttle_mask);
3276
3277         if (devvp == rootvp) {
3278                 rootunit = mp->mnt_devbsdunit;
3279         }
3280
3281         if (mp->mnt_devbsdunit == rootunit) {
3282                 /*
3283                  * this mount point exists on the same device as the root
3284                  * partition, so it comes under the hard throttle control...
3285                  * this is true even for the root mount point itself
3286                  */
3287                 mp->mnt_kern_flag |= MNTK_ROOTDEV;
3288         }
3289         /*
3290          * force the spec device to re-cache
3291          * the underlying block size in case
3292          * the filesystem overrode the initial value
3293          */
3294         set_fsblocksize(devvp);
3295
3296
3297         if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
3298             (caddr_t)&blksize, 0, ctx))) {
3299                 return error;
3300         }
3301
3302         mp->mnt_devblocksize = blksize;
3303
3304         /*
3305          * set the maximum possible I/O size
3306          * this may get clipped to a smaller value
3307          * based on which constraints are being advertised
3308          * and if those advertised constraints result in a smaller
3309          * limit for a given I/O
3310          */
3311         mp->mnt_maxreadcnt = MAX_UPL_SIZE_BYTES;
3312         mp->mnt_maxwritecnt = MAX_UPL_SIZE_BYTES;
3313
3314         if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, ctx) == 0) {
3315                 if (isvirtual) {
3316                         mp->mnt_kern_flag |= MNTK_VIRTUALDEV;
3317                 }
3318         }
3319         if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ctx) == 0) {
3320                 if (isssd) {
3321                         mp->mnt_kern_flag |= MNTK_SSD;
3322                 }
3323         }
3324         if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES,
3325             (caddr_t)&features, 0, ctx))) {
3326                 return error;
3327         }
3328
3329         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
3330             (caddr_t)&readblockcnt, 0, ctx))) {
3331                 return error;
3332         }
3333
3334         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
3335             (caddr_t)&writeblockcnt, 0, ctx))) {
3336                 return error;
3337         }
3338
3339         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
3340             (caddr_t)&readmaxcnt, 0, ctx))) {
3341                 return error;
3342         }
3343
3344         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
3345             (caddr_t)&writemaxcnt, 0, ctx))) {
3346                 return error;
3347         }
3348
3349         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
3350             (caddr_t)&readsegcnt, 0, ctx))) {
3351                 return error;
3352         }
3353
3354         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
3355             (caddr_t)&writesegcnt, 0, ctx))) {
3356                 return error;
3357         }
3358
3359         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
3360             (caddr_t)&readsegsize, 0, ctx))) {
3361                 return error;
3362         }
3363
3364         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
3365             (caddr_t)&writesegsize, 0, ctx))) {
3366                 return error;
3367         }
3368
3369         if ((error = VNOP_IOCTL(devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT,
3370             (caddr_t)&alignment, 0, ctx))) {
3371                 return error;
3372         }
3373
3374         if ((error = VNOP_IOCTL(devvp, DKIOCGETCOMMANDPOOLSIZE,
3375             (caddr_t)&ioqueue_depth, 0, ctx))) {
3376                 return error;
3377         }
3378
3379         if (readmaxcnt) {
3380                 mp->mnt_maxreadcnt = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt;
3381         }
3382
3383         if (readblockcnt) {
3384                 temp = readblockcnt * blksize;
3385                 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
3386
3387                 if (temp < mp->mnt_maxreadcnt) {
3388                         mp->mnt_maxreadcnt = (u_int32_t)temp;
3389                 }
3390         }
3391
3392         if (writemaxcnt) {
3393                 mp->mnt_maxwritecnt = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt;
3394         }
3395
3396         if (writeblockcnt) {
3397                 temp = writeblockcnt * blksize;
3398                 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
3399
3400                 if (temp < mp->mnt_maxwritecnt) {
3401                         mp->mnt_maxwritecnt = (u_int32_t)temp;
3402                 }
3403         }
3404
3405         if (readsegcnt) {
3406                 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
3407         } else {
3408                 temp = mp->mnt_maxreadcnt / PAGE_SIZE;
3409
3410                 if (temp > UINT16_MAX) {
3411                         temp = UINT16_MAX;
3412                 }
3413         }
3414         mp->mnt_segreadcnt = (u_int16_t)temp;
3415
3416         if (writesegcnt) {
3417                 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
3418         } else {
3419                 temp = mp->mnt_maxwritecnt / PAGE_SIZE;
3420
3421                 if (temp > UINT16_MAX) {
3422                         temp = UINT16_MAX;
3423                 }
3424         }
3425         mp->mnt_segwritecnt = (u_int16_t)temp;
3426
3427         if (readsegsize) {
3428                 temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
3429         } else {
3430                 temp = mp->mnt_maxreadcnt;
3431         }
3432         mp->mnt_maxsegreadsize = (u_int32_t)temp;
3433
3434         if (writesegsize) {
3435                 temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
3436         } else {
3437                 temp = mp->mnt_maxwritecnt;
3438         }
3439         mp->mnt_maxsegwritesize = (u_int32_t)temp;
3440
3441         if (alignment) {
3442                 temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - 1;
3443         } else {
3444                 temp = 0;
3445         }
3446         mp->mnt_alignmentmask = temp;
3447
3448
3449         if (ioqueue_depth > MNT_DEFAULT_IOQUEUE_DEPTH) {
3450                 temp = ioqueue_depth;
3451         } else {
3452                 temp = MNT_DEFAULT_IOQUEUE_DEPTH;
3453         }
3454
3455         mp->mnt_ioqueue_depth = temp;
3456         mp->mnt_ioscale = MNT_IOSCALE(mp->mnt_ioqueue_depth);
3457
3458         if (mp->mnt_ioscale > 1) {
3459                 printf("ioqueue_depth = %d,   ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale);
3460         }
3461
3462         if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
3463                 mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED;
3464         }
3465
3466         if (VNOP_IOCTL(devvp, DKIOCGETIOMINSATURATIONBYTECOUNT, (caddr_t)&minsaturationbytecount, 0, ctx) == 0) {
3467                 mp->mnt_minsaturationbytecount = minsaturationbytecount;
3468         } else {
3469                 mp->mnt_minsaturationbytecount = 0;
3470         }
3471
3472         if (VNOP_IOCTL(devvp, DKIOCCORESTORAGE, (caddr_t)&cs_info, 0, ctx) == 0) {
3473                 cs_present = TRUE;
3474         }
3475
3476         if (features & DK_FEATURE_UNMAP) {
3477                 mp->mnt_ioflags |= MNT_IOFLAGS_UNMAP_SUPPORTED;
3478
3479                 if (cs_present == TRUE) {
3480                         mp->mnt_ioflags |= MNT_IOFLAGS_CSUNMAP_SUPPORTED;
3481                 }
3482         }
3483         if (cs_present == TRUE) {
3484                 /*
3485                  * for now we'll use the following test as a proxy for
3486                  * the underlying drive being FUSION in nature
3487                  */
3488                 if ((cs_info.flags & DK_CORESTORAGE_PIN_YOUR_METADATA)) {
3489                         mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE;
3490                 }
3491         } else {
3492                 /* Check for APFS Fusion */
3493                 dk_apfs_flavour_t flavour;
3494                 if ((VNOP_IOCTL(devvp, DKIOCGETAPFSFLAVOUR, (caddr_t)&flavour, 0, ctx) == 0) &&
3495                     (flavour == DK_APFS_FUSION)) {
3496                         mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE;
3497                 }
3498         }
3499
3500 #if CONFIG_IOSCHED
3501         if (iosched_enabled && (features & DK_FEATURE_PRIORITY)) {
3502                 mp->mnt_ioflags |= MNT_IOFLAGS_IOSCHED_SUPPORTED;
3503                 throttle_info_disable_throttle(mp->mnt_devbsdunit, (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) != 0);
3504         }
3505 #endif /* CONFIG_IOSCHED */
3506         return error;
3507 }
3508
3509 static struct klist fs_klist;
3510 lck_grp_t *fs_klist_lck_grp;
3511 lck_mtx_t *fs_klist_lock;
3512
3513 void
3514 vfs_event_init(void)
3515 {
3516         klist_init(&fs_klist);
3517         fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL);
3518         fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL);
3519 }
3520
3521 void
3522 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data)
3523 {
3524         if (event == VQ_DEAD || event == VQ_NOTRESP) {
3525                 struct mount *mp = vfs_getvfs(fsid);
3526                 if (mp) {
3527                         mount_lock_spin(mp);
3528                         if (data) {
3529                                 mp->mnt_kern_flag &= ~MNT_LNOTRESP;     // Now responding
3530                         } else {
3531                                 mp->mnt_kern_flag |= MNT_LNOTRESP;      // Not responding
3532                         }
3533                         mount_unlock(mp);
3534                 }
3535         }
3536
3537         lck_mtx_lock(fs_klist_lock);
3538         KNOTE(&fs_klist, event);
3539         lck_mtx_unlock(fs_klist_lock);
3540 }
3541
3542 /*
3543  * return the number of mounted filesystems.
3544  */
3545 static int
3546 sysctl_vfs_getvfscnt(void)
3547 {
3548         return mount_getvfscnt();
3549 }
3550
3551
3552 static int
3553 mount_getvfscnt(void)
3554 {
3555         int ret;
3556
3557         mount_list_lock();
3558         ret = nummounts;
3559         mount_list_unlock();
3560         return ret;
3561 }
3562
3563
3564
3565 static int
3566 mount_fillfsids(fsid_t *fsidlst, int count)
3567 {
3568         struct mount *mp;
3569         int actual = 0;
3570
3571         actual = 0;
3572         mount_list_lock();
3573         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3574                 if (actual <= count) {
3575                         fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
3576                         actual++;
3577                 }
3578         }
3579         mount_list_unlock();
3580         return actual;
3581 }
3582
3583 /*
3584  * fill in the array of fsid_t's up to a max of 'count', the actual
3585  * number filled in will be set in '*actual'.  If there are more fsid_t's
3586  * than room in fsidlst then ENOMEM will be returned and '*actual' will
3587  * have the actual count.
3588  * having *actual filled out even in the error case is depended upon.
3589  */
3590 static int
3591 sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual)
3592 {
3593         struct mount *mp;
3594
3595         *actual = 0;
3596         mount_list_lock();
3597         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3598                 (*actual)++;
3599                 if (*actual <= count) {
3600                         fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid;
3601                 }
3602         }
3603         mount_list_unlock();
3604         return *actual <= count ? 0 : ENOMEM;
3605 }
3606
3607 static int
3608 sysctl_vfs_vfslist(__unused struct sysctl_oid *oidp, __unused void *arg1,
3609     __unused int arg2, struct sysctl_req *req)
3610 {
3611         int actual, error;
3612         size_t space;
3613         fsid_t *fsidlst;
3614
3615         /* This is a readonly node. */
3616         if (req->newptr != USER_ADDR_NULL) {
3617                 return EPERM;
3618         }
3619
3620         /* they are querying us so just return the space required. */
3621         if (req->oldptr == USER_ADDR_NULL) {
3622                 req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3623                 return 0;
3624         }
3625 again:
3626         /*
3627          * Retrieve an accurate count of the amount of space required to copy
3628          * out all the fsids in the system.
3629          */
3630         space = req->oldlen;
3631         req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3632
3633         /* they didn't give us enough space. */
3634         if (space < req->oldlen) {
3635                 return ENOMEM;
3636         }
3637
3638         MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK | M_ZERO);
3639         if (fsidlst == NULL) {
3640                 return ENOMEM;
3641         }
3642
3643         error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
3644             &actual);
3645         /*
3646          * If we get back ENOMEM, then another mount has been added while we
3647          * slept in malloc above.  If this is the case then try again.
3648          */
3649         if (error == ENOMEM) {
3650                 FREE(fsidlst, M_TEMP);
3651                 req->oldlen = space;
3652                 goto again;
3653         }
3654         if (error == 0) {
3655                 error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
3656         }
3657         FREE(fsidlst, M_TEMP);
3658         return error;
3659 }
3660
3661 /*
3662  * Do a sysctl by fsid.
3663  */
3664 static int
3665 sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
3666     struct sysctl_req *req)
3667 {
3668         union union_vfsidctl vc;
3669         struct mount *mp;
3670         struct vfsstatfs *sp;
3671         int *name, flags, namelen;
3672         int error = 0, gotref = 0;
3673         vfs_context_t ctx = vfs_context_current();
3674         proc_t p = req->p;      /* XXX req->p != current_proc()? */
3675         boolean_t is_64_bit;
3676
3677         name = arg1;
3678         namelen = arg2;
3679         is_64_bit = proc_is64bit(p);
3680
3681         error = SYSCTL_IN(req, &vc, is_64_bit? sizeof(vc.vc64):sizeof(vc.vc32));
3682         if (error) {
3683                 goto out;
3684         }
3685         if (vc.vc32.vc_vers != VFS_CTL_VERS1) { /* works for 32 and 64 */
3686                 error = EINVAL;
3687                 goto out;
3688         }
3689         mp = mount_list_lookupby_fsid(&vc.vc32.vc_fsid, 0, 1); /* works for 32 and 64 */
3690         if (mp == NULL) {
3691                 error = ENOENT;
3692                 goto out;
3693         }
3694         gotref = 1;
3695         /* reset so that the fs specific code can fetch it. */
3696         req->newidx = 0;
3697         /*
3698          * Note if this is a VFS_CTL then we pass the actual sysctl req
3699          * in for "oldp" so that the lower layer can DTRT and use the
3700          * SYSCTL_IN/OUT routines.
3701          */
3702         if (mp->mnt_op->vfs_sysctl != NULL) {
3703                 if (is_64_bit) {
3704                         if (vfs_64bitready(mp)) {
3705                                 error = mp->mnt_op->vfs_sysctl(name, namelen,
3706                                     CAST_USER_ADDR_T(req),
3707                                     NULL, USER_ADDR_NULL, 0,
3708                                     ctx);
3709                         } else {
3710                                 error = ENOTSUP;
3711                         }
3712                 } else {
3713                         error = mp->mnt_op->vfs_sysctl(name, namelen,
3714                             CAST_USER_ADDR_T(req),
3715                             NULL, USER_ADDR_NULL, 0,
3716                             ctx);
3717                 }
3718                 if (error != ENOTSUP) {
3719                         goto out;
3720                 }
3721         }
3722         switch (name[0]) {
3723         case VFS_CTL_UMOUNT:
3724                 req->newidx = 0;
3725                 if (is_64_bit) {
3726                         req->newptr = vc.vc64.vc_ptr;
3727                         req->newlen = (size_t)vc.vc64.vc_len;
3728                 } else {
3729                         req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
3730                         req->newlen = vc.vc32.vc_len;
3731                 }
3732                 error = SYSCTL_IN(req, &flags, sizeof(flags));
3733                 if (error) {
3734                         break;
3735                 }
3736
3737                 mount_ref(mp, 0);
3738                 mount_iterdrop(mp);
3739                 gotref = 0;
3740                 /* safedounmount consumes a ref */
3741                 error = safedounmount(mp, flags, ctx);
3742                 break;
3743         case VFS_CTL_STATFS:
3744                 req->newidx = 0;
3745                 if (is_64_bit) {
3746                         req->newptr = vc.vc64.vc_ptr;
3747                         req->newlen = (size_t)vc.vc64.vc_len;
3748                 } else {
3749                         req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
3750                         req->newlen = vc.vc32.vc_len;
3751                 }
3752                 error = SYSCTL_IN(req, &flags, sizeof(flags));
3753                 if (error) {
3754                         break;
3755                 }
3756                 sp = &mp->mnt_vfsstat;
3757                 if (((flags & MNT_NOWAIT) == 0 || (flags & (MNT_WAIT | MNT_DWAIT))) &&
3758                     (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT))) {
3759                         goto out;
3760                 }
3761                 if (is_64_bit) {
3762                         struct user64_statfs sfs;
3763                         bzero(&sfs, sizeof(sfs));
3764                         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3765                         sfs.f_type = mp->mnt_vtable->vfc_typenum;
3766                         sfs.f_bsize = (user64_long_t)sp->f_bsize;
3767                         sfs.f_iosize = (user64_long_t)sp->f_iosize;
3768                         sfs.f_blocks = (user64_long_t)sp->f_blocks;
3769                         sfs.f_bfree = (user64_long_t)sp->f_bfree;
3770                         sfs.f_bavail = (user64_long_t)sp->f_bavail;
3771                         sfs.f_files = (user64_long_t)sp->f_files;
3772                         sfs.f_ffree = (user64_long_t)sp->f_ffree;
3773                         sfs.f_fsid = sp->f_fsid;
3774                         sfs.f_owner = sp->f_owner;
3775 #ifdef NFSCLIENT
3776                         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3777                                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
3778                         } else
3779 #endif
3780                         {
3781                                 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3782                         }
3783                         strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3784                         strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
3785
3786                         error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
3787                 } else {
3788                         struct user32_statfs sfs;
3789                         bzero(&sfs, sizeof(sfs));
3790                         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3791                         sfs.f_type = mp->mnt_vtable->vfc_typenum;
3792
3793                         /*
3794                          * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
3795                          * have to fudge the numbers here in that case.   We inflate the blocksize in order
3796                          * to reflect the filesystem size as best we can.
3797                          */
3798                         if (sp->f_blocks > INT_MAX) {
3799                                 int             shift;
3800
3801                                 /*
3802                                  * Work out how far we have to shift the block count down to make it fit.
3803                                  * Note that it's possible to have to shift so far that the resulting
3804                                  * blocksize would be unreportably large.  At that point, we will clip
3805                                  * any values that don't fit.
3806                                  *
3807                                  * For safety's sake, we also ensure that f_iosize is never reported as
3808                                  * being smaller than f_bsize.
3809                                  */
3810                                 for (shift = 0; shift < 32; shift++) {
3811                                         if ((sp->f_blocks >> shift) <= INT_MAX) {
3812                                                 break;
3813                                         }
3814                                         if ((((long long)sp->f_bsize) << (shift + 1)) > INT_MAX) {
3815                                                 break;
3816                                         }
3817                                 }
3818 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
3819                                 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sp->f_blocks, shift);
3820                                 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bfree, shift);
3821                                 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bavail, shift);
3822 #undef __SHIFT_OR_CLIP
3823                                 sfs.f_bsize = (user32_long_t)(sp->f_bsize << shift);
3824                                 sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize);
3825                         } else {
3826                                 sfs.f_bsize = (user32_long_t)sp->f_bsize;
3827                                 sfs.f_iosize = (user32_long_t)sp->f_iosize;
3828                                 sfs.f_blocks = (user32_long_t)sp->f_blocks;
3829                                 sfs.f_bfree = (user32_long_t)sp->f_bfree;
3830                                 sfs.f_bavail = (user32_long_t)sp->f_bavail;
3831                         }
3832                         sfs.f_files = (user32_long_t)sp->f_files;
3833                         sfs.f_ffree = (user32_long_t)sp->f_ffree;
3834                         sfs.f_fsid = sp->f_fsid;
3835                         sfs.f_owner = sp->f_owner;
3836
3837 #ifdef NFSCLIENT
3838                         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3839                                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
3840                         } else
3841 #endif
3842                         {
3843                                 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3844                         }
3845                         strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3846                         strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
3847
3848                         error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
3849                 }
3850                 break;
3851         default:
3852                 error = ENOTSUP;
3853                 goto out;
3854         }
3855 out:
3856         if (gotref != 0) {
3857                 mount_iterdrop(mp);
3858         }
3859         return error;
3860 }
3861
3862 static int      filt_fsattach(struct knote *kn, struct kevent_internal_s *kev);
3863 static void     filt_fsdetach(struct knote *kn);
3864 static int      filt_fsevent(struct knote *kn, long hint);
3865 static int      filt_fstouch(struct knote *kn, struct kevent_internal_s *kev);
3866 static int      filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
3867 SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = {
3868         .f_attach = filt_fsattach,
3869         .f_detach = filt_fsdetach,
3870         .f_event = filt_fsevent,
3871         .f_touch = filt_fstouch,
3872         .f_process = filt_fsprocess,
3873 };
3874
3875 static int
3876 filt_fsattach(struct knote *kn, __unused struct kevent_internal_s *kev)
3877 {
3878         lck_mtx_lock(fs_klist_lock);
3879         KNOTE_ATTACH(&fs_klist, kn);
3880         lck_mtx_unlock(fs_klist_lock);
3881
3882         /*
3883          * filter only sees future events,
3884          * so it can't be fired already.
3885          */
3886         return 0;
3887 }
3888
3889 static void
3890 filt_fsdetach(struct knote *kn)
3891 {
3892         lck_mtx_lock(fs_klist_lock);
3893         KNOTE_DETACH(&fs_klist, kn);
3894         lck_mtx_unlock(fs_klist_lock);
3895 }
3896
3897 static int
3898 filt_fsevent(struct knote *kn, long hint)
3899 {
3900         /*
3901          * Backwards compatibility:
3902          * Other filters would do nothing if kn->kn_sfflags == 0
3903          */
3904
3905         if ((kn->kn_sfflags == 0) || (kn->kn_sfflags & hint)) {
3906                 kn->kn_fflags |= hint;
3907         }
3908
3909         return kn->kn_fflags != 0;
3910 }
3911
3912 static int
3913 filt_fstouch(struct knote *kn, struct kevent_internal_s *kev)
3914 {
3915         int res;
3916
3917         lck_mtx_lock(fs_klist_lock);
3918
3919         kn->kn_sfflags = kev->fflags;
3920
3921         /*
3922          * the above filter function sets bits even if nobody is looking for them.
3923          * Just preserve those bits even in the new mask is more selective
3924          * than before.
3925          *
3926          * For compatibility with previous implementations, we leave kn_fflags
3927          * as they were before.
3928          */
3929         //if (kn->kn_sfflags)
3930         //      kn->kn_fflags &= kn->kn_sfflags;
3931         res = (kn->kn_fflags != 0);
3932
3933         lck_mtx_unlock(fs_klist_lock);
3934
3935         return res;
3936 }
3937
3938 static int
3939 filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
3940 {
3941 #pragma unused(data)
3942         int res;
3943
3944         lck_mtx_lock(fs_klist_lock);
3945         res = (kn->kn_fflags != 0);
3946         if (res) {
3947                 *kev = kn->kn_kevent;
3948                 kn->kn_flags |= EV_CLEAR; /* automatic */
3949                 kn->kn_fflags = 0;
3950                 kn->kn_data = 0;
3951         }
3952         lck_mtx_unlock(fs_klist_lock);
3953         return res;
3954 }
3955
3956 static int
3957 sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp,
3958     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3959 {
3960         int out, error;
3961         pid_t pid;
3962         proc_t p;
3963
3964         /* We need a pid. */
3965         if (req->newptr == USER_ADDR_NULL) {
3966                 return EINVAL;
3967         }
3968
3969         error = SYSCTL_IN(req, &pid, sizeof(pid));
3970         if (error) {
3971                 return error;
3972         }
3973
3974         p = proc_find(pid < 0 ? -pid : pid);
3975         if (p == NULL) {
3976                 return ESRCH;
3977         }
3978
3979         /*
3980          * Fetching the value is ok, but we only fetch if the old
3981          * pointer is given.
3982          */
3983         if (req->oldptr != USER_ADDR_NULL) {
3984                 out = !((p->p_flag & P_NOREMOTEHANG) == 0);
3985                 proc_rele(p);
3986                 error = SYSCTL_OUT(req, &out, sizeof(out));
3987                 return error;
3988         }
3989
3990         /* cansignal offers us enough security. */
3991         if (p != req->p && proc_suser(req->p) != 0) {
3992                 proc_rele(p);
3993                 return EPERM;
3994         }
3995
3996         if (pid < 0) {
3997                 OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), &p->p_flag);
3998         } else {
3999                 OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
4000         }
4001         proc_rele(p);
4002
4003         return 0;
4004 }
4005
4006 static int
4007 sysctl_vfs_generic_conf SYSCTL_HANDLER_ARGS
4008 {
4009         int *name, namelen;
4010         struct vfstable *vfsp;
4011         struct vfsconf vfsc = {};
4012
4013         (void)oidp;
4014         name = arg1;
4015         namelen = arg2;
4016
4017         if (namelen < 1) {
4018                 return EISDIR;
4019         } else if (namelen > 1) {
4020                 return ENOTDIR;
4021         }
4022
4023         mount_list_lock();
4024         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
4025                 if (vfsp->vfc_typenum == name[0]) {
4026                         break;
4027                 }
4028         }
4029
4030         if (vfsp == NULL) {
4031                 mount_list_unlock();
4032                 return ENOTSUP;
4033         }
4034
4035         vfsc.vfc_reserved1 = 0;
4036         bcopy(vfsp->vfc_name, vfsc.vfc_name, sizeof(vfsc.vfc_name));
4037         vfsc.vfc_typenum = vfsp->vfc_typenum;
4038         vfsc.vfc_refcount = vfsp->vfc_refcount;
4039         vfsc.vfc_flags = vfsp->vfc_flags;
4040         vfsc.vfc_reserved2 = 0;
4041         vfsc.vfc_reserved3 = 0;
4042
4043         mount_list_unlock();
4044         return SYSCTL_OUT(req, &vfsc, sizeof(struct vfsconf));
4045 }
4046
4047 /* the vfs.generic. branch. */
4048 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge");
4049 /* retreive a list of mounted filesystem fsid_t */
4050 SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist,
4051     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
4052     NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
4053 /* perform operations on filesystem via fsid_t */
4054 SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW | CTLFLAG_LOCKED,
4055     sysctl_vfs_ctlbyfsid, "ctlbyfsid");
4056 SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY,
4057     NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
4058 SYSCTL_INT(_vfs_generic, VFS_MAXTYPENUM, maxtypenum,
4059     CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
4060     &maxvfstypenum, 0, "");
4061 SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &sync_timeout_seconds, 0, "");
4062 SYSCTL_NODE(_vfs_generic, VFS_CONF, conf,
4063     CTLFLAG_RD | CTLFLAG_LOCKED,
4064     sysctl_vfs_generic_conf, "");
4065
4066 /* Indicate that the root file system unmounted cleanly */
4067 static int vfs_root_unmounted_cleanly = 0;
4068 SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &vfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly");
4069
4070 void
4071 vfs_set_root_unmounted_cleanly(void)
4072 {
4073         vfs_root_unmounted_cleanly = 1;
4074 }
4075
4076 /*
4077  * Print vnode state.
4078  */
4079 void
4080 vn_print_state(struct vnode *vp, const char *fmt, ...)
4081 {
4082         va_list ap;
4083         char perm_str[] = "(VM_KERNEL_ADDRPERM pointer)";
4084         char fs_name[MFSNAMELEN];
4085
4086         va_start(ap, fmt);
4087         vprintf(fmt, ap);
4088         va_end(ap);
4089         printf("vp 0x%0llx %s: ", (uint64_t)VM_KERNEL_ADDRPERM(vp), perm_str);
4090         printf("tag %d, type %d\n", vp->v_tag, vp->v_type);
4091         /* Counts .. */
4092         printf("    iocount %d, usecount %d, kusecount %d references %d\n",
4093             vp->v_iocount, vp->v_usecount, vp->v_kusecount, vp->v_references);
4094         printf("    writecount %d, numoutput %d\n", vp->v_writecount,
4095             vp->v_numoutput);
4096         /* Flags */
4097         printf("    flag 0x%x, lflag 0x%x, listflag 0x%x\n", vp->v_flag,
4098             vp->v_lflag, vp->v_listflag);
4099
4100         if (vp->v_mount == NULL || vp->v_mount == dead_mountp) {
4101                 strlcpy(fs_name, "deadfs", MFSNAMELEN);
4102         } else {
4103                 vfs_name(vp->v_mount, fs_name);
4104         }
4105
4106         printf("    v_data 0x%0llx %s\n",
4107             (vp->v_data ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_data) : 0),
4108             perm_str);
4109         printf("    v_mount 0x%0llx %s vfs_name %s\n",
4110             (vp->v_mount ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_mount) : 0),
4111             perm_str, fs_name);
4112 }
4113
4114 long num_reusedvnodes = 0;
4115
4116
4117 static vnode_t
4118 process_vp(vnode_t vp, int want_vp, int *deferred)
4119 {
4120         unsigned int  vpid;
4121
4122         *deferred = 0;
4123
4124         vpid = vp->v_id;
4125
4126         vnode_list_remove_locked(vp);
4127
4128         vnode_list_unlock();
4129
4130         vnode_lock_spin(vp);
4131
4132         /*
4133          * We could wait for the vnode_lock after removing the vp from the freelist
4134          * and the vid is bumped only at the very end of reclaim. So it is  possible
4135          * that we are looking at a vnode that is being terminated. If so skip it.
4136          */
4137         if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) ||
4138             VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) {
4139                 /*
4140                  * we lost the race between dropping the list lock
4141                  * and picking up the vnode_lock... someone else
4142                  * used this vnode and it is now in a new state
4143                  */
4144                 vnode_unlock(vp);
4145
4146                 return NULLVP;
4147         }
4148         if ((vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE) {
4149                 /*
4150                  * we did a vnode_rele_ext that asked for
4151                  * us not to reenter the filesystem during
4152                  * the release even though VL_NEEDINACTIVE was
4153                  * set... we'll do it here by doing a
4154                  * vnode_get/vnode_put
4155                  *
4156                  * pick up an iocount so that we can call
4157                  * vnode_put and drive the VNOP_INACTIVE...
4158                  * vnode_put will either leave us off
4159                  * the freelist if a new ref comes in,
4160                  * or put us back on the end of the freelist
4161                  * or recycle us if we were marked for termination...
4162                  * so we'll just go grab a new candidate
4163                  */
4164                 vp->v_iocount++;
4165 #ifdef JOE_DEBUG
4166                 record_vp(vp, 1);
4167 #endif
4168                 vnode_put_locked(vp);
4169                 vnode_unlock(vp);
4170
4171                 return NULLVP;
4172         }
4173         /*
4174          * Checks for anyone racing us for recycle
4175          */
4176         if (vp->v_type != VBAD) {
4177                 if (want_vp && (vnode_on_reliable_media(vp) == FALSE || (vp->v_flag & VISDIRTY))) {
4178                         vnode_async_list_add(vp);
4179                         vnode_unlock(vp);
4180
4181                         *deferred = 1;
4182
4183                         return NULLVP;
4184                 }
4185                 if (vp->v_lflag & VL_DEAD) {
4186                         panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp);
4187                 }
4188
4189                 vnode_lock_convert(vp);
4190                 (void)vnode_reclaim_internal(vp, 1, want_vp, 0);
4191
4192                 if (want_vp) {
4193                         if ((VONLIST(vp))) {
4194                                 panic("new_vnode(%p): vp on list", vp);
4195                         }
4196                         if (vp->v_usecount || vp->v_iocount || vp->v_kusecount ||
4197                             (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH))) {
4198                                 panic("new_vnode(%p): free vnode still referenced", vp);
4199                         }
4200                         if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) {
4201                                 panic("new_vnode(%p): vnode seems to be on mount list", vp);
4202                         }
4203                         if (!LIST_EMPTY(&vp->v_nclinks) || !TAILQ_EMPTY(&vp->v_ncchildren)) {
4204                                 panic("new_vnode(%p): vnode still hooked into the name cache", vp);
4205                         }
4206                 } else {
4207                         vnode_unlock(vp);
4208                         vp = NULLVP;
4209                 }
4210         }
4211         return vp;
4212 }
4213
4214 __attribute__((noreturn))
4215 static void
4216 async_work_continue(void)
4217 {
4218         struct async_work_lst *q;
4219         int     deferred;
4220         vnode_t vp;
4221
4222         q = &vnode_async_work_list;
4223
4224         for (;;) {
4225                 vnode_list_lock();
4226
4227                 if (TAILQ_EMPTY(q)) {
4228                         assert_wait(q, (THREAD_UNINT));
4229
4230                         vnode_list_unlock();
4231
4232                         thread_block((thread_continue_t)async_work_continue);
4233
4234                         continue;
4235                 }
4236                 async_work_handled++;
4237
4238                 vp = TAILQ_FIRST(q);
4239
4240                 vp = process_vp(vp, 0, &deferred);
4241
4242                 if (vp != NULLVP) {
4243                         panic("found VBAD vp (%p) on async queue", vp);
4244                 }
4245         }
4246 }
4247
4248
4249 static int
4250 new_vnode(vnode_t *vpp)
4251 {
4252         vnode_t vp;
4253         uint32_t retries = 0, max_retries = 100;                /* retry incase of tablefull */
4254         int force_alloc = 0, walk_count = 0;
4255         boolean_t need_reliable_vp = FALSE;
4256         int deferred;
4257         struct timeval initial_tv;
4258         struct timeval current_tv;
4259         proc_t  curproc = current_proc();
4260
4261         initial_tv.tv_sec = 0;
4262 retry:
4263         vp = NULLVP;
4264
4265         vnode_list_lock();
4266
4267         if (need_reliable_vp == TRUE) {
4268                 async_work_timed_out++;
4269         }
4270
4271         if ((numvnodes - deadvnodes) < desiredvnodes || force_alloc) {
4272                 struct timespec ts;
4273
4274                 if (!TAILQ_EMPTY(&vnode_dead_list)) {
4275                         /*
4276                          * Can always reuse a dead one
4277                          */
4278                         vp = TAILQ_FIRST(&vnode_dead_list);
4279                         goto steal_this_vp;
4280                 }
4281                 /*
4282                  * no dead vnodes available... if we're under
4283                  * the limit, we'll create a new vnode
4284                  */
4285                 numvnodes++;
4286                 vnode_list_unlock();
4287
4288                 MALLOC_ZONE(vp, struct vnode *, sizeof(*vp), M_VNODE, M_WAITOK);
4289                 bzero((char *)vp, sizeof(*vp));
4290                 VLISTNONE(vp);          /* avoid double queue removal */
4291                 lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr);
4292
4293                 TAILQ_INIT(&vp->v_ncchildren);
4294
4295                 klist_init(&vp->v_knotes);
4296                 nanouptime(&ts);
4297                 vp->v_id = ts.tv_nsec;
4298                 vp->v_flag = VSTANDARD;
4299
4300 #if CONFIG_MACF
4301                 if (mac_vnode_label_init_needed(vp)) {
4302                         mac_vnode_label_init(vp);
4303                 }
4304 #endif /* MAC */
4305
4306                 vp->v_iocount = 1;
4307                 goto done;
4308         }
4309         microuptime(&current_tv);
4310
4311 #define MAX_WALK_COUNT 1000
4312
4313         if (!TAILQ_EMPTY(&vnode_rage_list) &&
4314             (ragevnodes >= rage_limit ||
4315             (current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) {
4316                 TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) {
4317                         if (!(vp->v_listflag & VLIST_RAGE)) {
4318                                 panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp);
4319                         }
4320
4321                         // if we're a dependency-capable process, skip vnodes that can
4322                         // cause recycling deadlocks. (i.e. this process is diskimages
4323                         // helper and the vnode is in a disk image).  Querying the
4324                         // mnt_kern_flag for the mount's virtual device status
4325                         // is safer than checking the mnt_dependent_process, which
4326                         // may not be updated if there are multiple devnode layers
4327                         // in between the disk image and the final consumer.
4328
4329                         if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL ||
4330                             (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) {
4331                                 /*
4332                                  * if need_reliable_vp == TRUE, then we've already sent one or more
4333                                  * non-reliable vnodes to the async thread for processing and timed
4334                                  * out waiting for a dead vnode to show up.  Use the MAX_WALK_COUNT
4335                                  * mechanism to first scan for a reliable vnode before forcing
4336                                  * a new vnode to be created
4337                                  */
4338                                 if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) {
4339                                         break;
4340                                 }
4341                         }
4342
4343                         // don't iterate more than MAX_WALK_COUNT vnodes to
4344                         // avoid keeping the vnode list lock held for too long.
4345
4346                         if (walk_count++ > MAX_WALK_COUNT) {
4347                                 vp = NULL;
4348                                 break;
4349                         }
4350                 }
4351         }
4352
4353         if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) {
4354                 /*
4355                  * Pick the first vp for possible reuse
4356                  */
4357                 walk_count = 0;
4358                 TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) {
4359                         // if we're a dependency-capable process, skip vnodes that can
4360                         // cause recycling deadlocks. (i.e. this process is diskimages
4361                         // helper and the vnode is in a disk image).  Querying the
4362                         // mnt_kern_flag for the mount's virtual device status
4363                         // is safer than checking the mnt_dependent_process, which
4364                         // may not be updated if there are multiple devnode layers
4365                         // in between the disk image and the final consumer.
4366
4367                         if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL ||
4368                             (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) {
4369                                 /*
4370                                  * if need_reliable_vp == TRUE, then we've already sent one or more
4371                                  * non-reliable vnodes to the async thread for processing and timed
4372                                  * out waiting for a dead vnode to show up.  Use the MAX_WALK_COUNT
4373                                  * mechanism to first scan for a reliable vnode before forcing
4374                                  * a new vnode to be created
4375                                  */
4376                                 if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) {
4377                                         break;
4378                                 }
4379                         }
4380
4381                         // don't iterate more than MAX_WALK_COUNT vnodes to
4382                         // avoid keeping the vnode list lock held for too long.
4383
4384                         if (walk_count++ > MAX_WALK_COUNT) {
4385                                 vp = NULL;
4386                                 break;
4387                         }
4388                 }
4389         }
4390
4391         //
4392         // if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT
4393         // then we're trying to create a vnode on behalf of a
4394         // process like diskimages-helper that has file systems
4395         // mounted on top of itself (and thus we can't reclaim
4396         // vnodes in the file systems on top of us).  if we can't
4397         // find a vnode to reclaim then we'll just have to force
4398         // the allocation.
4399         //
4400         if (vp == NULL && walk_count >= MAX_WALK_COUNT) {
4401                 force_alloc = 1;
4402                 vnode_list_unlock();
4403                 goto retry;
4404         }
4405
4406         if (vp == NULL) {
4407                 /*
4408                  * we've reached the system imposed maximum number of vnodes
4409                  * but there isn't a single one available
4410                  * wait a bit and then retry... if we can't get a vnode
4411                  * after our target number of retries, than log a complaint
4412                  */
4413                 if (++retries <= max_retries) {
4414                         vnode_list_unlock();
4415                         delay_for_interval(1, 1000 * 1000);
4416                         goto retry;
4417                 }
4418
4419                 vnode_list_unlock();
4420                 tablefull("vnode");
4421                 log(LOG_EMERG, "%d desired, %d numvnodes, "
4422                     "%d free, %d dead, %d async, %d rage\n",
4423                     desiredvnodes, numvnodes, freevnodes, deadvnodes, async_work_vnodes, ragevnodes);
4424 #if CONFIG_JETSAM
4425
4426 #if DEVELOPMENT || DEBUG
4427                 if (bootarg_no_vnode_jetsam) {
4428                         panic("vnode table is full\n");
4429                 }
4430 #endif /* DEVELOPMENT || DEBUG */
4431
4432                 /*
4433                  * Running out of vnodes tends to make a system unusable. Start killing
4434                  * processes that jetsam knows are killable.
4435                  */
4436                 if (memorystatus_kill_on_vnode_limit() == FALSE) {
4437                         /*
4438                          * If jetsam can't find any more processes to kill and there
4439                          * still aren't any free vnodes, panic. Hopefully we'll get a
4440                          * panic log to tell us why we ran out.
4441                          */
4442                         panic("vnode table is full\n");
4443                 }
4444
4445                 /*
4446                  * Now that we've killed someone, wait a bit and continue looking
4447                  * (with fewer retries before trying another kill).
4448                  */
4449                 delay_for_interval(3, 1000 * 1000);
4450                 retries = 0;
4451                 max_retries = 10;
4452                 goto retry;
4453 #endif
4454
4455                 *vpp = NULL;
4456                 return ENFILE;
4457         }
4458 steal_this_vp:
4459         if ((vp = process_vp(vp, 1, &deferred)) == NULLVP) {
4460                 if (deferred) {
4461                         int     elapsed_msecs;
4462                         struct timeval elapsed_tv;
4463
4464                         if (initial_tv.tv_sec == 0) {
4465                                 microuptime(&initial_tv);
4466                         }
4467
4468                         vnode_list_lock();
4469
4470                         dead_vnode_waited++;
4471                         dead_vnode_wanted++;
4472
4473                         /*
4474                          * note that we're only going to explicitly wait 10ms
4475                          * for a dead vnode to become available, since even if one
4476                          * isn't available, a reliable vnode might now be available
4477                          * at the head of the VRAGE or free lists... if so, we
4478                          * can satisfy the new_vnode request with less latency then waiting
4479                          * for the full 100ms duration we're ultimately willing to tolerate
4480                          */
4481                         assert_wait_timeout((caddr_t)&dead_vnode_wanted, (THREAD_INTERRUPTIBLE), 10000, NSEC_PER_USEC);
4482
4483                         vnode_list_unlock();
4484
4485                         thread_block(THREAD_CONTINUE_NULL);
4486
4487                         microuptime(&elapsed_tv);
4488
4489                         timevalsub(&elapsed_tv, &initial_tv);
4490                         elapsed_msecs = elapsed_tv.tv_sec * 1000 + elapsed_tv.tv_usec / 1000;
4491
4492                         if (elapsed_msecs >= 100) {
4493                                 /*
4494                                  * we've waited long enough... 100ms is
4495                                  * somewhat arbitrary for this case, but the
4496                                  * normal worst case latency used for UI
4497                                  * interaction is 100ms, so I've chosen to
4498                                  * go with that.
4499                                  *
4500                                  * setting need_reliable_vp to TRUE
4501                                  * forces us to find a reliable vnode
4502                                  * that we can process synchronously, or
4503                                  * to create a new one if the scan for
4504                                  * a reliable one hits the scan limit
4505                                  */
4506                                 need_reliable_vp = TRUE;
4507                         }
4508                 }
4509                 goto retry;
4510         }
4511         OSAddAtomicLong(1, &num_reusedvnodes);
4512
4513
4514 #if CONFIG_MACF
4515         /*
4516          * We should never see VL_LABELWAIT or VL_LABEL here.
4517          * as those operations hold a reference.
4518          */
4519         assert((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT);
4520         assert((vp->v_lflag & VL_LABEL) != VL_LABEL);
4521         if (vp->v_lflag & VL_LABELED) {
4522                 vnode_lock_convert(vp);
4523                 mac_vnode_label_recycle(vp);
4524         } else if (mac_vnode_label_init_needed(vp)) {
4525                 vnode_lock_convert(vp);
4526                 mac_vnode_label_init(vp);
4527         }
4528
4529 #endif /* MAC */
4530
4531         vp->v_iocount = 1;
4532         vp->v_lflag = 0;
4533         vp->v_writecount = 0;
4534         vp->v_references = 0;
4535         vp->v_iterblkflags = 0;
4536         vp->v_flag = VSTANDARD;
4537         /* vbad vnodes can point to dead_mountp */
4538         vp->v_mount = NULL;
4539         vp->v_defer_reclaimlist = (vnode_t)0;
4540
4541         vnode_unlock(vp);
4542
4543 done:
4544         *vpp = vp;
4545
4546         return 0;
4547 }
4548
4549 void
4550 vnode_lock(vnode_t vp)
4551 {
4552         lck_mtx_lock(&vp->v_lock);
4553 }
4554
4555 void
4556 vnode_lock_spin(vnode_t vp)
4557 {
4558         lck_mtx_lock_spin(&vp->v_lock);
4559 }
4560
4561 void
4562 vnode_unlock(vnode_t vp)
4563 {
4564         lck_mtx_unlock(&vp->v_lock);
4565 }
4566
4567
4568
4569 int
4570 vnode_get(struct vnode *vp)
4571 {
4572         int retval;
4573
4574         vnode_lock_spin(vp);
4575         retval = vnode_get_locked(vp);
4576         vnode_unlock(vp);
4577
4578         return retval;
4579 }
4580
4581 int
4582 vnode_get_locked(struct vnode *vp)
4583 {
4584 #if DIAGNOSTIC
4585         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
4586 #endif
4587         if ((vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) {
4588                 return ENOENT;
4589         }
4590
4591         if (os_add_overflow(vp->v_iocount, 1, &vp->v_iocount)) {
4592                 panic("v_iocount overflow");
4593         }
4594
4595 #ifdef JOE_DEBUG
4596         record_vp(vp, 1);
4597 #endif
4598         return 0;
4599 }
4600
4601 /*
4602  * vnode_getwithvid() cuts in line in front of a vnode drain (that is,
4603  * while the vnode is draining, but at no point after that) to prevent
4604  * deadlocks when getting vnodes from filesystem hashes while holding
4605  * resources that may prevent other iocounts from being released.
4606  */
4607 int
4608 vnode_getwithvid(vnode_t vp, uint32_t vid)
4609 {
4610         return vget_internal(vp, vid, (VNODE_NODEAD | VNODE_WITHID | VNODE_DRAINO));
4611 }
4612
4613 /*
4614  * vnode_getwithvid_drainok() is like vnode_getwithvid(), but *does* block behind a vnode
4615  * drain; it exists for use in the VFS name cache, where we really do want to block behind
4616  * vnode drain to prevent holding off an unmount.
4617  */
4618 int
4619 vnode_getwithvid_drainok(vnode_t vp, uint32_t vid)
4620 {
4621         return vget_internal(vp, vid, (VNODE_NODEAD | VNODE_WITHID));
4622 }
4623
4624 int
4625 vnode_getwithref(vnode_t vp)
4626 {
4627         return vget_internal(vp, 0, 0);
4628 }
4629
4630
4631 __private_extern__ int
4632 vnode_getalways(vnode_t vp)
4633 {
4634         return vget_internal(vp, 0, VNODE_ALWAYS);
4635 }
4636
4637 int
4638 vnode_put(vnode_t vp)
4639 {
4640         int retval;
4641
4642         vnode_lock_spin(vp);
4643         retval = vnode_put_locked(vp);
4644         vnode_unlock(vp);
4645
4646         return retval;
4647 }
4648
4649 static inline void
4650 vn_set_dead(vnode_t vp)
4651 {
4652         vp->v_mount = NULL;
4653         vp->v_op = dead_vnodeop_p;
4654         vp->v_tag = VT_NON;
4655         vp->v_data = NULL;
4656         vp->v_type = VBAD;
4657         vp->v_lflag |= VL_DEAD;
4658 }
4659
4660 int
4661 vnode_put_locked(vnode_t vp)
4662 {
4663         vfs_context_t ctx = vfs_context_current();      /* hoist outside loop */
4664
4665 #if DIAGNOSTIC
4666         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
4667 #endif
4668 retry:
4669         if (vp->v_iocount < 1) {
4670                 panic("vnode_put(%p): iocount < 1", vp);
4671         }
4672
4673         if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
4674                 vnode_dropiocount(vp);
4675                 return 0;
4676         }
4677         if ((vp->v_lflag & (VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) {
4678                 vp->v_lflag &= ~VL_NEEDINACTIVE;
4679                 vnode_unlock(vp);
4680
4681                 VNOP_INACTIVE(vp, ctx);
4682
4683                 vnode_lock_spin(vp);
4684                 /*
4685                  * because we had to drop the vnode lock before calling
4686                  * VNOP_INACTIVE, the state of this vnode may have changed...
4687                  * we may pick up both VL_MARTERM and either
4688                  * an iocount or a usecount while in the VNOP_INACTIVE call
4689                  * we don't want to call vnode_reclaim_internal on a vnode
4690                  * that has active references on it... so loop back around
4691                  * and reevaluate the state
4692                  */
4693                 goto retry;
4694         }
4695         vp->v_lflag &= ~VL_NEEDINACTIVE;
4696
4697         if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM) {
4698                 vnode_lock_convert(vp);
4699                 vnode_reclaim_internal(vp, 1, 1, 0);
4700         }
4701         vnode_dropiocount(vp);
4702         vnode_list_add(vp);
4703
4704         return 0;
4705 }
4706
4707 /* is vnode_t in use by others?  */
4708 int
4709 vnode_isinuse(vnode_t vp, int refcnt)
4710 {
4711         return vnode_isinuse_locked(vp, refcnt, 0);
4712 }
4713
4714 int
4715 vnode_usecount(vnode_t vp)
4716 {
4717         return vp->v_usecount;
4718 }
4719
4720 int
4721 vnode_iocount(vnode_t vp)
4722 {
4723         return vp->v_iocount;
4724 }
4725
4726 static int
4727 vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
4728 {
4729         int retval = 0;
4730
4731         if (!locked) {
4732                 vnode_lock_spin(vp);
4733         }
4734         if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) {
4735                 retval = 1;
4736                 goto out;
4737         }
4738         if (vp->v_type == VREG) {
4739                 retval = ubc_isinuse_locked(vp, refcnt, 1);
4740         }
4741
4742 out:
4743         if (!locked) {
4744                 vnode_unlock(vp);
4745         }
4746         return retval;
4747 }
4748
4749
4750 /* resume vnode_t */
4751 errno_t
4752 vnode_resume(vnode_t vp)
4753 {
4754         if ((vp->v_lflag & VL_SUSPENDED) && vp->v_owner == current_thread()) {
4755                 vnode_lock_spin(vp);
4756                 vp->v_lflag &= ~VL_SUSPENDED;
4757                 vp->v_owner = NULL;
4758                 vnode_unlock(vp);
4759
4760                 wakeup(&vp->v_iocount);
4761         }
4762         return 0;
4763 }
4764
4765 /* suspend vnode_t
4766  * Please do not use on more than one vnode at a time as it may
4767  * cause deadlocks.
4768  * xxx should we explicity prevent this from happening?
4769  */
4770
4771 errno_t
4772 vnode_suspend(vnode_t vp)
4773 {
4774         if (vp->v_lflag & VL_SUSPENDED) {
4775                 return EBUSY;
4776         }
4777
4778         vnode_lock_spin(vp);
4779
4780         /*
4781          * xxx is this sufficient to check if a vnode_drain is
4782          * progress?
4783          */
4784
4785         if (vp->v_owner == NULL) {
4786                 vp->v_lflag |= VL_SUSPENDED;
4787                 vp->v_owner = current_thread();
4788         }
4789         vnode_unlock(vp);
4790
4791         return 0;
4792 }
4793
4794 /*
4795  * Release any blocked locking requests on the vnode.
4796  * Used for forced-unmounts.
4797  *
4798  * XXX  What about network filesystems?
4799  */
4800 static void
4801 vnode_abort_advlocks(vnode_t vp)
4802 {
4803         if (vp->v_flag & VLOCKLOCAL) {
4804                 lf_abort_advlocks(vp);
4805         }
4806 }
4807
4808
4809 static errno_t
4810 vnode_drain(vnode_t vp)
4811 {
4812         if (vp->v_lflag & VL_DRAIN) {
4813                 panic("vnode_drain: recursive drain");
4814                 return ENOENT;
4815         }
4816         vp->v_lflag |= VL_DRAIN;
4817         vp->v_owner = current_thread();
4818
4819         while (vp->v_iocount > 1) {
4820                 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL);
4821         }
4822
4823         vp->v_lflag &= ~VL_DRAIN;
4824
4825         return 0;
4826 }
4827
4828
4829 /*
4830  * if the number of recent references via vnode_getwithvid or vnode_getwithref
4831  * exceeds this threshold, than 'UN-AGE' the vnode by removing it from
4832  * the LRU list if it's currently on it... once the iocount and usecount both drop
4833  * to 0, it will get put back on the end of the list, effectively making it younger
4834  * this allows us to keep actively referenced vnodes in the list without having
4835  * to constantly remove and add to the list each time a vnode w/o a usecount is
4836  * referenced which costs us taking and dropping a global lock twice.
4837  * However, if the vnode is marked DIRTY, we want to pull it out much earlier
4838  */
4839 #define UNAGE_THRESHHOLD        25
4840 #define UNAGE_DIRTYTHRESHHOLD    6
4841
4842 errno_t
4843 vnode_getiocount(vnode_t vp, unsigned int vid, int vflags)
4844 {
4845         int nodead = vflags & VNODE_NODEAD;
4846         int nosusp = vflags & VNODE_NOSUSPEND;
4847         int always = vflags & VNODE_ALWAYS;
4848         int beatdrain = vflags & VNODE_DRAINO;
4849         int withvid = vflags & VNODE_WITHID;
4850
4851         for (;;) {
4852                 int sleepflg = 0;
4853
4854                 /*
4855                  * if it is a dead vnode with deadfs
4856                  */
4857                 if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) {
4858                         return ENOENT;
4859                 }
4860                 /*
4861                  * will return VL_DEAD ones
4862                  */
4863                 if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0) {
4864                         break;
4865                 }
4866                 /*
4867                  * if suspended vnodes are to be failed
4868                  */
4869                 if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
4870                         return ENOENT;
4871                 }
4872                 /*
4873                  * if you are the owner of drain/suspend/termination , can acquire iocount
4874                  * check for VL_TERMINATE; it does not set owner
4875                  */
4876                 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) &&
4877                     (vp->v_owner == current_thread())) {
4878                         break;
4879                 }
4880
4881                 if (always != 0) {
4882                         break;
4883                 }
4884
4885                 /*
4886                  * If this vnode is getting drained, there are some cases where
4887                  * we can't block or, in case of tty vnodes, want to be
4888                  * interruptible.
4889                  */
4890                 if (vp->v_lflag & VL_DRAIN) {
4891                         /*
4892                          * In some situations, we want to get an iocount
4893                          * even if the vnode is draining to prevent deadlock,
4894                          * e.g. if we're in the filesystem, potentially holding
4895                          * resources that could prevent other iocounts from
4896                          * being released.
4897                          */
4898                         if (beatdrain) {
4899                                 break;
4900                         }
4901                         /*
4902                          * Don't block if the vnode's mount point is unmounting as
4903                          * we may be the thread the unmount is itself waiting on
4904                          * Only callers who pass in vids (at this point, we've already
4905                          * handled nosusp and nodead) are expecting error returns
4906                          * from this function, so only we can only return errors for
4907                          * those. ENODEV is intended to inform callers that the call
4908                          * failed because an unmount is in progress.
4909                          */
4910                         if (withvid && (vp->v_mount) && vfs_isunmount(vp->v_mount)) {
4911                                 return ENODEV;
4912                         }
4913
4914                         if (vnode_istty(vp)) {
4915                                 sleepflg = PCATCH;
4916                         }
4917                 }
4918
4919                 vnode_lock_convert(vp);
4920
4921                 if (vp->v_lflag & VL_TERMINATE) {
4922                         int error;
4923
4924                         vp->v_lflag |= VL_TERMWANT;
4925
4926                         error = msleep(&vp->v_lflag, &vp->v_lock,
4927                             (PVFS | sleepflg), "vnode getiocount", NULL);
4928                         if (error) {
4929                                 return error;
4930                         }
4931                 } else {
4932                         msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL);
4933                 }
4934         }
4935         if (withvid && vid != vp->v_id) {
4936                 return ENOENT;
4937         }
4938         if (++vp->v_references >= UNAGE_THRESHHOLD ||
4939             (vp->v_flag & VISDIRTY && vp->v_references >= UNAGE_DIRTYTHRESHHOLD)) {
4940                 vp->v_references = 0;
4941                 vnode_list_remove(vp);
4942         }
4943         vp->v_iocount++;
4944 #ifdef JOE_DEBUG
4945         record_vp(vp, 1);
4946 #endif
4947         return 0;
4948 }
4949
4950 static void
4951 vnode_dropiocount(vnode_t vp)
4952 {
4953         if (vp->v_iocount < 1) {
4954                 panic("vnode_dropiocount(%p): v_iocount < 1", vp);
4955         }
4956
4957         vp->v_iocount--;
4958 #ifdef JOE_DEBUG
4959         record_vp(vp, -1);
4960 #endif
4961         if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1)) {
4962                 wakeup(&vp->v_iocount);
4963         }
4964 }
4965
4966
4967 void
4968 vnode_reclaim(struct vnode * vp)
4969 {
4970         vnode_reclaim_internal(vp, 0, 0, 0);
4971 }
4972
4973 __private_extern__
4974 void
4975 vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags)
4976 {
4977         int isfifo = 0;
4978
4979         if (!locked) {
4980                 vnode_lock(vp);
4981         }
4982
4983         if (vp->v_lflag & VL_TERMINATE) {
4984                 panic("vnode reclaim in progress");
4985         }
4986         vp->v_lflag |= VL_TERMINATE;
4987
4988         vn_clearunionwait(vp, 1);
4989
4990         vnode_drain(vp);
4991
4992         isfifo = (vp->v_type == VFIFO);
4993
4994         if (vp->v_type != VBAD) {
4995                 vgone(vp, flags);               /* clean and reclaim the vnode */
4996         }
4997         /*
4998          * give the vnode a new identity so that vnode_getwithvid will fail
4999          * on any stale cache accesses...
5000          * grab the list_lock so that if we're in "new_vnode"
5001          * behind the list_lock trying to steal this vnode, the v_id is stable...
5002          * once new_vnode drops the list_lock, it will block trying to take
5003          * the vnode lock until we release it... at that point it will evaluate
5004          * whether the v_vid has changed
5005          * also need to make sure that the vnode isn't on a list where "new_vnode"
5006          * can find it after the v_id has been bumped until we are completely done
5007          * with the vnode (i.e. putting it back on a list has to be the very last
5008          * thing we do to this vnode... many of the callers of vnode_reclaim_internal
5009          * are holding an io_count on the vnode... they need to drop the io_count
5010          * BEFORE doing a vnode_list_add or make sure to hold the vnode lock until
5011          * they are completely done with the vnode
5012          */
5013         vnode_list_lock();
5014
5015         vnode_list_remove_locked(vp);
5016         vp->v_id++;
5017
5018         vnode_list_unlock();
5019
5020         if (isfifo) {
5021                 struct fifoinfo * fip;
5022
5023                 fip = vp->v_fifoinfo;
5024                 vp->v_fifoinfo = NULL;
5025                 FREE(fip, M_TEMP);
5026         }
5027         vp->v_type = VBAD;
5028
5029         if (vp->v_data) {
5030                 panic("vnode_reclaim_internal: cleaned vnode isn't");
5031         }
5032         if (vp->v_numoutput) {
5033                 panic("vnode_reclaim_internal: clean vnode has pending I/O's");
5034         }
5035         if (UBCINFOEXISTS(vp)) {
5036                 panic("vnode_reclaim_internal: ubcinfo not cleaned");
5037         }
5038         if (vp->v_parent) {
5039                 panic("vnode_reclaim_internal: vparent not removed");
5040         }
5041         if (vp->v_name) {
5042                 panic("vnode_reclaim_internal: vname not removed");
5043         }
5044
5045         vp->v_socket = NULL;
5046
5047         vp->v_lflag &= ~VL_TERMINATE;
5048         vp->v_owner = NULL;
5049
5050         KNOTE(&vp->v_knotes, NOTE_REVOKE);
5051
5052         /* Make sure that when we reuse the vnode, no knotes left over */
5053         klist_init(&vp->v_knotes);
5054
5055         if (vp->v_lflag & VL_TERMWANT) {
5056                 vp->v_lflag &= ~VL_TERMWANT;
5057                 wakeup(&vp->v_lflag);
5058         }
5059         if (!reuse) {
5060                 /*
5061                  * make sure we get on the
5062                  * dead list if appropriate
5063                  */
5064                 vnode_list_add(vp);
5065         }
5066         if (!locked) {
5067                 vnode_unlock(vp);
5068         }
5069 }
5070
5071 static int
5072 vnode_create_internal(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp,
5073     int init_vnode)
5074 {
5075         int error;
5076         int insert = 1;
5077         int existing_vnode;
5078         vnode_t vp;
5079         vnode_t nvp;
5080         vnode_t dvp;
5081         struct  uthread *ut;
5082         struct componentname *cnp;
5083         struct vnode_fsparam *param = (struct vnode_fsparam *)data;
5084 #if CONFIG_TRIGGERS
5085         struct vnode_trigger_param *tinfo = NULL;
5086 #endif
5087         if (*vpp) {
5088                 vp = *vpp;
5089                 *vpp = NULLVP;
5090                 existing_vnode = 1;
5091         } else {
5092                 existing_vnode = 0;
5093         }
5094
5095         if (init_vnode) {
5096                 /* Do quick sanity check on the parameters. */
5097                 if ((param == NULL) || (param->vnfs_vtype == VBAD)) {
5098                         error = EINVAL;
5099                         goto error_out;
5100                 }
5101
5102 #if CONFIG_TRIGGERS
5103                 if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) {
5104                         tinfo = (struct vnode_trigger_param *)data;
5105
5106                         /* Validate trigger vnode input */
5107                         if ((param->vnfs_vtype != VDIR) ||
5108                             (tinfo->vnt_resolve_func == NULL) ||
5109                             (tinfo->vnt_flags & ~VNT_VALID_MASK)) {
5110                                 error = EINVAL;
5111                                 goto error_out;
5112                         }
5113                         /* Fall through a normal create (params will be the same) */
5114                         flavor = VNCREATE_FLAVOR;
5115                         size = VCREATESIZE;
5116                 }
5117 #endif
5118                 if ((flavor != VNCREATE_FLAVOR) || (size != VCREATESIZE)) {
5119                         error = EINVAL;
5120                         goto error_out;
5121                 }
5122         }
5123
5124         if (!existing_vnode) {
5125                 if ((error = new_vnode(&vp))) {
5126                         return error;
5127                 }
5128                 if (!init_vnode) {
5129                         /* Make it so that it can be released by a vnode_put) */
5130                         vn_set_dead(vp);
5131                         *vpp = vp;
5132                         return 0;
5133                 }
5134         } else {
5135                 /*
5136                  * A vnode obtained by vnode_create_empty has been passed to
5137                  * vnode_initialize - Unset VL_DEAD set by vn_set_dead. After
5138                  * this point, it is set back on any error.
5139                  *
5140                  * N.B. vnode locking - We make the same assumptions as the
5141                  * "unsplit" vnode_create did - i.e. it is safe to update the
5142                  * vnode's fields without the vnode lock. This vnode has been
5143                  * out and about with the filesystem and hopefully nothing
5144                  * was done to the vnode between the vnode_create_empty and
5145                  * now when it has come in through vnode_initialize.
5146                  */
5147                 vp->v_lflag &= ~VL_DEAD;
5148         }
5149
5150         dvp = param->vnfs_dvp;
5151         cnp = param->vnfs_cnp;
5152
5153         vp->v_op = param->vnfs_vops;
5154         vp->v_type = param->vnfs_vtype;
5155         vp->v_data = param->vnfs_fsnode;
5156
5157         if (param->vnfs_markroot) {
5158                 vp->v_flag |= VROOT;
5159         }
5160         if (param->vnfs_marksystem) {
5161                 vp->v_flag |= VSYSTEM;
5162         }
5163         if (vp->v_type == VREG) {
5164                 error = ubc_info_init_withsize(vp, param->vnfs_filesize);
5165                 if (error) {
5166 #ifdef JOE_DEBUG
5167                         record_vp(vp, 1);
5168 #endif
5169                         vn_set_dead(vp);
5170
5171                         vnode_put(vp);
5172                         return error;
5173                 }
5174                 if (param->vnfs_mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED) {
5175                         memory_object_mark_io_tracking(vp->v_ubcinfo->ui_control);
5176                 }
5177         }
5178 #ifdef JOE_DEBUG
5179         record_vp(vp, 1);
5180 #endif
5181
5182 #if CONFIG_TRIGGERS
5183         /*
5184          * For trigger vnodes, attach trigger info to vnode
5185          */
5186         if ((vp->v_type == VDIR) && (tinfo != NULL)) {
5187                 /*
5188                  * Note: has a side effect of incrementing trigger count on the
5189                  * mount if successful, which we would need to undo on a
5190                  * subsequent failure.
5191                  */
5192 #ifdef JOE_DEBUG
5193                 record_vp(vp, -1);
5194 #endif
5195                 error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE);
5196                 if (error) {
5197                         printf("vnode_create: vnode_resolver_create() err %d\n", error);
5198                         vn_set_dead(vp);
5199 #ifdef JOE_DEBUG
5200                         record_vp(vp, 1);
5201 #endif
5202                         vnode_put(vp);
5203                         return error;
5204                 }
5205         }
5206 #endif
5207         if (vp->v_type == VCHR || vp->v_type == VBLK) {
5208                 vp->v_tag = VT_DEVFS;           /* callers will reset if needed (bdevvp) */
5209
5210                 if ((nvp = checkalias(vp, param->vnfs_rdev))) {
5211                         /*
5212                          * if checkalias returns a vnode, it will be locked
5213                          *
5214                          * first get rid of the unneeded vnode we acquired
5215                          */
5216                         vp->v_data = NULL;
5217                         vp->v_op = spec_vnodeop_p;
5218                         vp->v_type = VBAD;
5219                         vp->v_lflag = VL_DEAD;
5220                         vp->v_data = NULL;
5221                         vp->v_tag = VT_NON;
5222                         vnode_put(vp);
5223
5224                         /*
5225                          * switch to aliased vnode and finish
5226                          * preparing it
5227                          */
5228                         vp = nvp;
5229
5230                         vclean(vp, 0);
5231                         vp->v_op = param->vnfs_vops;
5232                         vp->v_type = param->vnfs_vtype;
5233                         vp->v_data = param->vnfs_fsnode;
5234                         vp->v_lflag = 0;
5235                         vp->v_mount = NULL;
5236                         insmntque(vp, param->vnfs_mp);
5237                         insert = 0;
5238                         vnode_unlock(vp);
5239                 }
5240
5241                 if (VCHR == vp->v_type) {
5242                         u_int maj = major(vp->v_rdev);
5243
5244                         if (maj < (u_int)nchrdev && cdevsw[maj].d_type == D_TTY) {
5245                                 vp->v_flag |= VISTTY;
5246                         }
5247                 }
5248         }
5249
5250         if (vp->v_type == VFIFO) {
5251                 struct fifoinfo *fip;
5252
5253                 MALLOC(fip, struct fifoinfo *,
5254                     sizeof(*fip), M_TEMP, M_WAITOK);
5255                 bzero(fip, sizeof(struct fifoinfo));
5256                 vp->v_fifoinfo = fip;
5257         }
5258         /* The file systems must pass the address of the location where
5259          * they store the vnode pointer. When we add the vnode into the mount
5260          * list and name cache they become discoverable. So the file system node
5261          * must have the connection to vnode setup by then
5262          */
5263         *vpp = vp;
5264
5265         /* Add fs named reference. */
5266         if (param->vnfs_flags & VNFS_ADDFSREF) {
5267                 vp->v_lflag |= VNAMED_FSHASH;
5268         }
5269         if (param->vnfs_mp) {
5270                 if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) {
5271                         vp->v_flag |= VLOCKLOCAL;
5272                 }
5273                 if (insert) {
5274                         if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) {
5275                                 panic("insmntque: vp on the free list\n");
5276                         }
5277
5278                         /*
5279                          * enter in mount vnode list
5280                          */
5281                         insmntque(vp, param->vnfs_mp);
5282                 }
5283         }
5284         if (dvp && vnode_ref(dvp) == 0) {
5285                 vp->v_parent = dvp;
5286         }
5287         if (cnp) {
5288                 if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) {
5289                         /*
5290                          * enter into name cache
5291                          * we've got the info to enter it into the name cache now
5292                          * cache_enter_create will pick up an extra reference on
5293                          * the name entered into the string cache
5294                          */
5295                         vp->v_name = cache_enter_create(dvp, vp, cnp);
5296                 } else {
5297                         vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0);
5298                 }
5299
5300                 if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) {
5301                         vp->v_flag |= VISUNION;
5302                 }
5303         }
5304         if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) {
5305                 /*
5306                  * this vnode is being created as cacheable in the name cache
5307                  * this allows us to re-enter it in the cache
5308                  */
5309                 vp->v_flag |= VNCACHEABLE;
5310         }
5311         ut = get_bsdthread_info(current_thread());
5312
5313         if ((current_proc()->p_lflag & P_LRAGE_VNODES) ||
5314             (ut->uu_flag & (UT_RAGE_VNODES | UT_KERN_RAGE_VNODES))) {
5315                 /*
5316                  * process has indicated that it wants any
5317                  * vnodes created on its behalf to be rapidly
5318                  * aged to reduce the impact on the cached set
5319                  * of vnodes
5320                  *
5321                  * if UT_KERN_RAGE_VNODES is set, then the
5322                  * kernel internally wants vnodes to be rapidly
5323                  * aged, even if the process hasn't requested
5324                  * this
5325                  */
5326                 vp->v_flag |= VRAGE;
5327         }
5328
5329 #if CONFIG_SECLUDED_MEMORY
5330         switch (secluded_for_filecache) {
5331         case 0:
5332                 /*
5333                  * secluded_for_filecache == 0:
5334                  * + no file contents in secluded pool
5335                  */
5336                 break;
5337         case 1:
5338                 /*
5339                  * secluded_for_filecache == 1:
5340                  * + no files from /
5341                  * + files from /Applications/ are OK
5342                  * + files from /Applications/Camera are not OK
5343                  * + no files that are open for write
5344                  */
5345                 if (vnode_vtype(vp) == VREG &&
5346                     vnode_mount(vp) != NULL &&
5347                     (!(vfs_flags(vnode_mount(vp)) & MNT_ROOTFS))) {
5348                         /* not from root filesystem: eligible for secluded pages */
5349                         memory_object_mark_eligible_for_secluded(
5350                                 ubc_getobject(vp, UBC_FLAGS_NONE),
5351                                 TRUE);
5352                 }
5353                 break;
5354         case 2:
5355                 /*
5356                  * secluded_for_filecache == 2:
5357                  * + all read-only files OK, except:
5358                  *      + dyld_shared_cache_arm64*
5359                  *      + Camera
5360                  *      + mediaserverd
5361                  */
5362                 if (vnode_vtype(vp) == VREG) {
5363                         memory_object_mark_eligible_for_secluded(
5364                                 ubc_getobject(vp, UBC_FLAGS_NONE),
5365                                 TRUE);
5366                 }
5367                 break;
5368         default:
5369                 break;
5370         }
5371 #endif /* CONFIG_SECLUDED_MEMORY */
5372
5373         return 0;
5374
5375 error_out:
5376         if (existing_vnode) {
5377                 vnode_put(vp);
5378         }
5379         return error;
5380 }
5381
5382 /* USAGE:
5383  * The following api creates a vnode and associates all the parameter specified in vnode_fsparam
5384  * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias
5385  * is obsoleted by this.
5386  */
5387 int
5388 vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp)
5389 {
5390         *vpp = NULLVP;
5391         return vnode_create_internal(flavor, size, data, vpp, 1);
5392 }
5393
5394 int
5395 vnode_create_empty(vnode_t *vpp)
5396 {
5397         *vpp = NULLVP;
5398         return vnode_create_internal(VNCREATE_FLAVOR, VCREATESIZE, NULL,
5399                    vpp, 0);
5400 }
5401
5402 int
5403 vnode_initialize(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp)
5404 {
5405         if (*vpp == NULLVP) {
5406                 panic("NULL vnode passed to vnode_initialize");
5407         }
5408 #if DEVELOPMENT || DEBUG
5409         /*
5410          * We lock to check that vnode is fit for unlocked use in
5411          * vnode_create_internal.
5412          */
5413         vnode_lock_spin(*vpp);
5414         VNASSERT(((*vpp)->v_iocount == 1), *vpp,
5415             ("vnode_initialize : iocount not 1, is %d", (*vpp)->v_iocount));
5416         VNASSERT(((*vpp)->v_usecount == 0), *vpp,
5417             ("vnode_initialize : usecount not 0, is %d", (*vpp)->v_usecount));
5418         VNASSERT(((*vpp)->v_lflag & VL_DEAD), *vpp,
5419             ("vnode_initialize : v_lflag does not have VL_DEAD, is 0x%x",
5420             (*vpp)->v_lflag));
5421         VNASSERT(((*vpp)->v_data == NULL), *vpp,
5422             ("vnode_initialize : v_data not NULL"));
5423         vnode_unlock(*vpp);
5424 #endif
5425         return vnode_create_internal(flavor, size, data, vpp, 1);
5426 }
5427
5428 int
5429 vnode_addfsref(vnode_t vp)
5430 {
5431         vnode_lock_spin(vp);
5432         if (vp->v_lflag & VNAMED_FSHASH) {
5433                 panic("add_fsref: vp already has named reference");
5434         }
5435         if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) {
5436                 panic("addfsref: vp on the free list\n");
5437         }
5438         vp->v_lflag |= VNAMED_FSHASH;
5439         vnode_unlock(vp);
5440         return 0;
5441 }
5442 int
5443 vnode_removefsref(vnode_t vp)
5444 {
5445         vnode_lock_spin(vp);
5446         if ((vp->v_lflag & VNAMED_FSHASH) == 0) {
5447                 panic("remove_fsref: no named reference");
5448         }
5449         vp->v_lflag &= ~VNAMED_FSHASH;
5450         vnode_unlock(vp);
5451         return 0;
5452 }
5453
5454
5455 int
5456 vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg)
5457 {
5458         mount_t mp;
5459         int ret = 0;
5460         fsid_t * fsid_list;
5461         int count, actualcount, i;
5462         void * allocmem;
5463         int indx_start, indx_stop, indx_incr;
5464         int cb_dropref = (flags & VFS_ITERATE_CB_DROPREF);
5465
5466         count = mount_getvfscnt();
5467         count += 10;
5468
5469         fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t));
5470         allocmem = (void *)fsid_list;
5471
5472         actualcount = mount_fillfsids(fsid_list, count);
5473
5474         /*
5475          * Establish the iteration direction
5476          * VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first)
5477          */
5478         if (flags & VFS_ITERATE_TAIL_FIRST) {
5479                 indx_start = actualcount - 1;
5480                 indx_stop = -1;
5481                 indx_incr = -1;
5482         } else { /* Head first by default */
5483                 indx_start = 0;
5484                 indx_stop = actualcount;
5485                 indx_incr = 1;
5486         }
5487
5488         for (i = indx_start; i != indx_stop; i += indx_incr) {
5489                 /* obtain the mount point with iteration reference */
5490                 mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1);
5491
5492                 if (mp == (struct mount *)0) {
5493                         continue;
5494                 }
5495                 mount_lock(mp);
5496                 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
5497                         mount_unlock(mp);
5498                         mount_iterdrop(mp);
5499                         continue;
5500                 }
5501                 mount_unlock(mp);
5502
5503                 /* iterate over all the vnodes */
5504                 ret = callout(mp, arg);
5505
5506                 /*
5507                  * Drop the iterref here if the callback didn't do it.
5508                  * Note: If cb_dropref is set the mp may no longer exist.
5509                  */
5510                 if (!cb_dropref) {
5511                         mount_iterdrop(mp);
5512                 }
5513
5514                 switch (ret) {
5515                 case VFS_RETURNED:
5516                 case VFS_RETURNED_DONE:
5517                         if (ret == VFS_RETURNED_DONE) {
5518                                 ret = 0;
5519                                 goto out;
5520                         }
5521                         break;
5522
5523                 case VFS_CLAIMED_DONE:
5524                         ret = 0;
5525                         goto out;
5526                 case VFS_CLAIMED:
5527                 default:
5528                         break;
5529                 }
5530                 ret = 0;
5531         }
5532
5533 out:
5534         kfree(allocmem, (count * sizeof(fsid_t)));
5535         return ret;
5536 }
5537
5538 /*
5539  * Update the vfsstatfs structure in the mountpoint.
5540  * MAC: Parameter eventtype added, indicating whether the event that
5541  * triggered this update came from user space, via a system call
5542  * (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT).
5543  */
5544 int
5545 vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype)
5546 {
5547         struct vfs_attr va;
5548         int             error;
5549
5550         /*
5551          * Request the attributes we want to propagate into
5552          * the per-mount vfsstat structure.
5553          */
5554         VFSATTR_INIT(&va);
5555         VFSATTR_WANTED(&va, f_iosize);
5556         VFSATTR_WANTED(&va, f_blocks);
5557         VFSATTR_WANTED(&va, f_bfree);
5558         VFSATTR_WANTED(&va, f_bavail);
5559         VFSATTR_WANTED(&va, f_bused);
5560         VFSATTR_WANTED(&va, f_files);
5561         VFSATTR_WANTED(&va, f_ffree);
5562         VFSATTR_WANTED(&va, f_bsize);
5563         VFSATTR_WANTED(&va, f_fssubtype);
5564
5565         if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
5566                 KAUTH_DEBUG("STAT - filesystem returned error %d", error);
5567                 return error;
5568         }
5569 #if CONFIG_MACF
5570         if (eventtype == VFS_USER_EVENT) {
5571                 error = mac_mount_check_getattr(ctx, mp, &va);
5572                 if (error != 0) {
5573                         return error;
5574                 }
5575         }
5576 #endif
5577         /*
5578          * Unpack into the per-mount structure.
5579          *
5580          * We only overwrite these fields, which are likely to change:
5581          *      f_blocks
5582          *      f_bfree
5583          *      f_bavail
5584          *      f_bused
5585          *      f_files
5586          *      f_ffree
5587          *
5588          * And these which are not, but which the FS has no other way
5589          * of providing to us:
5590          *      f_bsize
5591          *      f_iosize
5592          *      f_fssubtype
5593          *
5594          */
5595         if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
5596                 /* 4822056 - protect against malformed server mount */
5597                 mp->mnt_vfsstat.f_bsize = (va.f_bsize > 0 ? va.f_bsize : 512);
5598         } else {
5599                 mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */
5600         }
5601         if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
5602                 mp->mnt_vfsstat.f_iosize = va.f_iosize;
5603         } else {
5604                 mp->mnt_vfsstat.f_iosize = 1024 * 1024;         /* 1MB sensible I/O size */
5605         }
5606         if (VFSATTR_IS_SUPPORTED(&va, f_blocks)) {
5607                 mp->mnt_vfsstat.f_blocks = va.f_blocks;
5608         }
5609         if (VFSATTR_IS_SUPPORTED(&va, f_bfree)) {
5610                 mp->mnt_vfsstat.f_bfree = va.f_bfree;
5611         }
5612         if (VFSATTR_IS_SUPPORTED(&va, f_bavail)) {
5613                 mp->mnt_vfsstat.f_bavail = va.f_bavail;
5614         }
5615         if (VFSATTR_IS_SUPPORTED(&va, f_bused)) {
5616                 mp->mnt_vfsstat.f_bused = va.f_bused;
5617         }
5618         if (VFSATTR_IS_SUPPORTED(&va, f_files)) {
5619                 mp->mnt_vfsstat.f_files = va.f_files;
5620         }
5621         if (VFSATTR_IS_SUPPORTED(&va, f_ffree)) {
5622                 mp->mnt_vfsstat.f_ffree = va.f_ffree;
5623         }
5624
5625         /* this is unlikely to change, but has to be queried for */
5626         if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype)) {
5627                 mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
5628         }
5629
5630         return 0;
5631 }
5632
5633 int
5634 mount_list_add(mount_t mp)
5635 {
5636         int res;
5637
5638         mount_list_lock();
5639         if (system_inshutdown != 0) {
5640                 res = -1;
5641         } else {
5642                 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
5643                 nummounts++;
5644                 res = 0;
5645         }
5646         mount_list_unlock();
5647
5648         return res;
5649 }
5650
5651 void
5652 mount_list_remove(mount_t mp)
5653 {
5654         mount_list_lock();
5655         TAILQ_REMOVE(&mountlist, mp, mnt_list);
5656         nummounts--;
5657         mp->mnt_list.tqe_next = NULL;
5658         mp->mnt_list.tqe_prev = NULL;
5659         mount_list_unlock();
5660 }
5661
5662 mount_t
5663 mount_lookupby_volfsid(int volfs_id, int withref)
5664 {
5665         mount_t cur_mount = (mount_t)0;
5666         mount_t mp;
5667
5668         mount_list_lock();
5669         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
5670                 if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) &&
5671                     (mp->mnt_kern_flag & MNTK_PATH_FROM_ID) &&
5672                     (mp->mnt_vfsstat.f_fsid.val[0] == volfs_id)) {
5673                         cur_mount = mp;
5674                         if (withref) {
5675                                 if (mount_iterref(cur_mount, 1)) {
5676                                         cur_mount = (mount_t)0;
5677                                         mount_list_unlock();
5678                                         goto out;
5679                                 }
5680                         }
5681                         break;
5682                 }
5683         }
5684         mount_list_unlock();
5685         if (withref && (cur_mount != (mount_t)0)) {
5686                 mp = cur_mount;
5687                 if (vfs_busy(mp, LK_NOWAIT) != 0) {
5688                         cur_mount = (mount_t)0;
5689                 }
5690                 mount_iterdrop(mp);
5691         }
5692 out:
5693         return cur_mount;
5694 }
5695
5696 mount_t
5697 mount_list_lookupby_fsid(fsid_t *fsid, int locked, int withref)
5698 {
5699         mount_t retmp = (mount_t)0;
5700         mount_t mp;
5701
5702         if (!locked) {
5703                 mount_list_lock();
5704         }
5705         TAILQ_FOREACH(mp, &mountlist, mnt_list)
5706         if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] &&
5707             mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) {
5708                 retmp = mp;
5709                 if (withref) {
5710                         if (mount_iterref(retmp, 1)) {
5711                                 retmp = (mount_t)0;
5712                         }
5713                 }
5714                 goto out;
5715         }
5716 out:
5717         if (!locked) {
5718                 mount_list_unlock();
5719         }
5720         return retmp;
5721 }
5722
5723 errno_t
5724 vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx)
5725 {
5726         struct nameidata nd;
5727         int error;
5728         u_int32_t ndflags = 0;
5729
5730         if (ctx == NULL) {
5731                 return EINVAL;
5732         }
5733
5734         if (flags & VNODE_LOOKUP_NOFOLLOW) {
5735                 ndflags = NOFOLLOW;
5736         } else {
5737                 ndflags = FOLLOW;
5738         }
5739
5740         if (flags & VNODE_LOOKUP_NOCROSSMOUNT) {
5741                 ndflags |= NOCROSSMOUNT;
5742         }
5743
5744         if (flags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
5745                 ndflags |= CN_NBMOUNTLOOK;
5746         }
5747
5748         /* XXX AUDITVNPATH1 needed ? */
5749         NDINIT(&nd, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE,
5750             CAST_USER_ADDR_T(path), ctx);
5751
5752         if ((error = namei(&nd))) {
5753                 return error;
5754         }
5755         *vpp = nd.ni_vp;
5756         nameidone(&nd);
5757
5758         return 0;
5759 }
5760
5761 errno_t
5762 vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx)
5763 {
5764         struct nameidata nd;
5765         int error;
5766         u_int32_t ndflags = 0;
5767         int lflags = flags;
5768
5769         if (ctx == NULL) {              /* XXX technically an error */
5770                 ctx = vfs_context_current();
5771         }
5772
5773         if (fmode & O_NOFOLLOW) {
5774                 lflags |= VNODE_LOOKUP_NOFOLLOW;
5775         }
5776
5777         if (lflags & VNODE_LOOKUP_NOFOLLOW) {
5778                 ndflags = NOFOLLOW;
5779         } else {
5780                 ndflags = FOLLOW;
5781         }
5782
5783         if (lflags & VNODE_LOOKUP_NOCROSSMOUNT) {
5784                 ndflags |= NOCROSSMOUNT;
5785         }
5786
5787         if (lflags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
5788                 ndflags |= CN_NBMOUNTLOOK;
5789         }
5790
5791         /* XXX AUDITVNPATH1 needed ? */
5792         NDINIT(&nd, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE,
5793             CAST_USER_ADDR_T(path), ctx);
5794
5795         if ((error = vn_open(&nd, fmode, cmode))) {
5796                 *vpp = NULL;
5797         } else {
5798                 *vpp = nd.ni_vp;
5799         }
5800
5801         return error;
5802 }
5803
5804 errno_t
5805 vnode_close(vnode_t vp, int flags, vfs_context_t ctx)
5806 {
5807         int error;
5808
5809         if (ctx == NULL) {
5810                 ctx = vfs_context_current();
5811         }
5812
5813         error = vn_close(vp, flags, ctx);
5814         vnode_put(vp);
5815         return error;
5816 }
5817
5818 errno_t
5819 vnode_mtime(vnode_t vp, struct timespec *mtime, vfs_context_t ctx)
5820 {
5821         struct vnode_attr       va;
5822         int                     error;
5823
5824         VATTR_INIT(&va);
5825         VATTR_WANTED(&va, va_modify_time);
5826         error = vnode_getattr(vp, &va, ctx);
5827         if (!error) {
5828                 *mtime = va.va_modify_time;
5829         }
5830         return error;
5831 }
5832
5833 errno_t
5834 vnode_flags(vnode_t vp, uint32_t *flags, vfs_context_t ctx)
5835 {
5836         struct vnode_attr       va;
5837         int                     error;
5838
5839         VATTR_INIT(&va);
5840         VATTR_WANTED(&va, va_flags);
5841         error = vnode_getattr(vp, &va, ctx);
5842         if (!error) {
5843                 *flags = va.va_flags;
5844         }
5845         return error;
5846 }
5847
5848 /*
5849  * Returns:     0                       Success
5850  *      vnode_getattr:???
5851  */
5852 errno_t
5853 vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
5854 {
5855         struct vnode_attr       va;
5856         int                     error;
5857
5858         VATTR_INIT(&va);
5859         VATTR_WANTED(&va, va_data_size);
5860         error = vnode_getattr(vp, &va, ctx);
5861         if (!error) {
5862                 *sizep = va.va_data_size;
5863         }
5864         return error;
5865 }
5866
5867 errno_t
5868 vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
5869 {
5870         struct vnode_attr       va;
5871
5872         VATTR_INIT(&va);
5873         VATTR_SET(&va, va_data_size, size);
5874         va.va_vaflags = ioflag & 0xffff;
5875         return vnode_setattr(vp, &va, ctx);
5876 }
5877
5878 int
5879 vnode_setdirty(vnode_t vp)
5880 {
5881         vnode_lock_spin(vp);
5882         vp->v_flag |= VISDIRTY;
5883         vnode_unlock(vp);
5884         return 0;
5885 }
5886
5887 int
5888 vnode_cleardirty(vnode_t vp)
5889 {
5890         vnode_lock_spin(vp);
5891         vp->v_flag &= ~VISDIRTY;
5892         vnode_unlock(vp);
5893         return 0;
5894 }
5895
5896 int
5897 vnode_isdirty(vnode_t vp)
5898 {
5899         int dirty;
5900
5901         vnode_lock_spin(vp);
5902         dirty = (vp->v_flag & VISDIRTY) ? 1 : 0;
5903         vnode_unlock(vp);
5904
5905         return dirty;
5906 }
5907
5908 static int
5909 vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx)
5910 {
5911         /* Only use compound VNOP for compound operation */
5912         if (vnode_compound_open_available(dvp) && ((flags & VN_CREATE_DOOPEN) != 0)) {
5913                 *vpp = NULLVP;
5914                 return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, O_CREAT, fmode, statusp, vap, ctx);
5915         } else {
5916                 return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx);
5917         }
5918 }
5919
5920 /*
5921  * Create a filesystem object of arbitrary type with arbitrary attributes in
5922  * the spevied directory with the specified name.
5923  *
5924  * Parameters:  dvp                     Pointer to the vnode of the directory
5925  *                                      in which to create the object.
5926  *              vpp                     Pointer to the area into which to
5927  *                                      return the vnode of the created object.
5928  *              cnp                     Component name pointer from the namei
5929  *                                      data structure, containing the name to
5930  *                                      use for the create object.
5931  *              vap                     Pointer to the vnode_attr structure
5932  *                                      describing the object to be created,
5933  *                                      including the type of object.
5934  *              flags                   VN_* flags controlling ACL inheritance
5935  *                                      and whether or not authorization is to
5936  *                                      be required for the operation.
5937  *
5938  * Returns:     0                       Success
5939  *              !0                      errno value
5940  *
5941  * Implicit:    *vpp                    Contains the vnode of the object that
5942  *                                      was created, if successful.
5943  *              *cnp                    May be modified by the underlying VFS.
5944  *              *vap                    May be modified by the underlying VFS.
5945  *                                      modified by either ACL inheritance or
5946  *
5947  *
5948  *                                      be modified, even if the operation is
5949  *
5950  *
5951  * Notes:       The kauth_filesec_t in 'vap', if any, is in host byte order.
5952  *
5953  *              Modification of '*cnp' and '*vap' by the underlying VFS is
5954  *              strongly discouraged.
5955  *
5956  * XXX:         This function is a 'vn_*' function; it belongs in vfs_vnops.c
5957  *
5958  * XXX:         We should enummerate the possible errno values here, and where
5959  *              in the code they originated.
5960  */
5961 errno_t
5962 vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx)
5963 {
5964         errno_t error, old_error;
5965         vnode_t vp = (vnode_t)0;
5966         boolean_t batched;
5967         struct componentname *cnp;
5968         uint32_t defaulted;
5969
5970         cnp = &ndp->ni_cnd;
5971         error = 0;
5972         batched = namei_compound_available(dvp, ndp) ? TRUE : FALSE;
5973
5974         KAUTH_DEBUG("%p    CREATE - '%s'", dvp, cnp->cn_nameptr);
5975
5976         if (flags & VN_CREATE_NOINHERIT) {
5977                 vap->va_vaflags |= VA_NOINHERIT;
5978         }
5979         if (flags & VN_CREATE_NOAUTH) {
5980                 vap->va_vaflags |= VA_NOAUTH;
5981         }
5982         /*
5983          * Handle ACL inheritance, initialize vap.
5984          */
5985         error = vn_attribute_prepare(dvp, vap, &defaulted, ctx);
5986         if (error) {
5987                 return error;
5988         }
5989
5990         if (vap->va_type != VREG && (fmode != 0 || (flags & VN_CREATE_DOOPEN) || statusp)) {
5991                 panic("Open parameters, but not a regular file.");
5992         }
5993         if ((fmode != 0) && ((flags & VN_CREATE_DOOPEN) == 0)) {
5994                 panic("Mode for open, but not trying to open...");
5995         }
5996
5997
5998         /*
5999          * Create the requested node.
6000          */
6001         switch (vap->va_type) {
6002         case VREG:
6003                 error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx);
6004                 break;
6005         case VDIR:
6006                 error = vn_mkdir(dvp, vpp, ndp, vap, ctx);
6007                 break;
6008         case VSOCK:
6009         case VFIFO:
6010         case VBLK:
6011         case VCHR:
6012                 error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
6013                 break;
6014         default:
6015                 panic("vnode_create: unknown vtype %d", vap->va_type);
6016         }
6017         if (error != 0) {
6018                 KAUTH_DEBUG("%p    CREATE - error %d returned by filesystem", dvp, error);
6019                 goto out;
6020         }
6021
6022         vp = *vpp;
6023         old_error = error;
6024
6025 #if CONFIG_MACF
6026         if (!(flags & VN_CREATE_NOLABEL)) {
6027                 error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx);
6028                 if (error) {
6029                         goto error;
6030                 }
6031         }
6032 #endif
6033
6034         /*
6035          * If some of the requested attributes weren't handled by the VNOP,
6036          * use our fallback code.
6037          */
6038         if (!VATTR_ALL_SUPPORTED(vap) && *vpp) {
6039                 KAUTH_DEBUG("     CREATE - doing fallback with ACL %p", vap->va_acl);
6040                 error = vnode_setattr_fallback(*vpp, vap, ctx);
6041         }
6042 #if CONFIG_MACF
6043 error:
6044 #endif
6045         if ((error != 0) && (vp != (vnode_t)0)) {
6046                 /* If we've done a compound open, close */
6047                 if (batched && (old_error == 0) && (vap->va_type == VREG)) {
6048                         VNOP_CLOSE(vp, fmode, ctx);
6049                 }
6050
6051                 /* Need to provide notifications if a create succeeded */
6052                 if (!batched) {
6053                         *vpp = (vnode_t) 0;
6054                         vnode_put(vp);
6055                         vp = NULLVP;
6056                 }
6057         }
6058
6059         /*
6060          * For creation VNOPs, this is the equivalent of
6061          * lookup_handle_found_vnode.
6062          */
6063         if (kdebug_enable && *vpp) {
6064                 kdebug_lookup(*vpp, cnp);
6065         }
6066
6067 out:
6068         vn_attribute_cleanup(vap, defaulted);
6069
6070         return error;
6071 }
6072
6073 static kauth_scope_t    vnode_scope;
6074 static int      vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action,
6075     uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
6076 static int vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
6077     vnode_t vp, vnode_t dvp, int *errorp);
6078
6079 typedef struct _vnode_authorize_context {
6080         vnode_t         vp;
6081         struct vnode_attr *vap;
6082         vnode_t         dvp;
6083         struct vnode_attr *dvap;
6084         vfs_context_t   ctx;
6085         int             flags;
6086         int             flags_valid;
6087 #define _VAC_IS_OWNER           (1<<0)
6088 #define _VAC_IN_GROUP           (1<<1)
6089 #define _VAC_IS_DIR_OWNER       (1<<2)
6090 #define _VAC_IN_DIR_GROUP       (1<<3)
6091 #define _VAC_NO_VNODE_POINTERS  (1<<4)
6092 } *vauth_ctx;
6093
6094 void
6095 vnode_authorize_init(void)
6096 {
6097         vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL);
6098 }
6099
6100 #define VATTR_PREPARE_DEFAULTED_UID             0x1
6101 #define VATTR_PREPARE_DEFAULTED_GID             0x2
6102 #define VATTR_PREPARE_DEFAULTED_MODE            0x4
6103
6104 int
6105 vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
6106 {
6107         kauth_acl_t nacl = NULL, oacl = NULL;
6108         int error;
6109
6110         /*
6111          * Handle ACL inheritance.
6112          */
6113         if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
6114                 /* save the original filesec */
6115                 if (VATTR_IS_ACTIVE(vap, va_acl)) {
6116                         oacl = vap->va_acl;
6117                 }
6118
6119                 vap->va_acl = NULL;
6120                 if ((error = kauth_acl_inherit(dvp,
6121                     oacl,
6122                     &nacl,
6123                     vap->va_type == VDIR,
6124                     ctx)) != 0) {
6125                         KAUTH_DEBUG("%p    CREATE - error %d processing inheritance", dvp, error);
6126                         return error;
6127                 }
6128
6129                 /*
6130                  * If the generated ACL is NULL, then we can save ourselves some effort
6131                  * by clearing the active bit.
6132                  */
6133                 if (nacl == NULL) {
6134                         VATTR_CLEAR_ACTIVE(vap, va_acl);
6135                 } else {
6136                         vap->va_base_acl = oacl;
6137                         VATTR_SET(vap, va_acl, nacl);
6138                 }
6139         }
6140
6141         error = vnode_authattr_new_internal(dvp, vap, (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx);
6142         if (error) {
6143                 vn_attribute_cleanup(vap, *defaulted_fieldsp);
6144         }
6145
6146         return error;
6147 }
6148
6149 void
6150 vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields)
6151 {
6152         /*
6153          * If the caller supplied a filesec in vap, it has been replaced
6154          * now by the post-inheritance copy.  We need to put the original back
6155          * and free the inherited product.
6156          */
6157         kauth_acl_t nacl, oacl;
6158
6159         if (VATTR_IS_ACTIVE(vap, va_acl)) {
6160                 nacl = vap->va_acl;
6161                 oacl = vap->va_base_acl;
6162
6163                 if (oacl) {
6164                         VATTR_SET(vap, va_acl, oacl);
6165                         vap->va_base_acl = NULL;
6166                 } else {
6167                         VATTR_CLEAR_ACTIVE(vap, va_acl);
6168                 }
6169
6170                 if (nacl != NULL) {
6171                         kauth_acl_free(nacl);
6172                 }
6173         }
6174
6175         if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != 0) {
6176                 VATTR_CLEAR_ACTIVE(vap, va_mode);
6177         }
6178         if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != 0) {
6179                 VATTR_CLEAR_ACTIVE(vap, va_gid);
6180         }
6181         if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != 0) {
6182                 VATTR_CLEAR_ACTIVE(vap, va_uid);
6183         }
6184
6185         return;
6186 }
6187
6188 int
6189 vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, __unused void *reserved)
6190 {
6191 #if !CONFIG_MACF
6192 #pragma unused(cnp)
6193 #endif
6194         int error = 0;
6195
6196         /*
6197          * Normally, unlinking of directories is not supported.
6198          * However, some file systems may have limited support.
6199          */
6200         if ((vp->v_type == VDIR) &&
6201             !(vp->v_mount->mnt_kern_flag & MNTK_DIR_HARDLINKS)) {
6202                 return EPERM; /* POSIX */
6203         }
6204
6205         /* authorize the delete operation */
6206 #if CONFIG_MACF
6207         if (!error) {
6208                 error = mac_vnode_check_unlink(ctx, dvp, vp, cnp);
6209         }
6210 #endif /* MAC */
6211         if (!error) {
6212                 error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
6213         }
6214
6215         return error;
6216 }
6217
6218 int
6219 vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved)
6220 {
6221         /* Open of existing case */
6222         kauth_action_t action;
6223         int error = 0;
6224         if (cnp->cn_ndp == NULL) {
6225                 panic("NULL ndp");
6226         }
6227         if (reserved != NULL) {
6228                 panic("reserved not NULL.");
6229         }
6230
6231 #if CONFIG_MACF
6232         /* XXX may do duplicate work here, but ignore that for now (idempotent) */
6233         if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) {
6234                 error = vnode_label(vnode_mount(vp), NULL, vp, NULL, 0, ctx);
6235                 if (error) {
6236                         return error;
6237                 }
6238         }
6239 #endif
6240
6241         if ((fmode & O_DIRECTORY) && vp->v_type != VDIR) {
6242                 return ENOTDIR;
6243         }
6244
6245         if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) {
6246                 return EOPNOTSUPP;    /* Operation not supported on socket */
6247         }
6248
6249         if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) {
6250                 return ELOOP;         /* O_NOFOLLOW was specified and the target is a symbolic link */
6251         }
6252
6253         /* disallow write operations on directories */
6254         if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) {
6255                 return EISDIR;
6256         }
6257
6258         if ((cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH)) {
6259                 if (vp->v_type != VDIR) {
6260                         return ENOTDIR;
6261                 }
6262         }
6263
6264 #if CONFIG_MACF
6265         /* If a file being opened is a shadow file containing
6266          * namedstream data, ignore the macf checks because it
6267          * is a kernel internal file and access should always
6268          * be allowed.
6269          */
6270         if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) {
6271                 error = mac_vnode_check_open(ctx, vp, fmode);
6272                 if (error) {
6273                         return error;
6274                 }
6275         }
6276 #endif
6277
6278         /* compute action to be authorized */
6279         action = 0;
6280         if (fmode & FREAD) {
6281                 action |= KAUTH_VNODE_READ_DATA;
6282         }
6283         if (fmode & (FWRITE | O_TRUNC)) {
6284                 /*
6285                  * If we are writing, appending, and not truncating,
6286                  * indicate that we are appending so that if the
6287                  * UF_APPEND or SF_APPEND bits are set, we do not deny
6288                  * the open.
6289                  */
6290                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
6291                         action |= KAUTH_VNODE_APPEND_DATA;
6292                 } else {
6293                         action |= KAUTH_VNODE_WRITE_DATA;
6294                 }
6295         }
6296         error = vnode_authorize(vp, NULL, action, ctx);
6297 #if NAMEDSTREAMS
6298         if (error == EACCES) {
6299                 /*
6300                  * Shadow files may exist on-disk with a different UID/GID
6301                  * than that of the current context.  Verify that this file
6302                  * is really a shadow file.  If it was created successfully
6303                  * then it should be authorized.
6304                  */
6305                 if (vnode_isshadow(vp) && vnode_isnamedstream(vp)) {
6306                         error = vnode_verifynamedstream(vp);
6307                 }
6308         }
6309 #endif
6310
6311         return error;
6312 }
6313
6314 int
6315 vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved)
6316 {
6317 #if !CONFIG_MACF
6318 #pragma unused(vap)
6319 #endif
6320         /* Creation case */
6321         int error;
6322
6323         if (cnp->cn_ndp == NULL) {
6324                 panic("NULL cn_ndp");
6325         }
6326         if (reserved != NULL) {
6327                 panic("reserved not NULL.");
6328         }
6329
6330         /* Only validate path for creation if we didn't do a complete lookup */
6331         if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) {
6332                 error = lookup_validate_creation_path(cnp->cn_ndp);
6333                 if (error) {
6334                         return error;
6335                 }
6336         }
6337
6338 #if CONFIG_MACF
6339         error = mac_vnode_check_create(ctx, dvp, cnp, vap);
6340         if (error) {
6341                 return error;
6342         }
6343 #endif /* CONFIG_MACF */
6344
6345         return vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6346 }
6347
6348 int
6349 vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
6350     struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
6351     vfs_context_t ctx, void *reserved)
6352 {
6353         return vn_authorize_renamex(fdvp, fvp, fcnp, tdvp, tvp, tcnp, ctx, 0, reserved);
6354 }
6355
6356 int
6357 vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
6358     struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
6359     vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
6360 {
6361         return vn_authorize_renamex_with_paths(fdvp, fvp, fcnp, NULL, tdvp, tvp, tcnp, NULL, ctx, flags, reserved);
6362 }
6363
6364 int
6365 vn_authorize_renamex_with_paths(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, const char *from_path,
6366     struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, const char *to_path,
6367     vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
6368 {
6369         int error = 0;
6370         int moving = 0;
6371         bool swap = flags & VFS_RENAME_SWAP;
6372
6373         if (reserved != NULL) {
6374                 panic("Passed something other than NULL as reserved field!");
6375         }
6376
6377         /*
6378          * Avoid renaming "." and "..".
6379          *
6380          * XXX No need to check for this in the FS.  We should always have the leaves
6381          * in VFS in this case.
6382          */
6383         if (fvp->v_type == VDIR &&
6384             ((fdvp == fvp) ||
6385             (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
6386             ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT))) {
6387                 error = EINVAL;
6388                 goto out;
6389         }
6390
6391         if (tvp == NULLVP && vnode_compound_rename_available(tdvp)) {
6392                 error = lookup_validate_creation_path(tcnp->cn_ndp);
6393                 if (error) {
6394                         goto out;
6395                 }
6396         }
6397
6398         /***** <MACF> *****/
6399 #if CONFIG_MACF
6400         error = mac_vnode_check_rename(ctx, fdvp, fvp, fcnp, tdvp, tvp, tcnp);
6401         if (error) {
6402                 goto out;
6403         }
6404         if (swap) {
6405                 error = mac_vnode_check_rename(ctx, tdvp, tvp, tcnp, fdvp, fvp, fcnp);
6406                 if (error) {
6407                         goto out;
6408                 }
6409         }
6410 #endif
6411         /***** </MACF> *****/
6412
6413         /***** <MiscChecks> *****/
6414         if (tvp != NULL) {
6415                 if (!swap) {
6416                         if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
6417                                 error = ENOTDIR;
6418                                 goto out;
6419                         } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
6420                                 error = EISDIR;
6421                                 goto out;
6422                         }
6423                 }
6424         } else if (swap) {
6425                 /*
6426                  * Caller should have already checked this and returned
6427                  * ENOENT.  If we send back ENOENT here, caller will retry
6428                  * which isn't what we want so we send back EINVAL here
6429                  * instead.
6430                  */
6431                 error = EINVAL;
6432                 goto out;
6433         }
6434
6435         if (fvp == tdvp) {
6436                 error = EINVAL;
6437                 goto out;
6438         }
6439
6440         /*
6441          * The following edge case is caught here:
6442          * (to cannot be a descendent of from)
6443          *
6444          *       o fdvp
6445          *      /
6446          *     /
6447          *    o fvp
6448          *     \
6449          *      \
6450          *       o tdvp
6451          *      /
6452          *     /
6453          *    o tvp
6454          */
6455         if (tdvp->v_parent == fvp) {
6456                 error = EINVAL;
6457                 goto out;
6458         }
6459
6460         if (swap && fdvp->v_parent == tvp) {
6461                 error = EINVAL;
6462                 goto out;
6463         }
6464         /***** </MiscChecks> *****/
6465
6466         /***** <Kauth> *****/
6467
6468         /*
6469          * As part of the Kauth step, we call out to allow 3rd-party
6470          * fileop notification of "about to rename".  This is needed
6471          * in the event that 3rd-parties need to know that the DELETE
6472          * authorization is actually part of a rename.  It's important
6473          * that we guarantee that the DELETE call-out will always be
6474          * made if the WILL_RENAME call-out is made.  Another fileop
6475          * call-out will be performed once the operation is completed.
6476          * We can ignore the result of kauth_authorize_fileop().
6477          *
6478          * N.B. We are passing the vnode and *both* paths to each
6479          * call; kauth_authorize_fileop() extracts the "from" path
6480          * when posting a KAUTH_FILEOP_WILL_RENAME notification.
6481          * As such, we only post these notifications if all of the
6482          * information we need is provided.
6483          */
6484
6485         if (swap) {
6486                 kauth_action_t f = 0, t = 0;
6487
6488                 /*
6489                  * Directories changing parents need ...ADD_SUBDIR...  to
6490                  * permit changing ".."
6491                  */
6492                 if (fdvp != tdvp) {
6493                         if (vnode_isdir(fvp)) {
6494                                 f = KAUTH_VNODE_ADD_SUBDIRECTORY;
6495                         }
6496                         if (vnode_isdir(tvp)) {
6497                                 t = KAUTH_VNODE_ADD_SUBDIRECTORY;
6498                         }
6499                 }
6500                 if (to_path != NULL) {
6501                         kauth_authorize_fileop(vfs_context_ucred(ctx),
6502                             KAUTH_FILEOP_WILL_RENAME,
6503                             (uintptr_t)fvp,
6504                             (uintptr_t)to_path);
6505                 }
6506                 error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | f, ctx);
6507                 if (error) {
6508                         goto out;
6509                 }
6510                 if (from_path != NULL) {
6511                         kauth_authorize_fileop(vfs_context_ucred(ctx),
6512                             KAUTH_FILEOP_WILL_RENAME,
6513                             (uintptr_t)tvp,
6514                             (uintptr_t)from_path);
6515                 }
6516                 error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE | t, ctx);
6517                 if (error) {
6518                         goto out;
6519                 }
6520                 f = vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
6521                 t = vnode_isdir(tvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
6522                 if (fdvp == tdvp) {
6523                         error = vnode_authorize(fdvp, NULL, f | t, ctx);
6524                 } else {
6525                         error = vnode_authorize(fdvp, NULL, t, ctx);
6526                         if (error) {
6527                                 goto out;
6528                         }
6529                         error = vnode_authorize(tdvp, NULL, f, ctx);
6530                 }
6531                 if (error) {
6532                         goto out;
6533                 }
6534         } else {
6535                 error = 0;
6536                 if ((tvp != NULL) && vnode_isdir(tvp)) {
6537                         if (tvp != fdvp) {
6538                                 moving = 1;
6539                         }
6540                 } else if (tdvp != fdvp) {
6541                         moving = 1;
6542                 }
6543
6544                 /*
6545                  * must have delete rights to remove the old name even in
6546                  * the simple case of fdvp == tdvp.
6547                  *
6548                  * If fvp is a directory, and we are changing it's parent,
6549                  * then we also need rights to rewrite its ".." entry as well.
6550                  */
6551                 if (to_path != NULL) {
6552                         kauth_authorize_fileop(vfs_context_ucred(ctx),
6553                             KAUTH_FILEOP_WILL_RENAME,
6554                             (uintptr_t)fvp,
6555                             (uintptr_t)to_path);
6556                 }
6557                 if (vnode_isdir(fvp)) {
6558                         if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) {
6559                                 goto out;
6560                         }
6561                 } else {
6562                         if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
6563                                 goto out;
6564                         }
6565                 }
6566                 if (moving) {
6567                         /* moving into tdvp or tvp, must have rights to add */
6568                         if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp,
6569                             NULL,
6570                             vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE,
6571                             ctx)) != 0) {
6572                                 goto out;
6573                         }
6574                 } else {
6575                         /* node staying in same directory, must be allowed to add new name */
6576                         if ((error = vnode_authorize(fdvp, NULL,
6577                             vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
6578                                 goto out;
6579                         }
6580                 }
6581                 /* overwriting tvp */
6582                 if ((tvp != NULL) && !vnode_isdir(tvp) &&
6583                     ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) {
6584                         goto out;
6585                 }
6586         }
6587
6588         /***** </Kauth> *****/
6589
6590         /* XXX more checks? */
6591 out:
6592         return error;
6593 }
6594
6595 int
6596 vn_authorize_mkdir(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved)
6597 {
6598 #if !CONFIG_MACF
6599 #pragma unused(vap)
6600 #endif
6601         int error;
6602
6603         if (reserved != NULL) {
6604                 panic("reserved not NULL in vn_authorize_mkdir()");
6605         }
6606
6607         /* XXX A hack for now, to make shadow files work */
6608         if (cnp->cn_ndp == NULL) {
6609                 return 0;
6610         }
6611
6612         if (vnode_compound_mkdir_available(dvp)) {
6613                 error = lookup_validate_creation_path(cnp->cn_ndp);
6614                 if (error) {
6615                         goto out;
6616                 }
6617         }
6618
6619 #if CONFIG_MACF
6620         error = mac_vnode_check_create(ctx,
6621             dvp, cnp, vap);
6622         if (error) {
6623                 goto out;
6624         }
6625 #endif
6626
6627         /* authorize addition of a directory to the parent */
6628         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) {
6629                 goto out;
6630         }
6631
6632 out:
6633         return error;
6634 }
6635
6636 int
6637 vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved)
6638 {
6639 #if CONFIG_MACF
6640         int error;
6641 #else
6642 #pragma unused(cnp)
6643 #endif
6644         if (reserved != NULL) {
6645                 panic("Non-NULL reserved argument to vn_authorize_rmdir()");
6646         }
6647
6648         if (vp->v_type != VDIR) {
6649                 /*
6650                  * rmdir only deals with directories
6651                  */
6652                 return ENOTDIR;
6653         }
6654
6655         if (dvp == vp) {
6656                 /*
6657                  * No rmdir "." please.
6658                  */
6659                 return EINVAL;
6660         }
6661
6662 #if CONFIG_MACF
6663         error = mac_vnode_check_unlink(ctx, dvp,
6664             vp, cnp);
6665         if (error) {
6666                 return error;
6667         }
6668 #endif
6669
6670         return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
6671 }
6672
6673 /*
6674  * Authorizer for directory cloning. This does not use vnodes but instead
6675  * uses prefilled vnode attributes from the filesystem.
6676  *
6677  * The same function is called to set up the attributes required, perform the
6678  * authorization and cleanup (if required)
6679  */
6680 int
6681 vnode_attr_authorize_dir_clone(struct vnode_attr *vap, kauth_action_t action,
6682     struct vnode_attr *dvap, __unused vnode_t sdvp, mount_t mp,
6683     dir_clone_authorizer_op_t vattr_op, uint32_t flags, vfs_context_t ctx,
6684     __unused void *reserved)
6685 {
6686         int error;
6687         int is_suser = vfs_context_issuser(ctx);
6688
6689         if (vattr_op == OP_VATTR_SETUP) {
6690                 VATTR_INIT(vap);
6691
6692                 /*
6693                  * When ACL inheritence is implemented, both vap->va_acl and
6694                  * dvap->va_acl will be required (even as superuser).
6695                  */
6696                 VATTR_WANTED(vap, va_type);
6697                 VATTR_WANTED(vap, va_mode);
6698                 VATTR_WANTED(vap, va_flags);
6699                 VATTR_WANTED(vap, va_uid);
6700                 VATTR_WANTED(vap, va_gid);
6701                 if (dvap) {
6702                         VATTR_INIT(dvap);
6703                         VATTR_WANTED(dvap, va_flags);
6704                 }
6705
6706                 if (!is_suser) {
6707                         /*
6708                          * If not superuser, we have to evaluate ACLs and
6709                          * need the target directory gid to set the initial
6710                          * gid of the new object.
6711                          */
6712                         VATTR_WANTED(vap, va_acl);
6713                         if (dvap) {
6714                                 VATTR_WANTED(dvap, va_gid);
6715                         }
6716                 } else if (dvap && (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
6717                         VATTR_WANTED(dvap, va_gid);
6718                 }
6719                 return 0;
6720         } else if (vattr_op == OP_VATTR_CLEANUP) {
6721                 return 0; /* Nothing to do for now */
6722         }
6723
6724         /* dvap isn't used for authorization */
6725         error = vnode_attr_authorize(vap, NULL, mp, action, ctx);
6726
6727         if (error) {
6728                 return error;
6729         }
6730
6731         /*
6732          * vn_attribute_prepare should be able to accept attributes as well as
6733          * vnodes but for now we do this inline.
6734          */
6735         if (!is_suser || (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
6736                 /*
6737                  * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit
6738                  * owner is set, that owner takes ownership of all new files.
6739                  */
6740                 if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
6741                     (mp->mnt_fsowner != KAUTH_UID_NONE)) {
6742                         VATTR_SET(vap, va_uid, mp->mnt_fsowner);
6743                 } else {
6744                         /* default owner is current user */
6745                         VATTR_SET(vap, va_uid,
6746                             kauth_cred_getuid(vfs_context_ucred(ctx)));
6747                 }
6748
6749                 if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
6750                     (mp->mnt_fsgroup != KAUTH_GID_NONE)) {
6751                         VATTR_SET(vap, va_gid, mp->mnt_fsgroup);
6752                 } else {
6753                         /*
6754                          * default group comes from parent object,
6755                          * fallback to current user
6756                          */
6757                         if (VATTR_IS_SUPPORTED(dvap, va_gid)) {
6758                                 VATTR_SET(vap, va_gid, dvap->va_gid);
6759                         } else {
6760                                 VATTR_SET(vap, va_gid,
6761                                     kauth_cred_getgid(vfs_context_ucred(ctx)));
6762                         }
6763                 }
6764         }
6765
6766         /* Inherit SF_RESTRICTED bit from destination directory only */
6767         if (VATTR_IS_ACTIVE(vap, va_flags)) {
6768                 VATTR_SET(vap, va_flags,
6769                     ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)))); /* Turn off from source */
6770                 if (VATTR_IS_ACTIVE(dvap, va_flags)) {
6771                         VATTR_SET(vap, va_flags,
6772                             vap->va_flags | (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED)));
6773                 }
6774         } else if (VATTR_IS_ACTIVE(dvap, va_flags)) {
6775                 VATTR_SET(vap, va_flags, (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED)));
6776         }
6777
6778         return 0;
6779 }
6780
6781
6782 /*
6783  * Authorize an operation on a vnode.
6784  *
6785  * This is KPI, but here because it needs vnode_scope.
6786  *
6787  * Returns:     0                       Success
6788  *      kauth_authorize_action:EPERM    ...
6789  *      xlate => EACCES                 Permission denied
6790  *      kauth_authorize_action:0        Success
6791  *      kauth_authorize_action:         Depends on callback return; this is
6792  *                                      usually only vnode_authorize_callback(),
6793  *                                      but may include other listerners, if any
6794  *                                      exist.
6795  *              EROFS
6796  *              EACCES
6797  *              EPERM
6798  *              ???
6799  */
6800 int
6801 vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx)
6802 {
6803         int     error, result;
6804
6805         /*
6806          * We can't authorize against a dead vnode; allow all operations through so that
6807          * the correct error can be returned.
6808          */
6809         if (vp->v_type == VBAD) {
6810                 return 0;
6811         }
6812
6813         error = 0;
6814         result = kauth_authorize_action(vnode_scope, vfs_context_ucred(ctx), action,
6815             (uintptr_t)ctx, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error);
6816         if (result == EPERM) {          /* traditional behaviour */
6817                 result = EACCES;
6818         }
6819         /* did the lower layers give a better error return? */
6820         if ((result != 0) && (error != 0)) {
6821                 return error;
6822         }
6823         return result;
6824 }
6825
6826 /*
6827  * Test for vnode immutability.
6828  *
6829  * The 'append' flag is set when the authorization request is constrained
6830  * to operations which only request the right to append to a file.
6831  *
6832  * The 'ignore' flag is set when an operation modifying the immutability flags
6833  * is being authorized.  We check the system securelevel to determine which
6834  * immutability flags we can ignore.
6835  */
6836 static int
6837 vnode_immutable(struct vnode_attr *vap, int append, int ignore)
6838 {
6839         int     mask;
6840
6841         /* start with all bits precluding the operation */
6842         mask = IMMUTABLE | APPEND;
6843
6844         /* if appending only, remove the append-only bits */
6845         if (append) {
6846                 mask &= ~APPEND;
6847         }
6848
6849         /* ignore only set when authorizing flags changes */
6850         if (ignore) {
6851                 if (securelevel <= 0) {
6852                         /* in insecure state, flags do not inhibit changes */
6853                         mask = 0;
6854                 } else {
6855                         /* in secure state, user flags don't inhibit */
6856                         mask &= ~(UF_IMMUTABLE | UF_APPEND);
6857                 }
6858         }
6859         KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
6860         if ((vap->va_flags & mask) != 0) {
6861                 return EPERM;
6862         }
6863         return 0;
6864 }
6865
6866 static int
6867 vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
6868 {
6869         int result;
6870
6871         /* default assumption is not-owner */
6872         result = 0;
6873
6874         /*
6875          * If the filesystem has given us a UID, we treat this as authoritative.
6876          */
6877         if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
6878                 result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0;
6879         }
6880         /* we could test the owner UUID here if we had a policy for it */
6881
6882         return result;
6883 }
6884
6885 /*
6886  * vauth_node_group
6887  *
6888  * Description: Ask if a cred is a member of the group owning the vnode object
6889  *
6890  * Parameters:          vap             vnode attribute
6891  *                              vap->va_gid     group owner of vnode object
6892  *                      cred            credential to check
6893  *                      ismember        pointer to where to put the answer
6894  *                      idontknow       Return this if we can't get an answer
6895  *
6896  * Returns:             0               Success
6897  *                      idontknow       Can't get information
6898  *      kauth_cred_ismember_gid:?       Error from kauth subsystem
6899  *      kauth_cred_ismember_gid:?       Error from kauth subsystem
6900  */
6901 static int
6902 vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember, int idontknow)
6903 {
6904         int     error;
6905         int     result;
6906
6907         error = 0;
6908         result = 0;
6909
6910         /*
6911          * The caller is expected to have asked the filesystem for a group
6912          * at some point prior to calling this function.  The answer may
6913          * have been that there is no group ownership supported for the
6914          * vnode object, in which case we return
6915          */
6916         if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
6917                 error = kauth_cred_ismember_gid(cred, vap->va_gid, &result);
6918                 /*
6919                  * Credentials which are opted into external group membership
6920                  * resolution which are not known to the external resolver
6921                  * will result in an ENOENT error.  We translate this into
6922                  * the appropriate 'idontknow' response for our caller.
6923                  *
6924                  * XXX We do not make a distinction here between an ENOENT
6925                  * XXX arising from a response from the external resolver,
6926                  * XXX and an ENOENT which is internally generated.  This is
6927                  * XXX a deficiency of the published kauth_cred_ismember_gid()
6928                  * XXX KPI which can not be overcome without new KPI.  For
6929                  * XXX all currently known cases, however, this wil result
6930                  * XXX in correct behaviour.
6931                  */
6932                 if (error == ENOENT) {
6933                         error = idontknow;
6934                 }
6935         }
6936         /*
6937          * XXX We could test the group UUID here if we had a policy for it,
6938          * XXX but this is problematic from the perspective of synchronizing
6939          * XXX group UUID and POSIX GID ownership of a file and keeping the
6940          * XXX values coherent over time.  The problem is that the local
6941          * XXX system will vend transient group UUIDs for unknown POSIX GID
6942          * XXX values, and these are not persistent, whereas storage of values
6943          * XXX is persistent.  One potential solution to this is a local
6944          * XXX (persistent) replica of remote directory entries and vended
6945          * XXX local ids in a local directory server (think in terms of a
6946          * XXX caching DNS server).
6947          */
6948
6949         if (!error) {
6950                 *ismember = result;
6951         }
6952         return error;
6953 }
6954
6955 static int
6956 vauth_file_owner(vauth_ctx vcp)
6957 {
6958         int result;
6959
6960         if (vcp->flags_valid & _VAC_IS_OWNER) {
6961                 result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0;
6962         } else {
6963                 result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred);
6964
6965                 /* cache our result */
6966                 vcp->flags_valid |= _VAC_IS_OWNER;
6967                 if (result) {
6968                         vcp->flags |= _VAC_IS_OWNER;
6969                 } else {
6970                         vcp->flags &= ~_VAC_IS_OWNER;
6971                 }
6972         }
6973         return result;
6974 }
6975
6976
6977 /*
6978  * vauth_file_ingroup
6979  *
6980  * Description: Ask if a user is a member of the group owning the directory
6981  *
6982  * Parameters:          vcp             The vnode authorization context that
6983  *                                      contains the user and directory info
6984  *                              vcp->flags_valid        Valid flags
6985  *                              vcp->flags              Flags values
6986  *                              vcp->vap                File vnode attributes
6987  *                              vcp->ctx                VFS Context (for user)
6988  *                      ismember        pointer to where to put the answer
6989  *                      idontknow       Return this if we can't get an answer
6990  *
6991  * Returns:             0               Success
6992  *              vauth_node_group:?      Error from vauth_node_group()
6993  *
6994  * Implicit returns:    *ismember       0       The user is not a group member
6995  *                                      1       The user is a group member
6996  */
6997 static int
6998 vauth_file_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
6999 {
7000         int     error;
7001
7002         /* Check for a cached answer first, to avoid the check if possible */
7003         if (vcp->flags_valid & _VAC_IN_GROUP) {
7004                 *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0;
7005                 error = 0;
7006         } else {
7007                 /* Otherwise, go look for it */
7008                 error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember, idontknow);
7009
7010                 if (!error) {
7011                         /* cache our result */
7012                         vcp->flags_valid |= _VAC_IN_GROUP;
7013                         if (*ismember) {
7014                                 vcp->flags |= _VAC_IN_GROUP;
7015                         } else {
7016                                 vcp->flags &= ~_VAC_IN_GROUP;
7017                         }
7018                 }
7019         }
7020         return error;
7021 }
7022
7023 static int
7024 vauth_dir_owner(vauth_ctx vcp)
7025 {
7026         int result;
7027
7028         if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
7029                 result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0;
7030         } else {
7031                 result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred);
7032
7033                 /* cache our result */
7034                 vcp->flags_valid |= _VAC_IS_DIR_OWNER;
7035                 if (result) {
7036                         vcp->flags |= _VAC_IS_DIR_OWNER;
7037                 } else {
7038                         vcp->flags &= ~_VAC_IS_DIR_OWNER;
7039                 }
7040         }
7041         return result;
7042 }
7043
7044 /*
7045  * vauth_dir_ingroup
7046  *
7047  * Description: Ask if a user is a member of the group owning the directory
7048  *
7049  * Parameters:          vcp             The vnode authorization context that
7050  *                                      contains the user and directory info
7051  *                              vcp->flags_valid        Valid flags
7052  *                              vcp->flags              Flags values
7053  *                              vcp->dvap               Dir vnode attributes
7054  *                              vcp->ctx                VFS Context (for user)
7055  *                      ismember        pointer to where to put the answer
7056  *                      idontknow       Return this if we can't get an answer
7057  *
7058  * Returns:             0               Success
7059  *              vauth_node_group:?      Error from vauth_node_group()
7060  *
7061  * Implicit returns:    *ismember       0       The user is not a group member
7062  *                                      1       The user is a group member
7063  */
7064 static int
7065 vauth_dir_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
7066 {
7067         int     error;
7068
7069         /* Check for a cached answer first, to avoid the check if possible */
7070         if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
7071                 *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0;
7072                 error = 0;
7073         } else {
7074                 /* Otherwise, go look for it */
7075                 error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember, idontknow);
7076
7077                 if (!error) {
7078                         /* cache our result */
7079                         vcp->flags_valid |= _VAC_IN_DIR_GROUP;
7080                         if (*ismember) {
7081                                 vcp->flags |= _VAC_IN_DIR_GROUP;
7082                         } else {
7083                                 vcp->flags &= ~_VAC_IN_DIR_GROUP;
7084                         }
7085                 }
7086         }
7087         return error;
7088 }
7089
7090 /*
7091  * Test the posix permissions in (vap) to determine whether (credential)
7092  * may perform (action)
7093  */
7094 static int
7095 vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
7096 {
7097         struct vnode_attr *vap;
7098         int needed, error, owner_ok, group_ok, world_ok, ismember;
7099 #ifdef KAUTH_DEBUG_ENABLE
7100         const char *where = "uninitialized";
7101 # define _SETWHERE(c)   where = c;
7102 #else
7103 # define _SETWHERE(c)
7104 #endif
7105
7106         /* checking file or directory? */
7107         if (on_dir) {
7108                 vap = vcp->dvap;
7109         } else {
7110                 vap = vcp->vap;
7111         }
7112
7113         error = 0;
7114
7115         /*
7116          * We want to do as little work here as possible.  So first we check
7117          * which sets of permissions grant us the access we need, and avoid checking
7118          * whether specific permissions grant access when more generic ones would.
7119          */
7120
7121         /* owner permissions */
7122         needed = 0;
7123         if (action & VREAD) {
7124                 needed |= S_IRUSR;
7125         }
7126         if (action & VWRITE) {
7127                 needed |= S_IWUSR;
7128         }
7129         if (action & VEXEC) {
7130                 needed |= S_IXUSR;
7131         }
7132         owner_ok = (needed & vap->va_mode) == needed;
7133
7134         /* group permissions */
7135         needed = 0;
7136         if (action & VREAD) {
7137                 needed |= S_IRGRP;
7138         }
7139         if (action & VWRITE) {
7140                 needed |= S_IWGRP;
7141         }
7142         if (action & VEXEC) {
7143                 needed |= S_IXGRP;
7144         }
7145         group_ok = (needed & vap->va_mode) == needed;
7146
7147         /* world permissions */
7148         needed = 0;
7149         if (action & VREAD) {
7150                 needed |= S_IROTH;
7151         }
7152         if (action & VWRITE) {
7153                 needed |= S_IWOTH;
7154         }
7155         if (action & VEXEC) {
7156                 needed |= S_IXOTH;
7157         }
7158         world_ok = (needed & vap->va_mode) == needed;
7159
7160         /* If granted/denied by all three, we're done */
7161         if (owner_ok && group_ok && world_ok) {
7162                 _SETWHERE("all");
7163                 goto out;
7164         }
7165         if (!owner_ok && !group_ok && !world_ok) {
7166                 _SETWHERE("all");
7167                 error = EACCES;
7168                 goto out;
7169         }
7170
7171         /* Check ownership (relatively cheap) */
7172         if ((on_dir && vauth_dir_owner(vcp)) ||
7173             (!on_dir && vauth_file_owner(vcp))) {
7174                 _SETWHERE("user");
7175                 if (!owner_ok) {
7176                         error = EACCES;
7177                 }
7178                 goto out;
7179         }
7180
7181         /* Not owner; if group and world both grant it we're done */
7182         if (group_ok && world_ok) {
7183                 _SETWHERE("group/world");
7184                 goto out;
7185         }
7186         if (!group_ok && !world_ok) {
7187                 _SETWHERE("group/world");
7188                 error = EACCES;
7189                 goto out;
7190         }
7191
7192         /* Check group membership (most expensive) */
7193         ismember = 0;   /* Default to allow, if the target has no group owner */
7194
7195         /*
7196          * In the case we can't get an answer about the user from the call to
7197          * vauth_dir_ingroup() or vauth_file_ingroup(), we want to fail on
7198          * the side of caution, rather than simply granting access, or we will
7199          * fail to correctly implement exclusion groups, so we set the third
7200          * parameter on the basis of the state of 'group_ok'.
7201          */
7202         if (on_dir) {
7203                 error = vauth_dir_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
7204         } else {
7205                 error = vauth_file_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
7206         }
7207         if (error) {
7208                 if (!group_ok) {
7209                         ismember = 1;
7210                 }
7211                 error = 0;
7212         }
7213         if (ismember) {
7214                 _SETWHERE("group");
7215                 if (!group_ok) {
7216                         error = EACCES;
7217                 }
7218                 goto out;
7219         }
7220
7221         /* Not owner, not in group, use world result */
7222         _SETWHERE("world");
7223         if (!world_ok) {
7224                 error = EACCES;
7225         }
7226
7227         /* FALLTHROUGH */
7228
7229 out:
7230         KAUTH_DEBUG("%p    %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
7231             vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where,
7232             (action & VREAD)  ? "r" : "-",
7233             (action & VWRITE) ? "w" : "-",
7234             (action & VEXEC)  ? "x" : "-",
7235             needed,
7236             (vap->va_mode & S_IRUSR) ? "r" : "-",
7237             (vap->va_mode & S_IWUSR) ? "w" : "-",
7238             (vap->va_mode & S_IXUSR) ? "x" : "-",
7239             (vap->va_mode & S_IRGRP) ? "r" : "-",
7240             (vap->va_mode & S_IWGRP) ? "w" : "-",
7241             (vap->va_mode & S_IXGRP) ? "x" : "-",
7242             (vap->va_mode & S_IROTH) ? "r" : "-",
7243             (vap->va_mode & S_IWOTH) ? "w" : "-",
7244             (vap->va_mode & S_IXOTH) ? "x" : "-",
7245             kauth_cred_getuid(vcp->ctx->vc_ucred),
7246             on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
7247             on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
7248         return error;
7249 }
7250
7251 /*
7252  * Authorize the deletion of the node vp from the directory dvp.
7253  *
7254  * We assume that:
7255  * - Neither the node nor the directory are immutable.
7256  * - The user is not the superuser.
7257  *
7258  * The precedence of factors for authorizing or denying delete for a credential
7259  *
7260  * 1) Explicit ACE on the node. (allow or deny DELETE)
7261  * 2) Explicit ACE on the directory (allow or deny DELETE_CHILD).
7262  *
7263  *    If there are conflicting ACEs on the node and the directory, the node
7264  *    ACE wins.
7265  *
7266  * 3) Sticky bit on the directory.
7267  *    Deletion is not permitted if the directory is sticky and the caller is
7268  *    not owner of the node or directory. The sticky bit rules are like a deny
7269  *    delete ACE except lower in priority than ACL's either allowing or denying
7270  *    delete.
7271  *
7272  * 4) POSIX permisions on the directory.
7273  *
7274  * As an optimization, we cache whether or not delete child is permitted
7275  * on directories. This enables us to skip directory ACL and POSIX checks
7276  * as we already have the result from those checks. However, we always check the
7277  * node ACL and, if the directory has the sticky bit set, we always check its
7278  * ACL (even for a directory with an authorized delete child). Furthermore,
7279  * caching the delete child authorization is independent of the sticky bit
7280  * being set as it is only applicable in determining whether the node can be
7281  * deleted or not.
7282  */
7283 static int
7284 vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
7285 {
7286         struct vnode_attr       *vap = vcp->vap;
7287         struct vnode_attr       *dvap = vcp->dvap;
7288         kauth_cred_t            cred = vcp->ctx->vc_ucred;
7289         struct kauth_acl_eval   eval;
7290         int                     error, ismember;
7291
7292         /* Check the ACL on the node first */
7293         if (VATTR_IS_NOT(vap, va_acl, NULL)) {
7294                 eval.ae_requested = KAUTH_VNODE_DELETE;
7295                 eval.ae_acl = &vap->va_acl->acl_ace[0];
7296                 eval.ae_count = vap->va_acl->acl_entrycount;
7297                 eval.ae_options = 0;
7298                 if (vauth_file_owner(vcp)) {
7299                         eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
7300                 }
7301                 /*
7302                  * We use ENOENT as a marker to indicate we could not get
7303                  * information in order to delay evaluation until after we
7304                  * have the ACL evaluation answer.  Previously, we would
7305                  * always deny the operation at this point.
7306                  */
7307                 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
7308                         return error;
7309                 }
7310                 if (error == ENOENT) {
7311                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
7312                 } else if (ismember) {
7313                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
7314                 }
7315                 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7316                 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7317                 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7318                 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7319
7320                 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
7321                         KAUTH_DEBUG("%p    ERROR during ACL processing - %d", vcp->vp, error);
7322                         return error;
7323                 }
7324
7325                 switch (eval.ae_result) {
7326                 case KAUTH_RESULT_DENY:
7327                         KAUTH_DEBUG("%p    DENIED - denied by ACL", vcp->vp);
7328                         return EACCES;
7329                 case KAUTH_RESULT_ALLOW:
7330                         KAUTH_DEBUG("%p    ALLOWED - granted by ACL", vcp->vp);
7331                         return 0;
7332                 case KAUTH_RESULT_DEFER:
7333                 default:
7334                         /* Defer to directory */
7335                         KAUTH_DEBUG("%p    DEFERRED - by file ACL", vcp->vp);
7336                         break;
7337                 }
7338         }
7339
7340         /*
7341          * Without a sticky bit, a previously authorized delete child is
7342          * sufficient to authorize this delete.
7343          *
7344          * If the sticky bit is set, a directory ACL which allows delete child
7345          * overrides a (potential) sticky bit deny. The authorized delete child
7346          * cannot tell us if it was authorized because of an explicit delete
7347          * child allow ACE or because of POSIX permisions so we have to check
7348          * the directory ACL everytime if the directory has a sticky bit.
7349          */
7350         if (!(dvap->va_mode & S_ISTXT) && cached_delete_child) {
7351                 KAUTH_DEBUG("%p    ALLOWED - granted by directory ACL or POSIX permissions and no sticky bit on directory", vcp->vp);
7352                 return 0;
7353         }
7354
7355         /* check the ACL on the directory */
7356         if (VATTR_IS_NOT(dvap, va_acl, NULL)) {
7357                 eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
7358                 eval.ae_acl = &dvap->va_acl->acl_ace[0];
7359                 eval.ae_count = dvap->va_acl->acl_entrycount;
7360                 eval.ae_options = 0;
7361                 if (vauth_dir_owner(vcp)) {
7362                         eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
7363                 }
7364                 /*
7365                  * We use ENOENT as a marker to indicate we could not get
7366                  * information in order to delay evaluation until after we
7367                  * have the ACL evaluation answer.  Previously, we would
7368                  * always deny the operation at this point.
7369                  */
7370                 if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
7371                         return error;
7372                 }
7373                 if (error == ENOENT) {
7374                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
7375                 } else if (ismember) {
7376                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
7377                 }
7378                 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7379                 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7380                 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7381                 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7382
7383                 /*
7384                  * If there is no entry, we are going to defer to other
7385                  * authorization mechanisms.
7386                  */
7387                 error = kauth_acl_evaluate(cred, &eval);
7388
7389                 if (error != 0) {
7390                         KAUTH_DEBUG("%p    ERROR during ACL processing - %d", vcp->vp, error);
7391                         return error;
7392                 }
7393                 switch (eval.ae_result) {
7394                 case KAUTH_RESULT_DENY:
7395                         KAUTH_DEBUG("%p    DENIED - denied by directory ACL", vcp->vp);
7396                         return EACCES;
7397                 case KAUTH_RESULT_ALLOW:
7398                         KAUTH_DEBUG("%p    ALLOWED - granted by directory ACL", vcp->vp);
7399                         if (!cached_delete_child && vcp->dvp) {
7400                                 vnode_cache_authorized_action(vcp->dvp,
7401                                     vcp->ctx, KAUTH_VNODE_DELETE_CHILD);
7402                         }
7403                         return 0;
7404                 case KAUTH_RESULT_DEFER:
7405                 default:
7406                         /* Deferred by directory ACL */
7407                         KAUTH_DEBUG("%p    DEFERRED - directory ACL", vcp->vp);
7408                         break;
7409                 }
7410         }
7411
7412         /*
7413          * From this point, we can't explicitly allow and if we reach the end
7414          * of the function without a denial, then the delete is authorized.
7415          */
7416         if (!cached_delete_child) {
7417                 if (vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */) != 0) {
7418                         KAUTH_DEBUG("%p    DENIED - denied by posix permisssions", vcp->vp);
7419                         return EACCES;
7420                 }
7421                 /*
7422                  * Cache the authorized action on the vnode if allowed by the
7423                  * directory ACL or POSIX permissions. It is correct to cache
7424                  * this action even if sticky bit would deny deleting the node.
7425                  */
7426                 if (vcp->dvp) {
7427                         vnode_cache_authorized_action(vcp->dvp, vcp->ctx,
7428                             KAUTH_VNODE_DELETE_CHILD);
7429                 }
7430         }
7431
7432         /* enforce sticky bit behaviour */
7433         if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
7434                 KAUTH_DEBUG("%p    DENIED - sticky bit rules (user %d  file %d  dir %d)",
7435                     vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid);
7436                 return EACCES;
7437         }
7438
7439         /* not denied, must be OK */
7440         return 0;
7441 }
7442
7443
7444 /*
7445  * Authorize an operation based on the node's attributes.
7446  */
7447 static int
7448 vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny)
7449 {
7450         struct vnode_attr       *vap = vcp->vap;
7451         kauth_cred_t            cred = vcp->ctx->vc_ucred;
7452         struct kauth_acl_eval   eval;
7453         int                     error, ismember;
7454         mode_t                  posix_action;
7455
7456         /*
7457          * If we are the file owner, we automatically have some rights.
7458          *
7459          * Do we need to expand this to support group ownership?
7460          */
7461         if (vauth_file_owner(vcp)) {
7462                 acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY);
7463         }
7464
7465         /*
7466          * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can
7467          * mask the latter.  If TAKE_OWNERSHIP is requested the caller is about to
7468          * change ownership to themselves, and WRITE_SECURITY is implicitly
7469          * granted to the owner.  We need to do this because at this point
7470          * WRITE_SECURITY may not be granted as the caller is not currently
7471          * the owner.
7472          */
7473         if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) &&
7474             (acl_rights & KAUTH_VNODE_WRITE_SECURITY)) {
7475                 acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY;
7476         }
7477
7478         if (acl_rights == 0) {
7479                 KAUTH_DEBUG("%p    ALLOWED - implicit or no rights required", vcp->vp);
7480                 return 0;
7481         }
7482
7483         /* if we have an ACL, evaluate it */
7484         if (VATTR_IS_NOT(vap, va_acl, NULL)) {
7485                 eval.ae_requested = acl_rights;
7486                 eval.ae_acl = &vap->va_acl->acl_ace[0];
7487                 eval.ae_count = vap->va_acl->acl_entrycount;
7488                 eval.ae_options = 0;
7489                 if (vauth_file_owner(vcp)) {
7490                         eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
7491                 }
7492                 /*
7493                  * We use ENOENT as a marker to indicate we could not get
7494                  * information in order to delay evaluation until after we
7495                  * have the ACL evaluation answer.  Previously, we would
7496                  * always deny the operation at this point.
7497                  */
7498                 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
7499                         return error;
7500                 }
7501                 if (error == ENOENT) {
7502                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
7503                 } else if (ismember) {
7504                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
7505                 }
7506                 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7507                 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7508                 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7509                 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7510
7511                 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
7512                         KAUTH_DEBUG("%p    ERROR during ACL processing - %d", vcp->vp, error);
7513                         return error;
7514                 }
7515
7516                 switch (eval.ae_result) {
7517                 case KAUTH_RESULT_DENY:
7518                         KAUTH_DEBUG("%p    DENIED - by ACL", vcp->vp);
7519                         return EACCES;         /* deny, deny, counter-allege */
7520                 case KAUTH_RESULT_ALLOW:
7521                         KAUTH_DEBUG("%p    ALLOWED - all rights granted by ACL", vcp->vp);
7522                         return 0;
7523                 case KAUTH_RESULT_DEFER:
7524                 default:
7525                         /* Effectively the same as !delete_child_denied */
7526                         KAUTH_DEBUG("%p    DEFERRED - directory ACL", vcp->vp);
7527                         break;
7528                 }
7529
7530                 *found_deny = eval.ae_found_deny;
7531
7532                 /* fall through and evaluate residual rights */
7533         } else {
7534                 /* no ACL, everything is residual */
7535                 eval.ae_residual = acl_rights;
7536         }
7537
7538         /*
7539          * Grant residual rights that have been pre-authorized.
7540          */
7541         eval.ae_residual &= ~preauth_rights;
7542
7543         /*
7544          * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied.
7545          */
7546         if (vauth_file_owner(vcp)) {
7547                 eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES;
7548         }
7549
7550         if (eval.ae_residual == 0) {
7551                 KAUTH_DEBUG("%p    ALLOWED - rights already authorized", vcp->vp);
7552                 return 0;
7553         }
7554
7555         /*
7556          * Bail if we have residual rights that can't be granted by posix permissions,
7557          * or aren't presumed granted at this point.
7558          *
7559          * XXX these can be collapsed for performance
7560          */
7561         if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) {
7562                 KAUTH_DEBUG("%p    DENIED - CHANGE_OWNER not permitted", vcp->vp);
7563                 return EACCES;
7564         }
7565         if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) {
7566                 KAUTH_DEBUG("%p    DENIED - WRITE_SECURITY not permitted", vcp->vp);
7567                 return EACCES;
7568         }
7569
7570 #if DIAGNOSTIC
7571         if (eval.ae_residual & KAUTH_VNODE_DELETE) {
7572                 panic("vnode_authorize: can't be checking delete permission here");
7573         }
7574 #endif
7575
7576         /*
7577          * Compute the fallback posix permissions that will satisfy the remaining
7578          * rights.
7579          */
7580         posix_action = 0;
7581         if (eval.ae_residual & (KAUTH_VNODE_READ_DATA |
7582             KAUTH_VNODE_LIST_DIRECTORY |
7583             KAUTH_VNODE_READ_EXTATTRIBUTES)) {
7584                 posix_action |= VREAD;
7585         }
7586         if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA |
7587             KAUTH_VNODE_ADD_FILE |
7588             KAUTH_VNODE_ADD_SUBDIRECTORY |
7589             KAUTH_VNODE_DELETE_CHILD |
7590             KAUTH_VNODE_WRITE_ATTRIBUTES |
7591             KAUTH_VNODE_WRITE_EXTATTRIBUTES)) {
7592                 posix_action |= VWRITE;
7593         }
7594         if (eval.ae_residual & (KAUTH_VNODE_EXECUTE |
7595             KAUTH_VNODE_SEARCH)) {
7596                 posix_action |= VEXEC;
7597         }
7598
7599         if (posix_action != 0) {
7600                 return vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */);
7601         } else {
7602                 KAUTH_DEBUG("%p    ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping",
7603                     vcp->vp,
7604                     (eval.ae_residual & KAUTH_VNODE_READ_DATA)
7605                     ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
7606                     (eval.ae_residual & KAUTH_VNODE_WRITE_DATA)
7607                     ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "",
7608                     (eval.ae_residual & KAUTH_VNODE_EXECUTE)
7609                     ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "",
7610                     (eval.ae_residual & KAUTH_VNODE_DELETE)
7611                     ? " DELETE" : "",
7612                     (eval.ae_residual & KAUTH_VNODE_APPEND_DATA)
7613                     ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
7614                     (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD)
7615                     ? " DELETE_CHILD" : "",
7616                     (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES)
7617                     ? " READ_ATTRIBUTES" : "",
7618                     (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES)
7619                     ? " WRITE_ATTRIBUTES" : "",
7620                     (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES)
7621                     ? " READ_EXTATTRIBUTES" : "",
7622                     (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES)
7623                     ? " WRITE_EXTATTRIBUTES" : "",
7624                     (eval.ae_residual & KAUTH_VNODE_READ_SECURITY)
7625                     ? " READ_SECURITY" : "",
7626                     (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY)
7627                     ? " WRITE_SECURITY" : "",
7628                     (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE)
7629                     ? " CHECKIMMUTABLE" : "",
7630                     (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER)
7631                     ? " CHANGE_OWNER" : "");
7632         }
7633
7634         /*
7635          * Lack of required Posix permissions implies no reason to deny access.
7636          */
7637         return 0;
7638 }
7639
7640 /*
7641  * Check for file immutability.
7642  */
7643 static int
7644 vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, int ignore)
7645 {
7646         int error;
7647         int append;
7648
7649         /*
7650          * Perform immutability checks for operations that change data.
7651          *
7652          * Sockets, fifos and devices require special handling.
7653          */
7654         switch (vap->va_type) {
7655         case VSOCK:
7656         case VFIFO:
7657         case VBLK:
7658         case VCHR:
7659                 /*
7660                  * Writing to these nodes does not change the filesystem data,
7661                  * so forget that it's being tried.
7662                  */
7663                 rights &= ~KAUTH_VNODE_WRITE_DATA;
7664                 break;
7665         default:
7666                 break;
7667         }
7668
7669         error = 0;
7670         if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
7671                 /* check per-filesystem options if possible */
7672                 if (mp != NULL) {
7673                         /* check for no-EA filesystems */
7674                         if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
7675                             (vfs_flags(mp) & MNT_NOUSERXATTR)) {
7676                                 KAUTH_DEBUG("%p    DENIED - filesystem disallowed extended attributes", vp);
7677                                 error = EACCES;  /* User attributes disabled */
7678                                 goto out;
7679                         }
7680                 }
7681
7682                 /*
7683                  * check for file immutability. first, check if the requested rights are
7684                  * allowable for a UF_APPEND file.
7685                  */
7686                 append = 0;
7687                 if (vap->va_type == VDIR) {
7688                         if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) {
7689                                 append = 1;
7690                         }
7691                 } else {
7692                         if ((rights & (KAUTH_VNODE_APPEND_DATA | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) {
7693                                 append = 1;
7694                         }
7695                 }
7696                 if ((error = vnode_immutable(vap, append, ignore)) != 0) {
7697                         KAUTH_DEBUG("%p    DENIED - file is immutable", vp);
7698                         goto out;
7699                 }
7700         }
7701 out:
7702         return error;
7703 }
7704
7705 /*
7706  * Handle authorization actions for filesystems that advertise that the
7707  * server will be enforcing.
7708  *
7709  * Returns:     0                       Authorization should be handled locally
7710  *              1                       Authorization was handled by the FS
7711  *
7712  * Note:        Imputed returns will only occur if the authorization request
7713  *              was handled by the FS.
7714  *
7715  * Imputed:     *resultp, modified      Return code from FS when the request is
7716  *                                      handled by the FS.
7717  *              VNOP_ACCESS:???
7718  *              VNOP_OPEN:???
7719  */
7720 static int
7721 vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx)
7722 {
7723         int     error;
7724
7725         /*
7726          * If the vp is a device node, socket or FIFO it actually represents a local
7727          * endpoint, so we need to handle it locally.
7728          */
7729         switch (vp->v_type) {
7730         case VBLK:
7731         case VCHR:
7732         case VSOCK:
7733         case VFIFO:
7734                 return 0;
7735         default:
7736                 break;
7737         }
7738
7739         /*
7740          * In the advisory request case, if the filesystem doesn't think it's reliable
7741          * we will attempt to formulate a result ourselves based on VNOP_GETATTR data.
7742          */
7743         if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vp->v_mount)) {
7744                 return 0;
7745         }
7746
7747         /*
7748          * Let the filesystem have a say in the matter.  It's OK for it to not implemnent
7749          * VNOP_ACCESS, as most will authorise inline with the actual request.
7750          */
7751         if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) {
7752                 *resultp = error;
7753                 KAUTH_DEBUG("%p    DENIED - opaque filesystem VNOP_ACCESS denied access", vp);
7754                 return 1;
7755         }
7756
7757         /*
7758          * Typically opaque filesystems do authorisation in-line, but exec is a special case.  In
7759          * order to be reasonably sure that exec will be permitted, we try a bit harder here.
7760          */
7761         if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) {
7762                 /* try a VNOP_OPEN for readonly access */
7763                 if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
7764                         *resultp = error;
7765                         KAUTH_DEBUG("%p    DENIED - EXECUTE denied because file could not be opened readonly", vp);
7766                         return 1;
7767                 }
7768                 VNOP_CLOSE(vp, FREAD, ctx);
7769         }
7770
7771         /*
7772          * We don't have any reason to believe that the request has to be denied at this point,
7773          * so go ahead and allow it.
7774          */
7775         *resultp = 0;
7776         KAUTH_DEBUG("%p    ALLOWED - bypassing access check for non-local filesystem", vp);
7777         return 1;
7778 }
7779
7780
7781
7782
7783 /*
7784  * Returns:     KAUTH_RESULT_ALLOW
7785  *              KAUTH_RESULT_DENY
7786  *
7787  * Imputed:     *arg3, modified         Error code in the deny case
7788  *              EROFS                   Read-only file system
7789  *              EACCES                  Permission denied
7790  *              EPERM                   Operation not permitted [no execute]
7791  *      vnode_getattr:ENOMEM            Not enough space [only if has filesec]
7792  *      vnode_getattr:???
7793  *      vnode_authorize_opaque:*arg2    ???
7794  *      vnode_authorize_checkimmutable:???
7795  *      vnode_authorize_delete:???
7796  *      vnode_authorize_simple:???
7797  */
7798
7799
7800 static int
7801 vnode_authorize_callback(__unused kauth_cred_t cred, __unused void *idata,
7802     kauth_action_t action, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
7803     uintptr_t arg3)
7804 {
7805         vfs_context_t   ctx;
7806         vnode_t         cvp = NULLVP;
7807         vnode_t         vp, dvp;
7808         int             result = KAUTH_RESULT_DENY;
7809         int             parent_iocount = 0;
7810         int             parent_action; /* In case we need to use namedstream's data fork for cached rights*/
7811
7812         ctx = (vfs_context_t)arg0;
7813         vp = (vnode_t)arg1;
7814         dvp = (vnode_t)arg2;
7815
7816         /*
7817          * if there are 2 vnodes passed in, we don't know at
7818          * this point which rights to look at based on the
7819          * combined action being passed in... defer until later...
7820          * otherwise check the kauth 'rights' cache hung
7821          * off of the vnode we're interested in... if we've already
7822          * been granted the right we're currently interested in,
7823          * we can just return success... otherwise we'll go through
7824          * the process of authorizing the requested right(s)... if that
7825          * succeeds, we'll add the right(s) to the cache.
7826          * VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache
7827          */
7828         if (dvp && vp) {
7829                 goto defer;
7830         }
7831         if (dvp) {
7832                 cvp = dvp;
7833         } else {
7834                 /*
7835                  * For named streams on local-authorization volumes, rights are cached on the parent;
7836                  * authorization is determined by looking at the parent's properties anyway, so storing
7837                  * on the parent means that we don't recompute for the named stream and that if
7838                  * we need to flush rights (e.g. on VNOP_SETATTR()) we don't need to track down the
7839                  * stream to flush its cache separately.  If we miss in the cache, then we authorize
7840                  * as if there were no cached rights (passing the named stream vnode and desired rights to
7841                  * vnode_authorize_callback_int()).
7842                  *
7843                  * On an opaquely authorized volume, we don't know the relationship between the
7844                  * data fork's properties and the rights granted on a stream.  Thus, named stream vnodes
7845                  * on such a volume are authorized directly (rather than using the parent) and have their
7846                  * own caches.  When a named stream vnode is created, we mark the parent as having a named
7847                  * stream. On a VNOP_SETATTR() for the parent that may invalidate cached authorization, we
7848                  * find the stream and flush its cache.
7849                  */
7850                 if (vnode_isnamedstream(vp) && (!vfs_authopaque(vp->v_mount))) {
7851                         cvp = vnode_getparent(vp);
7852                         if (cvp != NULLVP) {
7853                                 parent_iocount = 1;
7854                         } else {
7855                                 cvp = NULL;
7856                                 goto defer; /* If we can't use the parent, take the slow path */
7857                         }
7858
7859                         /* Have to translate some actions */
7860                         parent_action = action;
7861                         if (parent_action & KAUTH_VNODE_READ_DATA) {
7862                                 parent_action &= ~KAUTH_VNODE_READ_DATA;
7863                                 parent_action |= KAUTH_VNODE_READ_EXTATTRIBUTES;
7864                         }
7865                         if (parent_action & KAUTH_VNODE_WRITE_DATA) {
7866                                 parent_action &= ~KAUTH_VNODE_WRITE_DATA;
7867                                 parent_action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
7868                         }
7869                 } else {
7870                         cvp = vp;
7871                 }
7872         }
7873
7874         if (vnode_cache_is_authorized(cvp, ctx, parent_iocount ? parent_action : action) == TRUE) {
7875                 result = KAUTH_RESULT_ALLOW;
7876                 goto out;
7877         }
7878 defer:
7879         result = vnode_authorize_callback_int(action, ctx, vp, dvp, (int *)arg3);
7880
7881         if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) {
7882                 KAUTH_DEBUG("%p - caching action = %x", cvp, action);
7883                 vnode_cache_authorized_action(cvp, ctx, action);
7884         }
7885
7886 out:
7887         if (parent_iocount) {
7888                 vnode_put(cvp);
7889         }
7890
7891         return result;
7892 }
7893
7894 static int
7895 vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp,
7896     kauth_ace_rights_t rights, int is_suser, boolean_t *found_deny,
7897     int noimmutable, int parent_authorized_for_delete_child)
7898 {
7899         int result;
7900
7901         /*
7902          * Check for immutability.
7903          *
7904          * In the deletion case, parent directory immutability vetoes specific
7905          * file rights.
7906          */
7907         if ((result = vnode_authorize_checkimmutable(mp, vcp->vap, rights,
7908             noimmutable)) != 0) {
7909                 goto out;
7910         }
7911
7912         if ((rights & KAUTH_VNODE_DELETE) &&
7913             !parent_authorized_for_delete_child) {
7914                 result = vnode_authorize_checkimmutable(mp, vcp->dvap,
7915                     KAUTH_VNODE_DELETE_CHILD, 0);
7916                 if (result) {
7917                         goto out;
7918                 }
7919         }
7920
7921         /*
7922          * Clear rights that have been authorized by reaching this point, bail if nothing left to
7923          * check.
7924          */
7925         rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE);
7926         if (rights == 0) {
7927                 goto out;
7928         }
7929
7930         /*
7931          * If we're not the superuser, authorize based on file properties;
7932          * note that even if parent_authorized_for_delete_child is TRUE, we
7933          * need to check on the node itself.
7934          */
7935         if (!is_suser) {
7936                 /* process delete rights */
7937                 if ((rights & KAUTH_VNODE_DELETE) &&
7938                     ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0)) {
7939                         goto out;
7940                 }
7941
7942                 /* process remaining rights */
7943                 if ((rights & ~KAUTH_VNODE_DELETE) &&
7944                     (result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, found_deny)) != 0) {
7945                         goto out;
7946                 }
7947         } else {
7948                 /*
7949                  * Execute is only granted to root if one of the x bits is set.  This check only
7950                  * makes sense if the posix mode bits are actually supported.
7951                  */
7952                 if ((rights & KAUTH_VNODE_EXECUTE) &&
7953                     (vcp->vap->va_type == VREG) &&
7954                     VATTR_IS_SUPPORTED(vcp->vap, va_mode) &&
7955                     !(vcp->vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
7956                         result = EPERM;
7957                         KAUTH_DEBUG("%p    DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode);
7958                         goto out;
7959                 }
7960
7961                 /* Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE */
7962                 *found_deny = TRUE;
7963
7964                 KAUTH_DEBUG("%p    ALLOWED - caller is superuser", vp);
7965         }
7966 out:
7967         return result;
7968 }
7969
7970 static int
7971 vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
7972     vnode_t vp, vnode_t dvp, int *errorp)
7973 {
7974         struct _vnode_authorize_context auth_context;
7975         vauth_ctx               vcp;
7976         kauth_cred_t            cred;
7977         kauth_ace_rights_t      rights;
7978         struct vnode_attr       va, dva;
7979         int                     result;
7980         int                     noimmutable;
7981         boolean_t               parent_authorized_for_delete_child = FALSE;
7982         boolean_t               found_deny = FALSE;
7983         boolean_t               parent_ref = FALSE;
7984         boolean_t               is_suser = FALSE;
7985
7986         vcp = &auth_context;
7987         vcp->ctx = ctx;
7988         vcp->vp = vp;
7989         vcp->dvp = dvp;
7990         /*
7991          * Note that we authorize against the context, not the passed cred
7992          * (the same thing anyway)
7993          */
7994         cred = ctx->vc_ucred;
7995
7996         VATTR_INIT(&va);
7997         vcp->vap = &va;
7998         VATTR_INIT(&dva);
7999         vcp->dvap = &dva;
8000
8001         vcp->flags = vcp->flags_valid = 0;
8002
8003 #if DIAGNOSTIC
8004         if ((ctx == NULL) || (vp == NULL) || (cred == NULL)) {
8005                 panic("vnode_authorize: bad arguments (context %p  vp %p  cred %p)", ctx, vp, cred);
8006         }
8007 #endif
8008
8009         KAUTH_DEBUG("%p  AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)",
8010             vp, vfs_context_proc(ctx)->p_comm,
8011             (action & KAUTH_VNODE_ACCESS)               ? "access" : "auth",
8012             (action & KAUTH_VNODE_READ_DATA)            ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
8013             (action & KAUTH_VNODE_WRITE_DATA)           ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "",
8014             (action & KAUTH_VNODE_EXECUTE)              ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "",
8015             (action & KAUTH_VNODE_DELETE)               ? " DELETE" : "",
8016             (action & KAUTH_VNODE_APPEND_DATA)          ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
8017             (action & KAUTH_VNODE_DELETE_CHILD)         ? " DELETE_CHILD" : "",
8018             (action & KAUTH_VNODE_READ_ATTRIBUTES)      ? " READ_ATTRIBUTES" : "",
8019             (action & KAUTH_VNODE_WRITE_ATTRIBUTES)     ? " WRITE_ATTRIBUTES" : "",
8020             (action & KAUTH_VNODE_READ_EXTATTRIBUTES)   ? " READ_EXTATTRIBUTES" : "",
8021             (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES)  ? " WRITE_EXTATTRIBUTES" : "",
8022             (action & KAUTH_VNODE_READ_SECURITY)        ? " READ_SECURITY" : "",
8023             (action & KAUTH_VNODE_WRITE_SECURITY)       ? " WRITE_SECURITY" : "",
8024             (action & KAUTH_VNODE_CHANGE_OWNER)         ? " CHANGE_OWNER" : "",
8025             (action & KAUTH_VNODE_NOIMMUTABLE)          ? " (noimmutable)" : "",
8026             vnode_isdir(vp) ? "directory" : "file",
8027             vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp);
8028
8029         /*
8030          * Extract the control bits from the action, everything else is
8031          * requested rights.
8032          */
8033         noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
8034         rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
8035
8036         if (rights & KAUTH_VNODE_DELETE) {
8037 #if DIAGNOSTIC
8038                 if (dvp == NULL) {
8039                         panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory");
8040                 }
8041 #endif
8042                 /*
8043                  * check to see if we've already authorized the parent
8044                  * directory for deletion of its children... if so, we
8045                  * can skip a whole bunch of work... we will still have to
8046                  * authorize that this specific child can be removed
8047                  */
8048                 if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE) {
8049                         parent_authorized_for_delete_child = TRUE;
8050                 }
8051         } else {
8052                 vcp->dvp = NULLVP;
8053                 vcp->dvap = NULL;
8054         }
8055
8056         /*
8057          * Check for read-only filesystems.
8058          */
8059         if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
8060             (vp->v_mount->mnt_flag & MNT_RDONLY) &&
8061             ((vp->v_type == VREG) || (vp->v_type == VDIR) ||
8062             (vp->v_type == VLNK) || (vp->v_type == VCPLX) ||
8063             (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) {
8064                 result = EROFS;
8065                 goto out;
8066         }
8067
8068         /*
8069          * Check for noexec filesystems.
8070          */
8071         if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) {
8072                 result = EACCES;
8073                 goto out;
8074         }
8075
8076         /*
8077          * Handle cases related to filesystems with non-local enforcement.
8078          * This call can return 0, in which case we will fall through to perform a
8079          * check based on VNOP_GETATTR data.  Otherwise it returns 1 and sets
8080          * an appropriate result, at which point we can return immediately.
8081          */
8082         if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, &result, action, ctx)) {
8083                 goto out;
8084         }
8085
8086         /*
8087          * If the vnode is a namedstream (extended attribute) data vnode (eg.
8088          * a resource fork), *_DATA becomes *_EXTATTRIBUTES.
8089          */
8090         if (vnode_isnamedstream(vp)) {
8091                 if (rights & KAUTH_VNODE_READ_DATA) {
8092                         rights &= ~KAUTH_VNODE_READ_DATA;
8093                         rights |= KAUTH_VNODE_READ_EXTATTRIBUTES;
8094                 }
8095                 if (rights & KAUTH_VNODE_WRITE_DATA) {
8096                         rights &= ~KAUTH_VNODE_WRITE_DATA;
8097                         rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
8098                 }
8099
8100                 /*
8101                  * Point 'vp' to the namedstream's parent for ACL checking
8102                  */
8103                 if ((vp->v_parent != NULL) &&
8104                     (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) {
8105                         parent_ref = TRUE;
8106                         vcp->vp = vp = vp->v_parent;
8107                 }
8108         }
8109
8110         if (vfs_context_issuser(ctx)) {
8111                 /*
8112                  * if we're not asking for execute permissions or modifications,
8113                  * then we're done, this action is authorized.
8114                  */
8115                 if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) {
8116                         goto success;
8117                 }
8118
8119                 is_suser = TRUE;
8120         }
8121
8122         /*
8123          * Get vnode attributes and extended security information for the vnode
8124          * and directory if required.
8125          *
8126          * If we're root we only want mode bits and flags for checking
8127          * execute and immutability.
8128          */
8129         VATTR_WANTED(&va, va_mode);
8130         VATTR_WANTED(&va, va_flags);
8131         if (!is_suser) {
8132                 VATTR_WANTED(&va, va_uid);
8133                 VATTR_WANTED(&va, va_gid);
8134                 VATTR_WANTED(&va, va_acl);
8135         }
8136         if ((result = vnode_getattr(vp, &va, ctx)) != 0) {
8137                 KAUTH_DEBUG("%p    ERROR - failed to get vnode attributes - %d", vp, result);
8138                 goto out;
8139         }
8140         VATTR_WANTED(&va, va_type);
8141         VATTR_RETURN(&va, va_type, vnode_vtype(vp));
8142
8143         if (vcp->dvp) {
8144                 VATTR_WANTED(&dva, va_mode);
8145                 VATTR_WANTED(&dva, va_flags);
8146                 if (!is_suser) {
8147                         VATTR_WANTED(&dva, va_uid);
8148                         VATTR_WANTED(&dva, va_gid);
8149                         VATTR_WANTED(&dva, va_acl);
8150                 }
8151                 if ((result = vnode_getattr(vcp->dvp, &dva, ctx)) != 0) {
8152                         KAUTH_DEBUG("%p    ERROR - failed to get directory vnode attributes - %d", vp, result);
8153                         goto out;
8154                 }
8155                 VATTR_WANTED(&dva, va_type);
8156                 VATTR_RETURN(&dva, va_type, vnode_vtype(vcp->dvp));
8157         }
8158
8159         result = vnode_attr_authorize_internal(vcp, vp->v_mount, rights, is_suser,
8160             &found_deny, noimmutable, parent_authorized_for_delete_child);
8161 out:
8162         if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) {
8163                 kauth_acl_free(va.va_acl);
8164         }
8165         if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL)) {
8166                 kauth_acl_free(dva.va_acl);
8167         }
8168
8169         if (result) {
8170                 if (parent_ref) {
8171                         vnode_put(vp);
8172                 }
8173                 *errorp = result;
8174                 KAUTH_DEBUG("%p    DENIED - auth denied", vp);
8175                 return KAUTH_RESULT_DENY;
8176         }
8177         if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) {
8178                 /*
8179                  * if we were successfully granted the right to search this directory
8180                  * and there were NO ACL DENYs for search and the posix permissions also don't
8181                  * deny execute, we can synthesize a global right that allows anyone to
8182                  * traverse this directory during a pathname lookup without having to
8183                  * match the credential associated with this cache of rights.
8184                  *
8185                  * Note that we can correctly cache KAUTH_VNODE_SEARCHBYANYONE
8186                  * only if we actually check ACLs which we don't for root. As
8187                  * a workaround, the lookup fast path checks for root.
8188                  */
8189                 if (!VATTR_IS_SUPPORTED(&va, va_mode) ||
8190                     ((va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) ==
8191                     (S_IXUSR | S_IXGRP | S_IXOTH))) {
8192                         vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE);
8193                 }
8194         }
8195 success:
8196         if (parent_ref) {
8197                 vnode_put(vp);
8198         }
8199
8200         /*
8201          * Note that this implies that we will allow requests for no rights, as well as
8202          * for rights that we do not recognise.  There should be none of these.
8203          */
8204         KAUTH_DEBUG("%p    ALLOWED - auth granted", vp);
8205         return KAUTH_RESULT_ALLOW;
8206 }
8207
8208 int
8209 vnode_attr_authorize_init(struct vnode_attr *vap, struct vnode_attr *dvap,
8210     kauth_action_t action, vfs_context_t ctx)
8211 {
8212         VATTR_INIT(vap);
8213         VATTR_WANTED(vap, va_type);
8214         VATTR_WANTED(vap, va_mode);
8215         VATTR_WANTED(vap, va_flags);
8216         if (dvap) {
8217                 VATTR_INIT(dvap);
8218                 if (action & KAUTH_VNODE_DELETE) {
8219                         VATTR_WANTED(dvap, va_type);
8220                         VATTR_WANTED(dvap, va_mode);
8221                         VATTR_WANTED(dvap, va_flags);
8222                 }
8223         } else if (action & KAUTH_VNODE_DELETE) {
8224                 return EINVAL;
8225         }
8226
8227         if (!vfs_context_issuser(ctx)) {
8228                 VATTR_WANTED(vap, va_uid);
8229                 VATTR_WANTED(vap, va_gid);
8230                 VATTR_WANTED(vap, va_acl);
8231                 if (dvap && (action & KAUTH_VNODE_DELETE)) {
8232                         VATTR_WANTED(dvap, va_uid);
8233                         VATTR_WANTED(dvap, va_gid);
8234                         VATTR_WANTED(dvap, va_acl);
8235                 }
8236         }
8237
8238         return 0;
8239 }
8240
8241 int
8242 vnode_attr_authorize(struct vnode_attr *vap, struct vnode_attr *dvap, mount_t mp,
8243     kauth_action_t action, vfs_context_t ctx)
8244 {
8245         struct _vnode_authorize_context auth_context;
8246         vauth_ctx vcp;
8247         kauth_ace_rights_t rights;
8248         int noimmutable;
8249         boolean_t found_deny;
8250         boolean_t is_suser = FALSE;
8251         int result = 0;
8252
8253         vcp = &auth_context;
8254         vcp->ctx = ctx;
8255         vcp->vp = NULLVP;
8256         vcp->vap = vap;
8257         vcp->dvp = NULLVP;
8258         vcp->dvap = dvap;
8259         vcp->flags = vcp->flags_valid = 0;
8260
8261         noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
8262         rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
8263
8264         /*
8265          * Check for read-only filesystems.
8266          */
8267         if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
8268             mp && (mp->mnt_flag & MNT_RDONLY) &&
8269             ((vap->va_type == VREG) || (vap->va_type == VDIR) ||
8270             (vap->va_type == VLNK) || (rights & KAUTH_VNODE_DELETE) ||
8271             (rights & KAUTH_VNODE_DELETE_CHILD))) {
8272                 result = EROFS;
8273                 goto out;
8274         }
8275
8276         /*
8277          * Check for noexec filesystems.
8278          */
8279         if ((rights & KAUTH_VNODE_EXECUTE) &&
8280             (vap->va_type == VREG) && mp && (mp->mnt_flag & MNT_NOEXEC)) {
8281                 result = EACCES;
8282                 goto out;
8283         }
8284
8285         if (vfs_context_issuser(ctx)) {
8286                 /*
8287                  * if we're not asking for execute permissions or modifications,
8288                  * then we're done, this action is authorized.
8289                  */
8290                 if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) {
8291                         goto out;
8292                 }
8293                 is_suser = TRUE;
8294         } else {
8295                 if (!VATTR_IS_SUPPORTED(vap, va_uid) ||
8296                     !VATTR_IS_SUPPORTED(vap, va_gid) ||
8297                     (mp && vfs_extendedsecurity(mp) && !VATTR_IS_SUPPORTED(vap, va_acl))) {
8298                         panic("vnode attrs not complete for vnode_attr_authorize\n");
8299                 }
8300         }
8301
8302         result = vnode_attr_authorize_internal(vcp, mp, rights, is_suser,
8303             &found_deny, noimmutable, FALSE);
8304
8305         if (result == EPERM) {
8306                 result = EACCES;
8307         }
8308 out:
8309         return result;
8310 }
8311
8312
8313 int
8314 vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx)
8315 {
8316         return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx);
8317 }
8318
8319 /*
8320  * Check that the attribute information in vattr can be legally applied to
8321  * a new file by the context.
8322  */
8323 static int
8324 vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
8325 {
8326         int             error;
8327         int             has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
8328         uint32_t        inherit_flags;
8329         kauth_cred_t    cred;
8330         guid_t          changer;
8331         mount_t         dmp;
8332         struct vnode_attr dva;
8333
8334         error = 0;
8335
8336         if (defaulted_fieldsp) {
8337                 *defaulted_fieldsp = 0;
8338         }
8339
8340         defaulted_owner = defaulted_group = defaulted_mode = 0;
8341
8342         inherit_flags = 0;
8343
8344         /*
8345          * Require that the filesystem support extended security to apply any.
8346          */
8347         if (!vfs_extendedsecurity(dvp->v_mount) &&
8348             (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) {
8349                 error = EINVAL;
8350                 goto out;
8351         }
8352
8353         /*
8354          * Default some fields.
8355          */
8356         dmp = dvp->v_mount;
8357
8358         /*
8359          * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that
8360          * owner takes ownership of all new files.
8361          */
8362         if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) {
8363                 VATTR_SET(vap, va_uid, dmp->mnt_fsowner);
8364                 defaulted_owner = 1;
8365         } else {
8366                 if (!VATTR_IS_ACTIVE(vap, va_uid)) {
8367                         /* default owner is current user */
8368                         VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx)));
8369                         defaulted_owner = 1;
8370                 }
8371         }
8372
8373         /*
8374          * We need the dvp's va_flags and *may* need the gid of the directory,
8375          * we ask for both here.
8376          */
8377         VATTR_INIT(&dva);
8378         VATTR_WANTED(&dva, va_gid);
8379         VATTR_WANTED(&dva, va_flags);
8380         if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) {
8381                 goto out;
8382         }
8383
8384         /*
8385          * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
8386          * group takes ownership of all new files.
8387          */
8388         if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) {
8389                 VATTR_SET(vap, va_gid, dmp->mnt_fsgroup);
8390                 defaulted_group = 1;
8391         } else {
8392                 if (!VATTR_IS_ACTIVE(vap, va_gid)) {
8393                         /* default group comes from parent object, fallback to current user */
8394                         if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
8395                                 VATTR_SET(vap, va_gid, dva.va_gid);
8396                         } else {
8397                                 VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx)));
8398                         }
8399                         defaulted_group = 1;
8400                 }
8401         }
8402
8403         if (!VATTR_IS_ACTIVE(vap, va_flags)) {
8404                 VATTR_SET(vap, va_flags, 0);
8405         }
8406
8407         /* Determine if SF_RESTRICTED should be inherited from the parent
8408          * directory. */
8409         if (VATTR_IS_SUPPORTED(&dva, va_flags)) {
8410                 inherit_flags = dva.va_flags & (UF_DATAVAULT | SF_RESTRICTED);
8411         }
8412
8413         /* default mode is everything, masked with current umask */
8414         if (!VATTR_IS_ACTIVE(vap, va_mode)) {
8415                 VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask);
8416                 KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask);
8417                 defaulted_mode = 1;
8418         }
8419         /* set timestamps to now */
8420         if (!VATTR_IS_ACTIVE(vap, va_create_time)) {
8421                 nanotime(&vap->va_create_time);
8422                 VATTR_SET_ACTIVE(vap, va_create_time);
8423         }
8424
8425         /*
8426          * Check for attempts to set nonsensical fields.
8427          */
8428         if (vap->va_active & ~VNODE_ATTR_NEWOBJ) {
8429                 error = EINVAL;
8430                 KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx",
8431                     vap->va_active & ~VNODE_ATTR_NEWOBJ);
8432                 goto out;
8433         }
8434
8435         /*
8436          * Quickly check for the applicability of any enforcement here.
8437          * Tests below maintain the integrity of the local security model.
8438          */
8439         if (vfs_authopaque(dvp->v_mount)) {
8440                 goto out;
8441         }
8442
8443         /*
8444          * We need to know if the caller is the superuser, or if the work is
8445          * otherwise already authorised.
8446          */
8447         cred = vfs_context_ucred(ctx);
8448         if (noauth) {
8449                 /* doing work for the kernel */
8450                 has_priv_suser = 1;
8451         } else {
8452                 has_priv_suser = vfs_context_issuser(ctx);
8453         }
8454
8455
8456         if (VATTR_IS_ACTIVE(vap, va_flags)) {
8457                 if (has_priv_suser) {
8458                         if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) {
8459                                 error = EPERM;
8460                                 KAUTH_DEBUG("  DENIED - superuser attempt to set illegal flag(s)");
8461                                 goto out;
8462                         }
8463                 } else {
8464                         if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) {
8465                                 error = EPERM;
8466                                 KAUTH_DEBUG("  DENIED - user attempt to set illegal flag(s)");
8467                                 goto out;
8468                         }
8469                 }
8470         }
8471
8472         /* if not superuser, validate legality of new-item attributes */
8473         if (!has_priv_suser) {
8474                 if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) {
8475                         /* setgid? */
8476                         if (vap->va_mode & S_ISGID) {
8477                                 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
8478                                         KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
8479                                         goto out;
8480                                 }
8481                                 if (!ismember) {
8482                                         KAUTH_DEBUG("  DENIED - can't set SGID bit, not a member of %d", vap->va_gid);
8483                                         error = EPERM;
8484                                         goto out;
8485                                 }
8486                         }
8487
8488                         /* setuid? */
8489                         if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) {
8490                                 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
8491                                 error = EPERM;
8492                                 goto out;
8493                         }
8494                 }
8495                 if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) {
8496                         KAUTH_DEBUG("  DENIED - cannot create new item owned by %d", vap->va_uid);
8497                         error = EPERM;
8498                         goto out;
8499                 }
8500                 if (!defaulted_group) {
8501                         if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
8502                                 KAUTH_DEBUG("  ERROR - got %d checking for membership in %d", error, vap->va_gid);
8503                                 goto out;
8504                         }
8505                         if (!ismember) {
8506                                 KAUTH_DEBUG("  DENIED - cannot create new item with group %d - not a member", vap->va_gid);
8507                                 error = EPERM;
8508                                 goto out;
8509                         }
8510                 }
8511
8512                 /* initialising owner/group UUID */
8513                 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
8514                         if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
8515                                 KAUTH_DEBUG("  ERROR - got %d trying to get caller UUID", error);
8516                                 /* XXX ENOENT here - no GUID - should perhaps become EPERM */
8517                                 goto out;
8518                         }
8519                         if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
8520                                 KAUTH_DEBUG("  ERROR - cannot create item with supplied owner UUID - not us");
8521                                 error = EPERM;
8522                                 goto out;
8523                         }
8524                 }
8525                 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
8526                         if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
8527                                 KAUTH_DEBUG("  ERROR - got %d trying to check group membership", error);
8528                                 goto out;
8529                         }
8530                         if (!ismember) {
8531                                 KAUTH_DEBUG("  ERROR - cannot create item with supplied group UUID - not a member");
8532                                 error = EPERM;
8533                                 goto out;
8534                         }
8535                 }
8536         }
8537 out:
8538         if (inherit_flags) {
8539                 /* Apply SF_RESTRICTED to the file if its parent directory was
8540                  * restricted.  This is done at the end so that root is not
8541                  * required if this flag is only set due to inheritance. */
8542                 VATTR_SET(vap, va_flags, (vap->va_flags | inherit_flags));
8543         }
8544         if (defaulted_fieldsp) {
8545                 if (defaulted_mode) {
8546                         *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE;
8547                 }
8548                 if (defaulted_group) {
8549                         *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_GID;
8550                 }
8551                 if (defaulted_owner) {
8552                         *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_UID;
8553                 }
8554         }
8555         return error;
8556 }
8557
8558 /*
8559  * Check that the attribute information in vap can be legally written by the
8560  * context.
8561  *
8562  * Call this when you're not sure about the vnode_attr; either its contents
8563  * have come from an unknown source, or when they are variable.
8564  *
8565  * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that
8566  * must be authorized to be permitted to write the vattr.
8567  */
8568 int
8569 vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx)
8570 {
8571         struct vnode_attr ova;
8572         kauth_action_t  required_action;
8573         int             error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid;
8574         guid_t          changer;
8575         gid_t           group;
8576         uid_t           owner;
8577         mode_t          newmode;
8578         kauth_cred_t    cred;
8579         uint32_t        fdelta;
8580
8581         VATTR_INIT(&ova);
8582         required_action = 0;
8583         error = 0;
8584
8585         /*
8586          * Quickly check for enforcement applicability.
8587          */
8588         if (vfs_authopaque(vp->v_mount)) {
8589                 goto out;
8590         }
8591
8592         /*
8593          * Check for attempts to set nonsensical fields.
8594          */
8595         if (vap->va_active & VNODE_ATTR_RDONLY) {
8596                 KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)");
8597                 error = EINVAL;
8598                 goto out;
8599         }
8600
8601         /*
8602          * We need to know if the caller is the superuser.
8603          */
8604         cred = vfs_context_ucred(ctx);
8605         has_priv_suser = kauth_cred_issuser(cred);
8606
8607         /*
8608          * If any of the following are changing, we need information from the old file:
8609          * va_uid
8610          * va_gid
8611          * va_mode
8612          * va_uuuid
8613          * va_guuid
8614          */
8615         if (VATTR_IS_ACTIVE(vap, va_uid) ||
8616             VATTR_IS_ACTIVE(vap, va_gid) ||
8617             VATTR_IS_ACTIVE(vap, va_mode) ||
8618             VATTR_IS_ACTIVE(vap, va_uuuid) ||
8619             VATTR_IS_ACTIVE(vap, va_guuid)) {
8620                 VATTR_WANTED(&ova, va_mode);
8621                 VATTR_WANTED(&ova, va_uid);
8622                 VATTR_WANTED(&ova, va_gid);
8623                 VATTR_WANTED(&ova, va_uuuid);
8624                 VATTR_WANTED(&ova, va_guuid);
8625                 KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes");
8626         }
8627
8628         /*
8629          * If timestamps are being changed, we need to know who the file is owned
8630          * by.
8631          */
8632         if (VATTR_IS_ACTIVE(vap, va_create_time) ||
8633             VATTR_IS_ACTIVE(vap, va_change_time) ||
8634             VATTR_IS_ACTIVE(vap, va_modify_time) ||
8635             VATTR_IS_ACTIVE(vap, va_access_time) ||
8636             VATTR_IS_ACTIVE(vap, va_backup_time) ||
8637             VATTR_IS_ACTIVE(vap, va_addedtime)) {
8638                 VATTR_WANTED(&ova, va_uid);
8639 #if 0   /* enable this when we support UUIDs as official owners */
8640                 VATTR_WANTED(&ova, va_uuuid);
8641 #endif
8642                 KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID");
8643         }
8644
8645         /*
8646          * If flags are being changed, we need the old flags.
8647          */
8648         if (VATTR_IS_ACTIVE(vap, va_flags)) {
8649                 KAUTH_DEBUG("ATTR - flags changing, fetching old flags");
8650                 VATTR_WANTED(&ova, va_flags);
8651         }
8652
8653         /*
8654          * If ACLs are being changed, we need the old ACLs.
8655          */
8656         if (VATTR_IS_ACTIVE(vap, va_acl)) {
8657                 KAUTH_DEBUG("ATTR - acl changing, fetching old flags");
8658                 VATTR_WANTED(&ova, va_acl);
8659         }
8660
8661         /*
8662          * If the size is being set, make sure it's not a directory.
8663          */
8664         if (VATTR_IS_ACTIVE(vap, va_data_size)) {
8665                 /* size is only meaningful on regular files, don't permit otherwise */
8666                 if (!vnode_isreg(vp)) {
8667                         KAUTH_DEBUG("ATTR - ERROR: size change requested on non-file");
8668                         error = vnode_isdir(vp) ? EISDIR : EINVAL;
8669                         goto out;
8670                 }
8671         }
8672
8673         /*
8674          * Get old data.
8675          */
8676         KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active);
8677         if ((error = vnode_getattr(vp, &ova, ctx)) != 0) {
8678                 KAUTH_DEBUG("  ERROR - got %d trying to get attributes", error);
8679                 goto out;
8680         }
8681
8682         /*
8683          * Size changes require write access to the file data.
8684          */
8685         if (VATTR_IS_ACTIVE(vap, va_data_size)) {
8686                 /* if we can't get the size, or it's different, we need write access */
8687                 KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA");
8688                 required_action |= KAUTH_VNODE_WRITE_DATA;
8689         }
8690
8691         /*
8692          * Changing timestamps?
8693          *
8694          * Note that we are only called to authorize user-requested time changes;
8695          * side-effect time changes are not authorized.  Authorisation is only
8696          * required for existing files.
8697          *
8698          * Non-owners are not permitted to change the time on an existing
8699          * file to anything other than the current time.
8700          */
8701         if (VATTR_IS_ACTIVE(vap, va_create_time) ||
8702             VATTR_IS_ACTIVE(vap, va_change_time) ||
8703             VATTR_IS_ACTIVE(vap, va_modify_time) ||
8704             VATTR_IS_ACTIVE(vap, va_access_time) ||
8705             VATTR_IS_ACTIVE(vap, va_backup_time) ||
8706             VATTR_IS_ACTIVE(vap, va_addedtime)) {
8707                 /*
8708                  * The owner and root may set any timestamps they like,
8709                  * provided that the file is not immutable.  The owner still needs
8710                  * WRITE_ATTRIBUTES (implied by ownership but still deniable).
8711                  */
8712                 if (has_priv_suser || vauth_node_owner(&ova, cred)) {
8713                         KAUTH_DEBUG("ATTR - root or owner changing timestamps");
8714                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES;
8715                 } else {
8716                         /* just setting the current time? */
8717                         if (vap->va_vaflags & VA_UTIMES_NULL) {
8718                                 KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES");
8719                                 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
8720                         } else {
8721                                 KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted");
8722                                 error = EACCES;
8723                                 goto out;
8724                         }
8725                 }
8726         }
8727
8728         /*
8729          * Changing file mode?
8730          */
8731         if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) {
8732                 KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode);
8733
8734                 /*
8735                  * Mode changes always have the same basic auth requirements.
8736                  */
8737                 if (has_priv_suser) {
8738                         KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check");
8739                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
8740                 } else {
8741                         /* need WRITE_SECURITY */
8742                         KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY");
8743                         required_action |= KAUTH_VNODE_WRITE_SECURITY;
8744                 }
8745
8746                 /*
8747                  * Can't set the setgid bit if you're not in the group and not root.  Have to have
8748                  * existing group information in the case we're not setting it right now.
8749                  */
8750                 if (vap->va_mode & S_ISGID) {
8751                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE;  /* always required */
8752                         if (!has_priv_suser) {
8753                                 if (VATTR_IS_ACTIVE(vap, va_gid)) {
8754                                         group = vap->va_gid;
8755                                 } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) {
8756                                         group = ova.va_gid;
8757                                 } else {
8758                                         KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available");
8759                                         error = EINVAL;
8760                                         goto out;
8761                                 }
8762                                 /*
8763                                  * This might be too restrictive; WRITE_SECURITY might be implied by
8764                                  * membership in this case, rather than being an additional requirement.
8765                                  */
8766                                 if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) {
8767                                         KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
8768                                         goto out;
8769                                 }
8770                                 if (!ismember) {
8771                                         KAUTH_DEBUG("  DENIED - can't set SGID bit, not a member of %d", group);
8772                                         error = EPERM;
8773                                         goto out;
8774                                 }
8775                         }
8776                 }
8777
8778                 /*
8779                  * Can't set the setuid bit unless you're root or the file's owner.
8780                  */
8781                 if (vap->va_mode & S_ISUID) {
8782                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE;  /* always required */
8783                         if (!has_priv_suser) {
8784                                 if (VATTR_IS_ACTIVE(vap, va_uid)) {
8785                                         owner = vap->va_uid;
8786                                 } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) {
8787                                         owner = ova.va_uid;
8788                                 } else {
8789                                         KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available");
8790                                         error = EINVAL;
8791                                         goto out;
8792                                 }
8793                                 if (owner != kauth_cred_getuid(cred)) {
8794                                         /*
8795                                          * We could allow this if WRITE_SECURITY is permitted, perhaps.
8796                                          */
8797                                         KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
8798                                         error = EPERM;
8799                                         goto out;
8800                                 }
8801                         }
8802                 }
8803         }
8804
8805         /*
8806          * Validate/mask flags changes.  This checks that only the flags in
8807          * the UF_SETTABLE mask are being set, and preserves the flags in
8808          * the SF_SETTABLE case.
8809          *
8810          * Since flags changes may be made in conjunction with other changes,
8811          * we will ask the auth code to ignore immutability in the case that
8812          * the SF_* flags are not set and we are only manipulating the file flags.
8813          *
8814          */
8815         if (VATTR_IS_ACTIVE(vap, va_flags)) {
8816                 /* compute changing flags bits */
8817                 if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
8818                         fdelta = vap->va_flags ^ ova.va_flags;
8819                 } else {
8820                         fdelta = vap->va_flags;
8821                 }
8822
8823                 if (fdelta != 0) {
8824                         KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY");
8825                         required_action |= KAUTH_VNODE_WRITE_SECURITY;
8826
8827                         /* check that changing bits are legal */
8828                         if (has_priv_suser) {
8829                                 /*
8830                                  * The immutability check will prevent us from clearing the SF_*
8831                                  * flags unless the system securelevel permits it, so just check
8832                                  * for legal flags here.
8833                                  */
8834                                 if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) {
8835                                         error = EPERM;
8836                                         KAUTH_DEBUG("  DENIED - superuser attempt to set illegal flag(s)");
8837                                         goto out;
8838                                 }
8839                         } else {
8840                                 if (fdelta & ~UF_SETTABLE) {
8841                                         error = EPERM;
8842                                         KAUTH_DEBUG("  DENIED - user attempt to set illegal flag(s)");
8843                                         goto out;
8844                                 }
8845                         }
8846                         /*
8847                          * If the caller has the ability to manipulate file flags,
8848                          * security is not reduced by ignoring them for this operation.
8849                          *
8850                          * A more complete test here would consider the 'after' states of the flags
8851                          * to determine whether it would permit the operation, but this becomes
8852                          * very complex.
8853                          *
8854                          * Ignoring immutability is conditional on securelevel; this does not bypass
8855                          * the SF_* flags if securelevel > 0.
8856                          */
8857                         required_action |= KAUTH_VNODE_NOIMMUTABLE;
8858                 }
8859         }
8860
8861         /*
8862          * Validate ownership information.
8863          */
8864         chowner = 0;
8865         chgroup = 0;
8866         clear_suid = 0;
8867         clear_sgid = 0;
8868
8869         /*
8870          * uid changing
8871          * Note that if the filesystem didn't give us a UID, we expect that it doesn't
8872          * support them in general, and will ignore it if/when we try to set it.
8873          * We might want to clear the uid out of vap completely here.
8874          */
8875         if (VATTR_IS_ACTIVE(vap, va_uid)) {
8876                 if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) {
8877                         if (!has_priv_suser && (kauth_cred_getuid(cred) != vap->va_uid)) {
8878                                 KAUTH_DEBUG("  DENIED - non-superuser cannot change ownershipt to a third party");
8879                                 error = EPERM;
8880                                 goto out;
8881                         }
8882                         chowner = 1;
8883                 }
8884                 clear_suid = 1;
8885         }
8886
8887         /*
8888          * gid changing
8889          * Note that if the filesystem didn't give us a GID, we expect that it doesn't
8890          * support them in general, and will ignore it if/when we try to set it.
8891          * We might want to clear the gid out of vap completely here.
8892          */
8893         if (VATTR_IS_ACTIVE(vap, va_gid)) {
8894                 if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) {
8895                         if (!has_priv_suser) {
8896                                 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
8897                                         KAUTH_DEBUG("  ERROR - got %d checking for membership in %d", error, vap->va_gid);
8898                                         goto out;
8899                                 }
8900                                 if (!ismember) {
8901                                         KAUTH_DEBUG("  DENIED - group change from %d to %d but not a member of target group",
8902                                             ova.va_gid, vap->va_gid);
8903                                         error = EPERM;
8904                                         goto out;
8905                                 }
8906                         }
8907                         chgroup = 1;
8908                 }
8909                 clear_sgid = 1;
8910         }
8911
8912         /*
8913          * Owner UUID being set or changed.
8914          */
8915         if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
8916                 /* if the owner UUID is not actually changing ... */
8917                 if (VATTR_IS_SUPPORTED(&ova, va_uuuid)) {
8918                         if (kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid)) {
8919                                 goto no_uuuid_change;
8920                         }
8921
8922                         /*
8923                          * If the current owner UUID is a null GUID, check
8924                          * it against the UUID corresponding to the owner UID.
8925                          */
8926                         if (kauth_guid_equal(&ova.va_uuuid, &kauth_null_guid) &&
8927                             VATTR_IS_SUPPORTED(&ova, va_uid)) {
8928                                 guid_t uid_guid;
8929
8930                                 if (kauth_cred_uid2guid(ova.va_uid, &uid_guid) == 0 &&
8931                                     kauth_guid_equal(&vap->va_uuuid, &uid_guid)) {
8932                                         goto no_uuuid_change;
8933                                 }
8934                         }
8935                 }
8936
8937                 /*
8938                  * The owner UUID cannot be set by a non-superuser to anything other than
8939                  * their own or a null GUID (to "unset" the owner UUID).
8940                  * Note that file systems must be prepared to handle the
8941                  * null UUID case in a manner appropriate for that file
8942                  * system.
8943                  */
8944                 if (!has_priv_suser) {
8945                         if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
8946                                 KAUTH_DEBUG("  ERROR - got %d trying to get caller UUID", error);
8947                                 /* XXX ENOENT here - no UUID - should perhaps become EPERM */
8948                                 goto out;
8949                         }
8950                         if (!kauth_guid_equal(&vap->va_uuuid, &changer) &&
8951                             !kauth_guid_equal(&vap->va_uuuid, &kauth_null_guid)) {
8952                                 KAUTH_DEBUG("  ERROR - cannot set supplied owner UUID - not us / null");
8953                                 error = EPERM;
8954                                 goto out;
8955                         }
8956                 }
8957                 chowner = 1;
8958                 clear_suid = 1;
8959         }
8960 no_uuuid_change:
8961         /*
8962          * Group UUID being set or changed.
8963          */
8964         if (VATTR_IS_ACTIVE(vap, va_guuid)) {
8965                 /* if the group UUID is not actually changing ... */
8966                 if (VATTR_IS_SUPPORTED(&ova, va_guuid)) {
8967                         if (kauth_guid_equal(&vap->va_guuid, &ova.va_guuid)) {
8968                                 goto no_guuid_change;
8969                         }
8970
8971                         /*
8972                          * If the current group UUID is a null UUID, check
8973                          * it against the UUID corresponding to the group GID.
8974                          */
8975                         if (kauth_guid_equal(&ova.va_guuid, &kauth_null_guid) &&
8976                             VATTR_IS_SUPPORTED(&ova, va_gid)) {
8977                                 guid_t gid_guid;
8978
8979                                 if (kauth_cred_gid2guid(ova.va_gid, &gid_guid) == 0 &&
8980                                     kauth_guid_equal(&vap->va_guuid, &gid_guid)) {
8981                                         goto no_guuid_change;
8982                                 }
8983                         }
8984                 }
8985
8986                 /*
8987                  * The group UUID cannot be set by a non-superuser to anything other than
8988                  * one of which they are a member or a null GUID (to "unset"
8989                  * the group UUID).
8990                  * Note that file systems must be prepared to handle the
8991                  * null UUID case in a manner appropriate for that file
8992                  * system.
8993                  */
8994                 if (!has_priv_suser) {
8995                         if (kauth_guid_equal(&vap->va_guuid, &kauth_null_guid)) {
8996                                 ismember = 1;
8997                         } else if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
8998                                 KAUTH_DEBUG("  ERROR - got %d trying to check group membership", error);
8999                                 goto out;
9000                         }
9001                         if (!ismember) {
9002                                 KAUTH_DEBUG("  ERROR - cannot set supplied group UUID - not a member / null");
9003                                 error = EPERM;
9004                                 goto out;
9005                         }
9006                 }
9007                 chgroup = 1;
9008         }
9009 no_guuid_change:
9010
9011         /*
9012          * Compute authorisation for group/ownership changes.
9013          */
9014         if (chowner || chgroup || clear_suid || clear_sgid) {
9015                 if (has_priv_suser) {
9016                         KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check");
9017                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
9018                 } else {
9019                         if (chowner) {
9020                                 KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP");
9021                                 required_action |= KAUTH_VNODE_TAKE_OWNERSHIP;
9022                         }
9023                         if (chgroup && !chowner) {
9024                                 KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY");
9025                                 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9026                         }
9027                 }
9028
9029                 /*
9030                  * clear set-uid and set-gid bits. POSIX only requires this for
9031                  * non-privileged processes but we do it even for root.
9032                  */
9033                 if (VATTR_IS_ACTIVE(vap, va_mode)) {
9034                         newmode = vap->va_mode;
9035                 } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
9036                         newmode = ova.va_mode;
9037                 } else {
9038                         KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
9039                         newmode = 0;
9040                 }
9041
9042                 /* chown always clears setuid/gid bits. An exception is made for
9043                  * setattrlist executed by a root process to set <uid, gid, mode> on a file:
9044                  * setattrlist is allowed to set the new mode on the file and change (chown)
9045                  * uid/gid.
9046                  */
9047                 if (newmode & (S_ISUID | S_ISGID)) {
9048                         if (!VATTR_IS_ACTIVE(vap, va_mode) || !has_priv_suser) {
9049                                 KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o",
9050                                     newmode, newmode & ~(S_ISUID | S_ISGID));
9051                                 newmode &= ~(S_ISUID | S_ISGID);
9052                         }
9053                         VATTR_SET(vap, va_mode, newmode);
9054                 }
9055         }
9056
9057         /*
9058          * Authorise changes in the ACL.
9059          */
9060         if (VATTR_IS_ACTIVE(vap, va_acl)) {
9061                 /* no existing ACL */
9062                 if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) {
9063                         /* adding an ACL */
9064                         if (vap->va_acl != NULL) {
9065                                 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9066                                 KAUTH_DEBUG("CHMOD - adding ACL");
9067                         }
9068
9069                         /* removing an existing ACL */
9070                 } else if (vap->va_acl == NULL) {
9071                         required_action |= KAUTH_VNODE_WRITE_SECURITY;
9072                         KAUTH_DEBUG("CHMOD - removing ACL");
9073
9074                         /* updating an existing ACL */
9075                 } else {
9076                         if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) {
9077                                 /* entry count changed, must be different */
9078                                 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9079                                 KAUTH_DEBUG("CHMOD - adding/removing ACL entries");
9080                         } else if (vap->va_acl->acl_entrycount > 0) {
9081                                 /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */
9082                                 if (memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0],
9083                                     sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) {
9084                                         required_action |= KAUTH_VNODE_WRITE_SECURITY;
9085                                         KAUTH_DEBUG("CHMOD - changing ACL entries");
9086                                 }
9087                         }
9088                 }
9089         }
9090
9091         /*
9092          * Other attributes that require authorisation.
9093          */
9094         if (VATTR_IS_ACTIVE(vap, va_encoding)) {
9095                 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
9096         }
9097
9098 out:
9099         if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL)) {
9100                 kauth_acl_free(ova.va_acl);
9101         }
9102         if (error == 0) {
9103                 *actionp = required_action;
9104         }
9105         return error;
9106 }
9107
9108 static int
9109 setlocklocal_callback(struct vnode *vp, __unused void *cargs)
9110 {
9111         vnode_lock_spin(vp);
9112         vp->v_flag |= VLOCKLOCAL;
9113         vnode_unlock(vp);
9114
9115         return VNODE_RETURNED;
9116 }
9117
9118 void
9119 vfs_setlocklocal(mount_t mp)
9120 {
9121         mount_lock_spin(mp);
9122         mp->mnt_kern_flag |= MNTK_LOCK_LOCAL;
9123         mount_unlock(mp);
9124
9125         /*
9126          * The number of active vnodes is expected to be
9127          * very small when vfs_setlocklocal is invoked.
9128          */
9129         vnode_iterate(mp, 0, setlocklocal_callback, NULL);
9130 }
9131
9132 void
9133 vfs_setcompoundopen(mount_t mp)
9134 {
9135         mount_lock_spin(mp);
9136         mp->mnt_compound_ops |= COMPOUND_VNOP_OPEN;
9137         mount_unlock(mp);
9138 }
9139
9140 void
9141 vnode_setswapmount(vnode_t vp)
9142 {
9143         mount_lock(vp->v_mount);
9144         vp->v_mount->mnt_kern_flag |= MNTK_SWAP_MOUNT;
9145         mount_unlock(vp->v_mount);
9146 }
9147
9148
9149 int64_t
9150 vnode_getswappin_avail(vnode_t vp)
9151 {
9152         int64_t max_swappin_avail = 0;
9153
9154         mount_lock(vp->v_mount);
9155         if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_SWAPPIN_SUPPORTED) {
9156                 max_swappin_avail = vp->v_mount->mnt_max_swappin_available;
9157         }
9158         mount_unlock(vp->v_mount);
9159
9160         return max_swappin_avail;
9161 }
9162
9163
9164 void
9165 vn_setunionwait(vnode_t vp)
9166 {
9167         vnode_lock_spin(vp);
9168         vp->v_flag |= VISUNION;
9169         vnode_unlock(vp);
9170 }
9171
9172
9173 void
9174 vn_checkunionwait(vnode_t vp)
9175 {
9176         vnode_lock_spin(vp);
9177         while ((vp->v_flag & VISUNION) == VISUNION) {
9178                 msleep((caddr_t)&vp->v_flag, &vp->v_lock, 0, 0, 0);
9179         }
9180         vnode_unlock(vp);
9181 }
9182
9183 void
9184 vn_clearunionwait(vnode_t vp, int locked)
9185 {
9186         if (!locked) {
9187                 vnode_lock_spin(vp);
9188         }
9189         if ((vp->v_flag & VISUNION) == VISUNION) {
9190                 vp->v_flag &= ~VISUNION;
9191                 wakeup((caddr_t)&vp->v_flag);
9192         }
9193         if (!locked) {
9194                 vnode_unlock(vp);
9195         }
9196 }
9197
9198 /*
9199  * Removes orphaned apple double files during a rmdir
9200  * Works by:
9201  * 1. vnode_suspend().
9202  * 2. Call VNOP_READDIR() till the end of directory is reached.
9203  * 3. Check if the directory entries returned are regular files with name starting with "._".  If not, return ENOTEMPTY.
9204  * 4. Continue (2) and (3) till end of directory is reached.
9205  * 5. If all the entries in the directory were files with "._" name, delete all the files.
9206  * 6. vnode_resume()
9207  * 7. If deletion of all files succeeded, call VNOP_RMDIR() again.
9208  */
9209
9210 errno_t
9211 rmdir_remove_orphaned_appleDouble(vnode_t vp, vfs_context_t ctx, int * restart_flag)
9212 {
9213 #define UIO_BUFF_SIZE 2048
9214         uio_t auio = NULL;
9215         int eofflag, siz = UIO_BUFF_SIZE, nentries = 0;
9216         int open_flag = 0, full_erase_flag = 0;
9217         char uio_buf[UIO_SIZEOF(1)];
9218         char *rbuf = NULL;
9219         void *dir_pos;
9220         void *dir_end;
9221         struct dirent *dp;
9222         errno_t error;
9223
9224         error = vnode_suspend(vp);
9225
9226         /*
9227          * restart_flag is set so that the calling rmdir sleeps and resets
9228          */
9229         if (error == EBUSY) {
9230                 *restart_flag = 1;
9231         }
9232         if (error != 0) {
9233                 return error;
9234         }
9235
9236         /*
9237          * set up UIO
9238          */
9239         MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
9240         if (rbuf) {
9241                 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
9242                     &uio_buf[0], sizeof(uio_buf));
9243         }
9244         if (!rbuf || !auio) {
9245                 error = ENOMEM;
9246                 goto outsc;
9247         }
9248
9249         uio_setoffset(auio, 0);
9250
9251         eofflag = 0;
9252
9253         if ((error = VNOP_OPEN(vp, FREAD, ctx))) {
9254                 goto outsc;
9255         } else {
9256                 open_flag = 1;
9257         }
9258
9259         /*
9260          * First pass checks if all files are appleDouble files.
9261          */
9262
9263         do {
9264                 siz = UIO_BUFF_SIZE;
9265                 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
9266                 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
9267
9268                 if ((error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx))) {
9269                         goto outsc;
9270                 }
9271
9272                 if (uio_resid(auio) != 0) {
9273                         siz -= uio_resid(auio);
9274                 }
9275
9276                 /*
9277                  * Iterate through directory
9278                  */
9279                 dir_pos = (void*) rbuf;
9280                 dir_end = (void*) (rbuf + siz);
9281                 dp = (struct dirent*) (dir_pos);
9282
9283                 if (dir_pos == dir_end) {
9284                         eofflag = 1;
9285                 }
9286
9287                 while (dir_pos < dir_end) {
9288                         /*
9289                          * Check for . and .. as well as directories
9290                          */
9291                         if (dp->d_ino != 0 &&
9292                             !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
9293                             (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))) {
9294                                 /*
9295                                  * Check for irregular files and ._ files
9296                                  * If there is a ._._ file abort the op
9297                                  */
9298                                 if (dp->d_namlen < 2 ||
9299                                     strncmp(dp->d_name, "._", 2) ||
9300                                     (dp->d_namlen >= 4 && !strncmp(&(dp->d_name[2]), "._", 2))) {
9301                                         error = ENOTEMPTY;
9302                                         goto outsc;
9303                                 }
9304                         }
9305                         dir_pos = (void*) ((uint8_t*)dir_pos + dp->d_reclen);
9306                         dp = (struct dirent*)dir_pos;
9307                 }
9308
9309                 /*
9310                  * workaround for HFS/NFS setting eofflag before end of file
9311                  */
9312                 if (vp->v_tag == VT_HFS && nentries > 2) {
9313                         eofflag = 0;
9314                 }
9315
9316                 if (vp->v_tag == VT_NFS) {
9317                         if (eofflag && !full_erase_flag) {
9318                                 full_erase_flag = 1;
9319                                 eofflag = 0;
9320                                 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
9321                         } else if (!eofflag && full_erase_flag) {
9322                                 full_erase_flag = 0;
9323                         }
9324                 }
9325         } while (!eofflag);
9326         /*
9327          * If we've made it here all the files in the dir are ._ files.
9328          * We can delete the files even though the node is suspended
9329          * because we are the owner of the file.
9330          */
9331
9332         uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
9333         eofflag = 0;
9334         full_erase_flag = 0;
9335
9336         do {
9337                 siz = UIO_BUFF_SIZE;
9338                 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
9339                 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
9340
9341                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx);
9342
9343                 if (error != 0) {
9344                         goto outsc;
9345                 }
9346
9347                 if (uio_resid(auio) != 0) {
9348                         siz -= uio_resid(auio);
9349                 }
9350
9351                 /*
9352                  * Iterate through directory
9353                  */
9354                 dir_pos = (void*) rbuf;
9355                 dir_end = (void*) (rbuf + siz);
9356                 dp = (struct dirent*) dir_pos;
9357
9358                 if (dir_pos == dir_end) {
9359                         eofflag = 1;
9360                 }
9361
9362                 while (dir_pos < dir_end) {
9363                         /*
9364                          * Check for . and .. as well as directories
9365                          */
9366                         if (dp->d_ino != 0 &&
9367                             !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
9368                             (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))
9369                             ) {
9370                                 error = unlink1(ctx, vp,
9371                                     CAST_USER_ADDR_T(dp->d_name), UIO_SYSSPACE,
9372                                     VNODE_REMOVE_SKIP_NAMESPACE_EVENT |
9373                                     VNODE_REMOVE_NO_AUDIT_PATH);
9374
9375                                 if (error && error != ENOENT) {
9376                                         goto outsc;
9377                                 }
9378                         }
9379                         dir_pos = (void*) ((uint8_t*)dir_pos + dp->d_reclen);
9380                         dp = (struct dirent*)dir_pos;
9381                 }
9382
9383                 /*
9384                  * workaround for HFS/NFS setting eofflag before end of file
9385                  */
9386                 if (vp->v_tag == VT_HFS && nentries > 2) {
9387                         eofflag = 0;
9388                 }
9389
9390                 if (vp->v_tag == VT_NFS) {
9391                         if (eofflag && !full_erase_flag) {
9392                                 full_erase_flag = 1;
9393                                 eofflag = 0;
9394                                 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
9395                         } else if (!eofflag && full_erase_flag) {
9396                                 full_erase_flag = 0;
9397                         }
9398                 }
9399         } while (!eofflag);
9400
9401
9402         error = 0;
9403
9404 outsc:
9405         if (open_flag) {
9406                 VNOP_CLOSE(vp, FREAD, ctx);
9407         }
9408
9409         if (auio) {
9410                 uio_free(auio);
9411         }
9412         FREE(rbuf, M_TEMP);
9413
9414         vnode_resume(vp);
9415
9416
9417         return error;
9418 }
9419
9420
9421 void
9422 lock_vnode_and_post(vnode_t vp, int kevent_num)
9423 {
9424         /* Only take the lock if there's something there! */
9425         if (vp->v_knotes.slh_first != NULL) {
9426                 vnode_lock(vp);
9427                 KNOTE(&vp->v_knotes, kevent_num);
9428                 vnode_unlock(vp);
9429         }
9430 }
9431
9432 void panic_print_vnodes(void);
9433
9434 /* define PANIC_PRINTS_VNODES only if investigation is required. */
9435 #ifdef PANIC_PRINTS_VNODES
9436
9437 static const char *
9438 __vtype(uint16_t vtype)
9439 {
9440         switch (vtype) {
9441         case VREG:
9442                 return "R";
9443         case VDIR:
9444                 return "D";
9445         case VBLK:
9446                 return "B";
9447         case VCHR:
9448                 return "C";
9449         case VLNK:
9450                 return "L";
9451         case VSOCK:
9452                 return "S";
9453         case VFIFO:
9454                 return "F";
9455         case VBAD:
9456                 return "x";
9457         case VSTR:
9458                 return "T";
9459         case VCPLX:
9460                 return "X";
9461         default:
9462                 return "?";
9463         }
9464 }
9465
9466 /*
9467  * build a path from the bottom up
9468  * NOTE: called from the panic path - no alloc'ing of memory and no locks!
9469  */
9470 static char *
9471 __vpath(vnode_t vp, char *str, int len, int depth)
9472 {
9473         int vnm_len;
9474         const char *src;
9475         char *dst;
9476
9477         if (len <= 0) {
9478                 return str;
9479         }
9480         /* str + len is the start of the string we created */
9481         if (!vp->v_name) {
9482                 return str + len;
9483         }
9484
9485         /* follow mount vnodes to get the full path */
9486         if ((vp->v_flag & VROOT)) {
9487                 if (vp->v_mount != NULL && vp->v_mount->mnt_vnodecovered) {
9488                         return __vpath(vp->v_mount->mnt_vnodecovered,
9489                                    str, len, depth + 1);
9490                 }
9491                 return str + len;
9492         }
9493
9494         src = vp->v_name;
9495         vnm_len = strlen(src);
9496         if (vnm_len > len) {
9497                 /* truncate the name to fit in the string */
9498                 src += (vnm_len - len);
9499                 vnm_len = len;
9500         }
9501
9502         /* start from the back and copy just characters (no NULLs) */
9503
9504         /* this will chop off leaf path (file) names */
9505         if (depth > 0) {
9506                 dst = str + len - vnm_len;
9507                 memcpy(dst, src, vnm_len);
9508                 len -= vnm_len;
9509         } else {
9510                 dst = str + len;
9511         }
9512
9513         if (vp->v_parent && len > 1) {
9514                 /* follow parents up the chain */
9515                 len--;
9516                 *(dst - 1) = '/';
9517                 return __vpath(vp->v_parent, str, len, depth + 1);
9518         }
9519
9520         return dst;
9521 }
9522
9523 #define SANE_VNODE_PRINT_LIMIT 5000
9524 void
9525 panic_print_vnodes(void)
9526 {
9527         mount_t mnt;
9528         vnode_t vp;
9529         int nvnodes = 0;
9530         const char *type;
9531         char *nm;
9532         char vname[257];
9533
9534         paniclog_append_noflush("\n***** VNODES *****\n"
9535             "TYPE UREF ICNT PATH\n");
9536
9537         /* NULL-terminate the path name */
9538         vname[sizeof(vname) - 1] = '\0';
9539
9540         /*
9541          * iterate all vnodelist items in all mounts (mntlist) -> mnt_vnodelist
9542          */
9543         TAILQ_FOREACH(mnt, &mountlist, mnt_list) {
9544                 if (!ml_validate_nofault((vm_offset_t)mnt, sizeof(mount_t))) {
9545                         paniclog_append_noflush("Unable to iterate the mount list %p - encountered an invalid mount pointer %p \n",
9546                             &mountlist, mnt);
9547                         break;
9548                 }
9549
9550                 TAILQ_FOREACH(vp, &mnt->mnt_vnodelist, v_mntvnodes) {
9551                         if (!ml_validate_nofault((vm_offset_t)vp, sizeof(vnode_t))) {
9552                                 paniclog_append_noflush("Unable to iterate the vnode list %p - encountered an invalid vnode pointer %p \n",
9553                                     &mnt->mnt_vnodelist, vp);
9554                                 break;
9555                         }
9556
9557                         if (++nvnodes > SANE_VNODE_PRINT_LIMIT) {
9558                                 return;
9559                         }
9560                         type = __vtype(vp->v_type);
9561                         nm = __vpath(vp, vname, sizeof(vname) - 1, 0);
9562                         paniclog_append_noflush("%s %0d %0d %s\n",
9563                             type, vp->v_usecount, vp->v_iocount, nm);
9564                 }
9565         }
9566 }
9567
9568 #else /* !PANIC_PRINTS_VNODES */
9569 void
9570 panic_print_vnodes(void)
9571 {
9572         return;
9573 }
9574 #endif
9575
9576
9577 #ifdef JOE_DEBUG
9578 static void
9579 record_vp(vnode_t vp, int count)
9580 {
9581         struct uthread *ut;
9582
9583 #if CONFIG_TRIGGERS
9584         if (vp->v_resolve) {
9585                 return;
9586         }
9587 #endif
9588         if ((vp->v_flag & VSYSTEM)) {
9589                 return;
9590         }
9591
9592         ut = get_bsdthread_info(current_thread());
9593         ut->uu_iocount += count;
9594
9595         if (count == 1) {
9596                 if (ut->uu_vpindex < 32) {
9597                         OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][0], 10);
9598
9599                         ut->uu_vps[ut->uu_vpindex] = vp;
9600                         ut->uu_vpindex++;
9601                 }
9602         }
9603 }
9604 #endif
9605
9606
9607 #if CONFIG_TRIGGERS
9608
9609 #define TRIG_DEBUG 0
9610
9611 #if TRIG_DEBUG
9612 #define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0)
9613 #else
9614 #define TRIG_LOG(...)
9615 #endif
9616
9617 /*
9618  * Resolver result functions
9619  */
9620
9621 resolver_result_t
9622 vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux)
9623 {
9624         /*
9625          * |<---   32   --->|<---  28  --->|<- 4 ->|
9626          *      sequence        auxiliary    status
9627          */
9628         return (((uint64_t)seq) << 32) |
9629                (((uint64_t)(aux & 0x0fffffff)) << 4) |
9630                (uint64_t)(stat & 0x0000000F);
9631 }
9632
9633 enum resolver_status
9634 vfs_resolver_status(resolver_result_t result)
9635 {
9636         /* lower 4 bits is status */
9637         return result & 0x0000000F;
9638 }
9639
9640 uint32_t
9641 vfs_resolver_sequence(resolver_result_t result)
9642 {
9643         /* upper 32 bits is sequence */
9644         return (uint32_t)(result >> 32);
9645 }
9646
9647 int
9648 vfs_resolver_auxiliary(resolver_result_t result)
9649 {
9650         /* 28 bits of auxiliary */
9651         return (int)(((uint32_t)(result & 0xFFFFFFF0)) >> 4);
9652 }
9653
9654 /*
9655  * SPI
9656  * Call in for resolvers to update vnode trigger state
9657  */
9658 int
9659 vnode_trigger_update(vnode_t vp, resolver_result_t result)
9660 {
9661         vnode_resolve_t rp;
9662         uint32_t seq;
9663         enum resolver_status stat;
9664
9665         if (vp->v_resolve == NULL) {
9666                 return EINVAL;
9667         }
9668
9669         stat = vfs_resolver_status(result);
9670         seq = vfs_resolver_sequence(result);
9671
9672         if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) {
9673                 return EINVAL;
9674         }
9675
9676         rp = vp->v_resolve;
9677         lck_mtx_lock(&rp->vr_lock);
9678
9679         if (seq > rp->vr_lastseq) {
9680                 if (stat == RESOLVER_RESOLVED) {
9681                         rp->vr_flags |= VNT_RESOLVED;
9682                 } else {
9683                         rp->vr_flags &= ~VNT_RESOLVED;
9684                 }
9685
9686                 rp->vr_lastseq = seq;
9687         }
9688
9689         lck_mtx_unlock(&rp->vr_lock);
9690
9691         return 0;
9692 }
9693
9694 static int
9695 vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref)
9696 {
9697         int error;
9698
9699         vnode_lock_spin(vp);
9700         if (vp->v_resolve != NULL) {
9701                 vnode_unlock(vp);
9702                 return EINVAL;
9703         } else {
9704                 vp->v_resolve = rp;
9705         }
9706         vnode_unlock(vp);
9707
9708         if (ref) {
9709                 error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE);
9710                 if (error != 0) {
9711                         panic("VNODE_REF_FORCE didn't help...");
9712                 }
9713         }
9714
9715         return 0;
9716 }
9717
9718 /*
9719  * VFS internal interfaces for vnode triggers
9720  *
9721  * vnode must already have an io count on entry
9722  * v_resolve is stable when io count is non-zero
9723  */
9724 static int
9725 vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external)
9726 {
9727         vnode_resolve_t rp;
9728         int result;
9729         char byte;
9730
9731 #if 1
9732         /* minimum pointer test (debugging) */
9733         if (tinfo->vnt_data) {
9734                 byte = *((char *)tinfo->vnt_data);
9735         }
9736 #endif
9737         MALLOC(rp, vnode_resolve_t, sizeof(*rp), M_TEMP, M_WAITOK);
9738         if (rp == NULL) {
9739                 return ENOMEM;
9740         }
9741
9742         lck_mtx_init(&rp->vr_lock, trigger_vnode_lck_grp, trigger_vnode_lck_attr);
9743
9744         rp->vr_resolve_func = tinfo->vnt_resolve_func;
9745         rp->vr_unresolve_func = tinfo->vnt_unresolve_func;
9746         rp->vr_rearm_func = tinfo->vnt_rearm_func;
9747         rp->vr_reclaim_func = tinfo->vnt_reclaim_func;
9748         rp->vr_data = tinfo->vnt_data;
9749         rp->vr_lastseq = 0;
9750         rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK;
9751         if (external) {
9752                 rp->vr_flags |= VNT_EXTERNAL;
9753         }
9754
9755         result = vnode_resolver_attach(vp, rp, external);
9756         if (result != 0) {
9757                 goto out;
9758         }
9759
9760         if (mp) {
9761                 OSAddAtomic(1, &mp->mnt_numtriggers);
9762         }
9763
9764         return result;
9765
9766 out:
9767         FREE(rp, M_TEMP);
9768         return result;
9769 }
9770
9771 static void
9772 vnode_resolver_release(vnode_resolve_t rp)
9773 {
9774         /*
9775          * Give them a chance to free any private data
9776          */
9777         if (rp->vr_data && rp->vr_reclaim_func) {
9778                 rp->vr_reclaim_func(NULLVP, rp->vr_data);
9779         }
9780
9781         lck_mtx_destroy(&rp->vr_lock, trigger_vnode_lck_grp);
9782         FREE(rp, M_TEMP);
9783 }
9784
9785 /* Called after the vnode has been drained */
9786 static void
9787 vnode_resolver_detach(vnode_t vp)
9788 {
9789         vnode_resolve_t rp;
9790         mount_t mp;
9791
9792         mp = vnode_mount(vp);
9793
9794         vnode_lock(vp);
9795         rp = vp->v_resolve;
9796         vp->v_resolve = NULL;
9797         vnode_unlock(vp);
9798
9799         if ((rp->vr_flags & VNT_EXTERNAL) != 0) {
9800                 vnode_rele_ext(vp, O_EVTONLY, 1);
9801         }
9802
9803         vnode_resolver_release(rp);
9804
9805         /* Keep count of active trigger vnodes per mount */
9806         OSAddAtomic(-1, &mp->mnt_numtriggers);
9807 }
9808
9809 __private_extern__
9810 void
9811 vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx)
9812 {
9813         vnode_resolve_t rp;
9814         resolver_result_t result;
9815         enum resolver_status status;
9816         uint32_t seq;
9817
9818         if ((vp->v_resolve == NULL) ||
9819             (vp->v_resolve->vr_rearm_func == NULL) ||
9820             (vp->v_resolve->vr_flags & VNT_AUTO_REARM) == 0) {
9821                 return;
9822         }
9823
9824         rp = vp->v_resolve;
9825         lck_mtx_lock(&rp->vr_lock);
9826
9827         /*
9828          * Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes.
9829          */
9830         if (rp->vr_flags & VNT_VFS_UNMOUNTED) {
9831                 lck_mtx_unlock(&rp->vr_lock);
9832                 return;
9833         }
9834
9835         /* Check if this vnode is already armed */
9836         if ((rp->vr_flags & VNT_RESOLVED) == 0) {
9837                 lck_mtx_unlock(&rp->vr_lock);
9838                 return;
9839         }
9840
9841         lck_mtx_unlock(&rp->vr_lock);
9842
9843         result = rp->vr_rearm_func(vp, 0, rp->vr_data, ctx);
9844         status = vfs_resolver_status(result);
9845         seq = vfs_resolver_sequence(result);
9846
9847         lck_mtx_lock(&rp->vr_lock);
9848         if (seq > rp->vr_lastseq) {
9849                 if (status == RESOLVER_UNRESOLVED) {
9850                         rp->vr_flags &= ~VNT_RESOLVED;
9851                 }
9852                 rp->vr_lastseq = seq;
9853         }
9854         lck_mtx_unlock(&rp->vr_lock);
9855 }
9856
9857 __private_extern__
9858 int
9859 vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx)
9860 {
9861         vnode_resolve_t rp;
9862         enum path_operation op;
9863         resolver_result_t result;
9864         enum resolver_status status;
9865         uint32_t seq;
9866
9867         /* Only trigger on topmost vnodes */
9868         if ((vp->v_resolve == NULL) ||
9869             (vp->v_resolve->vr_resolve_func == NULL) ||
9870             (vp->v_mountedhere != NULL)) {
9871                 return 0;
9872         }
9873
9874         rp = vp->v_resolve;
9875         lck_mtx_lock(&rp->vr_lock);
9876
9877         /* Check if this vnode is already resolved */
9878         if (rp->vr_flags & VNT_RESOLVED) {
9879                 lck_mtx_unlock(&rp->vr_lock);
9880                 return 0;
9881         }
9882
9883         lck_mtx_unlock(&rp->vr_lock);
9884
9885 #if CONFIG_MACF
9886         int rv = mac_vnode_check_trigger_resolve(ctx, vp, &ndp->ni_cnd);
9887         if (rv != 0) {
9888                 return rv;
9889         }
9890 #endif
9891
9892         /*
9893          * XXX
9894          * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
9895          * is there anyway to know this???
9896          * there can also be other legitimate lookups in parallel
9897          *
9898          * XXX - should we call this on a separate thread with a timeout?
9899          *
9900          * XXX - should we use ISLASTCN to pick the op value???  Perhaps only leafs should
9901          * get the richer set and non-leafs should get generic OP_LOOKUP?  TBD
9902          */
9903         op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP;
9904
9905         result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, 0, rp->vr_data, ctx);
9906         status = vfs_resolver_status(result);
9907         seq = vfs_resolver_sequence(result);
9908
9909         lck_mtx_lock(&rp->vr_lock);
9910         if (seq > rp->vr_lastseq) {
9911                 if (status == RESOLVER_RESOLVED) {
9912                         rp->vr_flags |= VNT_RESOLVED;
9913                 }
9914                 rp->vr_lastseq = seq;
9915         }
9916         lck_mtx_unlock(&rp->vr_lock);
9917
9918         /* On resolver errors, propagate the error back up */
9919         return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0;
9920 }
9921
9922 static int
9923 vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx)
9924 {
9925         vnode_resolve_t rp;
9926         resolver_result_t result;
9927         enum resolver_status status;
9928         uint32_t seq;
9929
9930         if ((vp->v_resolve == NULL) || (vp->v_resolve->vr_unresolve_func == NULL)) {
9931                 return 0;
9932         }
9933
9934         rp = vp->v_resolve;
9935         lck_mtx_lock(&rp->vr_lock);
9936
9937         /* Check if this vnode is already resolved */
9938         if ((rp->vr_flags & VNT_RESOLVED) == 0) {
9939                 printf("vnode_trigger_unresolve: not currently resolved\n");
9940                 lck_mtx_unlock(&rp->vr_lock);
9941                 return 0;
9942         }
9943
9944         rp->vr_flags |= VNT_VFS_UNMOUNTED;
9945
9946         lck_mtx_unlock(&rp->vr_lock);
9947
9948         /*
9949          * XXX
9950          * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
9951          * there can also be other legitimate lookups in parallel
9952          *
9953          * XXX - should we call this on a separate thread with a timeout?
9954          */
9955
9956         result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx);
9957         status = vfs_resolver_status(result);
9958         seq = vfs_resolver_sequence(result);
9959
9960         lck_mtx_lock(&rp->vr_lock);
9961         if (seq > rp->vr_lastseq) {
9962                 if (status == RESOLVER_UNRESOLVED) {
9963                         rp->vr_flags &= ~VNT_RESOLVED;
9964                 }
9965                 rp->vr_lastseq = seq;
9966         }
9967         rp->vr_flags &= ~VNT_VFS_UNMOUNTED;
9968         lck_mtx_unlock(&rp->vr_lock);
9969
9970         /* On resolver errors, propagate the error back up */
9971         return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0;
9972 }
9973
9974 static int
9975 triggerisdescendant(mount_t mp, mount_t rmp)
9976 {
9977         int match = FALSE;
9978
9979         /*
9980          * walk up vnode covered chain looking for a match
9981          */
9982         name_cache_lock_shared();
9983
9984         while (1) {
9985                 vnode_t vp;
9986
9987                 /* did we encounter "/" ? */
9988                 if (mp->mnt_flag & MNT_ROOTFS) {
9989                         break;
9990                 }
9991
9992                 vp = mp->mnt_vnodecovered;
9993                 if (vp == NULLVP) {
9994                         break;
9995                 }
9996
9997                 mp = vp->v_mount;
9998                 if (mp == rmp) {
9999                         match = TRUE;
10000                         break;
10001                 }
10002         }
10003
10004         name_cache_unlock();
10005
10006         return match;
10007 }
10008
10009 struct trigger_unmount_info {
10010         vfs_context_t   ctx;
10011         mount_t         top_mp;
10012         vnode_t         trigger_vp;
10013         mount_t         trigger_mp;
10014         uint32_t        trigger_vid;
10015         int             flags;
10016 };
10017
10018 static int
10019 trigger_unmount_callback(mount_t mp, void * arg)
10020 {
10021         struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg;
10022         boolean_t mountedtrigger = FALSE;
10023
10024         /*
10025          * When we encounter the top level mount we're done
10026          */
10027         if (mp == infop->top_mp) {
10028                 return VFS_RETURNED_DONE;
10029         }
10030
10031         if ((mp->mnt_vnodecovered == NULL) ||
10032             (vnode_getwithref(mp->mnt_vnodecovered) != 0)) {
10033                 return VFS_RETURNED;
10034         }
10035
10036         if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
10037             (mp->mnt_vnodecovered->v_resolve != NULL) &&
10038             (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) {
10039                 mountedtrigger = TRUE;
10040         }
10041         vnode_put(mp->mnt_vnodecovered);
10042
10043         /*
10044          * When we encounter a mounted trigger, check if its under the top level mount
10045          */
10046         if (!mountedtrigger || !triggerisdescendant(mp, infop->top_mp)) {
10047                 return VFS_RETURNED;
10048         }
10049
10050         /*
10051          * Process any pending nested mount (now that its not referenced)
10052          */
10053         if ((infop->trigger_vp != NULLVP) &&
10054             (vnode_getwithvid(infop->trigger_vp, infop->trigger_vid) == 0)) {
10055                 vnode_t vp = infop->trigger_vp;
10056                 int error;
10057
10058                 infop->trigger_vp = NULLVP;
10059
10060                 if (mp == vp->v_mountedhere) {
10061                         vnode_put(vp);
10062                         printf("trigger_unmount_callback: unexpected match '%s'\n",
10063                             mp->mnt_vfsstat.f_mntonname);
10064                         return VFS_RETURNED;
10065                 }
10066                 if (infop->trigger_mp != vp->v_mountedhere) {
10067                         vnode_put(vp);
10068                         printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n",
10069                             infop->trigger_mp, vp->v_mountedhere);
10070                         goto savenext;
10071                 }
10072
10073                 error = vnode_trigger_unresolve(vp, infop->flags, infop->ctx);
10074                 vnode_put(vp);
10075                 if (error) {
10076                         printf("unresolving: '%s', err %d\n",
10077                             vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname :
10078                             "???", error);
10079                         return VFS_RETURNED_DONE; /* stop iteration on errors */
10080                 }
10081         }
10082 savenext:
10083         /*
10084          * We can't call resolver here since we hold a mount iter
10085          * ref on mp so save its covered vp for later processing
10086          */
10087         infop->trigger_vp = mp->mnt_vnodecovered;
10088         if ((infop->trigger_vp != NULLVP) &&
10089             (vnode_getwithref(infop->trigger_vp) == 0)) {
10090                 if (infop->trigger_vp->v_mountedhere == mp) {
10091                         infop->trigger_vid = infop->trigger_vp->v_id;
10092                         infop->trigger_mp = mp;
10093                 }
10094                 vnode_put(infop->trigger_vp);
10095         }
10096
10097         return VFS_RETURNED;
10098 }
10099
10100 /*
10101  * Attempt to unmount any trigger mounts nested underneath a mount.
10102  * This is a best effort attempt and no retries are performed here.
10103  *
10104  * Note: mp->mnt_rwlock is held exclusively on entry (so be carefull)
10105  */
10106 __private_extern__
10107 void
10108 vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx)
10109 {
10110         struct trigger_unmount_info info;
10111
10112         /* Must have trigger vnodes */
10113         if (mp->mnt_numtriggers == 0) {
10114                 return;
10115         }
10116         /* Avoid recursive requests (by checking covered vnode) */
10117         if ((mp->mnt_vnodecovered != NULL) &&
10118             (vnode_getwithref(mp->mnt_vnodecovered) == 0)) {
10119                 boolean_t recursive = FALSE;
10120
10121                 if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
10122                     (mp->mnt_vnodecovered->v_resolve != NULL) &&
10123                     (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) {
10124                         recursive = TRUE;
10125                 }
10126                 vnode_put(mp->mnt_vnodecovered);
10127                 if (recursive) {
10128                         return;
10129                 }
10130         }
10131
10132         /*
10133          * Attempt to unmount any nested trigger mounts (best effort)
10134          */
10135         info.ctx = ctx;
10136         info.top_mp = mp;
10137         info.trigger_vp = NULLVP;
10138         info.trigger_vid = 0;
10139         info.trigger_mp = NULL;
10140         info.flags = flags;
10141
10142         (void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, trigger_unmount_callback, &info);
10143
10144         /*
10145          * Process remaining nested mount (now that its not referenced)
10146          */
10147         if ((info.trigger_vp != NULLVP) &&
10148             (vnode_getwithvid(info.trigger_vp, info.trigger_vid) == 0)) {
10149                 vnode_t vp = info.trigger_vp;
10150
10151                 if (info.trigger_mp == vp->v_mountedhere) {
10152                         (void) vnode_trigger_unresolve(vp, flags, ctx);
10153                 }
10154                 vnode_put(vp);
10155         }
10156 }
10157
10158 int
10159 vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx)
10160 {
10161         struct nameidata nd;
10162         int res;
10163         vnode_t rvp, vp;
10164         struct vnode_trigger_param vtp;
10165
10166         /*
10167          * Must be called for trigger callback, wherein rwlock is held
10168          */
10169         lck_rw_assert(&mp->mnt_rwlock, LCK_RW_ASSERT_HELD);
10170
10171         TRIG_LOG("Adding trigger at %s\n", relpath);
10172         TRIG_LOG("Trying VFS_ROOT\n");
10173
10174         /*
10175          * We do a lookup starting at the root of the mountpoint, unwilling
10176          * to cross into other mountpoints.
10177          */
10178         res = VFS_ROOT(mp, &rvp, ctx);
10179         if (res != 0) {
10180                 goto out;
10181         }
10182
10183         TRIG_LOG("Trying namei\n");
10184
10185         NDINIT(&nd, LOOKUP, OP_LOOKUP, USEDVP | NOCROSSMOUNT | FOLLOW, UIO_SYSSPACE,
10186             CAST_USER_ADDR_T(relpath), ctx);
10187         nd.ni_dvp = rvp;
10188         res = namei(&nd);
10189         if (res != 0) {
10190                 vnode_put(rvp);
10191                 goto out;
10192         }
10193
10194         vp = nd.ni_vp;
10195         nameidone(&nd);
10196         vnode_put(rvp);
10197
10198         TRIG_LOG("Trying vnode_resolver_create()\n");
10199
10200         /*
10201          * Set up blob.  vnode_create() takes a larger structure
10202          * with creation info, and we needed something different
10203          * for this case.  One needs to win, or we need to munge both;
10204          * vnode_create() wins.
10205          */
10206         bzero(&vtp, sizeof(vtp));
10207         vtp.vnt_resolve_func = vtip->vti_resolve_func;
10208         vtp.vnt_unresolve_func = vtip->vti_unresolve_func;
10209         vtp.vnt_rearm_func = vtip->vti_rearm_func;
10210         vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
10211         vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
10212         vtp.vnt_data = vtip->vti_data;
10213         vtp.vnt_flags = vtip->vti_flags;
10214
10215         res = vnode_resolver_create(mp, vp, &vtp, TRUE);
10216         vnode_put(vp);
10217 out:
10218         TRIG_LOG("Returning %d\n", res);
10219         return res;
10220 }
10221
10222 #endif /* CONFIG_TRIGGERS */
10223
10224 vm_offset_t
10225 kdebug_vnode(vnode_t vp)
10226 {
10227         return VM_KERNEL_ADDRPERM(vp);
10228 }
10229
10230 static int flush_cache_on_write = 0;
10231 SYSCTL_INT(_kern, OID_AUTO, flush_cache_on_write,
10232     CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0,
10233     "always flush the drive cache on writes to uncached files");
10234
10235 int
10236 vnode_should_flush_after_write(vnode_t vp, int ioflag)
10237 {
10238         return flush_cache_on_write
10239                && (ISSET(ioflag, IO_NOCACHE) || vnode_isnocache(vp));
10240 }
10241
10242 /*
10243  * sysctl for use by disk I/O tracing tools to get the list of existing
10244  * vnodes' paths
10245  */
10246
10247 struct vnode_trace_paths_context {
10248         uint64_t count;
10249         long path[MAXPATHLEN / sizeof(long) + 1];  /* + 1 in case sizeof (long) does not divide MAXPATHLEN */
10250 };
10251
10252 static int
10253 vnode_trace_path_callback(struct vnode *vp, void *arg)
10254 {
10255         int len, rv;
10256         struct vnode_trace_paths_context *ctx;
10257
10258         ctx = arg;
10259
10260         len = sizeof(ctx->path);
10261         rv = vn_getpath(vp, (char *)ctx->path, &len);
10262         /* vn_getpath() NUL-terminates, and len includes the NUL */
10263
10264         if (!rv) {
10265                 kdebug_vfs_lookup(ctx->path, len, vp,
10266                     KDBG_VFS_LOOKUP_FLAG_LOOKUP | KDBG_VFS_LOOKUP_FLAG_NOPROCFILT);
10267
10268                 if (++(ctx->count) == 1000) {
10269                         thread_yield_to_preemption();
10270                         ctx->count = 0;
10271                 }
10272         }
10273
10274         return VNODE_RETURNED;
10275 }
10276
10277 static int
10278 vfs_trace_paths_callback(mount_t mp, void *arg)
10279 {
10280         if (mp->mnt_flag & MNT_LOCAL) {
10281                 vnode_iterate(mp, VNODE_ITERATE_ALL, vnode_trace_path_callback, arg);
10282         }
10283
10284         return VFS_RETURNED;
10285 }
10286
10287 static int sysctl_vfs_trace_paths SYSCTL_HANDLER_ARGS {
10288         struct vnode_trace_paths_context ctx;
10289
10290         (void)oidp;
10291         (void)arg1;
10292         (void)arg2;
10293         (void)req;
10294
10295         if (!kauth_cred_issuser(kauth_cred_get())) {
10296                 return EPERM;
10297         }
10298
10299         if (!kdebug_enable || !kdebug_debugid_enabled(VFS_LOOKUP)) {
10300                 return EINVAL;
10301         }
10302
10303         bzero(&ctx, sizeof(struct vnode_trace_paths_context));
10304
10305         vfs_iterate(0, vfs_trace_paths_callback, &ctx);
10306
10307         return 0;
10308 }
10309
10310 SYSCTL_PROC(_vfs_generic, OID_AUTO, trace_paths, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, NULL, 0, &sysctl_vfs_trace_paths, "-", "trace_paths");