bsd/vfs/vfs_subr.c

   1 /*
   2  * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  * (c) UNIX System Laboratories, Inc.
  33  * All or some portions of this file are derived from material licensed
  34  * to the University of California by American Telephone and Telegraph
  35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  36  * the permission of UNIX System Laboratories, Inc.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  67  */
  68 /*
  69  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  70  * support for mandatory and extensible security protections.  This notice
  71  * is included in support of clause 2.2 (b) of the Apple Public License,
  72  * Version 2.0.
  73  */
  74
  75 /*
  76  * External virtual filesystem routines
  77  */
  78
  79
  80 #include <sys/param.h>
  81 #include <sys/systm.h>
  82 #include <sys/proc_internal.h>
  83 #include <sys/kauth.h>
  84 #include <sys/mount_internal.h>
  85 #include <sys/time.h>
  86 #include <sys/lock.h>
  87 #include <sys/vnode.h>
  88 #include <sys/vnode_internal.h>
  89 #include <sys/stat.h>
  90 #include <sys/namei.h>
  91 #include <sys/ucred.h>
  92 #include <sys/buf_internal.h>
  93 #include <sys/errno.h>
  94 #include <sys/malloc.h>
  95 #include <sys/uio_internal.h>
  96 #include <sys/uio.h>
  97 #include <sys/domain.h>
  98 #include <sys/mbuf.h>
  99 #include <sys/syslog.h>
 100 #include <sys/ubc_internal.h>
 101 #include <sys/vm.h>
 102 #include <sys/sysctl.h>
 103 #include <sys/filedesc.h>
 104 #include <sys/event.h>
 105 #include <sys/kdebug.h>
 106 #include <sys/kauth.h>
 107 #include <sys/user.h>
 108 #include <sys/kern_memorystatus.h>
 109 #include <miscfs/fifofs/fifo.h>
 110
 111 #include <string.h>
 112 #include <machine/spl.h>
 113
 114
 115 #include <kern/assert.h>
 116
 117 #include <miscfs/specfs/specdev.h>
 118
 119 #include <mach/mach_types.h>
 120 #include <mach/memory_object_types.h>
 121 #include <mach/memory_object_control.h>
 122
 123 #include <kern/kalloc.h>        /* kalloc()/kfree() */
 124 #include <kern/clock.h>         /* delay_for_interval() */
 125 #include <libkern/OSAtomic.h>   /* OSAddAtomic() */
 126
 127
 128 #ifdef JOE_DEBUG
 129 #include <libkern/OSDebug.h>
 130 #endif
 131
 132 #include <vm/vm_protos.h>       /* vnode_pager_vrele() */
 133
 134 #if CONFIG_MACF
 135 #include <security/mac_framework.h>
 136 #endif
 137
 138 extern lck_grp_t *vnode_lck_grp;
 139 extern lck_attr_t *vnode_lck_attr;
 140
 141 #if CONFIG_TRIGGERS
 142 extern lck_grp_t *trigger_vnode_lck_grp;
 143 extern lck_attr_t *trigger_vnode_lck_attr;
 144 #endif
 145
 146 extern lck_mtx_t * mnt_list_mtx_lock;
 147
 148 enum vtype iftovt_tab[16] = {
 149         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 150         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 151 };
 152 int     vttoif_tab[9] = {
 153         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 154         S_IFSOCK, S_IFIFO, S_IFMT,
 155 };
 156
 157
 158 /* XXX These should be in a BSD accessible Mach header, but aren't. */
 159 extern void             memory_object_mark_used(
 160         memory_object_control_t         control);
 161
 162 extern void             memory_object_mark_unused(
 163         memory_object_control_t         control,
 164         boolean_t                       rage);
 165
 166
 167 /* XXX next protptype should be from <nfs/nfs.h> */
 168 extern int       nfs_vinvalbuf(vnode_t, int, vfs_context_t, int);
 169
 170 /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
 171 __private_extern__ void qsort(
 172     void * array,
 173     size_t nmembers,
 174     size_t member_size,
 175     int (*)(const void *, const void *));
 176
 177 extern kern_return_t adjust_vm_object_cache(vm_size_t oval, vm_size_t nval);
 178 __private_extern__ void vntblinit(void);
 179 __private_extern__ kern_return_t reset_vmobjectcache(unsigned int val1,
 180                         unsigned int val2);
 181 __private_extern__ int unlink1(vfs_context_t, struct nameidata *, int);
 182
 183 extern int system_inshutdown;
 184
 185 static void vnode_list_add(vnode_t);
 186 static void vnode_list_remove(vnode_t);
 187 static void vnode_list_remove_locked(vnode_t);
 188
 189 static errno_t vnode_drain(vnode_t);
 190 static void vgone(vnode_t, int flags);
 191 static void vclean(vnode_t vp, int flag);
 192 static void vnode_reclaim_internal(vnode_t, int, int, int);
 193
 194 static void vnode_dropiocount (vnode_t);
 195
 196 static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev);
 197 static int  vnode_reload(vnode_t);
 198 static int  vnode_isinuse_locked(vnode_t, int, int);
 199
 200 static void insmntque(vnode_t vp, mount_t mp);
 201 static int mount_getvfscnt(void);
 202 static int mount_fillfsids(fsid_t *, int );
 203 static void vnode_iterate_setup(mount_t);
 204 int vnode_umount_preflight(mount_t, vnode_t, int);
 205 static int vnode_iterate_prepare(mount_t);
 206 static int vnode_iterate_reloadq(mount_t);
 207 static void vnode_iterate_clear(mount_t);
 208 static mount_t vfs_getvfs_locked(fsid_t *);
 209 static int vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp,
 210                 struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx);
 211 static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx);
 212
 213 errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 214
 215 #ifdef JOE_DEBUG
 216 static void record_vp(vnode_t vp, int count);
 217 #endif
 218
 219 #if CONFIG_TRIGGERS
 220 static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external);
 221 static void vnode_resolver_detach(vnode_t);
 222 #endif
 223
 224 TAILQ_HEAD(freelst, vnode) vnode_free_list;     /* vnode free list */
 225 TAILQ_HEAD(deadlst, vnode) vnode_dead_list;     /* vnode dead list */
 226
 227 TAILQ_HEAD(ragelst, vnode) vnode_rage_list;     /* vnode rapid age list */
 228 struct timeval rage_tv;
 229 int     rage_limit = 0;
 230 int     ragevnodes = 0;
 231
 232 #define RAGE_LIMIT_MIN  100
 233 #define RAGE_TIME_LIMIT 5
 234
 235 struct mntlist mountlist;                       /* mounted filesystem list */
 236 static int nummounts = 0;
 237
 238 #if DIAGNOSTIC
 239 #define VLISTCHECK(fun, vp, list)       \
 240         if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
 241                 panic("%s: %s vnode not on %slist", (fun), (list), (list));
 242 #else
 243 #define VLISTCHECK(fun, vp, list)
 244 #endif /* DIAGNOSTIC */
 245
 246 #define VLISTNONE(vp)   \
 247         do {    \
 248                 (vp)->v_freelist.tqe_next = (struct vnode *)0;  \
 249                 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb;   \
 250         } while(0)
 251
 252 #define VONLIST(vp)     \
 253         ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
 254
 255 /* remove a vnode from free vnode list */
 256 #define VREMFREE(fun, vp)       \
 257         do {    \
 258                 VLISTCHECK((fun), (vp), "free");        \
 259                 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist);       \
 260                 VLISTNONE((vp));        \
 261                 freevnodes--;   \
 262         } while(0)
 263
 264
 265
 266 /* remove a vnode from dead vnode list */
 267 #define VREMDEAD(fun, vp)       \
 268         do {    \
 269                 VLISTCHECK((fun), (vp), "dead");        \
 270                 TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist);       \
 271                 VLISTNONE((vp));        \
 272                 vp->v_listflag &= ~VLIST_DEAD;  \
 273                 deadvnodes--;   \
 274         } while(0)
 275
 276
 277 /* remove a vnode from rage vnode list */
 278 #define VREMRAGE(fun, vp)       \
 279         do {    \
 280                 if ( !(vp->v_listflag & VLIST_RAGE))                    \
 281                         panic("VREMRAGE: vp not on rage list");         \
 282                 VLISTCHECK((fun), (vp), "rage");                        \
 283                 TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist);       \
 284                 VLISTNONE((vp));                \
 285                 vp->v_listflag &= ~VLIST_RAGE;  \
 286                 ragevnodes--;                   \
 287         } while(0)
 288
 289
 290 /*
 291  * vnodetarget hasn't been used in a long time, but
 292  * it was exported for some reason... I'm leaving in
 293  * place for now...  it should be deprecated out of the
 294  * exports and removed eventually.
 295  */
 296 u_int32_t vnodetarget;          /* target for vnreclaim() */
 297 #define VNODE_FREE_TARGET       20      /* Default value for vnodetarget */
 298
 299 /*
 300  * We need quite a few vnodes on the free list to sustain the
 301  * rapid stat() the compilation process does, and still benefit from the name
 302  * cache. Having too few vnodes on the free list causes serious disk
 303  * thrashing as we cycle through them.
 304  */
 305 #define VNODE_FREE_MIN          CONFIG_VNODE_FREE_MIN   /* freelist should have at least this many */
 306
 307 /*
 308  * Initialize the vnode management data structures.
 309  */
 310 __private_extern__ void
 311 vntblinit(void)
 312 {
 313         TAILQ_INIT(&vnode_free_list);
 314         TAILQ_INIT(&vnode_rage_list);
 315         TAILQ_INIT(&vnode_dead_list);
 316         TAILQ_INIT(&mountlist);
 317
 318         if (!vnodetarget)
 319                 vnodetarget = VNODE_FREE_TARGET;
 320
 321         microuptime(&rage_tv);
 322         rage_limit = desiredvnodes / 100;
 323
 324         if (rage_limit < RAGE_LIMIT_MIN)
 325                 rage_limit = RAGE_LIMIT_MIN;
 326
 327         /*
 328          * Scale the vm_object_cache to accomodate the vnodes
 329          * we want to cache
 330          */
 331         (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN);
 332 }
 333
 334 /* Reset the VM Object Cache with the values passed in */
 335 __private_extern__ kern_return_t
 336 reset_vmobjectcache(unsigned int val1, unsigned int val2)
 337 {
 338         vm_size_t oval = val1 - VNODE_FREE_MIN;
 339         vm_size_t nval;
 340
 341         if (val1 == val2) {
 342                 return KERN_SUCCESS;
 343         }
 344
 345         if(val2 < VNODE_FREE_MIN)
 346                 nval = 0;
 347         else
 348                 nval = val2 - VNODE_FREE_MIN;
 349
 350         return(adjust_vm_object_cache(oval, nval));
 351 }
 352
 353
 354 /* the timeout is in 10 msecs */
 355 int
 356 vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg) {
 357         int error = 0;
 358         struct timespec ts;
 359
 360         KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0);
 361
 362         if (vp->v_numoutput > output_target) {
 363
 364                 slpflag |= PDROP;
 365
 366                 vnode_lock_spin(vp);
 367
 368                 while ((vp->v_numoutput > output_target) && error == 0) {
 369                         if (output_target)
 370                                 vp->v_flag |= VTHROTTLED;
 371                         else
 372                                 vp->v_flag |= VBWAIT;
 373
 374                         ts.tv_sec = (slptimeout/100);
 375                         ts.tv_nsec = (slptimeout % 1000)  * 10 * NSEC_PER_USEC * 1000 ;
 376                         error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts);
 377
 378                         vnode_lock_spin(vp);
 379                 }
 380                 vnode_unlock(vp);
 381         }
 382         KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0);
 383
 384         return error;
 385 }
 386
 387
 388 void
 389 vnode_startwrite(vnode_t vp) {
 390
 391         OSAddAtomic(1, &vp->v_numoutput);
 392 }
 393
 394
 395 void
 396 vnode_writedone(vnode_t vp)
 397 {
 398         if (vp) {
 399                 int need_wakeup = 0;
 400
 401                 OSAddAtomic(-1, &vp->v_numoutput);
 402
 403                 vnode_lock_spin(vp);
 404
 405                 if (vp->v_numoutput < 0)
 406                         panic("vnode_writedone: numoutput < 0");
 407
 408                 if ((vp->v_flag & VTHROTTLED)) {
 409                         vp->v_flag &= ~VTHROTTLED;
 410                         need_wakeup = 1;
 411                 }
 412                 if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) {
 413                         vp->v_flag &= ~VBWAIT;
 414                         need_wakeup = 1;
 415                 }
 416                 vnode_unlock(vp);
 417
 418                 if (need_wakeup)
 419                         wakeup((caddr_t)&vp->v_numoutput);
 420         }
 421 }
 422
 423
 424
 425 int
 426 vnode_hasdirtyblks(vnode_t vp)
 427 {
 428         struct cl_writebehind *wbp;
 429
 430         /*
 431          * Not taking the buf_mtxp as there is little
 432          * point doing it. Even if the lock is taken the
 433          * state can change right after that. If their
 434          * needs to be a synchronization, it must be driven
 435          * by the caller
 436          */
 437         if (vp->v_dirtyblkhd.lh_first)
 438                 return (1);
 439
 440         if (!UBCINFOEXISTS(vp))
 441                 return (0);
 442
 443         wbp = vp->v_ubcinfo->cl_wbehind;
 444
 445         if (wbp && (wbp->cl_number || wbp->cl_scmap))
 446                 return (1);
 447
 448         return (0);
 449 }
 450
 451 int
 452 vnode_hascleanblks(vnode_t vp)
 453 {
 454         /*
 455          * Not taking the buf_mtxp as there is little
 456          * point doing it. Even if the lock is taken the
 457          * state can change right after that. If their
 458          * needs to be a synchronization, it must be driven
 459          * by the caller
 460          */
 461         if (vp->v_cleanblkhd.lh_first)
 462                 return (1);
 463         return (0);
 464 }
 465
 466 void
 467 vnode_iterate_setup(mount_t mp)
 468 {
 469         while (mp->mnt_lflag & MNT_LITER) {
 470                 mp->mnt_lflag |= MNT_LITERWAIT;
 471                 msleep((caddr_t)mp, &mp->mnt_mlock, PVFS, "vnode_iterate_setup", NULL);
 472         }
 473
 474         mp->mnt_lflag |= MNT_LITER;
 475
 476 }
 477
 478 int
 479 vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
 480 {
 481         vnode_t vp;
 482
 483         TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
 484                 /* disable preflight only for udf, a hack to be removed after 4073176 is fixed */
 485                 if (vp->v_tag == VT_UDF)
 486                         return 0;
 487                 if (vp->v_type == VDIR)
 488                         continue;
 489                 if (vp == skipvp)
 490                         continue;
 491                 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
 492             (vp->v_flag & VNOFLUSH)))
 493                         continue;
 494                 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP))
 495                         continue;
 496                 if ((flags & WRITECLOSE) &&
 497             (vp->v_writecount == 0 || vp->v_type != VREG))
 498                         continue;
 499                 /* Look for busy vnode */
 500         if (((vp->v_usecount != 0) &&
 501             ((vp->v_usecount - vp->v_kusecount) != 0)))
 502                         return(1);
 503                 }
 504
 505         return(0);
 506 }
 507
 508 /*
 509  * This routine prepares iteration by moving all the vnodes to worker queue
 510  * called with mount lock held
 511  */
 512 int
 513 vnode_iterate_prepare(mount_t mp)
 514 {
 515         vnode_t vp;
 516
 517         if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
 518                 /* nothing to do */
 519                 return (0);
 520         }
 521
 522         vp = TAILQ_FIRST(&mp->mnt_vnodelist);
 523         vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first);
 524         mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first;
 525         mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last;
 526
 527         TAILQ_INIT(&mp->mnt_vnodelist);
 528         if (mp->mnt_newvnodes.tqh_first != NULL)
 529                 panic("vnode_iterate_prepare: newvnode when entering vnode");
 530         TAILQ_INIT(&mp->mnt_newvnodes);
 531
 532         return (1);
 533 }
 534
 535
 536 /* called with mount lock held */
 537 int
 538 vnode_iterate_reloadq(mount_t mp)
 539 {
 540         int moved = 0;
 541
 542         /* add the remaining entries in workerq to the end of mount vnode list */
 543         if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
 544                 struct vnode * mvp;
 545                 mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst);
 546
 547                 /* Joining the workerque entities to mount vnode list */
 548                 if (mvp)
 549                         mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first;
 550                 else
 551                         mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first;
 552                 mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last;
 553                 mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last;
 554                 TAILQ_INIT(&mp->mnt_workerqueue);
 555         }
 556
 557         /* add the newvnodes to the head of mount vnode list */
 558         if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) {
 559                 struct vnode * nlvp;
 560                 nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst);
 561
 562                 mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first;
 563                 nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first;
 564                 if(mp->mnt_vnodelist.tqh_first)
 565                         mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next;
 566                 else
 567                         mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last;
 568                 mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first;
 569                 TAILQ_INIT(&mp->mnt_newvnodes);
 570                 moved = 1;
 571         }
 572
 573         return(moved);
 574 }
 575
 576
 577 void
 578 vnode_iterate_clear(mount_t mp)
 579 {
 580         mp->mnt_lflag &= ~MNT_LITER;
 581         if (mp->mnt_lflag & MNT_LITERWAIT) {
 582                 mp->mnt_lflag &= ~MNT_LITERWAIT;
 583                 wakeup(mp);
 584         }
 585 }
 586
 587
 588 int
 589 vnode_iterate(mount_t mp, int flags, int (*callout)(struct vnode *, void *),
 590               void *arg)
 591 {
 592         struct vnode *vp;
 593         int vid, retval;
 594         int ret = 0;
 595
 596         mount_lock(mp);
 597
 598         vnode_iterate_setup(mp);
 599
 600         /* it is returns 0 then there is nothing to do */
 601         retval = vnode_iterate_prepare(mp);
 602
 603         if (retval == 0)  {
 604                 vnode_iterate_clear(mp);
 605                 mount_unlock(mp);
 606                 return(ret);
 607         }
 608
 609         /* iterate over all the vnodes */
 610         while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
 611                 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
 612                 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
 613                 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
 614                 vid = vp->v_id;
 615                 if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) {
 616                         continue;
 617                 }
 618                 mount_unlock(mp);
 619
 620                 if ( vget_internal(vp, vid, (flags | VNODE_NODEAD| VNODE_WITHID | VNODE_NOSUSPEND))) {
 621                         mount_lock(mp);
 622                         continue;
 623                 }
 624                 if (flags & VNODE_RELOAD) {
 625                         /*
 626                          * we're reloading the filesystem
 627                          * cast out any inactive vnodes...
 628                          */
 629                         if (vnode_reload(vp)) {
 630                                 /* vnode will be recycled on the refcount drop */
 631                                 vnode_put(vp);
 632                                 mount_lock(mp);
 633                                 continue;
 634                         }
 635                 }
 636
 637                 retval = callout(vp, arg);
 638
 639                 switch (retval) {
 640                   case VNODE_RETURNED:
 641                   case VNODE_RETURNED_DONE:
 642                           vnode_put(vp);
 643                           if (retval == VNODE_RETURNED_DONE) {
 644                                 mount_lock(mp);
 645                                 ret = 0;
 646                                 goto out;
 647                           }
 648                           break;
 649
 650                   case VNODE_CLAIMED_DONE:
 651                                 mount_lock(mp);
 652                                 ret = 0;
 653                                 goto out;
 654                   case VNODE_CLAIMED:
 655                   default:
 656                                 break;
 657                 }
 658                 mount_lock(mp);
 659         }
 660
 661 out:
 662         (void)vnode_iterate_reloadq(mp);
 663         vnode_iterate_clear(mp);
 664         mount_unlock(mp);
 665         return (ret);
 666 }
 667
 668 void
 669 mount_lock_renames(mount_t mp)
 670 {
 671         lck_mtx_lock(&mp->mnt_renamelock);
 672 }
 673
 674 void
 675 mount_unlock_renames(mount_t mp)
 676 {
 677         lck_mtx_unlock(&mp->mnt_renamelock);
 678 }
 679
 680 void
 681 mount_lock(mount_t mp)
 682 {
 683         lck_mtx_lock(&mp->mnt_mlock);
 684 }
 685
 686 void
 687 mount_lock_spin(mount_t mp)
 688 {
 689         lck_mtx_lock_spin(&mp->mnt_mlock);
 690 }
 691
 692 void
 693 mount_unlock(mount_t mp)
 694 {
 695         lck_mtx_unlock(&mp->mnt_mlock);
 696 }
 697
 698
 699 void
 700 mount_ref(mount_t mp, int locked)
 701 {
 702         if ( !locked)
 703                 mount_lock_spin(mp);
 704
 705         mp->mnt_count++;
 706
 707         if ( !locked)
 708                 mount_unlock(mp);
 709 }
 710
 711
 712 void
 713 mount_drop(mount_t mp, int locked)
 714 {
 715         if ( !locked)
 716                 mount_lock_spin(mp);
 717
 718         mp->mnt_count--;
 719
 720         if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN))
 721                 wakeup(&mp->mnt_lflag);
 722
 723         if ( !locked)
 724                 mount_unlock(mp);
 725 }
 726
 727
 728 int
 729 mount_iterref(mount_t mp, int locked)
 730 {
 731         int retval = 0;
 732
 733         if (!locked)
 734                 mount_list_lock();
 735         if (mp->mnt_iterref < 0) {
 736                 retval = 1;
 737         } else {
 738                 mp->mnt_iterref++;
 739         }
 740         if (!locked)
 741                 mount_list_unlock();
 742         return(retval);
 743 }
 744
 745 int
 746 mount_isdrained(mount_t mp, int locked)
 747 {
 748         int retval;
 749
 750         if (!locked)
 751                 mount_list_lock();
 752         if (mp->mnt_iterref < 0)
 753                 retval = 1;
 754         else
 755                 retval = 0;
 756         if (!locked)
 757                 mount_list_unlock();
 758         return(retval);
 759 }
 760
 761 void
 762 mount_iterdrop(mount_t mp)
 763 {
 764         mount_list_lock();
 765         mp->mnt_iterref--;
 766         wakeup(&mp->mnt_iterref);
 767         mount_list_unlock();
 768 }
 769
 770 void
 771 mount_iterdrain(mount_t mp)
 772 {
 773         mount_list_lock();
 774         while (mp->mnt_iterref)
 775                 msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL);
 776         /* mount iterations drained */
 777         mp->mnt_iterref = -1;
 778         mount_list_unlock();
 779 }
 780 void
 781 mount_iterreset(mount_t mp)
 782 {
 783         mount_list_lock();
 784         if (mp->mnt_iterref == -1)
 785                 mp->mnt_iterref = 0;
 786         mount_list_unlock();
 787 }
 788
 789 /* always called with  mount lock held */
 790 int
 791 mount_refdrain(mount_t mp)
 792 {
 793         if (mp->mnt_lflag & MNT_LDRAIN)
 794                 panic("already in drain");
 795         mp->mnt_lflag |= MNT_LDRAIN;
 796
 797         while (mp->mnt_count)
 798                 msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", NULL);
 799
 800         if (mp->mnt_vnodelist.tqh_first != NULL)
 801                  panic("mount_refdrain: dangling vnode");
 802
 803         mp->mnt_lflag &= ~MNT_LDRAIN;
 804
 805         return(0);
 806 }
 807
 808 /* Tags the mount point as not supportine extended readdir for NFS exports */
 809 void
 810 mount_set_noreaddirext(mount_t mp) {
 811         mount_lock (mp);
 812         mp->mnt_kern_flag |= MNTK_DENY_READDIREXT;
 813         mount_unlock (mp);
 814 }
 815
 816 /*
 817  * Mark a mount point as busy. Used to synchronize access and to delay
 818  * unmounting.
 819  */
 820 int
 821 vfs_busy(mount_t mp, int flags)
 822 {
 823
 824 restart:
 825         if (mp->mnt_lflag & MNT_LDEAD)
 826                 return(ENOENT);
 827
 828         if (mp->mnt_lflag & MNT_LUNMOUNT) {
 829                 if (flags & LK_NOWAIT)
 830                         return (ENOENT);
 831
 832                 mount_lock(mp);
 833
 834                 if (mp->mnt_lflag & MNT_LDEAD) {
 835                         mount_unlock(mp);
 836                         return(ENOENT);
 837                 }
 838                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 839                         mp->mnt_lflag |= MNT_LWAIT;
 840                         /*
 841                          * Since all busy locks are shared except the exclusive
 842                          * lock granted when unmounting, the only place that a
 843                          * wakeup needs to be done is at the release of the
 844                          * exclusive lock at the end of dounmount.
 845                          */
 846                         msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", NULL);
 847                         return (ENOENT);
 848                 }
 849                 mount_unlock(mp);
 850         }
 851
 852         lck_rw_lock_shared(&mp->mnt_rwlock);
 853
 854         /*
 855          * until we are granted the rwlock, it's possible for the mount point to
 856          * change state, so reevaluate before granting the vfs_busy
 857          */
 858         if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
 859                 lck_rw_done(&mp->mnt_rwlock);
 860                 goto restart;
 861         }
 862         return (0);
 863 }
 864
 865 /*
 866  * Free a busy filesystem.
 867  */
 868
 869 void
 870 vfs_unbusy(mount_t mp)
 871 {
 872         lck_rw_done(&mp->mnt_rwlock);
 873 }
 874
 875
 876
 877 static void
 878 vfs_rootmountfailed(mount_t mp) {
 879
 880         mount_list_lock();
 881         mp->mnt_vtable->vfc_refcount--;
 882         mount_list_unlock();
 883
 884         vfs_unbusy(mp);
 885
 886         mount_lock_destroy(mp);
 887
 888 #if CONFIG_MACF
 889         mac_mount_label_destroy(mp);
 890 #endif
 891
 892         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
 893 }
 894
 895 /*
 896  * Lookup a filesystem type, and if found allocate and initialize
 897  * a mount structure for it.
 898  *
 899  * Devname is usually updated by mount(8) after booting.
 900  */
 901 static mount_t
 902 vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname)
 903 {
 904         mount_t mp;
 905
 906         mp = _MALLOC_ZONE(sizeof(struct mount), M_MOUNT, M_WAITOK);
 907         bzero((char *)mp, sizeof(struct mount));
 908
 909         /* Initialize the default IO constraints */
 910         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 911         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 912         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 913         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 914         mp->mnt_devblocksize = DEV_BSIZE;
 915         mp->mnt_alignmentmask = PAGE_MASK;
 916         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 917         mp->mnt_ioscale = 1;
 918         mp->mnt_ioflags = 0;
 919         mp->mnt_realrootvp = NULLVP;
 920         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 921         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 922         mp->mnt_devbsdunit = 0;
 923
 924         mount_lock_init(mp);
 925         (void)vfs_busy(mp, LK_NOWAIT);
 926
 927         TAILQ_INIT(&mp->mnt_vnodelist);
 928         TAILQ_INIT(&mp->mnt_workerqueue);
 929         TAILQ_INIT(&mp->mnt_newvnodes);
 930
 931         mp->mnt_vtable = vfsp;
 932         mp->mnt_op = vfsp->vfc_vfsops;
 933         mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS;
 934         mp->mnt_vnodecovered = NULLVP;
 935         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 936         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 937
 938         mount_list_lock();
 939         vfsp->vfc_refcount++;
 940         mount_list_unlock();
 941
 942         strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 943         mp->mnt_vfsstat.f_mntonname[0] = '/';
 944         /* XXX const poisoning layering violation */
 945         (void) copystr((const void *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, NULL);
 946
 947 #if CONFIG_MACF
 948         mac_mount_label_init(mp);
 949         mac_mount_label_associate(vfs_context_kernel(), mp);
 950 #endif
 951         return (mp);
 952 }
 953
 954 errno_t
 955 vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp)
 956 {
 957         struct vfstable *vfsp;
 958
 959         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 960                 if (!strncmp(vfsp->vfc_name, fstypename,
 961                              sizeof(vfsp->vfc_name)))
 962                         break;
 963         if (vfsp == NULL)
 964                 return (ENODEV);
 965
 966         *mpp = vfs_rootmountalloc_internal(vfsp, devname);
 967
 968         if (*mpp)
 969                 return (0);
 970
 971         return (ENOMEM);
 972 }
 973
 974
 975 /*
 976  * Find an appropriate filesystem to use for the root. If a filesystem
 977  * has not been preselected, walk through the list of known filesystems
 978  * trying those that have mountroot routines, and try them until one
 979  * works or we have tried them all.
 980  */
 981 extern int (*mountroot)(void);
 982
 983 int
 984 vfs_mountroot(void)
 985 {
 986 #if CONFIG_MACF
 987         struct vnode *vp;
 988 #endif
 989         struct vfstable *vfsp;
 990         vfs_context_t ctx = vfs_context_kernel();
 991         struct vfs_attr vfsattr;
 992         int     error;
 993         mount_t mp;
 994         vnode_t bdevvp_rootvp;
 995
 996         if (mountroot != NULL) {
 997                 /*
 998                  * used for netboot which follows a different set of rules
 999                  */
1000                 error = (*mountroot)();
1001                 return (error);
1002         }
1003         if ((error = bdevvp(rootdev, &rootvp))) {
1004                 printf("vfs_mountroot: can't setup bdevvp\n");
1005                 return (error);
1006         }
1007         /*
1008          * 4951998 - code we call in vfc_mountroot may replace rootvp
1009          * so keep a local copy for some house keeping.
1010          */
1011         bdevvp_rootvp = rootvp;
1012
1013         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1014                 if (vfsp->vfc_mountroot == NULL)
1015                         continue;
1016
1017                 mp = vfs_rootmountalloc_internal(vfsp, "root_device");
1018                 mp->mnt_devvp = rootvp;
1019
1020                 if ((error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx)) == 0) {
1021                         if ( bdevvp_rootvp != rootvp ) {
1022                                 /*
1023                                  * rootvp changed...
1024                                  *   bump the iocount and fix up mnt_devvp for the
1025                                  *   new rootvp (it will already have a usecount taken)...
1026                                  *   drop the iocount and the usecount on the orignal
1027                                  *   since we are no longer going to use it...
1028                                  */
1029                                 vnode_getwithref(rootvp);
1030                                 mp->mnt_devvp = rootvp;
1031
1032                                 vnode_rele(bdevvp_rootvp);
1033                                 vnode_put(bdevvp_rootvp);
1034                         }
1035                         mp->mnt_devvp->v_specflags |= SI_MOUNTEDON;
1036
1037                         vfs_unbusy(mp);
1038
1039                         mount_list_add(mp);
1040
1041                         /*
1042                          *   cache the IO attributes for the underlying physical media...
1043                          *   an error return indicates the underlying driver doesn't
1044                          *   support all the queries necessary... however, reasonable
1045                          *   defaults will have been set, so no reason to bail or care
1046                          */
1047                         vfs_init_io_attributes(rootvp, mp);
1048
1049                         /*
1050                          * Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS.
1051                          */
1052                         if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1053                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1054                         }
1055                         if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1056                                 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1057                         }
1058
1059                         /*
1060                          * Probe root file system for additional features.
1061                          */
1062                         (void)VFS_START(mp, 0, ctx);
1063
1064                         VFSATTR_INIT(&vfsattr);
1065                         VFSATTR_WANTED(&vfsattr, f_capabilities);
1066                         if (vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1067                             VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1068                                 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1069                                     (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1070                                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1071                                 }
1072 #if NAMEDSTREAMS
1073                                 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1074                                     (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1075                                         mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1076                                 }
1077 #endif
1078                                 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1079                                     (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1080                                         mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1081                                 }
1082                         }
1083
1084                         /*
1085                          * get rid of iocount reference returned
1086                          * by bdevvp (or picked up by us on the substitued
1087                          * rootvp)... it (or we) will have also taken
1088                          * a usecount reference which we want to keep
1089                          */
1090                         vnode_put(rootvp);
1091
1092 #if CONFIG_MACF
1093                         if ((vfs_flags(mp) & MNT_MULTILABEL) == 0)
1094                                 return (0);
1095
1096                         error = VFS_ROOT(mp, &vp, ctx);
1097                         if (error) {
1098                                 printf("%s() VFS_ROOT() returned %d\n",
1099                                     __func__, error);
1100                                 dounmount(mp, MNT_FORCE, 0, ctx);
1101                                 goto fail;
1102                         }
1103                         error = vnode_label(mp, NULL, vp, NULL, 0, ctx);
1104                         /*
1105                          * get rid of reference provided by VFS_ROOT
1106                          */
1107                         vnode_put(vp);
1108
1109                         if (error) {
1110                                 printf("%s() vnode_label() returned %d\n",
1111                                     __func__, error);
1112                                 dounmount(mp, MNT_FORCE, 0, ctx);
1113                                 goto fail;
1114                         }
1115 #endif
1116                         return (0);
1117                 }
1118 #if CONFIG_MACF
1119 fail:
1120 #endif
1121                 vfs_rootmountfailed(mp);
1122
1123                 if (error != EINVAL)
1124                         printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
1125         }
1126         return (ENODEV);
1127 }
1128
1129 /*
1130  * Lookup a mount point by filesystem identifier.
1131  */
1132
1133 struct mount *
1134 vfs_getvfs(fsid_t *fsid)
1135 {
1136         return (mount_list_lookupby_fsid(fsid, 0, 0));
1137 }
1138
1139 static struct mount *
1140 vfs_getvfs_locked(fsid_t *fsid)
1141 {
1142         return(mount_list_lookupby_fsid(fsid, 1, 0));
1143 }
1144
1145 struct mount *
1146 vfs_getvfs_by_mntonname(char *path)
1147 {
1148         mount_t retmp = (mount_t)0;
1149         mount_t mp;
1150
1151         mount_list_lock();
1152         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
1153                 if (!strncmp(mp->mnt_vfsstat.f_mntonname, path,
1154                                         sizeof(mp->mnt_vfsstat.f_mntonname))) {
1155                         retmp = mp;
1156                         if (mount_iterref(retmp, 1))
1157                                 retmp = NULL;
1158                         goto out;
1159                 }
1160         }
1161 out:
1162         mount_list_unlock();
1163         return (retmp);
1164 }
1165
1166 /* generation number for creation of new fsids */
1167 u_short mntid_gen = 0;
1168 /*
1169  * Get a new unique fsid
1170  */
1171 void
1172 vfs_getnewfsid(struct mount *mp)
1173 {
1174
1175         fsid_t tfsid;
1176         int mtype;
1177         mount_t nmp;
1178
1179         mount_list_lock();
1180
1181         /* generate a new fsid */
1182         mtype = mp->mnt_vtable->vfc_typenum;
1183         if (++mntid_gen == 0)
1184                 mntid_gen++;
1185         tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1186         tfsid.val[1] = mtype;
1187
1188         TAILQ_FOREACH(nmp, &mountlist, mnt_list) {
1189                 while (vfs_getvfs_locked(&tfsid)) {
1190                         if (++mntid_gen == 0)
1191                                 mntid_gen++;
1192                         tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1193                 }
1194         }
1195         mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0];
1196         mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1];
1197         mount_list_unlock();
1198 }
1199
1200 /*
1201  * Routines having to do with the management of the vnode table.
1202  */
1203 extern int (**dead_vnodeop_p)(void *);
1204 long numvnodes, freevnodes, deadvnodes;
1205
1206
1207 /*
1208  * Move a vnode from one mount queue to another.
1209  */
1210 static void
1211 insmntque(vnode_t vp, mount_t mp)
1212 {
1213         mount_t lmp;
1214         /*
1215          * Delete from old mount point vnode list, if on one.
1216          */
1217         if ( (lmp = vp->v_mount) != NULL && lmp != dead_mountp) {
1218                 if ((vp->v_lflag & VNAMED_MOUNT) == 0)
1219                         panic("insmntque: vp not in mount vnode list");
1220                 vp->v_lflag &= ~VNAMED_MOUNT;
1221
1222                 mount_lock_spin(lmp);
1223
1224                 mount_drop(lmp, 1);
1225
1226                 if (vp->v_mntvnodes.tqe_next == NULL) {
1227                         if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp)
1228                                 TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes);
1229                         else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp)
1230                                 TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes);
1231                         else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp)
1232                                 TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes);
1233                  } else {
1234                         vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev;
1235                         *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next;
1236                 }
1237                 vp->v_mntvnodes.tqe_next = NULL;
1238                 vp->v_mntvnodes.tqe_prev = NULL;
1239                 mount_unlock(lmp);
1240                 return;
1241         }
1242
1243         /*
1244          * Insert into list of vnodes for the new mount point, if available.
1245          */
1246         if ((vp->v_mount = mp) != NULL) {
1247                 mount_lock_spin(mp);
1248                 if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0))
1249                         panic("vp already in mount list");
1250                 if (mp->mnt_lflag & MNT_LITER)
1251                         TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes);
1252                 else
1253                         TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
1254                 if (vp->v_lflag & VNAMED_MOUNT)
1255                         panic("insmntque: vp already in mount vnode list");
1256                 vp->v_lflag |= VNAMED_MOUNT;
1257                 mount_ref(mp, 1);
1258                 mount_unlock(mp);
1259         }
1260 }
1261
1262
1263 /*
1264  * Create a vnode for a block device.
1265  * Used for root filesystem, argdev, and swap areas.
1266  * Also used for memory file system special devices.
1267  */
1268 int
1269 bdevvp(dev_t dev, vnode_t *vpp)
1270 {
1271         vnode_t nvp;
1272         int     error;
1273         struct vnode_fsparam vfsp;
1274         struct vfs_context context;
1275
1276         if (dev == NODEV) {
1277                 *vpp = NULLVP;
1278                 return (ENODEV);
1279         }
1280
1281         context.vc_thread = current_thread();
1282         context.vc_ucred = FSCRED;
1283
1284         vfsp.vnfs_mp = (struct mount *)0;
1285         vfsp.vnfs_vtype = VBLK;
1286         vfsp.vnfs_str = "bdevvp";
1287         vfsp.vnfs_dvp = NULL;
1288         vfsp.vnfs_fsnode = NULL;
1289         vfsp.vnfs_cnp = NULL;
1290         vfsp.vnfs_vops = spec_vnodeop_p;
1291         vfsp.vnfs_rdev = dev;
1292         vfsp.vnfs_filesize = 0;
1293
1294         vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE;
1295
1296         vfsp.vnfs_marksystem = 0;
1297         vfsp.vnfs_markroot = 0;
1298
1299         if ( (error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp)) ) {
1300                 *vpp = NULLVP;
1301                 return (error);
1302         }
1303         vnode_lock_spin(nvp);
1304         nvp->v_flag |= VBDEVVP;
1305         nvp->v_tag = VT_NON;    /* set this to VT_NON so during aliasing it can be replaced */
1306         vnode_unlock(nvp);
1307         if ( (error = vnode_ref(nvp)) ) {
1308                 panic("bdevvp failed: vnode_ref");
1309                 return (error);
1310         }
1311         if ( (error = VNOP_FSYNC(nvp, MNT_WAIT, &context)) ) {
1312                 panic("bdevvp failed: fsync");
1313                 return (error);
1314         }
1315         if ( (error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0)) ) {
1316                 panic("bdevvp failed: invalidateblks");
1317                 return (error);
1318         }
1319
1320 #if CONFIG_MACF
1321         /*
1322          * XXXMAC: We can't put a MAC check here, the system will
1323          * panic without this vnode.
1324          */
1325 #endif /* MAC */
1326
1327         if ( (error = VNOP_OPEN(nvp, FREAD, &context)) ) {
1328                 panic("bdevvp failed: open");
1329                 return (error);
1330         }
1331         *vpp = nvp;
1332
1333         return (0);
1334 }
1335
1336 /*
1337  * Check to see if the new vnode represents a special device
1338  * for which we already have a vnode (either because of
1339  * bdevvp() or because of a different vnode representing
1340  * the same block device). If such an alias exists, deallocate
1341  * the existing contents and return the aliased vnode. The
1342  * caller is responsible for filling it with its new contents.
1343  */
1344 static vnode_t
1345 checkalias(struct vnode *nvp, dev_t nvp_rdev)
1346 {
1347         struct vnode *vp;
1348         struct vnode **vpp;
1349         struct specinfo *sin = NULL;
1350         int vid = 0;
1351
1352         vpp = &speclisth[SPECHASH(nvp_rdev)];
1353 loop:
1354         SPECHASH_LOCK();
1355
1356         for (vp = *vpp; vp; vp = vp->v_specnext) {
1357                 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1358                         vid = vp->v_id;
1359                         break;
1360                 }
1361         }
1362         SPECHASH_UNLOCK();
1363
1364         if (vp) {
1365 found_alias:
1366                 if (vnode_getwithvid(vp,vid)) {
1367                         goto loop;
1368                 }
1369                 /*
1370                  * Termination state is checked in vnode_getwithvid
1371                  */
1372                 vnode_lock(vp);
1373
1374                 /*
1375                  * Alias, but not in use, so flush it out.
1376                  */
1377                 if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) {
1378                         vnode_reclaim_internal(vp, 1, 1, 0);
1379                         vnode_put_locked(vp);
1380                         vnode_unlock(vp);
1381                         goto loop;
1382                 }
1383
1384         }
1385         if (vp == NULL || vp->v_tag != VT_NON) {
1386                 if (sin == NULL) {
1387                         MALLOC_ZONE(sin, struct specinfo *, sizeof(struct specinfo),
1388                                         M_SPECINFO, M_WAITOK);
1389                 }
1390
1391                 nvp->v_specinfo = sin;
1392                 bzero(nvp->v_specinfo, sizeof(struct specinfo));
1393                 nvp->v_rdev = nvp_rdev;
1394                 nvp->v_specflags = 0;
1395                 nvp->v_speclastr = -1;
1396                 nvp->v_specinfo->si_opencount = 0;
1397                 nvp->v_specinfo->si_initted = 0;
1398                 nvp->v_specinfo->si_throttleable = 0;
1399
1400                 SPECHASH_LOCK();
1401
1402                 /* We dropped the lock, someone could have added */
1403                 if (vp == NULLVP) {
1404                         for (vp = *vpp; vp; vp = vp->v_specnext) {
1405                                 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1406                                         vid = vp->v_id;
1407                                         SPECHASH_UNLOCK();
1408                                         goto found_alias;
1409                                 }
1410                         }
1411                 }
1412
1413                 nvp->v_hashchain = vpp;
1414                 nvp->v_specnext = *vpp;
1415                 *vpp = nvp;
1416
1417                 if (vp != NULLVP) {
1418                         nvp->v_specflags |= SI_ALIASED;
1419                         vp->v_specflags |= SI_ALIASED;
1420                         SPECHASH_UNLOCK();
1421                         vnode_put_locked(vp);
1422                         vnode_unlock(vp);
1423                 } else {
1424                         SPECHASH_UNLOCK();
1425                 }
1426
1427                 return (NULLVP);
1428         }
1429
1430         if (sin) {
1431                 FREE_ZONE(sin, sizeof(struct specinfo), M_SPECINFO);
1432         }
1433
1434         if ((vp->v_flag & (VBDEVVP | VDEVFLUSH)) != 0)
1435                 return(vp);
1436
1437         panic("checkalias with VT_NON vp that shouldn't: %p", vp);
1438
1439         return (vp);
1440 }
1441
1442
1443 /*
1444  * Get a reference on a particular vnode and lock it if requested.
1445  * If the vnode was on the inactive list, remove it from the list.
1446  * If the vnode was on the free list, remove it from the list and
1447  * move it to inactive list as needed.
1448  * The vnode lock bit is set if the vnode is being eliminated in
1449  * vgone. The process is awakened when the transition is completed,
1450  * and an error returned to indicate that the vnode is no longer
1451  * usable (possibly having been changed to a new file system type).
1452  */
1453 int
1454 vget_internal(vnode_t vp, int vid, int vflags)
1455 {
1456         int error = 0;
1457
1458         vnode_lock_spin(vp);
1459
1460         if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0))
1461                 /*
1462                  * vnode to be returned only if it has writers opened
1463                  */
1464                 error = EINVAL;
1465         else
1466                 error = vnode_getiocount(vp, vid, vflags);
1467
1468         vnode_unlock(vp);
1469
1470         return (error);
1471 }
1472
1473 /*
1474  * Returns:     0                       Success
1475  *              ENOENT                  No such file or directory [terminating]
1476  */
1477 int
1478 vnode_ref(vnode_t vp)
1479 {
1480
1481         return (vnode_ref_ext(vp, 0, 0));
1482 }
1483
1484 /*
1485  * Returns:     0                       Success
1486  *              ENOENT                  No such file or directory [terminating]
1487  */
1488 int
1489 vnode_ref_ext(vnode_t vp, int fmode, int flags)
1490 {
1491         int     error = 0;
1492
1493         vnode_lock_spin(vp);
1494
1495         /*
1496          * once all the current call sites have been fixed to insure they have
1497          * taken an iocount, we can toughen this assert up and insist that the
1498          * iocount is non-zero... a non-zero usecount doesn't insure correctness
1499          */
1500         if (vp->v_iocount <= 0 && vp->v_usecount <= 0)
1501                 panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount);
1502
1503         /*
1504          * if you are the owner of drain/termination, can acquire usecount
1505          */
1506         if ((flags & VNODE_REF_FORCE) == 0) {
1507                 if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) {
1508                         if (vp->v_owner != current_thread()) {
1509                                 error = ENOENT;
1510                                 goto out;
1511                         }
1512                 }
1513         }
1514         vp->v_usecount++;
1515
1516         if (fmode & FWRITE) {
1517                 if (++vp->v_writecount <= 0)
1518                         panic("vnode_ref_ext: v_writecount");
1519         }
1520         if (fmode & O_EVTONLY) {
1521                 if (++vp->v_kusecount <= 0)
1522                         panic("vnode_ref_ext: v_kusecount");
1523         }
1524         if (vp->v_flag & VRAGE) {
1525                 struct  uthread *ut;
1526
1527                 ut = get_bsdthread_info(current_thread());
1528
1529                 if ( !(current_proc()->p_lflag & P_LRAGE_VNODES) &&
1530                      !(ut->uu_flag & UT_RAGE_VNODES)) {
1531                         /*
1532                          * a 'normal' process accessed this vnode
1533                          * so make sure its no longer marked
1534                          * for rapid aging...  also, make sure
1535                          * it gets removed from the rage list...
1536                          * when v_usecount drops back to 0, it
1537                          * will be put back on the real free list
1538                          */
1539                         vp->v_flag &= ~VRAGE;
1540                         vp->v_references = 0;
1541                         vnode_list_remove(vp);
1542                 }
1543         }
1544         if (vp->v_usecount == 1 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
1545
1546                 if (vp->v_ubcinfo) {
1547                         vnode_lock_convert(vp);
1548                         memory_object_mark_used(vp->v_ubcinfo->ui_control);
1549                 }
1550         }
1551 out:
1552         vnode_unlock(vp);
1553
1554         return (error);
1555 }
1556
1557
1558 /*
1559  * put the vnode on appropriate free list.
1560  * called with vnode LOCKED
1561  */
1562 static void
1563 vnode_list_add(vnode_t vp)
1564 {
1565 #if DIAGNOSTIC
1566         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1567 #endif
1568         /*
1569          * if it is already on a list or non zero references return
1570          */
1571         if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || (vp->v_lflag & VL_TERMINATE))
1572                 return;
1573
1574         vnode_list_lock();
1575
1576         if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) {
1577                 /*
1578                  * add the new guy to the appropriate end of the RAGE list
1579                  */
1580                 if ((vp->v_flag & VAGE))
1581                         TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist);
1582                 else
1583                         TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist);
1584
1585                 vp->v_listflag |= VLIST_RAGE;
1586                 ragevnodes++;
1587
1588                 /*
1589                  * reset the timestamp for the last inserted vp on the RAGE
1590                  * queue to let new_vnode know that its not ok to start stealing
1591                  * from this list... as long as we're actively adding to this list
1592                  * we'll push out the vnodes we want to donate to the real free list
1593                  * once we stop pushing, we'll let some time elapse before we start
1594                  * stealing them in the new_vnode routine
1595                  */
1596                 microuptime(&rage_tv);
1597         } else {
1598                 /*
1599                  * if VL_DEAD, insert it at head of the dead list
1600                  * else insert at tail of LRU list or at head if VAGE is set
1601                  */
1602                 if ( (vp->v_lflag & VL_DEAD)) {
1603                         TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist);
1604                         vp->v_listflag |= VLIST_DEAD;
1605                         deadvnodes++;
1606                 } else if ((vp->v_flag & VAGE)) {
1607                         TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1608                         vp->v_flag &= ~VAGE;
1609                         freevnodes++;
1610                 } else {
1611                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1612                         freevnodes++;
1613                 }
1614         }
1615         vnode_list_unlock();
1616 }
1617
1618
1619 /*
1620  * remove the vnode from appropriate free list.
1621  * called with vnode LOCKED and
1622  * the list lock held
1623  */
1624 static void
1625 vnode_list_remove_locked(vnode_t vp)
1626 {
1627         if (VONLIST(vp)) {
1628                 /*
1629                  * the v_listflag field is
1630                  * protected by the vnode_list_lock
1631                  */
1632                 if (vp->v_listflag & VLIST_RAGE)
1633                         VREMRAGE("vnode_list_remove", vp);
1634                 else if (vp->v_listflag & VLIST_DEAD)
1635                         VREMDEAD("vnode_list_remove", vp);
1636                 else
1637                         VREMFREE("vnode_list_remove", vp);
1638         }
1639 }
1640
1641
1642 /*
1643  * remove the vnode from appropriate free list.
1644  * called with vnode LOCKED
1645  */
1646 static void
1647 vnode_list_remove(vnode_t vp)
1648 {
1649 #if DIAGNOSTIC
1650         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1651 #endif
1652         /*
1653          * we want to avoid taking the list lock
1654          * in the case where we're not on the free
1655          * list... this will be true for most
1656          * directories and any currently in use files
1657          *
1658          * we're guaranteed that we can't go from
1659          * the not-on-list state to the on-list
1660          * state since we hold the vnode lock...
1661          * all calls to vnode_list_add are done
1662          * under the vnode lock... so we can
1663          * check for that condition (the prevelant one)
1664          * without taking the list lock
1665          */
1666         if (VONLIST(vp)) {
1667                 vnode_list_lock();
1668                 /*
1669                  * however, we're not guaranteed that
1670                  * we won't go from the on-list state
1671                  * to the not-on-list state until we
1672                  * hold the vnode_list_lock... this
1673                  * is due to "new_vnode" removing vnodes
1674                  * from the free list uder the list_lock
1675                  * w/o the vnode lock... so we need to
1676                  * check again whether we're currently
1677                  * on the free list
1678                  */
1679                 vnode_list_remove_locked(vp);
1680
1681                 vnode_list_unlock();
1682         }
1683 }
1684
1685
1686 void
1687 vnode_rele(vnode_t vp)
1688 {
1689         vnode_rele_internal(vp, 0, 0, 0);
1690 }
1691
1692
1693 void
1694 vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter)
1695 {
1696         vnode_rele_internal(vp, fmode, dont_reenter, 0);
1697 }
1698
1699
1700 void
1701 vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked)
1702 {
1703
1704         if ( !locked)
1705                 vnode_lock_spin(vp);
1706 #if DIAGNOSTIC
1707         else
1708                 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1709 #endif
1710         if (--vp->v_usecount < 0)
1711                 panic("vnode_rele_ext: vp %p usecount -ve : %d.  v_tag = %d, v_type = %d, v_flag = %x.", vp,  vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
1712
1713         if (fmode & FWRITE) {
1714                 if (--vp->v_writecount < 0)
1715                         panic("vnode_rele_ext: vp %p writecount -ve : %d.  v_tag = %d, v_type = %d, v_flag = %x.", vp,  vp->v_writecount, vp->v_tag, vp->v_type, vp->v_flag);
1716         }
1717         if (fmode & O_EVTONLY) {
1718                 if (--vp->v_kusecount < 0)
1719                         panic("vnode_rele_ext: vp %p kusecount -ve : %d.  v_tag = %d, v_type = %d, v_flag = %x.", vp,  vp->v_kusecount, vp->v_tag, vp->v_type, vp->v_flag);
1720         }
1721         if (vp->v_kusecount > vp->v_usecount)
1722                 panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d).  v_tag = %d, v_type = %d, v_flag = %x.",vp, vp->v_kusecount, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
1723
1724         if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) {
1725                 /*
1726                  * vnode is still busy... if we're the last
1727                  * usecount, mark for a future call to VNOP_INACTIVE
1728                  * when the iocount finally drops to 0
1729                  */
1730                 if (vp->v_usecount == 0) {
1731                         vp->v_lflag |= VL_NEEDINACTIVE;
1732                         vp->v_flag  &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
1733                 }
1734                 goto done;
1735         }
1736         vp->v_flag  &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
1737
1738         if ( (vp->v_lflag & (VL_TERMINATE | VL_DEAD)) || dont_reenter) {
1739                 /*
1740                  * vnode is being cleaned, or
1741                  * we've requested that we don't reenter
1742                  * the filesystem on this release... in
1743                  * this case, we'll mark the vnode aged
1744                  * if it's been marked for termination
1745                  */
1746                 if (dont_reenter) {
1747                         if ( !(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM)) )
1748                                 vp->v_lflag |= VL_NEEDINACTIVE;
1749                         vp->v_flag |= VAGE;
1750                 }
1751                 vnode_list_add(vp);
1752
1753                 goto done;
1754         }
1755         /*
1756          * at this point both the iocount and usecount
1757          * are zero
1758          * pick up an iocount so that we can call
1759          * VNOP_INACTIVE with the vnode lock unheld
1760          */
1761         vp->v_iocount++;
1762 #ifdef JOE_DEBUG
1763         record_vp(vp, 1);
1764 #endif
1765         vp->v_lflag &= ~VL_NEEDINACTIVE;
1766         vnode_unlock(vp);
1767
1768         VNOP_INACTIVE(vp, vfs_context_current());
1769
1770         vnode_lock_spin(vp);
1771         /*
1772          * because we dropped the vnode lock to call VNOP_INACTIVE
1773          * the state of the vnode may have changed... we may have
1774          * picked up an iocount, usecount or the MARKTERM may have
1775          * been set... we need to reevaluate the reference counts
1776          * to determine if we can call vnode_reclaim_internal at
1777          * this point... if the reference counts are up, we'll pick
1778          * up the MARKTERM state when they get subsequently dropped
1779          */
1780         if ( (vp->v_iocount == 1) && (vp->v_usecount == 0) &&
1781              ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) {
1782                 struct  uthread *ut;
1783
1784                 ut = get_bsdthread_info(current_thread());
1785
1786                 if (ut->uu_defer_reclaims) {
1787                         vp->v_defer_reclaimlist = ut->uu_vreclaims;
1788                         ut->uu_vreclaims = vp;
1789                         goto done;
1790                 }
1791                 vnode_lock_convert(vp);
1792                 vnode_reclaim_internal(vp, 1, 1, 0);
1793         }
1794         vnode_dropiocount(vp);
1795         vnode_list_add(vp);
1796 done:
1797         if (vp->v_usecount == 0 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
1798
1799                 if (vp->v_ubcinfo) {
1800                         vnode_lock_convert(vp);
1801                         memory_object_mark_unused(vp->v_ubcinfo->ui_control, (vp->v_flag & VRAGE) == VRAGE);
1802                 }
1803         }
1804         if ( !locked)
1805                 vnode_unlock(vp);
1806         return;
1807 }
1808
1809 /*
1810  * Remove any vnodes in the vnode table belonging to mount point mp.
1811  *
1812  * If MNT_NOFORCE is specified, there should not be any active ones,
1813  * return error if any are found (nb: this is a user error, not a
1814  * system error). If MNT_FORCE is specified, detach any active vnodes
1815  * that are found.
1816  */
1817 #if DIAGNOSTIC
1818 int busyprt = 0;        /* print out busy vnodes */
1819 #if 0
1820 struct ctldebug debug1 = { "busyprt", &busyprt };
1821 #endif /* 0 */
1822 #endif
1823
1824 int
1825 vflush(struct mount *mp, struct vnode *skipvp, int flags)
1826 {
1827         struct vnode *vp;
1828         int busy = 0;
1829         int reclaimed = 0;
1830         int retval;
1831         unsigned int vid;
1832
1833         mount_lock(mp);
1834         vnode_iterate_setup(mp);
1835         /*
1836          * On regular unmounts(not forced) do a
1837          * quick check for vnodes to be in use. This
1838          * preserves the caching of vnodes. automounter
1839          * tries unmounting every so often to see whether
1840          * it is still busy or not.
1841          */
1842         if (((flags & FORCECLOSE)==0)  && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != 0)) {
1843                 if (vnode_umount_preflight(mp, skipvp, flags)) {
1844                         vnode_iterate_clear(mp);
1845                         mount_unlock(mp);
1846                         return(EBUSY);
1847                 }
1848         }
1849 loop:
1850         /* it is returns 0 then there is nothing to do */
1851         retval = vnode_iterate_prepare(mp);
1852
1853         if (retval == 0)  {
1854                 vnode_iterate_clear(mp);
1855                 mount_unlock(mp);
1856                 return(retval);
1857         }
1858
1859         /* iterate over all the vnodes */
1860         while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
1861
1862                 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
1863                 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
1864                 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
1865
1866                 if ( (vp->v_mount != mp) || (vp == skipvp)) {
1867                         continue;
1868                 }
1869                 vid = vp->v_id;
1870                 mount_unlock(mp);
1871
1872                 vnode_lock_spin(vp);
1873
1874                 if ((vp->v_id != vid) || ((vp->v_lflag & (VL_DEAD | VL_TERMINATE)))) {
1875                                 vnode_unlock(vp);
1876                                 mount_lock(mp);
1877                                 continue;
1878                 }
1879
1880                 /*
1881                  * If requested, skip over vnodes marked VSYSTEM.
1882                  * Skip over all vnodes marked VNOFLUSH.
1883                  */
1884                 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
1885                     (vp->v_flag & VNOFLUSH))) {
1886                         vnode_unlock(vp);
1887                         mount_lock(mp);
1888                         continue;
1889                 }
1890                 /*
1891                  * If requested, skip over vnodes marked VSWAP.
1892                  */
1893                 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
1894                         vnode_unlock(vp);
1895                         mount_lock(mp);
1896                         continue;
1897                 }
1898                 /*
1899                  * If requested, skip over vnodes marked VROOT.
1900                  */
1901                 if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
1902                         vnode_unlock(vp);
1903                         mount_lock(mp);
1904                         continue;
1905                 }
1906                 /*
1907                  * If WRITECLOSE is set, only flush out regular file
1908                  * vnodes open for writing.
1909                  */
1910                 if ((flags & WRITECLOSE) &&
1911                     (vp->v_writecount == 0 || vp->v_type != VREG)) {
1912                         vnode_unlock(vp);
1913                         mount_lock(mp);
1914                         continue;
1915                 }
1916                 /*
1917                  * If the real usecount is 0, all we need to do is clear
1918                  * out the vnode data structures and we are done.
1919                  */
1920                 if (((vp->v_usecount == 0) ||
1921                     ((vp->v_usecount - vp->v_kusecount) == 0))) {
1922
1923                         vnode_lock_convert(vp);
1924                         vp->v_iocount++;        /* so that drain waits for * other iocounts */
1925 #ifdef JOE_DEBUG
1926                         record_vp(vp, 1);
1927 #endif
1928                         vnode_reclaim_internal(vp, 1, 1, 0);
1929                         vnode_dropiocount(vp);
1930                         vnode_list_add(vp);
1931                         vnode_unlock(vp);
1932
1933                         reclaimed++;
1934                         mount_lock(mp);
1935                         continue;
1936                 }
1937                 /*
1938                  * If FORCECLOSE is set, forcibly close the vnode.
1939                  * For block or character devices, revert to an
1940                  * anonymous device. For all other files, just kill them.
1941                  */
1942                 if (flags & FORCECLOSE) {
1943                         vnode_lock_convert(vp);
1944
1945                         if (vp->v_type != VBLK && vp->v_type != VCHR) {
1946                                 vp->v_iocount++;        /* so that drain waits * for other iocounts */
1947 #ifdef JOE_DEBUG
1948                                 record_vp(vp, 1);
1949 #endif
1950                                 vnode_reclaim_internal(vp, 1, 1, 0);
1951                                 vnode_dropiocount(vp);
1952                                 vnode_list_add(vp);
1953                                 vnode_unlock(vp);
1954                         } else {
1955                                 vclean(vp, 0);
1956                                 vp->v_lflag &= ~VL_DEAD;
1957                                 vp->v_op = spec_vnodeop_p;
1958                                 vp->v_flag |= VDEVFLUSH;
1959                                 vnode_unlock(vp);
1960                         }
1961                         mount_lock(mp);
1962                         continue;
1963                 }
1964 #if DIAGNOSTIC
1965                 if (busyprt)
1966                         vprint("vflush: busy vnode", vp);
1967 #endif
1968                 vnode_unlock(vp);
1969                 mount_lock(mp);
1970                 busy++;
1971         }
1972
1973         /* At this point the worker queue is completed */
1974         if (busy && ((flags & FORCECLOSE)==0) && reclaimed) {
1975                 busy = 0;
1976                 reclaimed = 0;
1977                 (void)vnode_iterate_reloadq(mp);
1978                 /* returned with mount lock held */
1979                 goto loop;
1980         }
1981
1982         /* if new vnodes were created in between retry the reclaim */
1983         if ( vnode_iterate_reloadq(mp) != 0) {
1984                 if (!(busy && ((flags & FORCECLOSE)==0)))
1985                         goto loop;
1986         }
1987         vnode_iterate_clear(mp);
1988         mount_unlock(mp);
1989
1990         if (busy && ((flags & FORCECLOSE)==0))
1991                 return (EBUSY);
1992         return (0);
1993 }
1994
1995 long num_recycledvnodes = 0;
1996 /*
1997  * Disassociate the underlying file system from a vnode.
1998  * The vnode lock is held on entry.
1999  */
2000 static void
2001 vclean(vnode_t vp, int flags)
2002 {
2003         vfs_context_t ctx = vfs_context_current();
2004         int active;
2005         int need_inactive;
2006         int already_terminating;
2007         int clflags = 0;
2008 #if NAMEDSTREAMS
2009         int is_namedstream;
2010 #endif
2011
2012         /*
2013          * Check to see if the vnode is in use.
2014          * If so we have to reference it before we clean it out
2015          * so that its count cannot fall to zero and generate a
2016          * race against ourselves to recycle it.
2017          */
2018         active = vp->v_usecount;
2019
2020         /*
2021          * just in case we missed sending a needed
2022          * VNOP_INACTIVE, we'll do it now
2023          */
2024         need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
2025
2026         vp->v_lflag &= ~VL_NEEDINACTIVE;
2027
2028         /*
2029          * Prevent the vnode from being recycled or
2030          * brought into use while we clean it out.
2031          */
2032         already_terminating = (vp->v_lflag & VL_TERMINATE);
2033
2034         vp->v_lflag |= VL_TERMINATE;
2035
2036         /*
2037          * remove the vnode from any mount list
2038          * it might be on...
2039          */
2040         insmntque(vp, (struct mount *)0);
2041
2042 #if NAMEDSTREAMS
2043         is_namedstream = vnode_isnamedstream(vp);
2044 #endif
2045
2046         vnode_unlock(vp);
2047
2048         OSAddAtomicLong(1, &num_recycledvnodes);
2049
2050         if (flags & DOCLOSE)
2051                 clflags |= IO_NDELAY;
2052         if (flags & REVOKEALL)
2053                 clflags |= IO_REVOKE;
2054
2055         if (active && (flags & DOCLOSE))
2056                 VNOP_CLOSE(vp, clflags, ctx);
2057
2058         /*
2059          * Clean out any buffers associated with the vnode.
2060          */
2061         if (flags & DOCLOSE) {
2062 #if NFSCLIENT
2063                 if (vp->v_tag == VT_NFS)
2064                         nfs_vinvalbuf(vp, V_SAVE, ctx, 0);
2065                 else
2066 #endif
2067                 {
2068                         VNOP_FSYNC(vp, MNT_WAIT, ctx);
2069                         buf_invalidateblks(vp, BUF_WRITE_DATA | BUF_INVALIDATE_LOCKED, 0, 0);
2070                 }
2071                 if (UBCINFOEXISTS(vp))
2072                         /*
2073                          * Clean the pages in VM.
2074                          */
2075                         (void)ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC);
2076         }
2077         if (active || need_inactive)
2078                 VNOP_INACTIVE(vp, ctx);
2079
2080 #if NAMEDSTREAMS
2081         if ((is_namedstream != 0) && (vp->v_parent != NULLVP)) {
2082                 vnode_t pvp = vp->v_parent;
2083
2084                 /* Delete the shadow stream file before we reclaim its vnode */
2085                 if (vnode_isshadow(vp)) {
2086                         vnode_relenamedstream(pvp, vp, ctx);
2087                 }
2088
2089                 /*
2090                  * No more streams associated with the parent.  We
2091                  * have a ref on it, so its identity is stable.
2092                  * If the parent is on an opaque volume, then we need to know
2093                  * whether it has associated named streams.
2094                  */
2095                 if (vfs_authopaque(pvp->v_mount)) {
2096                         vnode_lock_spin(pvp);
2097                         pvp->v_lflag &= ~VL_HASSTREAMS;
2098                         vnode_unlock(pvp);
2099                 }
2100         }
2101 #endif
2102
2103         /*
2104          * Destroy ubc named reference
2105          * cluster_release is done on this path
2106          * along with dropping the reference on the ucred
2107          */
2108         ubc_destroy_named(vp);
2109
2110 #if CONFIG_TRIGGERS
2111         /*
2112          * cleanup trigger info from vnode (if any)
2113          */
2114         if (vp->v_resolve)
2115                 vnode_resolver_detach(vp);
2116 #endif
2117
2118         /*
2119          * Reclaim the vnode.
2120          */
2121         if (VNOP_RECLAIM(vp, ctx))
2122                 panic("vclean: cannot reclaim");
2123
2124         // make sure the name & parent ptrs get cleaned out!
2125         vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE);
2126
2127         vnode_lock(vp);
2128
2129         vp->v_mount = dead_mountp;
2130         vp->v_op = dead_vnodeop_p;
2131         vp->v_tag = VT_NON;
2132         vp->v_data = NULL;
2133
2134         vp->v_lflag |= VL_DEAD;
2135
2136         if (already_terminating == 0) {
2137                 vp->v_lflag &= ~VL_TERMINATE;
2138                 /*
2139                  * Done with purge, notify sleepers of the grim news.
2140                  */
2141                 if (vp->v_lflag & VL_TERMWANT) {
2142                         vp->v_lflag &= ~VL_TERMWANT;
2143                         wakeup(&vp->v_lflag);
2144                 }
2145         }
2146 }
2147
2148 /*
2149  * Eliminate all activity associated with  the requested vnode
2150  * and with all vnodes aliased to the requested vnode.
2151  */
2152 int
2153 #if DIAGNOSTIC
2154 vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
2155 #else
2156 vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context)
2157 #endif
2158 {
2159         struct vnode *vq;
2160         int vid;
2161
2162 #if DIAGNOSTIC
2163         if ((flags & REVOKEALL) == 0)
2164                 panic("vnop_revoke");
2165 #endif
2166
2167         if (vnode_isaliased(vp)) {
2168                 /*
2169                  * If a vgone (or vclean) is already in progress,
2170                  * return an immediate error
2171                  */
2172                 if (vp->v_lflag & VL_TERMINATE)
2173                         return(ENOENT);
2174
2175                 /*
2176                  * Ensure that vp will not be vgone'd while we
2177                  * are eliminating its aliases.
2178                  */
2179                 SPECHASH_LOCK();
2180                 while ((vp->v_specflags & SI_ALIASED)) {
2181                         for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2182                                 if (vq->v_rdev != vp->v_rdev ||
2183                                     vq->v_type != vp->v_type || vp == vq)
2184                                         continue;
2185                                 vid = vq->v_id;
2186                                 SPECHASH_UNLOCK();
2187                                 if (vnode_getwithvid(vq,vid)){
2188                                         SPECHASH_LOCK();
2189                                         break;
2190                                 }
2191                                 vnode_reclaim_internal(vq, 0, 1, 0);
2192                                 vnode_put(vq);
2193                                 SPECHASH_LOCK();
2194                                 break;
2195                         }
2196                 }
2197                 SPECHASH_UNLOCK();
2198         }
2199         vnode_reclaim_internal(vp, 0, 0, REVOKEALL);
2200
2201         return (0);
2202 }
2203
2204 /*
2205  * Recycle an unused vnode to the front of the free list.
2206  * Release the passed interlock if the vnode will be recycled.
2207  */
2208 int
2209 vnode_recycle(struct vnode *vp)
2210 {
2211         vnode_lock_spin(vp);
2212
2213         if (vp->v_iocount || vp->v_usecount) {
2214                 vp->v_lflag |= VL_MARKTERM;
2215                 vnode_unlock(vp);
2216                 return(0);
2217         }
2218         vnode_lock_convert(vp);
2219         vnode_reclaim_internal(vp, 1, 0, 0);
2220
2221         vnode_unlock(vp);
2222
2223         return (1);
2224 }
2225
2226 static int
2227 vnode_reload(vnode_t vp)
2228 {
2229         vnode_lock_spin(vp);
2230
2231         if ((vp->v_iocount > 1) || vp->v_usecount) {
2232                 vnode_unlock(vp);
2233                 return(0);
2234         }
2235         if (vp->v_iocount <= 0)
2236                 panic("vnode_reload with no iocount %d", vp->v_iocount);
2237
2238         /* mark for release when iocount is dopped */
2239         vp->v_lflag |= VL_MARKTERM;
2240         vnode_unlock(vp);
2241
2242         return (1);
2243 }
2244
2245
2246 static void
2247 vgone(vnode_t vp, int flags)
2248 {
2249         struct vnode *vq;
2250         struct vnode *vx;
2251
2252         /*
2253          * Clean out the filesystem specific data.
2254          * vclean also takes care of removing the
2255          * vnode from any mount list it might be on
2256          */
2257         vclean(vp, flags | DOCLOSE);
2258
2259         /*
2260          * If special device, remove it from special device alias list
2261          * if it is on one.
2262          */
2263         if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
2264                         SPECHASH_LOCK();
2265                         if (*vp->v_hashchain == vp) {
2266                                 *vp->v_hashchain = vp->v_specnext;
2267                         } else {
2268                                 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2269                                         if (vq->v_specnext != vp)
2270                                                 continue;
2271                                         vq->v_specnext = vp->v_specnext;
2272                                         break;
2273                                 }
2274                         if (vq == NULL)
2275                                 panic("missing bdev");
2276                         }
2277                         if (vp->v_specflags & SI_ALIASED) {
2278                                 vx = NULL;
2279                                 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2280                                         if (vq->v_rdev != vp->v_rdev ||
2281                                         vq->v_type != vp->v_type)
2282                                                 continue;
2283                                         if (vx)
2284                                                 break;
2285                                         vx = vq;
2286                                 }
2287                                 if (vx == NULL)
2288                                         panic("missing alias");
2289                                 if (vq == NULL)
2290                                         vx->v_specflags &= ~SI_ALIASED;
2291                                 vp->v_specflags &= ~SI_ALIASED;
2292                         }
2293                         SPECHASH_UNLOCK();
2294                         {
2295                         struct specinfo *tmp = vp->v_specinfo;
2296                         vp->v_specinfo = NULL;
2297                         FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO);
2298                         }
2299         }
2300 }
2301
2302 /*
2303  * Lookup a vnode by device number.
2304  */
2305 int
2306 check_mountedon(dev_t dev, enum vtype type, int  *errorp)
2307 {
2308         vnode_t vp;
2309         int rc = 0;
2310         int vid;
2311
2312 loop:
2313         SPECHASH_LOCK();
2314         for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
2315                 if (dev != vp->v_rdev || type != vp->v_type)
2316                         continue;
2317                 vid = vp->v_id;
2318                 SPECHASH_UNLOCK();
2319                 if (vnode_getwithvid(vp,vid))
2320                         goto loop;
2321                 vnode_lock_spin(vp);
2322                 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
2323                         vnode_unlock(vp);
2324                         if ((*errorp = vfs_mountedon(vp)) != 0)
2325                                 rc = 1;
2326                 } else
2327                         vnode_unlock(vp);
2328                 vnode_put(vp);
2329                 return(rc);
2330         }
2331         SPECHASH_UNLOCK();
2332         return (0);
2333 }
2334
2335 /*
2336  * Calculate the total number of references to a special device.
2337  */
2338 int
2339 vcount(vnode_t vp)
2340 {
2341         vnode_t vq, vnext;
2342         int count;
2343         int vid;
2344
2345 loop:
2346         if (!vnode_isaliased(vp))
2347                 return (vp->v_specinfo->si_opencount);
2348         count = 0;
2349
2350         SPECHASH_LOCK();
2351         /*
2352          * Grab first vnode and its vid.
2353          */
2354         vq = *vp->v_hashchain;
2355         vid = vq ? vq->v_id : 0;
2356
2357         SPECHASH_UNLOCK();
2358
2359         while (vq) {
2360                 /*
2361                  * Attempt to get the vnode outside the SPECHASH lock.
2362                  */
2363                 if (vnode_getwithvid(vq, vid)) {
2364                         goto loop;
2365                 }
2366                 vnode_lock(vq);
2367
2368                 if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) {
2369                         if ((vq->v_usecount == 0) && (vq->v_iocount == 1)  && vq != vp) {
2370                                 /*
2371                                  * Alias, but not in use, so flush it out.
2372                                  */
2373                                 vnode_reclaim_internal(vq, 1, 1, 0);
2374                                 vnode_put_locked(vq);
2375                                 vnode_unlock(vq);
2376                                 goto loop;
2377                         }
2378                         count += vq->v_specinfo->si_opencount;
2379                 }
2380                 vnode_unlock(vq);
2381
2382                 SPECHASH_LOCK();
2383                 /*
2384                  * must do this with the reference still held on 'vq'
2385                  * so that it can't be destroyed while we're poking
2386                  * through v_specnext
2387                  */
2388                 vnext = vq->v_specnext;
2389                 vid = vnext ? vnext->v_id : 0;
2390
2391                 SPECHASH_UNLOCK();
2392
2393                 vnode_put(vq);
2394
2395                 vq = vnext;
2396         }
2397
2398         return (count);
2399 }
2400
2401 int     prtactive = 0;          /* 1 => print out reclaim of active vnodes */
2402
2403 /*
2404  * Print out a description of a vnode.
2405  */
2406 static const char *typename[] =
2407    { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
2408
2409 void
2410 vprint(const char *label, struct vnode *vp)
2411 {
2412         char sbuf[64];
2413
2414         if (label != NULL)
2415                 printf("%s: ", label);
2416         printf("type %s, usecount %d, writecount %d",
2417                typename[vp->v_type], vp->v_usecount, vp->v_writecount);
2418         sbuf[0] = '\0';
2419         if (vp->v_flag & VROOT)
2420                 strlcat(sbuf, "|VROOT", sizeof(sbuf));
2421         if (vp->v_flag & VTEXT)
2422                 strlcat(sbuf, "|VTEXT", sizeof(sbuf));
2423         if (vp->v_flag & VSYSTEM)
2424                 strlcat(sbuf, "|VSYSTEM", sizeof(sbuf));
2425         if (vp->v_flag & VNOFLUSH)
2426                 strlcat(sbuf, "|VNOFLUSH", sizeof(sbuf));
2427         if (vp->v_flag & VBWAIT)
2428                 strlcat(sbuf, "|VBWAIT", sizeof(sbuf));
2429         if (vnode_isaliased(vp))
2430                 strlcat(sbuf, "|VALIASED", sizeof(sbuf));
2431         if (sbuf[0] != '\0')
2432                 printf(" flags (%s)", &sbuf[1]);
2433 }
2434
2435
2436 int
2437 vn_getpath(struct vnode *vp, char *pathbuf, int *len)
2438 {
2439         return build_path(vp, pathbuf, *len, len, BUILDPATH_NO_FS_ENTER, vfs_context_current());
2440 }
2441
2442 int
2443 vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len)
2444 {
2445         return build_path(vp, pathbuf, *len, len, 0, vfs_context_current());
2446 }
2447
2448 int
2449 vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash)
2450 {
2451         return ubc_cs_getcdhash(vp, offset, cdhash);
2452 }
2453
2454
2455 static char *extension_table=NULL;
2456 static int   nexts;
2457 static int   max_ext_width;
2458
2459 static int
2460 extension_cmp(const void *a, const void *b)
2461 {
2462     return (strlen((const char *)a) - strlen((const char *)b));
2463 }
2464
2465
2466 //
2467 // This is the api LaunchServices uses to inform the kernel
2468 // the list of package extensions to ignore.
2469 //
2470 // Internally we keep the list sorted by the length of the
2471 // the extension (from longest to shortest).  We sort the
2472 // list of extensions so that we can speed up our searches
2473 // when comparing file names -- we only compare extensions
2474 // that could possibly fit into the file name, not all of
2475 // them (i.e. a short 8 character name can't have an 8
2476 // character extension).
2477 //
2478 extern lck_mtx_t *pkg_extensions_lck;
2479
2480 __private_extern__ int
2481 set_package_extensions_table(user_addr_t data, int nentries, int maxwidth)
2482 {
2483     char *new_exts, *old_exts;
2484     int error;
2485
2486     if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) {
2487         return EINVAL;
2488     }
2489
2490
2491     // allocate one byte extra so we can guarantee null termination
2492     MALLOC(new_exts, char *, (nentries * maxwidth) + 1, M_TEMP, M_WAITOK);
2493     if (new_exts == NULL) {
2494         return ENOMEM;
2495     }
2496
2497     error = copyin(data, new_exts, nentries * maxwidth);
2498     if (error) {
2499         FREE(new_exts, M_TEMP);
2500         return error;
2501     }
2502
2503     new_exts[(nentries * maxwidth)] = '\0';   // guarantee null termination of the block
2504
2505     qsort(new_exts, nentries, maxwidth, extension_cmp);
2506
2507     lck_mtx_lock(pkg_extensions_lck);
2508
2509     old_exts        = extension_table;
2510     extension_table = new_exts;
2511     nexts           = nentries;
2512     max_ext_width   = maxwidth;
2513
2514     lck_mtx_unlock(pkg_extensions_lck);
2515
2516     if (old_exts) {
2517         FREE(old_exts, M_TEMP);
2518     }
2519
2520     return 0;
2521 }
2522
2523
2524 __private_extern__ int
2525 is_package_name(const char *name, int len)
2526 {
2527     int i, extlen;
2528     const char *ptr, *name_ext;
2529
2530     if (len <= 3) {
2531         return 0;
2532     }
2533
2534     name_ext = NULL;
2535     for(ptr=name; *ptr != '\0'; ptr++) {
2536         if (*ptr == '.') {
2537             name_ext = ptr;
2538         }
2539     }
2540
2541     // if there is no "." extension, it can't match
2542     if (name_ext == NULL) {
2543         return 0;
2544     }
2545
2546     // advance over the "."
2547     name_ext++;
2548
2549     lck_mtx_lock(pkg_extensions_lck);
2550
2551     // now iterate over all the extensions to see if any match
2552     ptr = &extension_table[0];
2553     for(i=0; i < nexts; i++, ptr+=max_ext_width) {
2554         extlen = strlen(ptr);
2555         if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') {
2556             // aha, a match!
2557             lck_mtx_unlock(pkg_extensions_lck);
2558             return 1;
2559         }
2560     }
2561
2562     lck_mtx_unlock(pkg_extensions_lck);
2563
2564     // if we get here, no extension matched
2565     return 0;
2566 }
2567
2568 int
2569 vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component)
2570 {
2571     char *ptr, *end;
2572     int comp=0;
2573
2574     *component = -1;
2575     if (*path != '/') {
2576         return EINVAL;
2577     }
2578
2579     end = path + 1;
2580     while(end < path + pathlen && *end != '\0') {
2581         while(end < path + pathlen && *end == '/' && *end != '\0') {
2582             end++;
2583         }
2584
2585         ptr = end;
2586
2587         while(end < path + pathlen && *end != '/' && *end != '\0') {
2588             end++;
2589         }
2590
2591         if (end > path + pathlen) {
2592             // hmm, string wasn't null terminated
2593             return EINVAL;
2594         }
2595
2596         *end = '\0';
2597         if (is_package_name(ptr, end - ptr)) {
2598             *component = comp;
2599             break;
2600         }
2601
2602         end++;
2603         comp++;
2604     }
2605
2606     return 0;
2607 }
2608
2609 /*
2610  * Determine if a name is inappropriate for a searchfs query.
2611  * This list consists of /System currently.
2612  */
2613
2614 int vn_searchfs_inappropriate_name(const char *name, int len) {
2615         const char *bad_names[] = { "System" };
2616         int   bad_len[]   = { 6 };
2617         int  i;
2618
2619         for(i=0; i < (int) (sizeof(bad_names) / sizeof(bad_names[0])); i++) {
2620                 if (len == bad_len[i] && strncmp(name, bad_names[i], strlen(bad_names[i]) + 1) == 0) {
2621                         return 1;
2622                 }
2623         }
2624
2625         // if we get here, no name matched
2626         return 0;
2627 }
2628
2629 /*
2630  * Top level filesystem related information gathering.
2631  */
2632 extern unsigned int vfs_nummntops;
2633
2634 int
2635 vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp,
2636            user_addr_t newp, size_t newlen, proc_t p)
2637 {
2638         struct vfstable *vfsp;
2639         int *username;
2640         u_int usernamelen;
2641         int error;
2642         struct vfsconf vfsc;
2643
2644         /* All non VFS_GENERIC and in VFS_GENERIC,
2645          * VFS_MAXTYPENUM, VFS_CONF, VFS_SET_PACKAGE_EXTS
2646          * needs to have root priv to have modifiers.
2647          * For rest the userland_sysctl(CTLFLAG_ANYBODY) would cover.
2648          */
2649         if ((newp != USER_ADDR_NULL) && ((name[0] != VFS_GENERIC) ||
2650                         ((name[1] == VFS_MAXTYPENUM) ||
2651                          (name[1] == VFS_CONF) ||
2652                          (name[1] == VFS_SET_PACKAGE_EXTS)))
2653              && (error = suser(kauth_cred_get(), &p->p_acflag))) {
2654                         return(error);
2655         }
2656         /*
2657          * The VFS_NUMMNTOPS shouldn't be at name[0] since
2658          * is a VFS generic variable. So now we must check
2659          * namelen so we don't end up covering any UFS
2660          * variables (sinc UFS vfc_typenum is 1).
2661          *
2662          * It should have been:
2663          *    name[0]:  VFS_GENERIC
2664          *    name[1]:  VFS_NUMMNTOPS
2665          */
2666         if (namelen == 1 && name[0] == VFS_NUMMNTOPS) {
2667                 return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops));
2668         }
2669
2670         /* all sysctl names at this level are at least name and field */
2671         if (namelen < 2)
2672                 return (EISDIR);                /* overloaded */
2673         if (name[0] != VFS_GENERIC) {
2674
2675                 mount_list_lock();
2676                 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2677                         if (vfsp->vfc_typenum == name[0]) {
2678                                 vfsp->vfc_refcount++;
2679                                 break;
2680                         }
2681                 mount_list_unlock();
2682
2683                 if (vfsp == NULL)
2684                         return (ENOTSUP);
2685
2686                 /* XXX current context proxy for proc p? */
2687                 error = ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2688                             oldp, oldlenp, newp, newlen,
2689                             vfs_context_current()));
2690
2691                 mount_list_lock();
2692                 vfsp->vfc_refcount--;
2693                 mount_list_unlock();
2694                 return error;
2695         }
2696         switch (name[1]) {
2697         case VFS_MAXTYPENUM:
2698                 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
2699         case VFS_CONF:
2700                 if (namelen < 3)
2701                         return (ENOTDIR);       /* overloaded */
2702
2703                 mount_list_lock();
2704                 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2705                         if (vfsp->vfc_typenum == name[2])
2706                                 break;
2707
2708                 if (vfsp == NULL) {
2709                         mount_list_unlock();
2710                         return (ENOTSUP);
2711                 }
2712
2713                 vfsc.vfc_reserved1 = 0;
2714                 bcopy(vfsp->vfc_name, vfsc.vfc_name, sizeof(vfsc.vfc_name));
2715                 vfsc.vfc_typenum = vfsp->vfc_typenum;
2716                 vfsc.vfc_refcount = vfsp->vfc_refcount;
2717                 vfsc.vfc_flags = vfsp->vfc_flags;
2718                 vfsc.vfc_reserved2 = 0;
2719                 vfsc.vfc_reserved3 = 0;
2720
2721                 mount_list_unlock();
2722                 return (sysctl_rdstruct(oldp, oldlenp, newp, &vfsc,
2723                                         sizeof(struct vfsconf)));
2724
2725         case VFS_SET_PACKAGE_EXTS:
2726                 return set_package_extensions_table((user_addr_t)((unsigned)name[1]), name[2], name[3]);
2727         }
2728         /*
2729          * We need to get back into the general MIB, so we need to re-prepend
2730          * CTL_VFS to our name and try userland_sysctl().
2731          */
2732         usernamelen = namelen + 1;
2733         MALLOC(username, int *, usernamelen * sizeof(*username),
2734             M_TEMP, M_WAITOK);
2735         bcopy(name, username + 1, namelen * sizeof(*name));
2736         username[0] = CTL_VFS;
2737         error = userland_sysctl(p, username, usernamelen, oldp,
2738                                 oldlenp, newp, newlen, oldlenp);
2739         FREE(username, M_TEMP);
2740         return (error);
2741 }
2742
2743 /*
2744  * Dump vnode list (via sysctl) - defunct
2745  * use "pstat" instead
2746  */
2747 /* ARGSUSED */
2748 int
2749 sysctl_vnode
2750 (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, __unused struct sysctl_req *req)
2751 {
2752         return(EINVAL);
2753 }
2754
2755 SYSCTL_PROC(_kern, KERN_VNODE, vnode,
2756                 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED,
2757                 0, 0, sysctl_vnode, "S,", "");
2758
2759
2760 /*
2761  * Check to see if a filesystem is mounted on a block device.
2762  */
2763 int
2764 vfs_mountedon(struct vnode *vp)
2765 {
2766         struct vnode *vq;
2767         int error = 0;
2768
2769         SPECHASH_LOCK();
2770         if (vp->v_specflags & SI_MOUNTEDON) {
2771                 error = EBUSY;
2772                 goto out;
2773         }
2774         if (vp->v_specflags & SI_ALIASED) {
2775                 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2776                         if (vq->v_rdev != vp->v_rdev ||
2777                             vq->v_type != vp->v_type)
2778                                 continue;
2779                         if (vq->v_specflags & SI_MOUNTEDON) {
2780                                 error = EBUSY;
2781                                 break;
2782                         }
2783                 }
2784         }
2785 out:
2786         SPECHASH_UNLOCK();
2787         return (error);
2788 }
2789
2790 /*
2791  * Unmount all filesystems. The list is traversed in reverse order
2792  * of mounting to avoid dependencies.
2793  */
2794 __private_extern__ void
2795 vfs_unmountall(void)
2796 {
2797         struct mount *mp;
2798         int error;
2799
2800         /*
2801          * Since this only runs when rebooting, it is not interlocked.
2802          */
2803         mount_list_lock();
2804         while(!TAILQ_EMPTY(&mountlist)) {
2805                 mp = TAILQ_LAST(&mountlist, mntlist);
2806                 mount_list_unlock();
2807                 error = dounmount(mp, MNT_FORCE, 0, vfs_context_current());
2808                 if ((error != 0) && (error != EBUSY)) {
2809                         printf("unmount of %s failed (", mp->mnt_vfsstat.f_mntonname);
2810                         printf("%d)\n", error);
2811                         mount_list_lock();
2812                         TAILQ_REMOVE(&mountlist, mp, mnt_list);
2813                         continue;
2814                 } else if (error == EBUSY) {
2815                         /* If EBUSY is returned,  the unmount was already in progress */
2816                         printf("unmount of %p failed (", mp);
2817                         printf("BUSY)\n");
2818                 }
2819                 mount_list_lock();
2820         }
2821         mount_list_unlock();
2822 }
2823
2824
2825 /*
2826  * This routine is called from vnode_pager_deallocate out of the VM
2827  * The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named
2828  * on a vnode that has a UBCINFO
2829  */
2830 __private_extern__ void
2831 vnode_pager_vrele(vnode_t vp)
2832 {
2833         struct ubc_info *uip;
2834
2835         vnode_lock_spin(vp);
2836
2837         vp->v_lflag &= ~VNAMED_UBC;
2838
2839         uip = vp->v_ubcinfo;
2840         vp->v_ubcinfo = UBC_INFO_NULL;
2841
2842         vnode_unlock(vp);
2843
2844         ubc_info_deallocate(uip);
2845 }
2846
2847
2848 #include <sys/disk.h>
2849
2850 u_int32_t rootunit = (u_int32_t)-1;
2851
2852 errno_t
2853 vfs_init_io_attributes(vnode_t devvp, mount_t mp)
2854 {
2855         int     error;
2856         off_t   readblockcnt = 0;
2857         off_t   writeblockcnt = 0;
2858         off_t   readmaxcnt = 0;
2859         off_t   writemaxcnt = 0;
2860         off_t   readsegcnt = 0;
2861         off_t   writesegcnt = 0;
2862         off_t   readsegsize = 0;
2863         off_t   writesegsize = 0;
2864         off_t   alignment = 0;
2865         off_t   ioqueue_depth = 0;
2866         u_int32_t blksize;
2867         u_int64_t temp;
2868         u_int32_t features;
2869         vfs_context_t ctx = vfs_context_current();
2870         int isssd = 0;
2871         int isvirtual = 0;
2872
2873
2874         VNOP_IOCTL(devvp, DKIOCGETTHROTTLEMASK, (caddr_t)&mp->mnt_throttle_mask, 0, NULL);
2875         /*
2876          * as a reasonable approximation, only use the lowest bit of the mask
2877          * to generate a disk unit number
2878          */
2879         mp->mnt_devbsdunit = num_trailing_0(mp->mnt_throttle_mask);
2880
2881         if (devvp == rootvp)
2882                 rootunit = mp->mnt_devbsdunit;
2883
2884         if (mp->mnt_devbsdunit == rootunit) {
2885                 /*
2886                  * this mount point exists on the same device as the root
2887                  * partition, so it comes under the hard throttle control...
2888                  * this is true even for the root mount point itself
2889                  */
2890                 mp->mnt_kern_flag |= MNTK_ROOTDEV;
2891         }
2892         /*
2893          * force the spec device to re-cache
2894          * the underlying block size in case
2895          * the filesystem overrode the initial value
2896          */
2897         set_fsblocksize(devvp);
2898
2899
2900         if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
2901                                 (caddr_t)&blksize, 0, ctx)))
2902                 return (error);
2903
2904         mp->mnt_devblocksize = blksize;
2905
2906         /*
2907          * set the maximum possible I/O size
2908          * this may get clipped to a smaller value
2909          * based on which constraints are being advertised
2910          * and if those advertised constraints result in a smaller
2911          * limit for a given I/O
2912          */
2913         mp->mnt_maxreadcnt = MAX_UPL_SIZE * PAGE_SIZE;
2914         mp->mnt_maxwritecnt = MAX_UPL_SIZE * PAGE_SIZE;
2915
2916         if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, ctx) == 0) {
2917                 if (isvirtual)
2918                         mp->mnt_kern_flag |= MNTK_VIRTUALDEV;
2919         }
2920         if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ctx) == 0) {
2921                 if (isssd)
2922                         mp->mnt_kern_flag |= MNTK_SSD;
2923         }
2924         if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES,
2925                                 (caddr_t)&features, 0, ctx)))
2926                 return (error);
2927
2928         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
2929                                 (caddr_t)&readblockcnt, 0, ctx)))
2930                 return (error);
2931
2932         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
2933                                 (caddr_t)&writeblockcnt, 0, ctx)))
2934                 return (error);
2935
2936         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
2937                                 (caddr_t)&readmaxcnt, 0, ctx)))
2938                 return (error);
2939
2940         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
2941                                 (caddr_t)&writemaxcnt, 0, ctx)))
2942                 return (error);
2943
2944         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
2945                                 (caddr_t)&readsegcnt, 0, ctx)))
2946                 return (error);
2947
2948         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
2949                                 (caddr_t)&writesegcnt, 0, ctx)))
2950                 return (error);
2951
2952         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
2953                                 (caddr_t)&readsegsize, 0, ctx)))
2954                 return (error);
2955
2956         if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
2957                                 (caddr_t)&writesegsize, 0, ctx)))
2958                 return (error);
2959
2960         if ((error = VNOP_IOCTL(devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT,
2961                                 (caddr_t)&alignment, 0, ctx)))
2962                 return (error);
2963
2964         if ((error = VNOP_IOCTL(devvp, DKIOCGETCOMMANDPOOLSIZE,
2965                                 (caddr_t)&ioqueue_depth, 0, ctx)))
2966                 return (error);
2967
2968         if (readmaxcnt)
2969                 mp->mnt_maxreadcnt = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt;
2970
2971         if (readblockcnt) {
2972                 temp = readblockcnt * blksize;
2973                 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
2974
2975                 if (temp < mp->mnt_maxreadcnt)
2976                         mp->mnt_maxreadcnt = (u_int32_t)temp;
2977         }
2978
2979         if (writemaxcnt)
2980                 mp->mnt_maxwritecnt = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt;
2981
2982         if (writeblockcnt) {
2983                 temp = writeblockcnt * blksize;
2984                 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
2985
2986                 if (temp < mp->mnt_maxwritecnt)
2987                         mp->mnt_maxwritecnt = (u_int32_t)temp;
2988         }
2989
2990         if (readsegcnt) {
2991                 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
2992         } else {
2993                 temp = mp->mnt_maxreadcnt / PAGE_SIZE;
2994
2995                 if (temp > UINT16_MAX)
2996                         temp = UINT16_MAX;
2997         }
2998         mp->mnt_segreadcnt = (u_int16_t)temp;
2999
3000         if (writesegcnt) {
3001                 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
3002         } else {
3003                 temp = mp->mnt_maxwritecnt / PAGE_SIZE;
3004
3005                 if (temp > UINT16_MAX)
3006                         temp = UINT16_MAX;
3007         }
3008         mp->mnt_segwritecnt = (u_int16_t)temp;
3009
3010         if (readsegsize)
3011                 temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
3012         else
3013                 temp = mp->mnt_maxreadcnt;
3014         mp->mnt_maxsegreadsize = (u_int32_t)temp;
3015
3016         if (writesegsize)
3017                 temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
3018         else
3019                 temp = mp->mnt_maxwritecnt;
3020         mp->mnt_maxsegwritesize = (u_int32_t)temp;
3021
3022         if (alignment)
3023                 temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - 1;
3024         else
3025                 temp = 0;
3026         mp->mnt_alignmentmask = temp;
3027
3028
3029         if (ioqueue_depth > MNT_DEFAULT_IOQUEUE_DEPTH)
3030                 temp = ioqueue_depth;
3031         else
3032                 temp = MNT_DEFAULT_IOQUEUE_DEPTH;
3033
3034         mp->mnt_ioqueue_depth = temp;
3035         mp->mnt_ioscale = (mp->mnt_ioqueue_depth + (MNT_DEFAULT_IOQUEUE_DEPTH - 1)) / MNT_DEFAULT_IOQUEUE_DEPTH;
3036
3037         if (mp->mnt_ioscale > 1)
3038                 printf("ioqueue_depth = %d,   ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale);
3039
3040         if (features & DK_FEATURE_FORCE_UNIT_ACCESS)
3041                 mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED;
3042         if (features & DK_FEATURE_UNMAP)
3043                 mp->mnt_ioflags |= MNT_IOFLAGS_UNMAP_SUPPORTED;
3044         return (error);
3045 }
3046
3047 static struct klist fs_klist;
3048 lck_grp_t *fs_klist_lck_grp;
3049 lck_mtx_t *fs_klist_lock;
3050
3051 void
3052 vfs_event_init(void)
3053 {
3054
3055         klist_init(&fs_klist);
3056         fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL);
3057         fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL);
3058 }
3059
3060 void
3061 vfs_event_signal(__unused fsid_t *fsid, u_int32_t event, __unused intptr_t data)
3062 {
3063         lck_mtx_lock(fs_klist_lock);
3064         KNOTE(&fs_klist, event);
3065         lck_mtx_unlock(fs_klist_lock);
3066 }
3067
3068 /*
3069  * return the number of mounted filesystems.
3070  */
3071 static int
3072 sysctl_vfs_getvfscnt(void)
3073 {
3074         return(mount_getvfscnt());
3075 }
3076
3077
3078 static int
3079 mount_getvfscnt(void)
3080 {
3081         int ret;
3082
3083         mount_list_lock();
3084         ret = nummounts;
3085         mount_list_unlock();
3086         return (ret);
3087
3088 }
3089
3090
3091
3092 static int
3093 mount_fillfsids(fsid_t *fsidlst, int count)
3094 {
3095         struct mount *mp;
3096         int actual=0;
3097
3098         actual = 0;
3099         mount_list_lock();
3100         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3101                 if (actual <= count) {
3102                         fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
3103                         actual++;
3104                 }
3105         }
3106         mount_list_unlock();
3107         return (actual);
3108
3109 }
3110
3111 /*
3112  * fill in the array of fsid_t's up to a max of 'count', the actual
3113  * number filled in will be set in '*actual'.  If there are more fsid_t's
3114  * than room in fsidlst then ENOMEM will be returned and '*actual' will
3115  * have the actual count.
3116  * having *actual filled out even in the error case is depended upon.
3117  */
3118 static int
3119 sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual)
3120 {
3121         struct mount *mp;
3122
3123         *actual = 0;
3124         mount_list_lock();
3125         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3126                 (*actual)++;
3127                 if (*actual <= count)
3128                         fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid;
3129         }
3130         mount_list_unlock();
3131         return (*actual <= count ? 0 : ENOMEM);
3132 }
3133
3134 static int
3135 sysctl_vfs_vfslist(__unused struct sysctl_oid *oidp, __unused void *arg1,
3136                 __unused int arg2, struct sysctl_req *req)
3137 {
3138         int actual, error;
3139         size_t space;
3140         fsid_t *fsidlst;
3141
3142         /* This is a readonly node. */
3143         if (req->newptr != USER_ADDR_NULL)
3144                 return (EPERM);
3145
3146         /* they are querying us so just return the space required. */
3147         if (req->oldptr == USER_ADDR_NULL) {
3148                 req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3149                 return 0;
3150         }
3151 again:
3152         /*
3153          * Retrieve an accurate count of the amount of space required to copy
3154          * out all the fsids in the system.
3155          */
3156         space = req->oldlen;
3157         req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3158
3159         /* they didn't give us enough space. */
3160         if (space < req->oldlen)
3161                 return (ENOMEM);
3162
3163         MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK);
3164         if (fsidlst == NULL) {
3165                 return (ENOMEM);
3166         }
3167
3168         error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
3169             &actual);
3170         /*
3171          * If we get back ENOMEM, then another mount has been added while we
3172          * slept in malloc above.  If this is the case then try again.
3173          */
3174         if (error == ENOMEM) {
3175                 FREE(fsidlst, M_TEMP);
3176                 req->oldlen = space;
3177                 goto again;
3178         }
3179         if (error == 0) {
3180                 error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
3181         }
3182         FREE(fsidlst, M_TEMP);
3183         return (error);
3184 }
3185
3186 /*
3187  * Do a sysctl by fsid.
3188  */
3189 static int
3190 sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
3191                 struct sysctl_req *req)
3192 {
3193         union union_vfsidctl vc;
3194         struct mount *mp;
3195         struct vfsstatfs *sp;
3196         int *name, flags, namelen;
3197         int error=0, gotref=0;
3198         vfs_context_t ctx = vfs_context_current();
3199         proc_t p = req->p;      /* XXX req->p != current_proc()? */
3200         boolean_t is_64_bit;
3201
3202         name = arg1;
3203         namelen = arg2;
3204         is_64_bit = proc_is64bit(p);
3205
3206         error = SYSCTL_IN(req, &vc, is_64_bit? sizeof(vc.vc64):sizeof(vc.vc32));
3207         if (error)
3208                 goto out;
3209         if (vc.vc32.vc_vers != VFS_CTL_VERS1) { /* works for 32 and 64 */
3210                 error = EINVAL;
3211                 goto out;
3212         }
3213         mp = mount_list_lookupby_fsid(&vc.vc32.vc_fsid, 0, 1); /* works for 32 and 64 */
3214         if (mp == NULL) {
3215                 error = ENOENT;
3216                 goto out;
3217         }
3218         gotref = 1;
3219         /* reset so that the fs specific code can fetch it. */
3220         req->newidx = 0;
3221         /*
3222          * Note if this is a VFS_CTL then we pass the actual sysctl req
3223          * in for "oldp" so that the lower layer can DTRT and use the
3224          * SYSCTL_IN/OUT routines.
3225          */
3226         if (mp->mnt_op->vfs_sysctl != NULL) {
3227                 if (is_64_bit) {
3228                         if (vfs_64bitready(mp)) {
3229                                 error = mp->mnt_op->vfs_sysctl(name, namelen,
3230                                     CAST_USER_ADDR_T(req),
3231                                     NULL, USER_ADDR_NULL, 0,
3232                                     ctx);
3233                         }
3234                         else {
3235                                 error = ENOTSUP;
3236                         }
3237                 }
3238                 else {
3239                         error = mp->mnt_op->vfs_sysctl(name, namelen,
3240                             CAST_USER_ADDR_T(req),
3241                             NULL, USER_ADDR_NULL, 0,
3242                             ctx);
3243                 }
3244                 if (error != ENOTSUP) {
3245                         goto out;
3246                 }
3247         }
3248         switch (name[0]) {
3249         case VFS_CTL_UMOUNT:
3250                 req->newidx = 0;
3251                 if (is_64_bit) {
3252                         req->newptr = vc.vc64.vc_ptr;
3253                         req->newlen = (size_t)vc.vc64.vc_len;
3254                 }
3255                 else {
3256                         req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
3257                         req->newlen = vc.vc32.vc_len;
3258                 }
3259                 error = SYSCTL_IN(req, &flags, sizeof(flags));
3260                 if (error)
3261                         break;
3262
3263                 mount_ref(mp, 0);
3264                 mount_iterdrop(mp);
3265                 gotref = 0;
3266                 /* safedounmount consumes a ref */
3267                 error = safedounmount(mp, flags, ctx);
3268                 break;
3269         case VFS_CTL_STATFS:
3270                 req->newidx = 0;
3271                 if (is_64_bit) {
3272                         req->newptr = vc.vc64.vc_ptr;
3273                         req->newlen = (size_t)vc.vc64.vc_len;
3274                 }
3275                 else {
3276                         req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
3277                         req->newlen = vc.vc32.vc_len;
3278                 }
3279                 error = SYSCTL_IN(req, &flags, sizeof(flags));
3280                 if (error)
3281                         break;
3282                 sp = &mp->mnt_vfsstat;
3283                 if (((flags & MNT_NOWAIT) == 0 || (flags & (MNT_WAIT | MNT_DWAIT))) &&
3284                     (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))
3285                         goto out;
3286                 if (is_64_bit) {
3287                         struct user64_statfs sfs;
3288                         bzero(&sfs, sizeof(sfs));
3289                         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3290                         sfs.f_type = mp->mnt_vtable->vfc_typenum;
3291                         sfs.f_bsize = (user64_long_t)sp->f_bsize;
3292                         sfs.f_iosize = (user64_long_t)sp->f_iosize;
3293                         sfs.f_blocks = (user64_long_t)sp->f_blocks;
3294                         sfs.f_bfree = (user64_long_t)sp->f_bfree;
3295                         sfs.f_bavail = (user64_long_t)sp->f_bavail;
3296                         sfs.f_files = (user64_long_t)sp->f_files;
3297                         sfs.f_ffree = (user64_long_t)sp->f_ffree;
3298                         sfs.f_fsid = sp->f_fsid;
3299                         sfs.f_owner = sp->f_owner;
3300
3301                         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3302                                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3303                         } else {
3304                                 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3305                         }
3306                         strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3307                         strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
3308
3309                         error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
3310                 }
3311                 else {
3312                         struct user32_statfs sfs;
3313                         bzero(&sfs, sizeof(sfs));
3314                         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3315                         sfs.f_type = mp->mnt_vtable->vfc_typenum;
3316
3317                         /*
3318                          * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
3319                          * have to fudge the numbers here in that case.   We inflate the blocksize in order
3320                          * to reflect the filesystem size as best we can.
3321                          */
3322                         if (sp->f_blocks > INT_MAX) {
3323                                 int             shift;
3324
3325                                 /*
3326                                  * Work out how far we have to shift the block count down to make it fit.
3327                                  * Note that it's possible to have to shift so far that the resulting
3328                                  * blocksize would be unreportably large.  At that point, we will clip
3329                                  * any values that don't fit.
3330                                  *
3331                                  * For safety's sake, we also ensure that f_iosize is never reported as
3332                                  * being smaller than f_bsize.
3333                                  */
3334                                 for (shift = 0; shift < 32; shift++) {
3335                                         if ((sp->f_blocks >> shift) <= INT_MAX)
3336                                                 break;
3337                                         if ((((long long)sp->f_bsize) << (shift + 1)) > INT_MAX)
3338                                                 break;
3339                                 }
3340 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
3341                                 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sp->f_blocks, shift);
3342                                 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bfree, shift);
3343                                 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bavail, shift);
3344 #undef __SHIFT_OR_CLIP
3345                                 sfs.f_bsize = (user32_long_t)(sp->f_bsize << shift);
3346                                 sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize);
3347                         } else {
3348                                 sfs.f_bsize = (user32_long_t)sp->f_bsize;
3349                                 sfs.f_iosize = (user32_long_t)sp->f_iosize;
3350                                 sfs.f_blocks = (user32_long_t)sp->f_blocks;
3351                                 sfs.f_bfree = (user32_long_t)sp->f_bfree;
3352                                 sfs.f_bavail = (user32_long_t)sp->f_bavail;
3353                         }
3354                         sfs.f_files = (user32_long_t)sp->f_files;
3355                         sfs.f_ffree = (user32_long_t)sp->f_ffree;
3356                         sfs.f_fsid = sp->f_fsid;
3357                         sfs.f_owner = sp->f_owner;
3358
3359                         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3360                                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3361                         } else {
3362                                 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3363                         }
3364                         strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3365                         strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
3366
3367                         error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
3368                 }
3369                 break;
3370         default:
3371                 error = ENOTSUP;
3372                 goto out;
3373         }
3374 out:
3375         if(gotref != 0)
3376                 mount_iterdrop(mp);
3377         return (error);
3378 }
3379
3380 static int      filt_fsattach(struct knote *kn);
3381 static void     filt_fsdetach(struct knote *kn);
3382 static int      filt_fsevent(struct knote *kn, long hint);
3383 struct filterops fs_filtops = {
3384         .f_attach = filt_fsattach,
3385         .f_detach = filt_fsdetach,
3386         .f_event = filt_fsevent,
3387 };
3388
3389 static int
3390 filt_fsattach(struct knote *kn)
3391 {
3392
3393         lck_mtx_lock(fs_klist_lock);
3394         kn->kn_flags |= EV_CLEAR;
3395         KNOTE_ATTACH(&fs_klist, kn);
3396         lck_mtx_unlock(fs_klist_lock);
3397         return (0);
3398 }
3399
3400 static void
3401 filt_fsdetach(struct knote *kn)
3402 {
3403         lck_mtx_lock(fs_klist_lock);
3404         KNOTE_DETACH(&fs_klist, kn);
3405         lck_mtx_unlock(fs_klist_lock);
3406 }
3407
3408 static int
3409 filt_fsevent(struct knote *kn, long hint)
3410 {
3411         /*
3412          * Backwards compatibility:
3413          * Other filters would do nothing if kn->kn_sfflags == 0
3414          */
3415
3416         if ((kn->kn_sfflags == 0) || (kn->kn_sfflags & hint)) {
3417                 kn->kn_fflags |= hint;
3418         }
3419
3420         return (kn->kn_fflags != 0);
3421 }
3422
3423 static int
3424 sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp,
3425                 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3426 {
3427         int out, error;
3428         pid_t pid;
3429         proc_t p;
3430
3431         /* We need a pid. */
3432         if (req->newptr == USER_ADDR_NULL)
3433                 return (EINVAL);
3434
3435         error = SYSCTL_IN(req, &pid, sizeof(pid));
3436         if (error)
3437                 return (error);
3438
3439         p = proc_find(pid < 0 ? -pid : pid);
3440         if (p == NULL)
3441                 return (ESRCH);
3442
3443         /*
3444          * Fetching the value is ok, but we only fetch if the old
3445          * pointer is given.
3446          */
3447         if (req->oldptr != USER_ADDR_NULL) {
3448                 out = !((p->p_flag & P_NOREMOTEHANG) == 0);
3449                 proc_rele(p);
3450                 error = SYSCTL_OUT(req, &out, sizeof(out));
3451                 return (error);
3452         }
3453
3454         /* cansignal offers us enough security. */
3455         if (p != req->p && proc_suser(req->p) != 0) {
3456                 proc_rele(p);
3457                 return (EPERM);
3458         }
3459
3460         if (pid < 0)
3461                 OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), &p->p_flag);
3462         else
3463                 OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
3464         proc_rele(p);
3465
3466         return (0);
3467 }
3468
3469 /* the vfs.generic. branch. */
3470 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge");
3471 /* retreive a list of mounted filesystem fsid_t */
3472 SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD | CTLFLAG_LOCKED,
3473     NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
3474 /* perform operations on filesystem via fsid_t */
3475 SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW | CTLFLAG_LOCKED,
3476     sysctl_vfs_ctlbyfsid, "ctlbyfsid");
3477 SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY,
3478     NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
3479
3480
3481 long num_reusedvnodes = 0;
3482
3483 static int
3484 new_vnode(vnode_t *vpp)
3485 {
3486         vnode_t vp;
3487         int retries = 0;                                /* retry incase of tablefull */
3488         int force_alloc = 0, walk_count = 0;
3489         unsigned int  vpid;
3490         struct timespec ts;
3491         struct timeval current_tv;
3492 #ifndef __LP64__
3493         struct unsafe_fsnode *l_unsafefs = 0;
3494 #endif /* __LP64__ */
3495         proc_t  curproc = current_proc();
3496
3497 retry:
3498         microuptime(&current_tv);
3499
3500         vp = NULLVP;
3501
3502         vnode_list_lock();
3503
3504         if ((numvnodes - deadvnodes) < desiredvnodes || force_alloc) {
3505                 if ( !TAILQ_EMPTY(&vnode_dead_list)) {
3506                         /*
3507                          * Can always reuse a dead one
3508                          */
3509                         vp = TAILQ_FIRST(&vnode_dead_list);
3510                         goto steal_this_vp;
3511                 }
3512                 /*
3513                  * no dead vnodes available... if we're under
3514                  * the limit, we'll create a new vnode
3515                  */
3516                 numvnodes++;
3517                 vnode_list_unlock();
3518
3519                 MALLOC_ZONE(vp, struct vnode *, sizeof(*vp), M_VNODE, M_WAITOK);
3520                 bzero((char *)vp, sizeof(*vp));
3521                 VLISTNONE(vp);          /* avoid double queue removal */
3522                 lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr);
3523
3524                 klist_init(&vp->v_knotes);
3525                 nanouptime(&ts);
3526                 vp->v_id = ts.tv_nsec;
3527                 vp->v_flag = VSTANDARD;
3528
3529 #if CONFIG_MACF
3530                 if (mac_vnode_label_init_needed(vp))
3531                         mac_vnode_label_init(vp);
3532 #endif /* MAC */
3533
3534                 vp->v_iocount = 1;
3535                 goto done;
3536         }
3537
3538 #define MAX_WALK_COUNT 1000
3539
3540         if ( !TAILQ_EMPTY(&vnode_rage_list) &&
3541              (ragevnodes >= rage_limit ||
3542               (current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) {
3543
3544                 TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) {
3545                     if ( !(vp->v_listflag & VLIST_RAGE))
3546                         panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp);
3547
3548                     // if we're a dependency-capable process, skip vnodes that can
3549                         // cause recycling deadlocks. (i.e. this process is diskimages
3550                         // helper and the vnode is in a disk image).  Querying the
3551                         // mnt_kern_flag for the mount's virtual device status
3552                         // is safer than checking the mnt_dependent_process, which
3553                         // may not be updated if there are multiple devnode layers
3554                         // in between the disk image and the final consumer.
3555
3556                     if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL ||
3557                                         (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) {
3558                                 break;
3559                     }
3560
3561                     // don't iterate more than MAX_WALK_COUNT vnodes to
3562                     // avoid keeping the vnode list lock held for too long.
3563                     if (walk_count++ > MAX_WALK_COUNT) {
3564                                 vp = NULL;
3565                         break;
3566                     }
3567                 }
3568
3569         }
3570
3571         if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) {
3572                 /*
3573                  * Pick the first vp for possible reuse
3574                  */
3575                 walk_count = 0;
3576                 TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) {
3577
3578                         // if we're a dependency-capable process, skip vnodes that can
3579                         // cause recycling deadlocks. (i.e. this process is diskimages
3580                         // helper and the vnode is in a disk image).  Querying the
3581                         // mnt_kern_flag for the mount's virtual device status
3582                         // is safer than checking the mnt_dependent_process, which
3583                         // may not be updated if there are multiple devnode layers
3584                         // in between the disk image and the final consumer.
3585
3586                     if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL ||
3587                                         (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) {
3588                                 break;
3589                     }
3590
3591                     // don't iterate more than MAX_WALK_COUNT vnodes to
3592                     // avoid keeping the vnode list lock held for too long.
3593                     if (walk_count++ > MAX_WALK_COUNT) {
3594                         vp = NULL;
3595                         break;
3596                     }
3597                 }
3598
3599         }
3600
3601         //
3602         // if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT
3603         // then we're trying to create a vnode on behalf of a
3604         // process like diskimages-helper that has file systems
3605         // mounted on top of itself (and thus we can't reclaim
3606         // vnodes in the file systems on top of us).  if we can't
3607         // find a vnode to reclaim then we'll just have to force
3608         // the allocation.
3609         //
3610         if (vp == NULL && walk_count >= MAX_WALK_COUNT) {
3611             force_alloc = 1;
3612             vnode_list_unlock();
3613             goto retry;
3614         }
3615
3616         if (vp == NULL) {
3617                 /*
3618                  * we've reached the system imposed maximum number of vnodes
3619                  * but there isn't a single one available
3620                  * wait a bit and then retry... if we can't get a vnode
3621                  * after 100 retries, than log a complaint
3622                  */
3623                 if (++retries <= 100) {
3624                         vnode_list_unlock();
3625                         delay_for_interval(1, 1000 * 1000);
3626                         goto retry;
3627                 }
3628
3629                 vnode_list_unlock();
3630                 tablefull("vnode");
3631                 log(LOG_EMERG, "%d desired, %d numvnodes, "
3632                         "%d free, %d dead, %d rage\n",
3633                         desiredvnodes, numvnodes, freevnodes, deadvnodes, ragevnodes);
3634 #if CONFIG_EMBEDDED
3635                 /*
3636                  * Running out of vnodes tends to make a system unusable. Start killing
3637                  * processes that jetsam knows are killable.
3638                  */
3639                 if (jetsam_kill_top_proc(TRUE, kJetsamFlagsKilledVnodes) < 0) {
3640                         /*
3641                          * If jetsam can't find any more processes to kill and there
3642                          * still aren't any free vnodes, panic. Hopefully we'll get a
3643                          * panic log to tell us why we ran out.
3644                          */
3645                         panic("vnode table is full\n");
3646                 }
3647
3648                 delay_for_interval(1, 1000 * 1000);
3649                 goto retry;
3650 #endif
3651
3652                 *vpp = NULL;
3653                 return (ENFILE);
3654         }
3655 steal_this_vp:
3656         vpid = vp->v_id;
3657
3658         vnode_list_remove_locked(vp);
3659
3660         vnode_list_unlock();
3661
3662         vnode_lock_spin(vp);
3663
3664         /*
3665          * We could wait for the vnode_lock after removing the vp from the freelist
3666          * and the vid is bumped only at the very end of reclaim. So it is  possible
3667          * that we are looking at a vnode that is being terminated. If so skip it.
3668          */
3669         if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) ||
3670                         VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) {
3671                 /*
3672                  * we lost the race between dropping the list lock
3673                  * and picking up the vnode_lock... someone else
3674                  * used this vnode and it is now in a new state
3675                  * so we need to go back and try again
3676                  */
3677                 vnode_unlock(vp);
3678                 goto retry;
3679         }
3680         if ( (vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE ) {
3681                 /*
3682                  * we did a vnode_rele_ext that asked for
3683                  * us not to reenter the filesystem during
3684                  * the release even though VL_NEEDINACTIVE was
3685                  * set... we'll do it here by doing a
3686                  * vnode_get/vnode_put
3687                  *
3688                  * pick up an iocount so that we can call
3689                  * vnode_put and drive the VNOP_INACTIVE...
3690                  * vnode_put will either leave us off
3691                  * the freelist if a new ref comes in,
3692                  * or put us back on the end of the freelist
3693                  * or recycle us if we were marked for termination...
3694                  * so we'll just go grab a new candidate
3695                  */
3696                 vp->v_iocount++;
3697 #ifdef JOE_DEBUG
3698                 record_vp(vp, 1);
3699 #endif
3700                 vnode_put_locked(vp);
3701                 vnode_unlock(vp);
3702                 goto retry;
3703         }
3704         OSAddAtomicLong(1, &num_reusedvnodes);
3705
3706         /* Checks for anyone racing us for recycle */
3707         if (vp->v_type != VBAD) {
3708                 if (vp->v_lflag & VL_DEAD)
3709                         panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp);
3710                 vnode_lock_convert(vp);
3711                 (void)vnode_reclaim_internal(vp, 1, 1, 0);
3712
3713                 if ((VONLIST(vp)))
3714                         panic("new_vnode(%p): vp on list", vp);
3715                 if (vp->v_usecount || vp->v_iocount || vp->v_kusecount ||
3716                     (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH)))
3717                         panic("new_vnode(%p): free vnode still referenced", vp);
3718                 if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0))
3719                         panic("new_vnode(%p): vnode seems to be on mount list", vp);
3720                 if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren))
3721                         panic("new_vnode(%p): vnode still hooked into the name cache", vp);
3722         }
3723
3724 #ifndef __LP64__
3725         if (vp->v_unsafefs) {
3726                 l_unsafefs = vp->v_unsafefs;
3727                 vp->v_unsafefs = (struct unsafe_fsnode *)NULL;
3728         }
3729 #endif /* __LP64__ */
3730
3731 #if CONFIG_MACF
3732         /*
3733          * We should never see VL_LABELWAIT or VL_LABEL here.
3734          * as those operations hold a reference.
3735          */
3736         assert ((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT);
3737         assert ((vp->v_lflag & VL_LABEL) != VL_LABEL);
3738         if (vp->v_lflag & VL_LABELED) {
3739                 vnode_lock_convert(vp);
3740                 mac_vnode_label_recycle(vp);
3741         } else if (mac_vnode_label_init_needed(vp)) {
3742                 vnode_lock_convert(vp);
3743                 mac_vnode_label_init(vp);
3744         }
3745
3746 #endif /* MAC */
3747
3748         vp->v_iocount = 1;
3749         vp->v_lflag = 0;
3750         vp->v_writecount = 0;
3751         vp->v_references = 0;
3752         vp->v_iterblkflags = 0;
3753         vp->v_flag = VSTANDARD;
3754         /* vbad vnodes can point to dead_mountp */
3755         vp->v_mount = NULL;
3756         vp->v_defer_reclaimlist = (vnode_t)0;
3757
3758         vnode_unlock(vp);
3759
3760 #ifndef __LP64__
3761         if (l_unsafefs) {
3762                 lck_mtx_destroy(&l_unsafefs->fsnodelock, vnode_lck_grp);
3763                 FREE_ZONE((void *)l_unsafefs, sizeof(struct unsafe_fsnode), M_UNSAFEFS);
3764         }
3765 #endif /* __LP64__ */
3766
3767 done:
3768         *vpp = vp;
3769
3770         return (0);
3771 }
3772
3773 void
3774 vnode_lock(vnode_t vp)
3775 {
3776         lck_mtx_lock(&vp->v_lock);
3777 }
3778
3779 void
3780 vnode_lock_spin(vnode_t vp)
3781 {
3782         lck_mtx_lock_spin(&vp->v_lock);
3783 }
3784
3785 void
3786 vnode_unlock(vnode_t vp)
3787 {
3788         lck_mtx_unlock(&vp->v_lock);
3789 }
3790
3791
3792
3793 int
3794 vnode_get(struct vnode *vp)
3795 {
3796         int retval;
3797
3798         vnode_lock_spin(vp);
3799         retval = vnode_get_locked(vp);
3800         vnode_unlock(vp);
3801
3802         return(retval);
3803 }
3804
3805 int
3806 vnode_get_locked(struct vnode *vp)
3807 {
3808 #if DIAGNOSTIC
3809         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
3810 #endif
3811         if ((vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) {
3812                 return(ENOENT);
3813         }
3814         vp->v_iocount++;
3815 #ifdef JOE_DEBUG
3816         record_vp(vp, 1);
3817 #endif
3818         return (0);
3819 }
3820
3821 /*
3822  * vnode_getwithvid() cuts in line in front of a vnode drain (that is,
3823  * while the vnode is draining, but at no point after that) to prevent
3824  * deadlocks when getting vnodes from filesystem hashes while holding
3825  * resources that may prevent other iocounts from being released.
3826  */
3827 int
3828 vnode_getwithvid(vnode_t vp, uint32_t vid)
3829 {
3830         return(vget_internal(vp, vid, ( VNODE_NODEAD | VNODE_WITHID | VNODE_DRAINO )));
3831 }
3832
3833 /*
3834  * vnode_getwithvid_drainok() is like vnode_getwithvid(), but *does* block behind a vnode
3835  * drain; it exists for use in the VFS name cache, where we really do want to block behind
3836  * vnode drain to prevent holding off an unmount.
3837  */
3838 int
3839 vnode_getwithvid_drainok(vnode_t vp, uint32_t vid)
3840 {
3841         return(vget_internal(vp, vid, ( VNODE_NODEAD | VNODE_WITHID )));
3842 }
3843
3844 int
3845 vnode_getwithref(vnode_t vp)
3846 {
3847         return(vget_internal(vp, 0, 0));
3848 }
3849
3850
3851 __private_extern__ int
3852 vnode_getalways(vnode_t vp)
3853 {
3854         return(vget_internal(vp, 0, VNODE_ALWAYS));
3855 }
3856
3857 int
3858 vnode_put(vnode_t vp)
3859 {
3860         int retval;
3861
3862         vnode_lock_spin(vp);
3863         retval = vnode_put_locked(vp);
3864         vnode_unlock(vp);
3865
3866         return(retval);
3867 }
3868
3869 int
3870 vnode_put_locked(vnode_t vp)
3871 {
3872         vfs_context_t ctx = vfs_context_current();      /* hoist outside loop */
3873
3874 #if DIAGNOSTIC
3875         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
3876 #endif
3877 retry:
3878         if (vp->v_iocount < 1)
3879                 panic("vnode_put(%p): iocount < 1", vp);
3880
3881         if ((vp->v_usecount > 0) || (vp->v_iocount > 1))  {
3882                 vnode_dropiocount(vp);
3883                 return(0);
3884         }
3885         if ((vp->v_lflag & (VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) {
3886
3887                 vp->v_lflag &= ~VL_NEEDINACTIVE;
3888                 vnode_unlock(vp);
3889
3890                 VNOP_INACTIVE(vp, ctx);
3891
3892                 vnode_lock_spin(vp);
3893                 /*
3894                  * because we had to drop the vnode lock before calling
3895                  * VNOP_INACTIVE, the state of this vnode may have changed...
3896                  * we may pick up both VL_MARTERM and either
3897                  * an iocount or a usecount while in the VNOP_INACTIVE call
3898                  * we don't want to call vnode_reclaim_internal on a vnode
3899                  * that has active references on it... so loop back around
3900                  * and reevaluate the state
3901                  */
3902                 goto retry;
3903         }
3904         vp->v_lflag &= ~VL_NEEDINACTIVE;
3905
3906         if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM) {
3907                 vnode_lock_convert(vp);
3908                 vnode_reclaim_internal(vp, 1, 1, 0);
3909         }
3910         vnode_dropiocount(vp);
3911         vnode_list_add(vp);
3912
3913         return(0);
3914 }
3915
3916 /* is vnode_t in use by others?  */
3917 int
3918 vnode_isinuse(vnode_t vp, int refcnt)
3919 {
3920         return(vnode_isinuse_locked(vp, refcnt, 0));
3921 }
3922
3923
3924 static int
3925 vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
3926 {
3927         int retval = 0;
3928
3929         if (!locked)
3930                 vnode_lock_spin(vp);
3931         if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) >  refcnt)) {
3932                 retval = 1;
3933                 goto out;
3934         }
3935         if (vp->v_type == VREG)  {
3936                 retval = ubc_isinuse_locked(vp, refcnt, 1);
3937         }
3938
3939 out:
3940         if (!locked)
3941                 vnode_unlock(vp);
3942         return(retval);
3943 }
3944
3945
3946 /* resume vnode_t */
3947 errno_t
3948 vnode_resume(vnode_t vp)
3949 {
3950         if ((vp->v_lflag & VL_SUSPENDED) && vp->v_owner == current_thread()) {
3951
3952                 vnode_lock_spin(vp);
3953                 vp->v_lflag &= ~VL_SUSPENDED;
3954                 vp->v_owner = NULL;
3955                 vnode_unlock(vp);
3956
3957                 wakeup(&vp->v_iocount);
3958         }
3959         return(0);
3960 }
3961
3962 /* suspend vnode_t
3963  * Please do not use on more than one vnode at a time as it may
3964  * cause deadlocks.
3965  * xxx should we explicity prevent this from happening?
3966  */
3967
3968 errno_t
3969 vnode_suspend(vnode_t vp)
3970 {
3971         if (vp->v_lflag & VL_SUSPENDED) {
3972                 return(EBUSY);
3973         }
3974
3975         vnode_lock_spin(vp);
3976
3977         /*
3978          * xxx is this sufficient to check if a vnode_drain is
3979          * progress?
3980          */
3981
3982         if (vp->v_owner == NULL) {
3983                 vp->v_lflag |= VL_SUSPENDED;
3984                 vp->v_owner = current_thread();
3985         }
3986         vnode_unlock(vp);
3987
3988         return(0);
3989 }
3990
3991
3992
3993 static errno_t
3994 vnode_drain(vnode_t vp)
3995 {
3996
3997         if (vp->v_lflag & VL_DRAIN) {
3998                 panic("vnode_drain: recursive drain");
3999                 return(ENOENT);
4000         }
4001         vp->v_lflag |= VL_DRAIN;
4002         vp->v_owner = current_thread();
4003
4004         while (vp->v_iocount > 1)
4005                 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL);
4006
4007         vp->v_lflag &= ~VL_DRAIN;
4008
4009         return(0);
4010 }
4011
4012
4013 /*
4014  * if the number of recent references via vnode_getwithvid or vnode_getwithref
4015  * exceeds this threshold, than 'UN-AGE' the vnode by removing it from
4016  * the LRU list if it's currently on it... once the iocount and usecount both drop
4017  * to 0, it will get put back on the end of the list, effectively making it younger
4018  * this allows us to keep actively referenced vnodes in the list without having
4019  * to constantly remove and add to the list each time a vnode w/o a usecount is
4020  * referenced which costs us taking and dropping a global lock twice.
4021  */
4022 #define UNAGE_THRESHHOLD        25
4023
4024 errno_t
4025 vnode_getiocount(vnode_t vp, unsigned int vid, int vflags)
4026 {
4027         int nodead = vflags & VNODE_NODEAD;
4028         int nosusp = vflags & VNODE_NOSUSPEND;
4029         int always = vflags & VNODE_ALWAYS;
4030         int beatdrain = vflags & VNODE_DRAINO;
4031
4032         for (;;) {
4033                 /*
4034                  * if it is a dead vnode with deadfs
4035                  */
4036                 if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) {
4037                         return(ENOENT);
4038                 }
4039                 /*
4040                  * will return VL_DEAD ones
4041                  */
4042                 if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0 ) {
4043                         break;
4044                 }
4045                 /*
4046                  * if suspended vnodes are to be failed
4047                  */
4048                 if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
4049                         return(ENOENT);
4050                 }
4051                 /*
4052                  * if you are the owner of drain/suspend/termination , can acquire iocount
4053                  * check for VL_TERMINATE; it does not set owner
4054                  */
4055                 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) &&
4056                     (vp->v_owner == current_thread())) {
4057                         break;
4058                 }
4059
4060                 if (always != 0)
4061                         break;
4062
4063                 /*
4064                  * In some situations, we want to get an iocount
4065                  * even if the vnode is draining to prevent deadlock,
4066                  * e.g. if we're in the filesystem, potentially holding
4067                  * resources that could prevent other iocounts from
4068                  * being released.
4069                  */
4070                 if (beatdrain && (vp->v_lflag & VL_DRAIN)) {
4071                         break;
4072                 }
4073
4074                 vnode_lock_convert(vp);
4075
4076                 if (vp->v_lflag & VL_TERMINATE) {
4077                         vp->v_lflag |= VL_TERMWANT;
4078
4079                         msleep(&vp->v_lflag,   &vp->v_lock, PVFS, "vnode getiocount", NULL);
4080                 } else
4081                         msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL);
4082         }
4083         if (((vflags & VNODE_WITHID) != 0) && vid != vp->v_id) {
4084                 return(ENOENT);
4085         }
4086         if (++vp->v_references >= UNAGE_THRESHHOLD) {
4087                 vp->v_references = 0;
4088                 vnode_list_remove(vp);
4089         }
4090         vp->v_iocount++;
4091 #ifdef JOE_DEBUG
4092         record_vp(vp, 1);
4093 #endif
4094         return(0);
4095 }
4096
4097 static void
4098 vnode_dropiocount (vnode_t vp)
4099 {
4100         if (vp->v_iocount < 1)
4101                 panic("vnode_dropiocount(%p): v_iocount < 1", vp);
4102
4103         vp->v_iocount--;
4104 #ifdef JOE_DEBUG
4105         record_vp(vp, -1);
4106 #endif
4107         if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1))
4108                 wakeup(&vp->v_iocount);
4109 }
4110
4111
4112 void
4113 vnode_reclaim(struct vnode * vp)
4114 {
4115         vnode_reclaim_internal(vp, 0, 0, 0);
4116 }
4117
4118 __private_extern__
4119 void
4120 vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags)
4121 {
4122         int isfifo = 0;
4123
4124         if (!locked)
4125                 vnode_lock(vp);
4126
4127         if (vp->v_lflag & VL_TERMINATE) {
4128                 panic("vnode reclaim in progress");
4129         }
4130         vp->v_lflag |= VL_TERMINATE;
4131
4132         vn_clearunionwait(vp, 1);
4133
4134         vnode_drain(vp);
4135
4136         isfifo = (vp->v_type == VFIFO);
4137
4138         if (vp->v_type != VBAD)
4139                 vgone(vp, flags);               /* clean and reclaim the vnode */
4140
4141         /*
4142          * give the vnode a new identity so that vnode_getwithvid will fail
4143          * on any stale cache accesses...
4144          * grab the list_lock so that if we're in "new_vnode"
4145          * behind the list_lock trying to steal this vnode, the v_id is stable...
4146          * once new_vnode drops the list_lock, it will block trying to take
4147          * the vnode lock until we release it... at that point it will evaluate
4148          * whether the v_vid has changed
4149          * also need to make sure that the vnode isn't on a list where "new_vnode"
4150          * can find it after the v_id has been bumped until we are completely done
4151          * with the vnode (i.e. putting it back on a list has to be the very last
4152          * thing we do to this vnode... many of the callers of vnode_reclaim_internal
4153          * are holding an io_count on the vnode... they need to drop the io_count
4154          * BEFORE doing a vnode_list_add or make sure to hold the vnode lock until
4155          * they are completely done with the vnode
4156          */
4157         vnode_list_lock();
4158
4159         vnode_list_remove_locked(vp);
4160         vp->v_id++;
4161
4162         vnode_list_unlock();
4163
4164         if (isfifo) {
4165                 struct fifoinfo * fip;
4166
4167                 fip = vp->v_fifoinfo;
4168                 vp->v_fifoinfo = NULL;
4169                 FREE(fip, M_TEMP);
4170         }
4171         vp->v_type = VBAD;
4172
4173         if (vp->v_data)
4174                 panic("vnode_reclaim_internal: cleaned vnode isn't");
4175         if (vp->v_numoutput)
4176                 panic("vnode_reclaim_internal: clean vnode has pending I/O's");
4177         if (UBCINFOEXISTS(vp))
4178                 panic("vnode_reclaim_internal: ubcinfo not cleaned");
4179         if (vp->v_parent)
4180                 panic("vnode_reclaim_internal: vparent not removed");
4181         if (vp->v_name)
4182                 panic("vnode_reclaim_internal: vname not removed");
4183
4184         vp->v_socket = NULL;
4185
4186         vp->v_lflag &= ~VL_TERMINATE;
4187         vp->v_owner = NULL;
4188
4189         KNOTE(&vp->v_knotes, NOTE_REVOKE);
4190
4191         /* Make sure that when we reuse the vnode, no knotes left over */
4192         klist_init(&vp->v_knotes);
4193
4194         if (vp->v_lflag & VL_TERMWANT) {
4195                 vp->v_lflag &= ~VL_TERMWANT;
4196                 wakeup(&vp->v_lflag);
4197         }
4198         if (!reuse) {
4199                 /*
4200                  * make sure we get on the
4201                  * dead list if appropriate
4202                  */
4203                 vnode_list_add(vp);
4204         }
4205         if (!locked)
4206                 vnode_unlock(vp);
4207 }
4208
4209 /* USAGE:
4210  * The following api creates a vnode and associates all the parameter specified in vnode_fsparam
4211  * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias
4212  * is obsoleted by this.
4213  */
4214 int
4215 vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp)
4216 {
4217         int error;
4218         int insert = 1;
4219         vnode_t vp;
4220         vnode_t nvp;
4221         vnode_t dvp;
4222         struct  uthread *ut;
4223         struct componentname *cnp;
4224         struct vnode_fsparam *param = (struct vnode_fsparam *)data;
4225 #if CONFIG_TRIGGERS
4226         struct vnode_trigger_param *tinfo = NULL;
4227 #endif
4228         if (param == NULL)
4229                 return (EINVAL);
4230
4231 #if CONFIG_TRIGGERS
4232         if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) {
4233                 tinfo = (struct vnode_trigger_param *)data;
4234
4235                 /* Validate trigger vnode input */
4236                 if ((param->vnfs_vtype != VDIR) ||
4237                     (tinfo->vnt_resolve_func == NULL) ||
4238                     (tinfo->vnt_flags & ~VNT_VALID_MASK)) {
4239                         return (EINVAL);
4240                 }
4241                 /* Fall through a normal create (params will be the same) */
4242                 flavor = VNCREATE_FLAVOR;
4243                 size = VCREATESIZE;
4244         }
4245 #endif
4246         if ((flavor != VNCREATE_FLAVOR) || (size != VCREATESIZE))
4247                 return (EINVAL);
4248
4249         if ( (error = new_vnode(&vp)) )
4250                 return(error);
4251
4252         dvp = param->vnfs_dvp;
4253         cnp = param->vnfs_cnp;
4254
4255         vp->v_op = param->vnfs_vops;
4256         vp->v_type = param->vnfs_vtype;
4257         vp->v_data = param->vnfs_fsnode;
4258
4259         if (param->vnfs_markroot)
4260                 vp->v_flag |= VROOT;
4261         if (param->vnfs_marksystem)
4262                 vp->v_flag |= VSYSTEM;
4263         if (vp->v_type == VREG) {
4264                 error = ubc_info_init_withsize(vp, param->vnfs_filesize);
4265                 if (error) {
4266 #ifdef JOE_DEBUG
4267                         record_vp(vp, 1);
4268 #endif
4269                         vp->v_mount = NULL;
4270                         vp->v_op = dead_vnodeop_p;
4271                         vp->v_tag = VT_NON;
4272                         vp->v_data = NULL;
4273                         vp->v_type = VBAD;
4274                         vp->v_lflag |= VL_DEAD;
4275
4276                         vnode_put(vp);
4277                         return(error);
4278                 }
4279         }
4280 #ifdef JOE_DEBUG
4281         record_vp(vp, 1);
4282 #endif
4283
4284 #if CONFIG_TRIGGERS
4285         /*
4286          * For trigger vnodes, attach trigger info to vnode
4287          */
4288         if ((vp->v_type == VDIR) && (tinfo != NULL)) {
4289                 /*
4290                  * Note: has a side effect of incrementing trigger count on the
4291                  * mount if successful, which we would need to undo on a
4292                  * subsequent failure.
4293                  */
4294 #ifdef JOE_DEBUG
4295                 record_vp(vp, -1);
4296 #endif
4297                 error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE);
4298                 if (error) {
4299                         printf("vnode_create: vnode_resolver_create() err %d\n", error);
4300                         vp->v_mount = NULL;
4301                         vp->v_op = dead_vnodeop_p;
4302                         vp->v_tag = VT_NON;
4303                         vp->v_data = NULL;
4304                         vp->v_type = VBAD;
4305                         vp->v_lflag |= VL_DEAD;
4306 #ifdef JOE_DEBUG
4307                         record_vp(vp, 1);
4308 #endif
4309                         vnode_put(vp);
4310                         return (error);
4311                 }
4312         }
4313 #endif
4314         if (vp->v_type == VCHR || vp->v_type == VBLK) {
4315
4316                 vp->v_tag = VT_DEVFS;           /* callers will reset if needed (bdevvp) */
4317
4318                 if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) {
4319                         /*
4320                          * if checkalias returns a vnode, it will be locked
4321                          *
4322                          * first get rid of the unneeded vnode we acquired
4323                          */
4324                         vp->v_data = NULL;
4325                         vp->v_op = spec_vnodeop_p;
4326                         vp->v_type = VBAD;
4327                         vp->v_lflag = VL_DEAD;
4328                         vp->v_data = NULL;
4329                         vp->v_tag = VT_NON;
4330                         vnode_put(vp);
4331
4332                         /*
4333                          * switch to aliased vnode and finish
4334                          * preparing it
4335                          */
4336                         vp = nvp;
4337
4338                         vclean(vp, 0);
4339                         vp->v_op = param->vnfs_vops;
4340                         vp->v_type = param->vnfs_vtype;
4341                         vp->v_data = param->vnfs_fsnode;
4342                         vp->v_lflag = 0;
4343                         vp->v_mount = NULL;
4344                         insmntque(vp, param->vnfs_mp);
4345                         insert = 0;
4346                         vnode_unlock(vp);
4347                 }
4348         }
4349
4350         if (vp->v_type == VFIFO) {
4351                 struct fifoinfo *fip;
4352
4353                 MALLOC(fip, struct fifoinfo *,
4354                         sizeof(*fip), M_TEMP, M_WAITOK);
4355                 bzero(fip, sizeof(struct fifoinfo ));
4356                 vp->v_fifoinfo = fip;
4357         }
4358         /* The file systems must pass the address of the location where
4359          * they store the vnode pointer. When we add the vnode into the mount
4360          * list and name cache they become discoverable. So the file system node
4361          * must have the connection to vnode setup by then
4362          */
4363         *vpp = vp;
4364
4365         /* Add fs named reference. */
4366         if (param->vnfs_flags & VNFS_ADDFSREF) {
4367                 vp->v_lflag |= VNAMED_FSHASH;
4368         }
4369         if (param->vnfs_mp) {
4370                         if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL)
4371                                 vp->v_flag |= VLOCKLOCAL;
4372                 if (insert) {
4373                         if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb))
4374                                 panic("insmntque: vp on the free list\n");
4375
4376                         /*
4377                          * enter in mount vnode list
4378                          */
4379                         insmntque(vp, param->vnfs_mp);
4380                 }
4381 #ifndef __LP64__
4382                 if ((param->vnfs_mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE) == 0) {
4383                         MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *,
4384                                     sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK);
4385                         vp->v_unsafefs->fsnode_count = 0;
4386                         vp->v_unsafefs->fsnodeowner  = (void *)NULL;
4387                         lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr);
4388                 }
4389 #endif /* __LP64__ */
4390         }
4391         if (dvp && vnode_ref(dvp) == 0) {
4392                 vp->v_parent = dvp;
4393         }
4394         if (cnp) {
4395                 if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) {
4396                         /*
4397                          * enter into name cache
4398                          * we've got the info to enter it into the name cache now
4399                          * cache_enter_create will pick up an extra reference on
4400                          * the name entered into the string cache
4401                          */
4402                         vp->v_name = cache_enter_create(dvp, vp, cnp);
4403                 } else
4404                         vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0);
4405
4406                 if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED)
4407                         vp->v_flag |= VISUNION;
4408         }
4409         if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) {
4410                 /*
4411                  * this vnode is being created as cacheable in the name cache
4412                  * this allows us to re-enter it in the cache
4413                  */
4414                 vp->v_flag |= VNCACHEABLE;
4415         }
4416         ut = get_bsdthread_info(current_thread());
4417
4418         if ((current_proc()->p_lflag & P_LRAGE_VNODES) ||
4419             (ut->uu_flag & UT_RAGE_VNODES)) {
4420                 /*
4421                  * process has indicated that it wants any
4422                  * vnodes created on its behalf to be rapidly
4423                  * aged to reduce the impact on the cached set
4424                  * of vnodes
4425                  */
4426                 vp->v_flag |= VRAGE;
4427         }
4428         return (0);
4429 }
4430
4431 int
4432 vnode_addfsref(vnode_t vp)
4433 {
4434         vnode_lock_spin(vp);
4435         if (vp->v_lflag & VNAMED_FSHASH)
4436                 panic("add_fsref: vp already has named reference");
4437         if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb))
4438                 panic("addfsref: vp on the free list\n");
4439         vp->v_lflag |= VNAMED_FSHASH;
4440         vnode_unlock(vp);
4441         return(0);
4442
4443 }
4444 int
4445 vnode_removefsref(vnode_t vp)
4446 {
4447         vnode_lock_spin(vp);
4448         if ((vp->v_lflag & VNAMED_FSHASH) == 0)
4449                 panic("remove_fsref: no named reference");
4450         vp->v_lflag &= ~VNAMED_FSHASH;
4451         vnode_unlock(vp);
4452         return(0);
4453
4454 }
4455
4456
4457 int
4458 vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg)
4459 {
4460         mount_t mp;
4461         int ret = 0;
4462         fsid_t * fsid_list;
4463         int count, actualcount,  i;
4464         void * allocmem;
4465         int indx_start, indx_stop, indx_incr;
4466
4467         count = mount_getvfscnt();
4468         count += 10;
4469
4470         fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t));
4471         allocmem = (void *)fsid_list;
4472
4473         actualcount = mount_fillfsids(fsid_list, count);
4474
4475         /*
4476          * Establish the iteration direction
4477          * VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first)
4478          */
4479         if (flags & VFS_ITERATE_TAIL_FIRST) {
4480                 indx_start = actualcount - 1;
4481                 indx_stop = -1;
4482                 indx_incr = -1;
4483         } else /* Head first by default */ {
4484                 indx_start = 0;
4485                 indx_stop = actualcount;
4486                 indx_incr = 1;
4487         }
4488
4489         for (i=indx_start; i != indx_stop; i += indx_incr) {
4490
4491                 /* obtain the mount point with iteration reference */
4492                 mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1);
4493
4494                 if(mp == (struct mount *)0)
4495                         continue;
4496                 mount_lock(mp);
4497                 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
4498                         mount_unlock(mp);
4499                         mount_iterdrop(mp);
4500                         continue;
4501
4502                 }
4503                 mount_unlock(mp);
4504
4505                 /* iterate over all the vnodes */
4506                 ret = callout(mp, arg);
4507
4508                 mount_iterdrop(mp);
4509
4510                 switch (ret) {
4511                 case VFS_RETURNED:
4512                 case VFS_RETURNED_DONE:
4513                         if (ret == VFS_RETURNED_DONE) {
4514                                 ret = 0;
4515                                 goto out;
4516                         }
4517                         break;
4518
4519                 case VFS_CLAIMED_DONE:
4520                         ret = 0;
4521                         goto out;
4522                 case VFS_CLAIMED:
4523                 default:
4524                         break;
4525                 }
4526                 ret = 0;
4527         }
4528
4529 out:
4530         kfree(allocmem, (count * sizeof(fsid_t)));
4531         return (ret);
4532 }
4533
4534 /*
4535  * Update the vfsstatfs structure in the mountpoint.
4536  * MAC: Parameter eventtype added, indicating whether the event that
4537  * triggered this update came from user space, via a system call
4538  * (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT).
4539  */
4540 int
4541 vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype)
4542 {
4543         struct vfs_attr va;
4544         int             error;
4545
4546         /*
4547          * Request the attributes we want to propagate into
4548          * the per-mount vfsstat structure.
4549          */
4550         VFSATTR_INIT(&va);
4551         VFSATTR_WANTED(&va, f_iosize);
4552         VFSATTR_WANTED(&va, f_blocks);
4553         VFSATTR_WANTED(&va, f_bfree);
4554         VFSATTR_WANTED(&va, f_bavail);
4555         VFSATTR_WANTED(&va, f_bused);
4556         VFSATTR_WANTED(&va, f_files);
4557         VFSATTR_WANTED(&va, f_ffree);
4558         VFSATTR_WANTED(&va, f_bsize);
4559         VFSATTR_WANTED(&va, f_fssubtype);
4560 #if CONFIG_MACF
4561         if (eventtype == VFS_USER_EVENT) {
4562                 error = mac_mount_check_getattr(ctx, mp, &va);
4563                 if (error != 0)
4564                         return (error);
4565         }
4566 #endif
4567
4568         if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
4569                 KAUTH_DEBUG("STAT - filesystem returned error %d", error);
4570                 return(error);
4571         }
4572
4573         /*
4574          * Unpack into the per-mount structure.
4575          *
4576          * We only overwrite these fields, which are likely to change:
4577          *      f_blocks
4578          *      f_bfree
4579          *      f_bavail
4580          *      f_bused
4581          *      f_files
4582          *      f_ffree
4583          *
4584          * And these which are not, but which the FS has no other way
4585          * of providing to us:
4586          *      f_bsize
4587          *      f_iosize
4588          *      f_fssubtype
4589          *
4590          */
4591         if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
4592                 /* 4822056 - protect against malformed server mount */
4593                 mp->mnt_vfsstat.f_bsize = (va.f_bsize > 0 ? va.f_bsize : 512);
4594         } else {
4595                 mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */
4596         }
4597         if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
4598                 mp->mnt_vfsstat.f_iosize = va.f_iosize;
4599         } else {
4600                 mp->mnt_vfsstat.f_iosize = 1024 * 1024;         /* 1MB sensible I/O size */
4601         }
4602         if (VFSATTR_IS_SUPPORTED(&va, f_blocks))
4603                 mp->mnt_vfsstat.f_blocks = va.f_blocks;
4604         if (VFSATTR_IS_SUPPORTED(&va, f_bfree))
4605                 mp->mnt_vfsstat.f_bfree = va.f_bfree;
4606         if (VFSATTR_IS_SUPPORTED(&va, f_bavail))
4607                 mp->mnt_vfsstat.f_bavail = va.f_bavail;
4608         if (VFSATTR_IS_SUPPORTED(&va, f_bused))
4609                 mp->mnt_vfsstat.f_bused = va.f_bused;
4610         if (VFSATTR_IS_SUPPORTED(&va, f_files))
4611                 mp->mnt_vfsstat.f_files = va.f_files;
4612         if (VFSATTR_IS_SUPPORTED(&va, f_ffree))
4613                 mp->mnt_vfsstat.f_ffree = va.f_ffree;
4614
4615         /* this is unlikely to change, but has to be queried for */
4616         if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype))
4617                 mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
4618
4619         return(0);
4620 }
4621
4622 int
4623 mount_list_add(mount_t mp)
4624 {
4625         int res;
4626
4627         mount_list_lock();
4628         if (system_inshutdown != 0) {
4629                 res = -1;
4630         } else {
4631                 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
4632                 nummounts++;
4633                 res = 0;
4634         }
4635         mount_list_unlock();
4636
4637         return res;
4638 }
4639
4640 void
4641 mount_list_remove(mount_t mp)
4642 {
4643         mount_list_lock();
4644         TAILQ_REMOVE(&mountlist, mp, mnt_list);
4645         nummounts--;
4646         mp->mnt_list.tqe_next = NULL;
4647         mp->mnt_list.tqe_prev = NULL;
4648         mount_list_unlock();
4649 }
4650
4651 mount_t
4652 mount_lookupby_volfsid(int volfs_id, int withref)
4653 {
4654         mount_t cur_mount = (mount_t)0;
4655         mount_t mp;
4656
4657         mount_list_lock();
4658         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4659                 if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) &&
4660                     (mp->mnt_kern_flag & MNTK_PATH_FROM_ID) &&
4661                     (mp->mnt_vfsstat.f_fsid.val[0] == volfs_id)) {
4662                         cur_mount = mp;
4663                         if (withref) {
4664                                 if (mount_iterref(cur_mount, 1))  {
4665                                         cur_mount = (mount_t)0;
4666                                         mount_list_unlock();
4667                                         goto out;
4668                                 }
4669                         }
4670                         break;
4671                 }
4672         }
4673         mount_list_unlock();
4674         if (withref && (cur_mount != (mount_t)0)) {
4675                 mp = cur_mount;
4676                 if (vfs_busy(mp, LK_NOWAIT) != 0) {
4677                         cur_mount = (mount_t)0;
4678                 }
4679                 mount_iterdrop(mp);
4680         }
4681 out:
4682         return(cur_mount);
4683 }
4684
4685 mount_t
4686 mount_list_lookupby_fsid(fsid_t *fsid, int locked, int withref)
4687 {
4688         mount_t retmp = (mount_t)0;
4689         mount_t mp;
4690
4691         if (!locked)
4692                 mount_list_lock();
4693         TAILQ_FOREACH(mp, &mountlist, mnt_list)
4694                 if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] &&
4695                     mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) {
4696                         retmp = mp;
4697                         if (withref) {
4698                                 if (mount_iterref(retmp, 1))
4699                                         retmp = (mount_t)0;
4700                         }
4701                         goto out;
4702                 }
4703 out:
4704         if (!locked)
4705                 mount_list_unlock();
4706         return (retmp);
4707 }
4708
4709 errno_t
4710 vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx)
4711 {
4712         struct nameidata nd;
4713         int error;
4714         u_int32_t ndflags = 0;
4715
4716         if (ctx == NULL) {              /* XXX technically an error */
4717                 ctx = vfs_context_current();
4718         }
4719
4720         if (flags & VNODE_LOOKUP_NOFOLLOW)
4721                 ndflags = NOFOLLOW;
4722         else
4723                 ndflags = FOLLOW;
4724
4725         if (flags & VNODE_LOOKUP_NOCROSSMOUNT)
4726                 ndflags |= NOCROSSMOUNT;
4727         if (flags & VNODE_LOOKUP_DOWHITEOUT)
4728                 ndflags |= DOWHITEOUT;
4729
4730         /* XXX AUDITVNPATH1 needed ? */
4731         NDINIT(&nd, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE,
4732                CAST_USER_ADDR_T(path), ctx);
4733
4734         if ((error = namei(&nd)))
4735                 return (error);
4736         *vpp = nd.ni_vp;
4737         nameidone(&nd);
4738
4739         return (0);
4740 }
4741
4742 errno_t
4743 vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx)
4744 {
4745         struct nameidata nd;
4746         int error;
4747         u_int32_t ndflags = 0;
4748         int lflags = flags;
4749
4750         if (ctx == NULL) {              /* XXX technically an error */
4751                 ctx = vfs_context_current();
4752         }
4753
4754         if (fmode & O_NOFOLLOW)
4755                 lflags |= VNODE_LOOKUP_NOFOLLOW;
4756
4757         if (lflags & VNODE_LOOKUP_NOFOLLOW)
4758                 ndflags = NOFOLLOW;
4759         else
4760                 ndflags = FOLLOW;
4761
4762         if (lflags & VNODE_LOOKUP_NOCROSSMOUNT)
4763                 ndflags |= NOCROSSMOUNT;
4764         if (lflags & VNODE_LOOKUP_DOWHITEOUT)
4765                 ndflags |= DOWHITEOUT;
4766
4767         /* XXX AUDITVNPATH1 needed ? */
4768         NDINIT(&nd, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE,
4769                CAST_USER_ADDR_T(path), ctx);
4770
4771         if ((error = vn_open(&nd, fmode, cmode)))
4772                 *vpp = NULL;
4773         else
4774                 *vpp = nd.ni_vp;
4775
4776         return (error);
4777 }
4778
4779 errno_t
4780 vnode_close(vnode_t vp, int flags, vfs_context_t ctx)
4781 {
4782         int error;
4783
4784         if (ctx == NULL) {
4785                 ctx = vfs_context_current();
4786         }
4787
4788         error = vn_close(vp, flags, ctx);
4789         vnode_put(vp);
4790         return (error);
4791 }
4792
4793 /*
4794  * Returns:     0                       Success
4795  *      vnode_getattr:???
4796  */
4797 errno_t
4798 vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
4799 {
4800         struct vnode_attr       va;
4801         int                     error;
4802
4803         VATTR_INIT(&va);
4804         VATTR_WANTED(&va, va_data_size);
4805         error = vnode_getattr(vp, &va, ctx);
4806         if (!error)
4807                 *sizep = va.va_data_size;
4808         return(error);
4809 }
4810
4811 errno_t
4812 vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
4813 {
4814         struct vnode_attr       va;
4815
4816         VATTR_INIT(&va);
4817         VATTR_SET(&va, va_data_size, size);
4818         va.va_vaflags = ioflag & 0xffff;
4819         return(vnode_setattr(vp, &va, ctx));
4820 }
4821
4822 static int
4823 vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx)
4824 {
4825         /* Only use compound VNOP for compound operation */
4826         if (vnode_compound_open_available(dvp) && ((flags & VN_CREATE_DOOPEN) != 0)) {
4827                 *vpp = NULLVP;
4828                 return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, VNOP_COMPOUND_OPEN_DO_CREATE, fmode, statusp, vap, ctx);
4829         } else {
4830                 return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx);
4831         }
4832 }
4833
4834 /*
4835  * Create a filesystem object of arbitrary type with arbitrary attributes in
4836  * the spevied directory with the specified name.
4837  *
4838  * Parameters:  dvp                     Pointer to the vnode of the directory
4839  *                                      in which to create the object.
4840  *              vpp                     Pointer to the area into which to
4841  *                                      return the vnode of the created object.
4842  *              cnp                     Component name pointer from the namei
4843  *                                      data structure, containing the name to
4844  *                                      use for the create object.
4845  *              vap                     Pointer to the vnode_attr structure
4846  *                                      describing the object to be created,
4847  *                                      including the type of object.
4848  *              flags                   VN_* flags controlling ACL inheritance
4849  *                                      and whether or not authorization is to
4850  *                                      be required for the operation.
4851  *
4852  * Returns:     0                       Success
4853  *              !0                      errno value
4854  *
4855  * Implicit:    *vpp                    Contains the vnode of the object that
4856  *                                      was created, if successful.
4857  *              *cnp                    May be modified by the underlying VFS.
4858  *              *vap                    May be modified by the underlying VFS.
4859  *                                      modified by either ACL inheritance or
4860  *
4861  *
4862  *                                      be modified, even if the operation is
4863  *
4864  *
4865  * Notes:       The kauth_filesec_t in 'vap', if any, is in host byte order.
4866  *
4867  *              Modification of '*cnp' and '*vap' by the underlying VFS is
4868  *              strongly discouraged.
4869  *
4870  * XXX:         This function is a 'vn_*' function; it belongs in vfs_vnops.c
4871  *
4872  * XXX:         We should enummerate the possible errno values here, and where
4873  *              in the code they originated.
4874  */
4875 errno_t
4876 vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx)
4877 {
4878         errno_t error, old_error;
4879         vnode_t vp = (vnode_t)0;
4880         boolean_t batched;
4881         struct componentname *cnp;
4882         uint32_t defaulted;
4883
4884         cnp = &ndp->ni_cnd;
4885         error = 0;
4886         batched = namei_compound_available(dvp, ndp) ? TRUE : FALSE;
4887
4888         KAUTH_DEBUG("%p    CREATE - '%s'", dvp, cnp->cn_nameptr);
4889
4890         if (flags & VN_CREATE_NOINHERIT)
4891                 vap->va_vaflags |= VA_NOINHERIT;
4892         if (flags & VN_CREATE_NOAUTH)
4893                 vap->va_vaflags |= VA_NOAUTH;
4894         /*
4895          * Handle ACL inheritance, initialize vap.
4896          */
4897         error = vn_attribute_prepare(dvp, vap, &defaulted, ctx);
4898         if (error) {
4899                 return error;
4900         }
4901
4902         if (vap->va_type != VREG && (fmode != 0 || (flags & VN_CREATE_DOOPEN) || statusp)) {
4903                 panic("Open parameters, but not a regular file.");
4904         }
4905         if ((fmode != 0) && ((flags & VN_CREATE_DOOPEN) == 0)) {
4906                 panic("Mode for open, but not trying to open...");
4907         }
4908
4909         /*
4910          * Create the requested node.
4911          */
4912         switch(vap->va_type) {
4913         case VREG:
4914                 error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx);
4915                 break;
4916         case VDIR:
4917                 error = vn_mkdir(dvp, vpp, ndp, vap, ctx);
4918                 break;
4919         case VSOCK:
4920         case VFIFO:
4921         case VBLK:
4922         case VCHR:
4923                 error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
4924                 break;
4925         default:
4926                 panic("vnode_create: unknown vtype %d", vap->va_type);
4927         }
4928         if (error != 0) {
4929                 KAUTH_DEBUG("%p    CREATE - error %d returned by filesystem", dvp, error);
4930                 goto out;
4931         }
4932
4933         vp = *vpp;
4934         old_error = error;
4935
4936 #if CONFIG_MACF
4937         if (!(flags & VN_CREATE_NOLABEL)) {
4938                 error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx);
4939                 if (error)
4940                         goto error;
4941         }
4942 #endif
4943
4944         /*
4945          * If some of the requested attributes weren't handled by the VNOP,
4946          * use our fallback code.
4947          */
4948         if (!VATTR_ALL_SUPPORTED(vap) && *vpp) {
4949                 KAUTH_DEBUG("     CREATE - doing fallback with ACL %p", vap->va_acl);
4950                 error = vnode_setattr_fallback(*vpp, vap, ctx);
4951         }
4952 #if CONFIG_MACF
4953 error:
4954 #endif
4955         if ((error != 0) && (vp != (vnode_t)0)) {
4956
4957                 /* If we've done a compound open, close */
4958                 if (batched && (old_error == 0) && (vap->va_type == VREG)) {
4959                         VNOP_CLOSE(vp, fmode, ctx);
4960                 }
4961
4962                 /* Need to provide notifications if a create succeeded */
4963                 if (!batched) {
4964                         *vpp = (vnode_t) 0;
4965                         vnode_put(vp);
4966                 }
4967         }
4968
4969 out:
4970         vn_attribute_cleanup(vap, defaulted);
4971
4972         return(error);
4973 }
4974
4975 static kauth_scope_t    vnode_scope;
4976 static int      vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action,
4977     uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
4978 static int      vnode_authorize_callback_int(__unused kauth_cred_t credential, __unused void *idata, kauth_action_t action,
4979     uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
4980
4981 typedef struct _vnode_authorize_context {
4982         vnode_t         vp;
4983         struct vnode_attr *vap;
4984         vnode_t         dvp;
4985         struct vnode_attr *dvap;
4986         vfs_context_t   ctx;
4987         int             flags;
4988         int             flags_valid;
4989 #define _VAC_IS_OWNER           (1<<0)
4990 #define _VAC_IN_GROUP           (1<<1)
4991 #define _VAC_IS_DIR_OWNER       (1<<2)
4992 #define _VAC_IN_DIR_GROUP       (1<<3)
4993 } *vauth_ctx;
4994
4995 void
4996 vnode_authorize_init(void)
4997 {
4998         vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL);
4999 }
5000
5001 #define VATTR_PREPARE_DEFAULTED_UID             0x1
5002 #define VATTR_PREPARE_DEFAULTED_GID             0x2
5003 #define VATTR_PREPARE_DEFAULTED_MODE            0x4
5004
5005 int
5006 vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
5007 {
5008         kauth_acl_t nacl = NULL, oacl = NULL;
5009         int error;
5010
5011         /*
5012          * Handle ACL inheritance.
5013          */
5014         if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
5015                 /* save the original filesec */
5016                 if (VATTR_IS_ACTIVE(vap, va_acl)) {
5017                         oacl = vap->va_acl;
5018                 }
5019
5020                 vap->va_acl = NULL;
5021                 if ((error = kauth_acl_inherit(dvp,
5022                          oacl,
5023                          &nacl,
5024                          vap->va_type == VDIR,
5025                          ctx)) != 0) {
5026                         KAUTH_DEBUG("%p    CREATE - error %d processing inheritance", dvp, error);
5027                         return(error);
5028                 }
5029
5030                 /*
5031                  * If the generated ACL is NULL, then we can save ourselves some effort
5032                  * by clearing the active bit.
5033                  */
5034                 if (nacl == NULL) {
5035                         VATTR_CLEAR_ACTIVE(vap, va_acl);
5036                 } else {
5037                         vap->va_base_acl = oacl;
5038                         VATTR_SET(vap, va_acl, nacl);
5039                 }
5040         }
5041
5042         error = vnode_authattr_new_internal(dvp, vap, (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx);
5043         if (error) {
5044                 vn_attribute_cleanup(vap, *defaulted_fieldsp);
5045         }
5046
5047         return error;
5048 }
5049
5050 void
5051 vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields)
5052 {
5053         /*
5054          * If the caller supplied a filesec in vap, it has been replaced
5055          * now by the post-inheritance copy.  We need to put the original back
5056          * and free the inherited product.
5057          */
5058         kauth_acl_t nacl, oacl;
5059
5060         if (VATTR_IS_ACTIVE(vap, va_acl)) {
5061                 nacl = vap->va_acl;
5062                 oacl = vap->va_base_acl;
5063
5064                 if (oacl)  {
5065                         VATTR_SET(vap, va_acl, oacl);
5066                         vap->va_base_acl = NULL;
5067                 } else {
5068                         VATTR_CLEAR_ACTIVE(vap, va_acl);
5069                 }
5070
5071                 if (nacl != NULL) {
5072                         kauth_acl_free(nacl);
5073                 }
5074         }
5075
5076         if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != 0) {
5077                 VATTR_CLEAR_ACTIVE(vap, va_mode);
5078         }
5079         if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != 0) {
5080                 VATTR_CLEAR_ACTIVE(vap, va_gid);
5081         }
5082         if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != 0) {
5083                 VATTR_CLEAR_ACTIVE(vap, va_uid);
5084         }
5085
5086         return;
5087 }
5088
5089 int
5090 vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, __unused void *reserved)
5091 {
5092         int error = 0;
5093
5094         /*
5095          * Normally, unlinking of directories is not supported.
5096          * However, some file systems may have limited support.
5097          */
5098         if ((vp->v_type == VDIR) &&
5099                         !(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) {
5100                 return (EPERM); /* POSIX */
5101         }
5102
5103         /* authorize the delete operation */
5104 #if CONFIG_MACF
5105         if (!error)
5106                 error = mac_vnode_check_unlink(ctx, dvp, vp, cnp);
5107 #endif /* MAC */
5108         if (!error)
5109                 error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
5110
5111         return error;
5112 }
5113
5114 int
5115 vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved)
5116 {
5117         /* Open of existing case */
5118         kauth_action_t action;
5119         int error = 0;
5120
5121         if (cnp->cn_ndp == NULL) {
5122                 panic("NULL ndp");
5123         }
5124         if (reserved != NULL) {
5125                 panic("reserved not NULL.");
5126         }
5127
5128 #if CONFIG_MACF
5129         /* XXX may do duplicate work here, but ignore that for now (idempotent) */
5130         if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) {
5131                 error = vnode_label(vnode_mount(vp), NULL, vp, NULL, 0, ctx);
5132                 if (error)
5133                         return (error);
5134         }
5135 #endif
5136
5137         if ( (fmode & O_DIRECTORY) && vp->v_type != VDIR ) {
5138                 return (ENOTDIR);
5139         }
5140
5141         if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) {
5142                 return (EOPNOTSUPP);    /* Operation not supported on socket */
5143         }
5144
5145         if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) {
5146                 return (ELOOP);         /* O_NOFOLLOW was specified and the target is a symbolic link */
5147         }
5148
5149         /* disallow write operations on directories */
5150         if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) {
5151                 return (EISDIR);
5152         }
5153
5154         if ((cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH)) {
5155                 if (vp->v_type != VDIR) {
5156                         return (ENOTDIR);
5157                 }
5158         }
5159
5160 #if CONFIG_MACF
5161         /* If a file being opened is a shadow file containing
5162          * namedstream data, ignore the macf checks because it
5163          * is a kernel internal file and access should always
5164          * be allowed.
5165          */
5166         if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) {
5167                 error = mac_vnode_check_open(ctx, vp, fmode);
5168                 if (error) {
5169                         return (error);
5170                 }
5171         }
5172 #endif
5173
5174         /* compute action to be authorized */
5175         action = 0;
5176         if (fmode & FREAD) {
5177                 action |= KAUTH_VNODE_READ_DATA;
5178         }
5179         if (fmode & (FWRITE | O_TRUNC)) {
5180                 /*
5181                  * If we are writing, appending, and not truncating,
5182                  * indicate that we are appending so that if the
5183                  * UF_APPEND or SF_APPEND bits are set, we do not deny
5184                  * the open.
5185                  */
5186                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
5187                         action |= KAUTH_VNODE_APPEND_DATA;
5188                 } else {
5189                         action |= KAUTH_VNODE_WRITE_DATA;
5190                 }
5191         }
5192         return (vnode_authorize(vp, NULL, action, ctx));
5193 }
5194
5195 int
5196 vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved)
5197 {
5198         /* Creation case */
5199         int error;
5200
5201         if (cnp->cn_ndp == NULL) {
5202                 panic("NULL cn_ndp");
5203         }
5204         if (reserved != NULL) {
5205                 panic("reserved not NULL.");
5206         }
5207
5208         /* Only validate path for creation if we didn't do a complete lookup */
5209         if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) {
5210                 error = lookup_validate_creation_path(cnp->cn_ndp);
5211                 if (error)
5212                         return (error);
5213         }
5214
5215 #if CONFIG_MACF
5216         error = mac_vnode_check_create(ctx, dvp, cnp, vap);
5217         if (error)
5218                 return (error);
5219 #endif /* CONFIG_MACF */
5220
5221         return (vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx));
5222 }
5223
5224 int
5225 vn_authorize_rename(struct vnode *fdvp,  struct vnode *fvp,  struct componentname *fcnp,
5226              struct vnode *tdvp,  struct vnode *tvp,  struct componentname *tcnp,
5227              vfs_context_t ctx, void *reserved)
5228 {
5229         int error = 0;
5230         int moving = 0;
5231
5232         if (reserved != NULL) {
5233                 panic("Passed something other than NULL as reserved field!");
5234         }
5235
5236         /*
5237          * Avoid renaming "." and "..".
5238          *
5239          * XXX No need to check for this in the FS.  We should always have the leaves
5240          * in VFS in this case.
5241          */
5242         if (fvp->v_type == VDIR &&
5243             ((fdvp == fvp) ||
5244              (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
5245              ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT)) ) {
5246                 error = EINVAL;
5247                 goto out;
5248         }
5249
5250         if (tvp == NULLVP && vnode_compound_rename_available(tdvp)) {
5251                 error = lookup_validate_creation_path(tcnp->cn_ndp);
5252                 if (error)
5253                         goto out;
5254         }
5255
5256         /***** <MACF> *****/
5257 #if CONFIG_MACF
5258         error = mac_vnode_check_rename_from(ctx, fdvp, fvp, fcnp);
5259         if (error)
5260                 goto out;
5261 #endif
5262
5263 #if CONFIG_MACF
5264         error = mac_vnode_check_rename_to(ctx,
5265                         tdvp, tvp, fdvp == tdvp, tcnp);
5266         if (error)
5267                 goto out;
5268 #endif
5269         /***** </MACF> *****/
5270
5271         /***** <MiscChecks> *****/
5272         if (tvp != NULL) {
5273                 if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
5274                         error = ENOTDIR;
5275                         goto out;
5276                 } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
5277                         error = EISDIR;
5278                         goto out;
5279                 }
5280         }
5281
5282         if (fvp == tdvp) {
5283                 error = EINVAL;
5284                 goto out;
5285         }
5286
5287         /*
5288          * The following edge case is caught here:
5289          * (to cannot be a descendent of from)
5290          *
5291          *       o fdvp
5292          *      /
5293          *     /
5294          *    o fvp
5295          *     \
5296          *      \
5297          *       o tdvp
5298          *      /
5299          *     /
5300          *    o tvp
5301          */
5302         if (tdvp->v_parent == fvp) {
5303                 error = EINVAL;
5304                 goto out;
5305         }
5306         /***** </MiscChecks> *****/
5307
5308         /***** <Kauth> *****/
5309
5310         error = 0;
5311         if ((tvp != NULL) && vnode_isdir(tvp)) {
5312                 if (tvp != fdvp)
5313                         moving = 1;
5314         } else if (tdvp != fdvp) {
5315                 moving = 1;
5316         }
5317
5318
5319         /*
5320          * must have delete rights to remove the old name even in
5321          * the simple case of fdvp == tdvp.
5322          *
5323          * If fvp is a directory, and we are changing it's parent,
5324          * then we also need rights to rewrite its ".." entry as well.
5325          */
5326         if (vnode_isdir(fvp)) {
5327                 if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0)
5328                         goto out;
5329         } else {
5330                 if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0)
5331                         goto out;
5332         }
5333         if (moving) {
5334                 /* moving into tdvp or tvp, must have rights to add */
5335                 if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp,
5336                                                 NULL,
5337                                                 vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE,
5338                                                 ctx)) != 0) {
5339                         goto out;
5340                 }
5341         } else {
5342                 /* node staying in same directory, must be allowed to add new name */
5343                 if ((error = vnode_authorize(fdvp, NULL,
5344                                                 vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0)
5345                         goto out;
5346         }
5347         /* overwriting tvp */
5348         if ((tvp != NULL) && !vnode_isdir(tvp) &&
5349                         ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) {
5350                 goto out;
5351         }
5352
5353         /***** </Kauth> *****/
5354
5355         /* XXX more checks? */
5356 out:
5357         return error;
5358 }
5359
5360 int
5361 vn_authorize_mkdir(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved)
5362 {
5363         int error;
5364
5365         if (reserved != NULL) {
5366                 panic("reserved not NULL in vn_authorize_mkdir()");
5367         }
5368
5369         /* XXX A hack for now, to make shadow files work */
5370         if (cnp->cn_ndp == NULL) {
5371                 return 0;
5372         }
5373
5374         if (vnode_compound_mkdir_available(dvp)) {
5375                 error = lookup_validate_creation_path(cnp->cn_ndp);
5376                 if (error)
5377                         goto out;
5378         }
5379
5380 #if CONFIG_MACF
5381         error = mac_vnode_check_create(ctx,
5382             dvp, cnp, vap);
5383         if (error)
5384                 goto out;
5385 #endif
5386
5387         /* authorize addition of a directory to the parent */
5388         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0)
5389                 goto out;
5390
5391 out:
5392         return error;
5393 }
5394
5395 int
5396 vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved)
5397 {
5398         int error;
5399
5400         if (reserved != NULL) {
5401                 panic("Non-NULL reserved argument to vn_authorize_rmdir()");
5402         }
5403
5404         if (vp->v_type != VDIR) {
5405                 /*
5406                  * rmdir only deals with directories
5407                  */
5408                 return ENOTDIR;
5409         }
5410
5411         if (dvp == vp) {
5412                 /*
5413                  * No rmdir "." please.
5414                  */
5415                 return EINVAL;
5416         }
5417
5418 #if CONFIG_MACF
5419         error = mac_vnode_check_unlink(ctx, dvp,
5420                         vp, cnp);
5421         if (error)
5422                 return error;
5423 #endif
5424
5425         return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
5426 }
5427
5428 /*
5429  * Authorize an operation on a vnode.
5430  *
5431  * This is KPI, but here because it needs vnode_scope.
5432  *
5433  * Returns:     0                       Success
5434  *      kauth_authorize_action:EPERM    ...
5435  *      xlate => EACCES                 Permission denied
5436  *      kauth_authorize_action:0        Success
5437  *      kauth_authorize_action:         Depends on callback return; this is
5438  *                                      usually only vnode_authorize_callback(),
5439  *                                      but may include other listerners, if any
5440  *                                      exist.
5441  *              EROFS
5442  *              EACCES
5443  *              EPERM
5444  *              ???
5445  */
5446 int
5447 vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx)
5448 {
5449         int     error, result;
5450
5451         /*
5452          * We can't authorize against a dead vnode; allow all operations through so that
5453          * the correct error can be returned.
5454          */
5455         if (vp->v_type == VBAD)
5456                 return(0);
5457
5458         error = 0;
5459         result = kauth_authorize_action(vnode_scope, vfs_context_ucred(ctx), action,
5460                    (uintptr_t)ctx, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error);
5461         if (result == EPERM)            /* traditional behaviour */
5462                 result = EACCES;
5463         /* did the lower layers give a better error return? */
5464         if ((result != 0) && (error != 0))
5465                 return(error);
5466         return(result);
5467 }
5468
5469 /*
5470  * Test for vnode immutability.
5471  *
5472  * The 'append' flag is set when the authorization request is constrained
5473  * to operations which only request the right to append to a file.
5474  *
5475  * The 'ignore' flag is set when an operation modifying the immutability flags
5476  * is being authorized.  We check the system securelevel to determine which
5477  * immutability flags we can ignore.
5478  */
5479 static int
5480 vnode_immutable(struct vnode_attr *vap, int append, int ignore)
5481 {
5482         int     mask;
5483
5484         /* start with all bits precluding the operation */
5485         mask = IMMUTABLE | APPEND;
5486
5487         /* if appending only, remove the append-only bits */
5488         if (append)
5489                 mask &= ~APPEND;
5490
5491         /* ignore only set when authorizing flags changes */
5492         if (ignore) {
5493                 if (securelevel <= 0) {
5494                         /* in insecure state, flags do not inhibit changes */
5495                         mask = 0;
5496                 } else {
5497                         /* in secure state, user flags don't inhibit */
5498                         mask &= ~(UF_IMMUTABLE | UF_APPEND);
5499                 }
5500         }
5501         KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
5502         if ((vap->va_flags & mask) != 0)
5503                 return(EPERM);
5504         return(0);
5505 }
5506
5507 static int
5508 vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
5509 {
5510         int result;
5511
5512         /* default assumption is not-owner */
5513         result = 0;
5514
5515         /*
5516          * If the filesystem has given us a UID, we treat this as authoritative.
5517          */
5518         if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
5519                 result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0;
5520         }
5521         /* we could test the owner UUID here if we had a policy for it */
5522
5523         return(result);
5524 }
5525
5526 /*
5527  * vauth_node_group
5528  *
5529  * Description: Ask if a cred is a member of the group owning the vnode object
5530  *
5531  * Parameters:          vap             vnode attribute
5532  *                              vap->va_gid     group owner of vnode object
5533  *                      cred            credential to check
5534  *                      ismember        pointer to where to put the answer
5535  *                      idontknow       Return this if we can't get an answer
5536  *
5537  * Returns:             0               Success
5538  *                      idontknow       Can't get information
5539  *      kauth_cred_ismember_gid:?       Error from kauth subsystem
5540  *      kauth_cred_ismember_gid:?       Error from kauth subsystem
5541  */
5542 static int
5543 vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember, int idontknow)
5544 {
5545         int     error;
5546         int     result;
5547
5548         error = 0;
5549         result = 0;
5550
5551         /*
5552          * The caller is expected to have asked the filesystem for a group
5553          * at some point prior to calling this function.  The answer may
5554          * have been that there is no group ownership supported for the
5555          * vnode object, in which case we return
5556          */
5557         if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
5558                 error = kauth_cred_ismember_gid(cred, vap->va_gid, &result);
5559                 /*
5560                  * Credentials which are opted into external group membership
5561                  * resolution which are not known to the external resolver
5562                  * will result in an ENOENT error.  We translate this into
5563                  * the appropriate 'idontknow' response for our caller.
5564                  *
5565                  * XXX We do not make a distinction here between an ENOENT
5566                  * XXX arising from a response from the external resolver,
5567                  * XXX and an ENOENT which is internally generated.  This is
5568                  * XXX a deficiency of the published kauth_cred_ismember_gid()
5569                  * XXX KPI which can not be overcome without new KPI.  For
5570                  * XXX all currently known cases, however, this wil result
5571                  * XXX in correct behaviour.
5572                  */
5573                 if (error == ENOENT)
5574                         error = idontknow;
5575         }
5576         /*
5577          * XXX We could test the group UUID here if we had a policy for it,
5578          * XXX but this is problematic from the perspective of synchronizing
5579          * XXX group UUID and POSIX GID ownership of a file and keeping the
5580          * XXX values coherent over time.  The problem is that the local
5581          * XXX system will vend transient group UUIDs for unknown POSIX GID
5582          * XXX values, and these are not persistent, whereas storage of values
5583          * XXX is persistent.  One potential solution to this is a local
5584          * XXX (persistent) replica of remote directory entries and vended
5585          * XXX local ids in a local directory server (think in terms of a
5586          * XXX caching DNS server).
5587          */
5588
5589         if (!error)
5590                 *ismember = result;
5591         return(error);
5592 }
5593
5594 static int
5595 vauth_file_owner(vauth_ctx vcp)
5596 {
5597         int result;
5598
5599         if (vcp->flags_valid & _VAC_IS_OWNER) {
5600                 result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0;
5601         } else {
5602                 result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred);
5603
5604                 /* cache our result */
5605                 vcp->flags_valid |= _VAC_IS_OWNER;
5606                 if (result) {
5607                         vcp->flags |= _VAC_IS_OWNER;
5608                 } else {
5609                         vcp->flags &= ~_VAC_IS_OWNER;
5610                 }
5611         }
5612         return(result);
5613 }
5614
5615
5616 /*
5617  * vauth_file_ingroup
5618  *
5619  * Description: Ask if a user is a member of the group owning the directory
5620  *
5621  * Parameters:          vcp             The vnode authorization context that
5622  *                                      contains the user and directory info
5623  *                              vcp->flags_valid        Valid flags
5624  *                              vcp->flags              Flags values
5625  *                              vcp->vap                File vnode attributes
5626  *                              vcp->ctx                VFS Context (for user)
5627  *                      ismember        pointer to where to put the answer
5628  *                      idontknow       Return this if we can't get an answer
5629  *
5630  * Returns:             0               Success
5631  *              vauth_node_group:?      Error from vauth_node_group()
5632  *
5633  * Implicit returns:    *ismember       0       The user is not a group member
5634  *                                      1       The user is a group member
5635  */
5636 static int
5637 vauth_file_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
5638 {
5639         int     error;
5640
5641         /* Check for a cached answer first, to avoid the check if possible */
5642         if (vcp->flags_valid & _VAC_IN_GROUP) {
5643                 *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0;
5644                 error = 0;
5645         } else {
5646                 /* Otherwise, go look for it */
5647                 error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember, idontknow);
5648
5649                 if (!error) {
5650                         /* cache our result */
5651                         vcp->flags_valid |= _VAC_IN_GROUP;
5652                         if (*ismember) {
5653                                 vcp->flags |= _VAC_IN_GROUP;
5654                         } else {
5655                                 vcp->flags &= ~_VAC_IN_GROUP;
5656                         }
5657                 }
5658
5659         }
5660         return(error);
5661 }
5662
5663 static int
5664 vauth_dir_owner(vauth_ctx vcp)
5665 {
5666         int result;
5667
5668         if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
5669                 result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0;
5670         } else {
5671                 result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred);
5672
5673                 /* cache our result */
5674                 vcp->flags_valid |= _VAC_IS_DIR_OWNER;
5675                 if (result) {
5676                         vcp->flags |= _VAC_IS_DIR_OWNER;
5677                 } else {
5678                         vcp->flags &= ~_VAC_IS_DIR_OWNER;
5679                 }
5680         }
5681         return(result);
5682 }
5683
5684 /*
5685  * vauth_dir_ingroup
5686  *
5687  * Description: Ask if a user is a member of the group owning the directory
5688  *
5689  * Parameters:          vcp             The vnode authorization context that
5690  *                                      contains the user and directory info
5691  *                              vcp->flags_valid        Valid flags
5692  *                              vcp->flags              Flags values
5693  *                              vcp->dvap               Dir vnode attributes
5694  *                              vcp->ctx                VFS Context (for user)
5695  *                      ismember        pointer to where to put the answer
5696  *                      idontknow       Return this if we can't get an answer
5697  *
5698  * Returns:             0               Success
5699  *              vauth_node_group:?      Error from vauth_node_group()
5700  *
5701  * Implicit returns:    *ismember       0       The user is not a group member
5702  *                                      1       The user is a group member
5703  */
5704 static int
5705 vauth_dir_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
5706 {
5707         int     error;
5708
5709         /* Check for a cached answer first, to avoid the check if possible */
5710         if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
5711                 *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0;
5712                 error = 0;
5713         } else {
5714                 /* Otherwise, go look for it */
5715                 error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember, idontknow);
5716
5717                 if (!error) {
5718                         /* cache our result */
5719                         vcp->flags_valid |= _VAC_IN_DIR_GROUP;
5720                         if (*ismember) {
5721                                 vcp->flags |= _VAC_IN_DIR_GROUP;
5722                         } else {
5723                                 vcp->flags &= ~_VAC_IN_DIR_GROUP;
5724                         }
5725                 }
5726         }
5727         return(error);
5728 }
5729
5730 /*
5731  * Test the posix permissions in (vap) to determine whether (credential)
5732  * may perform (action)
5733  */
5734 static int
5735 vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
5736 {
5737         struct vnode_attr *vap;
5738         int needed, error, owner_ok, group_ok, world_ok, ismember;
5739 #ifdef KAUTH_DEBUG_ENABLE
5740         const char *where = "uninitialized";
5741 # define _SETWHERE(c)   where = c;
5742 #else
5743 # define _SETWHERE(c)
5744 #endif
5745
5746         /* checking file or directory? */
5747         if (on_dir) {
5748                 vap = vcp->dvap;
5749         } else {
5750                 vap = vcp->vap;
5751         }
5752
5753         error = 0;
5754
5755         /*
5756          * We want to do as little work here as possible.  So first we check
5757          * which sets of permissions grant us the access we need, and avoid checking
5758          * whether specific permissions grant access when more generic ones would.
5759          */
5760
5761         /* owner permissions */
5762         needed = 0;
5763         if (action & VREAD)
5764                 needed |= S_IRUSR;
5765         if (action & VWRITE)
5766                 needed |= S_IWUSR;
5767         if (action & VEXEC)
5768                 needed |= S_IXUSR;
5769         owner_ok = (needed & vap->va_mode) == needed;
5770
5771         /* group permissions */
5772         needed = 0;
5773         if (action & VREAD)
5774                 needed |= S_IRGRP;
5775         if (action & VWRITE)
5776                 needed |= S_IWGRP;
5777         if (action & VEXEC)
5778                 needed |= S_IXGRP;
5779         group_ok = (needed & vap->va_mode) == needed;
5780
5781         /* world permissions */
5782         needed = 0;
5783         if (action & VREAD)
5784                 needed |= S_IROTH;
5785         if (action & VWRITE)
5786                 needed |= S_IWOTH;
5787         if (action & VEXEC)
5788                 needed |= S_IXOTH;
5789         world_ok = (needed & vap->va_mode) == needed;
5790
5791         /* If granted/denied by all three, we're done */
5792         if (owner_ok && group_ok && world_ok) {
5793                 _SETWHERE("all");
5794                 goto out;
5795         }
5796         if (!owner_ok && !group_ok && !world_ok) {
5797                 _SETWHERE("all");
5798                 error = EACCES;
5799                 goto out;
5800         }
5801
5802         /* Check ownership (relatively cheap) */
5803         if ((on_dir && vauth_dir_owner(vcp)) ||
5804             (!on_dir && vauth_file_owner(vcp))) {
5805                 _SETWHERE("user");
5806                 if (!owner_ok)
5807                         error = EACCES;
5808                 goto out;
5809         }
5810
5811         /* Not owner; if group and world both grant it we're done */
5812         if (group_ok && world_ok) {
5813                 _SETWHERE("group/world");
5814                 goto out;
5815         }
5816         if (!group_ok && !world_ok) {
5817                 _SETWHERE("group/world");
5818                 error = EACCES;
5819                 goto out;
5820         }
5821
5822         /* Check group membership (most expensive) */
5823         ismember = 0;   /* Default to allow, if the target has no group owner */
5824
5825         /*
5826          * In the case we can't get an answer about the user from the call to
5827          * vauth_dir_ingroup() or vauth_file_ingroup(), we want to fail on
5828          * the side of caution, rather than simply granting access, or we will
5829          * fail to correctly implement exclusion groups, so we set the third
5830          * parameter on the basis of the state of 'group_ok'.
5831          */
5832         if (on_dir) {
5833                 error = vauth_dir_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
5834         } else {
5835                 error = vauth_file_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
5836         }
5837         if (error) {
5838                 if (!group_ok)
5839                         ismember = 1;
5840                 error = 0;
5841         }
5842         if (ismember) {
5843                 _SETWHERE("group");
5844                 if (!group_ok)
5845                         error = EACCES;
5846                 goto out;
5847         }
5848
5849         /* Not owner, not in group, use world result */
5850         _SETWHERE("world");
5851         if (!world_ok)
5852                 error = EACCES;
5853
5854         /* FALLTHROUGH */
5855
5856 out:
5857         KAUTH_DEBUG("%p    %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
5858             vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where,
5859             (action & VREAD)  ? "r" : "-",
5860             (action & VWRITE) ? "w" : "-",
5861             (action & VEXEC)  ? "x" : "-",
5862             needed,
5863             (vap->va_mode & S_IRUSR) ? "r" : "-",
5864             (vap->va_mode & S_IWUSR) ? "w" : "-",
5865             (vap->va_mode & S_IXUSR) ? "x" : "-",
5866             (vap->va_mode & S_IRGRP) ? "r" : "-",
5867             (vap->va_mode & S_IWGRP) ? "w" : "-",
5868             (vap->va_mode & S_IXGRP) ? "x" : "-",
5869             (vap->va_mode & S_IROTH) ? "r" : "-",
5870             (vap->va_mode & S_IWOTH) ? "w" : "-",
5871             (vap->va_mode & S_IXOTH) ? "x" : "-",
5872             kauth_cred_getuid(vcp->ctx->vc_ucred),
5873             on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
5874             on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
5875         return(error);
5876 }
5877
5878 /*
5879  * Authorize the deletion of the node vp from the directory dvp.
5880  *
5881  * We assume that:
5882  * - Neither the node nor the directory are immutable.
5883  * - The user is not the superuser.
5884  *
5885  * Deletion is not permitted if the directory is sticky and the caller is
5886  * not owner of the node or directory.
5887  *
5888  * If either the node grants DELETE, or the directory grants DELETE_CHILD,
5889  * the node may be deleted.  If neither denies the permission, and the
5890  * caller has Posix write access to the directory, then the node may be
5891  * deleted.
5892  *
5893  * As an optimization, we cache whether or not delete child is permitted
5894  * on directories without the sticky bit set.
5895  */
5896 int
5897 vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child);
5898 /*static*/ int
5899 vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
5900 {
5901         struct vnode_attr       *vap = vcp->vap;
5902         struct vnode_attr       *dvap = vcp->dvap;
5903         kauth_cred_t            cred = vcp->ctx->vc_ucred;
5904         struct kauth_acl_eval   eval;
5905         int                     error, delete_denied, delete_child_denied, ismember;
5906
5907         /* check the ACL on the directory */
5908         delete_child_denied = 0;
5909         if (!cached_delete_child && VATTR_IS_NOT(dvap, va_acl, NULL)) {
5910                 eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
5911                 eval.ae_acl = &dvap->va_acl->acl_ace[0];
5912                 eval.ae_count = dvap->va_acl->acl_entrycount;
5913                 eval.ae_options = 0;
5914                 if (vauth_dir_owner(vcp))
5915                         eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
5916                 /*
5917                  * We use ENOENT as a marker to indicate we could not get
5918                  * information in order to delay evaluation until after we
5919                  * have the ACL evaluation answer.  Previously, we would
5920                  * always deny the operation at this point.
5921                  */
5922                 if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT)
5923                         return(error);
5924                 if (error == ENOENT)
5925                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
5926                 else if (ismember)
5927                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
5928                 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
5929                 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
5930                 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
5931                 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
5932
5933                 /*
5934                  * If there is no entry, we are going to defer to other
5935                  * authorization mechanisms.
5936                  */
5937                 error = kauth_acl_evaluate(cred, &eval);
5938
5939                 if (error != 0) {
5940                         KAUTH_DEBUG("%p    ERROR during ACL processing - %d", vcp->vp, error);
5941                         return(error);
5942                 }
5943                 switch(eval.ae_result) {
5944                 case KAUTH_RESULT_DENY:
5945                         delete_child_denied = 1;
5946                         break;
5947                         /* FALLSTHROUGH */
5948                 case KAUTH_RESULT_ALLOW:
5949                         KAUTH_DEBUG("%p    ALLOWED - granted by directory ACL", vcp->vp);
5950                         return(0);
5951                 case KAUTH_RESULT_DEFER:
5952                 default:
5953                         /* Effectively the same as !delete_child_denied */
5954                         KAUTH_DEBUG("%p    DEFERRED - directory ACL", vcp->vp);
5955                         break;
5956                 }
5957         }
5958
5959         /* check the ACL on the node */
5960         delete_denied = 0;
5961         if (VATTR_IS_NOT(vap, va_acl, NULL)) {
5962                 eval.ae_requested = KAUTH_VNODE_DELETE;
5963                 eval.ae_acl = &vap->va_acl->acl_ace[0];
5964                 eval.ae_count = vap->va_acl->acl_entrycount;
5965                 eval.ae_options = 0;
5966                 if (vauth_file_owner(vcp))
5967                         eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
5968                 /*
5969                  * We use ENOENT as a marker to indicate we could not get
5970                  * information in order to delay evaluation until after we
5971                  * have the ACL evaluation answer.  Previously, we would
5972                  * always deny the operation at this point.
5973                  */
5974                 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT)
5975                         return(error);
5976                 if (error == ENOENT)
5977                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
5978                 else if (ismember)
5979                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
5980                 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
5981                 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
5982                 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
5983                 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
5984
5985                 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
5986                         KAUTH_DEBUG("%p    ERROR during ACL processing - %d", vcp->vp, error);
5987                         return(error);
5988                 }
5989
5990                 switch(eval.ae_result) {
5991                 case KAUTH_RESULT_DENY:
5992                         delete_denied = 1;
5993                         break;
5994                 case KAUTH_RESULT_ALLOW:
5995                         KAUTH_DEBUG("%p    ALLOWED - granted by file ACL", vcp->vp);
5996                         return(0);
5997                 case KAUTH_RESULT_DEFER:
5998                 default:
5999                         /* Effectively the same as !delete_child_denied */
6000                         KAUTH_DEBUG("%p    DEFERRED%s - by file ACL", vcp->vp, delete_denied ? "(DENY)" : "");
6001                         break;
6002                 }
6003         }
6004
6005         /* if denied by ACL on directory or node, return denial */
6006         if (delete_denied || delete_child_denied) {
6007                 KAUTH_DEBUG("%p    DENIED - denied by ACL", vcp->vp);
6008                 return(EACCES);
6009         }
6010
6011         /* enforce sticky bit behaviour */
6012         if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
6013                 KAUTH_DEBUG("%p    DENIED - sticky bit rules (user %d  file %d  dir %d)",
6014                     vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid);
6015                 return(EACCES);
6016         }
6017
6018         /* check the directory */
6019         if (!cached_delete_child && (error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) {
6020                 KAUTH_DEBUG("%p    DENIED - denied by posix permisssions", vcp->vp);
6021                 return(error);
6022         }
6023
6024         /* not denied, must be OK */
6025         return(0);
6026 }
6027
6028
6029 /*
6030  * Authorize an operation based on the node's attributes.
6031  */
6032 static int
6033 vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny)
6034 {
6035         struct vnode_attr       *vap = vcp->vap;
6036         kauth_cred_t            cred = vcp->ctx->vc_ucred;
6037         struct kauth_acl_eval   eval;
6038         int                     error, ismember;
6039         mode_t                  posix_action;
6040
6041         /*
6042          * If we are the file owner, we automatically have some rights.
6043          *
6044          * Do we need to expand this to support group ownership?
6045          */
6046         if (vauth_file_owner(vcp))
6047                 acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY);
6048
6049         /*
6050          * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can
6051          * mask the latter.  If TAKE_OWNERSHIP is requested the caller is about to
6052          * change ownership to themselves, and WRITE_SECURITY is implicitly
6053          * granted to the owner.  We need to do this because at this point
6054          * WRITE_SECURITY may not be granted as the caller is not currently
6055          * the owner.
6056          */
6057         if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) &&
6058             (acl_rights & KAUTH_VNODE_WRITE_SECURITY))
6059                 acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY;
6060
6061         if (acl_rights == 0) {
6062                 KAUTH_DEBUG("%p    ALLOWED - implicit or no rights required", vcp->vp);
6063                 return(0);
6064         }
6065
6066         /* if we have an ACL, evaluate it */
6067         if (VATTR_IS_NOT(vap, va_acl, NULL)) {
6068                 eval.ae_requested = acl_rights;
6069                 eval.ae_acl = &vap->va_acl->acl_ace[0];
6070                 eval.ae_count = vap->va_acl->acl_entrycount;
6071                 eval.ae_options = 0;
6072                 if (vauth_file_owner(vcp))
6073                         eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
6074                 /*
6075                  * We use ENOENT as a marker to indicate we could not get
6076                  * information in order to delay evaluation until after we
6077                  * have the ACL evaluation answer.  Previously, we would
6078                  * always deny the operation at this point.
6079                  */
6080                 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT)
6081                         return(error);
6082                 if (error == ENOENT)
6083                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
6084                 else if (ismember)
6085                         eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
6086                 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
6087                 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
6088                 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
6089                 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
6090
6091                 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
6092                         KAUTH_DEBUG("%p    ERROR during ACL processing - %d", vcp->vp, error);
6093                         return(error);
6094                 }
6095
6096                 switch(eval.ae_result) {
6097                 case KAUTH_RESULT_DENY:
6098                         KAUTH_DEBUG("%p    DENIED - by ACL", vcp->vp);
6099                         return(EACCES);         /* deny, deny, counter-allege */
6100                 case KAUTH_RESULT_ALLOW:
6101                         KAUTH_DEBUG("%p    ALLOWED - all rights granted by ACL", vcp->vp);
6102                         return(0);
6103                 case KAUTH_RESULT_DEFER:
6104                 default:
6105                         /* Effectively the same as !delete_child_denied */
6106                         KAUTH_DEBUG("%p    DEFERRED - directory ACL", vcp->vp);
6107                         break;
6108                 }
6109
6110                 *found_deny = eval.ae_found_deny;
6111
6112                 /* fall through and evaluate residual rights */
6113         } else {
6114                 /* no ACL, everything is residual */
6115                 eval.ae_residual = acl_rights;
6116         }
6117
6118         /*
6119          * Grant residual rights that have been pre-authorized.
6120          */
6121         eval.ae_residual &= ~preauth_rights;
6122
6123         /*
6124          * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied.
6125          */
6126         if (vauth_file_owner(vcp))
6127                 eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES;
6128
6129         if (eval.ae_residual == 0) {
6130                 KAUTH_DEBUG("%p    ALLOWED - rights already authorized", vcp->vp);
6131                 return(0);
6132         }
6133
6134         /*
6135          * Bail if we have residual rights that can't be granted by posix permissions,
6136          * or aren't presumed granted at this point.
6137          *
6138          * XXX these can be collapsed for performance
6139          */
6140         if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) {
6141                 KAUTH_DEBUG("%p    DENIED - CHANGE_OWNER not permitted", vcp->vp);
6142                 return(EACCES);
6143         }
6144         if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) {
6145                 KAUTH_DEBUG("%p    DENIED - WRITE_SECURITY not permitted", vcp->vp);
6146                 return(EACCES);
6147         }
6148
6149 #if DIAGNOSTIC
6150         if (eval.ae_residual & KAUTH_VNODE_DELETE)
6151                 panic("vnode_authorize: can't be checking delete permission here");
6152 #endif
6153
6154         /*
6155          * Compute the fallback posix permissions that will satisfy the remaining
6156          * rights.
6157          */
6158         posix_action = 0;
6159         if (eval.ae_residual & (KAUTH_VNODE_READ_DATA |
6160                 KAUTH_VNODE_LIST_DIRECTORY |
6161                 KAUTH_VNODE_READ_EXTATTRIBUTES))
6162                 posix_action |= VREAD;
6163         if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA |
6164                 KAUTH_VNODE_ADD_FILE |
6165                 KAUTH_VNODE_ADD_SUBDIRECTORY |
6166                 KAUTH_VNODE_DELETE_CHILD |
6167                 KAUTH_VNODE_WRITE_ATTRIBUTES |
6168                 KAUTH_VNODE_WRITE_EXTATTRIBUTES))
6169                 posix_action |= VWRITE;
6170         if (eval.ae_residual & (KAUTH_VNODE_EXECUTE |
6171                 KAUTH_VNODE_SEARCH))
6172                 posix_action |= VEXEC;
6173
6174         if (posix_action != 0) {
6175                 return(vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */));
6176         } else {
6177                 KAUTH_DEBUG("%p    ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping",
6178                     vcp->vp,
6179                     (eval.ae_residual & KAUTH_VNODE_READ_DATA)
6180                     ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
6181                     (eval.ae_residual & KAUTH_VNODE_WRITE_DATA)
6182                     ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "",
6183                     (eval.ae_residual & KAUTH_VNODE_EXECUTE)
6184                     ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "",
6185                     (eval.ae_residual & KAUTH_VNODE_DELETE)
6186                     ? " DELETE" : "",
6187                     (eval.ae_residual & KAUTH_VNODE_APPEND_DATA)
6188                     ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
6189                     (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD)
6190                     ? " DELETE_CHILD" : "",
6191                     (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES)
6192                     ? " READ_ATTRIBUTES" : "",
6193                     (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES)
6194                     ? " WRITE_ATTRIBUTES" : "",
6195                     (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES)
6196                     ? " READ_EXTATTRIBUTES" : "",
6197                     (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES)
6198                     ? " WRITE_EXTATTRIBUTES" : "",
6199                     (eval.ae_residual & KAUTH_VNODE_READ_SECURITY)
6200                     ? " READ_SECURITY" : "",
6201                     (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY)
6202                     ? " WRITE_SECURITY" : "",
6203                     (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE)
6204                     ? " CHECKIMMUTABLE" : "",
6205                     (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER)
6206                     ? " CHANGE_OWNER" : "");
6207         }
6208
6209         /*
6210          * Lack of required Posix permissions implies no reason to deny access.
6211          */
6212         return(0);
6213 }
6214
6215 /*
6216  * Check for file immutability.
6217  */
6218 static int
6219 vnode_authorize_checkimmutable(vnode_t vp, struct vnode_attr *vap, int rights, int ignore)
6220 {
6221         mount_t mp;
6222         int error;
6223         int append;
6224
6225         /*
6226          * Perform immutability checks for operations that change data.
6227          *
6228          * Sockets, fifos and devices require special handling.
6229          */
6230         switch(vp->v_type) {
6231         case VSOCK:
6232         case VFIFO:
6233         case VBLK:
6234         case VCHR:
6235                 /*
6236                  * Writing to these nodes does not change the filesystem data,
6237                  * so forget that it's being tried.
6238                  */
6239                 rights &= ~KAUTH_VNODE_WRITE_DATA;
6240                 break;
6241         default:
6242                 break;
6243         }
6244
6245         error = 0;
6246         if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
6247
6248                 /* check per-filesystem options if possible */
6249                 mp = vp->v_mount;
6250                 if (mp != NULL) {
6251
6252                         /* check for no-EA filesystems */
6253                         if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
6254                             (vfs_flags(mp) & MNT_NOUSERXATTR)) {
6255                                 KAUTH_DEBUG("%p    DENIED - filesystem disallowed extended attributes", vp);
6256                                 error = EACCES;  /* User attributes disabled */
6257                                 goto out;
6258                         }
6259                 }
6260
6261                 /*
6262                  * check for file immutability. first, check if the requested rights are
6263                  * allowable for a UF_APPEND file.
6264                  */
6265                 append = 0;
6266                 if (vp->v_type == VDIR) {
6267                         if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights)
6268                                 append = 1;
6269                 } else {
6270                         if ((rights & (KAUTH_VNODE_APPEND_DATA | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights)
6271                                 append = 1;
6272                 }
6273                 if ((error = vnode_immutable(vap, append, ignore)) != 0) {
6274                         KAUTH_DEBUG("%p    DENIED - file is immutable", vp);
6275                         goto out;
6276                 }
6277         }
6278 out:
6279         return(error);
6280 }
6281
6282 /*
6283  * Handle authorization actions for filesystems that advertise that the
6284  * server will be enforcing.
6285  *
6286  * Returns:     0                       Authorization should be handled locally
6287  *              1                       Authorization was handled by the FS
6288  *
6289  * Note:        Imputed returns will only occur if the authorization request
6290  *              was handled by the FS.
6291  *
6292  * Imputed:     *resultp, modified      Return code from FS when the request is
6293  *                                      handled by the FS.
6294  *              VNOP_ACCESS:???
6295  *              VNOP_OPEN:???
6296  */
6297 static int
6298 vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx)
6299 {
6300         int     error;
6301
6302         /*
6303          * If the vp is a device node, socket or FIFO it actually represents a local
6304          * endpoint, so we need to handle it locally.
6305          */
6306         switch(vp->v_type) {
6307         case VBLK:
6308         case VCHR:
6309         case VSOCK:
6310         case VFIFO:
6311                 return(0);
6312         default:
6313                 break;
6314         }
6315
6316         /*
6317          * In the advisory request case, if the filesystem doesn't think it's reliable
6318          * we will attempt to formulate a result ourselves based on VNOP_GETATTR data.
6319          */
6320         if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vp->v_mount))
6321                 return(0);
6322
6323         /*
6324          * Let the filesystem have a say in the matter.  It's OK for it to not implemnent
6325          * VNOP_ACCESS, as most will authorise inline with the actual request.
6326          */
6327         if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) {
6328                 *resultp = error;
6329                 KAUTH_DEBUG("%p    DENIED - opaque filesystem VNOP_ACCESS denied access", vp);
6330                 return(1);
6331         }
6332
6333         /*
6334          * Typically opaque filesystems do authorisation in-line, but exec is a special case.  In
6335          * order to be reasonably sure that exec will be permitted, we try a bit harder here.
6336          */
6337         if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) {
6338                 /* try a VNOP_OPEN for readonly access */
6339                 if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
6340                         *resultp = error;
6341                         KAUTH_DEBUG("%p    DENIED - EXECUTE denied because file could not be opened readonly", vp);
6342                         return(1);
6343                 }
6344                 VNOP_CLOSE(vp, FREAD, ctx);
6345         }
6346
6347         /*
6348          * We don't have any reason to believe that the request has to be denied at this point,
6349          * so go ahead and allow it.
6350          */
6351         *resultp = 0;
6352         KAUTH_DEBUG("%p    ALLOWED - bypassing access check for non-local filesystem", vp);
6353         return(1);
6354 }
6355
6356
6357
6358
6359 /*
6360  * Returns:     KAUTH_RESULT_ALLOW
6361  *              KAUTH_RESULT_DENY
6362  *
6363  * Imputed:     *arg3, modified         Error code in the deny case
6364  *              EROFS                   Read-only file system
6365  *              EACCES                  Permission denied
6366  *              EPERM                   Operation not permitted [no execute]
6367  *      vnode_getattr:ENOMEM            Not enough space [only if has filesec]
6368  *      vnode_getattr:???
6369  *      vnode_authorize_opaque:*arg2    ???
6370  *      vnode_authorize_checkimmutable:???
6371  *      vnode_authorize_delete:???
6372  *      vnode_authorize_simple:???
6373  */
6374
6375
6376 static int
6377 vnode_authorize_callback(kauth_cred_t cred, void *idata, kauth_action_t action,
6378                          uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
6379 {
6380         vfs_context_t   ctx;
6381         vnode_t         cvp = NULLVP;
6382         vnode_t         vp, dvp;
6383         int             result = KAUTH_RESULT_DENY;
6384         int             parent_iocount = 0;
6385         int             parent_action; /* In case we need to use namedstream's data fork for cached rights*/
6386
6387         ctx = (vfs_context_t)arg0;
6388         vp = (vnode_t)arg1;
6389         dvp = (vnode_t)arg2;
6390
6391         /*
6392          * if there are 2 vnodes passed in, we don't know at
6393          * this point which rights to look at based on the
6394          * combined action being passed in... defer until later...
6395          * otherwise check the kauth 'rights' cache hung
6396          * off of the vnode we're interested in... if we've already
6397          * been granted the right we're currently interested in,
6398          * we can just return success... otherwise we'll go through
6399          * the process of authorizing the requested right(s)... if that
6400          * succeeds, we'll add the right(s) to the cache.
6401          * VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache
6402          */
6403         if (dvp && vp)
6404                 goto defer;
6405         if (dvp) {
6406                 cvp = dvp;
6407         } else {
6408                 /*
6409                  * For named streams on local-authorization volumes, rights are cached on the parent;
6410                  * authorization is determined by looking at the parent's properties anyway, so storing
6411                  * on the parent means that we don't recompute for the named stream and that if
6412                  * we need to flush rights (e.g. on VNOP_SETATTR()) we don't need to track down the
6413                  * stream to flush its cache separately.  If we miss in the cache, then we authorize
6414                  * as if there were no cached rights (passing the named stream vnode and desired rights to
6415                  * vnode_authorize_callback_int()).
6416                  *
6417                  * On an opaquely authorized volume, we don't know the relationship between the
6418                  * data fork's properties and the rights granted on a stream.  Thus, named stream vnodes
6419                  * on such a volume are authorized directly (rather than using the parent) and have their
6420                  * own caches.  When a named stream vnode is created, we mark the parent as having a named
6421                  * stream. On a VNOP_SETATTR() for the parent that may invalidate cached authorization, we
6422                  * find the stream and flush its cache.
6423                  */
6424                 if (vnode_isnamedstream(vp) && (!vfs_authopaque(vp->v_mount))) {
6425                         cvp = vnode_getparent(vp);
6426                         if (cvp != NULLVP) {
6427                                 parent_iocount = 1;
6428                         } else {
6429                                 cvp = NULL;
6430                                 goto defer; /* If we can't use the parent, take the slow path */
6431                         }
6432
6433                         /* Have to translate some actions */
6434                         parent_action = action;
6435                         if (parent_action & KAUTH_VNODE_READ_DATA) {
6436                                 parent_action &= ~KAUTH_VNODE_READ_DATA;
6437                                 parent_action |= KAUTH_VNODE_READ_EXTATTRIBUTES;
6438                         }
6439                         if (parent_action & KAUTH_VNODE_WRITE_DATA) {
6440                                 parent_action &= ~KAUTH_VNODE_WRITE_DATA;
6441                                 parent_action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
6442                         }
6443
6444                 } else {
6445                         cvp = vp;
6446                 }
6447         }
6448
6449         if (vnode_cache_is_authorized(cvp, ctx, parent_iocount ? parent_action : action) == TRUE) {
6450                 result = KAUTH_RESULT_ALLOW;
6451                 goto out;
6452         }
6453 defer:
6454         result = vnode_authorize_callback_int(cred, idata, action, arg0, arg1, arg2, arg3);
6455
6456         if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) {
6457                 KAUTH_DEBUG("%p - caching action = %x", cvp, action);
6458                 vnode_cache_authorized_action(cvp, ctx, action);
6459         }
6460
6461 out:
6462         if (parent_iocount) {
6463                 vnode_put(cvp);
6464         }
6465
6466         return result;
6467 }
6468
6469
6470 static int
6471 vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *idata, kauth_action_t action,
6472     uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
6473 {
6474         struct _vnode_authorize_context auth_context;
6475         vauth_ctx               vcp;
6476         vfs_context_t           ctx;
6477         vnode_t                 vp, dvp;
6478         kauth_cred_t            cred;
6479         kauth_ace_rights_t      rights;
6480         struct vnode_attr       va, dva;
6481         int                     result;
6482         int                     *errorp;
6483         int                     noimmutable;
6484         boolean_t               parent_authorized_for_delete_child = FALSE;
6485         boolean_t               found_deny = FALSE;
6486         boolean_t               parent_ref= FALSE;
6487
6488         vcp = &auth_context;
6489         ctx = vcp->ctx = (vfs_context_t)arg0;
6490         vp = vcp->vp = (vnode_t)arg1;
6491         dvp = vcp->dvp = (vnode_t)arg2;
6492         errorp = (int *)arg3;
6493         /*
6494          * Note that we authorize against the context, not the passed cred
6495          * (the same thing anyway)
6496          */
6497         cred = ctx->vc_ucred;
6498
6499         VATTR_INIT(&va);
6500         vcp->vap = &va;
6501         VATTR_INIT(&dva);
6502         vcp->dvap = &dva;
6503
6504         vcp->flags = vcp->flags_valid = 0;
6505
6506 #if DIAGNOSTIC
6507         if ((ctx == NULL) || (vp == NULL) || (cred == NULL))
6508                 panic("vnode_authorize: bad arguments (context %p  vp %p  cred %p)", ctx, vp, cred);
6509 #endif
6510
6511         KAUTH_DEBUG("%p  AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)",
6512             vp, vfs_context_proc(ctx)->p_comm,
6513             (action & KAUTH_VNODE_ACCESS)               ? "access" : "auth",
6514             (action & KAUTH_VNODE_READ_DATA)            ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
6515             (action & KAUTH_VNODE_WRITE_DATA)           ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "",
6516             (action & KAUTH_VNODE_EXECUTE)              ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "",
6517             (action & KAUTH_VNODE_DELETE)               ? " DELETE" : "",
6518             (action & KAUTH_VNODE_APPEND_DATA)          ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
6519             (action & KAUTH_VNODE_DELETE_CHILD)         ? " DELETE_CHILD" : "",
6520             (action & KAUTH_VNODE_READ_ATTRIBUTES)      ? " READ_ATTRIBUTES" : "",
6521             (action & KAUTH_VNODE_WRITE_ATTRIBUTES)     ? " WRITE_ATTRIBUTES" : "",
6522             (action & KAUTH_VNODE_READ_EXTATTRIBUTES)   ? " READ_EXTATTRIBUTES" : "",
6523             (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES)  ? " WRITE_EXTATTRIBUTES" : "",
6524             (action & KAUTH_VNODE_READ_SECURITY)        ? " READ_SECURITY" : "",
6525             (action & KAUTH_VNODE_WRITE_SECURITY)       ? " WRITE_SECURITY" : "",
6526             (action & KAUTH_VNODE_CHANGE_OWNER)         ? " CHANGE_OWNER" : "",
6527             (action & KAUTH_VNODE_NOIMMUTABLE)          ? " (noimmutable)" : "",
6528             vnode_isdir(vp) ? "directory" : "file",
6529             vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp);
6530
6531         /*
6532          * Extract the control bits from the action, everything else is
6533          * requested rights.
6534          */
6535         noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
6536         rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
6537
6538         if (rights & KAUTH_VNODE_DELETE) {
6539 #if DIAGNOSTIC
6540                 if (dvp == NULL)
6541                         panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory");
6542 #endif
6543                 /*
6544                  * check to see if we've already authorized the parent
6545                  * directory for deletion of its children... if so, we
6546                  * can skip a whole bunch of work... we will still have to
6547                  * authorize that this specific child can be removed
6548                  */
6549                 if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE)
6550                         parent_authorized_for_delete_child = TRUE;
6551         } else {
6552                 dvp = NULL;
6553         }
6554
6555         /*
6556          * Check for read-only filesystems.
6557          */
6558         if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
6559             (vp->v_mount->mnt_flag & MNT_RDONLY) &&
6560             ((vp->v_type == VREG) || (vp->v_type == VDIR) ||
6561              (vp->v_type == VLNK) || (vp->v_type == VCPLX) ||
6562              (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) {
6563                 result = EROFS;
6564                 goto out;
6565         }
6566
6567         /*
6568          * Check for noexec filesystems.
6569          */
6570         if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) {
6571                 result = EACCES;
6572                 goto out;
6573         }
6574
6575         /*
6576          * Handle cases related to filesystems with non-local enforcement.
6577          * This call can return 0, in which case we will fall through to perform a
6578          * check based on VNOP_GETATTR data.  Otherwise it returns 1 and sets
6579          * an appropriate result, at which point we can return immediately.
6580          */
6581         if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, &result, action, ctx))
6582                 goto out;
6583
6584         /*
6585          * Get vnode attributes and extended security information for the vnode
6586          * and directory if required.
6587          */
6588         VATTR_WANTED(&va, va_mode);
6589         VATTR_WANTED(&va, va_uid);
6590         VATTR_WANTED(&va, va_gid);
6591         VATTR_WANTED(&va, va_flags);
6592         VATTR_WANTED(&va, va_acl);
6593         if ((result = vnode_getattr(vp, &va, ctx)) != 0) {
6594                 KAUTH_DEBUG("%p    ERROR - failed to get vnode attributes - %d", vp, result);
6595                 goto out;
6596         }
6597         if (dvp) {
6598                 VATTR_WANTED(&dva, va_mode);
6599                 VATTR_WANTED(&dva, va_uid);
6600                 VATTR_WANTED(&dva, va_gid);
6601                 VATTR_WANTED(&dva, va_flags);
6602                 VATTR_WANTED(&dva, va_acl);
6603                 if ((result = vnode_getattr(dvp, &dva, ctx)) != 0) {
6604                         KAUTH_DEBUG("%p    ERROR - failed to get directory vnode attributes - %d", vp, result);
6605                         goto out;
6606                 }
6607         }
6608
6609         /*
6610          * If the vnode is an extended attribute data vnode (eg. a resource fork), *_DATA becomes
6611          * *_EXTATTRIBUTES.
6612          */
6613         if (vnode_isnamedstream(vp)) {
6614                 if (rights & KAUTH_VNODE_READ_DATA) {
6615                         rights &= ~KAUTH_VNODE_READ_DATA;
6616                         rights |= KAUTH_VNODE_READ_EXTATTRIBUTES;
6617                 }
6618                 if (rights & KAUTH_VNODE_WRITE_DATA) {
6619                         rights &= ~KAUTH_VNODE_WRITE_DATA;
6620                         rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
6621                 }
6622         }
6623
6624         /*
6625          * Point 'vp' to the resource fork's parent for ACL checking
6626          */
6627         if (vnode_isnamedstream(vp) &&
6628             (vp->v_parent != NULL) &&
6629             (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) {
6630                 parent_ref = TRUE;
6631                 vcp->vp = vp = vp->v_parent;
6632                 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL))
6633                         kauth_acl_free(va.va_acl);
6634                 VATTR_INIT(&va);
6635                 VATTR_WANTED(&va, va_mode);
6636                 VATTR_WANTED(&va, va_uid);
6637                 VATTR_WANTED(&va, va_gid);
6638                 VATTR_WANTED(&va, va_flags);
6639                 VATTR_WANTED(&va, va_acl);
6640                 if ((result = vnode_getattr(vp, &va, ctx)) != 0)
6641                         goto out;
6642         }
6643
6644         /*
6645          * Check for immutability.
6646          *
6647          * In the deletion case, parent directory immutability vetoes specific
6648          * file rights.
6649          */
6650         if ((result = vnode_authorize_checkimmutable(vp, &va, rights, noimmutable)) != 0)
6651                 goto out;
6652         if ((rights & KAUTH_VNODE_DELETE) &&
6653             parent_authorized_for_delete_child == FALSE &&
6654             ((result = vnode_authorize_checkimmutable(dvp, &dva, KAUTH_VNODE_DELETE_CHILD, 0)) != 0))
6655                 goto out;
6656
6657         /*
6658          * Clear rights that have been authorized by reaching this point, bail if nothing left to
6659          * check.
6660          */
6661         rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE);
6662         if (rights == 0)
6663                 goto out;
6664
6665         /*
6666          * If we're not the superuser, authorize based on file properties;
6667          * note that even if parent_authorized_for_delete_child is TRUE, we
6668          * need to check on the node itself.
6669          */
6670         if (!vfs_context_issuser(ctx)) {
6671                 /* process delete rights */
6672                 if ((rights & KAUTH_VNODE_DELETE) &&
6673                     ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0))
6674                     goto out;
6675
6676                 /* process remaining rights */
6677                 if ((rights & ~KAUTH_VNODE_DELETE) &&
6678                     (result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, &found_deny)) != 0)
6679                         goto out;
6680         } else {
6681
6682                 /*
6683                  * Execute is only granted to root if one of the x bits is set.  This check only
6684                  * makes sense if the posix mode bits are actually supported.
6685                  */
6686                 if ((rights & KAUTH_VNODE_EXECUTE) &&
6687                     (vp->v_type == VREG) &&
6688                     VATTR_IS_SUPPORTED(&va, va_mode) &&
6689                     !(va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
6690                         result = EPERM;
6691                         KAUTH_DEBUG("%p    DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode);
6692                         goto out;
6693                 }
6694
6695                 KAUTH_DEBUG("%p    ALLOWED - caller is superuser", vp);
6696         }
6697 out:
6698         if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL))
6699                 kauth_acl_free(va.va_acl);
6700         if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL))
6701                 kauth_acl_free(dva.va_acl);
6702
6703         if (result) {
6704                 if (parent_ref)
6705                         vnode_put(vp);
6706                 *errorp = result;
6707                 KAUTH_DEBUG("%p    DENIED - auth denied", vp);
6708                 return(KAUTH_RESULT_DENY);
6709         }
6710         if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) {
6711                 /*
6712                  * if we were successfully granted the right to search this directory
6713                  * and there were NO ACL DENYs for search and the posix permissions also don't
6714                  * deny execute, we can synthesize a global right that allows anyone to
6715                  * traverse this directory during a pathname lookup without having to
6716                  * match the credential associated with this cache of rights.
6717                  */
6718                 if (!VATTR_IS_SUPPORTED(&va, va_mode) ||
6719                     ((va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) ==
6720                      (S_IXUSR | S_IXGRP | S_IXOTH))) {
6721                         vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE);
6722                 }
6723         }
6724         if ((rights & KAUTH_VNODE_DELETE) && parent_authorized_for_delete_child == FALSE) {
6725                 /*
6726                  * parent was successfully and newly authorized for content deletions
6727                  * add it to the cache, but only if it doesn't have the sticky
6728                  * bit set on it.  This same  check is done earlier guarding
6729                  * fetching of dva, and if we jumped to out without having done
6730                  * this, we will have returned already because of a non-zero
6731                  * 'result' value.
6732                  */
6733                 if (VATTR_IS_SUPPORTED(&dva, va_mode) &&
6734                     !(dva.va_mode & (S_ISVTX))) {
6735                         /* OK to cache delete rights */
6736                         KAUTH_DEBUG("%p - caching DELETE_CHILD rights", dvp);
6737                         vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE_CHILD);
6738                 }
6739         }
6740         if (parent_ref)
6741                 vnode_put(vp);
6742         /*
6743          * Note that this implies that we will allow requests for no rights, as well as
6744          * for rights that we do not recognise.  There should be none of these.
6745          */
6746         KAUTH_DEBUG("%p    ALLOWED - auth granted", vp);
6747         return(KAUTH_RESULT_ALLOW);
6748 }
6749
6750 int
6751 vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx)
6752 {
6753         return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx);
6754 }
6755
6756 /*
6757  * Check that the attribute information in vattr can be legally applied to
6758  * a new file by the context.
6759  */
6760 static int
6761 vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
6762 {
6763         int             error;
6764         int             has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
6765         kauth_cred_t    cred;
6766         guid_t          changer;
6767         mount_t         dmp;
6768
6769         error = 0;
6770
6771         if (defaulted_fieldsp) {
6772                 *defaulted_fieldsp = 0;
6773         }
6774
6775         defaulted_owner = defaulted_group = defaulted_mode = 0;
6776
6777         /*
6778          * Require that the filesystem support extended security to apply any.
6779          */
6780         if (!vfs_extendedsecurity(dvp->v_mount) &&
6781             (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) {
6782                 error = EINVAL;
6783                 goto out;
6784         }
6785
6786         /*
6787          * Default some fields.
6788          */
6789         dmp = dvp->v_mount;
6790
6791         /*
6792          * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that
6793          * owner takes ownership of all new files.
6794          */
6795         if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) {
6796                 VATTR_SET(vap, va_uid, dmp->mnt_fsowner);
6797                 defaulted_owner = 1;
6798         } else {
6799                 if (!VATTR_IS_ACTIVE(vap, va_uid)) {
6800                         /* default owner is current user */
6801                         VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx)));
6802                         defaulted_owner = 1;
6803                 }
6804         }
6805
6806         /*
6807          * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
6808          * group takes ownership of all new files.
6809          */
6810         if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) {
6811                 VATTR_SET(vap, va_gid, dmp->mnt_fsgroup);
6812                 defaulted_group = 1;
6813         } else {
6814                 if (!VATTR_IS_ACTIVE(vap, va_gid)) {
6815                         /* default group comes from parent object, fallback to current user */
6816                         struct vnode_attr dva;
6817                         VATTR_INIT(&dva);
6818                         VATTR_WANTED(&dva, va_gid);
6819                         if ((error = vnode_getattr(dvp, &dva, ctx)) != 0)
6820                                 goto out;
6821                         if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
6822                                 VATTR_SET(vap, va_gid, dva.va_gid);
6823                         } else {
6824                                 VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx)));
6825                         }
6826                         defaulted_group = 1;
6827                 }
6828         }
6829
6830         if (!VATTR_IS_ACTIVE(vap, va_flags))
6831                 VATTR_SET(vap, va_flags, 0);
6832
6833         /* default mode is everything, masked with current umask */
6834         if (!VATTR_IS_ACTIVE(vap, va_mode)) {
6835                 VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask);
6836                 KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask);
6837                 defaulted_mode = 1;
6838         }
6839         /* set timestamps to now */
6840         if (!VATTR_IS_ACTIVE(vap, va_create_time)) {
6841                 nanotime(&vap->va_create_time);
6842                 VATTR_SET_ACTIVE(vap, va_create_time);
6843         }
6844
6845         /*
6846          * Check for attempts to set nonsensical fields.
6847          */
6848         if (vap->va_active & ~VNODE_ATTR_NEWOBJ) {
6849                 error = EINVAL;
6850                 KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx",
6851                     vap->va_active & ~VNODE_ATTR_NEWOBJ);
6852                 goto out;
6853         }
6854
6855         /*
6856          * Quickly check for the applicability of any enforcement here.
6857          * Tests below maintain the integrity of the local security model.
6858          */
6859         if (vfs_authopaque(dvp->v_mount))
6860             goto out;
6861
6862         /*
6863          * We need to know if the caller is the superuser, or if the work is
6864          * otherwise already authorised.
6865          */
6866         cred = vfs_context_ucred(ctx);
6867         if (noauth) {
6868                 /* doing work for the kernel */
6869                 has_priv_suser = 1;
6870         } else {
6871                 has_priv_suser = vfs_context_issuser(ctx);
6872         }
6873
6874
6875         if (VATTR_IS_ACTIVE(vap, va_flags)) {
6876                 if (has_priv_suser) {
6877                         if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) {
6878                                 error = EPERM;
6879                                 KAUTH_DEBUG("  DENIED - superuser attempt to set illegal flag(s)");
6880                                 goto out;
6881                         }
6882                 } else {
6883                         if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) {
6884                                 error = EPERM;
6885                                 KAUTH_DEBUG("  DENIED - user attempt to set illegal flag(s)");
6886                                 goto out;
6887                         }
6888                 }
6889         }
6890
6891         /* if not superuser, validate legality of new-item attributes */
6892         if (!has_priv_suser) {
6893                 if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) {
6894                         /* setgid? */
6895                         if (vap->va_mode & S_ISGID) {
6896                                 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
6897                                         KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
6898                                         goto out;
6899                                 }
6900                                 if (!ismember) {
6901                                         KAUTH_DEBUG("  DENIED - can't set SGID bit, not a member of %d", vap->va_gid);
6902                                         error = EPERM;
6903                                         goto out;
6904                                 }
6905                         }
6906
6907                         /* setuid? */
6908                         if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) {
6909                                 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
6910                                 error = EPERM;
6911                                 goto out;
6912                         }
6913                 }
6914                 if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) {
6915                         KAUTH_DEBUG("  DENIED - cannot create new item owned by %d", vap->va_uid);
6916                         error = EPERM;
6917                         goto out;
6918                 }
6919                 if (!defaulted_group) {
6920                         if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
6921                                 KAUTH_DEBUG("  ERROR - got %d checking for membership in %d", error, vap->va_gid);
6922                                 goto out;
6923                         }
6924                         if (!ismember) {
6925                                 KAUTH_DEBUG("  DENIED - cannot create new item with group %d - not a member", vap->va_gid);
6926                                 error = EPERM;
6927                                 goto out;
6928                         }
6929                 }
6930
6931                 /* initialising owner/group UUID */
6932                 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
6933                         if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
6934                                 KAUTH_DEBUG("  ERROR - got %d trying to get caller UUID", error);
6935                                 /* XXX ENOENT here - no GUID - should perhaps become EPERM */
6936                                 goto out;
6937                         }
6938                         if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
6939                                 KAUTH_DEBUG("  ERROR - cannot create item with supplied owner UUID - not us");
6940                                 error = EPERM;
6941                                 goto out;
6942                         }
6943                 }
6944                 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
6945                         if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
6946                                 KAUTH_DEBUG("  ERROR - got %d trying to check group membership", error);
6947                                 goto out;
6948                         }
6949                         if (!ismember) {
6950                                 KAUTH_DEBUG("  ERROR - cannot create item with supplied group UUID - not a member");
6951                                 error = EPERM;
6952                                 goto out;
6953                         }
6954                 }
6955         }
6956 out:
6957         if (defaulted_fieldsp) {
6958                 if (defaulted_mode) {
6959                         *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE;
6960                 }
6961                 if (defaulted_group) {
6962                         *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_GID;
6963                 }
6964                 if (defaulted_owner) {
6965                         *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_UID;
6966                 }
6967         }
6968         return(error);
6969 }
6970
6971 /*
6972  * Check that the attribute information in vap can be legally written by the
6973  * context.
6974  *
6975  * Call this when you're not sure about the vnode_attr; either its contents
6976  * have come from an unknown source, or when they are variable.
6977  *
6978  * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that
6979  * must be authorized to be permitted to write the vattr.
6980  */
6981 int
6982 vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx)
6983 {
6984         struct vnode_attr ova;
6985         kauth_action_t  required_action;
6986         int             error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid;
6987         guid_t          changer;
6988         gid_t           group;
6989         uid_t           owner;
6990         mode_t          newmode;
6991         kauth_cred_t    cred;
6992         uint32_t        fdelta;
6993
6994         VATTR_INIT(&ova);
6995         required_action = 0;
6996         error = 0;
6997
6998         /*
6999          * Quickly check for enforcement applicability.
7000          */
7001         if (vfs_authopaque(vp->v_mount))
7002                 goto out;
7003
7004         /*
7005          * Check for attempts to set nonsensical fields.
7006          */
7007         if (vap->va_active & VNODE_ATTR_RDONLY) {
7008                 KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)");
7009                 error = EINVAL;
7010                 goto out;
7011         }
7012
7013         /*
7014          * We need to know if the caller is the superuser.
7015          */
7016         cred = vfs_context_ucred(ctx);
7017         has_priv_suser = kauth_cred_issuser(cred);
7018
7019         /*
7020          * If any of the following are changing, we need information from the old file:
7021          * va_uid
7022          * va_gid
7023          * va_mode
7024          * va_uuuid
7025          * va_guuid
7026          */
7027         if (VATTR_IS_ACTIVE(vap, va_uid) ||
7028             VATTR_IS_ACTIVE(vap, va_gid) ||
7029             VATTR_IS_ACTIVE(vap, va_mode) ||
7030             VATTR_IS_ACTIVE(vap, va_uuuid) ||
7031             VATTR_IS_ACTIVE(vap, va_guuid)) {
7032                 VATTR_WANTED(&ova, va_mode);
7033                 VATTR_WANTED(&ova, va_uid);
7034                 VATTR_WANTED(&ova, va_gid);
7035                 VATTR_WANTED(&ova, va_uuuid);
7036                 VATTR_WANTED(&ova, va_guuid);
7037                 KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes");
7038         }
7039
7040         /*
7041          * If timestamps are being changed, we need to know who the file is owned
7042          * by.
7043          */
7044         if (VATTR_IS_ACTIVE(vap, va_create_time) ||
7045             VATTR_IS_ACTIVE(vap, va_change_time) ||
7046             VATTR_IS_ACTIVE(vap, va_modify_time) ||
7047             VATTR_IS_ACTIVE(vap, va_access_time) ||
7048             VATTR_IS_ACTIVE(vap, va_backup_time)) {
7049
7050                 VATTR_WANTED(&ova, va_uid);
7051 #if 0   /* enable this when we support UUIDs as official owners */
7052                 VATTR_WANTED(&ova, va_uuuid);
7053 #endif
7054                 KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID");
7055         }
7056
7057         /*
7058          * If flags are being changed, we need the old flags.
7059          */
7060         if (VATTR_IS_ACTIVE(vap, va_flags)) {
7061                 KAUTH_DEBUG("ATTR - flags changing, fetching old flags");
7062                 VATTR_WANTED(&ova, va_flags);
7063         }
7064
7065         /*
7066          * If ACLs are being changed, we need the old ACLs.
7067          */
7068         if (VATTR_IS_ACTIVE(vap, va_acl)) {
7069                 KAUTH_DEBUG("ATTR - acl changing, fetching old flags");
7070                 VATTR_WANTED(&ova, va_acl);
7071         }
7072
7073         /*
7074          * If the size is being set, make sure it's not a directory.
7075          */
7076         if (VATTR_IS_ACTIVE(vap, va_data_size)) {
7077                 /* size is meaningless on a directory, don't permit this */
7078                 if (vnode_isdir(vp)) {
7079                         KAUTH_DEBUG("ATTR - ERROR: size change requested on a directory");
7080                         error = EISDIR;
7081                         goto out;
7082                 }
7083         }
7084
7085         /*
7086          * Get old data.
7087          */
7088         KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active);
7089         if ((error = vnode_getattr(vp, &ova, ctx)) != 0) {
7090                 KAUTH_DEBUG("  ERROR - got %d trying to get attributes", error);
7091                 goto out;
7092         }
7093
7094         /*
7095          * Size changes require write access to the file data.
7096          */
7097         if (VATTR_IS_ACTIVE(vap, va_data_size)) {
7098                 /* if we can't get the size, or it's different, we need write access */
7099                         KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA");
7100                         required_action |= KAUTH_VNODE_WRITE_DATA;
7101         }
7102
7103         /*
7104          * Changing timestamps?
7105          *
7106          * Note that we are only called to authorize user-requested time changes;
7107          * side-effect time changes are not authorized.  Authorisation is only
7108          * required for existing files.
7109          *
7110          * Non-owners are not permitted to change the time on an existing
7111          * file to anything other than the current time.
7112          */
7113         if (VATTR_IS_ACTIVE(vap, va_create_time) ||
7114             VATTR_IS_ACTIVE(vap, va_change_time) ||
7115             VATTR_IS_ACTIVE(vap, va_modify_time) ||
7116             VATTR_IS_ACTIVE(vap, va_access_time) ||
7117             VATTR_IS_ACTIVE(vap, va_backup_time)) {
7118                 /*
7119                  * The owner and root may set any timestamps they like,
7120                  * provided that the file is not immutable.  The owner still needs
7121                  * WRITE_ATTRIBUTES (implied by ownership but still deniable).
7122                  */
7123                 if (has_priv_suser || vauth_node_owner(&ova, cred)) {
7124                         KAUTH_DEBUG("ATTR - root or owner changing timestamps");
7125                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES;
7126                 } else {
7127                         /* just setting the current time? */
7128                         if (vap->va_vaflags & VA_UTIMES_NULL) {
7129                                 KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES");
7130                                 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
7131                         } else {
7132                                 KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted");
7133                                 error = EACCES;
7134                                 goto out;
7135                         }
7136                 }
7137         }
7138
7139         /*
7140          * Changing file mode?
7141          */
7142         if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) {
7143                 KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode);
7144
7145                 /*
7146                  * Mode changes always have the same basic auth requirements.
7147                  */
7148                 if (has_priv_suser) {
7149                         KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check");
7150                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
7151                 } else {
7152                         /* need WRITE_SECURITY */
7153                         KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY");
7154                         required_action |= KAUTH_VNODE_WRITE_SECURITY;
7155                 }
7156
7157                 /*
7158                  * Can't set the setgid bit if you're not in the group and not root.  Have to have
7159                  * existing group information in the case we're not setting it right now.
7160                  */
7161                 if (vap->va_mode & S_ISGID) {
7162                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE;  /* always required */
7163                         if (!has_priv_suser) {
7164                                 if (VATTR_IS_ACTIVE(vap, va_gid)) {
7165                                         group = vap->va_gid;
7166                                 } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) {
7167                                         group = ova.va_gid;
7168                                 } else {
7169                                         KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available");
7170                                         error = EINVAL;
7171                                         goto out;
7172                                 }
7173                                 /*
7174                                  * This might be too restrictive; WRITE_SECURITY might be implied by
7175                                  * membership in this case, rather than being an additional requirement.
7176                                  */
7177                                 if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) {
7178                                         KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
7179                                         goto out;
7180                                 }
7181                                 if (!ismember) {
7182                                         KAUTH_DEBUG("  DENIED - can't set SGID bit, not a member of %d", group);
7183                                         error = EPERM;
7184                                         goto out;
7185                                 }
7186                         }
7187                 }
7188
7189                 /*
7190                  * Can't set the setuid bit unless you're root or the file's owner.
7191                  */
7192                 if (vap->va_mode & S_ISUID) {
7193                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE;  /* always required */
7194                         if (!has_priv_suser) {
7195                                 if (VATTR_IS_ACTIVE(vap, va_uid)) {
7196                                         owner = vap->va_uid;
7197                                 } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) {
7198                                         owner = ova.va_uid;
7199                                 } else {
7200                                         KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available");
7201                                         error = EINVAL;
7202                                         goto out;
7203                                 }
7204                                 if (owner != kauth_cred_getuid(cred)) {
7205                                         /*
7206                                          * We could allow this if WRITE_SECURITY is permitted, perhaps.
7207                                          */
7208                                         KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
7209                                         error = EPERM;
7210                                         goto out;
7211                                 }
7212                         }
7213                 }
7214         }
7215
7216         /*
7217          * Validate/mask flags changes.  This checks that only the flags in
7218          * the UF_SETTABLE mask are being set, and preserves the flags in
7219          * the SF_SETTABLE case.
7220          *
7221          * Since flags changes may be made in conjunction with other changes,
7222          * we will ask the auth code to ignore immutability in the case that
7223          * the SF_* flags are not set and we are only manipulating the file flags.
7224          *
7225          */
7226         if (VATTR_IS_ACTIVE(vap, va_flags)) {
7227                 /* compute changing flags bits */
7228                 if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
7229                         fdelta = vap->va_flags ^ ova.va_flags;
7230                 } else {
7231                         fdelta = vap->va_flags;
7232                 }
7233
7234                 if (fdelta != 0) {
7235                         KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY");
7236                         required_action |= KAUTH_VNODE_WRITE_SECURITY;
7237
7238                         /* check that changing bits are legal */
7239                         if (has_priv_suser) {
7240                                 /*
7241                                  * The immutability check will prevent us from clearing the SF_*
7242                                  * flags unless the system securelevel permits it, so just check
7243                                  * for legal flags here.
7244                                  */
7245                                 if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) {
7246                                         error = EPERM;
7247                                         KAUTH_DEBUG("  DENIED - superuser attempt to set illegal flag(s)");
7248                                         goto out;
7249                                 }
7250                         } else {
7251                                 if (fdelta & ~UF_SETTABLE) {
7252                                         error = EPERM;
7253                                         KAUTH_DEBUG("  DENIED - user attempt to set illegal flag(s)");
7254                                         goto out;
7255                                 }
7256                         }
7257                         /*
7258                          * If the caller has the ability to manipulate file flags,
7259                          * security is not reduced by ignoring them for this operation.
7260                          *
7261                          * A more complete test here would consider the 'after' states of the flags
7262                          * to determine whether it would permit the operation, but this becomes
7263                          * very complex.
7264                          *
7265                          * Ignoring immutability is conditional on securelevel; this does not bypass
7266                          * the SF_* flags if securelevel > 0.
7267                          */
7268                         required_action |= KAUTH_VNODE_NOIMMUTABLE;
7269                 }
7270         }
7271
7272         /*
7273          * Validate ownership information.
7274          */
7275         chowner = 0;
7276         chgroup = 0;
7277         clear_suid = 0;
7278         clear_sgid = 0;
7279
7280         /*
7281          * uid changing
7282          * Note that if the filesystem didn't give us a UID, we expect that it doesn't
7283          * support them in general, and will ignore it if/when we try to set it.
7284          * We might want to clear the uid out of vap completely here.
7285          */
7286         if (VATTR_IS_ACTIVE(vap, va_uid)) {
7287                 if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) {
7288                 if (!has_priv_suser && (kauth_cred_getuid(cred) != vap->va_uid)) {
7289                         KAUTH_DEBUG("  DENIED - non-superuser cannot change ownershipt to a third party");
7290                         error = EPERM;
7291                         goto out;
7292                 }
7293                 chowner = 1;
7294         }
7295                 clear_suid = 1;
7296         }
7297
7298         /*
7299          * gid changing
7300          * Note that if the filesystem didn't give us a GID, we expect that it doesn't
7301          * support them in general, and will ignore it if/when we try to set it.
7302          * We might want to clear the gid out of vap completely here.
7303          */
7304         if (VATTR_IS_ACTIVE(vap, va_gid)) {
7305                 if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) {
7306                 if (!has_priv_suser) {
7307                         if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
7308                                 KAUTH_DEBUG("  ERROR - got %d checking for membership in %d", error, vap->va_gid);
7309                                 goto out;
7310                         }
7311                         if (!ismember) {
7312                                 KAUTH_DEBUG("  DENIED - group change from %d to %d but not a member of target group",
7313                                     ova.va_gid, vap->va_gid);
7314                                 error = EPERM;
7315                                 goto out;
7316                         }
7317                 }
7318                 chgroup = 1;
7319         }
7320                 clear_sgid = 1;
7321         }
7322
7323         /*
7324          * Owner UUID being set or changed.
7325          */
7326         if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
7327                 /* if the owner UUID is not actually changing ... */
7328                 if (VATTR_IS_SUPPORTED(&ova, va_uuuid)) {
7329                         if (kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid))
7330                                 goto no_uuuid_change;
7331
7332                         /*
7333                          * If the current owner UUID is a null GUID, check
7334                          * it against the UUID corresponding to the owner UID.
7335                          */
7336                         if (kauth_guid_equal(&ova.va_uuuid, &kauth_null_guid) &&
7337                             VATTR_IS_SUPPORTED(&ova, va_uid)) {
7338                                 guid_t uid_guid;
7339
7340                                 if (kauth_cred_uid2guid(ova.va_uid, &uid_guid) == 0 &&
7341                                     kauth_guid_equal(&vap->va_uuuid, &uid_guid))
7342                                         goto no_uuuid_change;
7343                         }
7344                 }
7345
7346                 /*
7347                  * The owner UUID cannot be set by a non-superuser to anything other than
7348                  * their own or a null GUID (to "unset" the owner UUID).
7349                  * Note that file systems must be prepared to handle the
7350                  * null UUID case in a manner appropriate for that file
7351                  * system.
7352                  */
7353                 if (!has_priv_suser) {
7354                         if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
7355                                 KAUTH_DEBUG("  ERROR - got %d trying to get caller UUID", error);
7356                                 /* XXX ENOENT here - no UUID - should perhaps become EPERM */
7357                                 goto out;
7358                         }
7359                         if (!kauth_guid_equal(&vap->va_uuuid, &changer) &&
7360                             !kauth_guid_equal(&vap->va_uuuid, &kauth_null_guid)) {
7361                                 KAUTH_DEBUG("  ERROR - cannot set supplied owner UUID - not us / null");
7362                                 error = EPERM;
7363                                 goto out;
7364                         }
7365                 }
7366                 chowner = 1;
7367                 clear_suid = 1;
7368         }
7369 no_uuuid_change:
7370         /*
7371          * Group UUID being set or changed.
7372          */
7373         if (VATTR_IS_ACTIVE(vap, va_guuid)) {
7374                 /* if the group UUID is not actually changing ... */
7375                 if (VATTR_IS_SUPPORTED(&ova, va_guuid)) {
7376                         if (kauth_guid_equal(&vap->va_guuid, &ova.va_guuid))
7377                                 goto no_guuid_change;
7378
7379                         /*
7380                          * If the current group UUID is a null UUID, check
7381                          * it against the UUID corresponding to the group GID.
7382                          */
7383                         if (kauth_guid_equal(&ova.va_guuid, &kauth_null_guid) &&
7384                             VATTR_IS_SUPPORTED(&ova, va_gid)) {
7385                                 guid_t gid_guid;
7386
7387                                 if (kauth_cred_gid2guid(ova.va_gid, &gid_guid) == 0 &&
7388                                     kauth_guid_equal(&vap->va_guuid, &gid_guid))
7389                                         goto no_guuid_change;
7390                         }
7391                 }
7392
7393                 /*
7394                  * The group UUID cannot be set by a non-superuser to anything other than
7395                  * one of which they are a member or a null GUID (to "unset"
7396                  * the group UUID).
7397                  * Note that file systems must be prepared to handle the
7398                  * null UUID case in a manner appropriate for that file
7399                  * system.
7400                  */
7401                 if (!has_priv_suser) {
7402                         if (kauth_guid_equal(&vap->va_guuid, &kauth_null_guid))
7403                                 ismember = 1;
7404                         else if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
7405                                 KAUTH_DEBUG("  ERROR - got %d trying to check group membership", error);
7406                                 goto out;
7407                         }
7408                         if (!ismember) {
7409                                 KAUTH_DEBUG("  ERROR - cannot set supplied group UUID - not a member / null");
7410                                 error = EPERM;
7411                                 goto out;
7412                         }
7413                 }
7414                 chgroup = 1;
7415         }
7416 no_guuid_change:
7417
7418         /*
7419          * Compute authorisation for group/ownership changes.
7420          */
7421         if (chowner || chgroup || clear_suid || clear_sgid) {
7422                 if (has_priv_suser) {
7423                         KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check");
7424                         required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
7425                 } else {
7426                         if (chowner) {
7427                                 KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP");
7428                                 required_action |= KAUTH_VNODE_TAKE_OWNERSHIP;
7429                         }
7430                         if (chgroup && !chowner) {
7431                                 KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY");
7432                                 required_action |= KAUTH_VNODE_WRITE_SECURITY;
7433                         }
7434
7435                         /* clear set-uid and set-gid bits as required by Posix */
7436                         if (VATTR_IS_ACTIVE(vap, va_mode)) {
7437                                 newmode = vap->va_mode;
7438                         } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
7439                                 newmode = ova.va_mode;
7440                         } else {
7441                                 KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
7442                                 newmode = 0;
7443                         }
7444                         if (newmode & (S_ISUID | S_ISGID)) {
7445                                 VATTR_SET(vap, va_mode, newmode & ~(S_ISUID | S_ISGID));
7446                                 KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o", newmode, vap->va_mode);
7447                         }
7448                 }
7449         }
7450
7451         /*
7452          * Authorise changes in the ACL.
7453          */
7454         if (VATTR_IS_ACTIVE(vap, va_acl)) {
7455
7456                 /* no existing ACL */
7457                 if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) {
7458
7459                         /* adding an ACL */
7460                         if (vap->va_acl != NULL) {
7461                                 required_action |= KAUTH_VNODE_WRITE_SECURITY;
7462                                 KAUTH_DEBUG("CHMOD - adding ACL");
7463                         }
7464
7465                         /* removing an existing ACL */
7466                 } else if (vap->va_acl == NULL) {
7467                         required_action |= KAUTH_VNODE_WRITE_SECURITY;
7468                         KAUTH_DEBUG("CHMOD - removing ACL");
7469
7470                         /* updating an existing ACL */
7471                 } else {
7472                         if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) {
7473                                 /* entry count changed, must be different */
7474                                 required_action |= KAUTH_VNODE_WRITE_SECURITY;
7475                                 KAUTH_DEBUG("CHMOD - adding/removing ACL entries");
7476                         } else if (vap->va_acl->acl_entrycount > 0) {
7477                                 /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */
7478                                 if (memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0],
7479                                         sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) {
7480                                         required_action |= KAUTH_VNODE_WRITE_SECURITY;
7481                                         KAUTH_DEBUG("CHMOD - changing ACL entries");
7482                                 }
7483                         }
7484                 }
7485         }
7486
7487         /*
7488          * Other attributes that require authorisation.
7489          */
7490         if (VATTR_IS_ACTIVE(vap, va_encoding))
7491                 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
7492
7493 out:
7494         if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL))
7495                 kauth_acl_free(ova.va_acl);
7496         if (error == 0)
7497                 *actionp = required_action;
7498         return(error);
7499 }
7500
7501 static int
7502 setlocklocal_callback(struct vnode *vp, __unused void *cargs)
7503 {
7504         vnode_lock_spin(vp);
7505         vp->v_flag |= VLOCKLOCAL;
7506         vnode_unlock(vp);
7507
7508         return (VNODE_RETURNED);
7509 }
7510
7511 void
7512 vfs_setlocklocal(mount_t mp)
7513 {
7514         mount_lock_spin(mp);
7515         mp->mnt_kern_flag |= MNTK_LOCK_LOCAL;
7516         mount_unlock(mp);
7517
7518         /*
7519          * The number of active vnodes is expected to be
7520          * very small when vfs_setlocklocal is invoked.
7521          */
7522         vnode_iterate(mp, 0, setlocklocal_callback, NULL);
7523 }
7524
7525 void
7526 vfs_setunmountpreflight(mount_t mp)
7527 {
7528         mount_lock_spin(mp);
7529         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
7530         mount_unlock(mp);
7531 }
7532
7533 void
7534 vfs_setcompoundopen(mount_t mp)
7535 {
7536         mount_lock_spin(mp);
7537         mp->mnt_compound_ops |= COMPOUND_VNOP_OPEN;
7538         mount_unlock(mp);
7539 }
7540
7541 void
7542 vn_setunionwait(vnode_t vp)
7543 {
7544         vnode_lock_spin(vp);
7545         vp->v_flag |= VISUNION;
7546         vnode_unlock(vp);
7547 }
7548
7549
7550 void
7551 vn_checkunionwait(vnode_t vp)
7552 {
7553         vnode_lock_spin(vp);
7554         while ((vp->v_flag & VISUNION) == VISUNION)
7555                 msleep((caddr_t)&vp->v_flag, &vp->v_lock, 0, 0, 0);
7556         vnode_unlock(vp);
7557 }
7558
7559 void
7560 vn_clearunionwait(vnode_t vp, int locked)
7561 {
7562         if (!locked)
7563                 vnode_lock_spin(vp);
7564         if((vp->v_flag & VISUNION) == VISUNION) {
7565                 vp->v_flag &= ~VISUNION;
7566                 wakeup((caddr_t)&vp->v_flag);
7567         }
7568         if (!locked)
7569                 vnode_unlock(vp);
7570 }
7571
7572 /*
7573  * XXX - get "don't trigger mounts" flag for thread; used by autofs.
7574  */
7575 extern int thread_notrigger(void);
7576
7577 int
7578 thread_notrigger(void)
7579 {
7580         struct uthread *uth = (struct uthread *)get_bsdthread_info(current_thread());
7581         return (uth->uu_notrigger);
7582 }
7583
7584 /*
7585  * Removes orphaned apple double files during a rmdir
7586  * Works by:
7587  * 1. vnode_suspend().
7588  * 2. Call VNOP_READDIR() till the end of directory is reached.
7589  * 3. Check if the directory entries returned are regular files with name starting with "._".  If not, return ENOTEMPTY.
7590  * 4. Continue (2) and (3) till end of directory is reached.
7591  * 5. If all the entries in the directory were files with "._" name, delete all the files.
7592  * 6. vnode_resume()
7593  * 7. If deletion of all files succeeded, call VNOP_RMDIR() again.
7594  */
7595
7596 errno_t rmdir_remove_orphaned_appleDouble(vnode_t vp , vfs_context_t ctx, int * restart_flag)
7597 {
7598
7599 #define UIO_BUFF_SIZE 2048
7600         uio_t auio = NULL;
7601         int eofflag, siz = UIO_BUFF_SIZE, nentries = 0;
7602         int open_flag = 0, full_erase_flag = 0;
7603         char uio_buf[ UIO_SIZEOF(1) ];
7604         char *rbuf = NULL, *cpos, *cend;
7605         struct nameidata nd_temp;
7606         struct dirent *dp;
7607         errno_t error;
7608
7609         error = vnode_suspend(vp);
7610
7611         /*
7612          * restart_flag is set so that the calling rmdir sleeps and resets
7613          */
7614         if (error == EBUSY)
7615                 *restart_flag = 1;
7616         if (error != 0)
7617                 goto outsc;
7618
7619         /*
7620          * set up UIO
7621          */
7622         MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
7623         if (rbuf)
7624                 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
7625                                 &uio_buf[0], sizeof(uio_buf));
7626         if (!rbuf || !auio) {
7627                 error = ENOMEM;
7628                 goto outsc;
7629         }
7630
7631         uio_setoffset(auio,0);
7632
7633         eofflag = 0;
7634
7635         if ((error = VNOP_OPEN(vp, FREAD, ctx)))
7636                 goto outsc;
7637         else
7638                 open_flag = 1;
7639
7640         /*
7641          * First pass checks if all files are appleDouble files.
7642          */
7643
7644         do {
7645                 siz = UIO_BUFF_SIZE;
7646                 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
7647                 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
7648
7649                 if((error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx)))
7650                         goto outsc;
7651
7652                 if (uio_resid(auio) != 0)
7653                         siz -= uio_resid(auio);
7654
7655                 /*
7656                  * Iterate through directory
7657                  */
7658                 cpos = rbuf;
7659                 cend = rbuf + siz;
7660                 dp = (struct dirent*) cpos;
7661
7662                 if (cpos == cend)
7663                         eofflag = 1;
7664
7665                 while ((cpos < cend)) {
7666                         /*
7667                          * Check for . and .. as well as directories
7668                          */
7669                         if (dp->d_ino != 0 &&
7670                                         !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
7671                                             (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))) {
7672                                 /*
7673                                  * Check for irregular files and ._ files
7674                                  * If there is a ._._ file abort the op
7675                                  */
7676                                 if ( dp->d_namlen < 2 ||
7677                                                 strncmp(dp->d_name,"._",2) ||
7678                                                 (dp->d_namlen >= 4 && !strncmp(&(dp->d_name[2]), "._",2))) {
7679                                         error = ENOTEMPTY;
7680                                         goto outsc;
7681                                 }
7682                         }
7683                         cpos += dp->d_reclen;
7684                         dp = (struct dirent*)cpos;
7685                 }
7686
7687                 /*
7688                  * workaround for HFS/NFS setting eofflag before end of file
7689                  */
7690                 if (vp->v_tag == VT_HFS && nentries > 2)
7691                         eofflag=0;
7692
7693                 if (vp->v_tag == VT_NFS) {
7694                         if (eofflag && !full_erase_flag) {
7695                                 full_erase_flag = 1;
7696                                 eofflag = 0;
7697                                 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
7698                         }
7699                         else if (!eofflag && full_erase_flag)
7700                                 full_erase_flag = 0;
7701                 }
7702
7703         } while (!eofflag);
7704         /*
7705          * If we've made it here all the files in the dir are ._ files.
7706          * We can delete the files even though the node is suspended
7707          * because we are the owner of the file.
7708          */
7709
7710         uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
7711         eofflag = 0;
7712         full_erase_flag = 0;
7713
7714         do {
7715                 siz = UIO_BUFF_SIZE;
7716                 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
7717                 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
7718
7719                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx);
7720
7721                 if (error != 0)
7722                         goto outsc;
7723
7724                 if (uio_resid(auio) != 0)
7725                         siz -= uio_resid(auio);
7726
7727                 /*
7728                  * Iterate through directory
7729                  */
7730                 cpos = rbuf;
7731                 cend = rbuf + siz;
7732                 dp = (struct dirent*) cpos;
7733
7734                 if (cpos == cend)
7735                         eofflag = 1;
7736
7737                 while ((cpos < cend)) {
7738                         /*
7739                          * Check for . and .. as well as directories
7740                          */
7741                         if (dp->d_ino != 0 &&
7742                                         !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
7743                                             (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))
7744                                           ) {
7745
7746                                 NDINIT(&nd_temp, DELETE, OP_UNLINK, USEDVP,
7747                                        UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name),
7748                                        ctx);
7749                                 nd_temp.ni_dvp = vp;
7750                                 error = unlink1(ctx, &nd_temp, 0);
7751
7752                                 if (error &&  error != ENOENT) {
7753                                         goto outsc;
7754                                 }
7755
7756                         }
7757                         cpos += dp->d_reclen;
7758                         dp = (struct dirent*)cpos;
7759                 }
7760
7761                 /*
7762                  * workaround for HFS/NFS setting eofflag before end of file
7763                  */
7764                 if (vp->v_tag == VT_HFS && nentries > 2)
7765                         eofflag=0;
7766
7767                 if (vp->v_tag == VT_NFS) {
7768                         if (eofflag && !full_erase_flag) {
7769                                 full_erase_flag = 1;
7770                                 eofflag = 0;
7771                                 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
7772                         }
7773                         else if (!eofflag && full_erase_flag)
7774                                 full_erase_flag = 0;
7775                 }
7776
7777         } while (!eofflag);
7778
7779
7780         error = 0;
7781
7782 outsc:
7783         if (open_flag)
7784                 VNOP_CLOSE(vp, FREAD, ctx);
7785
7786         uio_free(auio);
7787         FREE(rbuf, M_TEMP);
7788
7789         vnode_resume(vp);
7790
7791
7792         return(error);
7793
7794 }
7795
7796
7797 void
7798 lock_vnode_and_post(vnode_t vp, int kevent_num)
7799 {
7800         /* Only take the lock if there's something there! */
7801         if (vp->v_knotes.slh_first != NULL) {
7802                 vnode_lock(vp);
7803                 KNOTE(&vp->v_knotes, kevent_num);
7804                 vnode_unlock(vp);
7805         }
7806 }
7807
7808 #ifdef JOE_DEBUG
7809 static void record_vp(vnode_t vp, int count) {
7810         struct uthread *ut;
7811
7812 #if CONFIG_TRIGGERS
7813         if (vp->v_resolve)
7814                 return;
7815 #endif
7816         if ((vp->v_flag & VSYSTEM))
7817                 return;
7818
7819         ut = get_bsdthread_info(current_thread());
7820         ut->uu_iocount += count;
7821
7822         if (count == 1) {
7823                 if (ut->uu_vpindex < 32) {
7824                         OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][0], 10);
7825
7826                         ut->uu_vps[ut->uu_vpindex] = vp;
7827                         ut->uu_vpindex++;
7828                 }
7829         }
7830 }
7831 #endif
7832
7833
7834 #if CONFIG_TRIGGERS
7835
7836 #define TRIG_DEBUG 0
7837
7838 #if TRIG_DEBUG
7839 #define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0)
7840 #else
7841 #define TRIG_LOG(...)
7842 #endif
7843
7844 /*
7845  * Resolver result functions
7846  */
7847
7848 resolver_result_t
7849 vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux)
7850 {
7851         /*
7852          * |<---   32   --->|<---  28  --->|<- 4 ->|
7853          *      sequence        auxiliary    status
7854          */
7855         return (((uint64_t)seq) << 32) |
7856                (((uint64_t)(aux & 0x0fffffff)) << 4) |
7857                (uint64_t)(stat & 0x0000000F);
7858 }
7859
7860 enum resolver_status
7861 vfs_resolver_status(resolver_result_t result)
7862 {
7863         /* lower 4 bits is status */
7864         return (result & 0x0000000F);
7865 }
7866
7867 uint32_t
7868 vfs_resolver_sequence(resolver_result_t result)
7869 {
7870         /* upper 32 bits is sequence */
7871         return (uint32_t)(result >> 32);
7872 }
7873
7874 int
7875 vfs_resolver_auxiliary(resolver_result_t result)
7876 {
7877         /* 28 bits of auxiliary */
7878         return (int)(((uint32_t)(result & 0xFFFFFFF0)) >> 4);
7879 }
7880
7881 /*
7882  * SPI
7883  * Call in for resolvers to update vnode trigger state
7884  */
7885 int
7886 vnode_trigger_update(vnode_t vp, resolver_result_t result)
7887 {
7888         vnode_resolve_t rp;
7889         uint32_t seq;
7890         enum resolver_status stat;
7891
7892         if (vp->v_resolve == NULL) {
7893                 return (EINVAL);
7894         }
7895
7896         stat = vfs_resolver_status(result);
7897         seq = vfs_resolver_sequence(result);
7898
7899         if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) {
7900                 return (EINVAL);
7901         }
7902
7903         rp = vp->v_resolve;
7904         lck_mtx_lock(&rp->vr_lock);
7905
7906         if (seq > rp->vr_lastseq) {
7907                 if (stat == RESOLVER_RESOLVED)
7908                         rp->vr_flags |= VNT_RESOLVED;
7909                 else
7910                         rp->vr_flags &= ~VNT_RESOLVED;
7911
7912                 rp->vr_lastseq = seq;
7913         }
7914
7915         lck_mtx_unlock(&rp->vr_lock);
7916
7917         return (0);
7918 }
7919
7920 static int
7921 vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref)
7922 {
7923         int error;
7924
7925         vnode_lock_spin(vp);
7926         if (vp->v_resolve != NULL) {
7927                 vnode_unlock(vp);
7928                 return EINVAL;
7929         } else {
7930                 vp->v_resolve = rp;
7931         }
7932         vnode_unlock(vp);
7933
7934         if (ref) {
7935                 error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE);
7936                 if (error != 0) {
7937                         panic("VNODE_REF_FORCE didn't help...");
7938                 }
7939         }
7940
7941         return 0;
7942 }
7943
7944 /*
7945  * VFS internal interfaces for vnode triggers
7946  *
7947  * vnode must already have an io count on entry
7948  * v_resolve is stable when io count is non-zero
7949  */
7950 static int
7951 vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external)
7952 {
7953         vnode_resolve_t rp;
7954         int result;
7955         char byte;
7956
7957 #if 1
7958         /* minimum pointer test (debugging) */
7959         if (tinfo->vnt_data)
7960                 byte = *((char *)tinfo->vnt_data);
7961 #endif
7962         MALLOC(rp, vnode_resolve_t, sizeof(*rp), M_TEMP, M_WAITOK);
7963         if (rp == NULL)
7964                 return (ENOMEM);
7965
7966         lck_mtx_init(&rp->vr_lock, trigger_vnode_lck_grp, trigger_vnode_lck_attr);
7967
7968         rp->vr_resolve_func = tinfo->vnt_resolve_func;
7969         rp->vr_unresolve_func = tinfo->vnt_unresolve_func;
7970         rp->vr_rearm_func = tinfo->vnt_rearm_func;
7971         rp->vr_reclaim_func = tinfo->vnt_reclaim_func;
7972         rp->vr_data = tinfo->vnt_data;
7973         rp->vr_lastseq = 0;
7974         rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK;
7975         if (external) {
7976                 rp->vr_flags |= VNT_EXTERNAL;
7977         }
7978
7979         result = vnode_resolver_attach(vp, rp, external);
7980         if (result != 0) {
7981                 goto out;
7982         }
7983
7984         if (mp) {
7985                 OSAddAtomic(1, &mp->mnt_numtriggers);
7986         }
7987
7988         return (result);
7989
7990 out:
7991         FREE(rp, M_TEMP);
7992         return result;
7993 }
7994
7995 static void
7996 vnode_resolver_release(vnode_resolve_t rp)
7997 {
7998         /*
7999          * Give them a chance to free any private data
8000          */
8001         if (rp->vr_data && rp->vr_reclaim_func) {
8002                 rp->vr_reclaim_func(NULLVP, rp->vr_data);
8003         }
8004
8005         lck_mtx_destroy(&rp->vr_lock, trigger_vnode_lck_grp);
8006         FREE(rp, M_TEMP);
8007
8008 }
8009
8010 /* Called after the vnode has been drained */
8011 static void
8012 vnode_resolver_detach(vnode_t vp)
8013 {
8014         vnode_resolve_t rp;
8015         mount_t mp;
8016
8017         mp = vnode_mount(vp);
8018
8019         vnode_lock(vp);
8020         rp = vp->v_resolve;
8021         vp->v_resolve = NULL;
8022         vnode_unlock(vp);
8023
8024         if ((rp->vr_flags & VNT_EXTERNAL) != 0) {
8025                 vnode_rele_ext(vp, O_EVTONLY, 1);
8026         }
8027
8028         vnode_resolver_release(rp);
8029
8030         /* Keep count of active trigger vnodes per mount */
8031         OSAddAtomic(-1, &mp->mnt_numtriggers);
8032 }
8033
8034 /*
8035  * Pathname operations that don't trigger a mount for trigger vnodes
8036  */
8037 static const u_int64_t ignorable_pathops_mask =
8038         1LL << OP_MOUNT |
8039         1LL << OP_UNMOUNT |
8040         1LL << OP_STATFS |
8041         1LL << OP_ACCESS |
8042         1LL << OP_GETATTR |
8043         1LL << OP_LISTXATTR;
8044
8045 int
8046 vfs_istraditionaltrigger(enum path_operation op, const struct componentname *cnp)
8047 {
8048         if (cnp->cn_flags & ISLASTCN)
8049                 return ((1LL << op) & ignorable_pathops_mask) == 0;
8050         else
8051                 return (1);
8052 }
8053
8054 __private_extern__
8055 void
8056 vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx)
8057 {
8058         vnode_resolve_t rp;
8059         resolver_result_t result;
8060         enum resolver_status status;
8061         uint32_t seq;
8062
8063         if ((vp->v_resolve == NULL) ||
8064             (vp->v_resolve->vr_rearm_func == NULL) ||
8065             (vp->v_resolve->vr_flags & VNT_AUTO_REARM) == 0) {
8066                 return;
8067         }
8068
8069         rp = vp->v_resolve;
8070         lck_mtx_lock(&rp->vr_lock);
8071
8072         /*
8073          * Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes.
8074          */
8075         if (rp->vr_flags & VNT_VFS_UNMOUNTED) {
8076                 lck_mtx_unlock(&rp->vr_lock);
8077                 return;
8078         }
8079
8080         /* Check if this vnode is already armed */
8081         if ((rp->vr_flags & VNT_RESOLVED) == 0) {
8082                 lck_mtx_unlock(&rp->vr_lock);
8083                 return;
8084         }
8085
8086         lck_mtx_unlock(&rp->vr_lock);
8087
8088         result = rp->vr_rearm_func(vp, 0, rp->vr_data, ctx);
8089         status = vfs_resolver_status(result);
8090         seq = vfs_resolver_sequence(result);
8091
8092         lck_mtx_lock(&rp->vr_lock);
8093         if (seq > rp->vr_lastseq) {
8094                 if (status == RESOLVER_UNRESOLVED)
8095                         rp->vr_flags &= ~VNT_RESOLVED;
8096                 rp->vr_lastseq = seq;
8097         }
8098         lck_mtx_unlock(&rp->vr_lock);
8099 }
8100
8101 __private_extern__
8102 int
8103 vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx)
8104 {
8105         vnode_resolve_t rp;
8106         enum path_operation op;
8107         resolver_result_t result;
8108         enum resolver_status status;
8109         uint32_t seq;
8110
8111         /* Only trigger on topmost vnodes */
8112         if ((vp->v_resolve == NULL) ||
8113             (vp->v_resolve->vr_resolve_func == NULL) ||
8114             (vp->v_mountedhere != NULL)) {
8115                 return (0);
8116         }
8117
8118         rp = vp->v_resolve;
8119         lck_mtx_lock(&rp->vr_lock);
8120
8121         /* Check if this vnode is already resolved */
8122         if (rp->vr_flags & VNT_RESOLVED) {
8123                 lck_mtx_unlock(&rp->vr_lock);
8124                 return (0);
8125         }
8126
8127         lck_mtx_unlock(&rp->vr_lock);
8128
8129         /*
8130          * XXX
8131          * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
8132          * is there anyway to know this???
8133          * there can also be other legitimate lookups in parallel
8134          *
8135          * XXX - should we call this on a separate thread with a timeout?
8136          *
8137          * XXX - should we use ISLASTCN to pick the op value???  Perhaps only leafs should
8138          * get the richer set and non-leafs should get generic OP_LOOKUP?  TBD
8139          */
8140         op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP;
8141
8142         result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, 0, rp->vr_data, ctx);
8143         status = vfs_resolver_status(result);
8144         seq = vfs_resolver_sequence(result);
8145
8146         lck_mtx_lock(&rp->vr_lock);
8147         if (seq > rp->vr_lastseq) {
8148                 if (status == RESOLVER_RESOLVED)
8149                         rp->vr_flags |= VNT_RESOLVED;
8150                 rp->vr_lastseq = seq;
8151         }
8152         lck_mtx_unlock(&rp->vr_lock);
8153
8154         /* On resolver errors, propagate the error back up */
8155         return (status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0);
8156 }
8157
8158 static int
8159 vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx)
8160 {
8161         vnode_resolve_t rp;
8162         resolver_result_t result;
8163         enum resolver_status status;
8164         uint32_t seq;
8165
8166         if ((vp->v_resolve == NULL) || (vp->v_resolve->vr_unresolve_func == NULL)) {
8167                 return (0);
8168         }
8169
8170         rp = vp->v_resolve;
8171         lck_mtx_lock(&rp->vr_lock);
8172
8173         /* Check if this vnode is already resolved */
8174         if ((rp->vr_flags & VNT_RESOLVED) == 0) {
8175                 printf("vnode_trigger_unresolve: not currently resolved\n");
8176                 lck_mtx_unlock(&rp->vr_lock);
8177                 return (0);
8178         }
8179
8180         rp->vr_flags |= VNT_VFS_UNMOUNTED;
8181
8182         lck_mtx_unlock(&rp->vr_lock);
8183
8184         /*
8185          * XXX
8186          * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
8187          * there can also be other legitimate lookups in parallel
8188          *
8189          * XXX - should we call this on a separate thread with a timeout?
8190          */
8191
8192         result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx);
8193         status = vfs_resolver_status(result);
8194         seq = vfs_resolver_sequence(result);
8195
8196         lck_mtx_lock(&rp->vr_lock);
8197         if (seq > rp->vr_lastseq) {
8198                 if (status == RESOLVER_UNRESOLVED)
8199                         rp->vr_flags &= ~VNT_RESOLVED;
8200                 rp->vr_lastseq = seq;
8201         }
8202         rp->vr_flags &= ~VNT_VFS_UNMOUNTED;
8203         lck_mtx_unlock(&rp->vr_lock);
8204
8205         /* On resolver errors, propagate the error back up */
8206         return (status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0);
8207 }
8208
8209 static int
8210 triggerisdescendant(mount_t mp, mount_t rmp)
8211 {
8212         int match = FALSE;
8213
8214         /*
8215          * walk up vnode covered chain looking for a match
8216          */
8217         name_cache_lock_shared();
8218
8219         while (1) {
8220                 vnode_t vp;
8221
8222                 /* did we encounter "/" ? */
8223                 if (mp->mnt_flag & MNT_ROOTFS)
8224                         break;
8225
8226                 vp = mp->mnt_vnodecovered;
8227                 if (vp == NULLVP)
8228                         break;
8229
8230                 mp = vp->v_mount;
8231                 if (mp == rmp) {
8232                         match = TRUE;
8233                         break;
8234                 }
8235         }
8236
8237         name_cache_unlock();
8238
8239         return (match);
8240 }
8241
8242 struct trigger_unmount_info {
8243         vfs_context_t   ctx;
8244         mount_t         top_mp;
8245         vnode_t         trigger_vp;
8246         mount_t         trigger_mp;
8247         uint32_t        trigger_vid;
8248         int             flags;
8249 };
8250
8251 static int
8252 trigger_unmount_callback(mount_t mp, void * arg)
8253 {
8254         struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg;
8255         boolean_t mountedtrigger = FALSE;
8256
8257         /*
8258          * When we encounter the top level mount we're done
8259          */
8260         if (mp == infop->top_mp)
8261                 return (VFS_RETURNED_DONE);
8262
8263         if ((mp->mnt_vnodecovered == NULL) ||
8264             (vnode_getwithref(mp->mnt_vnodecovered) != 0)) {
8265                 return (VFS_RETURNED);
8266         }
8267
8268         if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
8269             (mp->mnt_vnodecovered->v_resolve != NULL) &&
8270             (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) {
8271                 mountedtrigger = TRUE;
8272         }
8273         vnode_put(mp->mnt_vnodecovered);
8274
8275         /*
8276          * When we encounter a mounted trigger, check if its under the top level mount
8277          */
8278         if ( !mountedtrigger || !triggerisdescendant(mp, infop->top_mp) )
8279                 return (VFS_RETURNED);
8280
8281         /*
8282          * Process any pending nested mount (now that its not referenced)
8283          */
8284         if ((infop->trigger_vp != NULLVP) &&
8285             (vnode_getwithvid(infop->trigger_vp, infop->trigger_vid) == 0)) {
8286                 vnode_t vp = infop->trigger_vp;
8287                 int error;
8288
8289                 infop->trigger_vp = NULLVP;
8290
8291                 if (mp == vp->v_mountedhere) {
8292                         vnode_put(vp);
8293                         printf("trigger_unmount_callback: unexpected match '%s'\n",
8294                                 mp->mnt_vfsstat.f_mntonname);
8295                         return (VFS_RETURNED);
8296                 }
8297                 if (infop->trigger_mp != vp->v_mountedhere) {
8298                         vnode_put(vp);
8299                         printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n",
8300                                 infop->trigger_mp, vp->v_mountedhere);
8301                         goto savenext;
8302                 }
8303
8304                 error = vnode_trigger_unresolve(vp, infop->flags, infop->ctx);
8305                 vnode_put(vp);
8306                 if (error) {
8307                         printf("unresolving: '%s', err %d\n",
8308                                 vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname :
8309                                 "???", error);
8310                         return (VFS_RETURNED_DONE); /* stop iteration on errors */
8311                 }
8312         }
8313 savenext:
8314         /*
8315          * We can't call resolver here since we hold a mount iter
8316          * ref on mp so save its covered vp for later processing
8317          */
8318         infop->trigger_vp = mp->mnt_vnodecovered;
8319         if ((infop->trigger_vp != NULLVP) &&
8320             (vnode_getwithref(infop->trigger_vp) == 0)) {
8321                 if (infop->trigger_vp->v_mountedhere == mp) {
8322                         infop->trigger_vid = infop->trigger_vp->v_id;
8323                         infop->trigger_mp = mp;
8324                 }
8325                 vnode_put(infop->trigger_vp);
8326         }
8327
8328         return (VFS_RETURNED);
8329 }
8330
8331 /*
8332  * Attempt to unmount any trigger mounts nested underneath a mount.
8333  * This is a best effort attempt and no retries are performed here.
8334  *
8335  * Note: mp->mnt_rwlock is held exclusively on entry (so be carefull)
8336  */
8337 __private_extern__
8338 void
8339 vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx)
8340 {
8341         struct trigger_unmount_info info;
8342
8343         /* Must have trigger vnodes */
8344         if (mp->mnt_numtriggers == 0) {
8345                 return;
8346         }
8347         /* Avoid recursive requests (by checking covered vnode) */
8348         if ((mp->mnt_vnodecovered != NULL) &&
8349             (vnode_getwithref(mp->mnt_vnodecovered) == 0)) {
8350                 boolean_t recursive = FALSE;
8351
8352                 if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
8353                     (mp->mnt_vnodecovered->v_resolve != NULL) &&
8354                     (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) {
8355                         recursive = TRUE;
8356                 }
8357                 vnode_put(mp->mnt_vnodecovered);
8358                 if (recursive)
8359                         return;
8360         }
8361
8362         /*
8363          * Attempt to unmount any nested trigger mounts (best effort)
8364          */
8365         info.ctx = ctx;
8366         info.top_mp = mp;
8367         info.trigger_vp = NULLVP;
8368         info.trigger_vid = 0;
8369         info.trigger_mp = NULL;
8370         info.flags = flags;
8371
8372         (void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, trigger_unmount_callback, &info);
8373
8374         /*
8375          * Process remaining nested mount (now that its not referenced)
8376          */
8377         if ((info.trigger_vp != NULLVP) &&
8378             (vnode_getwithvid(info.trigger_vp, info.trigger_vid) == 0)) {
8379                 vnode_t vp = info.trigger_vp;
8380
8381                 if (info.trigger_mp == vp->v_mountedhere) {
8382                         (void) vnode_trigger_unresolve(vp, flags, ctx);
8383                 }
8384                 vnode_put(vp);
8385         }
8386 }
8387
8388 int
8389 vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx)
8390 {
8391         struct nameidata nd;
8392         int res;
8393         vnode_t rvp, vp;
8394         struct vnode_trigger_param vtp;
8395
8396         /*
8397          * Must be called for trigger callback, wherein rwlock is held
8398          */
8399         lck_rw_assert(&mp->mnt_rwlock, LCK_RW_ASSERT_HELD);
8400
8401         TRIG_LOG("Adding trigger at %s\n", relpath);
8402         TRIG_LOG("Trying VFS_ROOT\n");
8403
8404         /*
8405          * We do a lookup starting at the root of the mountpoint, unwilling
8406          * to cross into other mountpoints.
8407          */
8408         res = VFS_ROOT(mp, &rvp, ctx);
8409         if (res != 0) {
8410                 goto out;
8411         }
8412
8413         TRIG_LOG("Trying namei\n");
8414
8415         NDINIT(&nd, LOOKUP, OP_LOOKUP, USEDVP | NOCROSSMOUNT | FOLLOW, UIO_SYSSPACE,
8416                 CAST_USER_ADDR_T(relpath), ctx);
8417         nd.ni_dvp = rvp;
8418         res = namei(&nd);
8419         if (res != 0) {
8420                 vnode_put(rvp);
8421                 goto out;
8422         }
8423
8424         vp = nd.ni_vp;
8425         nameidone(&nd);
8426         vnode_put(rvp);
8427
8428         TRIG_LOG("Trying vnode_resolver_create()\n");
8429
8430         /*
8431          * Set up blob.  vnode_create() takes a larger structure
8432          * with creation info, and we needed something different
8433          * for this case.  One needs to win, or we need to munge both;
8434          * vnode_create() wins.
8435          */
8436         bzero(&vtp, sizeof(vtp));
8437         vtp.vnt_resolve_func = vtip->vti_resolve_func;
8438         vtp.vnt_unresolve_func = vtip->vti_unresolve_func;
8439         vtp.vnt_rearm_func = vtip->vti_rearm_func;
8440         vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
8441         vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
8442         vtp.vnt_data = vtip->vti_data;
8443         vtp.vnt_flags = vtip->vti_flags;
8444
8445         res = vnode_resolver_create(mp, vp, &vtp, TRUE);
8446         vnode_put(vp);
8447 out:
8448         TRIG_LOG("Returning %d\n", res);
8449         return res;
8450 }
8451
8452 #endif /* CONFIG_TRIGGERS */