+ if (retval == 0) {
+ vnode_iterate_clear(mp);
+ mount_unlock(mp);
+ return(retval);
+ }
+
+ /* iterate over all the vnodes */
+ while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
+ vp = TAILQ_FIRST(&mp->mnt_workerqueue);
+ TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
+ TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
+ if ( (vp->v_mount != mp) || (vp == skipvp)) {
+ continue;
+ }
+ vid = vp->v_id;
+ mount_unlock(mp);
+ vnode_lock(vp);
+
+ if ((vp->v_id != vid) || ((vp->v_lflag & (VL_DEAD | VL_TERMINATE)))) {
+ vnode_unlock(vp);
+ mount_lock(mp);
+ continue;
+ }
+
+ /*
+ * If requested, skip over vnodes marked VSYSTEM.
+ * Skip over all vnodes marked VNOFLUSH.
+ */
+ if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
+ (vp->v_flag & VNOFLUSH))) {
+ vnode_unlock(vp);
+ mount_lock(mp);
+ continue;
+ }
+ /*
+ * If requested, skip over vnodes marked VSWAP.
+ */
+ if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
+ vnode_unlock(vp);
+ mount_lock(mp);
+ continue;
+ }
+ /*
+ * If requested, skip over vnodes marked VSWAP.
+ */
+ if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
+ vnode_unlock(vp);
+ mount_lock(mp);
+ continue;
+ }
+ /*
+ * If WRITECLOSE is set, only flush out regular file
+ * vnodes open for writing.
+ */
+ if ((flags & WRITECLOSE) &&
+ (vp->v_writecount == 0 || vp->v_type != VREG)) {
+ vnode_unlock(vp);
+ mount_lock(mp);
+ continue;
+ }
+ /*
+ * If the real usecount is 0, all we need to do is clear
+ * out the vnode data structures and we are done.
+ */
+ if (((vp->v_usecount == 0) ||
+ ((vp->v_usecount - vp->v_kusecount) == 0))) {
+ vp->v_iocount++; /* so that drain waits for * other iocounts */
+#ifdef JOE_DEBUG
+ record_vp(vp, 1);
+#endif
+ vnode_reclaim_internal(vp, 1, 0);
+ vnode_dropiocount(vp, 1);
+ vnode_list_add(vp);
+
+ vnode_unlock(vp);
+ reclaimed++;
+ mount_lock(mp);
+ continue;
+ }
+ /*
+ * If FORCECLOSE is set, forcibly close the vnode.
+ * For block or character devices, revert to an
+ * anonymous device. For all other files, just kill them.
+ */
+ if (flags & FORCECLOSE) {
+ if (vp->v_type != VBLK && vp->v_type != VCHR) {
+ vp->v_iocount++; /* so that drain waits * for other iocounts */
+#ifdef JOE_DEBUG
+ record_vp(vp, 1);
+#endif
+ vnode_reclaim_internal(vp, 1, 0);
+ vnode_dropiocount(vp, 1);
+ vnode_list_add(vp);
+ vnode_unlock(vp);
+ } else {
+ vclean(vp, 0, p);
+ vp->v_lflag &= ~VL_DEAD;
+ vp->v_op = spec_vnodeop_p;
+ vnode_unlock(vp);
+ }
+ mount_lock(mp);
+ continue;
+ }
+#if DIAGNOSTIC
+ if (busyprt)
+ vprint("vflush: busy vnode", vp);
+#endif
+ vnode_unlock(vp);
+ mount_lock(mp);
+ busy++;
+ }
+
+ /* At this point the worker queue is completed */
+ if (busy && ((flags & FORCECLOSE)==0) && reclaimed) {
+ busy = 0;
+ reclaimed = 0;
+ (void)vnode_iterate_reloadq(mp);
+ /* returned with mount lock held */
+ goto loop;
+ }
+
+ /* if new vnodes were created in between retry the reclaim */
+ if ( vnode_iterate_reloadq(mp) != 0) {
+ if (!(busy && ((flags & FORCECLOSE)==0)))
+ goto loop;
+ }
+ vnode_iterate_clear(mp);
+ mount_unlock(mp);
+
+ if (busy && ((flags & FORCECLOSE)==0))
+ return (EBUSY);
+ return (0);
+}
+
+int num_recycledvnodes=0;
+/*
+ * Disassociate the underlying file system from a vnode.
+ * The vnode lock is held on entry.
+ */
+static void
+vclean(vnode_t vp, int flags, proc_t p)
+{
+ struct vfs_context context;
+ int active;
+ int need_inactive;
+ int already_terminating;
+ kauth_cred_t ucred = NULL;
+
+ context.vc_proc = p;
+ context.vc_ucred = kauth_cred_get();
+
+ /*
+ * Check to see if the vnode is in use.
+ * If so we have to reference it before we clean it out
+ * so that its count cannot fall to zero and generate a
+ * race against ourselves to recycle it.
+ */
+ active = vp->v_usecount;
+
+ /*
+ * just in case we missed sending a needed
+ * VNOP_INACTIVE, we'll do it now
+ */
+ need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
+
+ vp->v_lflag &= ~VL_NEEDINACTIVE;
+
+ /*
+ * Prevent the vnode from being recycled or
+ * brought into use while we clean it out.
+ */
+ already_terminating = (vp->v_lflag & VL_TERMINATE);
+
+ vp->v_lflag |= VL_TERMINATE;
+
+ /*
+ * remove the vnode from any mount list
+ * it might be on...
+ */
+ insmntque(vp, (struct mount *)0);
+
+ ucred = vp->v_cred;
+ vp->v_cred = NULL;
+
+ vnode_unlock(vp);
+
+ if (ucred)
+ kauth_cred_rele(ucred);
+
+ OSAddAtomic(1, &num_recycledvnodes);
+ /*
+ * purge from the name cache as early as possible...
+ */
+ cache_purge(vp);
+
+ if (active && (flags & DOCLOSE))
+ VNOP_CLOSE(vp, IO_NDELAY, &context);
+
+ /*
+ * Clean out any buffers associated with the vnode.
+ */
+ if (flags & DOCLOSE) {
+#if NFSCLIENT
+ if (vp->v_tag == VT_NFS)
+ nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
+ else
+#endif
+ {
+ VNOP_FSYNC(vp, MNT_WAIT, &context);
+ buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
+ }
+ if (UBCINFOEXISTS(vp))
+ /*
+ * Clean the pages in VM.
+ */
+ (void)ubc_sync_range(vp, (off_t)0, ubc_getsize(vp), UBC_PUSHALL);
+ }
+ if (UBCINFOEXISTS(vp))
+ cluster_release(vp->v_ubcinfo);
+
+ if (active || need_inactive)
+ VNOP_INACTIVE(vp, &context);
+
+ /* Destroy ubc named reference */
+ ubc_destroy_named(vp);
+
+ /*
+ * Reclaim the vnode.
+ */
+ if (VNOP_RECLAIM(vp, &context))
+ panic("vclean: cannot reclaim");
+
+ // make sure the name & parent ptrs get cleaned out!
+ vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME);
+
+ vnode_lock(vp);
+
+ vp->v_mount = dead_mountp;
+ vp->v_op = dead_vnodeop_p;
+ vp->v_tag = VT_NON;
+ vp->v_data = NULL;
+
+ vp->v_lflag |= VL_DEAD;
+
+ if (already_terminating == 0) {
+ vp->v_lflag &= ~VL_TERMINATE;
+ /*
+ * Done with purge, notify sleepers of the grim news.
+ */
+ if (vp->v_lflag & VL_TERMWANT) {
+ vp->v_lflag &= ~VL_TERMWANT;
+ wakeup(&vp->v_lflag);
+ }
+ }
+}
+
+/*
+ * Eliminate all activity associated with the requested vnode
+ * and with all vnodes aliased to the requested vnode.
+ */
+int
+vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
+{
+ struct vnode *vq;
+ int vid;
+
+#if DIAGNOSTIC
+ if ((flags & REVOKEALL) == 0)
+ panic("vnop_revoke");
+#endif
+
+ if (vp->v_flag & VALIASED) {
+ /*
+ * If a vgone (or vclean) is already in progress,
+ * wait until it is done and return.
+ */
+ vnode_lock(vp);
+ if (vp->v_lflag & VL_TERMINATE) {
+ vnode_unlock(vp);
+ return(ENOENT);
+ }
+ vnode_unlock(vp);
+ /*
+ * Ensure that vp will not be vgone'd while we
+ * are eliminating its aliases.
+ */
+ SPECHASH_LOCK();
+ while (vp->v_flag & VALIASED) {
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_rdev != vp->v_rdev ||
+ vq->v_type != vp->v_type || vp == vq)
+ continue;
+ vid = vq->v_id;
+ SPECHASH_UNLOCK();
+ if (vnode_getwithvid(vq,vid)){
+ SPECHASH_LOCK();
+ break;
+ }
+ vnode_reclaim_internal(vq, 0, 0);
+ vnode_put(vq);
+ SPECHASH_LOCK();
+ break;
+ }
+ }
+ SPECHASH_UNLOCK();
+ }
+ vnode_reclaim_internal(vp, 0, 0);
+
+ return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ * Release the passed interlock if the vnode will be recycled.
+ */
+int
+vnode_recycle(vp)
+ struct vnode *vp;
+{
+ vnode_lock(vp);
+
+ if (vp->v_iocount || vp->v_usecount) {
+ vp->v_lflag |= VL_MARKTERM;
+ vnode_unlock(vp);
+ return(0);
+ }
+ vnode_reclaim_internal(vp, 1, 0);
+ vnode_unlock(vp);
+
+ return (1);
+}
+
+static int
+vnode_reload(vnode_t vp)
+{
+ vnode_lock(vp);
+
+ if ((vp->v_iocount > 1) || vp->v_usecount) {
+ vnode_unlock(vp);
+ return(0);
+ }
+ if (vp->v_iocount <= 0)
+ panic("vnode_reload with no iocount %d", vp->v_iocount);
+
+ /* mark for release when iocount is dopped */
+ vp->v_lflag |= VL_MARKTERM;
+ vnode_unlock(vp);
+
+ return (1);
+}
+
+
+static void
+vgone(vnode_t vp)
+{
+ struct vnode *vq;
+ struct vnode *vx;
+
+ /*
+ * Clean out the filesystem specific data.
+ * vclean also takes care of removing the
+ * vnode from any mount list it might be on
+ */
+ vclean(vp, DOCLOSE, current_proc());
+
+ /*
+ * If special device, remove it from special device alias list
+ * if it is on one.
+ */
+ if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
+ SPECHASH_LOCK();
+ if (*vp->v_hashchain == vp) {
+ *vp->v_hashchain = vp->v_specnext;
+ } else {
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_specnext != vp)
+ continue;
+ vq->v_specnext = vp->v_specnext;
+ break;
+ }
+ if (vq == NULL)
+ panic("missing bdev");
+ }
+ if (vp->v_flag & VALIASED) {
+ vx = NULL;
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_rdev != vp->v_rdev ||
+ vq->v_type != vp->v_type)
+ continue;
+ if (vx)
+ break;
+ vx = vq;
+ }
+ if (vx == NULL)
+ panic("missing alias");
+ if (vq == NULL)
+ vx->v_flag &= ~VALIASED;
+ vp->v_flag &= ~VALIASED;
+ }
+ SPECHASH_UNLOCK();
+ {
+ struct specinfo *tmp = vp->v_specinfo;
+ vp->v_specinfo = NULL;
+ FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO);
+ }
+ }
+}
+
+/*
+ * Lookup a vnode by device number.
+ */
+int
+check_mountedon(dev_t dev, enum vtype type, int *errorp)
+{
+ vnode_t vp;
+ int rc = 0;
+ int vid;
+
+loop:
+ SPECHASH_LOCK();
+ for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
+ if (dev != vp->v_rdev || type != vp->v_type)
+ continue;
+ vid = vp->v_id;
+ SPECHASH_UNLOCK();
+ if (vnode_getwithvid(vp,vid))
+ goto loop;
+ vnode_lock(vp);
+ if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
+ vnode_unlock(vp);
+ if ((*errorp = vfs_mountedon(vp)) != 0)
+ rc = 1;
+ } else
+ vnode_unlock(vp);
+ vnode_put(vp);
+ return(rc);
+ }
+ SPECHASH_UNLOCK();
+ return (0);
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(vnode_t vp)
+{
+ vnode_t vq, vnext;
+ int count;
+ int vid;
+
+loop:
+ if ((vp->v_flag & VALIASED) == 0)
+ return (vp->v_usecount - vp->v_kusecount);
+
+ SPECHASH_LOCK();
+ for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
+ vnext = vq->v_specnext;
+ if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
+ continue;
+ vid = vq->v_id;
+ SPECHASH_UNLOCK();
+
+ if (vnode_getwithvid(vq, vid)) {
+ goto loop;
+ }
+ /*
+ * Alias, but not in use, so flush it out.
+ */
+ vnode_lock(vq);
+ if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) {
+ vnode_reclaim_internal(vq, 1, 0);
+ vnode_unlock(vq);
+ vnode_put(vq);
+ goto loop;
+ }
+ count += (vq->v_usecount - vq->v_kusecount);
+ vnode_unlock(vq);
+ vnode_put(vq);
+
+ SPECHASH_LOCK();
+ }
+ SPECHASH_UNLOCK();
+
+ return (count);
+}
+
+int prtactive = 0; /* 1 => print out reclaim of active vnodes */
+
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+ { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
+
+void
+vprint(const char *label, struct vnode *vp)
+{
+ char sbuf[64];
+
+ if (label != NULL)
+ printf("%s: ", label);
+ printf("type %s, usecount %d, writecount %d",
+ typename[vp->v_type], vp->v_usecount, vp->v_writecount);
+ sbuf[0] = '\0';
+ if (vp->v_flag & VROOT)
+ strcat(sbuf, "|VROOT");
+ if (vp->v_flag & VTEXT)
+ strcat(sbuf, "|VTEXT");
+ if (vp->v_flag & VSYSTEM)
+ strcat(sbuf, "|VSYSTEM");
+ if (vp->v_flag & VNOFLUSH)
+ strcat(sbuf, "|VNOFLUSH");
+ if (vp->v_flag & VBWAIT)
+ strcat(sbuf, "|VBWAIT");
+ if (vp->v_flag & VALIASED)
+ strcat(sbuf, "|VALIASED");
+ if (sbuf[0] != '\0')
+ printf(" flags (%s)", &sbuf[1]);
+}
+
+
+int
+vn_getpath(struct vnode *vp, char *pathbuf, int *len)
+{
+ return build_path(vp, pathbuf, *len, len);
+}
+
+
+static char *extension_table=NULL;
+static int nexts;
+static int max_ext_width;
+
+static int
+extension_cmp(void *a, void *b)
+{
+ return (strlen((char *)a) - strlen((char *)b));
+}
+
+
+//
+// This is the api LaunchServices uses to inform the kernel
+// the list of package extensions to ignore.
+//
+// Internally we keep the list sorted by the length of the
+// the extension (from longest to shortest). We sort the
+// list of extensions so that we can speed up our searches
+// when comparing file names -- we only compare extensions
+// that could possibly fit into the file name, not all of
+// them (i.e. a short 8 character name can't have an 8
+// character extension).
+//
+__private_extern__ int
+set_package_extensions_table(void *data, int nentries, int maxwidth)
+{
+ char *new_exts, *ptr;
+ int error, i, len;
+
+ if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) {
+ return EINVAL;
+ }
+
+ MALLOC(new_exts, char *, nentries * maxwidth, M_TEMP, M_WAITOK);
+
+ error = copyin(CAST_USER_ADDR_T(data), new_exts, nentries * maxwidth);
+ if (error) {
+ FREE(new_exts, M_TEMP);
+ return error;
+ }
+
+ if (extension_table) {
+ FREE(extension_table, M_TEMP);
+ }
+ extension_table = new_exts;
+ nexts = nentries;
+ max_ext_width = maxwidth;
+
+ qsort(extension_table, nexts, maxwidth, extension_cmp);
+
+ return 0;
+}
+
+
+__private_extern__ int
+is_package_name(char *name, int len)
+{
+ int i, extlen;
+ char *ptr, *name_ext;
+
+ if (len <= 3) {
+ return 0;
+ }
+
+ name_ext = NULL;
+ for(ptr=name; *ptr != '\0'; ptr++) {
+ if (*ptr == '.') {
+ name_ext = ptr;
+ }
+ }
+
+ // if there is no "." extension, it can't match
+ if (name_ext == NULL) {
+ return 0;
+ }
+
+ // advance over the "."
+ name_ext++;
+
+ // now iterate over all the extensions to see if any match
+ ptr = &extension_table[0];
+ for(i=0; i < nexts; i++, ptr+=max_ext_width) {
+ extlen = strlen(ptr);
+ if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') {
+ // aha, a match!
+ return 1;
+ }
+ }
+
+ // if we get here, no extension matched
+ return 0;
+}
+
+int
+vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component)
+{
+ char *ptr, *end;
+ int comp=0;
+
+ *component = -1;
+ if (*path != '/') {
+ return EINVAL;
+ }
+
+ end = path + 1;
+ while(end < path + pathlen && *end != '\0') {
+ while(end < path + pathlen && *end == '/' && *end != '\0') {
+ end++;
+ }
+
+ ptr = end;
+
+ while(end < path + pathlen && *end != '/' && *end != '\0') {
+ end++;
+ }
+
+ if (end > path + pathlen) {
+ // hmm, string wasn't null terminated
+ return EINVAL;
+ }
+
+ *end = '\0';
+ if (is_package_name(ptr, end - ptr)) {
+ *component = comp;
+ break;
+ }
+
+ end++;
+ comp++;
+ }
+
+ return 0;
+}
+
+
+/*
+ * Top level filesystem related information gathering.
+ */
+extern unsigned int vfs_nummntops;
+
+int
+vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp,
+ user_addr_t newp, size_t newlen, struct proc *p)
+{
+ struct vfstable *vfsp;
+ int *username;
+ u_int usernamelen;
+ int error;
+ struct vfsconf *vfsc;
+
+ /*
+ * The VFS_NUMMNTOPS shouldn't be at name[0] since
+ * is a VFS generic variable. So now we must check
+ * namelen so we don't end up covering any UFS
+ * variables (sinc UFS vfc_typenum is 1).
+ *
+ * It should have been:
+ * name[0]: VFS_GENERIC
+ * name[1]: VFS_NUMMNTOPS
+ */
+ if (namelen == 1 && name[0] == VFS_NUMMNTOPS) {
+ return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops));
+ }
+
+ /* all sysctl names at this level are at least name and field */
+ if (namelen < 2)
+ return (EISDIR); /* overloaded */
+ if (name[0] != VFS_GENERIC) {
+ struct vfs_context context;
+
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == name[0])
+ break;
+ if (vfsp == NULL)
+ return (ENOTSUP);
+ context.vc_proc = p;
+ context.vc_ucred = kauth_cred_get();
+
+ return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
+ oldp, oldlenp, newp, newlen, &context));
+ }
+ switch (name[1]) {
+ case VFS_MAXTYPENUM:
+ return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
+ case VFS_CONF:
+ if (namelen < 3)
+ return (ENOTDIR); /* overloaded */
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == name[2])
+ break;
+ if (vfsp == NULL)
+ return (ENOTSUP);
+ vfsc = (struct vfsconf *)vfsp;
+ if (proc_is64bit(p)) {
+ struct user_vfsconf usr_vfsc;
+ usr_vfsc.vfc_vfsops = CAST_USER_ADDR_T(vfsc->vfc_vfsops);
+ bcopy(vfsc->vfc_name, usr_vfsc.vfc_name, sizeof(usr_vfsc.vfc_name));
+ usr_vfsc.vfc_typenum = vfsc->vfc_typenum;
+ usr_vfsc.vfc_refcount = vfsc->vfc_refcount;
+ usr_vfsc.vfc_flags = vfsc->vfc_flags;
+ usr_vfsc.vfc_mountroot = CAST_USER_ADDR_T(vfsc->vfc_mountroot);
+ usr_vfsc.vfc_next = CAST_USER_ADDR_T(vfsc->vfc_next);
+ return (sysctl_rdstruct(oldp, oldlenp, newp, &usr_vfsc,
+ sizeof(usr_vfsc)));
+ }
+ else {
+ return (sysctl_rdstruct(oldp, oldlenp, newp, vfsc,
+ sizeof(struct vfsconf)));
+ }
+
+ case VFS_SET_PACKAGE_EXTS:
+ return set_package_extensions_table((void *)name[1], name[2], name[3]);
+ }
+ /*
+ * We need to get back into the general MIB, so we need to re-prepend
+ * CTL_VFS to our name and try userland_sysctl().
+ */
+ usernamelen = namelen + 1;
+ MALLOC(username, int *, usernamelen * sizeof(*username),
+ M_TEMP, M_WAITOK);
+ bcopy(name, username + 1, namelen * sizeof(*name));
+ username[0] = CTL_VFS;
+ error = userland_sysctl(p, username, usernamelen, oldp,
+ oldlenp, 1, newp, newlen, oldlenp);
+ FREE(username, M_TEMP);
+ return (error);
+}
+
+int kinfo_vdebug = 1;
+#define KINFO_VNODESLOP 10
+/*
+ * Dump vnode list (via sysctl).
+ * Copyout address of vnode followed by vnode.
+ */
+/* ARGSUSED */
+int
+sysctl_vnode(__unused user_addr_t where, __unused size_t *sizep)
+{
+#if 0
+ struct mount *mp, *nmp;
+ struct vnode *nvp, *vp;
+ char *bp = where, *savebp;
+ char *ewhere;
+ int error;
+
+#define VPTRSZ sizeof (struct vnode *)
+#define VNODESZ sizeof (struct vnode)
+ if (where == NULL) {
+ *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
+ return (0);
+ }
+ ewhere = where + *sizep;
+
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+ savebp = bp;
+again:
+ TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
+ /*
+ * Check that the vp is still associated with
+ * this filesystem. RACE: could have been
+ * recycled onto the same filesystem.
+ */
+ if (vp->v_mount != mp) {
+ if (kinfo_vdebug)
+ printf("kinfo: vp changed\n");
+ bp = savebp;
+ goto again;
+ }
+ if (bp + VPTRSZ + VNODESZ > ewhere) {
+ vfs_unbusy(mp);
+ *sizep = bp - where;
+ return (ENOMEM);
+ }
+ if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
+ (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) {
+ vfs_unbusy(mp);
+ return (error);
+ }
+ bp += VPTRSZ + VNODESZ;
+ }
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp);
+ }
+
+ *sizep = bp - where;
+ return (0);
+#else
+ return(EINVAL);
+#endif
+}
+
+/*
+ * Check to see if a filesystem is mounted on a block device.
+ */
+int
+vfs_mountedon(vp)
+ struct vnode *vp;
+{
+ struct vnode *vq;
+ int error = 0;
+
+ SPECHASH_LOCK();
+ if (vp->v_specflags & SI_MOUNTEDON) {
+ error = EBUSY;
+ goto out;
+ }
+ if (vp->v_flag & VALIASED) {
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_rdev != vp->v_rdev ||
+ vq->v_type != vp->v_type)
+ continue;
+ if (vq->v_specflags & SI_MOUNTEDON) {
+ error = EBUSY;
+ break;
+ }
+ }
+ }
+out:
+ SPECHASH_UNLOCK();
+ return (error);
+}
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+__private_extern__ void
+vfs_unmountall()
+{
+ struct mount *mp;
+ struct proc *p = current_proc();
+ int error;
+
+ /*
+ * Since this only runs when rebooting, it is not interlocked.
+ */
+ mount_list_lock();
+ while(!TAILQ_EMPTY(&mountlist)) {
+ mp = TAILQ_LAST(&mountlist, mntlist);
+ mount_list_unlock();
+ error = dounmount(mp, MNT_FORCE, p);
+ if (error) {
+ mount_list_lock();
+ TAILQ_REMOVE(&mountlist, mp, mnt_list);
+ printf("unmount of %s failed (", mp->mnt_vfsstat.f_mntonname);
+ if (error == EBUSY)
+ printf("BUSY)\n");
+ else
+ printf("%d)\n", error);
+ continue;
+ }
+ mount_list_lock();
+ }
+ mount_list_unlock();
+}
+
+
+/*
+ * This routine is called from vnode_pager_no_senders()
+ * which in turn can be called with vnode locked by vnode_uncache()
+ * But it could also get called as a result of vm_object_cache_trim().
+ * In that case lock state is unknown.
+ * AGE the vnode so that it gets recycled quickly.
+ */
+__private_extern__ void
+vnode_pager_vrele(struct vnode *vp)
+{
+ vnode_lock(vp);
+
+ if (!ISSET(vp->v_lflag, VL_TERMINATE))
+ panic("vnode_pager_vrele: vp not in termination");
+ vp->v_lflag &= ~VNAMED_UBC;
+
+ if (UBCINFOEXISTS(vp)) {
+ struct ubc_info *uip = vp->v_ubcinfo;
+
+ if (ISSET(uip->ui_flags, UI_WASMAPPED))
+ SET(vp->v_flag, VWASMAPPED);
+ vp->v_ubcinfo = UBC_INFO_NULL;
+
+ ubc_info_deallocate(uip);
+ } else {
+ panic("NO ubcinfo in vnode_pager_vrele");
+ }
+ vnode_unlock(vp);
+
+ wakeup(&vp->v_lflag);
+}
+
+
+#include <sys/disk.h>
+
+errno_t
+vfs_init_io_attributes(vnode_t devvp, mount_t mp)
+{
+ int error;
+ off_t readblockcnt;
+ off_t writeblockcnt;
+ off_t readmaxcnt;
+ off_t writemaxcnt;
+ off_t readsegcnt;
+ off_t writesegcnt;
+ off_t readsegsize;
+ off_t writesegsize;
+ u_long blksize;
+ u_int64_t temp;
+ struct vfs_context context;
+
+ proc_t p = current_proc();
+
+ context.vc_proc = p;
+ context.vc_ucred = kauth_cred_get();
+
+ int isvirtual = 0;
+ /*
+ * determine if this mount point exists on the same device as the root
+ * partition... if so, then it comes under the hard throttle control
+ */
+ int thisunit = -1;
+ static int rootunit = -1;
+
+ if (rootunit == -1) {
+ if (VNOP_IOCTL(rootvp, DKIOCGETBSDUNIT, (caddr_t)&rootunit, 0, &context))
+ rootunit = -1;
+ else if (rootvp == devvp)
+ mp->mnt_kern_flag |= MNTK_ROOTDEV;
+ }
+ if (devvp != rootvp && rootunit != -1) {
+ if (VNOP_IOCTL(devvp, DKIOCGETBSDUNIT, (caddr_t)&thisunit, 0, &context) == 0) {
+ if (thisunit == rootunit)
+ mp->mnt_kern_flag |= MNTK_ROOTDEV;
+ }
+ }
+ /*
+ * force the spec device to re-cache
+ * the underlying block size in case
+ * the filesystem overrode the initial value
+ */
+ set_fsblocksize(devvp);
+
+
+ if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
+ (caddr_t)&blksize, 0, &context)))
+ return (error);
+
+ mp->mnt_devblocksize = blksize;
+
+ if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, &context) == 0) {
+ if (isvirtual)
+ mp->mnt_kern_flag |= MNTK_VIRTUALDEV;
+ }
+
+ if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
+ (caddr_t)&readblockcnt, 0, &context)))
+ return (error);
+
+ if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
+ (caddr_t)&writeblockcnt, 0, &context)))
+ return (error);
+
+ if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
+ (caddr_t)&readmaxcnt, 0, &context)))
+ return (error);
+
+ if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
+ (caddr_t)&writemaxcnt, 0, &context)))
+ return (error);
+
+ if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
+ (caddr_t)&readsegcnt, 0, &context)))
+ return (error);
+
+ if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
+ (caddr_t)&writesegcnt, 0, &context)))
+ return (error);
+
+ if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
+ (caddr_t)&readsegsize, 0, &context)))
+ return (error);
+
+ if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
+ (caddr_t)&writesegsize, 0, &context)))
+ return (error);
+
+ if (readmaxcnt)
+ temp = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt;
+ else {
+ if (readblockcnt) {
+ temp = readblockcnt * blksize;
+ temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
+ } else
+ temp = MAXPHYS;
+ }
+ mp->mnt_maxreadcnt = (u_int32_t)temp;
+
+ if (writemaxcnt)
+ temp = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt;
+ else {
+ if (writeblockcnt) {
+ temp = writeblockcnt * blksize;
+ temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
+ } else
+ temp = MAXPHYS;
+ }
+ mp->mnt_maxwritecnt = (u_int32_t)temp;
+
+ if (readsegcnt) {
+ temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
+ mp->mnt_segreadcnt = (u_int16_t)temp;
+ }
+ if (writesegcnt) {
+ temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
+ mp->mnt_segwritecnt = (u_int16_t)temp;
+ }
+ if (readsegsize)
+ temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
+ else
+ temp = mp->mnt_maxreadcnt;
+ mp->mnt_maxsegreadsize = (u_int32_t)temp;
+
+ if (writesegsize)
+ temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
+ else
+ temp = mp->mnt_maxwritecnt;
+ mp->mnt_maxsegwritesize = (u_int32_t)temp;
+
+ return (error);
+}
+
+static struct klist fs_klist;
+
+void
+vfs_event_init(void)
+{
+
+ klist_init(&fs_klist);
+}
+
+void
+vfs_event_signal(__unused fsid_t *fsid, u_int32_t event, __unused intptr_t data)
+{
+
+ KNOTE(&fs_klist, event);
+}
+
+/*
+ * return the number of mounted filesystems.
+ */
+static int
+sysctl_vfs_getvfscnt(void)
+{
+ return(mount_getvfscnt());
+}
+
+
+static int
+mount_getvfscnt(void)
+{
+ int ret;
+
+ mount_list_lock();
+ ret = nummounts;
+ mount_list_unlock();
+ return (ret);
+
+}
+
+
+
+static int
+mount_fillfsids(fsid_t *fsidlst, int count)
+{
+ struct mount *mp;
+ int actual=0;
+
+ actual = 0;
+ mount_list_lock();
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ if (actual <= count) {
+ fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
+ actual++;
+ }
+ }
+ mount_list_unlock();
+ return (actual);
+
+}
+
+/*
+ * fill in the array of fsid_t's up to a max of 'count', the actual
+ * number filled in will be set in '*actual'. If there are more fsid_t's
+ * than room in fsidlst then ENOMEM will be returned and '*actual' will
+ * have the actual count.
+ * having *actual filled out even in the error case is depended upon.
+ */
+static int
+sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual)
+{
+ struct mount *mp;
+
+ *actual = 0;
+ mount_list_lock();
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ (*actual)++;
+ if (*actual <= count)
+ fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid;
+ }
+ mount_list_unlock();
+ return (*actual <= count ? 0 : ENOMEM);
+}
+
+static int
+sysctl_vfs_vfslist SYSCTL_HANDLER_ARGS
+{
+ int actual, error;
+ size_t space;
+ fsid_t *fsidlst;
+
+ /* This is a readonly node. */
+ if (req->newptr != USER_ADDR_NULL)
+ return (EPERM);
+
+ /* they are querying us so just return the space required. */
+ if (req->oldptr == USER_ADDR_NULL) {
+ req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
+ return 0;
+ }
+again:
+ /*
+ * Retrieve an accurate count of the amount of space required to copy
+ * out all the fsids in the system.
+ */
+ space = req->oldlen;
+ req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
+
+ /* they didn't give us enough space. */
+ if (space < req->oldlen)
+ return (ENOMEM);
+
+ MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK);
+ error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
+ &actual);
+ /*
+ * If we get back ENOMEM, then another mount has been added while we
+ * slept in malloc above. If this is the case then try again.
+ */
+ if (error == ENOMEM) {
+ FREE(fsidlst, M_TEMP);
+ req->oldlen = space;
+ goto again;
+ }
+ if (error == 0) {
+ error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
+ }
+ FREE(fsidlst, M_TEMP);
+ return (error);
+}
+
+/*
+ * Do a sysctl by fsid.
+ */
+static int
+sysctl_vfs_ctlbyfsid SYSCTL_HANDLER_ARGS
+{
+ struct vfsidctl vc;
+ struct user_vfsidctl user_vc;
+ struct mount *mp;
+ struct vfsstatfs *sp;
+ struct proc *p;
+ int *name;
+ int error, flags, namelen;
+ struct vfs_context context;
+ boolean_t is_64_bit;
+
+ name = arg1;
+ namelen = arg2;
+ p = req->p;
+ context.vc_proc = p;
+ context.vc_ucred = kauth_cred_get();
+ is_64_bit = proc_is64bit(p);
+
+ if (is_64_bit) {
+ error = SYSCTL_IN(req, &user_vc, sizeof(user_vc));
+ if (error)
+ return (error);
+ if (user_vc.vc_vers != VFS_CTL_VERS1)
+ return (EINVAL);
+ mp = mount_list_lookupby_fsid(&user_vc.vc_fsid, 0, 0);
+ }
+ else {
+ error = SYSCTL_IN(req, &vc, sizeof(vc));
+ if (error)
+ return (error);
+ if (vc.vc_vers != VFS_CTL_VERS1)
+ return (EINVAL);
+ mp = mount_list_lookupby_fsid(&vc.vc_fsid, 0, 0);
+ }
+ if (mp == NULL)
+ return (ENOENT);
+ /* reset so that the fs specific code can fetch it. */
+ req->newidx = 0;
+ /*
+ * Note if this is a VFS_CTL then we pass the actual sysctl req
+ * in for "oldp" so that the lower layer can DTRT and use the
+ * SYSCTL_IN/OUT routines.
+ */
+ if (mp->mnt_op->vfs_sysctl != NULL) {
+ if (is_64_bit) {
+ if (vfs_64bitready(mp)) {
+ error = mp->mnt_op->vfs_sysctl(name, namelen,
+ CAST_USER_ADDR_T(req),
+ NULL, USER_ADDR_NULL, 0,
+ &context);
+ }
+ else {
+ error = ENOTSUP;
+ }
+ }
+ else {
+ error = mp->mnt_op->vfs_sysctl(name, namelen,
+ CAST_USER_ADDR_T(req),
+ NULL, USER_ADDR_NULL, 0,
+ &context);
+ }
+ if (error != ENOTSUP)
+ return (error);
+ }
+ switch (name[0]) {
+ case VFS_CTL_UMOUNT:
+ req->newidx = 0;
+ if (is_64_bit) {
+ req->newptr = user_vc.vc_ptr;
+ req->newlen = (size_t)user_vc.vc_len;
+ }
+ else {
+ req->newptr = CAST_USER_ADDR_T(vc.vc_ptr);
+ req->newlen = vc.vc_len;
+ }
+ error = SYSCTL_IN(req, &flags, sizeof(flags));
+ if (error)
+ break;
+ error = safedounmount(mp, flags, p);
+ break;
+ case VFS_CTL_STATFS:
+ req->newidx = 0;
+ if (is_64_bit) {
+ req->newptr = user_vc.vc_ptr;
+ req->newlen = (size_t)user_vc.vc_len;
+ }
+ else {
+ req->newptr = CAST_USER_ADDR_T(vc.vc_ptr);
+ req->newlen = vc.vc_len;
+ }
+ error = SYSCTL_IN(req, &flags, sizeof(flags));
+ if (error)
+ break;
+ sp = &mp->mnt_vfsstat;
+ if (((flags & MNT_NOWAIT) == 0 || (flags & MNT_WAIT)) &&
+ (error = vfs_update_vfsstat(mp, &context)))
+ return (error);
+ if (is_64_bit) {
+ struct user_statfs sfs;
+ bzero(&sfs, sizeof(sfs));
+ sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ sfs.f_type = mp->mnt_vtable->vfc_typenum;
+ sfs.f_bsize = (user_long_t)sp->f_bsize;
+ sfs.f_iosize = (user_long_t)sp->f_iosize;
+ sfs.f_blocks = (user_long_t)sp->f_blocks;
+ sfs.f_bfree = (user_long_t)sp->f_bfree;
+ sfs.f_bavail = (user_long_t)sp->f_bavail;
+ sfs.f_files = (user_long_t)sp->f_files;
+ sfs.f_ffree = (user_long_t)sp->f_ffree;
+ sfs.f_fsid = sp->f_fsid;
+ sfs.f_owner = sp->f_owner;
+
+ strncpy(&sfs.f_fstypename, &sp->f_fstypename, MFSNAMELEN-1);
+ strncpy(&sfs.f_mntonname, &sp->f_mntonname, MNAMELEN-1);
+ strncpy(&sfs.f_mntfromname, &sp->f_mntfromname, MNAMELEN-1);
+
+ error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
+ }
+ else {
+ struct statfs sfs;
+ bzero(&sfs, sizeof(struct statfs));
+ sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ sfs.f_type = mp->mnt_vtable->vfc_typenum;
+
+ /*
+ * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
+ * have to fudge the numbers here in that case. We inflate the blocksize in order
+ * to reflect the filesystem size as best we can.
+ */
+ if (sp->f_blocks > LONG_MAX) {
+ int shift;
+
+ /*
+ * Work out how far we have to shift the block count down to make it fit.
+ * Note that it's possible to have to shift so far that the resulting
+ * blocksize would be unreportably large. At that point, we will clip
+ * any values that don't fit.
+ *
+ * For safety's sake, we also ensure that f_iosize is never reported as
+ * being smaller than f_bsize.
+ */
+ for (shift = 0; shift < 32; shift++) {
+ if ((sp->f_blocks >> shift) <= LONG_MAX)
+ break;
+ if ((sp->f_bsize << (shift + 1)) > LONG_MAX)
+ break;
+ }
+#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > LONG_MAX) ? LONG_MAX : ((x) >> (s)))
+ sfs.f_blocks = (long)__SHIFT_OR_CLIP(sp->f_blocks, shift);
+ sfs.f_bfree = (long)__SHIFT_OR_CLIP(sp->f_bfree, shift);
+ sfs.f_bavail = (long)__SHIFT_OR_CLIP(sp->f_bavail, shift);
+#undef __SHIFT_OR_CLIP
+ sfs.f_bsize = (long)(sp->f_bsize << shift);
+ sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize);
+ } else {
+ sfs.f_bsize = (long)sp->f_bsize;
+ sfs.f_iosize = (long)sp->f_iosize;
+ sfs.f_blocks = (long)sp->f_blocks;
+ sfs.f_bfree = (long)sp->f_bfree;
+ sfs.f_bavail = (long)sp->f_bavail;
+ }
+ sfs.f_files = (long)sp->f_files;
+ sfs.f_ffree = (long)sp->f_ffree;
+ sfs.f_fsid = sp->f_fsid;
+ sfs.f_owner = sp->f_owner;
+
+ strncpy(&sfs.f_fstypename, &sp->f_fstypename, MFSNAMELEN-1);
+ strncpy(&sfs.f_mntonname, &sp->f_mntonname, MNAMELEN-1);
+ strncpy(&sfs.f_mntfromname, &sp->f_mntfromname, MNAMELEN-1);
+
+ error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
+ }
+ break;
+ default:
+ return (ENOTSUP);
+ }
+ return (error);
+}
+
+static int filt_fsattach(struct knote *kn);
+static void filt_fsdetach(struct knote *kn);
+static int filt_fsevent(struct knote *kn, long hint);
+
+struct filterops fs_filtops =
+ { 0, filt_fsattach, filt_fsdetach, filt_fsevent };
+
+static int
+filt_fsattach(struct knote *kn)
+{
+
+ kn->kn_flags |= EV_CLEAR;
+ KNOTE_ATTACH(&fs_klist, kn);
+ return (0);
+}
+
+static void
+filt_fsdetach(struct knote *kn)
+{
+
+ KNOTE_DETACH(&fs_klist, kn);
+}
+
+static int
+filt_fsevent(struct knote *kn, long hint)
+{
+
+ kn->kn_fflags |= hint;
+ return (kn->kn_fflags != 0);
+}
+
+static int
+sysctl_vfs_noremotehang SYSCTL_HANDLER_ARGS
+{
+ int out, error;
+ pid_t pid;
+ size_t space;
+ struct proc *p;
+
+ /* We need a pid. */
+ if (req->newptr == USER_ADDR_NULL)
+ return (EINVAL);
+
+ error = SYSCTL_IN(req, &pid, sizeof(pid));
+ if (error)
+ return (error);
+
+ p = pfind(pid < 0 ? -pid : pid);
+ if (p == NULL)
+ return (ESRCH);
+
+ /*
+ * Fetching the value is ok, but we only fetch if the old
+ * pointer is given.
+ */
+ if (req->oldptr != USER_ADDR_NULL) {
+ out = !((p->p_flag & P_NOREMOTEHANG) == 0);
+ error = SYSCTL_OUT(req, &out, sizeof(out));
+ return (error);
+ }
+
+ /* XXX req->p->p_ucred -> kauth_cred_get() ??? */
+ /* cansignal offers us enough security. */
+ if (p != req->p && suser(req->p->p_ucred, &req->p->p_acflag) != 0)
+ return (EPERM);
+
+ if (pid < 0)
+ p->p_flag &= ~P_NOREMOTEHANG;
+ else
+ p->p_flag |= P_NOREMOTEHANG;
+
+ return (0);
+}
+/* the vfs.generic. branch. */
+SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW, 0, "vfs generic hinge");
+/* retreive a list of mounted filesystem fsid_t */
+SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD,
+ 0, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
+/* perform operations on filesystem via fsid_t */
+SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW,
+ sysctl_vfs_ctlbyfsid, "ctlbyfsid");
+SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW,
+ 0, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
+
+
+int num_reusedvnodes=0;
+
+static int
+new_vnode(vnode_t *vpp)
+{
+ vnode_t vp;
+ int retries = 0; /* retry incase of tablefull */
+ int vpid;
+ struct timespec ts;
+
+retry:
+ vnode_list_lock();
+
+ if ( !TAILQ_EMPTY(&vnode_free_list)) {
+ /*
+ * Pick the first vp for possible reuse
+ */
+ vp = TAILQ_FIRST(&vnode_free_list);
+
+ if (vp->v_lflag & VL_DEAD)
+ goto steal_this_vp;
+ } else
+ vp = NULL;
+
+ /*
+ * we're either empty, or the next guy on the
+ * list is a valid vnode... if we're under the
+ * limit, we'll create a new vnode
+ */
+ if (numvnodes < desiredvnodes) {
+ numvnodes++;
+ vnode_list_unlock();
+ MALLOC_ZONE(vp, struct vnode *, sizeof *vp, M_VNODE, M_WAITOK);
+ bzero((char *)vp, sizeof *vp);
+ VLISTNONE(vp); /* avoid double queue removal */
+ lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr);
+
+ nanouptime(&ts);
+ vp->v_id = ts.tv_nsec;
+ vp->v_flag = VSTANDARD;
+
+ goto done;
+ }
+ if (vp == NULL) {
+ /*
+ * we've reached the system imposed maximum number of vnodes
+ * but there isn't a single one available
+ * wait a bit and then retry... if we can't get a vnode
+ * after 100 retries, than log a complaint
+ */
+ if (++retries <= 100) {
+ vnode_list_unlock();
+ IOSleep(1);
+ goto retry;
+ }
+
+ vnode_list_unlock();
+ tablefull("vnode");
+ log(LOG_EMERG, "%d desired, %d numvnodes, "
+ "%d free, %d inactive\n",
+ desiredvnodes, numvnodes, freevnodes, inactivevnodes);
+ *vpp = 0;
+ return (ENFILE);
+ }
+steal_this_vp:
+ vpid = vp->v_id;
+
+ VREMFREE("new_vnode", vp);
+ VLISTNONE(vp);
+
+ vnode_list_unlock();
+ vnode_lock(vp);
+
+ /*
+ * We could wait for the vnode_lock after removing the vp from the freelist
+ * and the vid is bumped only at the very end of reclaim. So it is possible
+ * that we are looking at a vnode that is being terminated. If so skip it.
+ */
+ if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) ||
+ VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) {
+ /*
+ * we lost the race between dropping the list lock
+ * and picking up the vnode_lock... someone else
+ * used this vnode and it is now in a new state
+ * so we need to go back and try again
+ */
+ vnode_unlock(vp);
+ goto retry;
+ }
+ if ( (vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE ) {
+ /*
+ * we did a vnode_rele_ext that asked for
+ * us not to reenter the filesystem during
+ * the release even though VL_NEEDINACTIVE was
+ * set... we'll do it here by doing a
+ * vnode_get/vnode_put
+ *
+ * pick up an iocount so that we can call
+ * vnode_put and drive the VNOP_INACTIVE...
+ * vnode_put will either leave us off
+ * the freelist if a new ref comes in,
+ * or put us back on the end of the freelist
+ * or recycle us if we were marked for termination...
+ * so we'll just go grab a new candidate
+ */
+ vp->v_iocount++;
+#ifdef JOE_DEBUG
+ record_vp(vp, 1);
+#endif
+ vnode_put_locked(vp);
+ vnode_unlock(vp);
+ goto retry;
+ }
+ OSAddAtomic(1, &num_reusedvnodes);
+
+ /* Checks for anyone racing us for recycle */
+ if (vp->v_type != VBAD) {
+ if (vp->v_lflag & VL_DEAD)
+ panic("new_vnode: the vnode is VL_DEAD but not VBAD");
+
+ (void)vnode_reclaim_internal(vp, 1, 1);
+
+ if ((VONLIST(vp)))
+ panic("new_vnode: vp on list ");
+ if (vp->v_usecount || vp->v_iocount || vp->v_kusecount ||
+ (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH)))
+ panic("new_vnode: free vnode still referenced\n");
+ if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0))
+ panic("new_vnode: vnode seems to be on mount list ");
+ if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren))
+ panic("new_vnode: vnode still hooked into the name cache");
+ }
+ if (vp->v_unsafefs) {
+ lck_mtx_destroy(&vp->v_unsafefs->fsnodelock, vnode_lck_grp);
+ FREE_ZONE((void *)vp->v_unsafefs, sizeof(struct unsafe_fsnode), M_UNSAFEFS);
+ vp->v_unsafefs = (struct unsafe_fsnode *)NULL;
+ }
+ vp->v_lflag = 0;
+ vp->v_writecount = 0;
+ vp->v_references = 0;
+ vp->v_iterblkflags = 0;
+ vp->v_flag = VSTANDARD;
+ /* vbad vnodes can point to dead_mountp */
+ vp->v_mount = 0;
+ vp->v_defer_reclaimlist = (vnode_t)0;
+
+ vnode_unlock(vp);
+done:
+ *vpp = vp;
+
+ return (0);
+}
+
+void
+vnode_lock(vnode_t vp)
+{
+ lck_mtx_lock(&vp->v_lock);
+}
+
+void
+vnode_unlock(vnode_t vp)
+{
+ lck_mtx_unlock(&vp->v_lock);
+}
+
+
+
+int
+vnode_get(struct vnode *vp)
+{
+ vnode_lock(vp);
+
+ if ( (vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD)) ) {
+ vnode_unlock(vp);
+ return(ENOENT);
+ }
+ vp->v_iocount++;
+#ifdef JOE_DEBUG
+ record_vp(vp, 1);
+#endif
+ vnode_unlock(vp);
+
+ return(0);
+}
+
+int
+vnode_getwithvid(vnode_t vp, int vid)
+{
+ return(vget_internal(vp, vid, ( VNODE_NODEAD| VNODE_WITHID)));
+}
+
+int
+vnode_getwithref(vnode_t vp)
+{
+ return(vget_internal(vp, 0, 0));
+}
+
+
+int
+vnode_put(vnode_t vp)
+{
+ int retval;
+
+ vnode_lock(vp);
+ retval = vnode_put_locked(vp);
+ vnode_unlock(vp);
+
+ return(retval);
+}
+
+int
+vnode_put_locked(vnode_t vp)
+{
+ struct vfs_context context;
+
+retry:
+ if (vp->v_iocount < 1)
+ panic("vnode_put(%x): iocount < 1", vp);
+
+ if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
+ vnode_dropiocount(vp, 1);
+ return(0);
+ }
+ if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) {
+
+ vp->v_lflag &= ~VL_NEEDINACTIVE;
+ vnode_unlock(vp);
+
+ context.vc_proc = current_proc();
+ context.vc_ucred = kauth_cred_get();
+ VNOP_INACTIVE(vp, &context);
+
+ vnode_lock(vp);
+ /*
+ * because we had to drop the vnode lock before calling
+ * VNOP_INACTIVE, the state of this vnode may have changed...
+ * we may pick up both VL_MARTERM and either
+ * an iocount or a usecount while in the VNOP_INACTIVE call
+ * we don't want to call vnode_reclaim_internal on a vnode
+ * that has active references on it... so loop back around
+ * and reevaluate the state
+ */
+ goto retry;
+ }
+ vp->v_lflag &= ~VL_NEEDINACTIVE;
+
+ if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)
+ vnode_reclaim_internal(vp, 1, 0);
+
+ vnode_dropiocount(vp, 1);
+ vnode_list_add(vp);
+
+ return(0);
+}
+
+/* is vnode_t in use by others? */
+int
+vnode_isinuse(vnode_t vp, int refcnt)
+{
+ return(vnode_isinuse_locked(vp, refcnt, 0));
+}
+
+
+static int
+vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
+{
+ int retval = 0;
+
+ if (!locked)
+ vnode_lock(vp);
+ if ((vp->v_type != VREG) && (vp->v_usecount > refcnt)) {
+ retval = 1;
+ goto out;
+ }
+ if (vp->v_type == VREG) {
+ retval = ubc_isinuse_locked(vp, refcnt, 1);
+ }
+
+out:
+ if (!locked)
+ vnode_unlock(vp);
+ return(retval);
+}
+
+
+/* resume vnode_t */
+errno_t
+vnode_resume(vnode_t vp)
+{
+
+ vnode_lock(vp);
+
+ if (vp->v_owner == current_thread()) {
+ vp->v_lflag &= ~VL_SUSPENDED;
+ vp->v_owner = 0;
+ vnode_unlock(vp);
+ wakeup(&vp->v_iocount);
+ } else
+ vnode_unlock(vp);
+
+ return(0);
+}
+
+static errno_t
+vnode_drain(vnode_t vp)
+{
+
+ if (vp->v_lflag & VL_DRAIN) {
+ panic("vnode_drain: recursuve drain");
+ return(ENOENT);
+ }
+ vp->v_lflag |= VL_DRAIN;
+ vp->v_owner = current_thread();
+
+ while (vp->v_iocount > 1)
+ msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", 0);
+ return(0);
+}
+
+
+/*
+ * if the number of recent references via vnode_getwithvid or vnode_getwithref
+ * exceeds this threshhold, than 'UN-AGE' the vnode by removing it from
+ * the LRU list if it's currently on it... once the iocount and usecount both drop
+ * to 0, it will get put back on the end of the list, effectively making it younger
+ * this allows us to keep actively referenced vnodes in the list without having
+ * to constantly remove and add to the list each time a vnode w/o a usecount is
+ * referenced which costs us taking and dropping a global lock twice.
+ */
+#define UNAGE_THRESHHOLD 10
+
+errno_t
+vnode_getiocount(vnode_t vp, int locked, int vid, int vflags)
+{
+ int nodead = vflags & VNODE_NODEAD;
+ int nosusp = vflags & VNODE_NOSUSPEND;
+
+ if (!locked)
+ vnode_lock(vp);
+
+ for (;;) {
+ /*
+ * if it is a dead vnode with deadfs
+ */
+ if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) {
+ if (!locked)
+ vnode_unlock(vp);
+ return(ENOENT);
+ }
+ /*
+ * will return VL_DEAD ones
+ */
+ if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0 ) {
+ break;
+ }
+ /*
+ * if suspended vnodes are to be failed
+ */
+ if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
+ if (!locked)
+ vnode_unlock(vp);
+ return(ENOENT);
+ }
+ /*
+ * if you are the owner of drain/suspend/termination , can acquire iocount
+ * check for VL_TERMINATE; it does not set owner
+ */
+ if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) &&
+ (vp->v_owner == current_thread())) {
+ break;
+ }
+ if (vp->v_lflag & VL_TERMINATE) {
+ vp->v_lflag |= VL_TERMWANT;
+
+ msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vnode getiocount", 0);
+ } else
+ msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", 0);
+ }
+ if (vid != vp->v_id) {
+ if (!locked)
+ vnode_unlock(vp);
+ return(ENOENT);
+ }
+ if (++vp->v_references >= UNAGE_THRESHHOLD) {
+ vp->v_references = 0;
+ vnode_list_remove(vp);
+ }
+ vp->v_iocount++;
+#ifdef JOE_DEBUG
+ record_vp(vp, 1);
+#endif
+ if (!locked)
+ vnode_unlock(vp);
+ return(0);
+}
+
+static void
+vnode_dropiocount (vnode_t vp, int locked)
+{
+ if (!locked)
+ vnode_lock(vp);
+ if (vp->v_iocount < 1)
+ panic("vnode_dropiocount(%x): v_iocount < 1", vp);
+
+ vp->v_iocount--;
+#ifdef JOE_DEBUG
+ record_vp(vp, -1);
+#endif
+ if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1))
+ wakeup(&vp->v_iocount);
+
+ if (!locked)
+ vnode_unlock(vp);
+}
+
+
+void
+vnode_reclaim(struct vnode * vp)
+{
+ vnode_reclaim_internal(vp, 0, 0);
+}
+
+__private_extern__
+void
+vnode_reclaim_internal(struct vnode * vp, int locked, int reuse)
+{
+ int isfifo = 0;
+
+ if (!locked)
+ vnode_lock(vp);
+
+ if (vp->v_lflag & VL_TERMINATE) {
+ panic("vnode reclaim in progress");
+ }
+ vp->v_lflag |= VL_TERMINATE;
+
+ if (vnode_drain(vp)) {
+ panic("vnode drain failed");
+ vnode_unlock(vp);
+ return;
+ }
+ isfifo = (vp->v_type == VFIFO);
+
+ if (vp->v_type != VBAD)
+ vgone(vp); /* clean and reclaim the vnode */
+
+ /*
+ * give the vnode a new identity so
+ * that vnode_getwithvid will fail
+ * on any stale cache accesses
+ */
+ vp->v_id++;
+ if (isfifo) {
+ struct fifoinfo * fip;
+
+ fip = vp->v_fifoinfo;
+ vp->v_fifoinfo = NULL;
+ FREE(fip, M_TEMP);
+ }
+
+ vp->v_type = VBAD;
+
+ if (vp->v_data)
+ panic("vnode_reclaim_internal: cleaned vnode isn't");
+ if (vp->v_numoutput)
+ panic("vnode_reclaim_internal: Clean vnode has pending I/O's");
+ if (UBCINFOEXISTS(vp))
+ panic("vnode_reclaim_internal: ubcinfo not cleaned");
+ if (vp->v_parent)
+ panic("vnode_reclaim_internal: vparent not removed");
+ if (vp->v_name)
+ panic("vnode_reclaim_internal: vname not removed");
+
+ vp->v_socket = 0;
+
+ vp->v_lflag &= ~VL_TERMINATE;
+ vp->v_lflag &= ~VL_DRAIN;
+ vp->v_owner = 0;
+
+ if (vp->v_lflag & VL_TERMWANT) {
+ vp->v_lflag &= ~VL_TERMWANT;
+ wakeup(&vp->v_lflag);
+ }
+ if (!reuse && vp->v_usecount == 0)
+ vnode_list_add(vp);
+ if (!locked)
+ vnode_unlock(vp);
+}
+
+/* USAGE:
+ * The following api creates a vnode and associates all the parameter specified in vnode_fsparam
+ * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias
+ * is obsoleted by this.
+ * vnode_create(int flavor, size_t size, void * param, vnode_t *vp)
+ */
+int
+vnode_create(int flavor, size_t size, void *data, vnode_t *vpp)
+{
+ int error;
+ int insert = 1;
+ vnode_t vp;
+ vnode_t nvp;
+ vnode_t dvp;
+ struct componentname *cnp;
+ struct vnode_fsparam *param = (struct vnode_fsparam *)data;
+
+ if (flavor == VNCREATE_FLAVOR && (size == VCREATESIZE) && param) {
+ if ( (error = new_vnode(&vp)) ) {
+ return(error);
+ } else {
+ dvp = param->vnfs_dvp;
+ cnp = param->vnfs_cnp;
+
+ vp->v_op = param->vnfs_vops;
+ vp->v_type = param->vnfs_vtype;
+ vp->v_data = param->vnfs_fsnode;
+ vp->v_iocount = 1;
+
+ if (param->vnfs_markroot)
+ vp->v_flag |= VROOT;
+ if (param->vnfs_marksystem)
+ vp->v_flag |= VSYSTEM;
+ else if (vp->v_type == VREG) {
+ /*
+ * only non SYSTEM vp
+ */
+ error = ubc_info_init_withsize(vp, param->vnfs_filesize);
+ if (error) {
+#ifdef JOE_DEBUG
+ record_vp(vp, 1);
+#endif
+ vp->v_mount = 0;
+ vp->v_op = dead_vnodeop_p;
+ vp->v_tag = VT_NON;
+ vp->v_data = NULL;
+ vp->v_type = VBAD;
+ vp->v_lflag |= VL_DEAD;
+
+ vnode_put(vp);
+ return(error);
+ }
+ }
+#ifdef JOE_DEBUG
+ record_vp(vp, 1);
+#endif
+ if (vp->v_type == VCHR || vp->v_type == VBLK) {
+
+ if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) {
+ /*
+ * if checkalias returns a vnode, it will be locked
+ *
+ * first get rid of the unneeded vnode we acquired
+ */
+ vp->v_data = NULL;
+ vp->v_op = spec_vnodeop_p;
+ vp->v_type = VBAD;
+ vp->v_lflag = VL_DEAD;
+ vp->v_data = NULL;
+ vp->v_tag = VT_NON;
+ vnode_put(vp);
+
+ /*
+ * switch to aliased vnode and finish
+ * preparing it
+ */
+ vp = nvp;
+
+ vclean(vp, 0, current_proc());
+ vp->v_op = param->vnfs_vops;
+ vp->v_type = param->vnfs_vtype;
+ vp->v_data = param->vnfs_fsnode;
+ vp->v_lflag = 0;
+ vp->v_mount = NULL;
+ insmntque(vp, param->vnfs_mp);
+ insert = 0;
+ vnode_unlock(vp);
+ }
+ }
+
+ if (vp->v_type == VFIFO) {
+ struct fifoinfo *fip;
+
+ MALLOC(fip, struct fifoinfo *,
+ sizeof(*fip), M_TEMP, M_WAITOK);
+ bzero(fip, sizeof(struct fifoinfo ));
+ vp->v_fifoinfo = fip;
+ }
+ /* The file systems usually pass the address of the location where
+ * where there store the vnode pointer. When we add the vnode in mount
+ * point and name cache they are discoverable. So the file system node
+ * will have the connection to vnode setup by then
+ */
+ *vpp = vp;
+
+ if (param->vnfs_mp) {
+ if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL)
+ vp->v_flag |= VLOCKLOCAL;
+ if (insert) {
+ /*
+ * enter in mount vnode list
+ */
+ insmntque(vp, param->vnfs_mp);
+ }
+#ifdef INTERIM_FSNODE_LOCK
+ if (param->vnfs_mp->mnt_vtable->vfc_threadsafe == 0) {
+ MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *,
+ sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK);
+ vp->v_unsafefs->fsnode_count = 0;
+ vp->v_unsafefs->fsnodeowner = (void *)NULL;
+ lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr);
+ }
+#endif /* INTERIM_FSNODE_LOCK */
+ }
+ if (dvp && vnode_ref(dvp) == 0) {
+ vp->v_parent = dvp;
+ }
+ if (cnp) {
+ if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) {
+ /*
+ * enter into name cache
+ * we've got the info to enter it into the name cache now
+ */
+ cache_enter(dvp, vp, cnp);
+ }
+ vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0);
+ }
+ if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) {
+ /*
+ * this vnode is being created as cacheable in the name cache
+ * this allows us to re-enter it in the cache
+ */
+ vp->v_flag |= VNCACHEABLE;
+ }
+ if ((vp->v_flag & VSYSTEM) && (vp->v_type != VREG))
+ panic("incorrect vnode setup");
+
+ return(0);
+ }
+ }
+ return (EINVAL);
+}
+
+int
+vnode_addfsref(vnode_t vp)
+{
+ vnode_lock(vp);
+ if (vp->v_lflag & VNAMED_FSHASH)
+ panic("add_fsref: vp already has named reference");
+ if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb))
+ panic("addfsref: vp on the free list\n");
+ vp->v_lflag |= VNAMED_FSHASH;
+ vnode_unlock(vp);
+ return(0);
+
+}
+int
+vnode_removefsref(vnode_t vp)
+{
+ vnode_lock(vp);
+ if ((vp->v_lflag & VNAMED_FSHASH) == 0)
+ panic("remove_fsref: no named reference");
+ vp->v_lflag &= ~VNAMED_FSHASH;
+ vnode_unlock(vp);
+ return(0);
+
+}
+
+
+int
+vfs_iterate(__unused int flags, int (*callout)(mount_t, void *), void *arg)
+{
+ mount_t mp;
+ int ret = 0;
+ fsid_t * fsid_list;
+ int count, actualcount, i;
+ void * allocmem;
+
+ count = mount_getvfscnt();
+ count += 10;
+
+ fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t));
+ allocmem = (void *)fsid_list;
+
+ actualcount = mount_fillfsids(fsid_list, count);
+
+ for (i=0; i< actualcount; i++) {
+
+ /* obtain the mount point with iteration reference */
+ mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1);
+
+ if(mp == (struct mount *)0)
+ continue;
+ mount_lock(mp);
+ if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
+ mount_unlock(mp);
+ mount_iterdrop(mp);
+ continue;
+
+ }
+ mount_unlock(mp);
+
+ /* iterate over all the vnodes */
+ ret = callout(mp, arg);
+
+ mount_iterdrop(mp);
+
+ switch (ret) {
+ case VFS_RETURNED:
+ case VFS_RETURNED_DONE:
+ if (ret == VFS_RETURNED_DONE) {
+ ret = 0;
+ goto out;
+ }
+ break;
+
+ case VFS_CLAIMED_DONE:
+ ret = 0;
+ goto out;
+ case VFS_CLAIMED:
+ default:
+ break;
+ }
+ ret = 0;
+ }
+
+out:
+ kfree(allocmem, (count * sizeof(fsid_t)));
+ return (ret);
+}
+
+/*
+ * Update the vfsstatfs structure in the mountpoint.
+ */
+int
+vfs_update_vfsstat(mount_t mp, vfs_context_t ctx)
+{
+ struct vfs_attr va;
+ int error;
+
+ /*
+ * Request the attributes we want to propagate into
+ * the per-mount vfsstat structure.
+ */
+ VFSATTR_INIT(&va);
+ VFSATTR_WANTED(&va, f_iosize);
+ VFSATTR_WANTED(&va, f_blocks);
+ VFSATTR_WANTED(&va, f_bfree);
+ VFSATTR_WANTED(&va, f_bavail);
+ VFSATTR_WANTED(&va, f_bused);
+ VFSATTR_WANTED(&va, f_files);
+ VFSATTR_WANTED(&va, f_ffree);
+ VFSATTR_WANTED(&va, f_bsize);
+ VFSATTR_WANTED(&va, f_fssubtype);
+ if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
+ KAUTH_DEBUG("STAT - filesystem returned error %d", error);
+ return(error);
+ }
+
+ /*
+ * Unpack into the per-mount structure.
+ *
+ * We only overwrite these fields, which are likely to change:
+ * f_blocks
+ * f_bfree
+ * f_bavail
+ * f_bused
+ * f_files
+ * f_ffree
+ *
+ * And these which are not, but which the FS has no other way
+ * of providing to us:
+ * f_bsize
+ * f_iosize
+ * f_fssubtype
+ *
+ */
+ if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
+ mp->mnt_vfsstat.f_bsize = va.f_bsize;
+ } else {
+ mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */
+ }
+ if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
+ mp->mnt_vfsstat.f_iosize = va.f_iosize;
+ } else {
+ mp->mnt_vfsstat.f_iosize = 1024 * 1024; /* 1MB sensible I/O size */
+ }
+ if (VFSATTR_IS_SUPPORTED(&va, f_blocks))
+ mp->mnt_vfsstat.f_blocks = va.f_blocks;
+ if (VFSATTR_IS_SUPPORTED(&va, f_bfree))
+ mp->mnt_vfsstat.f_bfree = va.f_bfree;
+ if (VFSATTR_IS_SUPPORTED(&va, f_bavail))
+ mp->mnt_vfsstat.f_bavail = va.f_bavail;
+ if (VFSATTR_IS_SUPPORTED(&va, f_bused))
+ mp->mnt_vfsstat.f_bused = va.f_bused;
+ if (VFSATTR_IS_SUPPORTED(&va, f_files))
+ mp->mnt_vfsstat.f_files = va.f_files;
+ if (VFSATTR_IS_SUPPORTED(&va, f_ffree))
+ mp->mnt_vfsstat.f_ffree = va.f_ffree;
+
+ /* this is unlikely to change, but has to be queried for */
+ if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype))
+ mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
+
+ return(0);
+}
+
+void
+mount_list_add(mount_t mp)
+{
+ mount_list_lock();
+ TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ nummounts++;
+ mount_list_unlock();
+}
+
+void
+mount_list_remove(mount_t mp)
+{
+ mount_list_lock();
+ TAILQ_REMOVE(&mountlist, mp, mnt_list);
+ nummounts--;
+ mp->mnt_list.tqe_next = 0;
+ mp->mnt_list.tqe_prev = 0;
+ mount_list_unlock();
+}
+
+mount_t
+mount_lookupby_volfsid(int volfs_id, int withref)
+{
+ mount_t cur_mount = (mount_t)0;
+ mount_t mp ;
+
+ mount_list_lock();
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ if (validfsnode(mp) && mp->mnt_vfsstat.f_fsid.val[0] == volfs_id) {
+ cur_mount = mp;
+ if (withref) {
+ if (mount_iterref(cur_mount, 1)) {
+ cur_mount = (mount_t)0;
+ mount_list_unlock();
+ goto out;
+ }
+ }
+ break;
+ }
+ }
+ mount_list_unlock();
+ if (withref && (cur_mount != (mount_t)0)) {
+ mp = cur_mount;
+ if (vfs_busy(mp, LK_NOWAIT) != 0) {
+ cur_mount = (mount_t)0;
+ }
+ mount_iterdrop(mp);
+ }
+out:
+ return(cur_mount);
+}
+
+
+mount_t
+mount_list_lookupby_fsid(fsid, locked, withref)
+ fsid_t *fsid;
+ int locked;
+ int withref;
+{
+ mount_t retmp = (mount_t)0;
+ mount_t mp;
+
+ if (!locked)
+ mount_list_lock();
+ TAILQ_FOREACH(mp, &mountlist, mnt_list)
+ if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] &&
+ mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) {
+ retmp = mp;
+ if (withref) {
+ if (mount_iterref(retmp, 1))
+ retmp = (mount_t)0;
+ }
+ goto out;
+ }
+out:
+ if (!locked)
+ mount_list_unlock();
+ return (retmp);
+}
+
+errno_t
+vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t context)
+{
+ struct nameidata nd;
+ int error;
+ struct vfs_context context2;
+ vfs_context_t ctx = context;
+ u_long ndflags = 0;
+
+ if (context == NULL) { /* XXX technically an error */
+ context2.vc_proc = current_proc();
+ context2.vc_ucred = kauth_cred_get();
+ ctx = &context2;
+ }
+
+ if (flags & VNODE_LOOKUP_NOFOLLOW)
+ ndflags = NOFOLLOW;
+ else
+ ndflags = FOLLOW;
+
+ if (flags & VNODE_LOOKUP_NOCROSSMOUNT)
+ ndflags |= NOCROSSMOUNT;
+ if (flags & VNODE_LOOKUP_DOWHITEOUT)
+ ndflags |= DOWHITEOUT;
+
+ /* XXX AUDITVNPATH1 needed ? */
+ NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
+
+ if ((error = namei(&nd)))
+ return (error);
+ *vpp = nd.ni_vp;
+ nameidone(&nd);
+
+ return (0);
+}
+
+errno_t
+vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t context)
+{
+ struct nameidata nd;
+ int error;
+ struct vfs_context context2;
+ vfs_context_t ctx = context;
+ u_long ndflags = 0;
+ int lflags = flags;
+
+ if (context == NULL) { /* XXX technically an error */
+ context2.vc_proc = current_proc();
+ context2.vc_ucred = kauth_cred_get();
+ ctx = &context2;
+ }
+
+ if (fmode & O_NOFOLLOW)
+ lflags |= VNODE_LOOKUP_NOFOLLOW;
+
+ if (lflags & VNODE_LOOKUP_NOFOLLOW)
+ ndflags = NOFOLLOW;
+ else
+ ndflags = FOLLOW;
+
+ if (lflags & VNODE_LOOKUP_NOCROSSMOUNT)
+ ndflags |= NOCROSSMOUNT;
+ if (lflags & VNODE_LOOKUP_DOWHITEOUT)
+ ndflags |= DOWHITEOUT;
+
+ /* XXX AUDITVNPATH1 needed ? */
+ NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
+
+ if ((error = vn_open(&nd, fmode, cmode)))
+ *vpp = NULL;
+ else
+ *vpp = nd.ni_vp;
+
+ return (error);
+}
+
+errno_t
+vnode_close(vnode_t vp, int flags, vfs_context_t context)
+{
+ kauth_cred_t cred;
+ struct proc *p;
+ int error;
+
+ if (context) {
+ p = context->vc_proc;
+ cred = context->vc_ucred;
+ } else {
+ p = current_proc();
+ cred = kauth_cred_get();
+ }
+
+ error = vn_close(vp, flags, cred, p);
+ vnode_put(vp);
+ return (error);
+}
+
+errno_t
+vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
+{
+ struct vnode_attr va;
+ int error;
+
+ VATTR_INIT(&va);
+ VATTR_WANTED(&va, va_data_size);
+ error = vnode_getattr(vp, &va, ctx);
+ if (!error)
+ *sizep = va.va_data_size;
+ return(error);
+}
+
+errno_t
+vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
+{
+ struct vnode_attr va;
+
+ VATTR_INIT(&va);
+ VATTR_SET(&va, va_data_size, size);
+ va.va_vaflags = ioflag & 0xffff;
+ return(vnode_setattr(vp, &va, ctx));
+}
+
+errno_t
+vn_create(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, struct vnode_attr *vap, int flags, vfs_context_t ctx)
+{
+ kauth_acl_t oacl, nacl;
+ int initial_acl;
+ errno_t error;
+ vnode_t vp = (vnode_t)0;
+
+ error = 0;
+ oacl = nacl = NULL;
+ initial_acl = 0;
+
+ KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr);
+
+ /*
+ * Handle ACL inheritance.
+ */
+ if (!(flags & VN_CREATE_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
+ /* save the original filesec */
+ if (VATTR_IS_ACTIVE(vap, va_acl)) {
+ initial_acl = 1;
+ oacl = vap->va_acl;
+ }
+
+ vap->va_acl = NULL;
+ if ((error = kauth_acl_inherit(dvp,
+ oacl,
+ &nacl,
+ vap->va_type == VDIR,
+ ctx)) != 0) {
+ KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error);
+ return(error);
+ }
+
+ /*
+ * If the generated ACL is NULL, then we can save ourselves some effort
+ * by clearing the active bit.
+ */
+ if (nacl == NULL) {
+ VATTR_CLEAR_ACTIVE(vap, va_acl);
+ } else {
+ VATTR_SET(vap, va_acl, nacl);
+ }
+ }
+
+ /*
+ * Check and default new attributes.
+ * This will set va_uid, va_gid, va_mode and va_create_time at least, if the caller
+ * hasn't supplied them.
+ */
+ if ((error = vnode_authattr_new(dvp, vap, flags & VN_CREATE_NOAUTH, ctx)) != 0) {
+ KAUTH_DEBUG("%p CREATE - error %d handing/defaulting attributes", dvp, error);
+ goto out;
+ }
+
+
+ /*
+ * Create the requested node.
+ */
+ switch(vap->va_type) {
+ case VREG:
+ error = VNOP_CREATE(dvp, vpp, cnp, vap, ctx);
+ break;
+ case VDIR:
+ error = VNOP_MKDIR(dvp, vpp, cnp, vap, ctx);
+ break;
+ case VSOCK:
+ case VFIFO:
+ case VBLK:
+ case VCHR:
+ error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
+ break;
+ default:
+ panic("vnode_create: unknown vtype %d", vap->va_type);
+ }
+ if (error != 0) {
+ KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error);
+ goto out;
+ }
+
+ vp = *vpp;
+ /*
+ * If some of the requested attributes weren't handled by the VNOP,
+ * use our fallback code.
+ */
+ if (!VATTR_ALL_SUPPORTED(vap) && *vpp) {
+ KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl);
+ error = vnode_setattr_fallback(*vpp, vap, ctx);
+ }
+ if ((error != 0 ) && (vp != (vnode_t)0)) {
+ *vpp = (vnode_t) 0;
+ vnode_put(vp);
+ }
+
+out:
+ /*
+ * If the caller supplied a filesec in vap, it has been replaced
+ * now by the post-inheritance copy. We need to put the original back
+ * and free the inherited product.
+ */
+ if (initial_acl) {
+ VATTR_SET(vap, va_acl, oacl);
+ } else {
+ VATTR_CLEAR_ACTIVE(vap, va_acl);
+ }
+ if (nacl != NULL)
+ kauth_acl_free(nacl);
+
+ return(error);
+}
+
+static kauth_scope_t vnode_scope;
+static int vnode_authorize_callback(kauth_cred_t credential, __unused void *idata, kauth_action_t action,
+ uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
+
+typedef struct _vnode_authorize_context {
+ vnode_t vp;
+ struct vnode_attr *vap;
+ vnode_t dvp;
+ struct vnode_attr *dvap;
+ vfs_context_t ctx;
+ int flags;
+ int flags_valid;
+#define _VAC_IS_OWNER (1<<0)
+#define _VAC_IN_GROUP (1<<1)
+#define _VAC_IS_DIR_OWNER (1<<2)
+#define _VAC_IN_DIR_GROUP (1<<3)
+} *vauth_ctx;
+
+void
+vnode_authorize_init(void)
+{
+ vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL);
+}
+
+/*
+ * Authorize an operation on a vnode.
+ *
+ * This is KPI, but here because it needs vnode_scope.
+ */
+int
+vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t context)
+{
+ int error, result;
+
+ /*
+ * We can't authorize against a dead vnode; allow all operations through so that
+ * the correct error can be returned.
+ */
+ if (vp->v_type == VBAD)
+ return(0);
+
+ error = 0;
+ result = kauth_authorize_action(vnode_scope, vfs_context_ucred(context), action,
+ (uintptr_t)context, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error);
+ if (result == EPERM) /* traditional behaviour */
+ result = EACCES;
+ /* did the lower layers give a better error return? */
+ if ((result != 0) && (error != 0))
+ return(error);
+ return(result);
+}
+
+/*
+ * Test for vnode immutability.
+ *
+ * The 'append' flag is set when the authorization request is constrained
+ * to operations which only request the right to append to a file.
+ *
+ * The 'ignore' flag is set when an operation modifying the immutability flags
+ * is being authorized. We check the system securelevel to determine which
+ * immutability flags we can ignore.
+ */
+static int
+vnode_immutable(struct vnode_attr *vap, int append, int ignore)
+{
+ int mask;
+
+ /* start with all bits precluding the operation */
+ mask = IMMUTABLE | APPEND;
+
+ /* if appending only, remove the append-only bits */
+ if (append)
+ mask &= ~APPEND;
+
+ /* ignore only set when authorizing flags changes */
+ if (ignore) {
+ if (securelevel <= 0) {
+ /* in insecure state, flags do not inhibit changes */
+ mask = 0;
+ } else {
+ /* in secure state, user flags don't inhibit */
+ mask &= ~(UF_IMMUTABLE | UF_APPEND);
+ }
+ }
+ KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
+ if ((vap->va_flags & mask) != 0)
+ return(EPERM);
+ return(0);
+}
+
+static int
+vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
+{
+ int result;
+
+ /* default assumption is not-owner */
+ result = 0;
+
+ /*
+ * If the filesystem has given us a UID, we treat this as authoritative.
+ */
+ if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
+ result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0;
+ }
+ /* we could test the owner UUID here if we had a policy for it */
+
+ return(result);
+}
+
+static int
+vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember)
+{
+ int error;
+ int result;
+
+ error = 0;
+ result = 0;
+
+ /* the caller is expected to have asked the filesystem for a group at some point */
+ if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
+ error = kauth_cred_ismember_gid(cred, vap->va_gid, &result);
+ }
+ /* we could test the group UUID here if we had a policy for it */
+
+ if (!error)
+ *ismember = result;
+ return(error);
+}
+
+static int
+vauth_file_owner(vauth_ctx vcp)
+{
+ int result;
+
+ if (vcp->flags_valid & _VAC_IS_OWNER) {
+ result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0;
+ } else {
+ result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred);
+
+ /* cache our result */
+ vcp->flags_valid |= _VAC_IS_OWNER;
+ if (result) {
+ vcp->flags |= _VAC_IS_OWNER;
+ } else {
+ vcp->flags &= ~_VAC_IS_OWNER;
+ }
+ }
+ return(result);
+}
+
+static int
+vauth_file_ingroup(vauth_ctx vcp, int *ismember)
+{
+ int error;
+
+ if (vcp->flags_valid & _VAC_IN_GROUP) {
+ *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0;
+ error = 0;
+ } else {
+ error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember);
+
+ if (!error) {
+ /* cache our result */
+ vcp->flags_valid |= _VAC_IN_GROUP;
+ if (*ismember) {
+ vcp->flags |= _VAC_IN_GROUP;
+ } else {
+ vcp->flags &= ~_VAC_IN_GROUP;
+ }
+ }
+
+ }
+ return(error);
+}
+
+static int
+vauth_dir_owner(vauth_ctx vcp)
+{
+ int result;
+
+ if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
+ result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0;
+ } else {
+ result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred);
+
+ /* cache our result */
+ vcp->flags_valid |= _VAC_IS_DIR_OWNER;
+ if (result) {
+ vcp->flags |= _VAC_IS_DIR_OWNER;
+ } else {
+ vcp->flags &= ~_VAC_IS_DIR_OWNER;
+ }
+ }
+ return(result);
+}
+
+static int
+vauth_dir_ingroup(vauth_ctx vcp, int *ismember)
+{
+ int error;
+
+ if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
+ *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0;
+ error = 0;
+ } else {
+ error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember);
+
+ if (!error) {
+ /* cache our result */
+ vcp->flags_valid |= _VAC_IN_DIR_GROUP;
+ if (*ismember) {
+ vcp->flags |= _VAC_IN_DIR_GROUP;
+ } else {
+ vcp->flags &= ~_VAC_IN_DIR_GROUP;
+ }
+ }
+ }
+ return(error);
+}
+
+/*
+ * Test the posix permissions in (vap) to determine whether (credential)
+ * may perform (action)
+ */
+static int
+vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
+{
+ struct vnode_attr *vap;
+ int needed, error, owner_ok, group_ok, world_ok, ismember;
+#ifdef KAUTH_DEBUG_ENABLE
+ const char *where;
+# define _SETWHERE(c) where = c;
+#else
+# define _SETWHERE(c)
+#endif
+
+ /* checking file or directory? */
+ if (on_dir) {
+ vap = vcp->dvap;
+ } else {
+ vap = vcp->vap;
+ }
+
+ error = 0;
+
+ /*
+ * We want to do as little work here as possible. So first we check
+ * which sets of permissions grant us the access we need, and avoid checking
+ * whether specific permissions grant access when more generic ones would.
+ */
+
+ /* owner permissions */
+ needed = 0;
+ if (action & VREAD)
+ needed |= S_IRUSR;
+ if (action & VWRITE)
+ needed |= S_IWUSR;
+ if (action & VEXEC)
+ needed |= S_IXUSR;
+ owner_ok = (needed & vap->va_mode) == needed;
+
+ /* group permissions */
+ needed = 0;
+ if (action & VREAD)
+ needed |= S_IRGRP;
+ if (action & VWRITE)
+ needed |= S_IWGRP;
+ if (action & VEXEC)
+ needed |= S_IXGRP;
+ group_ok = (needed & vap->va_mode) == needed;
+
+ /* world permissions */
+ needed = 0;
+ if (action & VREAD)
+ needed |= S_IROTH;
+ if (action & VWRITE)
+ needed |= S_IWOTH;
+ if (action & VEXEC)
+ needed |= S_IXOTH;
+ world_ok = (needed & vap->va_mode) == needed;
+
+ /* If granted/denied by all three, we're done */
+ if (owner_ok && group_ok && world_ok) {
+ _SETWHERE("all");
+ goto out;
+ }
+ if (!owner_ok && !group_ok && !world_ok) {
+ _SETWHERE("all");
+ error = EACCES;
+ goto out;
+ }
+
+ /* Check ownership (relatively cheap) */
+ if ((on_dir && vauth_dir_owner(vcp)) ||
+ (!on_dir && vauth_file_owner(vcp))) {
+ _SETWHERE("user");
+ if (!owner_ok)
+ error = EACCES;
+ goto out;
+ }
+
+ /* Not owner; if group and world both grant it we're done */
+ if (group_ok && world_ok) {
+ _SETWHERE("group/world");
+ goto out;
+ }
+ if (!group_ok && !world_ok) {
+ _SETWHERE("group/world");
+ error = EACCES;
+ goto out;
+ }
+
+ /* Check group membership (most expensive) */
+ ismember = 0;
+ if (on_dir) {
+ error = vauth_dir_ingroup(vcp, &ismember);
+ } else {
+ error = vauth_file_ingroup(vcp, &ismember);
+ }
+ if (error)
+ goto out;
+ if (ismember) {
+ _SETWHERE("group");
+ if (!group_ok)
+ error = EACCES;
+ goto out;
+ }
+
+ /* Not owner, not in group, use world result */
+ _SETWHERE("world");
+ if (!world_ok)
+ error = EACCES;
+
+ /* FALLTHROUGH */
+
+out:
+ KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
+ vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where,
+ (action & VREAD) ? "r" : "-",
+ (action & VWRITE) ? "w" : "-",
+ (action & VEXEC) ? "x" : "-",
+ needed,
+ (vap->va_mode & S_IRUSR) ? "r" : "-",
+ (vap->va_mode & S_IWUSR) ? "w" : "-",
+ (vap->va_mode & S_IXUSR) ? "x" : "-",
+ (vap->va_mode & S_IRGRP) ? "r" : "-",
+ (vap->va_mode & S_IWGRP) ? "w" : "-",
+ (vap->va_mode & S_IXGRP) ? "x" : "-",
+ (vap->va_mode & S_IROTH) ? "r" : "-",
+ (vap->va_mode & S_IWOTH) ? "w" : "-",
+ (vap->va_mode & S_IXOTH) ? "x" : "-",
+ kauth_cred_getuid(vcp->ctx->vc_ucred),
+ on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
+ on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
+ return(error);
+}