X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/91447636331957f3d9b5ca5b508f07c526b0074d..3903760236c30e3b5ace7a4eefac3a269d68957c:/bsd/miscfs/nullfs/null_vnops.c?ds=sidebyside diff --git a/bsd/miscfs/nullfs/null_vnops.c b/bsd/miscfs/nullfs/null_vnops.c index ae4e8db1b..389adb7e4 100644 --- a/bsd/miscfs/nullfs/null_vnops.c +++ b/bsd/miscfs/nullfs/null_vnops.c @@ -1,28 +1,29 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2016 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * * @APPLE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. + +/*- + * Portions Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * John Heidemann of the UCLA Ficus project. @@ -35,10 +36,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -55,291 +52,396 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)null_vnops.c 8.6 (Berkeley) 5/27/95 + * @(#)null_vnops.c 8.6 (Berkeley) 5/27/95 * * Ancestors: - * @(#)lofs_vnops.c 1.2 (Berkeley) 6/18/92 - * ...and... - * @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project - */ - -/* - * Null Layer - * - * (See mount_null(8) for more information.) - * - * The null layer duplicates a portion of the file system - * name space under a new name. In this respect, it is - * similar to the loopback file system. It differs from - * the loopback fs in two respects: it is implemented using - * a stackable layers techniques, and it's "null-node"s stack above - * all lower-layer vnodes, not just over directory vnodes. - * - * The null layer has two purposes. First, it serves as a demonstration - * of layering by proving a layer which does nothing. (It actually - * does everything the loopback file system does, which is slightly - * more than nothing.) Second, the null layer can serve as a prototype - * layer. Since it provides all necessary layer framework, - * new file system layers can be created very easily be starting - * with a null layer. - * - * The remainder of this man page examines the null layer as a basis - * for constructing new layers. - * - * - * INSTANTIATING NEW NULL LAYERS - * - * New null layers are created with mount_null(8). - * Mount_null(8) takes two arguments, the pathname - * of the lower vfs (target-pn) and the pathname where the null - * layer will appear in the namespace (alias-pn). After - * the null layer is put into place, the contents - * of target-pn subtree will be aliased under alias-pn. - * - * - * OPERATION OF A NULL LAYER - * - * The null layer is the minimum file system layer, - * simply bypassing all possible operations to the lower layer - * for processing there. The majority of its activity centers - * on the bypass routine, though which nearly all vnode operations - * pass. - * - * The bypass routine accepts arbitrary vnode operations for - * handling by the lower layer. It begins by examing vnode - * operation arguments and replacing any null-nodes by their - * lower-layer equivlants. It then invokes the operation - * on the lower layer. Finally, it replaces the null-nodes - * in the arguments and, if a vnode is return by the operation, - * stacks a null-node on top of the returned vnode. - * - * Although bypass handles most operations, vnop_getattr, vnop_lock, - * vnop_unlock, vnop_inactive, vnop_reclaim, and vnop_print are not - * bypassed. Vop_getattr must change the fsid being returned. - * Vop_lock and vnop_unlock must handle any locking for the - * current vnode as well as pass the lock request down. - * Vop_inactive and vnop_reclaim are not bypassed so that - * they can handle freeing null-layer specific data. Vop_print - * is not bypassed to avoid excessive debugging information. - * Also, certain vnode operations change the locking state within - * the operation (create, mknod, remove, link, rename, mkdir, rmdir, - * and symlink). Ideally these operations should not change the - * lock state, but should be changed to let the caller of the - * function unlock them. Otherwise all intermediate vnode layers - * (such as union, umapfs, etc) must catch these functions to do - * the necessary locking at their layer. - * - * - * INSTANTIATING VNODE STACKS - * - * Mounting associates the null layer with a lower layer, - * effect stacking two VFSes. Vnode stacks are instead - * created on demand as files are accessed. - * - * The initial mount creates a single vnode stack for the - * root of the new null layer. All other vnode stacks - * are created as a result of vnode operations on - * this or other null vnode stacks. - * - * New vnode stacks come into existance as a result of - * an operation which returns a vnode. - * The bypass routine stacks a null-node above the new - * vnode before returning it to the caller. - * - * For example, imagine mounting a null layer with - * "mount_null /usr/include /dev/layer/null". - * Changing directory to /dev/layer/null will assign - * the root null-node (which was created when the null layer was mounted). - * Now consider opening "sys". A vnop_lookup would be - * done on the root null-node. This operation would bypass through - * to the lower layer which would return a vnode representing - * the UFS "sys". Null_bypass then builds a null-node - * aliasing the UFS "sys" and returns this to the caller. - * Later operations on the null-node "sys" will repeat this - * process when constructing other vnode stacks. - * - * - * CREATING OTHER FILE SYSTEM LAYERS - * - * One of the easiest ways to construct new file system layers is to make - * a copy of the null layer, rename all files and variables, and - * then begin modifing the copy. Sed can be used to easily rename - * all variables. - * - * The umap layer is an example of a layer descended from the - * null layer. - * - * - * INVOKING OPERATIONS ON LOWER LAYERS - * - * There are two techniques to invoke operations on a lower layer - * when the operation cannot be completely bypassed. Each method - * is appropriate in different situations. In both cases, - * it is the responsibility of the aliasing layer to make - * the operation arguments "correct" for the lower layer - * by mapping an vnode arguments to the lower layer. - * - * The first approach is to call the aliasing layer's bypass routine. - * This method is most suitable when you wish to invoke the operation - * currently being hanldled on the lower layer. It has the advantage - * that the bypass routine already must do argument mapping. - * An example of this is null_getattrs in the null layer. - * - * A second approach is to directly invoked vnode operations on - * the lower layer with the VOP_OPERATIONNAME interface. - * The advantage of this method is that it is easy to invoke - * arbitrary operations on the lower layer. The disadvantage - * is that vnodes arguments must be manualy mapped. + * @(#)lofs_vnops.c 1.2 (Berkeley) 6/18/92 + * ...and... + * @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project * + * $FreeBSD$ */ #include #include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include "nullfs.h" -int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ +#define NULL_ROOT_INO 2 +#define NULL_SECOND_INO 3 +#define NULL_THIRD_INO 4 -/* - * This is the 10-Apr-92 bypass routine. - * This version has been optimized for speed, throwing away some - * safety checks. It should still always work, but it's not as - * robust to programmer errors. - * Define SAFETY to include some error checking code. - * - * In general, we map all vnodes going down and unmap them on the way back. - * As an exception to this, vnodes can be marked "unmapped" by setting - * the Nth bit in operation's vdesc_flags. - * - * Also, some BSD vnode operations have the side effect of node_put'ing - * their arguments. With stacking, the reference counts are held - * by the upper node, not the lower one, so we must handle these - * side-effects here. This is not of concern in Sun-derived systems - * since there are no such side-effects. - * - * This makes the following assumptions: - * - only one returned vpp - * - no INOUT vpp's (Sun's vnop_open has one of these) - * - the vnode operation vector of the first vnode should be used - * to determine what implementation of the op should be invoked - * - all mapped vnodes are of our vnode-type (NEEDSWORK: - * problems on rmdir'ing mount points and renaming?) - */ -int -null_bypass(ap) - struct vnop_generic_args /* { - struct vnodeop_desc *a_desc; - - } */ *ap; +vop_t * nullfs_vnodeop_p = NULL; + +/* the mountpoint lock should be held going into this function */ +static int +nullfs_isspecialvp(struct vnode * vp) { - extern int (**null_vnodeop_p)(void *); /* not extern, really "forward" */ - register struct vnode **this_vp_p; - int error; - struct vnode *old_vps[VDESC_MAX_VPS]; - struct vnode **vps_p[VDESC_MAX_VPS]; - struct vnode ***vppp; - struct vnodeop_desc *descp = ap->a_desc; - int reles, i; + struct null_mount * null_mp; - if (null_bug_bypass) - printf ("null_bypass: %s\n", descp->vdesc_name); + null_mp = MOUNTTONULLMOUNT(vnode_mount(vp)); -#ifdef SAFETY - /* - * We require at least one vp. - */ - if (descp->vdesc_vp_offsets == NULL || - descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) - panic ("null_bypass: no vp's in map.\n"); -#endif + /* only check for root and second here, third is special in a different way, + * related only to lookup and readdir */ + if (vp && (vp == null_mp->nullm_rootvp || vp == null_mp->nullm_secondvp)) { + return 1; + } + return 0; +} - /* - * Map the vnodes going in. - * Later, we'll invoke the operation based on - * the first mapped vnode's operation vector. - */ - reles = descp->vdesc_flags; - for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { - if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) - break; /* bail out at end of list */ - vps_p[i] = this_vp_p = - VOPARG_OFFSETTO(struct vnode**,descp->vdesc_vp_offsets[i],ap); - /* - * We're not guaranteed that any but the first vnode - * are of our type. Check for and don't map any - * that aren't. (We must always map first vp or vclean fails.) - */ - if (i && (*this_vp_p == NULL || - (*this_vp_p)->v_op != null_vnodeop_p)) { - old_vps[i] = NULL; - } else { - old_vps[i] = *this_vp_p; - *(vps_p[i]) = NULLVPTOLOWERVP(*this_vp_p); - /* - * XXX - Several operations have the side effect - * of vnode_put'ing their vp's. We must account for - * that. (This should go away in the future.) - */ - if (reles & 1) - vnode_get(*this_vp_p); +/* helper function to handle locking where possible */ +static int +nullfs_checkspecialvp(struct vnode* vp) +{ + int result = 0; + struct null_mount * null_mp; + + null_mp = MOUNTTONULLMOUNT(vnode_mount(vp)); + + lck_mtx_lock(&null_mp->nullm_lock); + result = (nullfs_isspecialvp(vp)); + lck_mtx_unlock(&null_mp->nullm_lock); + + return result; +} + +static int +nullfs_default(__unused struct vnop_generic_args * args) +{ + NULLFSDEBUG("%s (default)\n", ((struct vnodeop_desc_fake *)args->a_desc)->vdesc_name); + return ENOTSUP; +} + +static int +nullfs_special_getattr(struct vnop_getattr_args * args) +{ + mount_t mp = vnode_mount(args->a_vp); + struct null_mount * null_mp = MOUNTTONULLMOUNT(mp); + + ino_t ino = NULL_ROOT_INO; + struct vnode_attr covered_rootattr; + vnode_t checkvp = null_mp->nullm_lowerrootvp; + + VATTR_INIT(&covered_rootattr); + VATTR_WANTED(&covered_rootattr, va_uid); + VATTR_WANTED(&covered_rootattr, va_gid); + VATTR_WANTED(&covered_rootattr, va_create_time); + VATTR_WANTED(&covered_rootattr, va_modify_time); + VATTR_WANTED(&covered_rootattr, va_access_time); + + /* prefer to get this from the lower root vp, but if not (i.e. forced unmount + * of lower fs) try the mount point covered vnode */ + if (vnode_getwithvid(checkvp, null_mp->nullm_lowerrootvid)) { + checkvp = vfs_vnodecovered(mp); + if (checkvp == NULL) { + return EIO; } - } - /* - * Call the operation on the lower layer - * with the modified argument structure. - */ - error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap); + int error = vnode_getattr(checkvp, &covered_rootattr, args->a_context); - /* - * Maintain the illusion of call-by-value - * by restoring vnodes in the argument structure - * to their original value. - */ - reles = descp->vdesc_flags; - for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { - if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) - break; /* bail out at end of list */ - if (old_vps[i]) { - *(vps_p[i]) = old_vps[i]; - if (reles & 1) - vnode_put(*(vps_p[i])); + vnode_put(checkvp); + if (error) { + /* we should have been able to get attributes fore one of the two choices so + * fail if we didn't */ + return error; + } + + /* we got the attributes of the vnode we cover so plow ahead */ + if (args->a_vp == null_mp->nullm_secondvp) { + ino = NULL_SECOND_INO; + } + + VATTR_RETURN(args->a_vap, va_type, vnode_vtype(args->a_vp)); + VATTR_RETURN(args->a_vap, va_rdev, 0); + VATTR_RETURN(args->a_vap, va_nlink, 3); /* always just ., .., and the child */ + VATTR_RETURN(args->a_vap, va_total_size, 0); // hoping this is ok + + VATTR_RETURN(args->a_vap, va_data_size, 0); // hoping this is ok + VATTR_RETURN(args->a_vap, va_data_alloc, 0); + VATTR_RETURN(args->a_vap, va_iosize, vfs_statfs(mp)->f_iosize); + VATTR_RETURN(args->a_vap, va_fileid, ino); + VATTR_RETURN(args->a_vap, va_linkid, ino); + VATTR_RETURN(args->a_vap, va_fsid, vfs_statfs(mp)->f_fsid.val[0]); // return the fsid of the mount point + VATTR_RETURN(args->a_vap, va_filerev, 0); + VATTR_RETURN(args->a_vap, va_gen, 0); + VATTR_RETURN(args->a_vap, va_flags, UF_HIDDEN); /* mark our fake directories as hidden. People + shouldn't be enocouraged to poke around in them */ + + if (ino == NULL_SECOND_INO) { + VATTR_RETURN(args->a_vap, va_parentid, NULL_ROOT_INO); /* no parent at the root, so + the only other vnode that + goes through this path is + second and its parent is + 1.*/ + } + + if (VATTR_IS_ACTIVE(args->a_vap, va_mode)) { + /* force dr_xr_xr_x */ + VATTR_RETURN(args->a_vap, va_mode, S_IFDIR | S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); + } + if (VATTR_IS_ACTIVE(args->a_vap, va_uid)) { + VATTR_RETURN(args->a_vap, va_uid, covered_rootattr.va_uid); + } + if (VATTR_IS_ACTIVE(args->a_vap, va_gid)) { + VATTR_RETURN(args->a_vap, va_gid, covered_rootattr.va_gid); + } + + if (VATTR_IS_ACTIVE(args->a_vap, va_create_time)) { + VATTR_SET_SUPPORTED(args->a_vap, va_create_time); + args->a_vap->va_create_time.tv_sec = covered_rootattr.va_create_time.tv_sec; + args->a_vap->va_create_time.tv_nsec = covered_rootattr.va_create_time.tv_nsec; + } + if (VATTR_IS_ACTIVE(args->a_vap, va_modify_time)) { + VATTR_SET_SUPPORTED(args->a_vap, va_modify_time); + args->a_vap->va_modify_time.tv_sec = covered_rootattr.va_modify_time.tv_sec; + args->a_vap->va_modify_time.tv_nsec = covered_rootattr.va_modify_time.tv_nsec; + } + if (VATTR_IS_ACTIVE(args->a_vap, va_access_time)) { + VATTR_SET_SUPPORTED(args->a_vap, va_access_time); + args->a_vap->va_modify_time.tv_sec = covered_rootattr.va_access_time.tv_sec; + args->a_vap->va_modify_time.tv_nsec = covered_rootattr.va_access_time.tv_nsec; + } + + return 0; +} + +static int +nullfs_getattr(struct vnop_getattr_args * args) +{ + int error; + struct null_mount * null_mp = MOUNTTONULLMOUNT(vnode_mount(args->a_vp)); + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + + lck_mtx_lock(&null_mp->nullm_lock); + if (nullfs_isspecialvp(args->a_vp)) { + error = nullfs_special_getattr(args); + lck_mtx_unlock(&null_mp->nullm_lock); + return error; + } + lck_mtx_unlock(&null_mp->nullm_lock); + + /* this will return a different inode for third than read dir will */ + struct vnode * lowervp = NULLVPTOLOWERVP(args->a_vp); + + error = vnode_getwithref(lowervp); + if (error == 0) { + error = VNOP_GETATTR(lowervp, args->a_vap, args->a_context); + vnode_put(lowervp); + + if (error == 0) { + /* fix up fsid so it doesn't say the underlying fs*/ + VATTR_RETURN(args->a_vap, va_fsid, vfs_statfs(vnode_mount(args->a_vp))->f_fsid.val[0]); } } - /* - * Map the possible out-going vpp - * (Assumes that the lower layer always returns - * a vnode_get'ed vpp unless it gets an error.) - */ - if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && - !(descp->vdesc_flags & VDESC_NOMAP_VPP) && - !error) { - /* - * XXX - even though some ops have vpp returned vp's, - * several ops actually vnode_put this before returning. - * We must avoid these ops. - * (This should go away when these ops are regularized.) - */ - if (descp->vdesc_flags & VDESC_VPP_WILLRELE) - goto out; - vppp = VOPARG_OFFSETTO(struct vnode***, - descp->vdesc_vpp_offset,ap); - error = null_node_create(old_vps[0]->v_mount, **vppp, *vppp); + return error; +} + +static int +nullfs_open(struct vnop_open_args * args) +{ + int error; + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + + if (nullfs_checkspecialvp(args->a_vp)) { + return 0; /* nothing extra needed */ } - out: - return (error); + vp = args->a_vp; + lvp = NULLVPTOLOWERVP(vp); + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_OPEN(lvp, args->a_mode, args->a_context); + vnode_put(lvp); + } + + return error; +} + +static int +nullfs_close(struct vnop_close_args * args) +{ + int error; + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + + if (nullfs_checkspecialvp(args->a_vp)) { + return 0; /* nothing extra needed */ + } + + vp = args->a_vp; + lvp = NULLVPTOLOWERVP(vp); + + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_CLOSE(lvp, args->a_fflag, args->a_context); + vnode_put(lvp); + } + return error; +} + +/* get lvp's parent, if possible, even if it isn't set. + + lvp is expected to have an iocount before and after this call. + + if a dvpp is populated the returned vnode has an iocount. */ +static int +null_get_lowerparent(vnode_t lvp, vnode_t * dvpp, vfs_context_t ctx) +{ + int error = 0; + struct vnode_attr va; + mount_t mp = vnode_mount(lvp); + vnode_t dvp = vnode_parent(lvp); + + if (dvp) { + error = vnode_get(dvp); + goto end; + } + + error = ENOENT; + if (!(mp->mnt_kern_flag & MNTK_PATH_FROM_ID)) { + goto end; + } + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_parentid); + + error = vnode_getattr(lvp, &va, ctx); + + if (error || !VATTR_IS_SUPPORTED(&va, va_parentid)) { + goto end; + } + + error = VFS_VGET(mp, (ino64_t)va.va_parentid, &dvp, ctx); + +end: + if (error == 0) { + *dvpp = dvp; + } + return error; +} + +/* the mountpoint lock should be held going into this function */ +static int +null_special_lookup(struct vnop_lookup_args * ap) +{ + struct componentname * cnp = ap->a_cnp; + struct vnode * dvp = ap->a_dvp; + struct vnode * ldvp = NULL; + struct vnode * lvp = NULL; + struct vnode * vp = NULL; + struct mount * mp = vnode_mount(dvp); + struct null_mount * null_mp = MOUNTTONULLMOUNT(mp); + int error = ENOENT; + + if (dvp == null_mp->nullm_rootvp) { + /* handle . and .. */ + if (cnp->cn_nameptr[0] == '.') { + if (cnp->cn_namelen == 1 || (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')) { + /* this is the root so both . and .. give back the root */ + vp = dvp; + error = vnode_get(vp); + goto end; + } + } + + /* our virtual wrapper directory should be d but D is acceptable if the + * lower file system is case insensitive */ + if (cnp->cn_namelen == 1 && + (cnp->cn_nameptr[0] == 'd' || (null_mp->nullm_flags & NULLM_CASEINSENSITIVE ? cnp->cn_nameptr[0] == 'D' : 0))) { + error = 0; + if (null_mp->nullm_secondvp == NULL) { + error = null_getnewvnode(mp, NULL, dvp, &vp, cnp, 0); + if (error) { + goto end; + } + + null_mp->nullm_secondvp = vp; + } else { + vp = null_mp->nullm_secondvp; + error = vnode_get(vp); + } + } + + } else if (dvp == null_mp->nullm_secondvp) { + /* handle . and .. */ + if (cnp->cn_nameptr[0] == '.') { + if (cnp->cn_namelen == 1) { + vp = dvp; + error = vnode_get(vp); + goto end; + } else if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { + /* parent here is the root vp */ + vp = null_mp->nullm_rootvp; + error = vnode_get(vp); + goto end; + } + } + /* nullmp->nullm_lowerrootvp was set at mount time so don't need to lock to + * access it */ + /* v_name should be null terminated but cn_nameptr is not necessarily. + cn_namelen is the number of characters before the null in either case */ + error = vnode_getwithvid(null_mp->nullm_lowerrootvp, null_mp->nullm_lowerrootvid); + if (error) { + goto end; + } + + /* We don't want to mess with case insensitivity and unicode, so the plan to + check here is + 1. try to get the lower root's parent + 2. If we get a parent, then perform a lookup on the lower file system + using the parent and the passed in cnp + 3. If that worked and we got a vp, then see if the vp is lowerrootvp. If + so we got a match + 4. Anything else results in ENOENT. + */ + error = null_get_lowerparent(null_mp->nullm_lowerrootvp, &ldvp, ap->a_context); + + if (error == 0) { + error = VNOP_LOOKUP(ldvp, &lvp, cnp, ap->a_context); + vnode_put(ldvp); + + if (error == 0) { + if (lvp == null_mp->nullm_lowerrootvp) { + /* always check the hashmap for a vnode for this, the root of the + * mirrored system */ + error = null_nodeget(mp, lvp, dvp, &vp, cnp, 0); + + if (error == 0 && null_mp->nullm_thirdcovervp == NULL) { + /* if nodeget succeeded then vp has an iocount*/ + null_mp->nullm_thirdcovervp = vp; + } + } else { + error = ENOENT; + } + vnode_put(lvp); + } + } + vnode_put(null_mp->nullm_lowerrootvp); + } + +end: + if (error == 0) { + *ap->a_vpp = vp; + } + return error; } /* @@ -347,218 +449,589 @@ null_bypass(ap) * as we progress through the tree. We also have to enforce read-only * if this layer is mounted read-only. */ -null_lookup(ap) - struct vnop_lookup_args /* { - struct vnode * a_dvp; - struct vnode ** a_vpp; - struct componentname * a_cnp; - vfs_context_t a_context; - } */ *ap; +static int +null_lookup(struct vnop_lookup_args * ap) { - struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; - int flags = cnp->cn_flags; - struct vnode *dvp, *vp; + struct componentname * cnp = ap->a_cnp; + struct vnode * dvp = ap->a_dvp; + struct vnode *vp, *ldvp, *lvp; + struct mount * mp; + struct null_mount * null_mp; int error; - error = null_bypass(ap); + NULLFSDEBUG("%s parent: %p component: %.*s\n", __FUNCTION__, ap->a_dvp, cnp->cn_namelen, cnp->cn_nameptr); + + mp = vnode_mount(dvp); + /* rename and delete are not allowed. this is a read only file system */ + if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME || cnp->cn_nameiop == CREATE) { + return (EROFS); + } + null_mp = MOUNTTONULLMOUNT(mp); + + lck_mtx_lock(&null_mp->nullm_lock); + if (nullfs_isspecialvp(dvp)) { + error = null_special_lookup(ap); + lck_mtx_unlock(&null_mp->nullm_lock); + return error; + } + lck_mtx_unlock(&null_mp->nullm_lock); + + // . and .. handling + if (cnp->cn_nameptr[0] == '.') { + if (cnp->cn_namelen == 1) { + vp = dvp; + } else if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { + /* mount point crossing is handled in null_special_lookup */ + vp = vnode_parent(dvp); + } else { + goto notdot; + } + + error = vp ? vnode_get(vp) : ENOENT; + + if (error == 0) { + *ap->a_vpp = vp; + } + + return error; + } + +notdot: + ldvp = NULLVPTOLOWERVP(dvp); + vp = lvp = NULL; /* - * We must do the same locking and unlocking at this layer as - * is done in the layers below us. We could figure this out - * based on the error return and the LASTCN, LOCKPARENT, and - * LOCKLEAF flags. However, it is more expidient to just find - * out the state of the lower level vnodes and set ours to the - * same state. + * Hold ldvp. The reference on it, owned by dvp, is lost in + * case of dvp reclamation. */ - dvp = ap->a_dvp; - vp = *ap->a_vpp; - if (dvp == vp) - return (error); + error = vnode_getwithref(ldvp); + if (error) { + return error; + } + + error = VNOP_LOOKUP(ldvp, &lvp, cnp, ap->a_context); + + vnode_put(ldvp); + + if ((error == 0 || error == EJUSTRETURN) && lvp != NULL) { + if (ldvp == lvp) { + vp = dvp; + error = vnode_get(vp); + } else { + error = null_nodeget(mp, lvp, dvp, &vp, cnp, 0); + } + if (error == 0) { + *ap->a_vpp = vp; + } + } + + /* if we got lvp, drop the iocount from VNOP_LOOKUP */ + if (lvp != NULL) { + vnode_put(lvp); + } + return (error); } /* - * Setattr call. + * Don't think this needs to do anything */ -int -null_setattr( - struct vnop_setattr_args /* { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct vnode_attr *a_vap; - kauth_cred_t a_cred; - struct proc *a_p; - } */ *ap) +static int +null_inactive(__unused struct vnop_inactive_args * ap) +{ + NULLFSDEBUG("%s %p\n", __FUNCTION__, ap->a_vp); + + return (0); +} + +static int +null_reclaim(struct vnop_reclaim_args * ap) { - struct vnode *vp = ap->a_vp; - struct vnode_attr *vap = ap->a_vap; - - if (VATTR_IS_ACTIVE(vap, va_data_size)) { - switch (vp->v_type) { - case VDIR: - return (EISDIR); - case VCHR: - case VBLK: - case VSOCK: - case VFIFO: - return (0); - case VREG: - case VLNK: - default: + struct vnode * vp; + struct null_node * xp; + struct vnode * lowervp; + struct null_mount * null_mp = MOUNTTONULLMOUNT(vnode_mount(ap->a_vp)); + + NULLFSDEBUG("%s %p\n", __FUNCTION__, ap->a_vp); + + vp = ap->a_vp; + + xp = VTONULL(vp); + lowervp = xp->null_lowervp; + + lck_mtx_lock(&null_mp->nullm_lock); + + vnode_removefsref(vp); + + if (lowervp != NULL) { + /* root and second don't have a lowervp, so nothing to release and nothing + * got hashed */ + if (xp->null_flags & NULL_FLAG_HASHED) { + /* only call this if we actually made it into the hash list. reclaim gets + called also to + clean up a vnode that got created when it didn't need to under race + conditions */ + null_hashrem(xp); } + vnode_getwithref(lowervp); + vnode_rele(lowervp); + vnode_put(lowervp); + } + + if (vp == null_mp->nullm_rootvp) { + null_mp->nullm_rootvp = NULL; + } else if (vp == null_mp->nullm_secondvp) { + null_mp->nullm_secondvp = NULL; + } else if (vp == null_mp->nullm_thirdcovervp) { + null_mp->nullm_thirdcovervp = NULL; } - return (null_bypass(ap)); + + lck_mtx_unlock(&null_mp->nullm_lock); + + cache_purge(vp); + vnode_clearfsnode(vp); + + FREE(xp, M_TEMP); + + return 0; } -/* - * We handle getattr only to change the fsid. - */ -int -null_getattr(ap) - struct vnop_getattr_args /* { - struct vnode *a_vp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } */ *ap; +#define DIRENT_SZ(dp) ((sizeof(struct dirent) - NAME_MAX) + (((dp)->d_namlen + 1 + 3) & ~3)) + +static int +store_entry_special(ino_t ino, const char * name, struct uio * uio) { - int error; + struct dirent e; + size_t namelen = strlen(name); + int error = EINVAL; - if (error = null_bypass(ap)) - return (error); - /* Requires that arguments be restored. */ - VATTR_RETURN(ap->a_vap, va_fsid, ap->a_vp->v_mount->mnt_vfsstat.f_fsid.val[0]); - return (0); + if (namelen + 1 <= NAME_MAX) { + memset(&e, 0, sizeof(e)); + + e.d_ino = ino; + e.d_type = DT_DIR; + + e.d_namlen = namelen; /* don't include NUL */ + e.d_reclen = DIRENT_SZ(&e); + if (uio_resid(uio) >= e.d_reclen) { + strlcpy(e.d_name, name, NAME_MAX); + error = uiomove((caddr_t)&e, e.d_reclen, uio); + } else { + error = EMSGSIZE; + } + } + return error; } -int -null_access(ap) - struct vnop_access_args /* { - struct vnode *a_vp; - int a_action; - vfs_context_t a_context; - } */ *ap; +static int +nullfs_special_readdir(struct vnop_readdir_args * ap) { - return (null_bypass(ap)); + struct vnode * vp = ap->a_vp; + struct uio * uio = ap->a_uio; + struct null_mount * null_mp = MOUNTTONULLMOUNT(vnode_mount(vp)); + off_t offset = uio_offset(uio); + int error = ERANGE; + int items = 0; + ino_t ino = 0; + const char * name = NULL; + + if (ap->a_flags & (VNODE_READDIR_EXTENDED | VNODE_READDIR_REQSEEKOFF)) + return (EINVAL); + + if (offset == 0) { + /* . case */ + if (vp == null_mp->nullm_rootvp) { + ino = NULL_ROOT_INO; + } else /* only get here if vp matches nullm_rootvp or nullm_secondvp */ + { + ino = NULL_SECOND_INO; + } + error = store_entry_special(ino, ".", uio); + if (error) { + goto out; + } + offset++; + items++; + } + if (offset == 1) { + /* .. case */ + /* only get here if vp matches nullm_rootvp or nullm_secondvp */ + ino = NULL_ROOT_INO; + + error = store_entry_special(ino, "..", uio); + if (error) { + goto out; + } + offset++; + items++; + } + if (offset == 2) { + /* the directory case */ + if (vp == null_mp->nullm_rootvp) { + ino = NULL_SECOND_INO; + name = "d"; + } else /* only get here if vp matches nullm_rootvp or nullm_secondvp */ + { + ino = NULL_THIRD_INO; + if (vnode_getwithvid(null_mp->nullm_lowerrootvp, null_mp->nullm_lowerrootvid)) { + /* In this case the lower file system has been ripped out from under us, + but we don't want to error out + Instead we just want d to look empty. */ + error = 0; + goto out; + } + name = vnode_getname_printable(null_mp->nullm_lowerrootvp); + } + error = store_entry_special(ino, name, uio); + + if (ino == NULL_THIRD_INO) { + vnode_putname_printable(name); + vnode_put(null_mp->nullm_lowerrootvp); + } + + if (error) { + goto out; + } + offset++; + items++; + } + +out: + if (error == EMSGSIZE) { + error = 0; /* return success if we ran out of space, but we wanted to make + sure that we didn't update offset and items incorrectly */ + } + uio_setoffset(uio, offset); + if (ap->a_numdirent) { + *ap->a_numdirent = items; + } + return error; } -int -null_inactive(ap) - struct vnop_inactive_args /* { - struct vnode *a_vp; - vfs_context_t a_context; - } */ *ap; +static int +nullfs_readdir(struct vnop_readdir_args * ap) { - /* - * Do nothing (and _don't_ bypass). - * Wait to vnode_put lowervp until reclaim, - * so that until then our null_node is in the - * cache and reusable. - * - * NEEDSWORK: Someday, consider inactive'ing - * the lowervp and then trying to reactivate it - * with capabilities (v_id) - * like they do in the name lookup cache code. - * That's too much work for now. + struct vnode *vp, *lvp; + int error; + struct null_mount * null_mp = MOUNTTONULLMOUNT(vnode_mount(ap->a_vp)); + + NULLFSDEBUG("%s %p\n", __FUNCTION__, ap->a_vp); + /* assumption is that any vp that comes through here had to go through lookup */ - return (0); + + lck_mtx_lock(&null_mp->nullm_lock); + if (nullfs_isspecialvp(ap->a_vp)) { + error = nullfs_special_readdir(ap); + lck_mtx_unlock(&null_mp->nullm_lock); + return error; + } + lck_mtx_unlock(&null_mp->nullm_lock); + + vp = ap->a_vp; + lvp = NULLVPTOLOWERVP(vp); + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_READDIR(lvp, ap->a_uio, ap->a_flags, ap->a_eofflag, ap->a_numdirent, ap->a_context); + vnode_put(lvp); + } + + return error; } -int -null_reclaim(ap) - struct vnop_reclaim_args /* { - struct vnode *a_vp; - vfs_context_t a_context; - } */ *ap; +static int +nullfs_readlink(struct vnop_readlink_args * ap) { - struct vnode *vp = ap->a_vp; - struct null_node *xp = VTONULL(vp); - struct vnode *lowervp = xp->null_lowervp; + NULLFSDEBUG("%s %p\n", __FUNCTION__, ap->a_vp); + int error; + struct vnode *vp, *lvp; - /* - * Note: in vnop_reclaim, vp->v_op == dead_vnodeop_p, - * so we can't call VOPs on ourself. - */ - /* After this assignment, this node will not be re-used. */ - xp->null_lowervp = NULL; - LIST_REMOVE(xp, null_hash); - FREE(vp->v_data, M_TEMP); - vp->v_data = NULL; - vnode_put (lowervp); - return (0); + if (nullfs_checkspecialvp(ap->a_vp)) { + return ENOTSUP; /* the special vnodes aren't links */ + } + + vp = ap->a_vp; + lvp = NULLVPTOLOWERVP(vp); + + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_READLINK(lvp, ap->a_uio, ap->a_context); + vnode_put(lvp); + + if (error) { + NULLFSDEBUG("readlink failed: %d\n", error); + } + } + + return error; } -/* - * XXX - vnop_strategy must be hand coded because it has no - * vnode in its arguments. - * This goes away with a merged VM/buffer cache. - */ -int -null_strategy(ap) - struct vnop_strategy_args /* { - struct buf *a_bp; - } */ *ap; +static int +nullfs_pathconf(__unused struct vnop_pathconf_args * args) +{ + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + return EINVAL; +} + +static int +nullfs_fsync(__unused struct vnop_fsync_args * args) +{ + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + return 0; +} + +static int +nullfs_mmap(struct vnop_mmap_args * args) { - struct buf *bp = ap->a_bp; int error; - struct vnode *savedvp; + struct vnode *vp, *lvp; - savedvp = vnode(bp); - buf_setvnode(bp, NULLVPTOLOWERVP(savedvp)); + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); - error = VNOP_STRATEGY(bp); + if (nullfs_checkspecialvp(args->a_vp)) { + return 0; /* nothing extra needed */ + } - buf_setvnode(bp, savedvp); + vp = args->a_vp; + lvp = NULLVPTOLOWERVP(vp); + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_MMAP(lvp, args->a_fflags, args->a_context); + vnode_put(lvp); + } - return (error); + return error; } -/* - * XXX - like vnop_strategy, vnop_bwrite must be hand coded because it has no - * vnode in its arguments. - * This goes away with a merged VM/buffer cache. - */ -int -null_bwrite(ap) - struct vnop_bwrite_args /* { - struct buf *a_bp; - } */ *ap; +static int +nullfs_mnomap(struct vnop_mnomap_args * args) { - struct buf *bp = ap->a_bp; int error; - struct vnode *savedvp; + struct vnode *vp, *lvp; - savedvp = buf_vnode(bp); - buf_setvnode(bp, NULLVPTOLOWERVP(savedvp)); + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); - error = VNOP_BWRITE(bp); + if (nullfs_checkspecialvp(args->a_vp)) { + return 0; /* nothing extra needed */ + } - buf_setvnode(bp, savedvp); + vp = args->a_vp; + lvp = NULLVPTOLOWERVP(vp); + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_MNOMAP(lvp, args->a_context); + vnode_put(lvp); + } - return (error); + return error; } -/* - * Global vfs data structures - */ +static int +nullfs_getxattr(struct vnop_getxattr_args * args) +{ + int error; + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + + if (nullfs_checkspecialvp(args->a_vp)) { + return 0; /* nothing extra needed */ + } + + vp = args->a_vp; + lvp = NULLVPTOLOWERVP(vp); + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_GETXATTR(lvp, args->a_name, args->a_uio, args->a_size, args->a_options, args->a_context); + vnode_put(lvp); + } + + return error; +} + +static int +nullfs_listxattr(struct vnop_listxattr_args * args) +{ + int error; + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, args->a_vp); + + if (nullfs_checkspecialvp(args->a_vp)) { + return 0; /* nothing extra needed */ + } + + vp = args->a_vp; + lvp = NULLVPTOLOWERVP(vp); + error = vnode_getwithref(lvp); + if (error == 0) { + error = VNOP_LISTXATTR(lvp, args->a_uio, args->a_size, args->a_options, args->a_context); + vnode_put(lvp); + } + + return error; +} + +/* relies on v1 paging */ +static int +nullfs_pagein(struct vnop_pagein_args * ap) +{ + int error = EIO; + struct vnode *vp, *lvp; + + NULLFSDEBUG("%s %p\n", __FUNCTION__, ap->a_vp); + + vp = ap->a_vp; + lvp = NULLVPTOLOWERVP(vp); + + if (vnode_vtype(vp) != VREG) { + return ENOTSUP; + } + + /* + * Ask VM/UBC/VFS to do our bidding + */ + if (vnode_getwithvid(lvp, NULLVPTOLOWERVID(vp)) == 0) { + vm_offset_t ioaddr; + uio_t auio; + kern_return_t kret; + off_t bytes_to_commit; + off_t lowersize; + upl_t upl = ap->a_pl; + user_ssize_t bytes_remaining = 0; -#define VOPFUNC int (*)(void *) + auio = uio_create(1, ap->a_f_offset, UIO_SYSSPACE, UIO_READ); + if (auio == NULL) { + error = EIO; + goto exit_no_unmap; + } + + kret = ubc_upl_map(upl, &ioaddr); + if (KERN_SUCCESS != kret) { + panic("nullfs_pagein: ubc_upl_map() failed with (%d)", kret); + } + + ioaddr += ap->a_pl_offset; + + error = uio_addiov(auio, (user_addr_t)ioaddr, ap->a_size); + if (error) { + goto exit; + } + + lowersize = ubc_getsize(lvp); + if (lowersize != ubc_getsize(vp)) { + (void)ubc_setsize(vp, lowersize); /* ignore failures, nothing can be done */ + } + + error = VNOP_READ(lvp, auio, ((ap->a_flags & UPL_IOSYNC) ? IO_SYNC : 0), ap->a_context); + + bytes_remaining = uio_resid(auio); + if (bytes_remaining > 0 && bytes_remaining <= (user_ssize_t)ap->a_size) + { + /* zero bytes that weren't read in to the upl */ + bzero((void*)((uintptr_t)(ioaddr + ap->a_size - bytes_remaining)), (size_t) bytes_remaining); + } + + exit: + kret = ubc_upl_unmap(upl); + if (KERN_SUCCESS != kret) { + panic("nullfs_pagein: ubc_upl_unmap() failed with (%d)", kret); + } + + if (auio != NULL) { + uio_free(auio); + } + + exit_no_unmap: + if ((ap->a_flags & UPL_NOCOMMIT) == 0) { + if (!error && (bytes_remaining >= 0) && (bytes_remaining <= (user_ssize_t)ap->a_size)) { + /* only commit what was read in (page aligned)*/ + bytes_to_commit = ap->a_size - bytes_remaining; + if (bytes_to_commit) + { + /* need to make sure bytes_to_commit and byte_remaining are page aligned before calling ubc_upl_commit_range*/ + if (bytes_to_commit & PAGE_MASK) + { + bytes_to_commit = (bytes_to_commit & (~PAGE_MASK)) + (PAGE_MASK + 1); + assert(bytes_to_commit <= (off_t)ap->a_size); + + bytes_remaining = ap->a_size - bytes_to_commit; + } + ubc_upl_commit_range(upl, ap->a_pl_offset, (upl_size_t)bytes_to_commit, UPL_COMMIT_FREE_ON_EMPTY); + } + + /* abort anything thats left */ + if (bytes_remaining) { + ubc_upl_abort_range(upl, ap->a_pl_offset + bytes_to_commit, (upl_size_t)bytes_remaining, UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); + } + } else { + ubc_upl_abort_range(upl, ap->a_pl_offset, (upl_size_t)ap->a_size, UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); + } + } + vnode_put(lvp); + } else if((ap->a_flags & UPL_NOCOMMIT) == 0) { + ubc_upl_abort_range(ap->a_pl, ap->a_pl_offset, (upl_size_t)ap->a_size, UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); + } + return error; +} + +static int +nullfs_read(struct vnop_read_args * ap) +{ + int error = EIO; + + struct vnode *vp, *lvp; -int (**null_vnodeop_p)(void *); -struct vnodeopv_entry_desc null_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)null_bypass }, + NULLFSDEBUG("%s %p\n", __FUNCTION__, ap->a_vp); - { &vnop_lookup_desc, (VOPFUNC)null_lookup }, - { &vnop_setattr_desc, (VOPFUNC)null_setattr }, - { &vnop_getattr_desc, (VOPFUNC)null_getattr }, - { &vnop_access_desc, (VOPFUNC)null_access }, - { &vnop_inactive_desc, (VOPFUNC)null_inactive }, - { &vnop_reclaim_desc, (VOPFUNC)null_reclaim }, + if (nullfs_checkspecialvp(ap->a_vp)) { + return ENOTSUP; /* the special vnodes can't be read */ + } - { &vnop_strategy_desc, (VOPFUNC)null_strategy }, - { &vnop_bwrite_desc, (VOPFUNC)null_bwrite }, + vp = ap->a_vp; + lvp = NULLVPTOLOWERVP(vp); - { (struct vnodeop_desc*)NULL, (int(*)())NULL } + /* + * First some house keeping + */ + if (vnode_getwithvid(lvp, NULLVPTOLOWERVID(vp)) == 0) { + if (!vnode_isreg(lvp) && !vnode_islnk(lvp)) { + error = EPERM; + goto end; + } + + if (uio_resid(ap->a_uio) == 0) { + error = 0; + goto end; + } + + /* + * Now ask VM/UBC/VFS to do our bidding + */ + + error = VNOP_READ(lvp, ap->a_uio, ap->a_ioflag, ap->a_context); + if (error) { + NULLFSDEBUG("VNOP_READ failed: %d\n", error); + } + end: + vnode_put(lvp); + } + return error; +} + +/* + * Global vfs data structures + */ + +static struct vnodeopv_entry_desc nullfs_vnodeop_entries[] = { + {&vnop_default_desc, (vop_t)nullfs_default}, {&vnop_getattr_desc, (vop_t)nullfs_getattr}, + {&vnop_open_desc, (vop_t)nullfs_open}, {&vnop_close_desc, (vop_t)nullfs_close}, + {&vnop_inactive_desc, (vop_t)null_inactive}, {&vnop_reclaim_desc, (vop_t)null_reclaim}, + {&vnop_lookup_desc, (vop_t)null_lookup}, {&vnop_readdir_desc, (vop_t)nullfs_readdir}, + {&vnop_readlink_desc, (vop_t)nullfs_readlink}, {&vnop_pathconf_desc, (vop_t)nullfs_pathconf}, + {&vnop_fsync_desc, (vop_t)nullfs_fsync}, {&vnop_mmap_desc, (vop_t)nullfs_mmap}, + {&vnop_mnomap_desc, (vop_t)nullfs_mnomap}, {&vnop_getxattr_desc, (vop_t)nullfs_getxattr}, + {&vnop_pagein_desc, (vop_t)nullfs_pagein}, {&vnop_read_desc, (vop_t)nullfs_read}, + {&vnop_listxattr_desc, (vop_t)nullfs_listxattr}, {NULL, NULL}, }; -struct vnodeopv_desc null_vnodeop_opv_desc = - { &null_vnodeop_p, null_vnodeop_entries }; + +struct vnodeopv_desc nullfs_vnodeop_opv_desc = {&nullfs_vnodeop_p, nullfs_vnodeop_entries};