X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/c910b4d9d2451126ae3917b931cd4390c11e1d52..c18c124eaa464aaaa5549e99e5a70fc9cbb50944:/bsd/vfs/vfs_vnops.c?ds=inline diff --git a/bsd/vfs/vfs_vnops.c b/bsd/vfs/vfs_vnops.c index 0eb1036ad..9b431080f 100644 --- a/bsd/vfs/vfs_vnops.c +++ b/bsd/vfs/vfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2014 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,11 +105,17 @@ int ubc_setcred(struct vnode *, struct proc *); #include #include +#include #if CONFIG_MACF #include #endif +#if CONFIG_PROTECT +#include +#endif + +extern void sigpup_attach_vnode(vnode_t); /* XXX */ static int vn_closefile(struct fileglob *fp, vfs_context_t ctx); static int vn_ioctl(struct fileproc *fp, u_long com, caddr_t data, @@ -122,13 +128,31 @@ static int vn_select( struct fileproc *fp, int which, void * wql, vfs_context_t ctx); static int vn_kqfilt_add(struct fileproc *fp, struct knote *kn, vfs_context_t ctx); +static void filt_vndetach(struct knote *kn); +static int filt_vnode(struct knote *kn, long hint); +static int vn_open_auth_finish(vnode_t vp, int fmode, vfs_context_t ctx); #if 0 static int vn_kqfilt_remove(struct vnode *vp, uintptr_t ident, vfs_context_t ctx); #endif -struct fileops vnops = - { vn_read, vn_write, vn_ioctl, vn_select, vn_closefile, vn_kqfilt_add, NULL }; +const struct fileops vnops = { + DTYPE_VNODE, + vn_read, + vn_write, + vn_ioctl, + vn_select, + vn_closefile, + vn_kqfilt_add, + NULL +}; + +struct filterops vnode_filtops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = filt_vndetach, + .f_event = filt_vnode +}; /* * Common code for vnode open operations. @@ -153,6 +177,137 @@ vn_open_modflags(struct nameidata *ndp, int *fmodep, int cmode) return(vn_open_auth(ndp, fmodep, &va)); } +static int +vn_open_auth_finish(vnode_t vp, int fmode, vfs_context_t ctx) +{ + int error; + + if ((error = vnode_ref_ext(vp, fmode, 0)) != 0) { + goto bad; + } + + /* Call out to allow 3rd party notification of open. + * Ignore result of kauth_authorize_fileop call. + */ +#if CONFIG_MACF + mac_vnode_notify_open(ctx, vp, fmode); +#endif + kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN, + (uintptr_t)vp, 0); + + sigpup_attach_vnode(vp); + + return 0; + +bad: + return error; + +} + +/* + * May do nameidone() to allow safely adding an FSEvent. Cue off of ni_dvp to + * determine whether that has happened. + */ +static int +vn_open_auth_do_create(struct nameidata *ndp, struct vnode_attr *vap, int fmode, boolean_t *did_create, boolean_t *did_open, vfs_context_t ctx) +{ + uint32_t status = 0; + vnode_t dvp = ndp->ni_dvp; + int batched; + int error; + vnode_t vp; + + batched = vnode_compound_open_available(ndp->ni_dvp); + *did_open = FALSE; + + VATTR_SET(vap, va_type, VREG); + if (fmode & O_EXCL) + vap->va_vaflags |= VA_EXCLUSIVE; + +#if NAMEDRSRCFORK + if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) { + if ((error = vn_authorize_create(dvp, &ndp->ni_cnd, vap, ctx, NULL)) != 0) + goto out; + if ((error = vnode_makenamedstream(dvp, &ndp->ni_vp, XATTR_RESOURCEFORK_NAME, 0, ctx)) != 0) + goto out; + *did_create = TRUE; + } else { +#endif + if (!batched) { + if ((error = vn_authorize_create(dvp, &ndp->ni_cnd, vap, ctx, NULL)) != 0) + goto out; + } + + error = vn_create(dvp, &ndp->ni_vp, ndp, vap, VN_CREATE_DOOPEN, fmode, &status, ctx); + if (error != 0) { + if (batched) { + *did_create = (status & COMPOUND_OPEN_STATUS_DID_CREATE) ? TRUE : FALSE; + } else { + *did_create = FALSE; + } + + if (error == EKEEPLOOKING) { + if (*did_create) { + panic("EKEEPLOOKING, but we did a create?"); + } + if (!batched) { + panic("EKEEPLOOKING from filesystem that doesn't support compound vnops?"); + } + if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING, but continue flag not set?"); + } + + /* + * Do NOT drop the dvp: we need everything to continue the lookup. + */ + return error; + } + } else { + if (batched) { + *did_create = (status & COMPOUND_OPEN_STATUS_DID_CREATE) ? 1 : 0; + *did_open = TRUE; + } else { + *did_create = TRUE; + } + } +#if NAMEDRSRCFORK + } +#endif + + vp = ndp->ni_vp; + + if (*did_create) { + int update_flags = 0; + + // Make sure the name & parent pointers are hooked up + if (vp->v_name == NULL) + update_flags |= VNODE_UPDATE_NAME; + if (vp->v_parent == NULLVP) + update_flags |= VNODE_UPDATE_PARENT; + + if (update_flags) + vnode_update_identity(vp, dvp, ndp->ni_cnd.cn_nameptr, ndp->ni_cnd.cn_namelen, ndp->ni_cnd.cn_hash, update_flags); + + vnode_put(dvp); + ndp->ni_dvp = NULLVP; + +#if CONFIG_FSE + if (need_fsevent(FSE_CREATE_FILE, vp)) { + add_fsevent(FSE_CREATE_FILE, ctx, + FSE_ARG_VNODE, vp, + FSE_ARG_DONE); + } +#endif + } +out: + if (ndp->ni_dvp != NULLVP) { + vnode_put(dvp); + ndp->ni_dvp = NULLVP; + } + + return error; +} + /* * Open a file with authorization, updating the contents of the structures * pointed to by ndp, fmodep, and vap as necessary to perform the requested @@ -206,228 +361,283 @@ vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap) vfs_context_t ctx = ndp->ni_cnd.cn_context; int error; int fmode; - kauth_action_t action; + uint32_t origcnflags; + boolean_t did_create; + boolean_t did_open; + boolean_t need_vnop_open; + boolean_t batched; + boolean_t ref_failed; again: vp = NULL; dvp = NULL; + batched = FALSE; + did_create = FALSE; + need_vnop_open = TRUE; + ref_failed = FALSE; fmode = *fmodep; + origcnflags = ndp->ni_cnd.cn_flags; + + /* + * O_CREAT + */ if (fmode & O_CREAT) { if ( (fmode & O_DIRECTORY) ) { error = EINVAL; goto out; } ndp->ni_cnd.cn_nameiop = CREATE; - /* Inherit USEDVP flag only */ - ndp->ni_cnd.cn_flags &= USEDVP; +#if CONFIG_TRIGGERS + ndp->ni_op = OP_LINK; +#endif + /* Inherit USEDVP, vnode_open() supported flags only */ + ndp->ni_cnd.cn_flags &= (USEDVP | NOCROSSMOUNT); ndp->ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF | AUDITVNPATH1; + ndp->ni_flag = NAMEI_COMPOUNDOPEN; #if NAMEDRSRCFORK /* open calls are allowed for resource forks. */ ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK; #endif - if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) + if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0 && (origcnflags & FOLLOW) != 0) ndp->ni_cnd.cn_flags |= FOLLOW; + +continue_create_lookup: if ( (error = namei(ndp)) ) goto out; + dvp = ndp->ni_dvp; vp = ndp->ni_vp; - /* not found, create */ - if (vp == NULL) { - /* must have attributes for a new file */ - if (vap == NULL) { - error = EINVAL; - goto badcreate; - } - - VATTR_SET(vap, va_type, VREG); -#if CONFIG_MACF - error = mac_vnode_check_create(ctx, - dvp, &ndp->ni_cnd, vap); - if (error) - goto badcreate; -#endif /* MAC */ + batched = vnode_compound_open_available(dvp); - /* authorize before creating */ - if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) - goto badcreate; + /* not found, create */ + if (vp == NULL) { + /* must have attributes for a new file */ + if (vap == NULL) { + vnode_put(dvp); + error = EINVAL; + goto out; + } + /* + * Attempt a create. For a system supporting compound VNOPs, we may + * find an existing file or create one; in either case, we will already + * have the file open and no VNOP_OPEN() will be needed. + */ + error = vn_open_auth_do_create(ndp, vap, fmode, &did_create, &did_open, ctx); - if (fmode & O_EXCL) - vap->va_vaflags |= VA_EXCLUSIVE; -#if NAMEDRSRCFORK - if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) { - if ((error = vnode_makenamedstream(dvp, &ndp->ni_vp, XATTR_RESOURCEFORK_NAME, 0, ctx)) != 0) - goto badcreate; - } else -#endif - if ((error = vn_create(dvp, &ndp->ni_vp, &ndp->ni_cnd, vap, 0, ctx)) != 0) - goto badcreate; - + dvp = ndp->ni_dvp; vp = ndp->ni_vp; - if (vp) { - int update_flags = 0; - - // Make sure the name & parent pointers are hooked up - if (vp->v_name == NULL) - update_flags |= VNODE_UPDATE_NAME; - if (vp->v_parent == NULLVP) - update_flags |= VNODE_UPDATE_PARENT; - - if (update_flags) - vnode_update_identity(vp, dvp, ndp->ni_cnd.cn_nameptr, ndp->ni_cnd.cn_namelen, ndp->ni_cnd.cn_hash, update_flags); - -#if CONFIG_FSE - if (need_fsevent(FSE_CREATE_FILE, vp)) { - add_fsevent(FSE_CREATE_FILE, ctx, - FSE_ARG_VNODE, vp, - FSE_ARG_DONE); + /* + * Detected a node that the filesystem couldn't handle. Don't call + * nameidone() yet, because we need that path buffer. + */ + if (error == EKEEPLOOKING) { + if (!batched) { + panic("EKEEPLOOKING from a filesystem that doesn't support compound VNOPs?"); } -#endif - + goto continue_create_lookup; } - /* - * nameidone has to happen before we vnode_put(dvp) - * and clear the ni_dvp field, since it may need - * to release the fs_nodelock on the dvp - */ -badcreate: + nameidone(ndp); - ndp->ni_dvp = NULL; - vnode_put(dvp); + if (dvp) { + panic("Shouldn't have a dvp here."); + } if (error) { /* - * Check for a creation race. + * Check for a creation or unlink race. */ - if ((error == EEXIST) && !(fmode & O_EXCL)) { + if (((error == EEXIST) && !(fmode & O_EXCL)) || + ((error == ENOENT) && (fmode & O_CREAT))){ + if (vp) + vnode_put(vp); goto again; } goto bad; } - fmode &= ~O_TRUNC; - } else { + + need_vnop_open = !did_open; + } + else { + if (fmode & O_EXCL) + error = EEXIST; + + /* + * We have a vnode. Use compound open if available + * or else fall through to "traditional" path. Note: can't + * do a compound open for root, because the parent belongs + * to a different FS. + */ + if (error == 0 && batched && (vnode_mount(dvp) == vnode_mount(vp))) { + error = VNOP_COMPOUND_OPEN(dvp, &ndp->ni_vp, ndp, 0, fmode, NULL, NULL, ctx); + + if (error == 0) { + vp = ndp->ni_vp; + need_vnop_open = FALSE; + } else if (error == EKEEPLOOKING) { + if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING, but continue flag not set?"); + } + goto continue_create_lookup; + } + } nameidone(ndp); - ndp->ni_dvp = NULL; vnode_put(dvp); + ndp->ni_dvp = NULLVP; - if (fmode & O_EXCL) { - error = EEXIST; + if (error) { goto bad; } + fmode &= ~O_CREAT; + + /* Fall through */ } - } else { + } + else { + /* + * Not O_CREAT + */ ndp->ni_cnd.cn_nameiop = LOOKUP; - /* Inherit USEDVP flag only */ - ndp->ni_cnd.cn_flags &= USEDVP; - ndp->ni_cnd.cn_flags |= FOLLOW | LOCKLEAF | AUDITVNPATH1; + /* Inherit USEDVP, vnode_open() supported flags only */ + ndp->ni_cnd.cn_flags &= (USEDVP | NOCROSSMOUNT); + ndp->ni_cnd.cn_flags |= FOLLOW | LOCKLEAF | AUDITVNPATH1 | WANTPARENT; #if NAMEDRSRCFORK /* open calls are allowed for resource forks. */ ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK; #endif - if (fmode & O_NOFOLLOW || fmode & O_SYMLINK) { - ndp->ni_cnd.cn_flags &= ~FOLLOW; + ndp->ni_flag = NAMEI_COMPOUNDOPEN; + + /* preserve NOFOLLOW from vnode_open() */ + if (fmode & O_NOFOLLOW || fmode & O_SYMLINK || (origcnflags & FOLLOW) == 0) { + ndp->ni_cnd.cn_flags &= ~FOLLOW; } - if ( (error = namei(ndp)) ) - goto out; - vp = ndp->ni_vp; + /* Do a lookup, possibly going directly to filesystem for compound operation */ + do { + if ( (error = namei(ndp)) ) + goto out; + vp = ndp->ni_vp; + dvp = ndp->ni_dvp; + + /* Check for batched lookup-open */ + batched = vnode_compound_open_available(dvp); + if (batched && ((vp == NULLVP) || (vnode_mount(dvp) == vnode_mount(vp)))) { + error = VNOP_COMPOUND_OPEN(dvp, &ndp->ni_vp, ndp, 0, fmode, NULL, NULL, ctx); + vp = ndp->ni_vp; + if (error == 0) { + need_vnop_open = FALSE; + } else if (error == EKEEPLOOKING) { + if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING, but continue flag not set?"); + } + } + } + } while (error == EKEEPLOOKING); + nameidone(ndp); - ndp->ni_dvp = NULL; + vnode_put(dvp); + ndp->ni_dvp = NULLVP; - if ( (fmode & O_DIRECTORY) && vp->v_type != VDIR ) { - error = ENOTDIR; + if (error) { goto bad; } } - if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) { - error = EOPNOTSUPP; /* Operation not supported on socket */ - goto bad; - } - - if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) { - error = ELOOP; /* O_NOFOLLOW was specified and the target is a symbolic link */ - goto bad; + /* + * By this point, nameidone() is called, dvp iocount is dropped, + * and dvp pointer is cleared. + */ + if (ndp->ni_dvp != NULLVP) { + panic("Haven't cleaned up adequately in vn_open_auth()"); } - /* authorize open of an existing file */ - if ((fmode & O_CREAT) == 0) { - - /* disallow write operations on directories */ - if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) { - error = EISDIR; - goto bad; + /* + * Expect to use this code for filesystems without compound VNOPs, for the root + * of a filesystem, which can't be "looked up" in the sense of VNOP_LOOKUP(), + * and for shadow files, which do not live on the same filesystems as their "parents." + */ + if (need_vnop_open) { + if (batched && !vnode_isvroot(vp) && !vnode_isnamedstream(vp)) { + panic("Why am I trying to use VNOP_OPEN() on anything other than the root or a named stream?"); } -#if CONFIG_MACF - error = mac_vnode_check_open(ctx, vp, fmode); - if (error) - goto bad; -#endif - - /* compute action to be authorized */ - action = 0; - if (fmode & FREAD) { - action |= KAUTH_VNODE_READ_DATA; + if (!did_create) { + error = vn_authorize_open_existing(vp, &ndp->ni_cnd, fmode, ctx, NULL); + if (error) { + goto bad; + } } - if (fmode & (FWRITE | O_TRUNC)) { - /* - * If we are writing, appending, and not truncating, - * indicate that we are appending so that if the - * UF_APPEND or SF_APPEND bits are set, we do not deny - * the open. - */ - if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) { - action |= KAUTH_VNODE_APPEND_DATA; - } else { - action |= KAUTH_VNODE_WRITE_DATA; + +#if CONFIG_PROTECT + /* + * Perform any content protection access checks prior to calling + * into the filesystem, if the raw encrypted mode was not + * requested. + * + * If the va_dataprotect_flags are NOT active, or if they are, + * but they do not have the VA_DP_RAWENCRYPTED bit set, then we need + * to perform the checks. + */ + if (!(VATTR_IS_ACTIVE (vap, va_dataprotect_flags)) || + ((vap->va_dataprotect_flags & VA_DP_RAWENCRYPTED) == 0)) { + error = cp_handle_open (vp, fmode); + if (error) { + goto bad; } } - if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0) - goto bad; - +#endif - // - // if the vnode is tagged VOPENEVT and the current process - // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY - // flag to the open mode so that this open won't count against - // the vnode when carbon delete() does a vnode_isinuse() to see - // if a file is currently in use. this allows spotlight - // importers to not interfere with carbon apps that depend on - // the no-delete-if-busy semantics of carbon delete(). - // - if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) { - fmode |= O_EVTONLY; + error = VNOP_OPEN(vp, fmode, ctx); + if (error) { + goto bad; } + need_vnop_open = FALSE; + } + // if the vnode is tagged VOPENEVT and the current process + // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY + // flag to the open mode so that this open won't count against + // the vnode when carbon delete() does a vnode_isinuse() to see + // if a file is currently in use. this allows spotlight + // importers to not interfere with carbon apps that depend on + // the no-delete-if-busy semantics of carbon delete(). + // + if (!did_create && (vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) { + fmode |= O_EVTONLY; } - if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) { + /* + * Grab reference, etc. + */ + error = vn_open_auth_finish(vp, fmode, ctx); + if (error) { + ref_failed = TRUE; goto bad; } - if ( (error = vnode_ref_ext(vp, fmode)) ) { - goto bad2; - } - /* call out to allow 3rd party notification of open. - * Ignore result of kauth_authorize_fileop call. - */ - kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN, - (uintptr_t)vp, 0); + /* Compound VNOP open is responsible for doing the truncate */ + if (batched || did_create) + fmode &= ~O_TRUNC; *fmodep = fmode; return (0); -bad2: - VNOP_CLOSE(vp, fmode, ctx); + bad: + /* Opened either explicitly or by a batched create */ + if (!need_vnop_open) { + VNOP_CLOSE(vp, fmode, ctx); + } + ndp->ni_vp = NULL; if (vp) { #if NAMEDRSRCFORK - if ((vnode_isnamedstream(vp)) && (vp->v_parent != NULLVP) && - (vnode_isshadow (vp))) { - vnode_recycle(vp); + /* Aggressively recycle shadow files if we error'd out during open() */ + if ((vnode_isnamedstream(vp)) && + (vp->v_parent != NULLVP) && + (vnode_isshadow(vp))) { + vnode_recycle(vp); } #endif vnode_put(vp); @@ -438,10 +648,11 @@ bad: * * EREDRIVEOPEN: means that we were hit by the tty allocation race. */ - if (((error == ENOENT) && (*fmodep & O_CREAT)) || (error == EREDRIVEOPEN)) { + if (((error == ENOENT) && (*fmodep & O_CREAT)) || (error == EREDRIVEOPEN) || ref_failed) { goto again; } } + out: return (error); } @@ -480,37 +691,53 @@ int vn_close(struct vnode *vp, int flags, vfs_context_t ctx) { int error; - -#if CONFIG_FSE - if (flags & FWASWRITTEN) { - if (need_fsevent(FSE_CONTENT_MODIFIED, vp)) { - add_fsevent(FSE_CONTENT_MODIFIED, ctx, - FSE_ARG_VNODE, vp, - FSE_ARG_DONE); - } - } -#endif + int flusherror = 0; #if NAMEDRSRCFORK /* Sync data from resource fork shadow file if needed. */ if ((vp->v_flag & VISNAMEDSTREAM) && (vp->v_parent != NULLVP) && - (vnode_isshadow(vp))) { + vnode_isshadow(vp)) { if (flags & FWASWRITTEN) { - (void) vnode_flushnamedstream(vp->v_parent, vp, ctx); + flusherror = vnode_flushnamedstream(vp->v_parent, vp, ctx); } } #endif /* work around for foxhound */ - if (vp->v_type == VBLK) + if (vnode_isspec(vp)) (void)vnode_rele_ext(vp, flags, 0); + /* + * On HFS, we flush when the last writer closes. We do this + * because resource fork vnodes hold a reference on data fork + * vnodes and that will prevent them from getting VNOP_INACTIVE + * which will delay when we flush cached data. In future, we + * might find it beneficial to do this for all file systems. + * Note that it's OK to access v_writecount without the lock + * in this context. + */ + if (vp->v_tag == VT_HFS && (flags & FWRITE) && vp->v_writecount == 1) + VNOP_FSYNC(vp, MNT_NOWAIT, ctx); + error = VNOP_CLOSE(vp, flags, ctx); - if (vp->v_type != VBLK) +#if CONFIG_FSE + if (flags & FWASWRITTEN) { + if (need_fsevent(FSE_CONTENT_MODIFIED, vp)) { + add_fsevent(FSE_CONTENT_MODIFIED, ctx, + FSE_ARG_VNODE, vp, + FSE_ARG_DONE); + } + } +#endif + + if (!vnode_isspec(vp)) (void)vnode_rele_ext(vp, flags, 0); + if (flusherror) { + error = flusherror; + } return (error); } @@ -519,15 +746,17 @@ vn_read_swapfile( struct vnode *vp, uio_t uio) { - static char *swap_read_zero_page = NULL; int error; off_t swap_count, this_count; off_t file_end, read_end; off_t prev_resid; + char *my_swap_page; /* - * Reading from a swap file will get you all zeroes. + * Reading from a swap file will get you zeroes. */ + + my_swap_page = NULL; error = 0; swap_count = uio_resid(uio); @@ -542,49 +771,20 @@ vn_read_swapfile( } while (swap_count > 0) { - if (swap_read_zero_page == NULL) { - char *my_zero_page; - int funnel_state; - - /* - * Take kernel funnel so that only one thread - * sets up "swap_read_zero_page". - */ - funnel_state = thread_funnel_set(kernel_flock, TRUE); - - if (swap_read_zero_page == NULL) { - MALLOC(my_zero_page, char *, PAGE_SIZE, - M_TEMP, M_WAITOK); - memset(my_zero_page, '?', PAGE_SIZE); - /* - * Adding a newline character here - * and there prevents "less(1)", for - * example, from getting too confused - * about a file with one really really - * long line. - */ - my_zero_page[PAGE_SIZE-1] = '\n'; - if (swap_read_zero_page == NULL) { - swap_read_zero_page = my_zero_page; - } else { - FREE(my_zero_page, M_TEMP); - } - } else { - /* - * Someone else raced us here and won; - * just use their page. - */ - } - thread_funnel_set(kernel_flock, funnel_state); + if (my_swap_page == NULL) { + MALLOC(my_swap_page, char *, PAGE_SIZE, + M_TEMP, M_WAITOK); + memset(my_swap_page, '\0', PAGE_SIZE); + /* add an end-of-line to keep line counters happy */ + my_swap_page[PAGE_SIZE-1] = '\n'; } - this_count = swap_count; if (this_count > PAGE_SIZE) { this_count = PAGE_SIZE; } prev_resid = uio_resid(uio); - error = uiomove((caddr_t) swap_read_zero_page, + error = uiomove((caddr_t) my_swap_page, this_count, uio); if (error) { @@ -592,6 +792,10 @@ vn_read_swapfile( } swap_count -= (prev_resid - uio_resid(uio)); } + if (my_swap_page != NULL) { + FREE(my_swap_page, M_TEMP); + my_swap_page = NULL; + } return error; } @@ -611,7 +815,10 @@ vn_rdwr( int *aresid, proc_t p) { - return vn_rdwr_64(rw, + int64_t resid; + int result; + + result = vn_rdwr_64(rw, vp, (uint64_t)(uintptr_t)base, (int64_t)len, @@ -619,8 +826,15 @@ vn_rdwr( segflg, ioflg, cred, - aresid, + &resid, p); + + /* "resid" should be bounded above by "len," which is an int */ + if (aresid != NULL) { + *aresid = resid; + } + + return result; } @@ -634,7 +848,7 @@ vn_rdwr_64( enum uio_seg segflg, int ioflg, kauth_cred_t cred, - int *aresid, + int64_t *aresid, proc_t p) { uio_t auio; @@ -672,7 +886,7 @@ vn_rdwr_64( if (error == 0) { if (rw == UIO_READ) { - if (vp->v_flag & VSWAP) { + if (vnode_isswap(vp) && ((ioflg & IO_SWAP_DISPATCH) == 0)) { error = vn_read_swapfile(vp, auio); } else { error = VNOP_READ(vp, auio, ioflg, &context); @@ -683,7 +897,6 @@ vn_rdwr_64( } if (aresid) - // LP64todo - fix this *aresid = uio_resid(auio); else if (uio_resid(auio) && error == 0) @@ -691,6 +904,35 @@ vn_rdwr_64( return (error); } +static inline void +vn_offset_lock(struct fileglob *fg) +{ + lck_mtx_lock_spin(&fg->fg_lock); + while (fg->fg_lflags & FG_OFF_LOCKED) { + fg->fg_lflags |= FG_OFF_LOCKWANT; + msleep(&fg->fg_lflags, &fg->fg_lock, PVFS | PSPIN, + "fg_offset_lock_wait", 0); + } + fg->fg_lflags |= FG_OFF_LOCKED; + lck_mtx_unlock(&fg->fg_lock); +} + +static inline void +vn_offset_unlock(struct fileglob *fg) +{ + int lock_wanted = 0; + + lck_mtx_lock_spin(&fg->fg_lock); + if (fg->fg_lflags & FG_OFF_LOCKWANT) { + lock_wanted = 1; + } + fg->fg_lflags &= ~(FG_OFF_LOCKED | FG_OFF_LOCKWANT); + lck_mtx_unlock(&fg->fg_lock); + if (lock_wanted) { + wakeup(&fg->fg_lflags); + } +} + /* * File table vnode read routine. */ @@ -698,8 +940,10 @@ static int vn_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) { struct vnode *vp; - int error, ioflag; + int error; + int ioflag; off_t count; + int offset_locked = 0; vp = (struct vnode *)fp->f_fglob->fg_data; if ( (error = vnode_getwithref(vp)) ) { @@ -714,26 +958,41 @@ vn_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) } #endif - ioflag = 0; + /* This signals to VNOP handlers that this read came from a file table read */ + ioflag = IO_SYSCALL_DISPATCH; + if (fp->f_fglob->fg_flag & FNONBLOCK) ioflag |= IO_NDELAY; if ((fp->f_fglob->fg_flag & FNOCACHE) || vnode_isnocache(vp)) - ioflag |= IO_NOCACHE; + ioflag |= IO_NOCACHE; + if (fp->f_fglob->fg_flag & FENCRYPTED) { + ioflag |= IO_ENCRYPTED; + } if (fp->f_fglob->fg_flag & FNORDAHEAD) - ioflag |= IO_RAOFF; + ioflag |= IO_RAOFF; - if ((flags & FOF_OFFSET) == 0) + if ((flags & FOF_OFFSET) == 0) { + if ((vnode_vtype(vp) == VREG) && !vnode_isswap(vp)) { + vn_offset_lock(fp->f_fglob); + offset_locked = 1; + } uio->uio_offset = fp->f_fglob->fg_offset; + } count = uio_resid(uio); - if (vp->v_flag & VSWAP) { + if (vnode_isswap(vp)) { /* special case for swap files */ error = vn_read_swapfile(vp, uio); } else { error = VNOP_READ(vp, uio, ioflag, ctx); } - if ((flags & FOF_OFFSET) == 0) + if ((flags & FOF_OFFSET) == 0) { fp->f_fglob->fg_offset += count - uio_resid(uio); + if (offset_locked) { + vn_offset_unlock(fp->f_fglob); + offset_locked = 0; + } + } (void)vnode_put(vp); return (error); @@ -752,6 +1011,7 @@ vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) int clippedsize = 0; int partialwrite=0; int residcount, oldcount; + int offset_locked = 0; proc_t p = vfs_context_proc(ctx); count = 0; @@ -768,25 +1028,47 @@ vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) } #endif - ioflag = IO_UNIT; + /* + * IO_SYSCALL_DISPATCH signals to VNOP handlers that this write came from + * a file table write + */ + ioflag = (IO_UNIT | IO_SYSCALL_DISPATCH); + if (vp->v_type == VREG && (fp->f_fglob->fg_flag & O_APPEND)) ioflag |= IO_APPEND; if (fp->f_fglob->fg_flag & FNONBLOCK) ioflag |= IO_NDELAY; if ((fp->f_fglob->fg_flag & FNOCACHE) || vnode_isnocache(vp)) ioflag |= IO_NOCACHE; - if ((fp->f_fglob->fg_flag & O_FSYNC) || - (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) + if (fp->f_fglob->fg_flag & FNODIRECT) + ioflag |= IO_NODIRECT; + if (fp->f_fglob->fg_flag & FSINGLE_WRITER) + ioflag |= IO_SINGLE_WRITER; + + /* + * Treat synchronous mounts and O_FSYNC on the fd as equivalent. + * + * XXX We treat O_DSYNC as O_FSYNC for now, since we can not delay + * XXX the non-essential metadata without some additional VFS work; + * XXX the intent at this point is to plumb the interface for it. + */ + if ((fp->f_fglob->fg_flag & (O_FSYNC|O_DSYNC)) || + (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) { ioflag |= IO_SYNC; + } if ((flags & FOF_OFFSET) == 0) { + if ((vnode_vtype(vp) == VREG) && !vnode_isswap(vp)) { + vn_offset_lock(fp->f_fglob); + offset_locked = 1; + } uio->uio_offset = fp->f_fglob->fg_offset; count = uio_resid(uio); } if (((flags & FOF_OFFSET) == 0) && vfs_context_proc(ctx) && (vp->v_type == VREG) && - (((rlim_t)(uio->uio_offset + uio_uio_resid(uio)) > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) || - ((rlim_t)uio_uio_resid(uio) > (p->p_rlimit[RLIMIT_FSIZE].rlim_cur - uio->uio_offset)))) { + (((rlim_t)(uio->uio_offset + uio_resid(uio)) > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) || + ((rlim_t)uio_resid(uio) > (p->p_rlimit[RLIMIT_FSIZE].rlim_cur - uio->uio_offset)))) { /* * If the requested residual would cause us to go past the * administrative limit, then we need to adjust the residual @@ -794,16 +1076,16 @@ vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) * we can't do that (e.g. the residual is already 1 byte), * then we fail the write with EFBIG. */ - residcount = uio_uio_resid(uio); - if ((rlim_t)(uio->uio_offset + uio_uio_resid(uio)) > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { - clippedsize = (uio->uio_offset + uio_uio_resid(uio)) - p->p_rlimit[RLIMIT_FSIZE].rlim_cur; - } else if ((rlim_t)uio_uio_resid(uio) > (p->p_rlimit[RLIMIT_FSIZE].rlim_cur - uio->uio_offset)) { + residcount = uio_resid(uio); + if ((rlim_t)(uio->uio_offset + uio_resid(uio)) > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { + clippedsize = (uio->uio_offset + uio_resid(uio)) - p->p_rlimit[RLIMIT_FSIZE].rlim_cur; + } else if ((rlim_t)uio_resid(uio) > (p->p_rlimit[RLIMIT_FSIZE].rlim_cur - uio->uio_offset)) { clippedsize = (p->p_rlimit[RLIMIT_FSIZE].rlim_cur - uio->uio_offset); } if (clippedsize >= residcount) { psignal(p, SIGXFSZ); - vnode_put(vp); - return (EFBIG); + error = EFBIG; + goto error_out; } partialwrite = 1; uio_setresid(uio, residcount-clippedsize); @@ -814,14 +1096,14 @@ vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) if (p && (vp->v_type == VREG) && ((rlim_t)uio->uio_offset >= p->p_rlimit[RLIMIT_FSIZE].rlim_cur)) { psignal(p, SIGXFSZ); - vnode_put(vp); - return (EFBIG); + error = EFBIG; + goto error_out; } if (p && (vp->v_type == VREG) && - ((rlim_t)(uio->uio_offset + uio_uio_resid(uio)) > p->p_rlimit[RLIMIT_FSIZE].rlim_cur)) { + ((rlim_t)(uio->uio_offset + uio_resid(uio)) > p->p_rlimit[RLIMIT_FSIZE].rlim_cur)) { //Debugger("vn_bwrite:overstepping the bounds"); - residcount = uio_uio_resid(uio); - clippedsize = (uio->uio_offset + uio_uio_resid(uio)) - p->p_rlimit[RLIMIT_FSIZE].rlim_cur; + residcount = uio_resid(uio); + clippedsize = (uio->uio_offset + uio_resid(uio)) - p->p_rlimit[RLIMIT_FSIZE].rlim_cur; partialwrite = 1; uio_setresid(uio, residcount-clippedsize); } @@ -839,6 +1121,10 @@ vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) fp->f_fglob->fg_offset = uio->uio_offset; else fp->f_fglob->fg_offset += count - uio_resid(uio); + if (offset_locked) { + vn_offset_unlock(fp->f_fglob); + offset_locked = 0; + } } /* @@ -861,6 +1147,13 @@ vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) } (void)vnode_put(vp); return (error); + +error_out: + if (offset_locked) { + vn_offset_unlock(fp->f_fglob); + } + (void)vnode_put(vp); + return (error); } /* @@ -885,7 +1178,7 @@ vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat6 sb64 = (struct stat64 *)sbptr; else sb = (struct stat *)sbptr; - + memset(&va, 0, sizeof(va)); VATTR_INIT(&va); VATTR_WANTED(&va, va_fsid); VATTR_WANTED(&va, va_fileid); @@ -981,7 +1274,7 @@ vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat6 sb->st_blocks = roundup(va.va_total_alloc, 512) / 512; } - /* if we're interested in exended security data and we got an ACL */ + /* if we're interested in extended security data and we got an ACL */ if (xsec != NULL) { if (!VATTR_IS_SUPPORTED(&va, va_acl) && !VATTR_IS_SUPPORTED(&va, va_uuuid) && @@ -1068,7 +1361,6 @@ vn_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx) off_t file_size; int error; struct vnode *ttyvp; - int funnel_state; struct session * sessp; if ( (error = vnode_getwithref(vp)) ) { @@ -1132,9 +1424,6 @@ vn_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx) error = VNOP_IOCTL(vp, com, data, fp->f_fglob->fg_flag, ctx); if (error == 0 && com == TIOCSCTTY) { - vnode_ref(vp); - - funnel_state = thread_funnel_set(kernel_flock, TRUE); sessp = proc_session(vfs_context_proc(ctx)); session_lock(sessp); @@ -1143,10 +1432,6 @@ vn_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx) sessp->s_ttyvid = vnode_vid(vp); session_unlock(sessp); session_rele(sessp); - thread_funnel_set(kernel_flock, funnel_state); - - if (ttyvp) - vnode_rele(ttyvp); } } out: @@ -1185,16 +1470,6 @@ vn_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx) } -/* - * Check that the vnode is still valid, and if so - * acquire requested lock. - */ -int -vn_lock(__unused vnode_t vp, __unused int flags, __unused proc_t p) -{ - return (0); -} - /* * File table vnode close routine. */ @@ -1207,13 +1482,14 @@ vn_closefile(struct fileglob *fg, vfs_context_t ctx) if ( (error = vnode_getwithref(vp)) == 0 ) { - if ((fg->fg_flag & FHASLOCK) && fg->fg_type == DTYPE_VNODE) { + if ((fg->fg_flag & FHASLOCK) && + FILEGLOB_DTYPE(fg) == DTYPE_VNODE) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; - (void)VNOP_ADVLOCK(vp, (caddr_t)fg, F_UNLCK, &lf, F_FLOCK, ctx); + (void)VNOP_ADVLOCK(vp, (caddr_t)fg, F_UNLCK, &lf, F_FLOCK, ctx, NULL); } error = vn_close(vp, fg->fg_flag, ctx); @@ -1227,9 +1503,10 @@ vn_closefile(struct fileglob *fg, vfs_context_t ctx) * VNOP_PATHCONF:??? */ int -vn_pathconf(vnode_t vp, int name, register_t *retval, vfs_context_t ctx) +vn_pathconf(vnode_t vp, int name, int32_t *retval, vfs_context_t ctx) { int error = 0; + struct vfs_attr vfa; switch(name) { case _PC_EXTENDED_SECURITY_NP: @@ -1268,6 +1545,33 @@ vn_pathconf(vnode_t vp, int name, register_t *retval, vfs_context_t ctx) case _PC_SYNC_IO: /* unistd.h: _POSIX_SYNCHRONIZED_IO */ *retval = 0; /* [SIO] option is not supported */ break; + case _PC_XATTR_SIZE_BITS: + /* The number of bits used to store maximum extended + * attribute size in bytes. For example, if the maximum + * attribute size supported by a file system is 128K, the + * value returned will be 18. However a value 18 can mean + * that the maximum attribute size can be anywhere from + * (256KB - 1) to 128KB. As a special case, the resource + * fork can have much larger size, and some file system + * specific extended attributes can have smaller and preset + * size; for example, Finder Info is always 32 bytes. + */ + memset(&vfa, 0, sizeof(vfa)); + VFSATTR_INIT(&vfa); + VFSATTR_WANTED(&vfa, f_capabilities); + if (vfs_getattr(vnode_mount(vp), &vfa, ctx) == 0 && + (VFSATTR_IS_SUPPORTED(&vfa, f_capabilities)) && + (vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) && + (vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) { + /* Supports native extended attributes */ + error = VNOP_PATHCONF(vp, name, retval, ctx); + } else { + /* Number of bits used to represent the maximum size of + * extended attribute stored in an Apple Double file. + */ + *retval = AD_XATTR_SIZE_BITS; + } + break; default: error = VNOP_PATHCONF(vp, name, retval, ctx); break; @@ -1279,48 +1583,237 @@ vn_pathconf(vnode_t vp, int name, register_t *retval, vfs_context_t ctx) static int vn_kqfilt_add(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) { - struct vnode *vp = (struct vnode *)fp->f_fglob->fg_data; int error; - int funnel_state; + struct vnode *vp; - if ( (error = vnode_getwithref(vp)) == 0 ) { + vp = (struct vnode *)fp->f_fglob->fg_data; + + /* + * Don't attach a knote to a dead vnode. + */ + if ((error = vget_internal(vp, 0, VNODE_NODEAD)) == 0) { + switch (kn->kn_filter) { + case EVFILT_READ: + case EVFILT_WRITE: + if (vnode_isfifo(vp)) { + /* We'll only watch FIFOs that use our fifofs */ + if (!(vp->v_fifoinfo && vp->v_fifoinfo->fi_readsock)) { + error = ENOTSUP; + } + + } else if (!vnode_isreg(vp)) { + if (vnode_ischr(vp) && + (error = spec_kqfilter(vp, kn)) == 0) { + /* claimed by a special device */ + vnode_put(vp); + return 0; + } + + error = EINVAL; + } + break; + case EVFILT_VNODE: + break; + default: + error = EINVAL; + } + + if (error) { + vnode_put(vp); + return error; + } #if CONFIG_MACF error = mac_vnode_check_kqfilter(ctx, fp->f_fglob->fg_cred, kn, vp); if (error) { - (void)vnode_put(vp); - return (error); + vnode_put(vp); + return error; } #endif - funnel_state = thread_funnel_set(kernel_flock, TRUE); - error = VNOP_KQFILT_ADD(vp, kn, ctx); - thread_funnel_set(kernel_flock, funnel_state); + kn->kn_hook = (void*)vp; + kn->kn_hookid = vnode_vid(vp); + kn->kn_fop = &vnode_filtops; - (void)vnode_put(vp); + vnode_lock(vp); + KNOTE_ATTACH(&vp->v_knotes, kn); + vnode_unlock(vp); + + /* Ask the filesystem to provide remove notifications, but ignore failure */ + VNOP_MONITOR(vp, 0, VNODE_MONITOR_BEGIN, (void*) kn, ctx); + + vnode_put(vp); } + return (error); } -#if 0 -/* No one calls this yet. */ -static int -vn_kqfilt_remove(vp, ident, ctx) - struct vnode *vp; - uintptr_t ident; - vfs_context_t ctx; +static void +filt_vndetach(struct knote *kn) { - int error; - int funnel_state; + vfs_context_t ctx = vfs_context_current(); + struct vnode *vp; + vp = (struct vnode *)kn->kn_hook; + if (vnode_getwithvid(vp, kn->kn_hookid)) + return; + + vnode_lock(vp); + KNOTE_DETACH(&vp->v_knotes, kn); + vnode_unlock(vp); - if ( (error = vnode_getwithref(vp)) == 0 ) { + /* + * Tell a (generally networked) filesystem that we're no longer watching + * If the FS wants to track contexts, it should still be using the one from + * the VNODE_MONITOR_BEGIN. + */ + VNOP_MONITOR(vp, 0, VNODE_MONITOR_END, (void*)kn, ctx); + vnode_put(vp); +} - funnel_state = thread_funnel_set(kernel_flock, TRUE); - error = VNOP_KQFILT_REMOVE(vp, ident, ctx); - thread_funnel_set(kernel_flock, funnel_state); - (void)vnode_put(vp); +/* + * Used for EVFILT_READ + * + * Takes only VFIFO or VREG. vnode is locked. We handle the "poll" case + * differently than the regular case for VREG files. If not in poll(), + * then we need to know current fileproc offset for VREG. + */ +static intptr_t +vnode_readable_data_count(vnode_t vp, off_t current_offset, int ispoll) +{ + if (vnode_isfifo(vp)) { +#if FIFO + int cnt; + int err = fifo_charcount(vp, &cnt); + if (err == 0) { + return (intptr_t)cnt; + } else +#endif + { + return (intptr_t)0; + } + } else if (vnode_isreg(vp)) { + if (ispoll) { + return (intptr_t)1; + } + + off_t amount; + amount = vp->v_un.vu_ubcinfo->ui_size - current_offset; + if (amount > (off_t)INTPTR_MAX) { + return INTPTR_MAX; + } else if (amount < (off_t)INTPTR_MIN) { + return INTPTR_MIN; + } else { + return (intptr_t)amount; + } + } else { + panic("Should never have an EVFILT_READ except for reg or fifo."); + return 0; } - return (error); } + +/* + * Used for EVFILT_WRITE. + * + * For regular vnodes, we can always write (1). For named pipes, + * see how much space there is in the buffer. Nothing else is covered. + */ +static intptr_t +vnode_writable_space_count(vnode_t vp) +{ + if (vnode_isfifo(vp)) { +#if FIFO + long spc; + int err = fifo_freespace(vp, &spc); + if (err == 0) { + return (intptr_t)spc; + } else #endif + { + return (intptr_t)0; + } + } else if (vnode_isreg(vp)) { + return (intptr_t)1; + } else { + panic("Should never have an EVFILT_READ except for reg or fifo."); + return 0; + } +} + +/* + * Determine whether this knote should be active + * + * This is kind of subtle. + * --First, notice if the vnode has been revoked: in so, override hint + * --EVFILT_READ knotes are checked no matter what the hint is + * --Other knotes activate based on hint. + * --If hint is revoke, set special flags and activate + */ +static int +filt_vnode(struct knote *kn, long hint) +{ + vnode_t vp = (struct vnode *)kn->kn_hook; + int activate = 0; + long orig_hint = hint; + + if (0 == hint) { + vnode_lock(vp); + + if (vnode_getiocount(vp, kn->kn_hookid, VNODE_NODEAD | VNODE_WITHID) != 0) { + /* Is recycled */ + hint = NOTE_REVOKE; + } + } else { + lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); + } + + /* Special handling for vnodes that are in recycle or already gone */ + if (NOTE_REVOKE == hint) { + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + activate = 1; + + if ((kn->kn_filter == EVFILT_VNODE) && (kn->kn_sfflags & NOTE_REVOKE)) { + kn->kn_fflags |= NOTE_REVOKE; + } + } else { + switch(kn->kn_filter) { + case EVFILT_READ: + kn->kn_data = vnode_readable_data_count(vp, kn->kn_fp->f_fglob->fg_offset, (kn->kn_flags & EV_POLL)); + + if (kn->kn_data != 0) { + activate = 1; + } + break; + case EVFILT_WRITE: + kn->kn_data = vnode_writable_space_count(vp); + + if (kn->kn_data != 0) { + activate = 1; + } + break; + case EVFILT_VNODE: + /* Check events this note matches against the hint */ + if (kn->kn_sfflags & hint) { + kn->kn_fflags |= hint; /* Set which event occurred */ + } + if (kn->kn_fflags != 0) { + activate = 1; + } + break; + default: + panic("Invalid knote filter on a vnode!\n"); + } + } + + if (orig_hint == 0) { + /* + * Definitely need to unlock, may need to put + */ + if (hint == 0) { + vnode_put_locked(vp); + } + vnode_unlock(vp); + } + + return (activate); +}