X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/1c79356b52d46aa6b508fb032f5ae709b1f2897b..55e303ae13a4cf49d70f2294092726f2fffb9ef2:/bsd/nfs/nfs_vnops.c diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index 06f5961e3..882ed59fe 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -1,21 +1,24 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ @@ -63,14 +66,12 @@ /* * vnode op calls for Sun NFS version 2 and 3 */ - #include #include #include #include #include #include -#include #include #include #include @@ -81,7 +82,6 @@ #include #include -#include #include #include @@ -99,6 +99,7 @@ #include #include #include +#include #include #include #include @@ -106,14 +107,35 @@ #include #include #include -#include #include +#include +#include + #include +#define FSDBG(A, B, C, D, E) \ + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \ + (int)(B), (int)(C), (int)(D), (int)(E), 0) +#define FSDBG_TOP(A, B, C, D, E) \ + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \ + (int)(B), (int)(C), (int)(D), (int)(E), 0) +#define FSDBG_BOT(A, B, C, D, E) \ + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \ + (int)(B), (int)(C), (int)(D), (int)(E), 0) + #define TRUE 1 #define FALSE 0 +#define NFS_FREE_PNBUF(CNP) \ + do { \ + char *tmp = (CNP)->cn_pnbuf; \ + (CNP)->cn_pnbuf = NULL; \ + (CNP)->cn_flags &= ~HASBUF; \ + FREE_ZONE(tmp, (CNP)->cn_pnlen, M_NAMEI); \ + } while (0) + + static int nfsspec_read __P((struct vop_read_args *)); static int nfsspec_write __P((struct vop_write_args *)); static int nfsfifo_read __P((struct vop_read_args *)); @@ -144,7 +166,6 @@ static int nfs_rmdir __P((struct vop_rmdir_args *)); static int nfs_symlink __P((struct vop_symlink_args *)); static int nfs_readdir __P((struct vop_readdir_args *)); static int nfs_bmap __P((struct vop_bmap_args *)); -static int nfs_strategy __P((struct vop_strategy_args *)); static int nfs_lookitup __P((struct vnode *,char *,int,struct ucred *,struct proc *,struct nfsnode **)); static int nfs_sillyrename __P((struct vnode *,struct vnode *,struct componentname *)); static int nfsspec_access __P((struct vop_access_args *)); @@ -153,7 +174,6 @@ static int nfs_print __P((struct vop_print_args *)); static int nfs_pathconf __P((struct vop_pathconf_args *)); static int nfs_advlock __P((struct vop_advlock_args *)); static int nfs_blkatoff __P((struct vop_blkatoff_args *)); -static int nfs_bwrite __P((struct vop_bwrite_args *)); static int nfs_valloc __P((struct vop_valloc_args *)); static int nfs_vfree __P((struct vop_vfree_args *)); static int nfs_truncate __P((struct vop_truncate_args *)); @@ -195,13 +215,13 @@ static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vop_symlink_desc, (vop_t *)nfs_symlink }, /* symlink */ { &vop_readdir_desc, (vop_t *)nfs_readdir }, /* readdir */ { &vop_readlink_desc, (vop_t *)nfs_readlink }, /* readlink */ - { &vop_abortop_desc, (vop_t *)nfs_abortop }, /* abortop */ + { &vop_abortop_desc, (vop_t *)nop_abortop }, /* abortop */ { &vop_inactive_desc, (vop_t *)nfs_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *)nfs_reclaim }, /* reclaim */ { &vop_lock_desc, (vop_t *)nfs_lock }, /* lock */ { &vop_unlock_desc, (vop_t *)nfs_unlock }, /* unlock */ { &vop_bmap_desc, (vop_t *)nfs_bmap }, /* bmap */ - { &vop_strategy_desc, (vop_t *)nfs_strategy }, /* strategy */ + { &vop_strategy_desc, (vop_t *)err_strategy }, /* strategy */ { &vop_print_desc, (vop_t *)nfs_print }, /* print */ { &vop_islocked_desc, (vop_t *)nfs_islocked }, /* islocked */ { &vop_pathconf_desc, (vop_t *)nfs_pathconf }, /* pathconf */ @@ -212,7 +232,7 @@ static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vop_vfree_desc, (vop_t *)nfs_vfree }, /* vfree */ { &vop_truncate_desc, (vop_t *)nfs_truncate }, /* truncate */ { &vop_update_desc, (vop_t *)nfs_update }, /* update */ - { &vop_bwrite_desc, (vop_t *)nfs_bwrite }, /* bwrite */ + { &vop_bwrite_desc, (vop_t *)err_bwrite }, /* bwrite */ { &vop_pagein_desc, (vop_t *)nfs_pagein }, /* Pagein */ { &vop_pageout_desc, (vop_t *)nfs_pageout }, /* Pageout */ { &vop_copyfile_desc, (vop_t *)err_copyfile }, /* Copyfile */ @@ -324,7 +344,7 @@ static struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = { { &vop_lock_desc, (vop_t *)nfs_lock }, /* lock */ { &vop_unlock_desc, (vop_t *)nfs_unlock }, /* unlock */ { &vop_bmap_desc, (vop_t *)fifo_bmap }, /* bmap */ - { &vop_strategy_desc, (vop_t *)fifo_badop }, /* strategy */ + { &vop_strategy_desc, (vop_t *)fifo_strategy }, /* strategy */ { &vop_print_desc, (vop_t *)nfs_print }, /* print */ { &vop_islocked_desc, (vop_t *)nfs_islocked }, /* islocked */ { &vop_pathconf_desc, (vop_t *)fifo_pathconf }, /* pathconf */ @@ -349,8 +369,6 @@ struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = VNODEOP_SET(fifo_nfsv2nodeop_opv_desc); #endif -static int nfs_commit __P((struct vnode *vp, u_quad_t offset, int cnt, - struct ucred *cred, struct proc *procp)); static int nfs_mknodrpc __P((struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap)); @@ -373,6 +391,7 @@ extern nfstype nfsv3_type[9]; struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; int nfs_numasync = 0; +int nfs_ioddelwri = 0; #define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1)) static int nfsaccess_cache_timeout = NFS_MAXATTRTIMO; @@ -383,41 +402,166 @@ static int nfsaccess_cache_timeout = NFS_MAXATTRTIMO; | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE \ | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP) - + +/* + * the following are needed only by nfs_pageout to know how to handle errors + * see nfs_pageout comments on explanation of actions. + * the errors here are copied from errno.h and errors returned by servers + * are expected to match the same numbers here. If not, our actions maybe + * erroneous. + */ +enum actiontype {NOACTION, DUMP, DUMPANDLOG, RETRY, RETRYWITHSLEEP, SEVER}; + +static int errorcount[ELAST+1]; /* better be zeros when initialized */ + +static const short errortooutcome[ELAST+1] = { + NOACTION, + DUMP, /* EPERM 1 Operation not permitted */ + DUMP, /* ENOENT 2 No such file or directory */ + DUMPANDLOG, /* ESRCH 3 No such process */ + RETRY, /* EINTR 4 Interrupted system call */ + DUMP, /* EIO 5 Input/output error */ + DUMP, /* ENXIO 6 Device not configured */ + DUMPANDLOG, /* E2BIG 7 Argument list too long */ + DUMPANDLOG, /* ENOEXEC 8 Exec format error */ + DUMPANDLOG, /* EBADF 9 Bad file descriptor */ + DUMPANDLOG, /* ECHILD 10 No child processes */ + DUMPANDLOG, /* EDEADLK 11 Resource deadlock avoided - was EAGAIN */ + RETRY, /* ENOMEM 12 Cannot allocate memory */ + DUMP, /* EACCES 13 Permission denied */ + DUMPANDLOG, /* EFAULT 14 Bad address */ + DUMPANDLOG, /* ENOTBLK 15 POSIX - Block device required */ + RETRY, /* EBUSY 16 Device busy */ + DUMP, /* EEXIST 17 File exists */ + DUMP, /* EXDEV 18 Cross-device link */ + DUMP, /* ENODEV 19 Operation not supported by device */ + DUMP, /* ENOTDIR 20 Not a directory */ + DUMP, /* EISDIR 21 Is a directory */ + DUMP, /* EINVAL 22 Invalid argument */ + DUMPANDLOG, /* ENFILE 23 Too many open files in system */ + DUMPANDLOG, /* EMFILE 24 Too many open files */ + DUMPANDLOG, /* ENOTTY 25 Inappropriate ioctl for device */ + DUMPANDLOG, /* ETXTBSY 26 Text file busy - POSIX */ + DUMP, /* EFBIG 27 File too large */ + DUMP, /* ENOSPC 28 No space left on device */ + DUMPANDLOG, /* ESPIPE 29 Illegal seek */ + DUMP, /* EROFS 30 Read-only file system */ + DUMP, /* EMLINK 31 Too many links */ + RETRY, /* EPIPE 32 Broken pipe */ + /* math software */ + DUMPANDLOG, /* EDOM 33 Numerical argument out of domain */ + DUMPANDLOG, /* ERANGE 34 Result too large */ + RETRY, /* EAGAIN/EWOULDBLOCK 35 Resource temporarily unavailable */ + DUMPANDLOG, /* EINPROGRESS 36 Operation now in progress */ + DUMPANDLOG, /* EALREADY 37 Operation already in progress */ + /* ipc/network software -- argument errors */ + DUMPANDLOG, /* ENOTSOC 38 Socket operation on non-socket */ + DUMPANDLOG, /* EDESTADDRREQ 39 Destination address required */ + DUMPANDLOG, /* EMSGSIZE 40 Message too long */ + DUMPANDLOG, /* EPROTOTYPE 41 Protocol wrong type for socket */ + DUMPANDLOG, /* ENOPROTOOPT 42 Protocol not available */ + DUMPANDLOG, /* EPROTONOSUPPORT 43 Protocol not supported */ + DUMPANDLOG, /* ESOCKTNOSUPPORT 44 Socket type not supported */ + DUMPANDLOG, /* ENOTSUP 45 Operation not supported */ + DUMPANDLOG, /* EPFNOSUPPORT 46 Protocol family not supported */ + DUMPANDLOG, /* EAFNOSUPPORT 47 Address family not supported by protocol family */ + DUMPANDLOG, /* EADDRINUSE 48 Address already in use */ + DUMPANDLOG, /* EADDRNOTAVAIL 49 Can't assign requested address */ + /* ipc/network software -- operational errors */ + RETRY, /* ENETDOWN 50 Network is down */ + RETRY, /* ENETUNREACH 51 Network is unreachable */ + RETRY, /* ENETRESET 52 Network dropped connection on reset */ + RETRY, /* ECONNABORTED 53 Software caused connection abort */ + RETRY, /* ECONNRESET 54 Connection reset by peer */ + RETRY, /* ENOBUFS 55 No buffer space available */ + RETRY, /* EISCONN 56 Socket is already connected */ + RETRY, /* ENOTCONN 57 Socket is not connected */ + RETRY, /* ESHUTDOWN 58 Can't send after socket shutdown */ + RETRY, /* ETOOMANYREFS 59 Too many references: can't splice */ + RETRY, /* ETIMEDOUT 60 Operation timed out */ + RETRY, /* ECONNREFUSED 61 Connection refused */ + + DUMPANDLOG, /* ELOOP 62 Too many levels of symbolic links */ + DUMP, /* ENAMETOOLONG 63 File name too long */ + RETRY, /* EHOSTDOWN 64 Host is down */ + RETRY, /* EHOSTUNREACH 65 No route to host */ + DUMP, /* ENOTEMPTY 66 Directory not empty */ + /* quotas & mush */ + DUMPANDLOG, /* PROCLIM 67 Too many processes */ + DUMPANDLOG, /* EUSERS 68 Too many users */ + DUMPANDLOG, /* EDQUOT 69 Disc quota exceeded */ + /* Network File System */ + DUMP, /* ESTALE 70 Stale NFS file handle */ + DUMP, /* EREMOTE 71 Too many levels of remote in path */ + DUMPANDLOG, /* EBADRPC 72 RPC struct is bad */ + DUMPANDLOG, /* ERPCMISMATCH 73 RPC version wrong */ + DUMPANDLOG, /* EPROGUNAVAIL 74 RPC prog. not avail */ + DUMPANDLOG, /* EPROGMISMATCH 75 Program version wrong */ + DUMPANDLOG, /* EPROCUNAVAIL 76 Bad procedure for program */ + + DUMPANDLOG, /* ENOLCK 77 No locks available */ + DUMPANDLOG, /* ENOSYS 78 Function not implemented */ + DUMPANDLOG, /* EFTYPE 79 Inappropriate file type or format */ + DUMPANDLOG, /* EAUTH 80 Authentication error */ + DUMPANDLOG, /* ENEEDAUTH 81 Need authenticator */ + /* Intelligent device errors */ + DUMPANDLOG, /* EPWROFF 82 Device power is off */ + DUMPANDLOG, /* EDEVERR 83 Device error, e.g. paper out */ + DUMPANDLOG, /* EOVERFLOW 84 Value too large to be stored in data type */ + /* Program loading errors */ + DUMPANDLOG, /* EBADEXEC 85 Bad executable */ + DUMPANDLOG, /* EBADARCH 86 Bad CPU type in executable */ + DUMPANDLOG, /* ESHLIBVERS 87 Shared library version mismatch */ + DUMPANDLOG, /* EBADMACHO 88 Malformed Macho file */ +}; + + +static short +nfs_pageouterrorhandler(error) + int error; +{ + if (error > ELAST) + return(DUMP); + else + return(errortooutcome[error]); +} static int nfs3_access_otw(struct vnode *vp, - int wmode, - struct proc *p, - struct ucred *cred) + int wmode, + struct proc *p, + struct ucred *cred) { - const int v3 = 1; - u_int32_t *tl; - int error = 0, attrflag; - - struct mbuf *mreq, *mrep, *md, *mb, *mb2; - caddr_t bpos, dpos, cp2; - register int32_t t1, t2; - register caddr_t cp; - u_int32_t rmode; - struct nfsnode *np = VTONFS(vp); - - nfsstats.rpccnt[NFSPROC_ACCESS]++; - nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED); - nfsm_fhtom(vp, v3); - nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); - *tl = txdr_unsigned(wmode); - nfsm_request(vp, NFSPROC_ACCESS, p, cred); - nfsm_postop_attr(vp, attrflag); - if (!error) { - nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); - rmode = fxdr_unsigned(u_int32_t, *tl); - np->n_mode = rmode; - np->n_modeuid = cred->cr_uid; - np->n_modestamp = time_second; - } - nfsm_reqdone; - return error; + const int v3 = 1; + u_long *tl; + int error = 0, attrflag; + + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + caddr_t bpos, dpos, cp2; + register long t1, t2; + register caddr_t cp; + u_int32_t rmode; + struct nfsnode *np = VTONFS(vp); + u_int64_t xid; + struct timeval now; + + nfsstats.rpccnt[NFSPROC_ACCESS]++; + nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED); + nfsm_fhtom(vp, v3); + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = txdr_unsigned(wmode); + nfsm_request(vp, NFSPROC_ACCESS, p, cred, &xid); + nfsm_postop_attr(vp, attrflag, &xid); + if (!error) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + rmode = fxdr_unsigned(u_int32_t, *tl); + np->n_mode = rmode; + np->n_modeuid = cred->cr_uid; + microuptime(&now); + np->n_modestamp = now.tv_sec; + } + nfsm_reqdone; + return error; } /* @@ -436,10 +580,11 @@ nfs_access(ap) } */ *ap; { register struct vnode *vp = ap->a_vp; - int error = 0; - u_long mode, wmode; + int error = 0; + u_long mode, wmode; int v3 = NFS_ISV3(vp); - struct nfsnode *np = VTONFS(vp); + struct nfsnode *np = VTONFS(vp); + struct timeval now; /* * For nfs v3, do an access rpc, otherwise you are stuck emulating @@ -456,63 +601,63 @@ nfs_access(ap) mode = 0; if (vp->v_type == VDIR) { if (ap->a_mode & VWRITE) - mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | - NFSV3ACCESS_DELETE); + mode |= NFSV3ACCESS_MODIFY | + NFSV3ACCESS_EXTEND | NFSV3ACCESS_DELETE; if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_LOOKUP; } else { if (ap->a_mode & VWRITE) - mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND); + mode |= NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND; if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_EXECUTE; } - /* XXX safety belt, only make blanket request if caching */ - if (nfsaccess_cache_timeout > 0) { - wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY | - NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE | - NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP; - } else { - wmode = mode; - } + /* XXX safety belt, only make blanket request if caching */ + if (nfsaccess_cache_timeout > 0) { + wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY | + NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE | + NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP; + } else + wmode = mode; - /* - * Does our cached result allow us to give a definite yes to - * this request? - */ - if ((time_second < (np->n_modestamp + nfsaccess_cache_timeout)) && - (ap->a_cred->cr_uid == np->n_modeuid) && - ((np->n_mode & mode) == mode)) { - /* nfsstats.accesscache_hits++; */ - } else { - /* - * Either a no, or a don't know. Go to the wire. - */ - /* nfsstats.accesscache_misses++; */ - error = nfs3_access_otw(vp, wmode, ap->a_p,ap->a_cred); - if (!error) { - if ((np->n_mode & mode) != mode) - error = EACCES; - } - } + /* + * Does our cached result allow us to give a definite yes to + * this request? + */ + microuptime(&now); + if (now.tv_sec < np->n_modestamp + nfsaccess_cache_timeout && + ap->a_cred->cr_uid == np->n_modeuid && + (np->n_mode & mode) == mode) { + /* nfsstats.accesscache_hits++; */ + } else { + /* + * Either a no, or a don't know. Go to the wire. + */ + /* nfsstats.accesscache_misses++; */ + error = nfs3_access_otw(vp, wmode, ap->a_p,ap->a_cred); + if (!error) { + if ((np->n_mode & mode) != mode) + error = EACCES; + } + } } else - return (nfsspec_access(ap)); /* NFSv2 case checks for EROFS here*/ - /* CSM - moved EROFS check down per NetBSD rev 1.71. So you - * get the correct error value with layered filesystems. - * EKN - moved the return(error) below this so it does get called.*/ + return (nfsspec_access(ap)); /* NFSv2 case checks for EROFS here */ /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. + * CSM - moved EROFS check down per NetBSD rev 1.71. So you + * get the correct error value with layered filesystems. + * EKN - moved the return(error) below this so it does get called. */ if (!error && (ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { - case VREG: case VDIR: case VLNK: - error = EROFS; - default: - break; + case VREG: case VDIR: case VLNK: + error = EROFS; + default: + break; } } - return (error); + return (error); } /* @@ -523,6 +668,7 @@ nfs_access(ap) * if consistency is lost. */ /* ARGSUSED */ + static int nfs_open(ap) struct vop_open_args /* { @@ -538,10 +684,9 @@ nfs_open(ap) struct vattr vattr; int error; - if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) -{ printf("open eacces vtyp=%d\n",vp->v_type); + if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { return (EACCES); -} + } /* * Get a valid lease. If cached data is stale, flush it. */ @@ -572,14 +717,22 @@ nfs_open(ap) error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); if (error) return (error); + /* if directory changed, purge any name cache entries */ + if ((vp->v_type == VDIR) && + (np->n_mtime != vattr.va_mtime.tv_sec)) + cache_purge(vp); np->n_mtime = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); if (error) return (error); if (np->n_mtime != vattr.va_mtime.tv_sec) { - if (vp->v_type == VDIR) + if (vp->v_type == VDIR) { np->n_direofoffset = 0; + nfs_invaldir(vp); + /* purge name cache entries */ + cache_purge(vp); + } if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); @@ -635,6 +788,7 @@ nfs_close(ap) { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); + struct nfsmount *nmp; int error = 0; if (vp->v_type == VREG) { @@ -645,14 +799,35 @@ nfs_close(ap) &sp->s_name[0], (unsigned)(sp->s_dvp), (unsigned)vp, (unsigned)ap, (unsigned)np, (unsigned)sp); #endif - if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) == 0 && + nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); + if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && (np->n_flag & NMODIFIED)) { + int getlock = !VOP_ISLOCKED(vp); + if (getlock) { + error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); + if (!error && !VFSTONFS(vp->v_mount)) { + VOP_UNLOCK(vp, 0, ap->a_p); + error = ENXIO; + } + if (error) + return (error); + } if (NFS_ISV3(vp)) { - error = nfs_flush(vp, ap->a_cred, MNT_WAIT, ap->a_p, 0); - np->n_flag &= ~NMODIFIED; - } else + error = nfs_flush(vp, ap->a_cred, MNT_WAIT, ap->a_p, 1); + /* + * We cannot clear the NMODIFIED bit in np->n_flag due to + * potential races with other processes + * NMODIFIED is a hint + */ + /* np->n_flag &= ~NMODIFIED; */ + } else { error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); + } np->n_attrstamp = 0; + if (getlock) + VOP_UNLOCK(vp, 0, ap->a_p); } if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; @@ -682,66 +857,83 @@ nfs_getattr(ap) caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3 = NFS_ISV3(vp); + int v3; + u_int64_t xid; + int avoidfloods; + FSDBG_TOP(513, np->n_size, np, np->n_vattr.va_size, np->n_flag); /* * Update local times for special files. */ if (np->n_flag & (NACC | NUPD)) np->n_flag |= NCHG; - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 513)) | DBG_FUNC_START, - (int)np->n_size, 0, (int)np->n_vattr.va_size, np->n_flag, 0); - /* * First look in the cache. */ if ((error = nfs_getattrcache(vp, ap->a_vap)) == 0) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 513)) | DBG_FUNC_END, - (int)np->n_size, 0, (int)np->n_vattr.va_size, np->n_flag, 0); - + FSDBG_BOT(513, np->n_size, 0, np->n_vattr.va_size, np->n_flag); return (0); } - if (error != ENOENT) + if (error != ENOENT) { + FSDBG_BOT(513, np->n_size, error, np->n_vattr.va_size, + np->n_flag); return (error); + } + + if (!VFSTONFS(vp->v_mount)) { + FSDBG_BOT(513, np->n_size, ENXIO, np->n_vattr.va_size, np->n_flag); + return (ENXIO); + } + v3 = NFS_ISV3(vp); error = 0; - + if (v3 && nfsaccess_cache_timeout > 0) { /* nfsstats.accesscache_misses++; */ - if (error = nfs3_access_otw(vp, NFSV3ACCESS_ALL, ap->a_p, ap->a_cred)) - return (error); + if (error = nfs3_access_otw(vp, NFSV3ACCESS_ALL, ap->a_p, + ap->a_cred)) + return (error); if ((error = nfs_getattrcache(vp, ap->a_vap)) == 0) return (0); if (error != ENOENT) return (error); error = 0; } - + avoidfloods = 0; +tryagain: nfsstats.rpccnt[NFSPROC_GETATTR]++; nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3)); nfsm_fhtom(vp, v3); - nfsm_request(vp, NFSPROC_GETATTR, ap->a_p, ap->a_cred); + nfsm_request(vp, NFSPROC_GETATTR, ap->a_p, ap->a_cred, &xid); if (!error) { - nfsm_loadattr(vp, ap->a_vap); + nfsm_loadattr(vp, ap->a_vap, &xid); + if (!xid) { /* out-of-order rpc - attributes were dropped */ + m_freem(mrep); + FSDBG(513, -1, np, np->n_xid << 32, np->n_xid); + if (avoidfloods++ < 100) + goto tryagain; + /* + * avoidfloods>1 is bizarre. at 100 pull the plug + */ + panic("nfs_getattr: getattr flood\n"); + } if (np->n_mtime != ap->a_vap->va_mtime.tv_sec) { - NFSTRACE(NFSTRC_GA_INV, vp); - if (vp->v_type == VDIR) + FSDBG(513, -1, np, -1, vp); + if (vp->v_type == VDIR) { nfs_invaldir(vp); + /* purge name cache entries */ + cache_purge(vp); + } error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); + FSDBG(513, -1, np, -2, error); if (!error) { - NFSTRACE(NFSTRC_GA_INV1, vp); np->n_mtime = ap->a_vap->va_mtime.tv_sec; - } else { - NFSTRACE(NFSTRC_GA_INV2, error); } } } nfsm_reqdone; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 513)) | DBG_FUNC_END, - (int)np->n_size, -1, (int)np->n_vattr.va_size, error, 0); - + FSDBG_BOT(513, np->n_size, -1, np->n_vattr.va_size, error); return (error); } @@ -767,6 +959,15 @@ nfs_setattr(ap) #ifndef nolint tsize = (u_quad_t)0; #endif + +#ifdef XXX /* enable this code soon! (but test it first) */ + /* + * Setting of flags is not supported. + */ + if (vap->va_flags != VNOVAL) + return (EOPNOTSUPP); +#endif + /* * Disallow write attempts if the filesystem is mounted read-only. */ @@ -798,72 +999,101 @@ nfs_setattr(ap) */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); - np->n_flag |= NMODIFIED; - tsize = np->n_size; - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 512)) | DBG_FUNC_START, - (int)np->n_size, (int)vap->va_size, (int)np->n_vattr.va_size, np->n_flag, 0); - - if (vap->va_size == 0) - error = nfs_vinvalbuf(vp, 0, - ap->a_cred, ap->a_p, 1); - else - error = nfs_vinvalbuf(vp, V_SAVE, - ap->a_cred, ap->a_p, 1); - - if (UBCISVALID(vp)) - ubc_setsize(vp, (off_t)vap->va_size); /* XXX check error */ - - if (error) { - printf("nfs_setattr: nfs_vinvalbuf %d\n", error); - -#if DIAGNOSTIC - kprintf("nfs_setattr: nfs_vinvalbuf %d\n", - error); -#endif /* DIAGNOSTIC */ - if (UBCISVALID(vp)) - ubc_setsize(vp, (off_t)tsize); /* XXX check error */ - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 512)) | DBG_FUNC_END, - (int)np->n_size, (int)vap->va_size, (int)np->n_vattr.va_size, -1, 0); - - return (error); + FSDBG_TOP(512, np->n_size, vap->va_size, + np->n_vattr.va_size, np->n_flag); + if (np->n_flag & NMODIFIED) { + if (vap->va_size == 0) + error = nfs_vinvalbuf(vp, 0, + ap->a_cred, ap->a_p, 1); + else + error = nfs_vinvalbuf(vp, V_SAVE, + ap->a_cred, ap->a_p, 1); + if (error) { + printf("nfs_setattr: nfs_vinvalbuf %d\n", error); + FSDBG_BOT(512, np->n_size, vap->va_size, + np->n_vattr.va_size, -1); + return (error); + } + } else if (np->n_size > vap->va_size) { /* shrinking? */ + daddr_t obn, bn; + int biosize; + struct nfsbuf *bp; + + biosize = vp->v_mount->mnt_stat.f_iosize; + obn = (np->n_size - 1) / biosize; + bn = vap->va_size / biosize; + for ( ; obn >= bn; obn--) + if (nfs_buf_incore(vp, obn)) { + bp = nfs_buf_get(vp, obn, biosize, 0, BLK_READ); + if (!bp) + continue; + if (obn == bn) { + int neweofoff, mustwrite; + mustwrite = 0; + neweofoff = vap->va_size - NBOFF(bp); + /* check for any dirty data before the new EOF */ + if (bp->nb_dirtyend && bp->nb_dirtyoff < neweofoff) { + /* clip dirty range to EOF */ + if (bp->nb_dirtyend > neweofoff) + bp->nb_dirtyend = neweofoff; + mustwrite++; + } + bp->nb_dirty &= (1 << round_page_32(neweofoff)/PAGE_SIZE) - 1; + if (bp->nb_dirty) + mustwrite++; + if (mustwrite) { + /* gotta write out dirty data before invalidating */ + /* (NB_STABLE indicates that data writes should be FILESYNC) */ + /* (NB_NOCACHE indicates buffer should be discarded) */ + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC | NB_READ)); + SET(bp->nb_flags, NB_STABLE | NB_NOCACHE); + /* + * NFS has embedded ucred so crhold() risks zone corruption + */ + if (bp->nb_wcred == NOCRED) + bp->nb_wcred = crdup(ap->a_cred); + error = nfs_buf_write(bp); + // Note: bp has been released + if (error) { + FSDBG(512, bp, 0xd00dee, 0xbad, error); + np->n_error = error; + np->n_flag |= NWRITEERR; + error = 0; + } + bp = NULL; + } + } + if (bp) { + FSDBG(512, bp, bp->nb_flags, 0, obn); + SET(bp->nb_flags, NB_INVAL); + nfs_buf_release(bp); + } + } } + tsize = np->n_size; np->n_size = np->n_vattr.va_size = vap->va_size; - + ubc_setsize(vp, (off_t)vap->va_size); /* XXX error? */ }; } else if ((vap->va_mtime.tv_sec != VNOVAL || - vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) && - vp->v_type == VREG && - (error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, - ap->a_p, 1)) == EINTR) - return (error); - + vap->va_atime.tv_sec != VNOVAL) && + (np->n_flag & NMODIFIED) && vp->v_type == VREG) { + error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); + if (error == EINTR) + return (error); + } error = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_p); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 512)) | DBG_FUNC_END, - (int)np->n_size, (int)vap->va_size, (int)np->n_vattr.va_size, error, 0); - + FSDBG_BOT(512, np->n_size, vap->va_size, np->n_vattr.va_size, error); if (error && vap->va_size != VNOVAL) { /* make every effort to resync file size w/ server... */ int err = 0; /* preserve "error" for return */ printf("nfs_setattr: nfs_setattrrpc %d\n", error); -#if DIAGNOSTIC - kprintf("nfs_setattr: nfs_setattrrpc %d\n", error); -#endif /* DIAGNOSTIC */ np->n_size = np->n_vattr.va_size = tsize; - if (UBCISVALID(vp)) - ubc_setsize(vp, (off_t)np->n_size); /* XXX check error */ + ubc_setsize(vp, (off_t)np->n_size); /* XXX check error */ vap->va_size = tsize; err = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_p); - if (err) printf("nfs_setattr1: nfs_setattrrpc %d\n", err); -#if DIAGNOSTIC - if (err) - kprintf("nfs_setattr nfs_setattrrpc %d\n", err); -#endif /* DIAGNOSTIC */ } return (error); } @@ -885,7 +1115,13 @@ nfs_setattrrpc(vp, vap, cred, procp) u_long *tl; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3 = NFS_ISV3(vp); + int v3; + u_int64_t xid; + struct timeval now; + + if (!VFSTONFS(vp->v_mount)) + return (ENXIO); + v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_SETATTR]++; nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3)); @@ -923,8 +1159,9 @@ nfs_setattrrpc(vp, vap, cred, procp) nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } + microtime(&now); if (vap->va_atime.tv_sec != VNOVAL) { - if (vap->va_atime.tv_sec != time.tv_sec) { + if (vap->va_atime.tv_sec != now.tv_sec) { nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT); txdr_nfsv3time(&vap->va_atime, tl); @@ -937,7 +1174,7 @@ nfs_setattrrpc(vp, vap, cred, procp) *tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE); } if (vap->va_mtime.tv_sec != VNOVAL) { - if (vap->va_mtime.tv_sec != time.tv_sec) { + if (vap->va_mtime.tv_sec != now.tv_sec) { nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT); txdr_nfsv3time(&vap->va_mtime, tl); @@ -969,13 +1206,13 @@ nfs_setattrrpc(vp, vap, cred, procp) txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } - nfsm_request(vp, NFSPROC_SETATTR, procp, cred); + nfsm_request(vp, NFSPROC_SETATTR, procp, cred, &xid); if (v3) { - nfsm_wcc_data(vp, wccflag); - if ((!wccflag) && (vp->v_type != VBAD)) /* EINVAL set on VBAD vnode */ - VTONFS(vp)->n_attrstamp = 0; + nfsm_wcc_data(vp, wccflag, &xid); + if (!wccflag) + VTONFS(vp)->n_attrstamp = 0; } else - nfsm_loadattr(vp, (struct vattr *)0); + nfsm_loadattr(vp, (struct vattr *)0, &xid); nfsm_reqdone; return (error); } @@ -1002,7 +1239,6 @@ nfs_lookup(ap) register u_long *tl; register caddr_t cp; register long t1, t2; - struct nfsmount *nmp; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; long len; @@ -1011,7 +1247,9 @@ nfs_lookup(ap) int lockparent, wantparent, error = 0, attrflag, fhsize; int v3 = NFS_ISV3(dvp); struct proc *p = cnp->cn_proc; - int worldbuildworkaround = 1; + int unlockdvp = 0; + u_int64_t xid; + struct vattr vattr; if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) @@ -1019,110 +1257,80 @@ nfs_lookup(ap) *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); + lockparent = flags & LOCKPARENT; wantparent = flags & (LOCKPARENT|WANTPARENT); - nmp = VFSTONFS(dvp->v_mount); np = VTONFS(dvp); - - if (worldbuildworkaround) { - /* temporary workaround for world builds to not have dvp go - VBAD on during server calls in this routine. When - the real ref counting problem is found take this out. - Note if this was later and before the nfsm_request - set up, the workaround did not work (NOTE other difference - was I only put one VREF in that time. Thus it needs - to be above the cache_lookup branch or with 2 VREFS. Not - sure which. Can't play with world builds right now to see - which. VOP_ACCESS could also make it go to server. - EKN */ - VREF(dvp); /* hang on to this dvp - EKN */ - VREF(dvp); /* hang on tight - EKN */ - } + + /* if directory has changed, purge any name cache entries */ + if (!VOP_GETATTR(dvp, &vattr, cnp->cn_cred, p) && + (np->n_mtime != vattr.va_mtime.tv_sec)) + cache_purge(dvp); if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) { - struct vattr vattr; int vpid; - if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, p))) { - *vpp = NULLVP; - if (worldbuildworkaround) { - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - } - return (error); - } - - /* got to check to make sure the vnode didn't go away if access went to server */ - if ((*vpp)->v_type == VBAD) { - if (worldbuildworkaround) { - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - } - return(EINVAL); - } - newvp = *vpp; vpid = newvp->v_id; + /* * See the comment starting `Step through' in ufs/ufs_lookup.c * for an explanation of the locking protocol */ + + /* + * Note: we need to make sure to get a lock/ref on newvp + * before we possibly go off to the server in VOP_ACCESS. + */ if (dvp == newvp) { VREF(newvp); error = 0; } else if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, p); error = vget(newvp, LK_EXCLUSIVE, p); - if (!error && lockparent && (flags & ISLASTCN)) + if (!error) error = vn_lock(dvp, LK_EXCLUSIVE, p); } else { error = vget(newvp, LK_EXCLUSIVE, p); - if (!lockparent || error || !(flags & ISLASTCN)) + if (error) VOP_UNLOCK(dvp, 0, p); } - if (!error) { - if (vpid == newvp->v_id) { - if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, p) - && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) { + + if (error) + goto cache_lookup_out; + + if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, p))) { + if (dvp == newvp) + vrele(newvp); + else + vput(newvp); + *vpp = NULLVP; + goto error_return; + } + + if ((dvp != newvp) && (!lockparent || !(flags & ISLASTCN))) + VOP_UNLOCK(dvp, 0, p); + + if (vpid == newvp->v_id) { + if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, p) + && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) { nfsstats.lookupcache_hits++; - if (cnp->cn_nameiop != LOOKUP && - (flags & ISLASTCN)) + if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; - - if (worldbuildworkaround) { - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - } - - return (0); - } - cache_purge(newvp); - } - vput(newvp); - if (lockparent && dvp != newvp && (flags & ISLASTCN)) - VOP_UNLOCK(dvp, 0, p); + error = 0; /* ignore any from VOP_GETATTR */ + goto error_return; + } + cache_purge(newvp); } + vput(newvp); + if ((dvp != newvp) && lockparent && (flags & ISLASTCN)) + VOP_UNLOCK(dvp, 0, p); +cache_lookup_out: error = vn_lock(dvp, LK_EXCLUSIVE, p); *vpp = NULLVP; - if (error) { - if (worldbuildworkaround) { - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - } - return (error); - } + if (error) + goto error_return; } - - /* - * Got to check to make sure the vnode didn't go away if VOP_GETATTR went to server - * or callers prior to this blocked and had it go VBAD. - */ - if (dvp->v_type == VBAD) { - if (worldbuildworkaround) { - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - } - return(EINVAL); - } error = 0; newvp = NULLVP; @@ -1133,19 +1341,11 @@ nfs_lookup(ap) NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); - nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_proc, cnp->cn_cred); - - /* this two lines set dvp refcounts back to where they were - * before we took extra 2 VREFS to avoid VBAD vnode on dvp - * during server calls for world builds. Remove when real - * fix is found. - EKN */ - if (worldbuildworkaround) { - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - } + /* nfsm_request for NFSv2 causes you to goto to nfsmout upon errors */ + nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_proc, cnp->cn_cred, &xid); if (error) { - nfsm_postop_attr(dvp, attrflag); + nfsm_postop_attr(dvp, attrflag, &xid); m_freem(mrep); goto nfsmout; } @@ -1157,56 +1357,65 @@ nfs_lookup(ap) if (cnp->cn_nameiop == RENAME && wantparent && (flags & ISLASTCN)) { if (NFS_CMPFH(np, fhp, fhsize)) { m_freem(mrep); - return (EISDIR); + error = EISDIR; + goto error_return; } if ((error = nfs_nget(dvp->v_mount, fhp, fhsize, &np))) { m_freem(mrep); - return (error); + goto error_return; } newvp = NFSTOV(np); if (v3) { - nfsm_postop_attr(newvp, attrflag); - nfsm_postop_attr(dvp, attrflag); + u_int64_t dxid = xid; + + nfsm_postop_attr(newvp, attrflag, &xid); + nfsm_postop_attr(dvp, attrflag, &dxid); } else - nfsm_loadattr(newvp, (struct vattr *)0); + nfsm_loadattr(newvp, (struct vattr *)0, &xid); *vpp = newvp; m_freem(mrep); cnp->cn_flags |= SAVENAME; if (!lockparent) VOP_UNLOCK(dvp, 0, p); - return (0); + error = 0; + goto error_return; } - if (flags & ISDOTDOT) { + if (NFS_CMPFH(np, fhp, fhsize)) { + VREF(dvp); + newvp = dvp; + } else if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, p); error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { + m_freem(mrep); vn_lock(dvp, LK_EXCLUSIVE + LK_RETRY, p); - return (error); + goto error_return; } newvp = NFSTOV(np); - if (lockparent && (flags & ISLASTCN) && - (error = vn_lock(dvp, LK_EXCLUSIVE, p))) { + if (!lockparent || !(flags & ISLASTCN)) + unlockdvp = 1; /* keep dvp locked until after postops */ + if (error = vn_lock(dvp, LK_EXCLUSIVE, p)) { + m_freem(mrep); vput(newvp); - return (error); + goto error_return; } - } else if (NFS_CMPFH(np, fhp, fhsize)) { - VREF(dvp); - newvp = dvp; } else { if ((error = nfs_nget(dvp->v_mount, fhp, fhsize, &np))) { m_freem(mrep); - return (error); + goto error_return; } if (!lockparent || !(flags & ISLASTCN)) - VOP_UNLOCK(dvp, 0, p); + unlockdvp = 1; /* keep dvp locked until after postops */ newvp = NFSTOV(np); } if (v3) { - nfsm_postop_attr(newvp, attrflag); - nfsm_postop_attr(dvp, attrflag); + u_int64_t dxid = xid; + + nfsm_postop_attr(newvp, attrflag, &xid); + nfsm_postop_attr(dvp, attrflag, &dxid); } else - nfsm_loadattr(newvp, (struct vattr *)0); + nfsm_loadattr(newvp, (struct vattr *)0, &xid); if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; if ((cnp->cn_flags & MAKEENTRY) && @@ -1216,23 +1425,29 @@ nfs_lookup(ap) } *vpp = newvp; nfsm_reqdone; + if (unlockdvp) + VOP_UNLOCK(dvp, 0, p); if (error) { if (newvp != NULLVP) { - vrele(newvp); + if (newvp == dvp) + vrele(newvp); + else + vput(newvp); *vpp = NULLVP; } if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && (flags & ISLASTCN) && error == ENOENT) { - if (!lockparent) - VOP_UNLOCK(dvp, 0, p); - if (dvp->v_mount->mnt_flag & MNT_RDONLY) + if (dvp->v_mount && (dvp->v_mount->mnt_flag & MNT_RDONLY)) error = EROFS; else error = EJUSTRETURN; + if (!lockparent) + VOP_UNLOCK(dvp, 0, p); } if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; } +error_return: return (error); } @@ -1256,6 +1471,7 @@ nfs_read(ap) return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred, 0)); } + /* * nfs readlink call */ @@ -1290,18 +1506,23 @@ nfs_readlinkrpc(vp, uiop, cred) caddr_t bpos, dpos, cp2; int error = 0, len, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3 = NFS_ISV3(vp); + int v3; + u_int64_t xid; + + if (!VFSTONFS(vp->v_mount)) + return (ENXIO); + v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_READLINK]++; nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3)); nfsm_fhtom(vp, v3); - nfsm_request(vp, NFSPROC_READLINK, uiop->uio_procp, cred); + nfsm_request(vp, NFSPROC_READLINK, uiop->uio_procp, cred, &xid); if (v3) - nfsm_postop_attr(vp, attrflag); + nfsm_postop_attr(vp, attrflag, &xid); if (!error) { nfsm_strsiz(len, NFS_MAXPATHLEN); - if (len == NFS_MAXPATHLEN) { - struct nfsnode *np = VTONFS(vp); + if (len == NFS_MAXPATHLEN) { + struct nfsnode *np = VTONFS(vp); #if DIAGNOSTIC if (!np) panic("nfs_readlinkrpc: null np"); @@ -1331,19 +1552,25 @@ nfs_readrpc(vp, uiop, cred) caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct nfsmount *nmp; - int error = 0, len, retlen, tsiz, eof, attrflag; - int v3 = NFS_ISV3(vp); + int error = 0, len, retlen, tsiz, eof = 0, attrflag; + int v3, nmrsize; + u_int64_t xid; -#ifndef nolint - eof = 0; -#endif + FSDBG_TOP(536, vp, uiop->uio_offset, uiop->uio_resid, 0); nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); + v3 = NFS_ISV3(vp); + nmrsize = nmp->nm_rsize; + tsiz = uiop->uio_resid; - if (((u_int64_t)uiop->uio_offset + (unsigned int)tsiz > 0xffffffff) && !v3) + if (((u_int64_t)uiop->uio_offset + (unsigned int)tsiz > 0xffffffff) && !v3) { + FSDBG_BOT(536, vp, uiop->uio_offset, uiop->uio_resid, EFBIG); return (EFBIG); + } while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_READ]++; - len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz; + len = (tsiz > nmrsize) ? nmrsize : tsiz; nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3); nfsm_fhtom(vp, v3); nfsm_build(tl, u_long *, NFSX_UNSIGNED * 3); @@ -1355,9 +1582,10 @@ nfs_readrpc(vp, uiop, cred) *tl++ = txdr_unsigned(len); *tl = 0; } - nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, cred); + FSDBG(536, vp, uiop->uio_offset, len, 0); + nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, cred, &xid); if (v3) { - nfsm_postop_attr(vp, attrflag); + nfsm_postop_attr(vp, attrflag, &xid); if (error) { m_freem(mrep); goto nfsmout; @@ -1365,8 +1593,8 @@ nfs_readrpc(vp, uiop, cred) nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); eof = fxdr_unsigned(int, *(tl + 1)); } else - nfsm_loadattr(vp, (struct vattr *)0); - nfsm_strsiz(retlen, nmp->nm_rsize); + nfsm_loadattr(vp, (struct vattr *)0, &xid); + nfsm_strsiz(retlen, nmrsize); nfsm_mtouio(uiop, retlen); m_freem(mrep); tsiz -= retlen; @@ -1377,6 +1605,7 @@ nfs_readrpc(vp, uiop, cred) tsiz = 0; } nfsmout: + FSDBG_BOT(536, vp, eof, uiop->uio_resid, error); return (error); } @@ -1395,19 +1624,32 @@ nfs_writerpc(vp, uiop, cred, iomode, must_commit) register int t1, t2, backup; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct nfsmount *nmp; int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit; - int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC; + int v3, committed = NFSV3WRITE_FILESYNC; + u_int64_t xid; #if DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfs_writerpc: iovcnt > 1"); #endif + FSDBG_TOP(537, vp, uiop->uio_offset, uiop->uio_resid, *iomode); + nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); + v3 = NFS_ISV3(vp); *must_commit = 0; tsiz = uiop->uio_resid; - if (((u_int64_t)uiop->uio_offset + (unsigned int)tsiz > 0xffffffff) && !v3) + if (((u_int64_t)uiop->uio_offset + (unsigned int)tsiz > 0xffffffff) && !v3) { + FSDBG_BOT(537, vp, uiop->uio_offset, uiop->uio_resid, EFBIG); return (EFBIG); + } while (tsiz > 0) { + nmp = VFSTONFS(vp->v_mount); + if (!nmp) { + error = ENXIO; + break; + } nfsstats.rpccnt[NFSPROC_WRITE]++; len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz; nfsm_reqhead(vp, NFSPROC_WRITE, @@ -1425,11 +1667,15 @@ nfs_writerpc(vp, uiop, cred, iomode, must_commit) tl += 2; } *tl = txdr_unsigned(len); + FSDBG(537, vp, uiop->uio_offset, len, 0); nfsm_uiotom(uiop, len); - nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, cred); + nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, cred, &xid); + nmp = VFSTONFS(vp->v_mount); + if (!nmp) + error = ENXIO; if (v3) { wccflag = NFSV3_WCCCHK; - nfsm_wcc_data(vp, wccflag); + nfsm_wcc_data(vp, wccflag, &xid); if (!error) { nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF); @@ -1456,10 +1702,10 @@ nfs_writerpc(vp, uiop, cred, iomode, must_commit) else if (committed == NFSV3WRITE_DATASYNC && commit == NFSV3WRITE_UNSTABLE) committed = commit; - if ((nmp->nm_flag & NFSMNT_HASWRITEVERF) == 0) { + if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) { bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); - nmp->nm_flag |= NFSMNT_HASWRITEVERF; + nmp->nm_state |= NFSSTA_HASWRITEVERF; } else if (bcmp((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) { *must_commit = 1; @@ -1468,27 +1714,29 @@ nfs_writerpc(vp, uiop, cred, iomode, must_commit) } } } else - nfsm_loadattr(vp, (struct vattr *)0); - if ((wccflag) && (vp->v_type != VBAD)) /* EINVAL set on VBAD vnode */ + nfsm_loadattr(vp, (struct vattr *)0, &xid); + + if (wccflag) VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime.tv_sec; m_freem(mrep); - /* - * we seem to have a case where we end up looping on shutdown and taking down nfs servers. - * For V3, error cases, there is no way to terminate loop, if the len was 0, meaning, - * nmp->nm_wsize was trashed. FreeBSD has this fix in it. Let's try it. - */ - if (error) - break; - tsiz -= len; + /* + * we seem to have a case where we end up looping on shutdown + * and taking down nfs servers. For V3, error cases, there is + * no way to terminate loop, if the len was 0, meaning, + * nmp->nm_wsize was trashed. FreeBSD has this fix in it. + * Let's try it. + */ + if (error) + break; + tsiz -= len; } nfsmout: - /* does it make sense to even say it was committed if we had an error? EKN */ - /* okay well just don't on bad vnodes then. EINVAL will be returned on bad vnodes */ - if ((vp->v_type != VBAD) && (vp->v_mount->mnt_flag & MNT_ASYNC)) + if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC)) committed = NFSV3WRITE_FILESYNC; *iomode = committed; if (error) uiop->uio_resid = tsiz; + FSDBG_BOT(537, vp, committed, uiop->uio_resid, error); return (error); } @@ -1517,6 +1765,7 @@ nfs_mknodrpc(dvp, vpp, cnp, vap) int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; u_long rdev; + u_int64_t xid; int v3 = NFS_ISV3(dvp); if (vap->va_type == VCHR || vap->va_type == VBLK) @@ -1557,9 +1806,9 @@ nfs_mknodrpc(dvp, vpp, cnp, vap) txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } - nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_proc, cnp->cn_cred); + nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_proc, cnp->cn_cred, &xid); if (!error) { - nfsm_mtofh(dvp, newvp, v3, gotvp); + nfsm_mtofh(dvp, newvp, v3, gotvp, &xid); if (!gotvp) { if (newvp) { vput(newvp); @@ -1572,7 +1821,7 @@ nfs_mknodrpc(dvp, vpp, cnp, vap) } } if (v3) - nfsm_wcc_data(dvp, wccflag); + nfsm_wcc_data(dvp, wccflag, &xid); nfsm_reqdone; if (error) { if (newvp) @@ -1582,13 +1831,11 @@ nfs_mknodrpc(dvp, vpp, cnp, vap) cache_enter(dvp, newvp, cnp); *vpp = newvp; } - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); - if (dvp->v_type != VBAD) { /* EINVAL set on VBAD vnode */ - VTONFS(dvp)->n_flag |= NMODIFIED; - if (!wccflag) + VTONFS(dvp)->n_flag |= NMODIFIED; + if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; - } vput(dvp); + NFS_FREE_PNBUF(cnp); return (error); } @@ -1610,8 +1857,9 @@ nfs_mknod(ap) int error; error = nfs_mknodrpc(ap->a_dvp, &newvp, ap->a_cnp, ap->a_vap); - if (!error) + if (!error && newvp) vput(newvp); + *ap->a_vpp = 0; return (error); } @@ -1643,6 +1891,7 @@ nfs_create(ap) struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vattr vattr; int v3 = NFS_ISV3(dvp); + u_int64_t xid; /* * Oops, not for me.. @@ -1688,9 +1937,9 @@ again: txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } - nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_proc, cnp->cn_cred); + nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_proc, cnp->cn_cred, &xid); if (!error) { - nfsm_mtofh(dvp, newvp, v3, gotvp); + nfsm_mtofh(dvp, newvp, v3, gotvp, &xid); if (!gotvp) { if (newvp) { vput(newvp); @@ -1703,7 +1952,7 @@ again: } } if (v3) - nfsm_wcc_data(dvp, wccflag); + nfsm_wcc_data(dvp, wccflag, &xid); nfsm_reqdone; if (error) { if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) { @@ -1719,13 +1968,11 @@ again: cache_enter(dvp, newvp, cnp); *ap->a_vpp = newvp; } - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); - if (dvp->v_type != VBAD) { /* EINVAL set on VBAD vnode */ - VTONFS(dvp)->n_flag |= NMODIFIED; - if (!wccflag) - VTONFS(dvp)->n_attrstamp = 0; - } + VTONFS(dvp)->n_flag |= NMODIFIED; + if (!wccflag) + VTONFS(dvp)->n_attrstamp = 0; vput(dvp); + NFS_FREE_PNBUF(cnp); return (error); } @@ -1753,9 +2000,8 @@ nfs_remove(ap) register struct vnode *dvp = ap->a_dvp; register struct componentname *cnp = ap->a_cnp; register struct nfsnode *np = VTONFS(vp); - int error = 0; + int error = 0, gofree = 0; struct vattr vattr; - int file_deleted = 0; #if DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) @@ -1763,11 +2009,33 @@ nfs_remove(ap) if (vp->v_usecount < 1) panic("nfs_remove: bad v_usecount"); #endif - if (vp->v_usecount == 1 || - (UBCISVALID(vp)&&(vp->v_usecount==2)) || - (np->n_sillyrename && - VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_proc) == 0 && - vattr.va_nlink > 1)) { + + if (UBCISVALID(vp)) { + /* regular files */ + if (UBCINFOEXISTS(vp)) + gofree = (ubc_isinuse(vp, 1)) ? 0 : 1; + else { + /* dead or dying vnode.With vnode locking panic instead of error */ + vput(dvp); + vput(vp); + NFS_FREE_PNBUF(cnp); + return (EIO); + } + } else { + /* UBC not in play */ + if (vp->v_usecount == 1) + gofree = 1; + } + if ((ap->a_cnp->cn_flags & NODELETEBUSY) && !gofree) { + /* Caller requested Carbon delete semantics, but file is busy */ + vput(dvp); + vput(vp); + NFS_FREE_PNBUF(cnp); + return (EBUSY); + } + if (gofree || (np->n_sillyrename && + VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_proc) == 0 && + vattr.va_nlink > 1)) { /* * Purge the name cache so that the chance of a lookup for * the name succeeding while the remove is in progress is @@ -1781,7 +2049,8 @@ nfs_remove(ap) * unnecessary delayed writes later. */ error = nfs_vinvalbuf(vp, 0, cnp->cn_cred, cnp->cn_proc, 1); - ubc_setsize(vp, (off_t)0); + np->n_size = 0; + ubc_setsize(vp, (off_t)0); /* XXX check error */ /* Do the rpc */ if (error != EINTR) error = nfs_removerpc(dvp, cnp->cn_nameptr, @@ -1794,26 +2063,25 @@ nfs_remove(ap) */ if (error == ENOENT) error = 0; - file_deleted = 1; + if (!error) { + /* + * remove nfsnode from hash now so we can't accidentally find it + * again if another object gets created with the same filehandle + * before this vnode gets reclaimed + */ + LIST_REMOVE(np, n_hash); + np->n_flag &= ~NHASHED; + } } else if (!np->n_sillyrename) { error = nfs_sillyrename(dvp, vp, cnp); } - - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); np->n_attrstamp = 0; vput(dvp); - - if (vp == dvp) - vrele(vp); - else - vput(vp); - - if (file_deleted && UBCINFOEXISTS(vp)) { - (void) ubc_uncache(vp); - ubc_release(vp); - /* WARNING vp may not be valid after this */ - } + VOP_UNLOCK(vp, 0, cnp->cn_proc); + NFS_FREE_PNBUF(cnp); + ubc_uncache(vp); + vrele(vp); return (error); } @@ -1847,22 +2115,25 @@ nfs_removerpc(dvp, name, namelen, cred, proc) caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3 = NFS_ISV3(dvp); + int v3; + u_int64_t xid; + + if (!VFSTONFS(dvp->v_mount)) + return (ENXIO); + v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_REMOVE]++; nfsm_reqhead(dvp, NFSPROC_REMOVE, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen)); nfsm_fhtom(dvp, v3); nfsm_strtom(name, namelen, NFS_MAXNAMLEN); - nfsm_request(dvp, NFSPROC_REMOVE, proc, cred); + nfsm_request(dvp, NFSPROC_REMOVE, proc, cred, &xid); if (v3) - nfsm_wcc_data(dvp, wccflag); + nfsm_wcc_data(dvp, wccflag, &xid); nfsm_reqdone; - if (dvp->v_type != VBAD) { /* EINVAL set on VBAD vnode */ - VTONFS(dvp)->n_flag |= NMODIFIED; - if (!wccflag) + VTONFS(dvp)->n_flag |= NMODIFIED; + if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; - } return (error); } @@ -1886,7 +2157,7 @@ nfs_rename(ap) register struct vnode *tdvp = ap->a_tdvp; register struct componentname *tcnp = ap->a_tcnp; register struct componentname *fcnp = ap->a_fcnp; - int error; + int error, purged=0, inuse=0; #if DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || @@ -1897,6 +2168,8 @@ nfs_rename(ap) if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; + if (tvp) + VOP_UNLOCK(tvp, 0, tcnp->cn_proc); goto out; } @@ -1904,30 +2177,78 @@ nfs_rename(ap) * If the tvp exists and is in use, sillyrename it before doing the * rename of the new file over it. * XXX Can't sillyrename a directory. + * Don't sillyrename if source and target are same vnode (hard + * links or case-variants) */ - if (tvp && (tvp->v_usecount>(UBCISVALID(tvp) ? 2 : 1)) && - !VTONFS(tvp)->n_sillyrename && - tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) { - vput(tvp); - tvp = NULL; + if (tvp && tvp != fvp) { + if (UBCISVALID(tvp)) { + /* regular files */ + if (UBCINFOEXISTS(tvp)) + inuse = (ubc_isinuse(tvp, 1)) ? 1 : 0; + else { + /* dead or dying vnode.With vnode locking panic instead of error */ + error = EIO; + VOP_UNLOCK(tvp, 0, tcnp->cn_proc); + goto out; + } + } else { + /* UBC not in play */ + if (tvp->v_usecount > 1) + inuse = 1; + } + } + if (inuse && !VTONFS(tvp)->n_sillyrename && tvp->v_type != VDIR) { + if (error = nfs_sillyrename(tdvp, tvp, tcnp)) { + /* sillyrename failed. Instead of pressing on, return error */ + VOP_UNLOCK(tvp, 0, tcnp->cn_proc); + goto out; /* should not be ENOENT. */ + } else { + /* sillyrename succeeded.*/ + VOP_UNLOCK(tvp, 0, tcnp->cn_proc); + ubc_uncache(tvp); /* get the nfs turd file to disappear */ + vrele(tvp); + tvp = NULL; + } } error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen, tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred, tcnp->cn_proc); + if (!error && tvp && tvp != fvp && !VTONFS(tvp)->n_sillyrename) { + /* + * remove nfsnode from hash now so we can't accidentally find it + * again if another object gets created with the same filehandle + * before this vnode gets reclaimed + */ + LIST_REMOVE(VTONFS(tvp), n_hash); + VTONFS(tvp)->n_flag &= ~NHASHED; + } + if (fvp->v_type == VDIR) { - if (tvp != NULL && tvp->v_type == VDIR) + if (tvp != NULL && tvp->v_type == VDIR) { cache_purge(tdvp); + if (tvp == tdvp) + purged = 1; + } cache_purge(fdvp); } + + cache_purge(fvp); + if (tvp) { + if (!purged) + cache_purge(tvp); + VOP_UNLOCK(tvp, 0, tcnp->cn_proc); + ubc_uncache(tvp); /* get the nfs turd file to disappear */ + } + out: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) - vput(tvp); + vrele(tvp); /* already unlocked */ vrele(fdvp); vrele(fvp); /* @@ -1971,32 +2292,35 @@ nfs_renamerpc(fdvp, fnameptr, fnamelen, tdvp, tnameptr, tnamelen, cred, proc) caddr_t bpos, dpos, cp2; int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3 = NFS_ISV3(fdvp); + int v3; + u_int64_t xid; + + if (!VFSTONFS(fdvp->v_mount)) + return (ENXIO); + v3 = NFS_ISV3(fdvp); nfsstats.rpccnt[NFSPROC_RENAME]++; nfsm_reqhead(fdvp, NFSPROC_RENAME, - (NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) + - nfsm_rndup(tnamelen)); + (NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) + + nfsm_rndup(tnamelen)); nfsm_fhtom(fdvp, v3); nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN); nfsm_fhtom(tdvp, v3); nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN); - nfsm_request(fdvp, NFSPROC_RENAME, proc, cred); + nfsm_request(fdvp, NFSPROC_RENAME, proc, cred, &xid); if (v3) { - nfsm_wcc_data(fdvp, fwccflag); - nfsm_wcc_data(tdvp, twccflag); + u_int64_t txid = xid; + + nfsm_wcc_data(fdvp, fwccflag, &xid); + nfsm_wcc_data(tdvp, twccflag, &txid); } nfsm_reqdone; - if (fdvp->v_type != VBAD) { /* EINVAL set on VBAD vnode */ - VTONFS(fdvp)->n_flag |= NMODIFIED; - if (!fwccflag) + VTONFS(fdvp)->n_flag |= NMODIFIED; + if (!fwccflag) VTONFS(fdvp)->n_attrstamp = 0; - } - if (tdvp->v_type != VBAD) { /* EINVAL set on VBAD vnode */ - VTONFS(tdvp)->n_flag |= NMODIFIED; - if (!twccflag) - VTONFS(tdvp)->n_attrstamp = 0; - } + VTONFS(tdvp)->n_flag |= NMODIFIED; + if (!twccflag) + VTONFS(tdvp)->n_attrstamp = 0; return (error); } @@ -2020,23 +2344,38 @@ nfs_link(ap) caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3 = NFS_ISV3(vp); + int v3, didhold; + u_int64_t xid; if (vp->v_mount != tdvp->v_mount) { VOP_ABORTOP(vp, cnp); - if (tdvp == vp) - vrele(tdvp); - else - vput(tdvp); + vput(tdvp); return (EXDEV); } + /* need to get vnode lock for vp before calling VOP_FSYNC() */ + if (error = vn_lock(vp, LK_EXCLUSIVE, cnp->cn_proc)) { + VOP_ABORTOP(vp, cnp); + vput(tdvp); + return (error); + } + + if (!VFSTONFS(vp->v_mount)) { + VOP_UNLOCK(vp, 0, cnp->cn_proc); + VOP_ABORTOP(vp, cnp); + vput(tdvp); + return (ENXIO); + } + v3 = NFS_ISV3(vp); + /* * Push all writes to the server, so that the attribute cache * doesn't get "out of sync" with the server. * XXX There should be a better way! */ + didhold = ubc_hold(vp); VOP_FSYNC(vp, cnp->cn_cred, MNT_WAIT, cnp->cn_proc); + VOP_UNLOCK(vp, 0, cnp->cn_proc); nfsstats.rpccnt[NFSPROC_LINK]++; nfsm_reqhead(vp, NFSPROC_LINK, @@ -2044,20 +2383,24 @@ nfs_link(ap) nfsm_fhtom(vp, v3); nfsm_fhtom(tdvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); - nfsm_request(vp, NFSPROC_LINK, cnp->cn_proc, cnp->cn_cred); + nfsm_request(vp, NFSPROC_LINK, cnp->cn_proc, cnp->cn_cred, &xid); if (v3) { - nfsm_postop_attr(vp, attrflag); - nfsm_wcc_data(tdvp, wccflag); + u_int64_t txid = xid; + + nfsm_postop_attr(vp, attrflag, &xid); + nfsm_wcc_data(tdvp, wccflag, &txid); } nfsm_reqdone; - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); VTONFS(tdvp)->n_flag |= NMODIFIED; - if ((!attrflag) && (vp->v_type != VBAD)) /* EINVAL set on VBAD vnode */ + if (!attrflag) VTONFS(vp)->n_attrstamp = 0; - if ((!wccflag) && (tdvp->v_type != VBAD)) /* EINVAL set on VBAD vnode */ + if (!wccflag) VTONFS(tdvp)->n_attrstamp = 0; + if (didhold) + ubc_rele(vp); vput(tdvp); + NFS_FREE_PNBUF(cnp); /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ @@ -2092,6 +2435,7 @@ nfs_symlink(ap) struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vnode *newvp = (struct vnode *)0; int v3 = NFS_ISV3(dvp); + u_int64_t xid; nfsstats.rpccnt[NFSPROC_SYMLINK]++; slen = strlen(ap->a_target); @@ -2114,22 +2458,23 @@ nfs_symlink(ap) txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } - nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_proc, cnp->cn_cred); + nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_proc, cnp->cn_cred, &xid); if (v3) { + u_int64_t dxid = xid; + if (!error) - nfsm_mtofh(dvp, newvp, v3, gotvp); - nfsm_wcc_data(dvp, wccflag); + nfsm_mtofh(dvp, newvp, v3, gotvp, &xid); + nfsm_wcc_data(dvp, wccflag, &dxid); } nfsm_reqdone; if (newvp) vput(newvp); - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); - if (dvp->v_type != VBAD) { /* EINVAL set on VBAD vnode */ - VTONFS(dvp)->n_flag |= NMODIFIED; - if (!wccflag) + + VTONFS(dvp)->n_flag |= NMODIFIED; + if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; - } vput(dvp); + NFS_FREE_PNBUF(cnp); /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ @@ -2167,6 +2512,7 @@ nfs_mkdir(ap) struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vattr vattr; int v3 = NFS_ISV3(dvp); + u_int64_t xid, dxid; if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc))) { VOP_ABORTOP(dvp, cnp); @@ -2191,28 +2537,27 @@ nfs_mkdir(ap) txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } - nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_proc, cnp->cn_cred); + nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_proc, cnp->cn_cred, &xid); + dxid = xid; if (!error) - nfsm_mtofh(dvp, newvp, v3, gotvp); + nfsm_mtofh(dvp, newvp, v3, gotvp, &xid); if (v3) - nfsm_wcc_data(dvp, wccflag); + nfsm_wcc_data(dvp, wccflag, &dxid); nfsm_reqdone; - if (dvp->v_type != VBAD) { /* EINVAL set on this case */ - VTONFS(dvp)->n_flag |= NMODIFIED; - if (!wccflag) + VTONFS(dvp)->n_flag |= NMODIFIED; + if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; - } /* * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry * if we can succeed in looking up the directory. */ if (error == EEXIST || (!error && !gotvp)) { if (newvp) { - vrele(newvp); + vput(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred, - cnp->cn_proc, &np); + cnp->cn_proc, &np); if (!error) { newvp = NFSTOV(np); if (newvp->v_type != VDIR) @@ -2221,11 +2566,11 @@ nfs_mkdir(ap) } if (error) { if (newvp) - vrele(newvp); + vput(newvp); } else *ap->a_vpp = newvp; - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); vput(dvp); + NFS_FREE_PNBUF(cnp); return (error); } @@ -2250,26 +2595,25 @@ nfs_rmdir(ap) int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(dvp); + u_int64_t xid; nfsstats.rpccnt[NFSPROC_RMDIR]++; nfsm_reqhead(dvp, NFSPROC_RMDIR, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); - nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_proc, cnp->cn_cred); + nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_proc, cnp->cn_cred, &xid); if (v3) - nfsm_wcc_data(dvp, wccflag); + nfsm_wcc_data(dvp, wccflag, &xid); nfsm_reqdone; - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); - if (dvp->v_type != VBAD) { /* EINVAL set on this case */ - VTONFS(dvp)->n_flag |= NMODIFIED; - if (!wccflag) + VTONFS(dvp)->n_flag |= NMODIFIED; + if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; - } cache_purge(dvp); cache_purge(vp); vput(vp); vput(dvp); + NFS_FREE_PNBUF(cnp); /* * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. */ @@ -2307,10 +2651,13 @@ nfs_readdir(ap) nfsstats.direofcache_hits++; return (0); } - } else if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_procp) == 0 && - np->n_mtime == vattr.va_mtime.tv_sec) { - nfsstats.direofcache_hits++; - return (0); + } else if (!VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_procp)) { + if (np->n_mtime == vattr.va_mtime.tv_sec) { + nfsstats.direofcache_hits++; + return (0); + } + /* directory changed, purge any name cache entries */ + cache_purge(vp); } } @@ -2345,12 +2692,13 @@ nfs_readdirrpc(vp, uiop, cred) caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; nfsuint64 cookie; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct nfsmount *nmp; struct nfsnode *dnp = VTONFS(vp); u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1; int attrflag; - int v3 = NFS_ISV3(vp); + int v3, nmreaddirsize; + u_int64_t xid; #ifndef nolint dp = (struct dirent *)0; @@ -2360,6 +2708,11 @@ nfs_readdirrpc(vp, uiop, cred) (uiop->uio_resid & (NFS_DIRBLKSIZ - 1))) panic("nfs_readdirrpc: bad uio"); #endif + nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); + v3 = NFS_ISV3(vp); + nmreaddirsize = nmp->nm_readdirsize; /* * If there is no cookie, assume directory was stale. @@ -2389,10 +2742,10 @@ nfs_readdirrpc(vp, uiop, cred) nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; } - *tl = txdr_unsigned(nmp->nm_readdirsize); - nfsm_request(vp, NFSPROC_READDIR, uiop->uio_procp, cred); + *tl = txdr_unsigned(nmreaddirsize); + nfsm_request(vp, NFSPROC_READDIR, uiop->uio_procp, cred, &xid); if (v3) { - nfsm_postop_attr(vp, attrflag); + nfsm_postop_attr(vp, attrflag, &xid); if (!error) { nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); dnp->n_cookieverf.nfsuquad[0] = *tl++; @@ -2532,12 +2885,13 @@ nfs_readdirplusrpc(vp, uiop, cred) struct nameidata nami, *ndp = &nami; struct componentname *cnp = &ndp->ni_cnd; nfsuint64 cookie; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct nfsmount *nmp; struct nfsnode *dnp = VTONFS(vp), *np; nfsfh_t *fhp; u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i; - int attrflag, fhsize; + int attrflag, fhsize, nmreaddirsize, nmrsize; + u_int64_t xid, savexid; #ifndef nolint dp = (struct dirent *)0; @@ -2547,6 +2901,12 @@ nfs_readdirplusrpc(vp, uiop, cred) (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs_readdirplusrpc: bad uio"); #endif + nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); + nmreaddirsize = nmp->nm_readdirsize; + nmrsize = nmp->nm_rsize; + ndp->ni_dvp = vp; newvp = NULLVP; @@ -2573,10 +2933,12 @@ nfs_readdirplusrpc(vp, uiop, cred) *tl++ = cookie.nfsuquad[1]; *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; - *tl++ = txdr_unsigned(nmp->nm_readdirsize); - *tl = txdr_unsigned(nmp->nm_rsize); - nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_procp, cred); - nfsm_postop_attr(vp, attrflag); + *tl++ = txdr_unsigned(nmreaddirsize); + *tl = txdr_unsigned(nmrsize); + nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_procp, cred, + &xid); + savexid = xid; + nfsm_postop_attr(vp, attrflag, &xid); if (error) { m_freem(mrep); goto nfsmout; @@ -2660,6 +3022,20 @@ nfs_readdirplusrpc(vp, uiop, cred) VREF(vp); newvp = vp; np = dnp; + } else if (!bigenough || + (cnp->cn_namelen == 2 && + cnp->cn_nameptr[1] == '.' && + cnp->cn_nameptr[0] == '.')) { + /* + * don't doit if we can't guarantee + * that this entry is NOT ".." because + * we would have to drop the lock on + * the directory before getting the + * (lock on) the ".." vnode... and we + * don't want to drop the dvp lock in + * the middle of a readdirplus. + */ + doit = 0; } else { if ((error = nfs_nget(vp->v_mount, fhp, fhsize, &np))) @@ -2668,12 +3044,13 @@ nfs_readdirplusrpc(vp, uiop, cred) newvp = NFSTOV(np); } } - if (doit) { + if (doit && bigenough) { dpossav2 = dpos; dpos = dpossav1; mdsav2 = md; md = mdsav1; - nfsm_loadattr(newvp, (struct vattr *)0); + xid = savexid; + nfsm_loadattr(newvp, (struct vattr *)0, &xid); dpos = dpossav2; md = mdsav2; dp->d_type = @@ -2693,7 +3070,10 @@ nfs_readdirplusrpc(vp, uiop, cred) nfsm_adv(nfsm_rndup(i)); } if (newvp != NULLVP) { - vrele(newvp); + if (newvp == vp) + vrele(newvp); + else + vput(newvp); newvp = NULLVP; } nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); @@ -2752,6 +3132,11 @@ nfsmout: * to create the same funny name between the nfs_lookitup() fails and the * nfs_rename() completes, but... */ + +/* format of "random" names and next name to try */ +/* (note: shouldn't exceed size of sillyrename.s_name) */ +static char sillyrename_name[] = ".nfsAAA%04x4.4"; + static int nfs_sillyrename(dvp, vp, cnp) struct vnode *dvp, *vp; @@ -2762,6 +3147,7 @@ nfs_sillyrename(dvp, vp, cnp) int error; short pid; struct ucred *cred; + int i, j, k; cache_purge(dvp); np = VTONFS(vp); @@ -2777,17 +3163,39 @@ nfs_sillyrename(dvp, vp, cnp) /* Fudge together a funny name */ pid = cnp->cn_proc->p_pid; - sp->s_namlen = sprintf(sp->s_name, ".nfsA%04x4.4", pid); + sp->s_namlen = sprintf(sp->s_name, sillyrename_name, pid); /* Try lookitups until we get one that isn't there */ + i = j = k = 0; while (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_proc, (struct nfsnode **)0) == 0) { - sp->s_name[4]++; - if (sp->s_name[4] > 'z') { - error = EINVAL; - goto bad; + if (sp->s_name[4]++ >= 'z') + sp->s_name[4] = 'A'; + if (++i > ('z' - 'A' + 1)) { + i = 0; + if (sp->s_name[5]++ >= 'z') + sp->s_name[5] = 'A'; + if (++j > ('z' - 'A' + 1)) { + j = 0; + if (sp->s_name[6]++ >= 'z') + sp->s_name[6] = 'A'; + if (++k > ('z' - 'A' + 1)) { + error = EINVAL; + goto bad; + } + } + } + } + /* make note of next "random" name to try */ + if ((sillyrename_name[4] = (sp->s_name[4] + 1)) > 'z') { + sillyrename_name[4] = 'A'; + if ((sillyrename_name[5] = (sp->s_name[5] + 1)) > 'z') { + sillyrename_name[5] = 'A'; + if ((sillyrename_name[6] = (sp->s_name[6] + 1)) > 'z') + sillyrename_name[6] = 'A'; } } + /* now, do the rename */ if ((error = nfs_renameit(dvp, cnp, sp))) goto bad; error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, @@ -2803,7 +3211,7 @@ bad: cred = sp->s_cred; sp->s_cred = NOCRED; crfree(cred); - _FREE_ZONE((caddr_t)sp, sizeof (struct sillyrename), M_NFSREQ); + FREE_ZONE((caddr_t)sp, sizeof (struct sillyrename), M_NFSREQ); return (error); } @@ -2833,20 +3241,25 @@ nfs_lookitup(dvp, name, len, cred, procp, npp) int error = 0, fhlen, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; nfsfh_t *nfhp; - int v3 = NFS_ISV3(dvp); + int v3; + u_int64_t xid; + + if (!VFSTONFS(dvp->v_mount)) + return (ENXIO); + v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_LOOKUP]++; nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(dvp, v3); nfsm_strtom(name, len, NFS_MAXNAMLEN); - nfsm_request(dvp, NFSPROC_LOOKUP, procp, cred); + nfsm_request(dvp, NFSPROC_LOOKUP, procp, cred, &xid); if (npp && !error) { nfsm_getfh(nfhp, fhlen, v3); if (*npp) { np = *npp; if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) { - _FREE_ZONE((caddr_t)np->n_fhp, + FREE_ZONE((caddr_t)np->n_fhp, np->n_fhsize, M_NFSBIGFH); np->n_fhp = &np->n_fh; } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH) @@ -2867,7 +3280,7 @@ nfs_lookitup(dvp, name, len, cred, procp, npp) newvp = NFSTOV(np); } if (v3) { - nfsm_postop_attr(newvp, attrflag); + nfsm_postop_attr(newvp, attrflag, &xid); if (!attrflag && *npp == NULL) { m_freem(mrep); if (newvp == dvp) @@ -2877,7 +3290,7 @@ nfs_lookitup(dvp, name, len, cred, procp, npp) return (ENOENT); } } else - nfsm_loadattr(newvp, (struct vattr *)0); + nfsm_loadattr(newvp, (struct vattr *)0, &xid); } nfsm_reqdone; if (npp && *npp == NULL) { @@ -2896,7 +3309,7 @@ nfs_lookitup(dvp, name, len, cred, procp, npp) /* * Nfs Version 3 commit rpc */ -static int +int nfs_commit(vp, offset, cnt, cred, procp) register struct vnode *vp; u_quad_t offset; @@ -2911,8 +3324,12 @@ nfs_commit(vp, offset, cnt, cred, procp) caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; + u_int64_t xid; - if ((nmp->nm_flag & NFSMNT_HASWRITEVERF) == 0) + FSDBG(521, vp, offset, cnt, nmp->nm_state); + if (!nmp) + return (ENXIO); + if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) return (0); nfsstats.rpccnt[NFSPROC_COMMIT]++; nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1)); @@ -2921,12 +3338,12 @@ nfs_commit(vp, offset, cnt, cred, procp) txdr_hyper(&offset, tl); tl += 2; *tl = txdr_unsigned(cnt); - nfsm_request(vp, NFSPROC_COMMIT, procp, cred); - nfsm_wcc_data(vp, wccflag); + nfsm_request(vp, NFSPROC_COMMIT, procp, cred, &xid); + nfsm_wcc_data(vp, wccflag, &xid); if (!error) { nfsm_dissect(tl, u_long *, NFSX_V3WRITEVERF); if (bcmp((caddr_t)nmp->nm_verf, (caddr_t)tl, - NFSX_V3WRITEVERF)) { + NFSX_V3WRITEVERF)) { bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); error = NFSERR_STALEWRITEVERF; @@ -2936,15 +3353,6 @@ nfs_commit(vp, offset, cnt, cred, procp) return (error); } -/* - * Kludge City.. - * - make nfs_bmap() essentially a no-op that does no translation - * - do nfs_strategy() by doing I/O with nfs_readrpc/nfs_writerpc - * (Maybe I could use the process's page mapping, but I was concerned that - * Kernel Write might not be enabled and also figured copyout() would do - * a lot more work than bcopy() and also it currently happens in the - * context of the swapper process (2). - */ static int nfs_bmap(ap) struct vop_bmap_args /* { @@ -2961,9 +3369,12 @@ nfs_bmap(ap) if (ap->a_vpp != NULL) *ap->a_vpp = vp; - if (ap->a_bnp != NULL) + if (ap->a_bnp != NULL) { + if (!vp->v_mount) + return (ENXIO); *ap->a_bnp = ap->a_bn * btodb(vp->v_mount->mnt_stat.f_iosize, devBlockSize); + } if (ap->a_runp != NULL) *ap->a_runp = 0; #ifdef notyet @@ -2973,41 +3384,6 @@ nfs_bmap(ap) return (0); } -/* - * Strategy routine. - * For async requests when nfsiod(s) are running, queue the request by - * calling nfs_asyncio(), otherwise just all nfs_doio() to do the - * request. - */ -static int -nfs_strategy(ap) - struct vop_strategy_args *ap; -{ - register struct buf *bp = ap->a_bp; - struct ucred *cr; - struct proc *p; - int error = 0; - - if (ISSET(bp->b_flags, B_PHYS)) - panic("nfs_strategy: physio"); - if (ISSET(bp->b_flags, B_ASYNC)) - p = (struct proc *)0; - else - p = current_proc(); /* XXX */ - if (ISSET(bp->b_flags, B_READ)) - cr = bp->b_rcred; - else - cr = bp->b_wcred; - /* - * If the op is asynchronous and an i/o daemon is waiting - * queue the request, wake it up and wait for completion - * otherwise just do it ourselves. - */ - if (!ISSET(bp->b_flags, B_ASYNC) || nfs_asyncio(bp, NOCRED)) - error = nfs_doio(bp, cr, p); - return (error); -} - /* * Mmap a file * @@ -3041,299 +3417,334 @@ nfs_fsync(ap) struct proc * a_p; } */ *ap; { - return (nfs_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_p, 1)); } - -/* - * Flush all the blocks associated with a vnode. - * Walk through the buffer pool and push any dirty pages - * associated with the vnode. - */ -static int -nfs_flush(vp, cred, waitfor, p, commit) - register struct vnode *vp; - struct ucred *cred; - int waitfor; - struct proc *p; - int commit; + +int +nfs_flushcommits(struct vnode *vp, struct proc *p) { - register struct nfsnode *np = VTONFS(vp); - register struct buf *bp; - register int i; - struct buf *nbp; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos, err; - int passone = 1; + struct nfsnode *np = VTONFS(vp); + struct nfsbuf *bp, *nbp; + int i, s, error = 0, retv, bvecpos, wcred_set; u_quad_t off, endoff, toff; - struct ucred* wcred = NULL; - struct buf **bvec = NULL; - void * object; - kern_return_t kret; - upl_t *upls = NULL; - - -#ifndef NFS_COMMITBVECSIZ + struct ucred* wcred; + struct nfsbuf **bvec = NULL; #define NFS_COMMITBVECSIZ 20 -#endif - struct buf *bvec_on_stack[NFS_COMMITBVECSIZ]; - struct upl_t *upls_on_stack[NFS_COMMITBVECSIZ]; - int bvecsize = 0, bveccount, buplpos; +#define NFS_MAXCOMMITBVECSIZ 1024 + struct nfsbuf *bvec_on_stack[NFS_COMMITBVECSIZ]; + int bvecsize = NFS_MAXCOMMITBVECSIZ; - if (nmp->nm_flag & NFSMNT_INT) - slpflag = PCATCH; - if (!commit) - passone = 0; + FSDBG_TOP(557, vp, np, 0, 0); /* - * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the + * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the * server, but nas not been committed to stable storage on the server - * yet. On the first pass, the byte range is worked out and the commit - * rpc is done. On the second pass, nfs_writebp() is called to do the - * job. + * yet. The byte range is worked out for as many nfsbufs as we can handle + * and the commit rpc is done. */ -again: - if (vp->v_dirtyblkhd.lh_first) + if (np->n_dirtyblkhd.lh_first) np->n_flag |= NMODIFIED; + off = (u_quad_t)-1; endoff = 0; bvecpos = 0; - buplpos = 0; - if (NFS_ISV3(vp) && commit) { - s = splbio(); + wcred_set = 0; + + if (!VFSTONFS(vp->v_mount)) { + error = ENXIO; + goto done; + } + if (!NFS_ISV3(vp)) { + error = EINVAL; + goto done; + } + s = splbio(); + + /* + * Allocate space to remember the list of bufs to commit. It is + * important to use M_NOWAIT here to avoid a race with nfs_write + */ + MALLOC(bvec, struct nfsbuf **, + bvecsize * sizeof(struct nfsbuf *), M_TEMP, + M_NOWAIT); + if (bvec == NULL) { + bvec = bvec_on_stack; + bvecsize = NFS_COMMITBVECSIZ; + } + for (bp = np->n_dirtyblkhd.lh_first; bp && bvecpos < bvecsize; bp = nbp) { + nbp = bp->nb_vnbufs.le_next; + + if (((bp->nb_flags & (NB_BUSY | NB_DELWRI | NB_NEEDCOMMIT)) + != (NB_DELWRI | NB_NEEDCOMMIT))) + continue; + + nfs_buf_remfree(bp); + SET(bp->nb_flags, NB_BUSY); /* - * Count up how many buffers waiting for a commit. + * we need a upl to see if the page has been + * dirtied (think mmap) since the unstable write, and + * also to prevent vm from paging it during our commit rpc */ - bveccount = 0; - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT)) - == (B_DELWRI | B_NEEDCOMMIT)) - bveccount++; + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + retv = nfs_buf_upl_setup(bp); + if (retv) { + /* unable to create upl */ + /* vm object must no longer exist */ + /* this could be fatal if we need */ + /* to write the data again, we'll see... */ + printf("nfs_flushcommits: upl create failed %d\n", retv); + bp->nb_valid = bp->nb_dirty = 0; + } } + nfs_buf_upl_check(bp); + + FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty); + FSDBG(557, bp->nb_validoff, bp->nb_validend, + bp->nb_dirtyoff, bp->nb_dirtyend); + /* - * Allocate space to remember the list of bufs to commit. It is - * important to use M_NOWAIT here to avoid a race with nfs_write. - * If we can't get memory (for whatever reason), we will end up - * committing the buffers one-by-one in the loop below. + * We used to check for dirty pages here; if there were any + * we'd abort the commit and force the entire buffer to be + * written again. + * + * Instead of doing that, we now go ahead and commit the dirty + * range, and then leave the buffer around with dirty pages + * that will be written out later. */ - if (bveccount > NFS_COMMITBVECSIZ) { - if (bvec != NULL && bvec != bvec_on_stack) - _FREE(bvec, M_TEMP); - MALLOC(bvec, struct buf **, - bveccount * sizeof(struct buf *), M_TEMP, M_NOWAIT); - if (bvec == NULL) { - bvec = bvec_on_stack; - bvecsize = NFS_COMMITBVECSIZ; - } else - bvecsize = bveccount; - /* allocate the upl structure before the loop based on buffers to commit */ - if (upls != NULL && upls != upls_on_stack) - _FREE(upls, M_TEMP); - MALLOC(upls, struct upl_t *, - bveccount * sizeof(upl_t), M_TEMP, M_NOWAIT); - if (upls == NULL) - upls = upls_on_stack; - } else { - if (bvec && bvec != bvec_on_stack) - _FREE(bvec, M_TEMP); - bvec = bvec_on_stack; - bvecsize = NFS_COMMITBVECSIZ; - if (upls && upls != upls_on_stack) - _FREE(upls, M_TEMP); - upls = upls_on_stack; - } - - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if (bvecpos >= bvecsize) - break; - if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT)) - != (B_DELWRI | B_NEEDCOMMIT)) - continue; - bremfree(bp); - /* - * Work out if all buffers are using the same cred - * so we can deal with them all with one commit. - */ - if (wcred == NULL) - wcred = bp->b_wcred; - else if (wcred != bp->b_wcred) - wcred = NOCRED; - SET(bp->b_flags, (B_BUSY | B_WRITEINPROG)); - /* - * we need vm_fault_list_request so if vm decides to - * do paging while we are waiting on commit rpc, - * that it doesn't pick these pages. - */ - if (!ISSET(bp->b_flags, B_PAGELIST)) { - /* if pagelist exists, assume vm pages are locked/busy already */ off_t file_offset = ubc_blktooff(vp, bp->b_lblkno); - object = ubc_getobject(vp, (UBC_NOREACTIVATE|UBC_HOLDOBJECT)); - if (object == (void*)NULL) - panic("nfs_getcacheblk: NULL vmobject"); - if(bp->b_bufsize & 0xfff) - panic("nfs_getcacheblk: list request is less than 4k"); - kret = vm_fault_list_request( - object, (vm_object_offset_t)file_offset, - bp->b_bufsize, &(upls[buplpos]), NULL, 0, - (int)(UPL_NO_SYNC | UPL_CLEAN_IN_PLACE |UPL_PRECIOUS | - UPL_SET_INTERNAL)); - if (kret != KERN_SUCCESS) - panic("nfs_getcacheblk: get pagelists failed with (%d)", kret); - -#ifdef UBC_DEBUG - upl_ubc_alias_set(pl, ioaddr, 1); -#endif /* UBC_DEBUG */ - buplpos++; /* not same as bvecpos if upl existed already */ - } + /* in case blocking calls were made, re-evaluate nbp */ + nbp = bp->nb_vnbufs.le_next; - /* - * A list of these buffers is kept so that the - * second loop knows which buffers have actually - * been committed. This is necessary, since there - * may be a race between the commit rpc and new - * uncommitted writes on the file. - */ - bvec[bvecpos++] = bp; - toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + - bp->b_dirtyoff; - if (toff < off) - off = toff; - toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); - if (toff > endoff) - endoff = toff; - } - splx(s); - } - if (bvecpos > 0) { /* - * Commit data on the server, as required. - * If all bufs are using the same wcred, then use that with - * one call for all of them, otherwise commit each one - * separately. + * Work out if all buffers are using the same cred + * so we can deal with them all with one commit. */ - if (wcred != NOCRED) - retv = nfs_commit(vp, off, (int)(endoff - off), - wcred, p); - else { - retv = 0; - for (i = 0; i < bvecpos; i++) { - off_t off, size; - bp = bvec[i]; - off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + - bp->b_dirtyoff; - size = (u_quad_t)(bp->b_dirtyend - - bp->b_dirtyoff); - retv = nfs_commit(vp, off, (int)size, - bp->b_wcred, p); - if (retv) break; - } + if (wcred_set == 0) { + wcred = bp->nb_wcred; + if (wcred == NOCRED) + panic("nfs: needcommit w/out wcred"); + wcred_set = 1; + } else if ((wcred_set == 1) && crcmp(wcred, bp->nb_wcred)) { + wcred_set = -1; } - - if (retv == NFSERR_STALEWRITEVERF) - nfs_clearcommit(vp->v_mount); - - for (i = 0; i < buplpos; i++) { - /* - * before the VOP_BWRITE and biodone(ASYNC)/brelse, we have to undo - * holding the vm page or we we will deadlock on another vm_fault_list_request. - * Here's a convenient place to put it. - * Better if we could hold it by setting the PAGELIST flag and kernel_upl_map - * as does nfs_writebp. Then normal biodones and brelse will clean it up and - * we can avoid this abort. For now make minimal changse and test this out. - */ - err = kernel_upl_abort(upls[i], NULL); - if (err) - printf("nfs_flush: kernel_upl_abort %d\n", err); - } + SET(bp->nb_flags, NB_WRITEINPROG); /* - * Now, either mark the blocks I/O done or mark the - * blocks dirty, depending on whether the commit - * succeeded. + * A list of these buffers is kept so that the + * second loop knows which buffers have actually + * been committed. This is necessary, since there + * may be a race between the commit rpc and new + * uncommitted writes on the file. */ + bvec[bvecpos++] = bp; + toff = NBOFF(bp) + bp->nb_dirtyoff; + if (toff < off) + off = toff; + toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff); + if (toff > endoff) + endoff = toff; + } + splx(s); + + if (bvecpos == 0) { + error = ENOBUFS; + goto done; + } + + /* + * Commit data on the server, as required. + * If all bufs are using the same wcred, then use that with + * one call for all of them, otherwise commit each one + * separately. + */ + if (wcred_set == 1) + retv = nfs_commit(vp, off, (int)(endoff - off), wcred, p); + else { + retv = 0; + for (i = 0; i < bvecpos; i++) { - + off_t off, size; bp = bvec[i]; - CLR(bp->b_flags, (B_NEEDCOMMIT | B_WRITEINPROG)); - if (retv) { - brelse(bp); - } else { - vp->v_numoutput++; - SET(bp->b_flags, B_ASYNC); - s = splbio(); - CLR(bp->b_flags, (B_READ|B_DONE|B_ERROR|B_DELWRI)); - bp->b_dirtyoff = bp->b_dirtyend = 0; - reassignbuf(bp, vp); - splx(s); - biodone(bp); + off = NBOFF(bp) + bp->nb_dirtyoff; + size = (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff); + retv = nfs_commit(vp, off, (int)size, bp->nb_wcred, p); + if (retv) break; + } + } + if (retv == NFSERR_STALEWRITEVERF) + nfs_clearcommit(vp->v_mount); + + /* + * Now, either mark the blocks I/O done or mark the + * blocks dirty, depending on whether the commit + * succeeded. + */ + for (i = 0; i < bvecpos; i++) { + bp = bvec[i]; + FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty); + + CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG)); + + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + + if (retv) { + nfs_buf_release(bp); + } else { + s = splbio(); + vp->v_numoutput++; + + if (ISSET(bp->nb_flags, NB_DELWRI)) { + nfs_nbdwrite--; + NFSBUFCNTCHK(); + wakeup((caddr_t)&nfs_nbdwrite); + } + CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + /* if block still has dirty pages, we don't want it to */ + /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */ + if (!bp->nb_dirty) + SET(bp->nb_flags, NB_ASYNC); + + /* move to clean list */ + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&VTONFS(vp)->n_cleanblkhd, bp, nb_vnbufs); + + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + splx(s); + + nfs_buf_iodone(bp); + if (bp->nb_dirty) { + /* throw it back in as a delayed write buffer */ + CLR(bp->nb_flags, NB_DONE); + nfs_buf_write_delayed(bp); } } + } + +done: + if (bvec != NULL && bvec != bvec_on_stack) + _FREE(bvec, M_TEMP); + FSDBG_BOT(557, vp, np, 0, error); + return (error); +} + +/* + * Flush all the blocks associated with a vnode. + * Walk through the buffer pool and push any dirty pages + * associated with the vnode. + */ +static int +nfs_flush(vp, cred, waitfor, p, commit) + register struct vnode *vp; + struct ucred *cred; + int waitfor; + struct proc *p; + int commit; +{ + struct nfsnode *np = VTONFS(vp); + struct nfsbuf *bp, *nbp; + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + int i, s, error = 0, error2, slptimeo = 0, slpflag = 0; + int passone = 1; + FSDBG_TOP(517, vp, np, waitfor, commit); + + if (!nmp) { + error = ENXIO; + goto done; } + if (nmp->nm_flag & NFSMNT_INT) + slpflag = PCATCH; + if (!commit) + passone = 0; /* - * Start/do any write(s) that are required. - * There is a window here where B_BUSY protects the buffer. The vm pages have been - * freed up, yet B_BUSY is set. Don't think you will hit any busy/incore problems while - * we sleep, but not absolutely sure. Keep an eye on it. Otherwise we will have to hold - * vm page across this locked. - EKN + * On the first pass, commit all the bufs that can be. + * On the second pass, nfs_buf_write() is called to do the job. */ -loop: - if (current_thread_aborted()) { - error = EINTR; +again: + FSDBG(518, np->n_dirtyblkhd.lh_first, np->n_flag, 0, 0); + if (np->n_dirtyblkhd.lh_first) + np->n_flag |= NMODIFIED; + if (!VFSTONFS(vp->v_mount)) { + error = ENXIO; goto done; } + if (NFS_ISV3(vp) && commit) { + /* loop while it looks like there are still buffers to be */ + /* commited and nfs_flushcommits() seems to be handling them. */ + while (np->n_needcommitcnt) + if (nfs_flushcommits(vp, p)) + break; + } + + /* Start/do any write(s) that are required. */ +loop: s = splbio(); - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if (ISSET(bp->b_flags, B_BUSY)) { + for (bp = np->n_dirtyblkhd.lh_first; bp; bp = nbp) { + nbp = bp->nb_vnbufs.le_next; + if (ISSET(bp->nb_flags, NB_BUSY)) { + FSDBG(524, bp, waitfor, passone, bp->nb_flags); if (waitfor != MNT_WAIT || passone) continue; - SET(bp->b_flags, B_WANTED); + SET(bp->nb_flags, NB_WANTED); error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), - "nfsfsync", slptimeo); + "nfsfsync", slptimeo); splx(s); if (error) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) { - error = EINTR; - goto done; - } - if (slpflag == PCATCH) { - slpflag = 0; - slptimeo = 2 * hz; - } + error2 = nfs_sigintr(VFSTONFS(vp->v_mount), + (struct nfsreq *)0, p); + if (error2) { + error = error2; + goto done; + } + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } } goto loop; } - if (!ISSET(bp->b_flags, B_DELWRI)) + if (!ISSET(bp->nb_flags, NB_DELWRI)) panic("nfs_fsync: not dirty"); - if ((passone || !commit) && ISSET(bp->b_flags, B_NEEDCOMMIT)) + FSDBG(525, bp, passone, commit, bp->nb_flags); + if ((passone || !commit) && ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + continue; + nfs_buf_remfree(bp); + if (ISSET(bp->nb_flags, NB_ERROR)) { + np->n_error = bp->nb_error ? bp->nb_error : EIO; + np->n_flag |= NWRITEERR; + nfs_buf_release(bp); continue; - bremfree(bp); + } if (passone || !commit) - SET(bp->b_flags, (B_BUSY|B_ASYNC)); - else - SET(bp->b_flags, (B_BUSY|B_ASYNC|B_WRITEINPROG|B_NEEDCOMMIT)); - + SET(bp->nb_flags, NB_BUSY|NB_ASYNC); + else { + /* the NB_STABLE forces this to be written FILESYNC */ + SET(bp->nb_flags, NB_BUSY|NB_ASYNC|NB_STABLE); + } splx(s); - VOP_BWRITE(bp); + nfs_buf_write(bp); goto loop; } splx(s); + if (passone) { passone = 0; goto again; } + if (waitfor == MNT_WAIT) { while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "nfsfsync", slptimeo); if (error) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) { - error = EINTR; + error2 = nfs_sigintr(VFSTONFS(vp->v_mount), + (struct nfsreq *)0, p); + if (error2) { + error = error2; goto done; } if (slpflag == PCATCH) { @@ -3342,19 +3753,17 @@ loop: } } } - if (vp->v_dirtyblkhd.lh_first && commit) { + if (np->n_dirtyblkhd.lh_first && commit) { goto loop; } } + FSDBG(526, np->n_flag, np->n_error, 0, 0); if (np->n_flag & NWRITEERR) { error = np->n_error; np->n_flag &= ~NWRITEERR; } done: - if (bvec != NULL && bvec != bvec_on_stack) - _FREE(bvec, M_TEMP); - if (upls != NULL && upls != upls_on_stack) - _FREE(upls, M_TEMP); + FSDBG_BOT(517, vp, np, error, 0); return (error); } @@ -3378,8 +3787,7 @@ nfs_pathconf(ap) } /* - * NFS advisory byte-level locks. - * Currently unsupported. + * NFS advisory byte-level locks (client) */ static int nfs_advlock(ap) @@ -3391,21 +3799,7 @@ nfs_advlock(ap) int a_flags; } */ *ap; { -#ifdef __FreeBSD__ - register struct nfsnode *np = VTONFS(ap->a_vp); - - /* - * The following kludge is to allow diskless support to work - * until a real NFS lockd is implemented. Basically, just pretend - * that this is a local lock. - */ - return (lf_advlock(ap, &(np->n_lockf), np->n_size)); -#else -#if DIAGNOSTIC - printf("nfs_advlock: pid %d comm %s\n", current_proc()->p_pid, current_proc()->p_comm); -#endif - return (EOPNOTSUPP); -#endif + return (nfs_dolock(ap)); } /* @@ -3525,213 +3919,74 @@ nfs_update(ap) return (EOPNOTSUPP); } -int nfs_aio_threads = 0; /* 1 per nfd (arbitrary) */ -struct slock nfs_aio_slock; -TAILQ_HEAD(bqueues, buf) nfs_aio_bufq; -int nfs_aio_bufq_len = 0; /* diagnostic only */ - -void -nfs_aio_thread() -{ /* see comment below in nfs_bwrite() for some rationale */ - struct buf *bp; - boolean_t funnel_state; - - funnel_state = thread_funnel_set(kernel_flock, TRUE); - for(;;) { - simple_lock(&nfs_aio_slock); - if ((bp = nfs_aio_bufq.tqh_first)) { - TAILQ_REMOVE(&nfs_aio_bufq, bp, b_freelist); - nfs_aio_bufq_len--; - simple_unlock(&nfs_aio_slock); - nfs_writebp(bp, 1); - } else { /* nothing to do - goodnight */ - assert_wait(&nfs_aio_bufq, THREAD_UNINT); - simple_unlock(&nfs_aio_slock); - (void)tsleep((caddr_t)0, PRIBIO+1, "nfs_aio_bufq", 0); - } - } - (void) thread_funnel_set(kernel_flock, FALSE); -} - - -void -nfs_aio_thread_init() -{ - if (nfs_aio_threads++ == 0) { - simple_lock_init(&nfs_aio_slock); - TAILQ_INIT(&nfs_aio_bufq); - } - kernel_thread(kernel_task, nfs_aio_thread); -} - - /* - * Just call nfs_writebp() with the force argument set to 1. - */ -static int -nfs_bwrite(ap) - struct vop_bwrite_args /* { - struct vnode *a_bp; - } */ *ap; -{ - extern void wakeup_one(caddr_t chan); - - /* - * nfs_writebp will issue a synchronous rpc to if B_ASYNC then - * to avoid distributed deadlocks we handoff the write to the - * nfs_aio threads. Doing so allows us to complete the - * current request, rather than blocking on a server which may - * be ourself (or blocked on ourself). - * - * Note the loopback deadlocks happened when the thread - * invoking us was nfsd, and also when it was the pagedaemon. - * - * This solution has one known problem. If *ALL* buffers get - * on the nfs_aio queue then no forward progress can be made - * until one of those writes complete. And if the current - * nfs_aio writes-in-progress block due to a non-responsive server we - * are in a deadlock circle. Probably the cure is to limit the - * async write concurrency in getnewbuf as in FreeBSD 3.2. - */ - if (nfs_aio_threads && ISSET(ap->a_bp->b_flags, B_ASYNC)) { - simple_lock(&nfs_aio_slock); - nfs_aio_bufq_len++; - TAILQ_INSERT_TAIL(&nfs_aio_bufq, ap->a_bp, b_freelist); - simple_unlock(&nfs_aio_slock); - wakeup_one((caddr_t)&nfs_aio_bufq); - return (0); - } - return (nfs_writebp(ap->a_bp, 1)); -} - -/* - * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless - * the force flag is one and it also handles the B_NEEDCOMMIT flag. + * write (or commit) the given NFS buffer */ int -nfs_writebp(bp, force) - register struct buf *bp; - int force; +nfs_buf_write(struct nfsbuf *bp) { int s; - register int oldflags = bp->b_flags, retv = 1; + int oldflags = bp->nb_flags, rv = 0; off_t off; - upl_t upl; - void * object; - kern_return_t kret; - struct vnode *vp = bp->b_vp; - upl_page_info_t *pl; + struct vnode *vp = bp->nb_vp; + struct ucred *cr; + struct proc *p = current_proc(); + + FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0); - if(!ISSET(bp->b_flags, B_BUSY)) - panic("nfs_writebp: buffer is not busy???"); + if (!ISSET(bp->nb_flags, NB_BUSY)) + panic("nfs_buf_write: buffer is not busy???"); s = splbio(); - CLR(bp->b_flags, (B_READ|B_DONE|B_ERROR|B_DELWRI)); + CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + if (ISSET(oldflags, NB_DELWRI)) { + nfs_nbdwrite--; + NFSBUFCNTCHK(); + wakeup((caddr_t)&nfs_nbdwrite); + } - if (ISSET(oldflags, (B_ASYNC|B_DELWRI))) { - reassignbuf(bp, vp); + /* move to clean list */ + if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) { + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&VTONFS(vp)->n_cleanblkhd, bp, nb_vnbufs); } vp->v_numoutput++; - current_proc()->p_stats->p_ru.ru_oublock++; + if (p && p->p_stats) + p->p_stats->p_ru.ru_oublock++; splx(s); - - /* - * Since the B_BUSY flag is set, we need to lock the page before doing nfs_commit. - * Otherwise we may block and get a busy incore pages during a vm pageout. - * Move the existing code up before the commit. - */ - - if (!ISSET(bp->b_flags, B_META) && UBCISVALID(vp)) { - - if (!ISSET(bp->b_flags, B_PAGELIST)) { - - off_t file_offset = ubc_blktooff(vp, bp->b_lblkno); - - object = ubc_getobject(vp, (UBC_NOREACTIVATE|UBC_HOLDOBJECT)); - if (object == (void*)NULL) - panic("nfs_writebp: NULL vmobject"); - - if(bp->b_bufsize & 0xfff) - panic("nfs_writebp: list request is with less than 4k"); - - kret = vm_fault_list_request(object, (vm_object_offset_t)file_offset, - bp->b_bufsize, &upl, NULL, 0, - (int)(UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_SET_INTERNAL)); - if (kret != KERN_SUCCESS) { - panic("nfs_writebp: get pagelists failed with (%d)", kret); - } - -#ifdef UBC_DEBUG - upl_ubc_alias_set(pl, ioaddr, 2); -#endif /* UBC_DEBUG */ - - s = splbio(); - - pl = UPL_GET_INTERNAL_PAGE_LIST(upl); - bp->b_pagelist = upl; - SET(bp->b_flags, B_PAGELIST); - splx(s); - - kret = kernel_upl_map(kernel_map, upl, - (vm_address_t *)&(bp->b_data)); - if (kret != KERN_SUCCESS) { - panic("nfs_writebp: kernel_upl_map() failed with (%d)", kret); - } - if(bp->b_data == 0) - panic("nfs_writebp: upl_map mapped 0"); - if (!upl_page_present(pl, 0)) { - /* - * may be the page got paged out. - * let's just read it in. It is marked - * busy so we should not have any one - * yanking this page underneath the fileIO - */ - panic("nfs_writebp: nopage"); - } - } - } /* - * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not - * an actual write will have to be scheduled via. VOP_STRATEGY(). - * If B_WRITEINPROG is already set, then push it with a write anyhow. + * For async requests when nfsiod(s) are running, queue the request by + * calling nfs_asyncio(), otherwise just all nfs_doio() to do the request. */ - if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) { - off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; - SET(bp->b_flags, B_WRITEINPROG); - retv = nfs_commit(vp, off, bp->b_dirtyend-bp->b_dirtyoff, - bp->b_wcred, bp->b_proc); - CLR(bp->b_flags, B_WRITEINPROG); - if (!retv) { - bp->b_dirtyoff = bp->b_dirtyend = 0; - CLR(bp->b_flags, B_NEEDCOMMIT); - biodone(bp); /* on B_ASYNC will brelse the buffer */ - - } else if (retv == NFSERR_STALEWRITEVERF) - nfs_clearcommit(vp->v_mount); - } - if (retv) { - if (force) - SET(bp->b_flags, B_WRITEINPROG); - - VOP_STRATEGY(bp); - - } - - if( (oldflags & B_ASYNC) == 0) { - int rtval = biowait(bp); - - if (oldflags & B_DELWRI) { + if (ISSET(bp->nb_flags, NB_ASYNC)) + p = (struct proc *)0; + if (ISSET(bp->nb_flags, NB_READ)) + cr = bp->nb_rcred; + else + cr = bp->nb_wcred; + if (!ISSET(bp->nb_flags, NB_ASYNC) || nfs_asyncio(bp, NOCRED)) + rv = nfs_doio(bp, cr, p); + + if ((oldflags & NB_ASYNC) == 0) { + rv = nfs_buf_iowait(bp); + /* move to clean list */ + if (oldflags & NB_DELWRI) { s = splbio(); - reassignbuf(bp, vp); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&VTONFS(vp)->n_cleanblkhd, bp, nb_vnbufs); splx(s); } - brelse(bp); - return (rtval); + FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, rv); + nfs_buf_release(bp); + return (rv); } - return (0); + FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, rv); + return (rv); } /* @@ -3762,7 +4017,7 @@ nfsspec_access(ap) * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ - if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { + if ((mode & VWRITE) && vp->v_mount && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); @@ -3810,13 +4065,15 @@ nfsspec_read(ap) } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); + struct timeval now; /* * Set access flag. */ np->n_flag |= NACC; - np->n_atim.tv_sec = time.tv_sec; - np->n_atim.tv_nsec = time.tv_usec * 1000; + microtime(&now); + np->n_atim.tv_sec = now.tv_sec; + np->n_atim.tv_nsec = now.tv_usec * 1000; return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap)); } @@ -3833,13 +4090,15 @@ nfsspec_write(ap) } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); + struct timeval now; /* * Set update flag. */ np->n_flag |= NUPD; - np->n_mtim.tv_sec = time.tv_sec; - np->n_mtim.tv_nsec = time.tv_usec * 1000; + microtime(&now); + np->n_mtim.tv_sec = now.tv_sec; + np->n_mtim.tv_nsec = now.tv_usec * 1000; return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap)); } @@ -3863,7 +4122,7 @@ nfsspec_close(ap) if (np->n_flag & (NACC | NUPD)) { np->n_flag |= NCHG; - if (vp->v_usecount == 1 && + if (vp->v_usecount == 1 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) @@ -3890,13 +4149,15 @@ nfsfifo_read(ap) { extern vop_t **fifo_vnodeop_p; register struct nfsnode *np = VTONFS(ap->a_vp); + struct timeval now; /* * Set access flag. */ np->n_flag |= NACC; - np->n_atim.tv_sec = time.tv_sec; - np->n_atim.tv_nsec = time.tv_usec * 1000; + microtime(&now); + np->n_atim.tv_sec = now.tv_sec; + np->n_atim.tv_nsec = now.tv_usec * 1000; return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap)); } @@ -3914,13 +4175,15 @@ nfsfifo_write(ap) { extern vop_t **fifo_vnodeop_p; register struct nfsnode *np = VTONFS(ap->a_vp); + struct timeval now; /* * Set update flag. */ np->n_flag |= NUPD; - np->n_mtim.tv_sec = time.tv_sec; - np->n_mtim.tv_nsec = time.tv_usec * 1000; + microtime(&now); + np->n_mtim.tv_sec = now.tv_sec; + np->n_mtim.tv_nsec = now.tv_usec * 1000; return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap)); } @@ -3941,19 +4204,21 @@ nfsfifo_close(ap) register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); struct vattr vattr; + struct timeval now; extern vop_t **fifo_vnodeop_p; if (np->n_flag & (NACC | NUPD)) { + microtime(&now); if (np->n_flag & NACC) { - np->n_atim.tv_sec = time.tv_sec; - np->n_atim.tv_nsec = time.tv_usec * 1000; + np->n_atim.tv_sec = now.tv_sec; + np->n_atim.tv_nsec = now.tv_usec * 1000; } if (np->n_flag & NUPD) { - np->n_mtim.tv_sec = time.tv_sec; - np->n_mtim.tv_nsec = time.tv_usec * 1000; + np->n_mtim.tv_sec = now.tv_sec; + np->n_mtim.tv_nsec = now.tv_usec * 1000; } np->n_flag |= NCHG; - if (vp->v_usecount == 1 && + if (vp->v_usecount == 1 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) @@ -3989,7 +4254,6 @@ nfs_select(ap) return (1); } -/* XXX Eliminate use of struct bp here */ /* * Vnode op for pagein using getblk_pages * derived from nfs_bioread() @@ -4014,232 +4278,122 @@ nfs_pagein(ap) vm_offset_t pl_offset = ap->a_pl_offset; int flags = ap->a_flags; struct ucred *cred; - register struct nfsnode *np = VTONFS(vp); - register int biosize; - register int xsize; + struct nfsnode *np = VTONFS(vp); + int biosize, xsize, iosize; struct vattr vattr; struct proc *p = current_proc(); - struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct nfsmount *nmp; int error = 0; vm_offset_t ioaddr; struct uio auio; struct iovec aiov; struct uio * uio = &auio; - int nocommit = flags & UPL_NOCOMMIT; + int nofreeupl = flags & UPL_NOCOMMIT; + upl_page_info_t *plinfo; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 322)) | DBG_FUNC_NONE, - (int)f_offset, size, pl, pl_offset, 0); + FSDBG(322, vp, f_offset, size, flags); + if (pl == (upl_t)NULL) + panic("nfs_pagein: no upl"); if (UBCINVALID(vp)) { -#if DIAGNOSTIC - panic("nfs_pagein: invalid vp"); -#endif /* DIAGNOSTIC */ + printf("nfs_pagein: invalid vnode 0x%x", (int)vp); + if (!nofreeupl) + (void) ubc_upl_abort(pl, NULL); return (EPERM); } - UBCINFOCHECK("nfs_pagein", vp); - if(pl == (upl_t)NULL) { - panic("nfs_pagein: no upl"); - } - cred = ubc_getcred(vp); - if (cred == NOCRED) - cred = ap->a_cred; - - if (size <= 0) + if (size <= 0) { + printf("nfs_pagein: invalid size %d", size); + if (!nofreeupl) + (void) ubc_upl_abort(pl, NULL); return (EINVAL); - - if (f_offset < 0 || f_offset >= np->n_size - || (f_offset & PAGE_MASK_64)) { - if (!nocommit) - kernel_upl_abort_range(pl, pl_offset, size, + } + if (f_offset < 0 || f_offset >= np->n_size || (f_offset & PAGE_MASK_64)) { + if (!nofreeupl) + ubc_upl_abort_range(pl, pl_offset, size, UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); return (EINVAL); } + cred = ubc_getcred(vp); + if (cred == NOCRED) + cred = ap->a_cred; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; auio.uio_offset = f_offset; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_procp = NULL; - - if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) - (void)nfs_fsinfo(nmp, vp, cred, p); - biosize = min(vp->v_mount->mnt_stat.f_iosize, size); - - if (biosize & PAGE_MASK) - panic("nfs_pagein(%x): biosize not page aligned", biosize); - -#if 0 /* Why bother? */ -/* DO NOT BOTHER WITH "approximately maintained cache consistency" */ -/* Does not make sense in paging paths -- Umesh*/ - /* - * For nfs, cache consistency can only be maintained approximately. - * Although RFC1094 does not specify the criteria, the following is - * believed to be compatible with the reference port. - * For nqnfs, full cache consistency is maintained within the loop. - * For nfs: - * If the file's modify time on the server has changed since the - * last read rpc or you have written to the file, - * you may have lost data cache consistency with the - * server, so flush all of the file's data out of the cache. - * Then force a getattr rpc to ensure that you have up to date - * attributes. - * NB: This implies that cache data can be read when up to - * NFS_ATTRTIMEO seconds out of date. If you find that you need current - * attributes this could be forced by setting n_attrstamp to 0 before - * the VOP_GETATTR() call. - */ - if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { - if (np->n_flag & NMODIFIED) { - np->n_attrstamp = 0; - error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) { - if (!nocommit) - kernel_upl_abort_range(pl, pl_offset, - size, - UPL_ABORT_ERROR | - UPL_ABORT_FREE_ON_EMPTY); - return (error); - } - np->n_mtime = vattr.va_mtime.tv_sec; - } else { - error = VOP_GETATTR(vp, &vattr, cred, p); - if (error){ - if (!nocommit) - kernel_upl_abort_range(pl, pl_offset, size, - UPL_ABORT_ERROR | - UPL_ABORT_FREE_ON_EMPTY); - return (error); - } - if (np->n_mtime != vattr.va_mtime.tv_sec) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error){ - if (!nocommit) - kernel_upl_abort_range(pl, pl_offset, size, - UPL_ABORT_ERROR | - UPL_ABORT_FREE_ON_EMPTY); - return (error); - } - np->n_mtime = vattr.va_mtime.tv_sec; - } - } + nmp = VFSTONFS(vp->v_mount); + if (!nmp) { + if (!nofreeupl) + ubc_upl_abort_range(pl, pl_offset, size, + UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); + return (ENXIO); } -#endif 0 /* Why bother? */ + if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO)) + (void)nfs_fsinfo(nmp, vp, cred, p); + biosize = vp->v_mount->mnt_stat.f_iosize; - kernel_upl_map(kernel_map, pl, &ioaddr); + plinfo = ubc_upl_pageinfo(pl); + ubc_upl_map(pl, &ioaddr); ioaddr += pl_offset; xsize = size; do { - uio->uio_resid = min(biosize, xsize); - aiov.iov_len = uio->uio_resid; - aiov.iov_base = (caddr_t)ioaddr; - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 322)) | DBG_FUNC_NONE, - (int)uio->uio_offset, uio->uio_resid, ioaddr, xsize, 0); - -#warning nfs_pagein does not support NQNFS yet. -#if 0 /* why bother? */ -/* NO RESOURCES TO FIX NQNFS CASE */ -/* We need to deal with this later -- Umesh */ /* - * Get a valid lease. If cached data is stale, flush it. + * It would be nice to be able to issue all these requests + * in parallel instead of waiting for each one to complete + * before sending the next one. + * XXX Should we align these requests to block boundaries? */ - if (nmp->nm_flag & NFSMNT_NQNFS) { - if (NQNFS_CKINVALID(vp, np, ND_READ)) { - do { - error = nqnfs_getlease(vp, ND_READ, cred, p); - } while (error == NQNFS_EXPIRED); - if (error){ - kernel_upl_unmap(kernel_map, pl); - if (!nocommit) - kernel_upl_abort_range(pl, pl_offset, - size ,UPL_ABORT_ERROR | - UPL_ABORT_FREE_ON_EMPTY); - - return (error); - } - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE)) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - kernel_upl_unmap(kernel_map, pl); - if (!nocommit) - kernel_upl_abort_range(pl, - pl_offset,size , - UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); - return (error); - } - np->n_brev = np->n_lrev; - } - } - } -#endif 0 /* why bother? */ - - if (np->n_flag & NQNFSNONCACHE) { - error = nfs_readrpc(vp, uio, cred); - kernel_upl_unmap(kernel_map, pl); - - if (!nocommit) { - if(error) - kernel_upl_abort_range(pl, pl_offset, size , - UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); - else - kernel_upl_commit_range(pl, - pl_offset, size, - UPL_COMMIT_CLEAR_DIRTY - | UPL_COMMIT_FREE_ON_EMPTY, - UPL_GET_INTERNAL_PAGE_LIST(pl), - MAX_UPL_TRANSFER); - } - return (error); - } + iosize = min(biosize, xsize); + uio->uio_resid = iosize; + aiov.iov_len = iosize; + aiov.iov_base = (caddr_t)ioaddr; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + FSDBG(322, uio->uio_offset, uio->uio_resid, ioaddr, xsize); +// XXX #warning our nfs_pagein does not support NQNFS /* * With UBC we get here only when the file data is not in the VM * page cache, so go ahead and read in. */ #ifdef UBC_DEBUG - upl_ubc_alias_set(pl, ioaddr, 2); + upl_ubc_alias_set(pl, current_act(), 2); #endif /* UBC_DEBUG */ nfsstats.pageins++; + error = nfs_readrpc(vp, uio, cred); if (!error) { - int zoff; - int zcnt; - if (uio->uio_resid) { /* - * If uio_resid > 0, there is a hole in the file and - * no writes after the hole have been pushed to - * the server yet... or we're at the EOF + * If uio_resid > 0, there is a hole in the file + * and no writes after the hole have been pushed + * to the server yet... or we're at the EOF * Just zero fill the rest of the valid area. */ - zcnt = uio->uio_resid; - zoff = biosize - zcnt; + int zcnt = uio->uio_resid; + int zoff = iosize - zcnt; bzero((char *)ioaddr + zoff, zcnt); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 324)) | DBG_FUNC_NONE, - (int)uio->uio_offset, zoff, zcnt, ioaddr, 0); - + FSDBG(324, uio->uio_offset, zoff, zcnt, ioaddr); uio->uio_offset += zcnt; } - ioaddr += biosize; - xsize -= biosize; + ioaddr += iosize; + xsize -= iosize; } else - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 322)) | DBG_FUNC_NONE, - (int)uio->uio_offset, uio->uio_resid, error, -1, 0); - - if (p && (vp->v_flag & VTEXT) && - (((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_READ) && - np->n_lrev != np->n_brev) || - (!(nmp->nm_flag & NFSMNT_NQNFS) && - np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { + FSDBG(322, uio->uio_offset, uio->uio_resid, error, -1); + + nmp = VFSTONFS(vp->v_mount); + if (p && (vp->v_flag & VTEXT) && nmp && + ((nmp->nm_flag & NFSMNT_NQNFS && + NQNFS_CKINVALID(vp, np, ND_READ) && + np->n_lrev != np->n_brev) || + (!(nmp->nm_flag & NFSMNT_NQNFS) && + np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { uprintf("Process killed due to text file modification\n"); psignal(p, SIGKILL); p->p_flag |= P_NOSWAP; @@ -4247,23 +4401,22 @@ nfs_pagein(ap) } while (error == 0 && xsize > 0); - kernel_upl_unmap(kernel_map, pl); + ubc_upl_unmap(pl); - if (!nocommit) { + if (!nofreeupl) { if (error) - kernel_upl_abort_range(pl, pl_offset, size, - UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); + ubc_upl_abort_range(pl, pl_offset, size, + UPL_ABORT_ERROR | + UPL_ABORT_FREE_ON_EMPTY); else - kernel_upl_commit_range(pl, pl_offset, size, - UPL_COMMIT_CLEAR_DIRTY - | UPL_COMMIT_FREE_ON_EMPTY, - UPL_GET_INTERNAL_PAGE_LIST(pl), - MAX_UPL_TRANSFER); + ubc_upl_commit_range(pl, pl_offset, size, + UPL_COMMIT_CLEAR_DIRTY | + UPL_COMMIT_FREE_ON_EMPTY); } - return (error); } + /* * Vnode op for pageout using UPL * Derived from nfs_write() @@ -4288,75 +4441,120 @@ nfs_pageout(ap) vm_offset_t pl_offset = ap->a_pl_offset; int flags = ap->a_flags; int ioflag = ap->a_flags; - register int biosize; struct proc *p = current_proc(); struct nfsnode *np = VTONFS(vp); register struct ucred *cred; - struct buf *bp; + struct nfsbuf *bp; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn; - int bufsize; int n = 0, on, error = 0, iomode, must_commit, s; off_t off; vm_offset_t ioaddr; struct uio auio; struct iovec aiov; - struct uio * uio = &auio; - int nocommit = flags & UPL_NOCOMMIT; - int iosize; - int pgsize; + int nofreeupl = flags & UPL_NOCOMMIT; + int biosize, iosize, pgsize, xsize; + + FSDBG(323, f_offset, size, pl, pl_offset); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 323)) | DBG_FUNC_NONE, - (int)f_offset, size, pl, pl_offset, 0); + if (pl == (upl_t)NULL) + panic("nfs_pageout: no upl"); if (UBCINVALID(vp)) { -#if DIAGNOSTIC - panic("nfs_pageout: invalid vnode"); -#endif + printf("nfs_pageout: invalid vnode 0x%x", (int)vp); + if (!nofreeupl) + ubc_upl_abort(pl, 0); return (EIO); } UBCINFOCHECK("nfs_pageout", vp); - if (size <= 0) + if (size <= 0) { + printf("nfs_pageout: invalid size %d", size); + if (!nofreeupl) + ubc_upl_abort(pl, 0); return (EINVAL); - - if (pl == (upl_t)NULL) { - panic("nfs_pageout: no upl"); } - /* - * I use nm_rsize, not nm_wsize so that all buffer cache blocks - * will be the same size within a filesystem. nfs_writerpc will - * still use nm_wsize when sizing the rpc's. - */ - biosize = min(vp->v_mount->mnt_stat.f_iosize, size); - - if (biosize & PAGE_MASK) - panic("nfs_pageout(%x): biosize not page aligned", biosize); - + if (!nmp) { + if (!nofreeupl) + ubc_upl_abort(pl, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); + return (ENXIO); + } + biosize = vp->v_mount->mnt_stat.f_iosize; /* - * Check to see whether the buffer is incore - * If incore and not busy invalidate it from the cache - * we should not find it BUSY, since we always do a - * vm_fault_list_request in 'getblk' before returning - * which would block on the page busy status + * Check to see whether the buffer is incore. + * If incore and not busy, invalidate it from the cache. */ - lbn = f_offset / PAGE_SIZE; /* to match the size getblk uses */ - - for (iosize = size; iosize > 0; iosize -= PAGE_SIZE, lbn++) { - + for (iosize = 0; iosize < size; iosize += xsize) { + off = f_offset + iosize; + /* need make sure we do things on block boundaries */ + xsize = biosize - (off % biosize); + if (off + xsize > f_offset + size) + xsize = f_offset + size - off; + lbn = ubc_offtoblk(vp, off); s = splbio(); - if (bp = incore(vp, lbn)) { - if (ISSET(bp->b_flags, B_BUSY)) { - /* don't panic incore. just tell vm we are busy */ - (void) kernel_upl_abort(pl, NULL); - return(EBUSY); - }; - - bremfree(bp); - SET(bp->b_flags, (B_BUSY | B_INVAL)); - brelse(bp); + if (bp = nfs_buf_incore(vp, lbn)) { + FSDBG(323, off, 1, bp, bp->nb_flags); + if (ISSET(bp->nb_flags, NB_BUSY)) { + /* no panic. just tell vm we are busy */ + if (!nofreeupl) + ubc_upl_abort(pl, 0); + return (EBUSY); + } + if (bp->nb_dirtyend > 0) { + /* + * if there's a dirty range in the buffer, check to + * see if it extends beyond the pageout region + * + * if the dirty region lies completely within the + * pageout region, we just invalidate the buffer + * because it's all being written out now anyway. + * + * if any of the dirty region lies outside the + * pageout region, we'll try to clip the dirty + * region to eliminate the portion that's being + * paged out. If that's not possible, because + * the dirty region extends before and after the + * pageout region, then we'll just return EBUSY. + */ + off_t boff, start, end; + boff = NBOFF(bp); + start = off; + end = off + xsize; + /* clip end to EOF */ + if (end > np->n_size) + end = np->n_size; + start -= boff; + end -= boff; + if ((bp->nb_dirtyoff < start) && + (bp->nb_dirtyend > end)) { + /* not gonna be able to clip the dirty region */ + FSDBG(323, vp, bp, 0xd00deebc, EBUSY); + if (!nofreeupl) + ubc_upl_abort(pl, 0); + return (EBUSY); + } + if ((bp->nb_dirtyoff < start) || + (bp->nb_dirtyend > end)) { + /* clip dirty region, if necessary */ + if (bp->nb_dirtyoff < start) + bp->nb_dirtyend = min(bp->nb_dirtyend, start); + if (bp->nb_dirtyend > end) + bp->nb_dirtyoff = max(bp->nb_dirtyoff, end); + FSDBG(323, bp, bp->nb_dirtyoff, bp->nb_dirtyend, 0xd00dee00); + /* we're leaving this block dirty */ + continue; + } + } + nfs_buf_remfree(bp); + SET(bp->nb_flags, (NB_BUSY | NB_INVAL)); + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + CLR(bp->nb_flags, NB_NEEDCOMMIT); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + } + nfs_buf_release(bp); } splx(s); } @@ -4367,216 +4565,153 @@ nfs_pageout(ap) if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; - if (!nocommit) - kernel_upl_abort_range(pl, pl_offset, size, - UPL_ABORT_FREE_ON_EMPTY); + if (!nofreeupl) + ubc_upl_abort_range(pl, pl_offset, size, + UPL_ABORT_FREE_ON_EMPTY); return (np->n_error); } - if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) + if ((nmp->nm_flag & NFSMNT_NFSV3) && + !(nmp->nm_state & NFSSTA_GOTFSINFO)) (void)nfs_fsinfo(nmp, vp, cred, p); if (f_offset < 0 || f_offset >= np->n_size || - (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) { - if (!nocommit) - kernel_upl_abort_range(pl, pl_offset, size, - UPL_ABORT_FREE_ON_EMPTY); + f_offset & PAGE_MASK_64 || size & PAGE_MASK_64) { + if (!nofreeupl) + ubc_upl_abort_range(pl, pl_offset, size, + UPL_ABORT_FREE_ON_EMPTY); return (EINVAL); } - kernel_upl_map(kernel_map, pl, &ioaddr); + ubc_upl_map(pl, &ioaddr); + ioaddr += pl_offset; - if ((f_offset + size) > np->n_size) - iosize = np->n_size - f_offset; + if (f_offset + size > np->n_size) + xsize = np->n_size - f_offset; else - iosize = size; - - pgsize = (iosize + (PAGE_SIZE - 1)) & ~PAGE_MASK; + xsize = size; + pgsize = round_page_64(xsize); if (size > pgsize) { - if (!nocommit) - kernel_upl_abort_range(pl, pl_offset + pgsize, size - pgsize, - UPL_ABORT_FREE_ON_EMPTY); + if (!nofreeupl) + ubc_upl_abort_range(pl, pl_offset + pgsize, + size - pgsize, + UPL_ABORT_FREE_ON_EMPTY); } - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = f_offset; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_resid = iosize; - auio.uio_procp = NULL; - - aiov.iov_len = iosize; - aiov.iov_base = (caddr_t)ioaddr + pl_offset; /* * check for partial page and clear the * contents past end of the file before * releasing it in the VM page cache */ - if ((f_offset < np->n_size) && (f_offset + size) > np->n_size) { + if (f_offset < np->n_size && f_offset + size > np->n_size) { size_t io = np->n_size - f_offset; - - bzero((caddr_t)(ioaddr + pl_offset + io), size - io); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 321)) | DBG_FUNC_NONE, - (int)np->n_size, (int)f_offset, (int)f_offset + io, size - io, 0); + bzero((caddr_t)(ioaddr + io), size - io); + FSDBG(321, np->n_size, f_offset, f_offset + io, size - io); } - do { - -#warning nfs_pageout does not support NQNFS yet. -#if 0 /* why bother? */ -/* NO RESOURCES TO FIX NQNFS CASE */ -/* We need to deal with this later -- Umesh */ + auio.uio_offset = f_offset; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_procp = NULL; + do { /* - * Check for a valid write lease. + * It would be nice to be able to issue all these requests + * in parallel instead of waiting for each one to complete + * before sending the next one. + * XXX Should we align these requests to block boundaries? */ - if ((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_WRITE)) { - do { - error = nqnfs_getlease(vp, ND_WRITE, cred, p); - } while (error == NQNFS_EXPIRED); - if (error) { - kernel_upl_unmap(kernel_map, pl); - if (!nocommit) - kernel_upl_abort_range(pl, pl_offset, size, - UPL_ABORT_FREE_ON_EMPTY); - return (error); - } - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE)) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - kernel_upl_unmap(kernel_map, pl); - if (!nocommit) - kernel_upl_abort_range(pl, - pl_offset, size, - UPL_ABORT_FREE_ON_EMPTY); - return (error); - } - np->n_brev = np->n_lrev; - } - } -#endif 0 /* why bother? */ - - if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { - iomode = NFSV3WRITE_FILESYNC; - error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); - if (must_commit) - nfs_clearcommit(vp->v_mount); - kernel_upl_unmap(kernel_map, pl); - - /* see comments below after other nfs_writerpc and ESTALE */ - if (error == ESTALE) { - kernel_upl_abort_range(pl, pl_offset, size, - UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); - } else { - if (!nocommit) { - if(error) - kernel_upl_abort_range(pl, pl_offset, size, - UPL_ABORT_FREE_ON_EMPTY); - else - kernel_upl_commit_range(pl, - pl_offset, size, - UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY, - UPL_GET_INTERNAL_PAGE_LIST(pl), MAX_UPL_TRANSFER); - } - } - return (error); - } - nfsstats.pageouts++; - lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize-1); - n = min((unsigned)(biosize - on), uio->uio_resid); -again: - bufsize = biosize; -#if 0 - if ((lbn + 1) * biosize > np->n_size) { - bufsize = np->n_size - lbn * biosize; - bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); - } -#endif - vp->v_numoutput++; - - np->n_flag |= NMODIFIED; + iosize = min(biosize, xsize); + auio.uio_resid = iosize; + aiov.iov_len = iosize; + aiov.iov_base = (caddr_t)ioaddr; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; -#if 0 /* why bother? */ -/* NO RESOURCES TO FIX NQNFS CASE */ -/* We need to deal with this later -- Umesh */ - /* - * Check for valid write lease and get one as required. - * In case getblk() and/or bwrite() delayed us. - */ - if ((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_WRITE)) { - do { - error = nqnfs_getlease(vp, ND_WRITE, cred, p); - } while (error == NQNFS_EXPIRED); - if (error) - goto cleanup; - - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE)) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - kernel_upl_unmap(kernel_map, pl); - if (!nocommit) - kernel_upl_abort_range(pl, - pl_offset, - size, - UPL_ABORT_FREE_ON_EMPTY); - - return (error); - } - np->n_brev = np->n_lrev; - goto again; - } - } -#endif 0 /* why bother? */ + FSDBG(323, auio.uio_offset, auio.uio_resid, ioaddr, xsize); +// XXX #warning our nfs_pageout does not support NQNFS + nfsstats.pageouts++; + vp->v_numoutput++; + /* NMODIFIED would be set here if doing unstable writes */ iomode = NFSV3WRITE_FILESYNC; - error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); + error = nfs_writerpc(vp, &auio, cred, &iomode, &must_commit); if (must_commit) nfs_clearcommit(vp->v_mount); - vp->v_numoutput--; - + vpwakeup(vp); if (error) goto cleanup; - - if (n > 0) { - uio->uio_resid -= n; - uio->uio_offset += n; - uio->uio_iov->iov_base += n; - uio->uio_iov->iov_len -= n; - } - } while (uio->uio_resid > 0 && n > 0); + /* Note: no need to check uio_resid, because */ + /* it'll only be set if there was an error. */ + ioaddr += iosize; + xsize -= iosize; + } while (xsize > 0); cleanup: - kernel_upl_unmap(kernel_map, pl); - /* - * EStale is special. In this case, we want vm to dump out - * the pages. Better yet, sever the object so we don't come - * back here on each page of the object to page out. For now, - * just dump. - * XXX What about !nocommit case? Should ESTALE only be checked - * in that portion? - EKN - */ - if (error == ESTALE) { - kernel_upl_abort_range(pl, pl_offset, size, - UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); - } else { - if (!nocommit) { - if(error) - kernel_upl_abort_range(pl, pl_offset, pgsize, - UPL_ABORT_FREE_ON_EMPTY); - else - kernel_upl_commit_range(pl, pl_offset, pgsize, - UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY, - UPL_GET_INTERNAL_PAGE_LIST(pl), MAX_UPL_TRANSFER); - } + ubc_upl_unmap(pl); + /* + * We've had several different solutions on what to do when the pageout + * gets an error. If we don't handle it, and return an error to the + * caller, vm, it will retry . This can end in endless looping + * between vm and here doing retries of the same page. Doing a dump + * back to vm, will get it out of vm's knowledge and we lose whatever + * data existed. This is risky, but in some cases necessary. For + * example, the initial fix here was to do that for ESTALE. In that case + * the server is telling us that the file is no longer the same. We + * would not want to keep paging out to that. We also saw some 151 + * errors from Auspex server and NFSv3 can return errors higher than + * ELAST. Those along with NFS known server errors we will "dump" from + * vm. Errors we don't expect to occur, we dump and log for further + * analysis. Errors that could be transient, networking ones, + * we let vm "retry". Lastly, errors that we retry, but may have potential + * to storm the network, we "retrywithsleep". "sever" will be used in + * in the future to dump all pages of object for cases like ESTALE. + * All this is the basis for the states returned and first guesses on + * error handling. Tweaking expected as more statistics are gathered. + * Note, in the long run we may need another more robust solution to + * have some kind of persistant store when the vm cannot dump nor keep + * retrying as a solution, but this would be a file architectural change + */ + + if (!nofreeupl) { /* otherwise stacked file system has to handle this */ + if (error) { + int abortflags; + short action = nfs_pageouterrorhandler(error); + + switch (action) { + case DUMP: + abortflags = UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY; + break; + case DUMPANDLOG: + abortflags = UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY; + if (error <= ELAST && + (errorcount[error] % 100 == 0)) + printf("nfs_pageout: unexpected error %d. dumping vm page\n", error); + errorcount[error]++; + break; + case RETRY: + abortflags = UPL_ABORT_FREE_ON_EMPTY; + break; + case RETRYWITHSLEEP: + abortflags = UPL_ABORT_FREE_ON_EMPTY; + /* pri unused. PSOCK for placeholder. */ + (void) tsleep(&lbolt, PSOCK, + "nfspageout", 0); + break; + case SEVER: /* not implemented */ + default: + printf("nfs_pageout: action %d not expected\n", action); + break; + } + + ubc_upl_abort_range(pl, pl_offset, size, abortflags); + /* return error in all cases above */ + + } else + ubc_upl_commit_range(pl, pl_offset, pgsize, + UPL_COMMIT_CLEAR_DIRTY | + UPL_COMMIT_FREE_ON_EMPTY); } - return (error); } @@ -4592,14 +4727,16 @@ nfs_blktooff(ap) int biosize; register struct vnode *vp = ap->a_vp; - biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); /* nfs_bio.c */ + if (!vp->v_mount) + return (ENXIO); - *ap->a_offset = (off_t)(ap->a_lblkno * biosize); + biosize = vp->v_mount->mnt_stat.f_iosize; + + *ap->a_offset = (off_t)ap->a_lblkno * biosize; return (0); } -/* Blktooff derives file offset given a logical block number */ static int nfs_offtoblk(ap) struct vop_offtoblk_args /* { @@ -4611,9 +4748,12 @@ nfs_offtoblk(ap) int biosize; register struct vnode *vp = ap->a_vp; - biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); /* nfs_bio.c */ + if (!vp->v_mount) + return (ENXIO); + + biosize = vp->v_mount->mnt_stat.f_iosize; - *ap->a_lblkno = (daddr_t)(ap->a_offset / biosize); + *ap->a_lblkno = (daddr_t)(ap->a_offset / biosize); return (0); }