X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/1c79356b52d46aa6b508fb032f5ae709b1f2897b..5eebf7385fedb1517b66b53c28e5aa6bb0a2be50:/bsd/nfs/nfs_socket.c diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index 22d5a17ba..ff2f55066 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -81,6 +81,9 @@ #include #include +#include +#include +#include #include #include @@ -95,6 +98,18 @@ #include #include +#include + +#define FSDBG(A, B, C, D, E) \ + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \ + (int)(B), (int)(C), (int)(D), (int)(E), 0) +#define FSDBG_TOP(A, B, C, D, E) \ + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \ + (int)(B), (int)(C), (int)(D), (int)(E), 0) +#define FSDBG_BOT(A, B, C, D, E) \ + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \ + (int)(B), (int)(C), (int)(D), (int)(E), 0) + #define TRUE 1 #define FALSE 0 @@ -128,6 +143,7 @@ extern time_t nqnfsstarttime; extern struct nfsstats nfsstats; extern int nfsv3_procid[NFS_NPROCS]; extern int nfs_ticks; +extern u_long nfs_xidwrap; /* * Defines which timer to use for the procnum. @@ -161,12 +177,20 @@ static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, }; int nfsrtton = 0; struct nfsrtt nfsrtt; -static int nfs_msg __P((struct proc *,char *,char *)); +static int nfs_msg __P((struct proc *, const char *, const char *, int)); static int nfs_rcvlock __P((struct nfsreq *)); -static void nfs_rcvunlock __P((int *flagp)); +static void nfs_rcvunlock __P((struct nfsreq *)); static int nfs_receive __P((struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp)); static int nfs_reconnect __P((struct nfsreq *rep)); +static void nfs_repbusy(struct nfsreq *rep); +static struct nfsreq * nfs_repnext(struct nfsreq *rep); +static void nfs_repdequeue(struct nfsreq *rep); + +/* XXX */ +boolean_t current_thread_aborted(void); +kern_return_t thread_terminate(thread_act_t); + #ifndef NFS_NOSERVER static int nfsrv_getstream __P((struct nfssvc_sock *,int)); @@ -203,6 +227,11 @@ int (*nfsrv3_procs[NFS_NPROCS]) __P((struct nfsrv_descript *nd, }; #endif /* NFS_NOSERVER */ +/* + * NFSTRACE points were changed to FSDBG (KERNEL_DEBUG) + * But some of this code may prove useful someday... + */ +#undef NFSDIAG #if NFSDIAG int nfstraceindx = 0; struct nfstracerec nfstracebuf[NFSTBUFSIZ] = {{0,0,0,0}}; @@ -322,20 +351,139 @@ nfsdup(struct nfsreq *rep) } #endif /* NFSDIAG */ + +/* + * attempt to bind a socket to a reserved port + */ +static int +nfs_bind_resv(struct nfsmount *nmp) +{ + struct socket *so = nmp->nm_so; + struct sockaddr_in sin; + int error; + u_short tport; + + if (!so) + return (EINVAL); + + sin.sin_len = sizeof (struct sockaddr_in); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = INADDR_ANY; + tport = IPPORT_RESERVED - 1; + sin.sin_port = htons(tport); + + while (((error = sobind(so, (struct sockaddr *) &sin)) == EADDRINUSE) && + (--tport > IPPORT_RESERVED / 2)) + sin.sin_port = htons(tport); + return (error); +} + +/* + * variables for managing the nfs_bind_resv_thread + */ +int nfs_resv_mounts = 0; +static int nfs_bind_resv_thread_state = 0; +#define NFS_BIND_RESV_THREAD_STATE_INITTED 1 +#define NFS_BIND_RESV_THREAD_STATE_RUNNING 2 +static struct slock nfs_bind_resv_slock; +struct nfs_bind_resv_request { + TAILQ_ENTRY(nfs_bind_resv_request) brr_chain; + struct nfsmount *brr_nmp; + int brr_error; +}; +static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue; + +/* + * thread to handle any reserved port bind requests + */ +static void +nfs_bind_resv_thread(void) +{ + struct nfs_bind_resv_request *brreq; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(network_flock, TRUE); + nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING; + + while (nfs_resv_mounts > 0) { + simple_lock(&nfs_bind_resv_slock); + while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) { + TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain); + simple_unlock(&nfs_bind_resv_slock); + brreq->brr_error = nfs_bind_resv(brreq->brr_nmp); + wakeup(brreq); + simple_lock(&nfs_bind_resv_slock); + } + simple_unlock(&nfs_bind_resv_slock); + (void)tsleep((caddr_t)&nfs_bind_resv_request_queue, PSOCK, + "nfs_bind_resv_request_queue", 0); + } + + nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED; + (void) thread_funnel_set(network_flock, funnel_state); + (void) thread_terminate(current_act()); +} + +int +nfs_bind_resv_thread_wake(void) +{ + if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) + return (EIO); + wakeup(&nfs_bind_resv_request_queue); + return (0); +} + +/* + * underprivileged procs call this to request nfs_bind_resv_thread + * to perform the reserved port binding for them. + */ +static int +nfs_bind_resv_nopriv(struct nfsmount *nmp) +{ + struct nfs_bind_resv_request brreq; + int error; + + if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) { + if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) { + simple_lock_init(&nfs_bind_resv_slock); + TAILQ_INIT(&nfs_bind_resv_request_queue); + nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED; + } + kernel_thread(kernel_task, nfs_bind_resv_thread); + nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING; + } + + brreq.brr_nmp = nmp; + brreq.brr_error = 0; + + simple_lock(&nfs_bind_resv_slock); + TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain); + simple_unlock(&nfs_bind_resv_slock); + + error = nfs_bind_resv_thread_wake(); + if (error) { + TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain); + /* Note: we might be able to simply restart the thread */ + return (error); + } + + (void) tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0); + + return (brreq.brr_error); +} + /* * Initialize sockets and congestion for a new NFS connection. * We do not free the sockaddr if error. */ int nfs_connect(nmp, rep) - register struct nfsmount *nmp; + struct nfsmount *nmp; struct nfsreq *rep; { - register struct socket *so; + struct socket *so; int s, error, rcvreserve, sndreserve; struct sockaddr *saddr; - struct sockaddr_in sin; - u_short tport; thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); nmp->nm_so = (struct socket *)0; @@ -352,18 +500,22 @@ nfs_connect(nmp, rep) * Some servers require that the client port be a reserved port number. */ if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) { - sin.sin_len = sizeof (struct sockaddr_in); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = INADDR_ANY; - tport = IPPORT_RESERVED - 1; - sin.sin_port = htons(tport); - - while ((error = sobind(so, (struct sockaddr *) &sin) == EADDRINUSE) && - (--tport > IPPORT_RESERVED / 2)) - sin.sin_port = htons(tport); - if (error) { - goto bad; + struct proc *p; + /* + * sobind() requires current_proc() to have superuser privs. + * If this bind is part of a reconnect, and the current proc + * doesn't have superuser privs, we hand the sobind() off to + * a kernel thread to process. + */ + if ((nmp->nm_state & NFSSTA_MOUNTED) && + (p = current_proc()) && suser(p->p_ucred, &p->p_acflag)) { + /* request nfs_bind_resv_thread() to do bind */ + error = nfs_bind_resv_nopriv(nmp); + } else { + error = nfs_bind_resv(nmp); } + if (error) + goto bad; } /* @@ -406,19 +558,24 @@ nfs_connect(nmp, rep) } splx(s); } + /* + * Always time out on recieve, this allows us to reconnect the + * socket to deal with network changes. + */ + so->so_rcv.sb_timeo = (2 * hz); if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) { - so->so_rcv.sb_timeo = (5 * hz); so->so_snd.sb_timeo = (5 * hz); } else { - so->so_rcv.sb_timeo = 0; so->so_snd.sb_timeo = 0; } if (nmp->nm_sotype == SOCK_DGRAM) { - sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; - rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * 2; + sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3; + rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * + (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2); } else if (nmp->nm_sotype == SOCK_SEQPACKET) { - sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; - rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * 2; + sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3; + rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * + (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2); } else { if (nmp->nm_sotype != SOCK_STREAM) panic("nfscon sotype"); @@ -428,6 +585,7 @@ nfs_connect(nmp, rep) int val; bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_KEEPALIVE; sopt.sopt_val = &val; @@ -440,6 +598,7 @@ nfs_connect(nmp, rep) int val; bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = &val; @@ -448,12 +607,15 @@ nfs_connect(nmp, rep) sosetopt(so, &sopt); } - sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) - * 2; - rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) - * 2; + sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3; + rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) * + (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2); } + if (sndreserve > NFS_MAXSOCKBUF) + sndreserve = NFS_MAXSOCKBUF; + if (rcvreserve > NFS_MAXSOCKBUF) + rcvreserve = NFS_MAXSOCKBUF; error = soreserve(so, sndreserve, rcvreserve); if (error) { goto bad; @@ -470,8 +632,7 @@ nfs_connect(nmp, rep) nmp->nm_sdrtt[3] = 0; nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ nmp->nm_sent = 0; - NFSTRACE4(NFSTRC_CWND_INIT, nmp, nmp->nm_flag, nmp->nm_soflags, - nmp->nm_cwnd); + FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd); nmp->nm_timeouts = 0; return (0); @@ -502,6 +663,17 @@ nfs_reconnect(rep) while ((error = nfs_connect(nmp, rep))) { if (error == EINTR || error == ERESTART) return (EINTR); + if (error == EIO) + return (EIO); + nfs_down(rep, rep->r_nmp, rep->r_procp, "can not connect", + error, NFSSTA_TIMEO); + if (!(nmp->nm_state & NFSSTA_MOUNTED)) { + /* we're not yet completely mounted and */ + /* we can't reconnect, so we fail */ + return (error); + } + if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) + return (error); (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0); } @@ -510,7 +682,7 @@ nfs_reconnect(rep) * Loop through outstanding request list and fix up all requests * on old socket. */ - for (rp = nfs_reqq.tqh_first; rp != 0; rp = rp->r_chain.tqe_next) { + TAILQ_FOREACH(rp, &nfs_reqq, r_chain) { if (rp->r_nmp == nmp) rp->r_flags |= R_MUSTRESEND; } @@ -557,15 +729,16 @@ nfs_send(so, nam, top, rep) struct nfsreq *rep; { struct sockaddr *sendnam; - int error, soflags, flags; + int error, error2, soflags, flags; int xidqueued = 0; struct nfsreq *rp; char savenametolog[MNAMELEN]; if (rep) { - if (rep->r_flags & R_SOFTTERM) { + error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp); + if (error) { m_freem(top); - return (EINTR); + return (error); } if ((so = rep->r_nmp->nm_so) == NULL) { rep->r_flags |= R_MUSTRESEND; @@ -574,7 +747,7 @@ nfs_send(so, nam, top, rep) } rep->r_flags &= ~R_MUSTRESEND; soflags = rep->r_nmp->nm_soflags; - for (rp = nfs_reqq.tqh_first; rp; rp = rp->r_chain.tqe_next) + TAILQ_FOREACH(rp, &nfs_reqq, r_chain) if (rp == rep) break; if (rp) @@ -613,8 +786,7 @@ nfs_send(so, nam, top, rep) if (error) { if (rep) { if (xidqueued) { - for (rp = nfs_reqq.tqh_first; rp; - rp = rp->r_chain.tqe_next) + TAILQ_FOREACH(rp, &nfs_reqq, r_chain) if (rp == rep && rp->r_xid == xidqueued) break; if (!rp) @@ -626,9 +798,10 @@ nfs_send(so, nam, top, rep) /* * Deal with errors for the client side. */ - if (rep->r_flags & R_SOFTTERM) - error = EINTR; - else { + error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp); + if (error2) { + error = error2; + } else { rep->r_flags |= R_MUSTRESEND; NFS_DPF(DUP, ("nfs_send RESEND error=%d\n", error)); @@ -639,9 +812,10 @@ nfs_send(so, nam, top, rep) /* * Handle any recoverable (soft) socket errors here. (???) */ - if (error != EINTR && error != ERESTART && - error != EWOULDBLOCK && error != EPIPE) + if (error != EINTR && error != ERESTART && error != EIO && + error != EWOULDBLOCK && error != EPIPE) { error = 0; + } } return (error); } @@ -671,7 +845,7 @@ nfs_receive(rep, aname, mp) struct sockaddr *tmp_nam; struct mbuf *mhck; struct sockaddr_in *sin; - int error, sotype, rcvflg; + int error, error2, sotype, rcvflg; struct proc *p = current_proc(); /* XXX */ /* @@ -690,7 +864,7 @@ nfs_receive(rep, aname, mp) * until we have an entire rpc request/reply. */ if (sotype != SOCK_DGRAM) { - error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); + error = nfs_sndlock(rep); if (error) return (error); tryagain: @@ -703,15 +877,17 @@ tryagain: * attempt that has essentially shut down this * mount point. */ - if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { - nfs_sndunlock(&rep->r_nmp->nm_flag); + if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) { + nfs_sndunlock(rep); + if (error) + return (error); return (EINTR); } so = rep->r_nmp->nm_so; if (!so) { error = nfs_reconnect(rep); if (error) { - nfs_sndunlock(&rep->r_nmp->nm_flag); + nfs_sndunlock(rep); return (error); } goto tryagain; @@ -730,13 +906,13 @@ tryagain: if (error) { if (error == EINTR || error == ERESTART || (error = nfs_reconnect(rep))) { - nfs_sndunlock(&rep->r_nmp->nm_flag); + nfs_sndunlock(rep); return (error); } goto tryagain; } } - nfs_sndunlock(&rep->r_nmp->nm_flag); + nfs_sndunlock(rep); if (sotype == SOCK_STREAM) { aio.iov_base = (caddr_t) &len; aio.iov_len = sizeof(u_long); @@ -752,12 +928,13 @@ tryagain: thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); error = soreceive(so, (struct sockaddr **)0, &auio, (struct mbuf **)0, (struct mbuf **)0, &rcvflg); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); if (!rep->r_nmp) /* if unmounted then bailout */ goto shutout; if (error == EWOULDBLOCK && rep) { - if (rep->r_flags & R_SOFTTERM) - return (EINTR); + error2 = nfs_sigintr(rep->r_nmp, rep, p); + if (error2) + error = error2; } } while (error == EWOULDBLOCK); if (!error && auio.uio_resid > 0) { @@ -820,19 +997,22 @@ tryagain: thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); do { + control = NULL; rcvflg = 0; error = soreceive(so, (struct sockaddr **)0, &auio, mp, &control, &rcvflg); + if (control) + m_freem(control); if (!rep->r_nmp) /* if unmounted then bailout */ { thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); goto shutout; } - if (control) - m_freem(control); if (error == EWOULDBLOCK && rep) { - if (rep->r_flags & R_SOFTTERM) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (EINTR); + error2 = nfs_sigintr(rep->r_nmp, rep, p); + if (error2) { + thread_funnel_switch(NETWORK_FUNNEL, + KERNEL_FUNNEL); + return (error2); } } } while (error == EWOULDBLOCK || @@ -855,15 +1035,31 @@ errout: "receive error %d from nfs server %s\n", error, rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); - error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); - if (!error) + error = nfs_sndlock(rep); + if (!error) { error = nfs_reconnect(rep); - if (!error) - goto tryagain; + if (!error) + goto tryagain; + nfs_sndunlock(rep); + } } } else { - if ((so = rep->r_nmp->nm_so) == NULL) - return (EACCES); + /* + * We could have failed while rebinding the datagram socket + * so we need to attempt to rebind here. + */ + if ((so = rep->r_nmp->nm_so) == NULL) { + error = nfs_sndlock(rep); + if (!error) { + error = nfs_reconnect(rep); + nfs_sndunlock(rep); + } + if (error) + return (error); + if (!rep->r_nmp) /* if unmounted then bailout */ + return (ENXIO); + so = rep->r_nmp->nm_so; + } if (so->so_state & SS_ISCONNECTED) getnam = (struct sockaddr **)0; else @@ -886,18 +1082,44 @@ errout: FREE(*getnam, M_SONAME); *aname = mhck; } - if (!rep->r_nmp) /* if unmounted then bailout */ { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - goto shutout; - } - - if (error == EWOULDBLOCK && - (rep->r_flags & R_SOFTTERM)) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (EINTR); + if (!rep->r_nmp) /* if unmounted then bailout */ + goto dgramout; + if (error) { + error2 = nfs_sigintr(rep->r_nmp, rep, p); + if (error2) { + error = error2; + goto dgramout; + } + } + /* Reconnect for all errors. We may be receiving + * soft/hard/blocking errors because of a network + * change. + * XXX: we should rate limit or delay this + * to once every N attempts or something. + * although TCP doesn't seem to. + */ + if (error) { + thread_funnel_switch(NETWORK_FUNNEL, + KERNEL_FUNNEL); + error2 = nfs_sndlock(rep); + if (!error2) { + error2 = nfs_reconnect(rep); + if (error2) + error = error2; + else if (!rep->r_nmp) /* if unmounted then bailout */ + error = ENXIO; + else + so = rep->r_nmp->nm_so; + nfs_sndunlock(rep); + } else { + error = error2; + } + thread_funnel_switch(KERNEL_FUNNEL, + NETWORK_FUNNEL); } } while (error == EWOULDBLOCK); +dgramout: thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); len -= auio.uio_resid; } @@ -948,59 +1170,52 @@ nfs_reply(myrep) return (0); if (error) return (error); - - /* - * This is being checked after nfs_receive, but - * it doesn't hurt to check prior, since nfs_receive - * will dereference r_nmp also. Bullet-proofing code - * since changing funnels since the request to the - * receive can leave us vulnerable for kernel to unmount - * us. - */ - if (!myrep->r_nmp) { - NFSTRACE4(NFSTRC_ECONN, myrep->r_xid, myrep, nmp, 1); - return (ECONNABORTED); - } + /* * If we slept after putting bits otw, then reply may have * arrived. In which case returning is required, or we * would hang trying to nfs_receive an already received reply. */ if (myrep->r_mrep != NULL) { - nfs_rcvunlock(&nmp->nm_flag); - NFSTRACE4(NFSTRC_RCVALREADY, myrep->r_xid, myrep, - myrep->r_nmp, 2); + nfs_rcvunlock(myrep); + FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1); return (0); } /* - * Get the next Rpc reply off the socket + * Get the next Rpc reply off the socket. Assume myrep->r_nmp + * is still intact by checks done in nfs_rcvlock. */ + /* XXX why do we ask for nam here? we don't use it! */ error = nfs_receive(myrep, &nam, &mrep); + if (nam) + m_freem(nam); /* - * Bailout asap if nfsmount struct gone (unmounted) + * Bailout asap if nfsmount struct gone (unmounted). */ if (!myrep->r_nmp) { - NFSTRACE4(NFSTRC_ECONN, myrep->r_xid, myrep, nmp, 2); - return (ECONNABORTED); + FSDBG(530, myrep->r_xid, myrep, nmp, -2); + return (ENXIO); } if (error) { - NFSTRACE4(NFSTRC_RCVERR, myrep->r_xid, myrep, nmp, - error); - nfs_rcvunlock(&nmp->nm_flag); + FSDBG(530, myrep->r_xid, myrep, nmp, error); + nfs_rcvunlock(myrep); + + /* Bailout asap if nfsmount struct gone (unmounted). */ + if (!myrep->r_nmp) + return (ENXIO); /* * Ignore routing errors on connectionless protocols?? */ if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { - nmp->nm_so->so_error = 0; + if (nmp->nm_so) + nmp->nm_so->so_error = 0; if (myrep->r_flags & R_GETONEREP) return (0); continue; } return (error); } - if (nam) - m_freem(nam); /* * We assume all is fine, but if we did not have an error @@ -1016,8 +1231,9 @@ nfs_reply(myrep) * just check here and get out. (ekn) */ if (!mrep) { - NFSTRACE4(NFSTRC_ECONN, myrep->r_xid, myrep, nmp, 3); - return (ECONNABORTED); /* sounds good */ + nfs_rcvunlock(myrep); + FSDBG(530, myrep->r_xid, myrep, nmp, -3); + return (ENXIO); /* sounds good */ } /* @@ -1041,8 +1257,8 @@ nfs_reply(myrep) m_freem(mrep); #endif nfsmout: - if (nmp->nm_flag & NFSMNT_RCVLOCK) - nfs_rcvunlock(&nmp->nm_flag); + if (nmp->nm_state & NFSSTA_RCVLOCK) + nfs_rcvunlock(myrep); if (myrep->r_flags & R_GETONEREP) return (0); /* this path used by NQNFS */ continue; @@ -1052,13 +1268,17 @@ nfsmout: * Loop through the request list to match up the reply * Iff no match, just drop the datagram */ - for (rep = nfs_reqq.tqh_first; rep != 0; - rep = rep->r_chain.tqe_next) { + TAILQ_FOREACH(rep, &nfs_reqq, r_chain) { if (rep->r_mrep == NULL && rxid == rep->r_xid) { /* Found it.. */ rep->r_mrep = mrep; rep->r_md = md; rep->r_dpos = dpos; + /* + * If we're tracking the round trip time + * then we update the circular log here + * with the stats from our current request. + */ if (nfsrtton) { struct rttl *rt; @@ -1072,7 +1292,7 @@ nfsmout: rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1]; rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1]; rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid; - rt->tstamp = time; + microtime(&rt->tstamp); // XXX unused if (rep->r_flags & R_TIMING) rt->rtt = rep->r_rtt; else @@ -1084,8 +1304,8 @@ nfsmout: * Do the additive increase of * one rpc/rtt. */ - NFSTRACE4(NFSTRC_CWND_REPLY, rep->r_xid, rep, - nmp->nm_sent, nmp->nm_cwnd); + FSDBG(530, rep->r_xid, rep, nmp->nm_sent, + nmp->nm_cwnd); if (nmp->nm_cwnd <= nmp->nm_sent) { nmp->nm_cwnd += (NFS_CWNDSCALE * NFS_CWNDSCALE + @@ -1093,11 +1313,10 @@ nfsmout: if (nmp->nm_cwnd > NFS_MAXCWND) nmp->nm_cwnd = NFS_MAXCWND; } - if (!(rep->r_flags & R_SENT)) - printf("nfs_reply: unsent xid=%x", - rep->r_xid); - rep->r_flags &= ~R_SENT; - nmp->nm_sent -= NFS_CWNDSCALE; + if (rep->r_flags & R_SENT) { + rep->r_flags &= ~R_SENT; + nmp->nm_sent -= NFS_CWNDSCALE; + } /* * Update rtt using a gain of 0.125 on the mean * and a gain of 0.25 on the deviation. @@ -1125,7 +1344,7 @@ nfsmout: break; } } - nfs_rcvunlock(&nmp->nm_flag); + nfs_rcvunlock(myrep); /* * If not matched to a request, drop it. * If it's mine, get out. @@ -1138,8 +1357,8 @@ nfsmout: panic("nfs_reply: nil r_mrep"); return (0); } - NFSTRACE4(NFSTRC_NOTMINE, myrep->r_xid, myrep, rep, - rep ? rep->r_xid : myrep->r_flags); + FSDBG(530, myrep->r_xid, myrep, rep, + rep ? rep->r_xid : myrep->r_flags); if (myrep->r_flags & R_GETONEREP) return (0); /* this path used by NQNFS */ } @@ -1156,7 +1375,7 @@ nfsmout: * nb: always frees up mreq mbuf list */ int -nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp) +nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp) struct vnode *vp; struct mbuf *mrest; int procnum; @@ -1165,8 +1384,9 @@ nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp) struct mbuf **mrp; struct mbuf **mdp; caddr_t *dposp; + u_int64_t *xidp; { - register struct mbuf *m, *mrep; + register struct mbuf *m, *mrep, *m2; register struct nfsreq *rep, *rp; register u_long *tl; register int i; @@ -1183,31 +1403,35 @@ nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp) u_quad_t frev; char *auth_str, *verf_str; NFSKERBKEY_T key; /* save session key */ + int nmsotype; + struct timeval now; + + if (mrp) + *mrp = NULL; + if (xidp) + *xidp = 0; - nmp = VFSTONFS(vp->v_mount); MALLOC_ZONE(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); - NFSTRACE4(NFSTRC_REQ, vp, procnum, nmp, rep); - - /* - * make sure if we blocked above, that the file system didn't get - * unmounted leaving nmp bogus value to trip on later and crash. - * Note nfs_unmount will set rep->r_nmp if unmounted volume, but we - * aren't that far yet. SO this is best we can do. I wanted to check - * for vp->v_mount = 0 also below, but that caused reboot crash. - * Something must think it's okay for vp-v_mount=0 during booting. - * Thus the best I can do here is see if we still have a vnode. - */ - if (vp->v_type == VBAD) { - NFSTRACE4(NFSTRC_VBAD, 1, vp, nmp, rep); - _FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); - return (EINVAL); + nmp = VFSTONFS(vp->v_mount); + if (nmp == NULL || + (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) == + (NFSSTA_FORCE|NFSSTA_TIMEO)) { + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); + return (ENXIO); } + nmsotype = nmp->nm_sotype; + + FSDBG_TOP(531, vp, procnum, nmp, rep); + rep->r_nmp = nmp; rep->r_vp = vp; rep->r_procp = procp; rep->r_procnum = procnum; + microuptime(&now); + rep->r_lastmsg = now.tv_sec - + ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay)); i = 0; m = mrest; while (m) { @@ -1220,6 +1444,12 @@ nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp) * Get the RPC header with authorization. */ kerbauth: + nmp = VFSTONFS(vp->v_mount); + if (!nmp) { + FSDBG_BOT(531, error, rep->r_xid, nmp, rep); + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); + return (ENXIO); + } verf_str = auth_str = (char *)0; if (nmp->nm_flag & NFSMNT_KERB) { verf_str = nickv; @@ -1228,10 +1458,22 @@ kerbauth: bzero((caddr_t)key, sizeof (key)); if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str, &auth_len, verf_str, verf_len)) { + nmp = VFSTONFS(vp->v_mount); + if (!nmp) { + FSDBG_BOT(531, 2, vp, error, rep); + FREE_ZONE((caddr_t)rep, + sizeof (struct nfsreq), M_NFSREQ); + m_freem(mrest); + return (ENXIO); + } error = nfs_getauth(nmp, rep, cred, &auth_str, &auth_len, verf_str, &verf_len, key); + nmp = VFSTONFS(vp->v_mount); + if (!error && !nmp) + error = ENXIO; if (error) { - _FREE_ZONE((caddr_t)rep, + FSDBG_BOT(531, 2, vp, error, rep); + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); m_freem(mrest); return (error); @@ -1247,13 +1489,15 @@ kerbauth: } m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len, auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid); + if (xidp) + *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32); if (auth_str) _FREE(auth_str, M_TEMP); /* * For stream protocols, insert a Sun RPC Record Mark. */ - if (nmp->nm_sotype == SOCK_STREAM) { + if (nmsotype == SOCK_STREAM) { M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); *mtod(m, u_long *) = htonl(0x80000000 | (m->m_pkthdr.len - NFSX_UNSIGNED)); @@ -1261,7 +1505,8 @@ kerbauth: rep->r_mreq = m; rep->r_xid = xid; tryagain: - if (nmp->nm_flag & NFSMNT_SOFT) + nmp = VFSTONFS(vp->v_mount); + if (nmp && (nmp->nm_flag & NFSMNT_SOFT)) rep->r_retry = nmp->nm_retry; else rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ @@ -1284,19 +1529,22 @@ tryagain: TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain); /* Get send time for nqnfs */ - reqtime = time.tv_sec; + microtime(&now); + reqtime = now.tv_sec; /* * If backing off another request or avoiding congestion, don't * send this one now but let timer do it. If not timing a request, * do it now. */ - if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || + if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || (nmp->nm_flag & NFSMNT_DUMBTIMR) || nmp->nm_sent < nmp->nm_cwnd)) { + int connrequired = (nmp->nm_soflags & PR_CONNREQUIRED); + splx(s); - if (nmp->nm_soflags & PR_CONNREQUIRED) - error = nfs_sndlock(&nmp->nm_flag, rep); + if (connrequired) + error = nfs_sndlock(rep); /* * Set the R_SENT before doing the send in case another thread @@ -1304,19 +1552,21 @@ tryagain: */ if (!error) { if ((rep->r_flags & R_MUSTRESEND) == 0) { - NFSTRACE4(NFSTRC_CWND_REQ1, rep->r_xid, rep, - nmp->nm_sent, nmp->nm_cwnd); + FSDBG(531, rep->r_xid, rep, nmp->nm_sent, + nmp->nm_cwnd); nmp->nm_sent += NFS_CWNDSCALE; rep->r_flags |= R_SENT; } - m = m_copym(m, 0, M_COPYALL, M_WAIT); - error = nfs_send(nmp->nm_so, nmp->nm_nam, m, rep); - if (nmp->nm_soflags & PR_CONNREQUIRED) - nfs_sndunlock(&nmp->nm_flag); + m2 = m_copym(m, 0, M_COPYALL, M_WAIT); + error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep); + if (connrequired) + nfs_sndunlock(rep); } + nmp = VFSTONFS(vp->v_mount); if (error) { - nmp->nm_sent -= NFS_CWNDSCALE; + if (nmp) + nmp->nm_sent -= NFS_CWNDSCALE; rep->r_flags &= ~R_SENT; } } else { @@ -1333,40 +1583,36 @@ tryagain: /* * RPC done, unlink the request. */ - s = splsoftclock(); - for (rp = nfs_reqq.tqh_first; rp; - rp = rp->r_chain.tqe_next) - if (rp == rep && rp->r_xid == xid) - break; - if (!rp) - panic("nfs_request race, rep %x xid %x", rep, xid); - TAILQ_REMOVE(&nfs_reqq, rep, r_chain); - splx(s); + nfs_repdequeue(rep); + + nmp = VFSTONFS(vp->v_mount); /* * Decrement the outstanding request count. */ if (rep->r_flags & R_SENT) { - NFSTRACE4(NFSTRC_CWND_REQ2, rep->r_xid, rep, nmp->nm_sent, - nmp->nm_cwnd); rep->r_flags &= ~R_SENT; /* paranoia */ - nmp->nm_sent -= NFS_CWNDSCALE; + if (nmp) { + FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd); + nmp->nm_sent -= NFS_CWNDSCALE; + } } /* * If there was a successful reply and a tprintf msg. * tprintf a response. */ - if (!error && (rep->r_flags & R_TPRINTFMSG)) - nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname, - "is alive again"); + if (!error) + nfs_up(rep, nmp, procp, "is alive again", NFSSTA_TIMEO); mrep = rep->r_mrep; md = rep->r_md; dpos = rep->r_dpos; + if (!error && !nmp) + error = ENXIO; if (error) { m_freem(rep->r_mreq); - NFSTRACE4(NFSTRC_REQERR, error, rep->r_xid, nmp, rep); - _FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); + FSDBG_BOT(531, error, rep->r_xid, nmp, rep); + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (error); } @@ -1390,8 +1636,8 @@ tryagain: error = EACCES; m_freem(mrep); m_freem(rep->r_mreq); - NFSTRACE4(NFSTRC_RPCERR, error, rep->r_xid, nmp, rep); - _FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); + FSDBG_BOT(531, error, rep->r_xid, nmp, rep); + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (error); } @@ -1416,16 +1662,21 @@ tryagain: error == NFSERR_TRYLATER) { m_freem(mrep); error = 0; - waituntil = time.tv_sec + trylater_delay; + microuptime(&now); + waituntil = now.tv_sec + trylater_delay; NFS_DPF(DUP, ("nfs_request %s flag=%x trylater_cnt=%x waituntil=%lx trylater_delay=%x\n", nmp->nm_mountp->mnt_stat.f_mntfromname, nmp->nm_flag, trylater_cnt, waituntil, trylater_delay)); - while (time.tv_sec < waituntil) + while (now.tv_sec < waituntil) { (void)tsleep((caddr_t)&lbolt, PSOCK, "nqnfstry", 0); - trylater_delay *= nfs_backoff[trylater_cnt]; + microuptime(&now); + } + trylater_delay *= 2; + if (trylater_delay > 60) + trylater_delay = 60; if (trylater_cnt < 7) trylater_cnt++; goto tryagain; @@ -1442,12 +1693,13 @@ tryagain: *mdp = md; *dposp = dpos; error |= NFSERR_RETERR; - } else + } else { m_freem(mrep); + error &= ~NFSERR_RETERR; + } m_freem(rep->r_mreq); - NFSTRACE4(NFSTRC_DISSECTERR, error, rep->r_xid, nmp, - rep); - _FREE_ZONE((caddr_t)rep, + FSDBG_BOT(531, error, rep->r_xid, nmp, rep); + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (error); } @@ -1463,7 +1715,8 @@ tryagain: nfsm_dissect(tl, u_long *, 4*NFSX_UNSIGNED); cachable = fxdr_unsigned(int, *tl++); reqtime += fxdr_unsigned(int, *tl++); - if (reqtime > time.tv_sec) { + microtime(&now); + if (reqtime > now.tv_sec) { fxdr_hyper(tl, &frev); nqnfs_clientlease(nmp, np, nqlflag, cachable, reqtime, frev); @@ -1474,7 +1727,7 @@ tryagain: *mdp = md; *dposp = dpos; m_freem(rep->r_mreq); - NFSTRACE4(NFSTRC_REQFREE, 0xf0f0f0f0, rep->r_xid, nmp, rep); + FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep); FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (0); } @@ -1482,8 +1735,8 @@ tryagain: error = EPROTONOSUPPORT; nfsmout: m_freem(rep->r_mreq); - NFSTRACE4(NFSTRC_REQFREE, error, rep->r_xid, nmp, rep); - _FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); + FSDBG_BOT(531, error, rep->r_xid, nmp, rep); + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (error); } @@ -1654,10 +1907,11 @@ nfs_rephead(siz, nd, slp, err, cache, frev, mrq, mbp, bposp) static void nfs_softterm(struct nfsreq *rep) { + rep->r_flags |= R_SOFTTERM; if (rep->r_flags & R_SENT) { - NFSTRACE4(NFSTRC_CWND_SOFT, rep->r_xid, rep, - rep->r_nmp->nm_sent, rep->r_nmp->nm_cwnd); + FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent, + rep->r_nmp->nm_cwnd); rep->r_nmp->nm_sent -= NFS_CWNDSCALE; rep->r_flags &= ~R_SENT; } @@ -1673,6 +1927,63 @@ nfs_timer_funnel(arg) } +/* + * Ensure rep isn't in use by the timer, then dequeue it. + */ +void +nfs_repdequeue(struct nfsreq *rep) +{ + int s; + + while ((rep->r_flags & R_BUSY)) { + rep->r_flags |= R_WAITING; + tsleep(rep, PSOCK, "repdeq", 0); + } + s = splsoftclock(); + TAILQ_REMOVE(&nfs_reqq, rep, r_chain); + splx(s); +} + +/* + * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not + * free()'d out from under it. + */ +void +nfs_repbusy(struct nfsreq *rep) +{ + + if ((rep->r_flags & R_BUSY)) + panic("rep locked"); + rep->r_flags |= R_BUSY; +} + +/* + * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied. + */ +struct nfsreq * +nfs_repnext(struct nfsreq *rep) +{ + struct nfsreq * nextrep; + + if (rep == NULL) + return (NULL); + /* + * We need to get and busy the next req before signalling the + * current one, otherwise wakeup() may block us and we'll race to + * grab the next req. + */ + nextrep = TAILQ_NEXT(rep, r_chain); + if (nextrep != NULL) + nfs_repbusy(nextrep); + /* unbusy and signal. */ + rep->r_flags &= ~R_BUSY; + if ((rep->r_flags & R_WAITING)) { + rep->r_flags &= ~R_WAITING; + wakeup(rep); + } + return (nextrep); +} + /* * Nfs timer routine * Scan the nfsreq list and retranmit any requests that have timed out @@ -1683,7 +1994,7 @@ void nfs_timer(arg) void *arg; /* never used */ { - register struct nfsreq *rep, *rp; + register struct nfsreq *rep; register struct mbuf *m; register struct socket *so; register struct nfsmount *nmp; @@ -1699,17 +2010,16 @@ nfs_timer(arg) #endif int flags, rexmit, cwnd, sent; u_long xid; + struct timeval now; s = splnet(); /* * XXX If preemptable threads are implemented the spls used for the * outstanding request queue must be replaced with mutexes. */ -rescan: #ifdef NFSTRACESUSPENDERS if (NFSTRACE_SUSPENDING) { - for (rep = nfs_reqq.tqh_first; rep != 0; - rep = rep->r_chain.tqe_next) + TAILQ_FOREACH(rep, &nfs_reqq, r_chain) if (rep->r_xid == nfstracexid) break; if (!rep) { @@ -1719,7 +2029,11 @@ rescan: } } #endif - for (rep = nfs_reqq.tqh_first; rep != 0; rep = rep->r_chain.tqe_next) { + rep = TAILQ_FIRST(&nfs_reqq); + if (rep != NULL) + nfs_repbusy(rep); + microuptime(&now); + for ( ; rep != NULL ; rep = nfs_repnext(rep)) { #ifdef NFSTRACESUSPENDERS if (rep->r_mrep && !NFSTRACE_SUSPENDING) { nfstracexid = rep->r_xid; @@ -1731,9 +2045,21 @@ rescan: continue; if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) continue; - if (nfs_sigintr(nmp, rep, rep->r_procp)) { - nfs_softterm(rep); + if (nfs_sigintr(nmp, rep, rep->r_procp)) continue; + if (nmp->nm_tprintf_initial_delay != 0 && + (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) && + rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) { + rep->r_lastmsg = now.tv_sec; + nfs_down(rep, rep->r_nmp, rep->r_procp, "not responding", + 0, NFSSTA_TIMEO); + if (!(nmp->nm_state & NFSSTA_MOUNTED)) { + /* we're not yet completely mounted and */ + /* we can't complete an RPC, so we fail */ + nfsstats.rpctimeouts++; + nfs_softterm(rep); + continue; + } } if (rep->r_rtt >= 0) { rep->r_rtt++; @@ -1752,15 +2078,10 @@ rescan: nmp->nm_timeouts++; } /* - * Check for server not responding + * Check for too many retransmits. This is never true for + * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1 + * and never allow r_rexmit to be more than NFS_MAXREXMIT. */ - if ((rep->r_flags & R_TPRINTFMSG) == 0 && - rep->r_rexmit > nmp->nm_deadthresh) { - nfs_msg(rep->r_procp, - nmp->nm_mountp->mnt_stat.f_mntfromname, - "not responding"); - rep->r_flags |= R_TPRINTFMSG; - } if (rep->r_rexmit >= rep->r_retry) { /* too many */ nfsstats.rpctimeouts++; nfs_softterm(rep); @@ -1827,8 +2148,7 @@ rescan: rep->r_flags |= R_SENT; nmp->nm_sent += NFS_CWNDSCALE; } - NFSTRACE4(NFSTRC_CWND_TIMER, xid, rep, - nmp->nm_sent, nmp->nm_cwnd); + FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd); thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); @@ -1841,30 +2161,12 @@ rescan: thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - NFSTRACE4(NFSTRC_CWND_TIMER, xid, error, sent, cwnd); - /* - * This is to fix "nfs_sigintr" DSI panics. - * We may have slept during the send so the current - * place in the request queue may have been released. - * Due to zone_gc it may even be part of an - * unrelated newly allocated data structure. - * Restart the list scan from the top if needed... - */ - for (rp = nfs_reqq.tqh_first; rp; - rp = rp->r_chain.tqe_next) - if (rp == rep && rp->r_xid == xid) - break; - if (!rp) { - if (!error) - goto rescan; - panic("nfs_timer: race error %d xid 0x%x\n", - error, xid); - } + FSDBG(535, xid, error, sent, cwnd); if (error) { if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) so->so_error = 0; - rep->r_flags = flags; + rep->r_flags = flags | R_RESENDERR; rep->r_rexmit = rexmit; nmp->nm_cwnd = cwnd; nmp->nm_sent = sent; @@ -1874,12 +2176,13 @@ rescan: rep->r_rtt = 0; } } + microuptime(&now); #ifndef NFS_NOSERVER /* * Call the nqnfs server timer once a second to handle leases. */ - if (lasttime != time.tv_sec) { - lasttime = time.tv_sec; + if (lasttime != now.tv_sec) { + lasttime = now.tv_sec; nqnfs_serverd(); } @@ -1887,14 +2190,23 @@ rescan: * Scan the write gathering queues for writes that need to be * completed now. */ - cur_usec = (u_quad_t)time.tv_sec * 1000000 + (u_quad_t)time.tv_usec; - for (slp = nfssvc_sockhead.tqh_first; slp != 0; - slp = slp->ns_chain.tqe_next) { - if (slp->ns_tq.lh_first && slp->ns_tq.lh_first->nd_time<=cur_usec) + cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec; + TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) { + if (LIST_FIRST(&slp->ns_tq) && + LIST_FIRST(&slp->ns_tq)->nd_time <= cur_usec) nfsrv_wakenfsd(slp); } #endif /* NFS_NOSERVER */ splx(s); + + if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) { + /* + * We haven't called nfs_buf_freeup() in a little while. + * So, see if we can free up any stale/unused bufs now. + */ + nfs_buf_freeup(1); + } + timeout(nfs_timer_funnel, (void *)0, nfs_ticks); } @@ -1902,22 +2214,82 @@ rescan: /* * Test for a termination condition pending on the process. - * This is used for NFSMNT_INT mounts. + * This is used to determine if we need to bail on a mount. + * EIO is returned if there has been a soft timeout. + * EINTR is returned if there is a signal pending that is not being ignored + * and the mount is interruptable, or if we are a thread that is in the process + * of cancellation (also SIGKILL posted). */ int nfs_sigintr(nmp, rep, p) struct nfsmount *nmp; struct nfsreq *rep; - register struct proc *p; + struct proc *p; { + struct uthread *curr_td; + sigset_t pending_sigs; + int context_good = 0; + struct nfsmount *repnmp; + + if (nmp == NULL) + return (ENXIO); + if (rep != NULL) { + repnmp = rep->r_nmp; + /* we've had a forced unmount. */ + if (repnmp == NULL) + return (ENXIO); + /* request has timed out on a 'soft' mount. */ + if (rep->r_flags & R_SOFTTERM) + return (EIO); + /* + * We're in the progress of a force unmount and there's + * been a timeout we're dead and fail IO. + */ + if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) == + (NFSSTA_FORCE|NFSSTA_TIMEO)) + return (EIO); + /* Someone is unmounting us, go soft and mark it. */ + if ((repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT)) { + repnmp->nm_flag |= NFSMNT_SOFT; + nmp->nm_state |= NFSSTA_FORCE; + } + /* + * If the mount is hung and we've requested not to hang + * on remote filesystems, then bail now. + */ + if (p != NULL && (p->p_flag & P_NOREMOTEHANG) != 0 && + (repnmp->nm_state & NFSSTA_TIMEO) != 0) + return (EIO); + } + /* XXX: is this valid? this probably should be an assertion. */ + if (p == NULL) + return (0); - if (rep && (rep->r_flags & R_SOFTTERM)) + /* + * XXX: Since nfs doesn't have a good shot at getting the current + * thread we take a guess. (only struct proc * are passed to VOPs) + * What we do is look at the current thread, if it belongs to the + * passed in proc pointer then we have a "good/accurate" context + * and can make an accurate guess as to what to do. + * However if we have a bad context we have to make due with what + * is in the proc struct which may not be as up to date as we'd + * like. + * This is ok because the process will call us with the correct + * context after a short timeout while waiting for a response. + */ + curr_td = (struct uthread *)get_bsdthread_info(current_act()); + if (curr_td->uu_proc == p) + context_good = 1; + if (context_good && current_thread_aborted()) return (EINTR); - if (!(nmp->nm_flag & NFSMNT_INT)) - return (0); - if (p && p->p_siglist && - (((p->p_siglist & ~p->p_sigmask) & ~p->p_sigignore) & - NFSINT_SIGMASK)) + /* mask off thread and process blocked signals. */ + if (context_good) + pending_sigs = curr_td->uu_siglist & ~curr_td->uu_sigmask; + else + pending_sigs = p->p_siglist; + /* mask off process level and NFS ignored signals. */ + pending_sigs &= ~p->p_sigignore & NFSINT_SIGMASK; + if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0) return (EINTR); return (0); } @@ -1929,31 +2301,41 @@ nfs_sigintr(nmp, rep, p) * in progress when a reconnect is necessary. */ int -nfs_sndlock(flagp, rep) - register int *flagp; +nfs_sndlock(rep) struct nfsreq *rep; { + register int *statep; struct proc *p; - int slpflag = 0, slptimeo = 0; + int error, slpflag = 0, slptimeo = 0; - if (rep) { - p = rep->r_procp; - if (rep->r_nmp->nm_flag & NFSMNT_INT) - slpflag = PCATCH; - } else - p = (struct proc *)0; - while (*flagp & NFSMNT_SNDLOCK) { - if (nfs_sigintr(rep->r_nmp, rep, p)) - return (EINTR); - *flagp |= NFSMNT_WANTSND; - (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsndlck", - slptimeo); + if (rep->r_nmp == NULL) + return (ENXIO); + statep = &rep->r_nmp->nm_state; + + p = rep->r_procp; + if (rep->r_nmp->nm_flag & NFSMNT_INT) + slpflag = PCATCH; + while (*statep & NFSSTA_SNDLOCK) { + error = nfs_sigintr(rep->r_nmp, rep, p); + if (error) + return (error); + *statep |= NFSSTA_WANTSND; + if (p != NULL && (p->p_flag & P_NOREMOTEHANG) != 0) + slptimeo = hz; + (void) tsleep((caddr_t)statep, slpflag | (PZERO - 1), + "nfsndlck", slptimeo); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } + /* + * Make sure while we slept that the mountpoint didn't go away. + * nfs_sigintr and callers expect it in tact. + */ + if (!rep->r_nmp) + return (ENXIO); /* don't have lock until out of loop */ } - *flagp |= NFSMNT_SNDLOCK; + *statep |= NFSSTA_SNDLOCK; return (0); } @@ -1961,16 +2343,20 @@ nfs_sndlock(flagp, rep) * Unlock the stream socket for others. */ void -nfs_sndunlock(flagp) - register int *flagp; +nfs_sndunlock(rep) + struct nfsreq *rep; { + register int *statep; - if ((*flagp & NFSMNT_SNDLOCK) == 0) + if (rep->r_nmp == NULL) + return; + statep = &rep->r_nmp->nm_state; + if ((*statep & NFSSTA_SNDLOCK) == 0) panic("nfs sndunlock"); - *flagp &= ~NFSMNT_SNDLOCK; - if (*flagp & NFSMNT_WANTSND) { - *flagp &= ~NFSMNT_WANTSND; - wakeup((caddr_t)flagp); + *statep &= ~NFSSTA_SNDLOCK; + if (*statep & NFSSTA_WANTSND) { + *statep &= ~NFSSTA_WANTSND; + wakeup((caddr_t)statep); } } @@ -1978,41 +2364,63 @@ static int nfs_rcvlock(rep) register struct nfsreq *rep; { - register int *flagp = &rep->r_nmp->nm_flag; - int slpflag, slptimeo = 0; + register int *statep; + int error, slpflag, slptimeo = 0; + + /* make sure we still have our mountpoint */ + if (!rep->r_nmp) { + if (rep->r_mrep != NULL) + return (EALREADY); + return (ENXIO); + } - if (*flagp & NFSMNT_INT) + statep = &rep->r_nmp->nm_state; + FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep); + if (rep->r_nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; else slpflag = 0; - while (*flagp & NFSMNT_RCVLOCK) { - if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) { - NFSTRACE4(NFSTRC_RCVLCKINTR, rep->r_xid, rep, - rep->r_nmp, *flagp); - return (EINTR); + while (*statep & NFSSTA_RCVLOCK) { + if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) { + FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100); + return (error); } else if (rep->r_mrep != NULL) { /* * Don't bother sleeping if reply already arrived */ - NFSTRACE4(NFSTRC_RCVALREADY, rep->r_xid, rep, - rep->r_nmp, 1); + FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101); return (EALREADY); } - NFSTRACE4(NFSTRC_RCVLCKW, rep->r_xid, rep, rep->r_nmp, *flagp); - *flagp |= NFSMNT_WANTRCV; - (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", - slptimeo); + FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102); + *statep |= NFSSTA_WANTRCV; + /* + * We need to poll if we're P_NOREMOTEHANG so that we + * call nfs_sigintr periodically above. + */ + if (rep->r_procp != NULL && + (rep->r_procp->p_flag & P_NOREMOTEHANG) != 0) + slptimeo = hz; + (void) tsleep((caddr_t)statep, slpflag | (PZERO - 1), + "nfsrcvlk", slptimeo); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } + /* + * Make sure while we slept that the mountpoint didn't go away. + * nfs_sigintr and caller nfs_reply expect it intact. + */ + if (!rep->r_nmp) { + FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103); + return (ENXIO); /* don't have lock until out of loop */ + } } /* * nfs_reply will handle it if reply already arrived. * (We may have slept or been preempted while on network funnel). */ - NFSTRACE4(NFSTRC_RCVLCK, rep->r_xid, rep, rep->r_nmp, *flagp); - *flagp |= NFSMNT_RCVLOCK; + FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep); + *statep |= NFSSTA_RCVLOCK; return (0); } @@ -2020,19 +2428,22 @@ nfs_rcvlock(rep) * Unlock the stream socket for others. */ static void -nfs_rcvunlock(flagp) - register int *flagp; +nfs_rcvunlock(rep) + register struct nfsreq *rep; { + register int *statep; + + if (rep->r_nmp == NULL) + return; + statep = &rep->r_nmp->nm_state; - if ((*flagp & NFSMNT_RCVLOCK) == 0) + FSDBG(533, statep, *statep, 0, 0); + if ((*statep & NFSSTA_RCVLOCK) == 0) panic("nfs rcvunlock"); - *flagp &= ~NFSMNT_RCVLOCK; - if (*flagp & NFSMNT_WANTRCV) { - NFSTRACE(NFSTRC_RCVUNLW, flagp); - *flagp &= ~NFSMNT_WANTRCV; - wakeup((caddr_t)flagp); - } else { - NFSTRACE(NFSTRC_RCVUNL, flagp); + *statep &= ~NFSSTA_RCVLOCK; + if (*statep & NFSSTA_WANTRCV) { + *statep &= ~NFSSTA_WANTRCV; + wakeup((caddr_t)statep); } } @@ -2045,7 +2456,7 @@ nfs_rcvunlock(flagp) * be called with M_WAIT from an nfsd. */ /* - * Needs to eun under network funnel + * Needs to run under network funnel */ void nfsrv_rcv(so, arg, waitflag) @@ -2056,9 +2467,9 @@ nfsrv_rcv(so, arg, waitflag) register struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; register struct mbuf *m; struct mbuf *mp, *mhck; - struct sockaddr *nam=0; + struct sockaddr *nam; struct uio auio; - int flags, error; + int flags, ns_nflag=0, error; struct sockaddr_in *sin; if ((slp->ns_flag & SLP_VALID) == 0) @@ -2068,7 +2479,8 @@ nfsrv_rcv(so, arg, waitflag) * Define this to test for nfsds handling this under heavy load. */ if (waitflag == M_DONTWAIT) { - slp->ns_flag |= SLP_NEEDQ; goto dorecs; + ns_nflag = SLPN_NEEDQ; + goto dorecs; } #endif auio.uio_procp = NULL; @@ -2079,7 +2491,7 @@ nfsrv_rcv(so, arg, waitflag) * the nfs servers are heavily loaded. */ if (slp->ns_rec && waitflag == M_DONTWAIT) { - slp->ns_flag |= SLP_NEEDQ; + ns_nflag = SLPN_NEEDQ; goto dorecs; } @@ -2091,9 +2503,9 @@ nfsrv_rcv(so, arg, waitflag) error = soreceive(so, (struct sockaddr **) 0, &auio, &mp, (struct mbuf **)0, &flags); if (error || mp == (struct mbuf *)0) { if (error == EWOULDBLOCK) - slp->ns_flag |= SLP_NEEDQ; + ns_nflag = SLPN_NEEDQ; else - slp->ns_flag |= SLP_DISCONN; + ns_nflag = SLPN_DISCONN; goto dorecs; } m = mp; @@ -2114,15 +2526,16 @@ nfsrv_rcv(so, arg, waitflag) error = nfsrv_getstream(slp, waitflag); if (error) { if (error == EPERM) - slp->ns_flag |= SLP_DISCONN; + ns_nflag = SLPN_DISCONN; else - slp->ns_flag |= SLP_NEEDQ; + ns_nflag = SLPN_NEEDQ; } } else { do { auio.uio_resid = 1000000000; - flags = MSG_DONTWAIT; + flags = MSG_DONTWAIT | MSG_NEEDSA; nam = 0; + mp = 0; error = soreceive(so, &nam, &auio, &mp, (struct mbuf **)0, &flags); @@ -2133,7 +2546,6 @@ nfsrv_rcv(so, arg, waitflag) sin = mtod(mhck, struct sockaddr_in *); bcopy(nam, sin, sizeof(struct sockaddr_in)); mhck->m_hdr.mh_len = sizeof(struct sockaddr_in); - FREE(nam, M_SONAME); m = mhck; m->m_next = mp; @@ -2146,10 +2558,13 @@ nfsrv_rcv(so, arg, waitflag) slp->ns_recend = m; m->m_nextpkt = (struct mbuf *)0; } + if (nam) { + FREE(nam, M_SONAME); + } if (error) { if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && error != EWOULDBLOCK) { - slp->ns_flag |= SLP_DISCONN; + ns_nflag = SLPN_DISCONN; goto dorecs; } } @@ -2160,8 +2575,10 @@ nfsrv_rcv(so, arg, waitflag) * Now try and process the request records, non-blocking. */ dorecs: + if (ns_nflag) + slp->ns_nflag |= ns_nflag; if (waitflag == M_DONTWAIT && - (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) { + (slp->ns_rec || (slp->ns_nflag & (SLPN_NEEDQ | SLPN_DISCONN)))) { thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); nfsrv_wakenfsd(slp); thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); @@ -2181,16 +2598,16 @@ nfsrv_getstream(slp, waitflag) register struct mbuf *m, **mpp; register char *cp1, *cp2; register int len; - struct mbuf *om, *m2, *recm = 0; + struct mbuf *om, *m2, *recm; u_long recmark; - if (slp->ns_flag & SLP_GETSTREAM) + if (slp->ns_nflag & SLPN_GETSTREAM) panic("nfs getstream"); - slp->ns_flag |= SLP_GETSTREAM; + slp->ns_nflag |= SLPN_GETSTREAM; for (;;) { if (slp->ns_reclen == 0) { if (slp->ns_cc < NFSX_UNSIGNED) { - slp->ns_flag &= ~SLP_GETSTREAM; + slp->ns_nflag &= ~SLPN_GETSTREAM; return (0); } m = slp->ns_raw; @@ -2215,18 +2632,22 @@ nfsrv_getstream(slp, waitflag) recmark = ntohl(recmark); slp->ns_reclen = recmark & ~0x80000000; if (recmark & 0x80000000) - slp->ns_flag |= SLP_LASTFRAG; + slp->ns_nflag |= SLPN_LASTFRAG; else - slp->ns_flag &= ~SLP_LASTFRAG; + slp->ns_nflag &= ~SLPN_LASTFRAG; if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) { - slp->ns_flag &= ~SLP_GETSTREAM; + slp->ns_nflag &= ~SLPN_GETSTREAM; return (EPERM); } } /* * Now get the record part. + * + * Note that slp->ns_reclen may be 0. Linux sometimes + * generates 0-length RPCs */ + recm = NULL; if (slp->ns_cc == slp->ns_reclen) { recm = slp->ns_raw; slp->ns_raw = slp->ns_rawend = (struct mbuf *)0; @@ -2249,7 +2670,7 @@ nfsrv_getstream(slp, waitflag) m->m_len -= slp->ns_reclen - len; len = slp->ns_reclen; } else { - slp->ns_flag &= ~SLP_GETSTREAM; + slp->ns_nflag &= ~SLPN_GETSTREAM; return (EWOULDBLOCK); } } else if ((len + m->m_len) == slp->ns_reclen) { @@ -2268,7 +2689,7 @@ nfsrv_getstream(slp, waitflag) slp->ns_cc -= len; slp->ns_reclen = 0; } else { - slp->ns_flag &= ~SLP_GETSTREAM; + slp->ns_nflag &= ~SLPN_GETSTREAM; return (0); } @@ -2279,7 +2700,7 @@ nfsrv_getstream(slp, waitflag) while (*mpp) mpp = &((*mpp)->m_next); *mpp = recm; - if (slp->ns_flag & SLP_LASTFRAG) { + if (slp->ns_nflag & SLPN_LASTFRAG) { if (slp->ns_recend) slp->ns_recend->m_nextpkt = slp->ns_frag; else @@ -2326,8 +2747,9 @@ nfsrv_dorec(slp, nfsd, ndp) nd->nd_dpos = mtod(m, caddr_t); error = nfs_getreq(nd, nfsd, TRUE); if (error) { - m_freem(nam); - _FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); + if (nam) + m_freem(nam); + FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); return (error); } *ndp = nd; @@ -2357,7 +2779,7 @@ nfs_getreq(nd, nfsd, has_header) int error = 0, nqnfs = 0, ticklen; struct mbuf *mrep, *md; register struct nfsuid *nuidp; - struct timeval tvin, tvout; + struct timeval tvin, tvout, now; #if 0 /* until encrypted keys are implemented */ NFSKERBKEYSCHED_T keys; /* stores key schedule */ #endif @@ -2543,7 +2965,8 @@ nfs_getreq(nd, nfsd, has_header) tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec); tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec); - if (nuidp->nu_expire < time.tv_sec || + microtime(&now); + if (nuidp->nu_expire < now.tv_sec || nuidp->nu_timestamp.tv_sec > tvout.tv_sec || (nuidp->nu_timestamp.tv_sec == tvout.tv_sec && nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) { @@ -2595,7 +3018,7 @@ nfsrv_wakenfsd(slp) if ((slp->ns_flag & SLP_VALID) == 0) return; - for (nd = nfsd_head.tqh_first; nd != 0; nd = nd->nfsd_chain.tqe_next) { + TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) { if (nd->nfsd_flag & NFSD_WAITING) { nd->nfsd_flag &= ~NFSD_WAITING; if (nd->nfsd_slp) @@ -2612,9 +3035,10 @@ nfsrv_wakenfsd(slp) #endif /* NFS_NOSERVER */ static int -nfs_msg(p, server, msg) +nfs_msg(p, server, msg, error) struct proc *p; - char *server, *msg; + const char *server, *msg; + int error; { tpr_t tpr; @@ -2622,7 +3046,61 @@ nfs_msg(p, server, msg) tpr = tprintf_open(p); else tpr = NULL; - tprintf(tpr, "nfs server %s: %s\n", server, msg); + if (error) + tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg, + error); + else + tprintf(tpr, "nfs server %s: %s\n", server, msg); tprintf_close(tpr); return (0); } + +void +nfs_down(rep, nmp, proc, msg, error, flags) + struct nfsreq *rep; + struct nfsmount *nmp; + struct proc *proc; + const char *msg; + int error, flags; +{ + if (nmp == NULL) + return; + if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) { + vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid, + VQ_NOTRESP, 0); + nmp->nm_state |= NFSSTA_TIMEO; + } + if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) { + vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid, + VQ_NOTRESPLOCK, 0); + nmp->nm_state |= NFSSTA_LOCKTIMEO; + } + if (rep) + rep->r_flags |= R_TPRINTFMSG; + nfs_msg(proc, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error); +} + +void +nfs_up(rep, nmp, proc, msg, flags) + struct nfsreq *rep; + struct nfsmount *nmp; + struct proc *proc; + const char *msg; + int flags; +{ + if (nmp == NULL) + return; + if ((rep == NULL) || (rep->r_flags & R_TPRINTFMSG) != 0) + nfs_msg(proc, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0); + if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) { + nmp->nm_state &= ~NFSSTA_TIMEO; + vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid, + VQ_NOTRESP, 1); + } + if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) { + nmp->nm_state &= ~NFSSTA_LOCKTIMEO; + vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid, + VQ_NOTRESPLOCK, 1); + } +} +