bsd/nfs/nfs_socket.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  24 /*
  25  * Copyright (c) 1989, 1991, 1993, 1995
  26  *      The Regents of the University of California.  All rights reserved.
  27  *
  28  * This code is derived from software contributed to Berkeley by
  29  * Rick Macklem at The University of Guelph.
  30  *
  31  * Redistribution and use in source and binary forms, with or without
  32  * modification, are permitted provided that the following conditions
  33  * are met:
  34  * 1. Redistributions of source code must retain the above copyright
  35  *    notice, this list of conditions and the following disclaimer.
  36  * 2. Redistributions in binary form must reproduce the above copyright
  37  *    notice, this list of conditions and the following disclaimer in the
  38  *    documentation and/or other materials provided with the distribution.
  39  * 3. All advertising materials mentioning features or use of this software
  40  *    must display the following acknowledgement:
  41  *      This product includes software developed by the University of
  42  *      California, Berkeley and its contributors.
  43  * 4. Neither the name of the University nor the names of its contributors
  44  *    may be used to endorse or promote products derived from this software
  45  *    without specific prior written permission.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57  * SUCH DAMAGE.
  58  *
  59  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
  60  * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
  61  */
  62
  63 /*
  64  * Socket operations for use by nfs
  65  */
  66
  67 #include <sys/param.h>
  68 #include <sys/systm.h>
  69 #include <sys/proc.h>
  70 #include <sys/kauth.h>
  71 #include <sys/mount_internal.h>
  72 #include <sys/kernel.h>
  73 #include <sys/kpi_mbuf.h>
  74 #include <sys/malloc.h>
  75 #include <sys/vnode.h>
  76 #include <sys/domain.h>
  77 #include <sys/protosw.h>
  78 #include <sys/socket.h>
  79 #include <sys/syslog.h>
  80 #include <sys/tprintf.h>
  81 #include <sys/uio_internal.h>
  82 #include <libkern/OSAtomic.h>
  83
  84 #include <sys/time.h>
  85 #include <kern/clock.h>
  86 #include <kern/task.h>
  87 #include <kern/thread.h>
  88 #include <sys/user.h>
  89
  90 #include <netinet/in.h>
  91 #include <netinet/tcp.h>
  92
  93 #include <nfs/rpcv2.h>
  94 #include <nfs/nfsproto.h>
  95 #include <nfs/nfs.h>
  96 #include <nfs/xdr_subs.h>
  97 #include <nfs/nfsm_subs.h>
  98 #include <nfs/nfsmount.h>
  99 #include <nfs/nfsnode.h>
 100 #include <nfs/nfsrtt.h>
 101
 102 #include <sys/kdebug.h>
 103
 104 #define FSDBG(A, B, C, D, E) \
 105         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
 106                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 107 #define FSDBG_TOP(A, B, C, D, E) \
 108         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
 109                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 110 #define FSDBG_BOT(A, B, C, D, E) \
 111         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
 112                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 113
 114 /*
 115  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
 116  * Use the mean and mean deviation of rtt for the appropriate type of rpc
 117  * for the frequent rpcs and a default for the others.
 118  * The justification for doing "other" this way is that these rpcs
 119  * happen so infrequently that timer est. would probably be stale.
 120  * Also, since many of these rpcs are
 121  * non-idempotent, a conservative timeout is desired.
 122  * getattr, lookup - A+2D
 123  * read, write     - A+4D
 124  * other           - nm_timeo
 125  */
 126 #define NFS_RTO(n, t) \
 127         ((t) == 0 ? (n)->nm_timeo : \
 128          ((t) < 3 ? \
 129           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
 130           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
 131 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
 132 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
 133 /*
 134  * External data, mostly RPC constants in XDR form
 135  */
 136 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
 137         rpc_msgaccepted, rpc_call, rpc_autherr,
 138         rpc_auth_kerb;
 139 extern u_long nfs_prog;
 140 extern struct nfsstats nfsstats;
 141 extern int nfsv3_procid[NFS_NPROCS];
 142 extern int nfs_ticks;
 143 extern u_long nfs_xidwrap;
 144
 145 /*
 146  * Defines which timer to use for the procnum.
 147  * 0 - default
 148  * 1 - getattr
 149  * 2 - lookup
 150  * 3 - read
 151  * 4 - write
 152  */
 153 static int proct[NFS_NPROCS] = {
 154         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
 155 };
 156
 157 /*
 158  * There is a congestion window for outstanding rpcs maintained per mount
 159  * point. The cwnd size is adjusted in roughly the way that:
 160  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
 161  * SIGCOMM '88". ACM, August 1988.
 162  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
 163  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
 164  * of rpcs is in progress.
 165  * (The sent count and cwnd are scaled for integer arith.)
 166  * Variants of "slow start" were tried and were found to be too much of a
 167  * performance hit (ave. rtt 3 times larger),
 168  * I suspect due to the large rtt that nfs rpcs have.
 169  */
 170 #define NFS_CWNDSCALE   256
 171 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
 172 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
 173 int nfsrtton = 0;
 174 struct nfsrtt nfsrtt;
 175
 176 static int      nfs_rcvlock(struct nfsreq *);
 177 static void     nfs_rcvunlock(struct nfsreq *);
 178 static int      nfs_receive(struct nfsreq *rep, mbuf_t *mp);
 179 static int      nfs_reconnect(struct nfsreq *rep);
 180 static void     nfs_repdequeue(struct nfsreq *rep);
 181
 182 /* XXX */
 183 boolean_t       current_thread_aborted(void);
 184 kern_return_t   thread_terminate(thread_t);
 185
 186 #ifndef NFS_NOSERVER
 187 static int      nfsrv_getstream(struct nfssvc_sock *,int);
 188
 189 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
 190                                     struct nfssvc_sock *slp,
 191                                     proc_t procp,
 192                                     mbuf_t *mreqp) = {
 193         nfsrv_null,
 194         nfsrv_getattr,
 195         nfsrv_setattr,
 196         nfsrv_lookup,
 197         nfsrv3_access,
 198         nfsrv_readlink,
 199         nfsrv_read,
 200         nfsrv_write,
 201         nfsrv_create,
 202         nfsrv_mkdir,
 203         nfsrv_symlink,
 204         nfsrv_mknod,
 205         nfsrv_remove,
 206         nfsrv_rmdir,
 207         nfsrv_rename,
 208         nfsrv_link,
 209         nfsrv_readdir,
 210         nfsrv_readdirplus,
 211         nfsrv_statfs,
 212         nfsrv_fsinfo,
 213         nfsrv_pathconf,
 214         nfsrv_commit,
 215         nfsrv_noop
 216 };
 217 #endif /* NFS_NOSERVER */
 218
 219
 220 /*
 221  * attempt to bind a socket to a reserved port
 222  */
 223 static int
 224 nfs_bind_resv(struct nfsmount *nmp)
 225 {
 226         socket_t so = nmp->nm_so;
 227         struct sockaddr_in sin;
 228         int error;
 229         u_short tport;
 230
 231         if (!so)
 232                 return (EINVAL);
 233
 234         sin.sin_len = sizeof (struct sockaddr_in);
 235         sin.sin_family = AF_INET;
 236         sin.sin_addr.s_addr = INADDR_ANY;
 237         tport = IPPORT_RESERVED - 1;
 238         sin.sin_port = htons(tport);
 239
 240         while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
 241                (--tport > IPPORT_RESERVED / 2))
 242                 sin.sin_port = htons(tport);
 243         return (error);
 244 }
 245
 246 /*
 247  * variables for managing the nfs_bind_resv_thread
 248  */
 249 int nfs_resv_mounts = 0;
 250 static int nfs_bind_resv_thread_state = 0;
 251 #define NFS_BIND_RESV_THREAD_STATE_INITTED      1
 252 #define NFS_BIND_RESV_THREAD_STATE_RUNNING      2
 253 lck_grp_t *nfs_bind_resv_lck_grp;
 254 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
 255 lck_attr_t *nfs_bind_resv_lck_attr;
 256 lck_mtx_t *nfs_bind_resv_mutex;
 257 struct nfs_bind_resv_request {
 258         TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
 259         struct nfsmount *brr_nmp;
 260         int brr_error;
 261 };
 262 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
 263
 264 /*
 265  * thread to handle any reserved port bind requests
 266  */
 267 static void
 268 nfs_bind_resv_thread(void)
 269 {
 270         struct nfs_bind_resv_request *brreq;
 271
 272         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 273
 274         while (nfs_resv_mounts > 0) {
 275                 lck_mtx_lock(nfs_bind_resv_mutex);
 276                 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
 277                         TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
 278                         lck_mtx_unlock(nfs_bind_resv_mutex);
 279                         brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
 280                         wakeup(brreq);
 281                         lck_mtx_lock(nfs_bind_resv_mutex);
 282                 }
 283                 msleep((caddr_t)&nfs_bind_resv_request_queue,
 284                                 nfs_bind_resv_mutex, PSOCK | PDROP,
 285                                 "nfs_bind_resv_request_queue", 0);
 286         }
 287
 288         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 289         (void) thread_terminate(current_thread());
 290 }
 291
 292 int
 293 nfs_bind_resv_thread_wake(void)
 294 {
 295         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
 296                 return (EIO);
 297         wakeup(&nfs_bind_resv_request_queue);
 298         return (0);
 299 }
 300
 301 /*
 302  * underprivileged procs call this to request nfs_bind_resv_thread
 303  * to perform the reserved port binding for them.
 304  */
 305 static int
 306 nfs_bind_resv_nopriv(struct nfsmount *nmp)
 307 {
 308         struct nfs_bind_resv_request brreq;
 309         int error;
 310
 311         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
 312                 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
 313                         nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
 314                         lck_grp_attr_setstat(nfs_bind_resv_lck_grp_attr);
 315                         nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
 316                         nfs_bind_resv_lck_attr = lck_attr_alloc_init();
 317                         nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
 318                         TAILQ_INIT(&nfs_bind_resv_request_queue);
 319                         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 320                 }
 321                 kernel_thread(kernel_task, nfs_bind_resv_thread);
 322                 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 323         }
 324
 325         brreq.brr_nmp = nmp;
 326         brreq.brr_error = 0;
 327
 328         lck_mtx_lock(nfs_bind_resv_mutex);
 329         TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 330         lck_mtx_unlock(nfs_bind_resv_mutex);
 331
 332         error = nfs_bind_resv_thread_wake();
 333         if (error) {
 334                 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 335                 /* Note: we might be able to simply restart the thread */
 336                 return (error);
 337         }
 338
 339         tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
 340
 341         return (brreq.brr_error);
 342 }
 343
 344 /*
 345  * Initialize sockets and congestion for a new NFS connection.
 346  * We do not free the sockaddr if error.
 347  */
 348 int
 349 nfs_connect(
 350         struct nfsmount *nmp,
 351         __unused struct nfsreq *rep)
 352 {
 353         socket_t so;
 354         int error, rcvreserve, sndreserve;
 355         struct sockaddr *saddr;
 356         struct timeval timeo;
 357
 358         nmp->nm_so = 0;
 359         saddr = mbuf_data(nmp->nm_nam);
 360         error = sock_socket(saddr->sa_family, nmp->nm_sotype,
 361                                                 nmp->nm_soproto, 0, 0, &nmp->nm_so);
 362         if (error) {
 363                 goto bad;
 364         }
 365         so = nmp->nm_so;
 366
 367         /*
 368          * Some servers require that the client port be a reserved port number.
 369          */
 370         if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
 371                 proc_t p;
 372                 /*
 373                  * sobind() requires current_proc() to have superuser privs.
 374                  * If this bind is part of a reconnect, and the current proc
 375                  * doesn't have superuser privs, we hand the sobind() off to
 376                  * a kernel thread to process.
 377                  */
 378                 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
 379                     (p = current_proc()) && suser(kauth_cred_get(), 0)) {
 380                         /* request nfs_bind_resv_thread() to do bind */
 381                         error = nfs_bind_resv_nopriv(nmp);
 382                 } else {
 383                         error = nfs_bind_resv(nmp);
 384                 }
 385                 if (error)
 386                         goto bad;
 387         }
 388
 389         /*
 390          * Protocols that do not require connections may be optionally left
 391          * unconnected for servers that reply from a port other than NFS_PORT.
 392          */
 393         if (nmp->nm_flag & NFSMNT_NOCONN) {
 394                 if (nmp->nm_sotype == SOCK_STREAM) {
 395                         error = ENOTCONN;
 396                         goto bad;
 397                 }
 398         } else {
 399                 struct timeval  tv;
 400                 tv.tv_sec = 2;
 401                 tv.tv_usec = 0;
 402                 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
 403                 if (error && error != EINPROGRESS) {
 404                         goto bad;
 405                 }
 406
 407                 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
 408                         if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
 409                                 goto bad;
 410                         }
 411                 }
 412         }
 413
 414         /*
 415          * Always time out on recieve, this allows us to reconnect the
 416          * socket to deal with network changes.
 417          */
 418         timeo.tv_usec = 0;
 419         timeo.tv_sec = 2;
 420         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
 421         if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
 422                 timeo.tv_sec = 5;
 423         } else {
 424                 timeo.tv_sec = 0;
 425         }
 426         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
 427
 428         if (nmp->nm_sotype == SOCK_DGRAM) {
 429                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 430                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 431                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 432         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
 433                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 434                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 435                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 436         } else {
 437                 int proto;
 438                 int on = 1;
 439
 440                 sock_gettype(so, NULL, NULL, &proto);
 441                 if (nmp->nm_sotype != SOCK_STREAM)
 442                         panic("nfscon sotype");
 443
 444                 // Assume that SOCK_STREAM always requires a connection
 445                 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
 446
 447                 if (proto == IPPROTO_TCP) {
 448                         sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
 449                 }
 450
 451                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
 452                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
 453                                 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 454         }
 455
 456         if (sndreserve > NFS_MAXSOCKBUF)
 457                 sndreserve = NFS_MAXSOCKBUF;
 458         if (rcvreserve > NFS_MAXSOCKBUF)
 459                 rcvreserve = NFS_MAXSOCKBUF;
 460         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
 461         if (error) {
 462                 goto bad;
 463         }
 464         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
 465         if (error) {
 466                 goto bad;
 467         }
 468
 469         sock_nointerrupt(so, 1);
 470
 471         /* Initialize other non-zero congestion variables */
 472         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
 473                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
 474         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
 475                 nmp->nm_sdrtt[3] = 0;
 476         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
 477         nmp->nm_sent = 0;
 478         FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
 479         nmp->nm_timeouts = 0;
 480         return (0);
 481
 482 bad:
 483         nfs_disconnect(nmp);
 484         return (error);
 485 }
 486
 487 /*
 488  * Reconnect routine:
 489  * Called when a connection is broken on a reliable protocol.
 490  * - clean up the old socket
 491  * - nfs_connect() again
 492  * - set R_MUSTRESEND for all outstanding requests on mount point
 493  * If this fails the mount point is DEAD!
 494  * nb: Must be called with the nfs_sndlock() set on the mount point.
 495  */
 496 static int
 497 nfs_reconnect(struct nfsreq *rep)
 498 {
 499         struct nfsreq *rp;
 500         struct nfsmount *nmp = rep->r_nmp;
 501         int error;
 502
 503         nfs_disconnect(nmp);
 504         while ((error = nfs_connect(nmp, rep))) {
 505                 if (error == EINTR || error == ERESTART)
 506                         return (EINTR);
 507                 if (error == EIO)
 508                         return (EIO);
 509                 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
 510                         "can not connect");
 511                 rep->r_flags |= R_TPRINTFMSG;
 512                 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
 513                         /* we're not yet completely mounted and */
 514                         /* we can't reconnect, so we fail */
 515                         return (error);
 516                 }
 517                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
 518                         return (error);
 519                 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
 520         }
 521
 522         /*
 523          * Loop through outstanding request list and fix up all requests
 524          * on old socket.
 525          */
 526         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
 527                 if (rp->r_nmp == nmp)
 528                         rp->r_flags |= R_MUSTRESEND;
 529         }
 530         return (0);
 531 }
 532
 533 /*
 534  * NFS disconnect. Clean up and unlink.
 535  */
 536 void
 537 nfs_disconnect(struct nfsmount *nmp)
 538 {
 539         socket_t so;
 540
 541         if (nmp->nm_so) {
 542                 so = nmp->nm_so;
 543                 nmp->nm_so = 0;
 544                 sock_shutdown(so, 2);
 545                 sock_close(so);
 546         }
 547 }
 548
 549 /*
 550  * This is the nfs send routine. For connection based socket types, it
 551  * must be called with an nfs_sndlock() on the socket.
 552  * "rep == NULL" indicates that it has been called from a server.
 553  * For the client side:
 554  * - return EINTR if the RPC is terminated, 0 otherwise
 555  * - set R_MUSTRESEND if the send fails for any reason
 556  * - do any cleanup required by recoverable socket errors (???)
 557  * For the server side:
 558  * - return EINTR or ERESTART if interrupted by a signal
 559  * - return EPIPE if a connection is lost for connection based sockets (TCP...)
 560  * - do any cleanup required by recoverable socket errors (???)
 561  */
 562 int
 563 nfs_send(so, nam, top, rep)
 564         socket_t so;
 565         mbuf_t nam;
 566         mbuf_t top;
 567         struct nfsreq *rep;
 568 {
 569         struct sockaddr *sendnam;
 570         int error, error2, sotype, flags;
 571         u_long xidqueued = 0;
 572         struct nfsreq *rp;
 573         char savenametolog[MAXPATHLEN];
 574         struct msghdr msg;
 575
 576         if (rep) {
 577                 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 578                 if (error) {
 579                         mbuf_freem(top);
 580                         return (error);
 581                 }
 582                 if ((so = rep->r_nmp->nm_so) == NULL) {
 583                         rep->r_flags |= R_MUSTRESEND;
 584                         mbuf_freem(top);
 585                         return (0);
 586                 }
 587                 rep->r_flags &= ~R_MUSTRESEND;
 588                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 589                         if (rp == rep)
 590                                 break;
 591                 if (rp)
 592                         xidqueued = rp->r_xid;
 593         }
 594         sock_gettype(so, NULL, &sotype, NULL);
 595         if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
 596             (nam == 0))
 597                 sendnam = (struct sockaddr *)0;
 598         else
 599                 sendnam = mbuf_data(nam);
 600
 601         if (sotype == SOCK_SEQPACKET)
 602                 flags = MSG_EOR;
 603         else
 604                 flags = 0;
 605
 606         /*
 607          * Save the name here in case mount point goes away if we block.
 608          * The name is using local stack and is large, but don't
 609          * want to block if we malloc.
 610          */
 611         if (rep)
 612                 strncpy(savenametolog,
 613                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
 614                         MAXPATHLEN - 1);
 615         bzero(&msg, sizeof(msg));
 616         msg.msg_name = (caddr_t)sendnam;
 617         msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
 618         error = sock_sendmbuf(so, &msg, top, flags, NULL);
 619
 620         if (error) {
 621                 if (rep) {
 622                         if (xidqueued) {
 623                                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 624                                         if (rp == rep && rp->r_xid == xidqueued)
 625                                                 break;
 626                                 if (!rp)
 627                                         panic("nfs_send: error %d xid %x gone",
 628                                               error, xidqueued);
 629                         }
 630                         log(LOG_INFO, "nfs send error %d for server %s\n",
 631                             error, savenametolog);
 632                         /*
 633                          * Deal with errors for the client side.
 634                          */
 635                         error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 636                         if (error2) {
 637                                 error = error2;
 638                         } else {
 639                                 rep->r_flags |= R_MUSTRESEND;
 640                         }
 641                 } else
 642                         log(LOG_INFO, "nfsd send error %d\n", error);
 643
 644                 /*
 645                  * Handle any recoverable (soft) socket errors here. (???)
 646                  */
 647                 if (error != EINTR && error != ERESTART && error != EIO &&
 648                         error != EWOULDBLOCK && error != EPIPE) {
 649                         error = 0;
 650                 }
 651         }
 652         return (error);
 653 }
 654
 655 /*
 656  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
 657  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
 658  * Mark and consolidate the data into a new mbuf list.
 659  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
 660  *     small mbufs.
 661  * For SOCK_STREAM we must be very careful to read an entire record once
 662  * we have read any of it, even if the system call has been interrupted.
 663  */
 664 static int
 665 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
 666 {
 667         socket_t so;
 668         struct iovec_32 aio;
 669         mbuf_t m, mlast;
 670         u_long len, fraglen;
 671         int error, error2, sotype;
 672         proc_t p = current_proc();      /* XXX */
 673         struct msghdr msg;
 674         size_t rcvlen;
 675         int lastfragment;
 676
 677         /*
 678          * Set up arguments for soreceive()
 679          */
 680         *mp = NULL;
 681         sotype = rep->r_nmp->nm_sotype;
 682
 683         /*
 684          * For reliable protocols, lock against other senders/receivers
 685          * in case a reconnect is necessary.
 686          * For SOCK_STREAM, first get the Record Mark to find out how much
 687          * more there is to get.
 688          * We must lock the socket against other receivers
 689          * until we have an entire rpc request/reply.
 690          */
 691         if (sotype != SOCK_DGRAM) {
 692                 error = nfs_sndlock(rep);
 693                 if (error)
 694                         return (error);
 695 tryagain:
 696                 /*
 697                  * Check for fatal errors and resending request.
 698                  */
 699                 /*
 700                  * Ugh: If a reconnect attempt just happened, nm_so
 701                  * would have changed. NULL indicates a failed
 702                  * attempt that has essentially shut down this
 703                  * mount point.
 704                  */
 705                 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
 706                         nfs_sndunlock(rep);
 707                         if (error)
 708                                 return (error);
 709                         return (EINTR);
 710                 }
 711                 so = rep->r_nmp->nm_so;
 712                 if (!so) {
 713                         error = nfs_reconnect(rep);
 714                         if (error) {
 715                                 nfs_sndunlock(rep);
 716                                 return (error);
 717                         }
 718                         goto tryagain;
 719                 }
 720                 while (rep->r_flags & R_MUSTRESEND) {
 721                         error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
 722                         if (!error) {
 723                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
 724                                 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
 725                         }
 726                         /*
 727                          * we also hold rcv lock so rep is still
 728                          * legit this point
 729                          */
 730                         if (error) {
 731                                 if (error == EINTR || error == ERESTART ||
 732                                     (error = nfs_reconnect(rep))) {
 733                                         nfs_sndunlock(rep);
 734                                         return (error);
 735                                 }
 736                                 goto tryagain;
 737                         }
 738                 }
 739                 nfs_sndunlock(rep);
 740                 if (sotype == SOCK_STREAM) {
 741                         error = 0;
 742                         len = 0;
 743                         lastfragment = 0;
 744                         mlast = NULL;
 745                         while (!error && !lastfragment) {
 746                                 aio.iov_base = (uintptr_t) &fraglen;
 747                                 aio.iov_len = sizeof(u_long);
 748                                 bzero(&msg, sizeof(msg));
 749                                 msg.msg_iov = (struct iovec *) &aio;
 750                                 msg.msg_iovlen = 1;
 751                                 do {
 752                                    error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
 753                                    if (!rep->r_nmp) /* if unmounted then bailout */
 754                                         goto shutout;
 755                                    if (error == EWOULDBLOCK && rep) {
 756                                         error2 = nfs_sigintr(rep->r_nmp, rep, p);
 757                                         if (error2)
 758                                                 error = error2;
 759                                    }
 760                                 } while (error == EWOULDBLOCK);
 761                                 if (!error && rcvlen < aio.iov_len) {
 762                                     /* only log a message if we got a partial word */
 763                                     if (rcvlen != 0)
 764                                             log(LOG_INFO,
 765                                                  "short receive (%d/%d) from nfs server %s\n",
 766                                                  rcvlen, sizeof(u_long),
 767                                                  vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 768                                     error = EPIPE;
 769                                 }
 770                                 if (error)
 771                                         goto errout;
 772                                 lastfragment = ntohl(fraglen) & 0x80000000;
 773                                 fraglen = ntohl(fraglen) & ~0x80000000;
 774                                 len += fraglen;
 775                                 /*
 776                                  * This is SERIOUS! We are out of sync with the sender
 777                                  * and forcing a disconnect/reconnect is all I can do.
 778                                  */
 779                                 if (len > NFS_MAXPACKET) {
 780                                     log(LOG_ERR, "%s (%d) from nfs server %s\n",
 781                                         "impossible RPC record length", len,
 782                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 783                                     error = EFBIG;
 784                                     goto errout;
 785                                 }
 786
 787                                 m = NULL;
 788                                 do {
 789                                     rcvlen = fraglen;
 790                                     error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
 791                                     if (!rep->r_nmp) /* if unmounted then bailout */ {
 792                                         goto shutout;
 793                                     }
 794                                 } while (error == EWOULDBLOCK || error == EINTR ||
 795                                          error == ERESTART);
 796
 797                                 if (!error && fraglen > rcvlen) {
 798                                     log(LOG_INFO,
 799                                         "short receive (%d/%d) from nfs server %s\n",
 800                                         rcvlen, fraglen,
 801                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 802                                     error = EPIPE;
 803                                     mbuf_freem(m);
 804                                 }
 805                                 if (!error) {
 806                                         if (!*mp) {
 807                                                 *mp = m;
 808                                                 mlast = m;
 809                                         } else {
 810                                                 error = mbuf_setnext(mlast, m);
 811                                                 if (error) {
 812                                                         printf("nfs_receive: mbuf_setnext failed %d\n", error);
 813                                                         mbuf_freem(m);
 814                                                 }
 815                                         }
 816                                         while (mbuf_next(mlast))
 817                                                 mlast = mbuf_next(mlast);
 818                                 }
 819                         }
 820                 } else {
 821                         bzero(&msg, sizeof(msg));
 822                         do {
 823                             rcvlen = 100000000;
 824                             error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 825                             if (!rep->r_nmp) /* if unmounted then bailout */ {
 826                                 goto shutout;
 827                             }
 828                             if (error == EWOULDBLOCK && rep) {
 829                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 830                                 if (error2) {
 831                                         return (error2);
 832                                 }
 833                             }
 834                         } while (error == EWOULDBLOCK);
 835
 836                         if ((msg.msg_flags & MSG_EOR) == 0)
 837                                 printf("Egad!!\n");
 838                         if (!error && *mp == NULL)
 839                                 error = EPIPE;
 840                         len = rcvlen;
 841                 }
 842 errout:
 843                 if (error && error != EINTR && error != ERESTART) {
 844                         mbuf_freem(*mp);
 845                         *mp = NULL;
 846                         if (error != EPIPE)
 847                                 log(LOG_INFO,
 848                                     "receive error %d from nfs server %s\n", error,
 849                                     vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 850                         error = nfs_sndlock(rep);
 851                         if (!error) {
 852                                 error = nfs_reconnect(rep);
 853                                 if (!error)
 854                                         goto tryagain;
 855                                 nfs_sndunlock(rep);
 856                         }
 857                 }
 858         } else {
 859                 /*
 860                  * We could have failed while rebinding the datagram socket
 861                  * so we need to attempt to rebind here.
 862                  */
 863                 if ((so = rep->r_nmp->nm_so) == NULL) {
 864                         error = nfs_sndlock(rep);
 865                         if (!error) {
 866                                 error = nfs_reconnect(rep);
 867                                 nfs_sndunlock(rep);
 868                         }
 869                         if (error)
 870                                 return (error);
 871                         if (!rep->r_nmp) /* if unmounted then bailout */
 872                                 return (ENXIO);
 873                         so = rep->r_nmp->nm_so;
 874                 }
 875                 bzero(&msg, sizeof(msg));
 876                 len = 0;
 877                 do {
 878                         rcvlen = 1000000;
 879                         error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 880                         if (!rep->r_nmp) /* if unmounted then bailout */
 881                                 goto shutout;
 882                         if (error) {
 883                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 884                                 if (error2) {
 885                                         error = error2;
 886                                         goto shutout;
 887                                 }
 888                         }
 889                         /* Reconnect for all errors.  We may be receiving
 890                          * soft/hard/blocking errors because of a network
 891                          * change.
 892                          * XXX: we should rate limit or delay this
 893                          * to once every N attempts or something.
 894                          * although TCP doesn't seem to.
 895                          */
 896                         if (error) {
 897                                 error2 = nfs_sndlock(rep);
 898                                 if (!error2) {
 899                                         error2 = nfs_reconnect(rep);
 900                                         if (error2)
 901                                                 error = error2;
 902                                         else if (!rep->r_nmp) /* if unmounted then bailout */
 903                                                 error = ENXIO;
 904                                         else
 905                                                 so = rep->r_nmp->nm_so;
 906                                         nfs_sndunlock(rep);
 907                                 } else {
 908                                         error = error2;
 909                                 }
 910                         }
 911                 } while (error == EWOULDBLOCK);
 912         }
 913 shutout:
 914         if (error) {
 915                 mbuf_freem(*mp);
 916                 *mp = NULL;
 917         }
 918         return (error);
 919 }
 920
 921 /*
 922  * Implement receipt of reply on a socket.
 923  * We must search through the list of received datagrams matching them
 924  * with outstanding requests using the xid, until ours is found.
 925  */
 926 /* ARGSUSED */
 927 int
 928 nfs_reply(myrep)
 929         struct nfsreq *myrep;
 930 {
 931         struct nfsreq *rep;
 932         struct nfsmount *nmp = myrep->r_nmp;
 933         long t1;
 934         mbuf_t mrep, md;
 935         u_long rxid, *tl;
 936         caddr_t dpos, cp2;
 937         int error;
 938
 939         /*
 940          * Loop around until we get our own reply
 941          */
 942         for (;;) {
 943                 /*
 944                  * Lock against other receivers so that I don't get stuck in
 945                  * sbwait() after someone else has received my reply for me.
 946                  * Also necessary for connection based protocols to avoid
 947                  * race conditions during a reconnect.
 948                  * If nfs_rcvlock() returns EALREADY, that means that
 949                  * the reply has already been recieved by another
 950                  * process and we can return immediately.  In this
 951                  * case, the lock is not taken to avoid races with
 952                  * other processes.
 953                  */
 954                 error = nfs_rcvlock(myrep);
 955                 if (error == EALREADY)
 956                         return (0);
 957                 if (error)
 958                         return (error);
 959
 960                 /*
 961                  * If we slept after putting bits otw, then reply may have
 962                  * arrived.  In which case returning is required, or we
 963                  * would hang trying to nfs_receive an already received reply.
 964                  */
 965                 if (myrep->r_mrep != NULL) {
 966                         nfs_rcvunlock(myrep);
 967                         FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
 968                         return (0);
 969                 }
 970                 /*
 971                  * Get the next Rpc reply off the socket. Assume myrep->r_nmp
 972                  * is still intact by checks done in nfs_rcvlock.
 973                  */
 974                 error = nfs_receive(myrep, &mrep);
 975                 /*
 976                  * Bailout asap if nfsmount struct gone (unmounted).
 977                  */
 978                 if (!myrep->r_nmp) {
 979                         FSDBG(530, myrep->r_xid, myrep, nmp, -2);
 980                         if (mrep)
 981                                 mbuf_freem(mrep);
 982                         return (ENXIO);
 983                 }
 984                 if (error) {
 985                         FSDBG(530, myrep->r_xid, myrep, nmp, error);
 986                         nfs_rcvunlock(myrep);
 987
 988                         /* Bailout asap if nfsmount struct gone (unmounted). */
 989                         if (!myrep->r_nmp) {
 990                                 if (mrep)
 991                                         mbuf_freem(mrep);
 992                                 return (ENXIO);
 993                         }
 994
 995                         /*
 996                          * Ignore routing errors on connectionless protocols??
 997                          */
 998                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
 999                                 if (nmp->nm_so) {
1000                                         int clearerror;
1001                                         int optlen = sizeof(clearerror);
1002                                         sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1003                                 }
1004                                 continue;
1005                         }
1006                         if (mrep)
1007                                 mbuf_freem(mrep);
1008                         return (error);
1009                 }
1010
1011                 /*
1012                  * We assume all is fine, but if we did not have an error
1013                  * and mrep is 0, better not dereference it. nfs_receive
1014                  * calls soreceive which carefully sets error=0 when it got
1015                  * errors on sbwait (tsleep). In most cases, I assume that's
1016                  * so we could go back again. In tcp case, EPIPE is returned.
1017                  * In udp, case nfs_receive gets back here with no error and no
1018                  * mrep. Is the right fix to have soreceive check for process
1019                  * aborted after sbwait and return something non-zero? Should
1020                  * nfs_receive give an EPIPE?  Too risky to play with those
1021                  * two this late in game for a shutdown problem. Instead,
1022                  * just check here and get out. (ekn)
1023                  */
1024                 if (!mrep) {
1025                         nfs_rcvunlock(myrep);
1026                         FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1027                         return (ENXIO); /* sounds good */
1028                 }
1029
1030                 /*
1031                  * Get the xid and check that it is an rpc reply
1032                  */
1033                 md = mrep;
1034                 dpos = mbuf_data(md);
1035                 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1036                 rxid = *tl++;
1037                 if (*tl != rpc_reply) {
1038                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1039                         mbuf_freem(mrep);
1040 nfsmout:
1041                         if (nmp->nm_state & NFSSTA_RCVLOCK)
1042                                 nfs_rcvunlock(myrep);
1043                         continue;
1044                 }
1045
1046                 /*
1047                  * Loop through the request list to match up the reply
1048                  * Iff no match, just drop the datagram
1049                  */
1050                 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1051                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1052                                 /* Found it.. */
1053                                 rep->r_mrep = mrep;
1054                                 rep->r_md = md;
1055                                 rep->r_dpos = dpos;
1056                                 /*
1057                                  * If we're tracking the round trip time
1058                                  * then we update the circular log here
1059                                  * with the stats from our current request.
1060                                  */
1061                                 if (nfsrtton) {
1062                                         struct rttl *rt;
1063
1064                                         rt = &nfsrtt.rttl[nfsrtt.pos];
1065                                         rt->proc = rep->r_procnum;
1066                                         rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1067                                         rt->sent = nmp->nm_sent;
1068                                         rt->cwnd = nmp->nm_cwnd;
1069                                         if (proct[rep->r_procnum] == 0)
1070                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1071                                         rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1072                                         rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1073                                         rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1074                                         microtime(&rt->tstamp); // XXX unused
1075                                         if (rep->r_flags & R_TIMING)
1076                                                 rt->rtt = rep->r_rtt;
1077                                         else
1078                                                 rt->rtt = 1000000;
1079                                         nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1080                                 }
1081                                 /*
1082                                  * Update congestion window.
1083                                  * Do the additive increase of
1084                                  * one rpc/rtt.
1085                                  */
1086                                 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1087                                       nmp->nm_cwnd);
1088                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
1089                                         nmp->nm_cwnd +=
1090                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
1091                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1092                                         if (nmp->nm_cwnd > NFS_MAXCWND)
1093                                                 nmp->nm_cwnd = NFS_MAXCWND;
1094                                 }
1095                                 if (rep->r_flags & R_SENT) {
1096                                     rep->r_flags &= ~R_SENT;
1097                                     nmp->nm_sent -= NFS_CWNDSCALE;
1098                                }
1099                                 /*
1100                                  * Update rtt using a gain of 0.125 on the mean
1101                                  * and a gain of 0.25 on the deviation.
1102                                  */
1103                                 if (rep->r_flags & R_TIMING) {
1104                                         /*
1105                                          * Since the timer resolution of
1106                                          * NFS_HZ is so course, it can often
1107                                          * result in r_rtt == 0. Since
1108                                          * r_rtt == N means that the actual
1109                                          * rtt is between N+dt and N+2-dt ticks,
1110                                          * add 1.
1111                                          */
1112                                         if (proct[rep->r_procnum] == 0)
1113                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1114                                         t1 = rep->r_rtt + 1;
1115                                         t1 -= (NFS_SRTT(rep) >> 3);
1116                                         NFS_SRTT(rep) += t1;
1117                                         if (t1 < 0)
1118                                                 t1 = -t1;
1119                                         t1 -= (NFS_SDRTT(rep) >> 2);
1120                                         NFS_SDRTT(rep) += t1;
1121                                 }
1122                                 nmp->nm_timeouts = 0;
1123                                 break;
1124                         }
1125                 }
1126                 nfs_rcvunlock(myrep);
1127                 /*
1128                  * If not matched to a request, drop it.
1129                  * If it's mine, get out.
1130                  */
1131                 if (rep == 0) {
1132                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1133                         mbuf_freem(mrep);
1134                 } else if (rep == myrep) {
1135                         if (rep->r_mrep == NULL)
1136                                 panic("nfs_reply: nil r_mrep");
1137                         return (0);
1138                 }
1139                 FSDBG(530, myrep->r_xid, myrep, rep,
1140                       rep ? rep->r_xid : myrep->r_flags);
1141         }
1142 }
1143
1144 /*
1145  * nfs_request - goes something like this
1146  *      - fill in request struct
1147  *      - links it into list
1148  *      - calls nfs_send() for first transmit
1149  *      - calls nfs_receive() to get reply
1150  *      - break down rpc header and return with nfs reply pointed to
1151  *        by mrep or error
1152  * nb: always frees up mreq mbuf list
1153  */
1154 int
1155 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1156         vnode_t vp;
1157         mount_t mp;
1158         mbuf_t mrest;
1159         int procnum;
1160         proc_t procp;
1161         kauth_cred_t cred;
1162         mbuf_t *mrp;
1163         mbuf_t *mdp;
1164         caddr_t *dposp;
1165         u_int64_t *xidp;
1166 {
1167         mbuf_t m, mrep, m2;
1168         struct nfsreq re, *rep;
1169         u_long *tl;
1170         int i;
1171         struct nfsmount *nmp;
1172         mbuf_t md, mheadend;
1173         char nickv[RPCX_NICKVERF];
1174         time_t waituntil;
1175         caddr_t dpos, cp2;
1176         int t1, error = 0, mrest_len, auth_len, auth_type;
1177         int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1178         int verf_len, verf_type;
1179         u_long xid;
1180         char *auth_str, *verf_str;
1181         NFSKERBKEY_T key;               /* save session key */
1182         int nmsotype;
1183         struct timeval now;
1184
1185         if (mrp)
1186                 *mrp = NULL;
1187         if (xidp)
1188                 *xidp = 0;
1189         nmp = VFSTONFS(mp);
1190
1191         rep = &re;
1192
1193         if (vp)
1194                 nmp = VFSTONFS(vnode_mount(vp));
1195         if (nmp == NULL ||
1196             (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1197             (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1198                 mbuf_freem(mrest);
1199                 return (ENXIO);
1200         }
1201         nmsotype = nmp->nm_sotype;
1202
1203         FSDBG_TOP(531, vp, procnum, nmp, rep);
1204
1205         rep->r_nmp = nmp;
1206         rep->r_vp = vp;
1207         rep->r_procp = procp;
1208         rep->r_procnum = procnum;
1209         microuptime(&now);
1210         rep->r_lastmsg = now.tv_sec -
1211             ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1212         i = 0;
1213         m = mrest;
1214         while (m) {
1215                 i += mbuf_len(m);
1216                 m = mbuf_next(m);
1217         }
1218         mrest_len = i;
1219
1220         /*
1221          * Get the RPC header with authorization.
1222          */
1223 kerbauth:
1224         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1225         if (!nmp) {
1226                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1227                 mbuf_freem(mrest);
1228                 return (ENXIO);
1229         }
1230         verf_str = auth_str = (char *)0;
1231         if (nmp->nm_flag & NFSMNT_KERB) {
1232                 verf_str = nickv;
1233                 verf_len = sizeof (nickv);
1234                 auth_type = RPCAUTH_KERB4;
1235                 bzero((caddr_t)key, sizeof (key));
1236                 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1237                         &auth_len, verf_str, verf_len)) {
1238                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1239                         if (!nmp) {
1240                                 FSDBG_BOT(531, 2, vp, error, rep);
1241                                 mbuf_freem(mrest);
1242                                 return (ENXIO);
1243                         }
1244                         error = nfs_getauth(nmp, rep, cred, &auth_str,
1245                                 &auth_len, verf_str, &verf_len, key);
1246                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1247                         if (!error && !nmp)
1248                                 error = ENXIO;
1249                         if (error) {
1250                                 FSDBG_BOT(531, 2, vp, error, rep);
1251                                 mbuf_freem(mrest);
1252                                 return (error);
1253                         }
1254                 }
1255         } else {
1256                 auth_type = RPCAUTH_UNIX;
1257                 if (cred->cr_ngroups < 1)
1258                         panic("nfsreq nogrps");
1259                 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1260                         nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1261                         5 * NFSX_UNSIGNED;
1262         }
1263         error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1264              auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1265         if (auth_str)
1266                 _FREE(auth_str, M_TEMP);
1267         if (error) {
1268                 mbuf_freem(mrest);
1269                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1270                 return (error);
1271         }
1272         if (xidp)
1273                 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1274
1275         /*
1276          * For stream protocols, insert a Sun RPC Record Mark.
1277          */
1278         if (nmsotype == SOCK_STREAM) {
1279                 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1280                 if (error) {
1281                         mbuf_freem(m);
1282                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1283                         return (error);
1284                 }
1285                 *((u_long*)mbuf_data(m)) =
1286                         htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1287         }
1288         rep->r_mreq = m;
1289         rep->r_xid = xid;
1290 tryagain:
1291         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1292         if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1293                 rep->r_retry = nmp->nm_retry;
1294         else
1295                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
1296         rep->r_rtt = rep->r_rexmit = 0;
1297         if (proct[procnum] > 0)
1298                 rep->r_flags = R_TIMING;
1299         else
1300                 rep->r_flags = 0;
1301         rep->r_mrep = NULL;
1302
1303         /*
1304          * Do the client side RPC.
1305          */
1306         OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1307         /*
1308          * Chain request into list of outstanding requests. Be sure
1309          * to put it LAST so timer finds oldest requests first.
1310          */
1311         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1312
1313         /*
1314          * If backing off another request or avoiding congestion, don't
1315          * send this one now but let timer do it. If not timing a request,
1316          * do it now.
1317          */
1318         if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1319                            (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1320                            nmp->nm_sent < nmp->nm_cwnd)) {
1321                 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1322
1323                 if (connrequired)
1324                         error = nfs_sndlock(rep);
1325
1326                 /*
1327                  * Set the R_SENT before doing the send in case another thread
1328                  * processes the reply before the nfs_send returns here
1329                  */
1330                 if (!error) {
1331                         if ((rep->r_flags & R_MUSTRESEND) == 0) {
1332                                 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1333                                       nmp->nm_cwnd);
1334                                 nmp->nm_sent += NFS_CWNDSCALE;
1335                                 rep->r_flags |= R_SENT;
1336                         }
1337
1338                         error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1339                         if (!error)
1340                                 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1341                         if (connrequired)
1342                                 nfs_sndunlock(rep);
1343                 }
1344                 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1345                 if (error) {
1346                         if (nmp)
1347                                 nmp->nm_sent -= NFS_CWNDSCALE;
1348                         rep->r_flags &= ~R_SENT;
1349                 }
1350         } else {
1351                 rep->r_rtt = -1;
1352         }
1353
1354         /*
1355          * Wait for the reply from our send or the timer's.
1356          */
1357         if (!error || error == EPIPE)
1358                 error = nfs_reply(rep);
1359
1360         /*
1361          * RPC done, unlink the request.
1362          */
1363         nfs_repdequeue(rep);
1364
1365         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1366
1367         /*
1368          * Decrement the outstanding request count.
1369          */
1370         if (rep->r_flags & R_SENT) {
1371                 rep->r_flags &= ~R_SENT;        /* paranoia */
1372                 if (nmp) {
1373                         FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1374                         nmp->nm_sent -= NFS_CWNDSCALE;
1375                 }
1376         }
1377
1378         /*
1379          * If there was a successful reply and a tprintf msg.
1380          * tprintf a response.
1381          */
1382         if (!error)
1383                 nfs_up(nmp, procp, NFSSTA_TIMEO,
1384                         (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1385         mrep = rep->r_mrep;
1386         md = rep->r_md;
1387         dpos = rep->r_dpos;
1388         if (!error && !nmp)
1389                 error = ENXIO;
1390         if (error) {
1391                 mbuf_freem(rep->r_mreq);
1392                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1393                 return (error);
1394         }
1395
1396         /*
1397          * break down the rpc header and check if ok
1398          */
1399         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1400         if (*tl++ == rpc_msgdenied) {
1401                 if (*tl == rpc_mismatch)
1402                         error = EOPNOTSUPP;
1403                 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1404                         if (!failed_auth) {
1405                                 failed_auth++;
1406                                 error = mbuf_setnext(mheadend, NULL);
1407                                 mbuf_freem(mrep);
1408                                 mbuf_freem(rep->r_mreq);
1409                                 if (!error)
1410                                         goto kerbauth;
1411                                 printf("nfs_request: mbuf_setnext failed\n");
1412                         } else
1413                                 error = EAUTH;
1414                 } else
1415                         error = EACCES;
1416                 mbuf_freem(mrep);
1417                 mbuf_freem(rep->r_mreq);
1418                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1419                 return (error);
1420         }
1421
1422         /*
1423          * Grab any Kerberos verifier, otherwise just throw it away.
1424          */
1425         verf_type = fxdr_unsigned(int, *tl++);
1426         i = fxdr_unsigned(int, *tl);
1427         if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1428                 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1429                 if (error)
1430                         goto nfsmout;
1431         } else if (i > 0)
1432                 nfsm_adv(nfsm_rndup(i));
1433         nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1434         /* 0 == ok */
1435         if (*tl == 0) {
1436                 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1437                 if (*tl != 0) {
1438                         error = fxdr_unsigned(int, *tl);
1439                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1440                                 error == NFSERR_TRYLATER) {
1441                                 mbuf_freem(mrep);
1442                                 error = 0;
1443                                 microuptime(&now);
1444                                 waituntil = now.tv_sec + trylater_delay;
1445                                 while (now.tv_sec < waituntil) {
1446                                         tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1447                                         microuptime(&now);
1448                                 }
1449                                 trylater_delay *= 2;
1450                                 if (trylater_delay > 60)
1451                                         trylater_delay = 60;
1452                                 goto tryagain;
1453                         }
1454
1455                         /*
1456                          * If the File Handle was stale, invalidate the
1457                          * lookup cache, just in case.
1458                          */
1459                         if ((error == ESTALE) && vp)
1460                                 cache_purge(vp);
1461                         if (nmp->nm_flag & NFSMNT_NFSV3) {
1462                                 *mrp = mrep;
1463                                 *mdp = md;
1464                                 *dposp = dpos;
1465                                 error |= NFSERR_RETERR;
1466                         } else {
1467                                 mbuf_freem(mrep);
1468                                 error &= ~NFSERR_RETERR;
1469                         }
1470                         mbuf_freem(rep->r_mreq);
1471                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1472                         return (error);
1473                 }
1474
1475                 *mrp = mrep;
1476                 *mdp = md;
1477                 *dposp = dpos;
1478                 mbuf_freem(rep->r_mreq);
1479                 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1480                 return (0);
1481         }
1482         mbuf_freem(mrep);
1483         error = EPROTONOSUPPORT;
1484 nfsmout:
1485         mbuf_freem(rep->r_mreq);
1486         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1487         return (error);
1488 }
1489
1490 #ifndef NFS_NOSERVER
1491 /*
1492  * Generate the rpc reply header
1493  * siz arg. is used to decide if adding a cluster is worthwhile
1494  */
1495 int
1496 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1497         int siz;
1498         struct nfsrv_descript *nd;
1499         struct nfssvc_sock *slp;
1500         int err;
1501         mbuf_t *mrq;
1502         mbuf_t *mbp;
1503         caddr_t *bposp;
1504 {
1505         u_long *tl;
1506         mbuf_t mreq;
1507         caddr_t bpos;
1508         mbuf_t mb, mb2;
1509         int error, mlen;
1510
1511         /*
1512          * If this is a big reply, use a cluster else
1513          * try and leave leading space for the lower level headers.
1514          */
1515         siz += RPC_REPLYSIZ;
1516         if (siz >= nfs_mbuf_minclsize) {
1517                 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1518         } else {
1519                 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1520         }
1521         if (error) {
1522                 /* unable to allocate packet */
1523                 /* XXX nfsstat? */
1524                 return (error);
1525         }
1526         mb = mreq;
1527         tl = mbuf_data(mreq);
1528         mlen = 6 * NFSX_UNSIGNED;
1529         if (siz < nfs_mbuf_minclsize) {
1530                 /* leave space for lower level headers */
1531                 tl += 80/sizeof(*tl);  /* XXX max_hdr? XXX */
1532                 mbuf_setdata(mreq, tl, mlen);
1533         } else {
1534                 mbuf_setlen(mreq, mlen);
1535         }
1536         bpos = ((caddr_t)tl) + mlen;
1537         *tl++ = txdr_unsigned(nd->nd_retxid);
1538         *tl++ = rpc_reply;
1539         if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1540                 *tl++ = rpc_msgdenied;
1541                 if (err & NFSERR_AUTHERR) {
1542                         *tl++ = rpc_autherr;
1543                         *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1544                         mlen -= NFSX_UNSIGNED;
1545                         mbuf_setlen(mreq, mlen);
1546                         bpos -= NFSX_UNSIGNED;
1547                 } else {
1548                         *tl++ = rpc_mismatch;
1549                         *tl++ = txdr_unsigned(RPC_VER2);
1550                         *tl = txdr_unsigned(RPC_VER2);
1551                 }
1552         } else {
1553                 *tl++ = rpc_msgaccepted;
1554
1555                 /*
1556                  * For Kerberos authentication, we must send the nickname
1557                  * verifier back, otherwise just RPCAUTH_NULL.
1558                  */
1559                 if (nd->nd_flag & ND_KERBFULL) {
1560                     struct nfsuid *nuidp;
1561                     struct timeval ktvin, ktvout;
1562                     uid_t uid = kauth_cred_getuid(nd->nd_cr);
1563
1564                     lck_rw_lock_shared(&slp->ns_rwlock);
1565                     for (nuidp = NUIDHASH(slp, uid)->lh_first;
1566                         nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1567                         if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1568                             (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1569                              &nuidp->nu_haddr, nd->nd_nam2)))
1570                             break;
1571                     }
1572                     if (nuidp) {
1573                         ktvin.tv_sec =
1574                             txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1575                         ktvin.tv_usec =
1576                             txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1577
1578                         /*
1579                          * Encrypt the timestamp in ecb mode using the
1580                          * session key.
1581                          */
1582 #if NFSKERB
1583                         XXX
1584 #endif
1585
1586                         *tl++ = rpc_auth_kerb;
1587                         *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1588                         *tl = ktvout.tv_sec;
1589                         nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1590                         *tl++ = ktvout.tv_usec;
1591                         *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1592                     } else {
1593                         *tl++ = 0;
1594                         *tl++ = 0;
1595                     }
1596                     lck_rw_done(&slp->ns_rwlock);
1597                 } else {
1598                         *tl++ = 0;
1599                         *tl++ = 0;
1600                 }
1601                 switch (err) {
1602                 case EPROGUNAVAIL:
1603                         *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1604                         break;
1605                 case EPROGMISMATCH:
1606                         *tl = txdr_unsigned(RPC_PROGMISMATCH);
1607                         nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1608                         // XXX hard coded versions
1609                         *tl++ = txdr_unsigned(2);
1610                         *tl = txdr_unsigned(3);
1611                         break;
1612                 case EPROCUNAVAIL:
1613                         *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1614                         break;
1615                 case EBADRPC:
1616                         *tl = txdr_unsigned(RPC_GARBAGE);
1617                         break;
1618                 default:
1619                         *tl = 0;
1620                         if (err != NFSERR_RETVOID) {
1621                                 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1622                                 if (err)
1623                                     *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1624                                 else
1625                                     *tl = 0;
1626                         }
1627                         break;
1628                 }
1629         }
1630
1631         if (mrq != NULL)
1632                 *mrq = mreq;
1633         *mbp = mb;
1634         *bposp = bpos;
1635         if (err != 0 && err != NFSERR_RETVOID) {
1636                 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1637         }
1638         return (0);
1639 }
1640
1641
1642 #endif /* NFS_NOSERVER */
1643
1644
1645 /*
1646  * From FreeBSD 1.58, a Matt Dillon fix...
1647  * Flag a request as being about to terminate.
1648  * The nm_sent count is decremented now to avoid deadlocks when the process
1649  * in soreceive() hasn't yet managed to send its own request.
1650  */
1651 static void
1652 nfs_softterm(struct nfsreq *rep)
1653 {
1654
1655         rep->r_flags |= R_SOFTTERM;
1656         if (rep->r_flags & R_SENT) {
1657                 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1658                       rep->r_nmp->nm_cwnd);
1659                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1660                 rep->r_flags &= ~R_SENT;
1661         }
1662 }
1663
1664 void
1665 nfs_timer_funnel(void * arg)
1666 {
1667         (void) thread_funnel_set(kernel_flock, TRUE);
1668         nfs_timer(arg);
1669         (void) thread_funnel_set(kernel_flock, FALSE);
1670
1671 }
1672
1673 /*
1674  * Ensure rep isn't in use by the timer, then dequeue it.
1675  */
1676 static void
1677 nfs_repdequeue(struct nfsreq *rep)
1678 {
1679
1680         while ((rep->r_flags & R_BUSY)) {
1681                 rep->r_flags |= R_WAITING;
1682                 tsleep(rep, PSOCK, "repdeq", 0);
1683         }
1684         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1685 }
1686
1687 /*
1688  * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1689  * free()'d out from under it.
1690  */
1691 static void
1692 nfs_repbusy(struct nfsreq *rep)
1693 {
1694
1695         if ((rep->r_flags & R_BUSY))
1696                 panic("rep locked");
1697         rep->r_flags |= R_BUSY;
1698 }
1699
1700 /*
1701  * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1702  */
1703 static struct nfsreq *
1704 nfs_repnext(struct nfsreq *rep)
1705 {
1706         struct nfsreq * nextrep;
1707
1708         if (rep == NULL)
1709                 return (NULL);
1710         /*
1711          * We need to get and busy the next req before signalling the
1712          * current one, otherwise wakeup() may block us and we'll race to
1713          * grab the next req.
1714          */
1715         nextrep = TAILQ_NEXT(rep, r_chain);
1716         if (nextrep != NULL)
1717                 nfs_repbusy(nextrep);
1718         /* unbusy and signal. */
1719         rep->r_flags &= ~R_BUSY;
1720         if ((rep->r_flags & R_WAITING)) {
1721                 rep->r_flags &= ~R_WAITING;
1722                 wakeup(rep);
1723         }
1724         return (nextrep);
1725 }
1726
1727 /*
1728  * Nfs timer routine
1729  * Scan the nfsreq list and retranmit any requests that have timed out
1730  * To avoid retransmission attempts on STREAM sockets (in the future) make
1731  * sure to set the r_retry field to 0 (implies nm_retry == 0).
1732  */
1733 void
1734 nfs_timer(__unused void *arg)
1735 {
1736         struct nfsreq *rep;
1737         mbuf_t m;
1738         socket_t so;
1739         struct nfsmount *nmp;
1740         int timeo;
1741         int error;
1742 #ifndef NFS_NOSERVER
1743         struct nfssvc_sock *slp;
1744         u_quad_t cur_usec;
1745 #endif /* NFS_NOSERVER */
1746         int flags, rexmit, cwnd, sent;
1747         u_long xid;
1748         struct timeval now;
1749
1750         rep = TAILQ_FIRST(&nfs_reqq);
1751         if (rep != NULL)
1752                 nfs_repbusy(rep);
1753         microuptime(&now);
1754         for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1755                 nmp = rep->r_nmp;
1756                 if (!nmp) /* unmounted */
1757                     continue;
1758                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1759                         continue;
1760                 if (nfs_sigintr(nmp, rep, rep->r_procp))
1761                         continue;
1762                 if (nmp->nm_tprintf_initial_delay != 0 &&
1763                     (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1764                     rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1765                         rep->r_lastmsg = now.tv_sec;
1766                         nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1767                                 "not responding");
1768                         rep->r_flags |= R_TPRINTFMSG;
1769                         if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1770                                 /* we're not yet completely mounted and */
1771                                 /* we can't complete an RPC, so we fail */
1772                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1773                                 nfs_softterm(rep);
1774                                 continue;
1775                         }
1776                 }
1777                 if (rep->r_rtt >= 0) {
1778                         rep->r_rtt++;
1779                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1780                                 timeo = nmp->nm_timeo;
1781                         else
1782                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1783                         /* ensure 62.5 ms floor */
1784                         while (16 * timeo < hz)
1785                             timeo *= 2;
1786                         if (nmp->nm_timeouts > 0)
1787                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1788                         if (rep->r_rtt <= timeo)
1789                                 continue;
1790                         if (nmp->nm_timeouts < 8)
1791                                 nmp->nm_timeouts++;
1792                 }
1793                 /*
1794                  * Check for too many retransmits.  This is never true for
1795                  * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1796                  * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1797                  */
1798                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
1799                         OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1800                         nfs_softterm(rep);
1801                         continue;
1802                 }
1803                 if (nmp->nm_sotype != SOCK_DGRAM) {
1804                         if (++rep->r_rexmit > NFS_MAXREXMIT)
1805                                 rep->r_rexmit = NFS_MAXREXMIT;
1806                         continue;
1807                 }
1808                 if ((so = nmp->nm_so) == NULL)
1809                         continue;
1810
1811                 /*
1812                  * If there is enough space and the window allows..
1813                  *      Resend it
1814                  * Set r_rtt to -1 in case we fail to send it now.
1815                  */
1816                 rep->r_rtt = -1;
1817                 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1818                     (rep->r_flags & R_SENT) ||
1819                     nmp->nm_sent < nmp->nm_cwnd) &&
1820                    (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1821                         struct msghdr   msg;
1822                         /*
1823                          * Iff first send, start timing
1824                          * else turn timing off, backoff timer
1825                          * and divide congestion window by 2.
1826                          * We update these *before* the send to avoid
1827                          * racing against receiving the reply.
1828                          * We save them so we can restore them on send error.
1829                          */
1830                         flags = rep->r_flags;
1831                         rexmit = rep->r_rexmit;
1832                         cwnd = nmp->nm_cwnd;
1833                         sent = nmp->nm_sent;
1834                         xid = rep->r_xid;
1835                         if (rep->r_flags & R_SENT) {
1836                                 rep->r_flags &= ~R_TIMING;
1837                                 if (++rep->r_rexmit > NFS_MAXREXMIT)
1838                                         rep->r_rexmit = NFS_MAXREXMIT;
1839                                 nmp->nm_cwnd >>= 1;
1840                                 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1841                                         nmp->nm_cwnd = NFS_CWNDSCALE;
1842                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1843                         } else {
1844                                 rep->r_flags |= R_SENT;
1845                                 nmp->nm_sent += NFS_CWNDSCALE;
1846                         }
1847                         FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1848
1849                         bzero(&msg, sizeof(msg));
1850                         if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1851                                 msg.msg_name = mbuf_data(nmp->nm_nam);
1852                                 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1853                         }
1854                         error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1855
1856                         FSDBG(535, xid, error, sent, cwnd);
1857
1858                         if (error) {
1859                                 if (error == EWOULDBLOCK) {
1860                                         rep->r_flags = flags;
1861                                         rep->r_rexmit = rexmit;
1862                                         nmp->nm_cwnd = cwnd;
1863                                         nmp->nm_sent = sent;
1864                                         rep->r_xid = xid;
1865                                 }
1866                                 else {
1867                                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1868                                                 int clearerror;
1869                                                 int optlen = sizeof(clearerror);
1870                                                 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1871                                         }
1872                                         rep->r_flags  = flags | R_RESENDERR;
1873                                         rep->r_rexmit = rexmit;
1874                                         nmp->nm_cwnd = cwnd;
1875                                         nmp->nm_sent = sent;
1876                                         if (flags & R_SENT)
1877                                                 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1878                                 }
1879                         } else
1880                                 rep->r_rtt = 0;
1881                 }
1882         }
1883         microuptime(&now);
1884 #ifndef NFS_NOSERVER
1885         /*
1886          * Scan the write gathering queues for writes that need to be
1887          * completed now.
1888          */
1889         cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1890         lck_mtx_lock(nfsd_mutex);
1891         TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1892             if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1893                 nfsrv_wakenfsd(slp);
1894         }
1895         lck_mtx_unlock(nfsd_mutex);
1896 #endif /* NFS_NOSERVER */
1897
1898         if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1899                 /*
1900                  * We haven't called nfs_buf_freeup() in a little while.
1901                  * So, see if we can free up any stale/unused bufs now.
1902                  */
1903                 nfs_buf_freeup(1);
1904         }
1905
1906         timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1907
1908 }
1909
1910
1911 /*
1912  * Test for a termination condition pending on the process.
1913  * This is used to determine if we need to bail on a mount.
1914  * EIO is returned if there has been a soft timeout.
1915  * EINTR is returned if there is a signal pending that is not being ignored
1916  * and the mount is interruptable, or if we are a thread that is in the process
1917  * of cancellation (also SIGKILL posted).
1918  */
1919 int
1920 nfs_sigintr(nmp, rep, p)
1921         struct nfsmount *nmp;
1922         struct nfsreq *rep;
1923         proc_t p;
1924 {
1925         sigset_t pending_sigs;
1926         int context_good = 0;
1927         struct nfsmount *repnmp;
1928         extern proc_t kernproc;
1929
1930         if (nmp == NULL)
1931                 return (ENXIO);
1932         if (rep != NULL) {
1933                 repnmp = rep->r_nmp;
1934                 /* we've had a forced unmount. */
1935                 if (repnmp == NULL)
1936                         return (ENXIO);
1937                 /* request has timed out on a 'soft' mount. */
1938                 if (rep->r_flags & R_SOFTTERM)
1939                         return (EIO);
1940                 /*
1941                  * We're in the progress of a force unmount and there's
1942                  * been a timeout we're dead and fail IO.
1943                  */
1944                 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1945                    (NFSSTA_FORCE|NFSSTA_TIMEO))
1946                         return (EIO);
1947                 /* Someone is unmounting us, go soft and mark it. */
1948                 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1949                         repnmp->nm_flag |= NFSMNT_SOFT;
1950                         nmp->nm_state |= NFSSTA_FORCE;
1951                 }
1952                 /*
1953                  * If the mount is hung and we've requested not to hang
1954                  * on remote filesystems, then bail now.
1955                  */
1956                 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1957                     (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1958                         return (EIO);
1959         }
1960         /* XXX: is this valid?  this probably should be an assertion. */
1961         if (p == NULL)
1962                 return (0);
1963
1964         /* Is this thread belongs to kernel task; then abort check  is not needed */
1965         if ((current_proc() != kernproc) && current_thread_aborted()) {
1966                 return (EINTR);
1967         }
1968         /* mask off thread and process blocked signals. */
1969
1970         pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1971         if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1972                 return (EINTR);
1973         return (0);
1974 }
1975
1976 /*
1977  * Lock a socket against others.
1978  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1979  * and also to avoid race conditions between the processes with nfs requests
1980  * in progress when a reconnect is necessary.
1981  */
1982 int
1983 nfs_sndlock(rep)
1984         struct nfsreq *rep;
1985 {
1986         int *statep;
1987         proc_t p;
1988         int error, slpflag = 0, slptimeo = 0;
1989
1990         if (rep->r_nmp == NULL)
1991                 return (ENXIO);
1992         statep = &rep->r_nmp->nm_state;
1993
1994         p = rep->r_procp;
1995         if (rep->r_nmp->nm_flag & NFSMNT_INT)
1996                 slpflag = PCATCH;
1997         while (*statep & NFSSTA_SNDLOCK) {
1998                 error = nfs_sigintr(rep->r_nmp, rep, p);
1999                 if (error)
2000                         return (error);
2001                 *statep |= NFSSTA_WANTSND;
2002                 if (p != NULL && (proc_noremotehang(p)) != 0)
2003                         slptimeo = hz;
2004                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2005                 if (slpflag == PCATCH) {
2006                         slpflag = 0;
2007                         slptimeo = 2 * hz;
2008                 }
2009                 /*
2010                  * Make sure while we slept that the mountpoint didn't go away.
2011                  * nfs_sigintr and callers expect it in tact.
2012                  */
2013                 if (!rep->r_nmp)
2014                         return (ENXIO); /* don't have lock until out of loop */
2015         }
2016         *statep |= NFSSTA_SNDLOCK;
2017         return (0);
2018 }
2019
2020 /*
2021  * Unlock the stream socket for others.
2022  */
2023 void
2024 nfs_sndunlock(rep)
2025         struct nfsreq *rep;
2026 {
2027         int *statep;
2028
2029         if (rep->r_nmp == NULL)
2030                 return;
2031         statep = &rep->r_nmp->nm_state;
2032         if ((*statep & NFSSTA_SNDLOCK) == 0)
2033                 panic("nfs sndunlock");
2034         *statep &= ~NFSSTA_SNDLOCK;
2035         if (*statep & NFSSTA_WANTSND) {
2036                 *statep &= ~NFSSTA_WANTSND;
2037                 wakeup((caddr_t)statep);
2038         }
2039 }
2040
2041 static int
2042 nfs_rcvlock(struct nfsreq *rep)
2043 {
2044         int *statep;
2045         int error, slpflag, slptimeo = 0;
2046
2047         /* make sure we still have our mountpoint */
2048         if (!rep->r_nmp) {
2049                 if (rep->r_mrep != NULL)
2050                         return (EALREADY);
2051                 return (ENXIO);
2052         }
2053
2054         statep = &rep->r_nmp->nm_state;
2055         FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2056         if (rep->r_nmp->nm_flag & NFSMNT_INT)
2057                 slpflag = PCATCH;
2058         else
2059                 slpflag = 0;
2060         while (*statep & NFSSTA_RCVLOCK) {
2061                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2062                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2063                         return (error);
2064                 } else if (rep->r_mrep != NULL) {
2065                         /*
2066                          * Don't bother sleeping if reply already arrived
2067                          */
2068                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2069                         return (EALREADY);
2070                 }
2071                 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2072                 *statep |= NFSSTA_WANTRCV;
2073                 /*
2074                  * We need to poll if we're P_NOREMOTEHANG so that we
2075                  * call nfs_sigintr periodically above.
2076                  */
2077                 if (rep->r_procp != NULL &&
2078                     (proc_noremotehang(rep->r_procp)) != 0)
2079                         slptimeo = hz;
2080                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2081                 if (slpflag == PCATCH) {
2082                         slpflag = 0;
2083                         slptimeo = 2 * hz;
2084                 }
2085                 /*
2086                  * Make sure while we slept that the mountpoint didn't go away.
2087                  * nfs_sigintr and caller nfs_reply expect it intact.
2088                  */
2089                 if (!rep->r_nmp)  {
2090                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2091                         return (ENXIO); /* don't have lock until out of loop */
2092                 }
2093         }
2094         /*
2095          * nfs_reply will handle it if reply already arrived.
2096          * (We may have slept or been preempted).
2097          */
2098         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2099         *statep |= NFSSTA_RCVLOCK;
2100         return (0);
2101 }
2102
2103 /*
2104  * Unlock the stream socket for others.
2105  */
2106 static void
2107 nfs_rcvunlock(struct nfsreq *rep)
2108 {
2109         int *statep;
2110
2111         if (rep->r_nmp == NULL)
2112                 return;
2113         statep = &rep->r_nmp->nm_state;
2114
2115         FSDBG(533, statep, *statep, 0, 0);
2116         if ((*statep & NFSSTA_RCVLOCK) == 0)
2117                 panic("nfs rcvunlock");
2118         *statep &= ~NFSSTA_RCVLOCK;
2119         if (*statep & NFSSTA_WANTRCV) {
2120                 *statep &= ~NFSSTA_WANTRCV;
2121                 wakeup((caddr_t)statep);
2122         }
2123 }
2124
2125
2126 #ifndef NFS_NOSERVER
2127 /*
2128  * Socket upcall routine for the nfsd sockets.
2129  * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2130  * Essentially do as much as possible non-blocking, else punt and it will
2131  * be called with MBUF_WAITOK from an nfsd.
2132  */
2133 void
2134 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2135 {
2136         struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2137
2138         if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2139                 return;
2140
2141         lck_rw_lock_exclusive(&slp->ns_rwlock);
2142         nfsrv_rcv_locked(so, slp, waitflag);
2143         /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2144 }
2145 void
2146 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2147 {
2148         mbuf_t m, mp, mhck, m2;
2149         int ns_flag=0, error;
2150         struct msghdr   msg;
2151         size_t bytes_read;
2152
2153         if ((slp->ns_flag & SLP_VALID) == 0) {
2154                 if (waitflag == MBUF_DONTWAIT)
2155                         lck_rw_done(&slp->ns_rwlock);
2156                 return;
2157         }
2158
2159 #ifdef notdef
2160         /*
2161          * Define this to test for nfsds handling this under heavy load.
2162          */
2163         if (waitflag == MBUF_DONTWAIT) {
2164                 ns_flag = SLP_NEEDQ;
2165                 goto dorecs;
2166         }
2167 #endif
2168         if (slp->ns_sotype == SOCK_STREAM) {
2169                 /*
2170                  * If there are already records on the queue, defer soreceive()
2171                  * to an nfsd so that there is feedback to the TCP layer that
2172                  * the nfs servers are heavily loaded.
2173                  */
2174                 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2175                         ns_flag = SLP_NEEDQ;
2176                         goto dorecs;
2177                 }
2178
2179                 /*
2180                  * Do soreceive().
2181                  */
2182                 bytes_read = 1000000000;
2183                 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2184                 if (error || mp == NULL) {
2185                         if (error == EWOULDBLOCK)
2186                                 ns_flag = SLP_NEEDQ;
2187                         else
2188                                 ns_flag = SLP_DISCONN;
2189                         goto dorecs;
2190                 }
2191                 m = mp;
2192                 if (slp->ns_rawend) {
2193                         if ((error = mbuf_setnext(slp->ns_rawend, m)))
2194                                 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2195                         slp->ns_cc += bytes_read;
2196                 } else {
2197                         slp->ns_raw = m;
2198                         slp->ns_cc = bytes_read;
2199                 }
2200                 while ((m2 = mbuf_next(m)))
2201                         m = m2;
2202                 slp->ns_rawend = m;
2203
2204                 /*
2205                  * Now try and parse record(s) out of the raw stream data.
2206                  */
2207                 error = nfsrv_getstream(slp, waitflag);
2208                 if (error) {
2209                         if (error == EPERM)
2210                                 ns_flag = SLP_DISCONN;
2211                         else
2212                                 ns_flag = SLP_NEEDQ;
2213                 }
2214         } else {
2215                 struct sockaddr_storage nam;
2216
2217                 bzero(&msg, sizeof(msg));
2218                 msg.msg_name = (caddr_t)&nam;
2219                 msg.msg_namelen = sizeof(nam);
2220
2221                 do {
2222                         bytes_read = 1000000000;
2223                         error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2224                         if (mp) {
2225                                 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2226                                         mbuf_setlen(mhck, nam.ss_len);
2227                                         bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2228                                         m = mhck;
2229                                         if (mbuf_setnext(m, mp)) {
2230                                                 /* trouble... just drop it */
2231                                                 printf("nfsrv_rcv: mbuf_setnext failed\n");
2232                                                 mbuf_free(mhck);
2233                                                 m = mp;
2234                                         }
2235                                 } else {
2236                                         m = mp;
2237                                 }
2238                                 if (slp->ns_recend)
2239                                         mbuf_setnextpkt(slp->ns_recend, m);
2240                                 else
2241                                         slp->ns_rec = m;
2242                                 slp->ns_recend = m;
2243                                 mbuf_setnextpkt(m, NULL);
2244                         }
2245 #if 0
2246                         if (error) {
2247                                 /*
2248                                  * This may be needed in the future to support
2249                                  * non-byte-stream connection-oriented protocols
2250                                  * such as SCTP.
2251                                  */
2252                                 /*
2253                                  * This (slp->ns_sotype == SOCK_STREAM) should really
2254                                  * be a check for PR_CONNREQUIRED.
2255                                  */
2256                                 if ((slp->ns_sotype == SOCK_STREAM)
2257                                         && error != EWOULDBLOCK) {
2258                                         ns_flag = SLP_DISCONN;
2259                                         goto dorecs;
2260                                 }
2261                         }
2262 #endif
2263                 } while (mp);
2264         }
2265
2266         /*
2267          * Now try and process the request records, non-blocking.
2268          */
2269 dorecs:
2270         if (ns_flag)
2271                 slp->ns_flag |= ns_flag;
2272         if (waitflag == MBUF_DONTWAIT) {
2273                 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2274                 lck_rw_done(&slp->ns_rwlock);
2275                 if (wake && nfs_numnfsd) {
2276                         lck_mtx_lock(nfsd_mutex);
2277                         nfsrv_wakenfsd(slp);
2278                         lck_mtx_unlock(nfsd_mutex);
2279                 }
2280         }
2281 }
2282
2283 /*
2284  * Try and extract an RPC request from the mbuf data list received on a
2285  * stream socket. The "waitflag" argument indicates whether or not it
2286  * can sleep.
2287  */
2288 static int
2289 nfsrv_getstream(slp, waitflag)
2290         struct nfssvc_sock *slp;
2291         int waitflag;
2292 {
2293         mbuf_t m;
2294         char *cp1, *cp2, *mdata;
2295         int len, mlen, error;
2296         mbuf_t om, m2, recm;
2297         u_long recmark;
2298
2299         if (slp->ns_flag & SLP_GETSTREAM)
2300                 panic("nfs getstream");
2301         slp->ns_flag |= SLP_GETSTREAM;
2302         for (;;) {
2303             if (slp->ns_reclen == 0) {
2304                 if (slp->ns_cc < NFSX_UNSIGNED) {
2305                         slp->ns_flag &= ~SLP_GETSTREAM;
2306                         return (0);
2307                 }
2308                 m = slp->ns_raw;
2309                 mdata = mbuf_data(m);
2310                 mlen = mbuf_len(m);
2311                 if (mlen >= NFSX_UNSIGNED) {
2312                         bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2313                         mdata += NFSX_UNSIGNED;
2314                         mlen -= NFSX_UNSIGNED;
2315                         mbuf_setdata(m, mdata, mlen);
2316                 } else {
2317                         cp1 = (caddr_t)&recmark;
2318                         cp2 = mdata;
2319                         while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2320                                 while (mlen == 0) {
2321                                         m = mbuf_next(m);
2322                                         cp2 = mbuf_data(m);
2323                                         mlen = mbuf_len(m);
2324                                 }
2325                                 *cp1++ = *cp2++;
2326                                 mlen--;
2327                                 mbuf_setdata(m, cp2, mlen);
2328                         }
2329                 }
2330                 slp->ns_cc -= NFSX_UNSIGNED;
2331                 recmark = ntohl(recmark);
2332                 slp->ns_reclen = recmark & ~0x80000000;
2333                 if (recmark & 0x80000000)
2334                         slp->ns_flag |= SLP_LASTFRAG;
2335                 else
2336                         slp->ns_flag &= ~SLP_LASTFRAG;
2337                 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2338                         slp->ns_flag &= ~SLP_GETSTREAM;
2339                         return (EPERM);
2340                 }
2341             }
2342
2343             /*
2344              * Now get the record part.
2345              *
2346              * Note that slp->ns_reclen may be 0.  Linux sometimes
2347              * generates 0-length RPCs
2348              */
2349             recm = NULL;
2350             if (slp->ns_cc == slp->ns_reclen) {
2351                 recm = slp->ns_raw;
2352                 slp->ns_raw = slp->ns_rawend = NULL;
2353                 slp->ns_cc = slp->ns_reclen = 0;
2354             } else if (slp->ns_cc > slp->ns_reclen) {
2355                 len = 0;
2356                 m = slp->ns_raw;
2357                 mlen = mbuf_len(m);
2358                 mdata = mbuf_data(m);
2359                 om = NULL;
2360                 while (len < slp->ns_reclen) {
2361                         if ((len + mlen) > slp->ns_reclen) {
2362                                 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2363                                         slp->ns_flag &= ~SLP_GETSTREAM;
2364                                         return (EWOULDBLOCK);
2365                                 }
2366                                 if (om) {
2367                                         if (mbuf_setnext(om, m2)) {
2368                                                 /* trouble... just drop it */
2369                                                 printf("nfsrv_getstream: mbuf_setnext failed\n");
2370                                                 mbuf_freem(m2);
2371                                                 slp->ns_flag &= ~SLP_GETSTREAM;
2372                                                 return (EWOULDBLOCK);
2373                                         }
2374                                         recm = slp->ns_raw;
2375                                 } else {
2376                                         recm = m2;
2377                                 }
2378                                 mdata += slp->ns_reclen - len;
2379                                 mlen -= slp->ns_reclen - len;
2380                                 mbuf_setdata(m, mdata, mlen);
2381                                 len = slp->ns_reclen;
2382                         } else if ((len + mlen) == slp->ns_reclen) {
2383                                 om = m;
2384                                 len += mlen;
2385                                 m = mbuf_next(m);
2386                                 recm = slp->ns_raw;
2387                                 if (mbuf_setnext(om, NULL)) {
2388                                         printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2389                                         slp->ns_flag &= ~SLP_GETSTREAM;
2390                                         return (EWOULDBLOCK);
2391                                 }
2392                                 mlen = mbuf_len(m);
2393                                 mdata = mbuf_data(m);
2394                         } else {
2395                                 om = m;
2396                                 len += mlen;
2397                                 m = mbuf_next(m);
2398                                 mlen = mbuf_len(m);
2399                                 mdata = mbuf_data(m);
2400                         }
2401                 }
2402                 slp->ns_raw = m;
2403                 slp->ns_cc -= len;
2404                 slp->ns_reclen = 0;
2405             } else {
2406                 slp->ns_flag &= ~SLP_GETSTREAM;
2407                 return (0);
2408             }
2409
2410             /*
2411              * Accumulate the fragments into a record.
2412              */
2413             if (slp->ns_frag == NULL) {
2414                 slp->ns_frag = recm;
2415             } else {
2416                 m = slp->ns_frag;
2417                 while ((m2 = mbuf_next(m)))
2418                     m = m2;
2419                 if ((error = mbuf_setnext(m, recm)))
2420                     panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2421             }
2422             if (slp->ns_flag & SLP_LASTFRAG) {
2423                 if (slp->ns_recend)
2424                     mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2425                 else
2426                     slp->ns_rec = slp->ns_frag;
2427                 slp->ns_recend = slp->ns_frag;
2428                 slp->ns_frag = NULL;
2429             }
2430         }
2431 }
2432
2433 /*
2434  * Parse an RPC header.
2435  */
2436 int
2437 nfsrv_dorec(slp, nfsd, ndp)
2438         struct nfssvc_sock *slp;
2439         struct nfsd *nfsd;
2440         struct nfsrv_descript **ndp;
2441 {
2442         mbuf_t m;
2443         mbuf_t nam;
2444         struct nfsrv_descript *nd;
2445         int error;
2446
2447         *ndp = NULL;
2448         if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2449                 return (ENOBUFS);
2450         MALLOC_ZONE(nd, struct nfsrv_descript *,
2451                         sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2452         if (!nd)
2453                 return (ENOMEM);
2454         m = slp->ns_rec;
2455         slp->ns_rec = mbuf_nextpkt(m);
2456         if (slp->ns_rec)
2457                 mbuf_setnextpkt(m, NULL);
2458         else
2459                 slp->ns_recend = NULL;
2460         if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2461                 nam = m;
2462                 m = mbuf_next(m);
2463                 if ((error = mbuf_setnext(nam, NULL)))
2464                         panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2465         } else
2466                 nam = NULL;
2467         nd->nd_md = nd->nd_mrep = m;
2468         nd->nd_nam2 = nam;
2469         nd->nd_dpos = mbuf_data(m);
2470         error = nfs_getreq(nd, nfsd, TRUE);
2471         if (error) {
2472                 if (nam)
2473                         mbuf_freem(nam);
2474                 FREE_ZONE((caddr_t)nd,  sizeof *nd, M_NFSRVDESC);
2475                 return (error);
2476         }
2477         *ndp = nd;
2478         nfsd->nfsd_nd = nd;
2479         return (0);
2480 }
2481
2482 /*
2483  * Parse an RPC request
2484  * - verify it
2485  * - fill in the cred struct.
2486  */
2487 int
2488 nfs_getreq(nd, nfsd, has_header)
2489         struct nfsrv_descript *nd;
2490         struct nfsd *nfsd;
2491         int has_header;
2492 {
2493         int len, i;
2494         u_long *tl;
2495         long t1;
2496         uio_t uiop;
2497         caddr_t dpos, cp2, cp;
2498         u_long nfsvers, auth_type;
2499         uid_t nickuid;
2500         int error = 0, ticklen;
2501         mbuf_t mrep, md;
2502         struct nfsuid *nuidp;
2503         uid_t user_id;
2504         gid_t group_id;
2505         int ngroups;
2506         struct ucred temp_cred;
2507         struct timeval tvin, tvout, now;
2508         char uio_buf[ UIO_SIZEOF(1) ];
2509 #if 0                           /* until encrypted keys are implemented */
2510         NFSKERBKEYSCHED_T keys; /* stores key schedule */
2511 #endif
2512
2513         nd->nd_cr = NULL;
2514
2515         mrep = nd->nd_mrep;
2516         md = nd->nd_md;
2517         dpos = nd->nd_dpos;
2518         if (has_header) {
2519                 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2520                 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2521                 if (*tl++ != rpc_call) {
2522                         mbuf_freem(mrep);
2523                         return (EBADRPC);
2524                 }
2525         } else
2526                 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2527         nd->nd_repstat = 0;
2528         nd->nd_flag = 0;
2529         if (*tl++ != rpc_vers) {
2530                 nd->nd_repstat = ERPCMISMATCH;
2531                 nd->nd_procnum = NFSPROC_NOOP;
2532                 return (0);
2533         }
2534         if (*tl != nfs_prog) {
2535                 nd->nd_repstat = EPROGUNAVAIL;
2536                 nd->nd_procnum = NFSPROC_NOOP;
2537                 return (0);
2538         }
2539         tl++;
2540         nfsvers = fxdr_unsigned(u_long, *tl++);
2541         if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2542                 nd->nd_repstat = EPROGMISMATCH;
2543                 nd->nd_procnum = NFSPROC_NOOP;
2544                 return (0);
2545         }
2546         else if (nfsvers == NFS_VER3)
2547                 nd->nd_flag = ND_NFSV3;
2548         nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2549         if (nd->nd_procnum == NFSPROC_NULL)
2550                 return (0);
2551         if ((nd->nd_procnum >= NFS_NPROCS) ||
2552                 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2553                 nd->nd_repstat = EPROCUNAVAIL;
2554                 nd->nd_procnum = NFSPROC_NOOP;
2555                 return (0);
2556         }
2557         if ((nd->nd_flag & ND_NFSV3) == 0)
2558                 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2559         auth_type = *tl++;
2560         len = fxdr_unsigned(int, *tl++);
2561         if (len < 0 || len > RPCAUTH_MAXSIZ) {
2562                 mbuf_freem(mrep);
2563                 return (EBADRPC);
2564         }
2565
2566         nd->nd_flag &= ~ND_KERBAUTH;
2567         /*
2568          * Handle auth_unix or auth_kerb.
2569          */
2570         if (auth_type == rpc_auth_unix) {
2571                 len = fxdr_unsigned(int, *++tl);
2572                 if (len < 0 || len > NFS_MAXNAMLEN) {
2573                         mbuf_freem(mrep);
2574                         return (EBADRPC);
2575                 }
2576                 bzero(&temp_cred, sizeof(temp_cred));
2577                 nfsm_adv(nfsm_rndup(len));
2578                 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2579                 user_id = fxdr_unsigned(uid_t, *tl++);
2580                 group_id = fxdr_unsigned(gid_t, *tl++);
2581                 temp_cred.cr_groups[0] = group_id;
2582                 len = fxdr_unsigned(int, *tl);
2583                 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2584                         mbuf_freem(mrep);
2585                         return (EBADRPC);
2586                 }
2587                 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2588                 for (i = 1; i <= len; i++)
2589                     if (i < NGROUPS)
2590                         temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2591                     else
2592                         tl++;
2593                 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2594                 if (ngroups > 1)
2595                     nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2596                 len = fxdr_unsigned(int, *++tl);
2597                 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2598                         mbuf_freem(mrep);
2599                         return (EBADRPC);
2600                 }
2601                 temp_cred.cr_uid = user_id;
2602                 temp_cred.cr_ngroups = ngroups;
2603                 nd->nd_cr = kauth_cred_create(&temp_cred);
2604                 if (nd->nd_cr == NULL) {
2605                         nd->nd_repstat = ENOMEM;
2606                         nd->nd_procnum = NFSPROC_NOOP;
2607                         return (0);
2608                 }
2609                 if (len > 0)
2610                         nfsm_adv(nfsm_rndup(len));
2611         } else if (auth_type == rpc_auth_kerb) {
2612                 switch (fxdr_unsigned(int, *tl++)) {
2613                 case RPCAKN_FULLNAME:
2614                         ticklen = fxdr_unsigned(int, *tl);
2615                         *((u_long *)nfsd->nfsd_authstr) = *tl;
2616                         uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2617                                                 &uio_buf[0], sizeof(uio_buf));
2618                         if (!uiop) {
2619                                 nd->nd_repstat = ENOMEM;
2620                                 nd->nd_procnum = NFSPROC_NOOP;
2621                                 return (0);
2622                         }
2623
2624                         // LP64todo - fix this
2625                         nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2626                         if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2627                                 mbuf_freem(mrep);
2628                                 return (EBADRPC);
2629                         }
2630                         uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2631                         // LP64todo - fix this
2632                         nfsm_mtouio(uiop, uio_resid(uiop));
2633                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2634                         if (*tl++ != rpc_auth_kerb ||
2635                                 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2636                                 printf("Bad kerb verifier\n");
2637                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2638                                 nd->nd_procnum = NFSPROC_NOOP;
2639                                 return (0);
2640                         }
2641                         nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2642                         tl = (u_long *)cp;
2643                         if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2644                                 printf("Not fullname kerb verifier\n");
2645                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2646                                 nd->nd_procnum = NFSPROC_NOOP;
2647                                 return (0);
2648                         }
2649                         cp += NFSX_UNSIGNED;
2650                         bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2651                         nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2652                         nd->nd_flag |= ND_KERBFULL;
2653                         nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2654                         break;
2655                 case RPCAKN_NICKNAME:
2656                         if (len != 2 * NFSX_UNSIGNED) {
2657                                 printf("Kerb nickname short\n");
2658                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2659                                 nd->nd_procnum = NFSPROC_NOOP;
2660                                 return (0);
2661                         }
2662                         nickuid = fxdr_unsigned(uid_t, *tl);
2663                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2664                         if (*tl++ != rpc_auth_kerb ||
2665                                 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2666                                 printf("Kerb nick verifier bad\n");
2667                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2668                                 nd->nd_procnum = NFSPROC_NOOP;
2669                                 return (0);
2670                         }
2671                         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2672                         tvin.tv_sec = *tl++;
2673                         tvin.tv_usec = *tl;
2674
2675                         for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2676                             nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2677                                 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2678                                     (!nd->nd_nam2 ||
2679                                      netaddr_match(NU_NETFAM(nuidp),
2680                                       &nuidp->nu_haddr, nd->nd_nam2)))
2681                                         break;
2682                         }
2683                         if (!nuidp) {
2684                                 nd->nd_repstat =
2685                                         (NFSERR_AUTHERR|AUTH_REJECTCRED);
2686                                 nd->nd_procnum = NFSPROC_NOOP;
2687                                 return (0);
2688                         }
2689
2690                         /*
2691                          * Now, decrypt the timestamp using the session key
2692                          * and validate it.
2693                          */
2694 #if NFSKERB
2695                         XXX
2696 #endif
2697
2698                         tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2699                         tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2700                         microtime(&now);
2701                         if (nuidp->nu_expire < now.tv_sec ||
2702                             nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2703                             (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2704                              nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2705                                 nuidp->nu_expire = 0;
2706                                 nd->nd_repstat =
2707                                     (NFSERR_AUTHERR|AUTH_REJECTVERF);
2708                                 nd->nd_procnum = NFSPROC_NOOP;
2709                                 return (0);
2710                         }
2711                         bzero(&temp_cred, sizeof(temp_cred));
2712                         ngroups = nuidp->nu_cr->cr_ngroups;
2713                         for (i = 0; i < ngroups; i++)
2714                                 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2715                         if (ngroups > 1)
2716                                 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2717
2718                         temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2719                         temp_cred.cr_ngroups = ngroups;
2720                         nd->nd_cr = kauth_cred_create(&temp_cred);
2721                         if (!nd->nd_cr) {
2722                                 nd->nd_repstat = ENOMEM;
2723                                 nd->nd_procnum = NFSPROC_NOOP;
2724                                 return (0);
2725                         }
2726                         nd->nd_flag |= ND_KERBNICK;
2727                 };
2728         } else {
2729                 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2730                 nd->nd_procnum = NFSPROC_NOOP;
2731                 return (0);
2732         }
2733
2734         nd->nd_md = md;
2735         nd->nd_dpos = dpos;
2736         return (0);
2737 nfsmout:
2738         if (nd->nd_cr)
2739                 kauth_cred_rele(nd->nd_cr);
2740         return (error);
2741 }
2742
2743 /*
2744  * Search for a sleeping nfsd and wake it up.
2745  * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2746  * running nfsds will go look for the work in the nfssvc_sock list.
2747  * Note: Must be called with nfsd_mutex held.
2748  */
2749 void
2750 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2751 {
2752         struct nfsd *nd;
2753
2754         if ((slp->ns_flag & SLP_VALID) == 0)
2755                 return;
2756
2757         lck_rw_lock_exclusive(&slp->ns_rwlock);
2758
2759         if (nfsd_waiting) {
2760                 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2761                         if (nd->nfsd_flag & NFSD_WAITING) {
2762                                 nd->nfsd_flag &= ~NFSD_WAITING;
2763                                 if (nd->nfsd_slp)
2764                                         panic("nfsd wakeup");
2765                                 slp->ns_sref++;
2766                                 nd->nfsd_slp = slp;
2767                                 lck_rw_done(&slp->ns_rwlock);
2768                                 wakeup((caddr_t)nd);
2769                                 return;
2770                         }
2771                 }
2772         }
2773
2774         slp->ns_flag |= SLP_DOREC;
2775
2776         lck_rw_done(&slp->ns_rwlock);
2777
2778         nfsd_head_flag |= NFSD_CHECKSLP;
2779 }
2780 #endif /* NFS_NOSERVER */
2781
2782 static int
2783 nfs_msg(proc_t p,
2784         const char *server,
2785         const char *msg,
2786         int error)
2787 {
2788         tpr_t tpr;
2789
2790         if (p)
2791                 tpr = tprintf_open(p);
2792         else
2793                 tpr = NULL;
2794         if (error)
2795                 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2796                     error);
2797         else
2798                 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2799         tprintf_close(tpr);
2800         return (0);
2801 }
2802
2803 void
2804 nfs_down(nmp, proc, error, flags, msg)
2805         struct nfsmount *nmp;
2806         proc_t proc;
2807         int error, flags;
2808         const char *msg;
2809 {
2810         if (nmp == NULL)
2811                 return;
2812         if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2813                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2814                 nmp->nm_state |= NFSSTA_TIMEO;
2815         }
2816         if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2817                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2818                 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2819         }
2820         nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2821 }
2822
2823 void
2824 nfs_up(nmp, proc, flags, msg)
2825         struct nfsmount *nmp;
2826         proc_t proc;
2827         int flags;
2828         const char *msg;
2829 {
2830         if (nmp == NULL)
2831                 return;
2832         if (msg)
2833                 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2834         if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2835                 nmp->nm_state &= ~NFSSTA_TIMEO;
2836                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2837         }
2838         if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2839                 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2840                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2841         }
2842 }
2843