bsd/nfs/nfs_socket.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1989, 1991, 1993, 1995
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * This code is derived from software contributed to Berkeley by
  28  * Rick Macklem at The University of Guelph.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  * 1. Redistributions of source code must retain the above copyright
  34  *    notice, this list of conditions and the following disclaimer.
  35  * 2. Redistributions in binary form must reproduce the above copyright
  36  *    notice, this list of conditions and the following disclaimer in the
  37  *    documentation and/or other materials provided with the distribution.
  38  * 3. All advertising materials mentioning features or use of this software
  39  *    must display the following acknowledgement:
  40  *      This product includes software developed by the University of
  41  *      California, Berkeley and its contributors.
  42  * 4. Neither the name of the University nor the names of its contributors
  43  *    may be used to endorse or promote products derived from this software
  44  *    without specific prior written permission.
  45  *
  46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  56  * SUCH DAMAGE.
  57  *
  58  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
  59  * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
  60  */
  61
  62 /*
  63  * Socket operations for use by nfs
  64  */
  65
  66 #include <sys/param.h>
  67 #include <sys/systm.h>
  68 #include <sys/proc.h>
  69 #include <sys/kauth.h>
  70 #include <sys/mount_internal.h>
  71 #include <sys/kernel.h>
  72 #include <sys/kpi_mbuf.h>
  73 #include <sys/malloc.h>
  74 #include <sys/vnode.h>
  75 #include <sys/domain.h>
  76 #include <sys/protosw.h>
  77 #include <sys/socket.h>
  78 #include <sys/syslog.h>
  79 #include <sys/tprintf.h>
  80 #include <sys/uio_internal.h>
  81 #include <libkern/OSAtomic.h>
  82
  83 #include <sys/time.h>
  84 #include <kern/clock.h>
  85 #include <kern/task.h>
  86 #include <kern/thread.h>
  87 #include <sys/user.h>
  88
  89 #include <netinet/in.h>
  90 #include <netinet/tcp.h>
  91
  92 #include <nfs/rpcv2.h>
  93 #include <nfs/nfsproto.h>
  94 #include <nfs/nfs.h>
  95 #include <nfs/xdr_subs.h>
  96 #include <nfs/nfsm_subs.h>
  97 #include <nfs/nfsmount.h>
  98 #include <nfs/nfsnode.h>
  99 #include <nfs/nfsrtt.h>
 100
 101 #include <sys/kdebug.h>
 102
 103 #define FSDBG(A, B, C, D, E) \
 104         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
 105                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 106 #define FSDBG_TOP(A, B, C, D, E) \
 107         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
 108                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 109 #define FSDBG_BOT(A, B, C, D, E) \
 110         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
 111                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 112
 113 /*
 114  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
 115  * Use the mean and mean deviation of rtt for the appropriate type of rpc
 116  * for the frequent rpcs and a default for the others.
 117  * The justification for doing "other" this way is that these rpcs
 118  * happen so infrequently that timer est. would probably be stale.
 119  * Also, since many of these rpcs are
 120  * non-idempotent, a conservative timeout is desired.
 121  * getattr, lookup - A+2D
 122  * read, write     - A+4D
 123  * other           - nm_timeo
 124  */
 125 #define NFS_RTO(n, t) \
 126         ((t) == 0 ? (n)->nm_timeo : \
 127          ((t) < 3 ? \
 128           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
 129           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
 130 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
 131 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
 132 /*
 133  * External data, mostly RPC constants in XDR form
 134  */
 135 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
 136         rpc_msgaccepted, rpc_call, rpc_autherr,
 137         rpc_auth_kerb;
 138 extern u_long nfs_prog;
 139 extern struct nfsstats nfsstats;
 140 extern int nfsv3_procid[NFS_NPROCS];
 141 extern int nfs_ticks;
 142 extern u_long nfs_xidwrap;
 143
 144 /*
 145  * Defines which timer to use for the procnum.
 146  * 0 - default
 147  * 1 - getattr
 148  * 2 - lookup
 149  * 3 - read
 150  * 4 - write
 151  */
 152 static int proct[NFS_NPROCS] = {
 153         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
 154 };
 155
 156 /*
 157  * There is a congestion window for outstanding rpcs maintained per mount
 158  * point. The cwnd size is adjusted in roughly the way that:
 159  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
 160  * SIGCOMM '88". ACM, August 1988.
 161  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
 162  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
 163  * of rpcs is in progress.
 164  * (The sent count and cwnd are scaled for integer arith.)
 165  * Variants of "slow start" were tried and were found to be too much of a
 166  * performance hit (ave. rtt 3 times larger),
 167  * I suspect due to the large rtt that nfs rpcs have.
 168  */
 169 #define NFS_CWNDSCALE   256
 170 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
 171 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
 172 int nfsrtton = 0;
 173 struct nfsrtt nfsrtt;
 174
 175 static int      nfs_rcvlock(struct nfsreq *);
 176 static void     nfs_rcvunlock(struct nfsreq *);
 177 static int      nfs_receive(struct nfsreq *rep, mbuf_t *mp);
 178 static int      nfs_reconnect(struct nfsreq *rep);
 179 static void     nfs_repdequeue(struct nfsreq *rep);
 180
 181 /* XXX */
 182 boolean_t       current_thread_aborted(void);
 183 kern_return_t   thread_terminate(thread_t);
 184
 185 #ifndef NFS_NOSERVER
 186 static int      nfsrv_getstream(struct nfssvc_sock *,int);
 187
 188 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
 189                                     struct nfssvc_sock *slp,
 190                                     proc_t procp,
 191                                     mbuf_t *mreqp) = {
 192         nfsrv_null,
 193         nfsrv_getattr,
 194         nfsrv_setattr,
 195         nfsrv_lookup,
 196         nfsrv3_access,
 197         nfsrv_readlink,
 198         nfsrv_read,
 199         nfsrv_write,
 200         nfsrv_create,
 201         nfsrv_mkdir,
 202         nfsrv_symlink,
 203         nfsrv_mknod,
 204         nfsrv_remove,
 205         nfsrv_rmdir,
 206         nfsrv_rename,
 207         nfsrv_link,
 208         nfsrv_readdir,
 209         nfsrv_readdirplus,
 210         nfsrv_statfs,
 211         nfsrv_fsinfo,
 212         nfsrv_pathconf,
 213         nfsrv_commit,
 214         nfsrv_noop
 215 };
 216 #endif /* NFS_NOSERVER */
 217
 218
 219 /*
 220  * attempt to bind a socket to a reserved port
 221  */
 222 static int
 223 nfs_bind_resv(struct nfsmount *nmp)
 224 {
 225         socket_t so = nmp->nm_so;
 226         struct sockaddr_in sin;
 227         int error;
 228         u_short tport;
 229
 230         if (!so)
 231                 return (EINVAL);
 232
 233         sin.sin_len = sizeof (struct sockaddr_in);
 234         sin.sin_family = AF_INET;
 235         sin.sin_addr.s_addr = INADDR_ANY;
 236         tport = IPPORT_RESERVED - 1;
 237         sin.sin_port = htons(tport);
 238
 239         while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
 240                (--tport > IPPORT_RESERVED / 2))
 241                 sin.sin_port = htons(tport);
 242         return (error);
 243 }
 244
 245 /*
 246  * variables for managing the nfs_bind_resv_thread
 247  */
 248 int nfs_resv_mounts = 0;
 249 static int nfs_bind_resv_thread_state = 0;
 250 #define NFS_BIND_RESV_THREAD_STATE_INITTED      1
 251 #define NFS_BIND_RESV_THREAD_STATE_RUNNING      2
 252 lck_grp_t *nfs_bind_resv_lck_grp;
 253 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
 254 lck_attr_t *nfs_bind_resv_lck_attr;
 255 lck_mtx_t *nfs_bind_resv_mutex;
 256 struct nfs_bind_resv_request {
 257         TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
 258         struct nfsmount *brr_nmp;
 259         int brr_error;
 260 };
 261 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
 262
 263 /*
 264  * thread to handle any reserved port bind requests
 265  */
 266 static void
 267 nfs_bind_resv_thread(void)
 268 {
 269         struct nfs_bind_resv_request *brreq;
 270
 271         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 272
 273         while (nfs_resv_mounts > 0) {
 274                 lck_mtx_lock(nfs_bind_resv_mutex);
 275                 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
 276                         TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
 277                         lck_mtx_unlock(nfs_bind_resv_mutex);
 278                         brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
 279                         wakeup(brreq);
 280                         lck_mtx_lock(nfs_bind_resv_mutex);
 281                 }
 282                 msleep((caddr_t)&nfs_bind_resv_request_queue,
 283                                 nfs_bind_resv_mutex, PSOCK | PDROP,
 284                                 "nfs_bind_resv_request_queue", 0);
 285         }
 286
 287         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 288         (void) thread_terminate(current_thread());
 289 }
 290
 291 int
 292 nfs_bind_resv_thread_wake(void)
 293 {
 294         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
 295                 return (EIO);
 296         wakeup(&nfs_bind_resv_request_queue);
 297         return (0);
 298 }
 299
 300 /*
 301  * underprivileged procs call this to request nfs_bind_resv_thread
 302  * to perform the reserved port binding for them.
 303  */
 304 static int
 305 nfs_bind_resv_nopriv(struct nfsmount *nmp)
 306 {
 307         struct nfs_bind_resv_request brreq;
 308         int error;
 309
 310         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
 311                 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
 312                         nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
 313                         nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
 314                         nfs_bind_resv_lck_attr = lck_attr_alloc_init();
 315                         nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
 316                         TAILQ_INIT(&nfs_bind_resv_request_queue);
 317                         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 318                 }
 319                 kernel_thread(kernel_task, nfs_bind_resv_thread);
 320                 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 321         }
 322
 323         brreq.brr_nmp = nmp;
 324         brreq.brr_error = 0;
 325
 326         lck_mtx_lock(nfs_bind_resv_mutex);
 327         TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 328         lck_mtx_unlock(nfs_bind_resv_mutex);
 329
 330         error = nfs_bind_resv_thread_wake();
 331         if (error) {
 332                 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 333                 /* Note: we might be able to simply restart the thread */
 334                 return (error);
 335         }
 336
 337         tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
 338
 339         return (brreq.brr_error);
 340 }
 341
 342 /*
 343  * Initialize sockets and congestion for a new NFS connection.
 344  * We do not free the sockaddr if error.
 345  */
 346 int
 347 nfs_connect(
 348         struct nfsmount *nmp,
 349         __unused struct nfsreq *rep)
 350 {
 351         socket_t so;
 352         int error, rcvreserve, sndreserve;
 353         struct sockaddr *saddr;
 354         struct timeval timeo;
 355
 356         nmp->nm_so = 0;
 357         saddr = mbuf_data(nmp->nm_nam);
 358         error = sock_socket(saddr->sa_family, nmp->nm_sotype,
 359                                                 nmp->nm_soproto, 0, 0, &nmp->nm_so);
 360         if (error) {
 361                 goto bad;
 362         }
 363         so = nmp->nm_so;
 364
 365         /*
 366          * Some servers require that the client port be a reserved port number.
 367          */
 368         if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
 369                 proc_t p;
 370                 /*
 371                  * sobind() requires current_proc() to have superuser privs.
 372                  * If this bind is part of a reconnect, and the current proc
 373                  * doesn't have superuser privs, we hand the sobind() off to
 374                  * a kernel thread to process.
 375                  */
 376                 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
 377                     (p = current_proc()) && suser(kauth_cred_get(), 0)) {
 378                         /* request nfs_bind_resv_thread() to do bind */
 379                         error = nfs_bind_resv_nopriv(nmp);
 380                 } else {
 381                         error = nfs_bind_resv(nmp);
 382                 }
 383                 if (error)
 384                         goto bad;
 385         }
 386
 387         /*
 388          * Protocols that do not require connections may be optionally left
 389          * unconnected for servers that reply from a port other than NFS_PORT.
 390          */
 391         if (nmp->nm_flag & NFSMNT_NOCONN) {
 392                 if (nmp->nm_sotype == SOCK_STREAM) {
 393                         error = ENOTCONN;
 394                         goto bad;
 395                 }
 396         } else {
 397                 struct timeval  tv;
 398                 tv.tv_sec = 2;
 399                 tv.tv_usec = 0;
 400                 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
 401                 if (error && error != EINPROGRESS) {
 402                         goto bad;
 403                 }
 404
 405                 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
 406                         if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
 407                                 goto bad;
 408                         }
 409                 }
 410         }
 411
 412         /*
 413          * Always time out on recieve, this allows us to reconnect the
 414          * socket to deal with network changes.
 415          */
 416         timeo.tv_usec = 0;
 417         timeo.tv_sec = 2;
 418         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
 419         if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
 420                 timeo.tv_sec = 5;
 421         } else {
 422                 timeo.tv_sec = 0;
 423         }
 424         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
 425
 426         if (nmp->nm_sotype == SOCK_DGRAM) {
 427                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 428                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 429                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 430         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
 431                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 432                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 433                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 434         } else {
 435                 int proto;
 436                 int on = 1;
 437
 438                 sock_gettype(so, NULL, NULL, &proto);
 439                 if (nmp->nm_sotype != SOCK_STREAM)
 440                         panic("nfscon sotype");
 441
 442                 // Assume that SOCK_STREAM always requires a connection
 443                 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
 444
 445                 if (proto == IPPROTO_TCP) {
 446                         sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
 447                 }
 448
 449                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
 450                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
 451                                 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 452         }
 453
 454         if (sndreserve > NFS_MAXSOCKBUF)
 455                 sndreserve = NFS_MAXSOCKBUF;
 456         if (rcvreserve > NFS_MAXSOCKBUF)
 457                 rcvreserve = NFS_MAXSOCKBUF;
 458         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
 459         if (error) {
 460                 goto bad;
 461         }
 462         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
 463         if (error) {
 464                 goto bad;
 465         }
 466
 467         sock_nointerrupt(so, 1);
 468
 469         /* Initialize other non-zero congestion variables */
 470         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
 471                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
 472         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
 473                 nmp->nm_sdrtt[3] = 0;
 474         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
 475         nmp->nm_sent = 0;
 476         FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
 477         nmp->nm_timeouts = 0;
 478         return (0);
 479
 480 bad:
 481         nfs_disconnect(nmp);
 482         return (error);
 483 }
 484
 485 /*
 486  * Reconnect routine:
 487  * Called when a connection is broken on a reliable protocol.
 488  * - clean up the old socket
 489  * - nfs_connect() again
 490  * - set R_MUSTRESEND for all outstanding requests on mount point
 491  * If this fails the mount point is DEAD!
 492  * nb: Must be called with the nfs_sndlock() set on the mount point.
 493  */
 494 static int
 495 nfs_reconnect(struct nfsreq *rep)
 496 {
 497         struct nfsreq *rp;
 498         struct nfsmount *nmp = rep->r_nmp;
 499         int error;
 500
 501         nfs_disconnect(nmp);
 502         while ((error = nfs_connect(nmp, rep))) {
 503                 if (error == EINTR || error == ERESTART)
 504                         return (EINTR);
 505                 if (error == EIO)
 506                         return (EIO);
 507                 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
 508                         "can not connect");
 509                 rep->r_flags |= R_TPRINTFMSG;
 510                 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
 511                         /* we're not yet completely mounted and */
 512                         /* we can't reconnect, so we fail */
 513                         return (error);
 514                 }
 515                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
 516                         return (error);
 517                 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
 518         }
 519
 520         /*
 521          * Loop through outstanding request list and fix up all requests
 522          * on old socket.
 523          */
 524         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
 525                 if (rp->r_nmp == nmp)
 526                         rp->r_flags |= R_MUSTRESEND;
 527         }
 528         return (0);
 529 }
 530
 531 /*
 532  * NFS disconnect. Clean up and unlink.
 533  */
 534 void
 535 nfs_disconnect(struct nfsmount *nmp)
 536 {
 537         socket_t so;
 538
 539         if (nmp->nm_so) {
 540                 so = nmp->nm_so;
 541                 nmp->nm_so = 0;
 542                 sock_shutdown(so, 2);
 543                 sock_close(so);
 544         }
 545 }
 546
 547 /*
 548  * This is the nfs send routine. For connection based socket types, it
 549  * must be called with an nfs_sndlock() on the socket.
 550  * "rep == NULL" indicates that it has been called from a server.
 551  * For the client side:
 552  * - return EINTR if the RPC is terminated, 0 otherwise
 553  * - set R_MUSTRESEND if the send fails for any reason
 554  * - do any cleanup required by recoverable socket errors (???)
 555  * For the server side:
 556  * - return EINTR or ERESTART if interrupted by a signal
 557  * - return EPIPE if a connection is lost for connection based sockets (TCP...)
 558  * - do any cleanup required by recoverable socket errors (???)
 559  */
 560 int
 561 nfs_send(so, nam, top, rep)
 562         socket_t so;
 563         mbuf_t nam;
 564         mbuf_t top;
 565         struct nfsreq *rep;
 566 {
 567         struct sockaddr *sendnam;
 568         int error, error2, sotype, flags;
 569         u_long xidqueued = 0;
 570         struct nfsreq *rp;
 571         char savenametolog[MAXPATHLEN];
 572         struct msghdr msg;
 573
 574         if (rep) {
 575                 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 576                 if (error) {
 577                         mbuf_freem(top);
 578                         return (error);
 579                 }
 580                 if ((so = rep->r_nmp->nm_so) == NULL) {
 581                         rep->r_flags |= R_MUSTRESEND;
 582                         mbuf_freem(top);
 583                         return (0);
 584                 }
 585                 rep->r_flags &= ~R_MUSTRESEND;
 586                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 587                         if (rp == rep)
 588                                 break;
 589                 if (rp)
 590                         xidqueued = rp->r_xid;
 591         }
 592         sock_gettype(so, NULL, &sotype, NULL);
 593         if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
 594             (nam == 0))
 595                 sendnam = (struct sockaddr *)0;
 596         else
 597                 sendnam = mbuf_data(nam);
 598
 599         if (sotype == SOCK_SEQPACKET)
 600                 flags = MSG_EOR;
 601         else
 602                 flags = 0;
 603
 604         /*
 605          * Save the name here in case mount point goes away if we block.
 606          * The name is using local stack and is large, but don't
 607          * want to block if we malloc.
 608          */
 609         if (rep)
 610                 strncpy(savenametolog,
 611                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
 612                         MAXPATHLEN - 1);
 613         bzero(&msg, sizeof(msg));
 614         msg.msg_name = (caddr_t)sendnam;
 615         msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
 616         error = sock_sendmbuf(so, &msg, top, flags, NULL);
 617
 618         if (error) {
 619                 if (rep) {
 620                         if (xidqueued) {
 621                                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 622                                         if (rp == rep && rp->r_xid == xidqueued)
 623                                                 break;
 624                                 if (!rp)
 625                                         panic("nfs_send: error %d xid %x gone",
 626                                               error, xidqueued);
 627                         }
 628                         log(LOG_INFO, "nfs send error %d for server %s\n",
 629                             error, savenametolog);
 630                         /*
 631                          * Deal with errors for the client side.
 632                          */
 633                         error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 634                         if (error2) {
 635                                 error = error2;
 636                         } else {
 637                                 rep->r_flags |= R_MUSTRESEND;
 638                         }
 639                 } else
 640                         log(LOG_INFO, "nfsd send error %d\n", error);
 641
 642                 /*
 643                  * Handle any recoverable (soft) socket errors here. (???)
 644                  */
 645                 if (error != EINTR && error != ERESTART && error != EIO &&
 646                         error != EWOULDBLOCK && error != EPIPE) {
 647                         error = 0;
 648                 }
 649         }
 650         return (error);
 651 }
 652
 653 /*
 654  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
 655  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
 656  * Mark and consolidate the data into a new mbuf list.
 657  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
 658  *     small mbufs.
 659  * For SOCK_STREAM we must be very careful to read an entire record once
 660  * we have read any of it, even if the system call has been interrupted.
 661  */
 662 static int
 663 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
 664 {
 665         socket_t so;
 666         struct iovec_32 aio;
 667         mbuf_t m, mlast;
 668         u_long len, fraglen;
 669         int error, error2, sotype;
 670         proc_t p = current_proc();      /* XXX */
 671         struct msghdr msg;
 672         size_t rcvlen;
 673         int lastfragment;
 674
 675         /*
 676          * Set up arguments for soreceive()
 677          */
 678         *mp = NULL;
 679         sotype = rep->r_nmp->nm_sotype;
 680
 681         /*
 682          * For reliable protocols, lock against other senders/receivers
 683          * in case a reconnect is necessary.
 684          * For SOCK_STREAM, first get the Record Mark to find out how much
 685          * more there is to get.
 686          * We must lock the socket against other receivers
 687          * until we have an entire rpc request/reply.
 688          */
 689         if (sotype != SOCK_DGRAM) {
 690                 error = nfs_sndlock(rep);
 691                 if (error)
 692                         return (error);
 693 tryagain:
 694                 /*
 695                  * Check for fatal errors and resending request.
 696                  */
 697                 /*
 698                  * Ugh: If a reconnect attempt just happened, nm_so
 699                  * would have changed. NULL indicates a failed
 700                  * attempt that has essentially shut down this
 701                  * mount point.
 702                  */
 703                 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
 704                         nfs_sndunlock(rep);
 705                         if (error)
 706                                 return (error);
 707                         return (EINTR);
 708                 }
 709                 so = rep->r_nmp->nm_so;
 710                 if (!so) {
 711                         error = nfs_reconnect(rep);
 712                         if (error) {
 713                                 nfs_sndunlock(rep);
 714                                 return (error);
 715                         }
 716                         goto tryagain;
 717                 }
 718                 while (rep->r_flags & R_MUSTRESEND) {
 719                         error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
 720                         if (!error) {
 721                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
 722                                 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
 723                         }
 724                         /*
 725                          * we also hold rcv lock so rep is still
 726                          * legit this point
 727                          */
 728                         if (error) {
 729                                 if (error == EINTR || error == ERESTART ||
 730                                     (error = nfs_reconnect(rep))) {
 731                                         nfs_sndunlock(rep);
 732                                         return (error);
 733                                 }
 734                                 goto tryagain;
 735                         }
 736                 }
 737                 nfs_sndunlock(rep);
 738                 if (sotype == SOCK_STREAM) {
 739                         error = 0;
 740                         len = 0;
 741                         lastfragment = 0;
 742                         mlast = NULL;
 743                         while (!error && !lastfragment) {
 744                                 aio.iov_base = (uintptr_t) &fraglen;
 745                                 aio.iov_len = sizeof(u_long);
 746                                 bzero(&msg, sizeof(msg));
 747                                 msg.msg_iov = (struct iovec *) &aio;
 748                                 msg.msg_iovlen = 1;
 749                                 do {
 750                                    error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
 751                                    if (!rep->r_nmp) /* if unmounted then bailout */
 752                                         goto shutout;
 753                                    if (error == EWOULDBLOCK && rep) {
 754                                         error2 = nfs_sigintr(rep->r_nmp, rep, p);
 755                                         if (error2)
 756                                                 error = error2;
 757                                    }
 758                                 } while (error == EWOULDBLOCK);
 759                                 if (!error && rcvlen < aio.iov_len) {
 760                                     /* only log a message if we got a partial word */
 761                                     if (rcvlen != 0)
 762                                             log(LOG_INFO,
 763                                                  "short receive (%d/%d) from nfs server %s\n",
 764                                                  rcvlen, sizeof(u_long),
 765                                                  vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 766                                     error = EPIPE;
 767                                 }
 768                                 if (error)
 769                                         goto errout;
 770                                 lastfragment = ntohl(fraglen) & 0x80000000;
 771                                 fraglen = ntohl(fraglen) & ~0x80000000;
 772                                 len += fraglen;
 773                                 /*
 774                                  * This is SERIOUS! We are out of sync with the sender
 775                                  * and forcing a disconnect/reconnect is all I can do.
 776                                  */
 777                                 if (len > NFS_MAXPACKET) {
 778                                     log(LOG_ERR, "%s (%d) from nfs server %s\n",
 779                                         "impossible RPC record length", len,
 780                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 781                                     error = EFBIG;
 782                                     goto errout;
 783                                 }
 784
 785                                 m = NULL;
 786                                 do {
 787                                     rcvlen = fraglen;
 788                                     error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
 789                                     if (!rep->r_nmp) /* if unmounted then bailout */ {
 790                                         goto shutout;
 791                                     }
 792                                 } while (error == EWOULDBLOCK || error == EINTR ||
 793                                          error == ERESTART);
 794
 795                                 if (!error && fraglen > rcvlen) {
 796                                     log(LOG_INFO,
 797                                         "short receive (%d/%d) from nfs server %s\n",
 798                                         rcvlen, fraglen,
 799                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 800                                     error = EPIPE;
 801                                     mbuf_freem(m);
 802                                 }
 803                                 if (!error) {
 804                                         if (!*mp) {
 805                                                 *mp = m;
 806                                                 mlast = m;
 807                                         } else {
 808                                                 error = mbuf_setnext(mlast, m);
 809                                                 if (error) {
 810                                                         printf("nfs_receive: mbuf_setnext failed %d\n", error);
 811                                                         mbuf_freem(m);
 812                                                 }
 813                                         }
 814                                         while (mbuf_next(mlast))
 815                                                 mlast = mbuf_next(mlast);
 816                                 }
 817                         }
 818                 } else {
 819                         bzero(&msg, sizeof(msg));
 820                         do {
 821                             rcvlen = 100000000;
 822                             error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 823                             if (!rep->r_nmp) /* if unmounted then bailout */ {
 824                                 goto shutout;
 825                             }
 826                             if (error == EWOULDBLOCK && rep) {
 827                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 828                                 if (error2) {
 829                                         return (error2);
 830                                 }
 831                             }
 832                         } while (error == EWOULDBLOCK);
 833
 834                         if ((msg.msg_flags & MSG_EOR) == 0)
 835                                 printf("Egad!!\n");
 836                         if (!error && *mp == NULL)
 837                                 error = EPIPE;
 838                         len = rcvlen;
 839                 }
 840 errout:
 841                 if (error && error != EINTR && error != ERESTART) {
 842                         mbuf_freem(*mp);
 843                         *mp = NULL;
 844                         if (error != EPIPE)
 845                                 log(LOG_INFO,
 846                                     "receive error %d from nfs server %s\n", error,
 847                                     vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 848                         error = nfs_sndlock(rep);
 849                         if (!error) {
 850                                 error = nfs_reconnect(rep);
 851                                 if (!error)
 852                                         goto tryagain;
 853                                 nfs_sndunlock(rep);
 854                         }
 855                 }
 856         } else {
 857                 /*
 858                  * We could have failed while rebinding the datagram socket
 859                  * so we need to attempt to rebind here.
 860                  */
 861                 if ((so = rep->r_nmp->nm_so) == NULL) {
 862                         error = nfs_sndlock(rep);
 863                         if (!error) {
 864                                 error = nfs_reconnect(rep);
 865                                 nfs_sndunlock(rep);
 866                         }
 867                         if (error)
 868                                 return (error);
 869                         if (!rep->r_nmp) /* if unmounted then bailout */
 870                                 return (ENXIO);
 871                         so = rep->r_nmp->nm_so;
 872                 }
 873                 bzero(&msg, sizeof(msg));
 874                 len = 0;
 875                 do {
 876                         rcvlen = 1000000;
 877                         error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 878                         if (!rep->r_nmp) /* if unmounted then bailout */
 879                                 goto shutout;
 880                         if (error) {
 881                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 882                                 if (error2) {
 883                                         error = error2;
 884                                         goto shutout;
 885                                 }
 886                         }
 887                         /* Reconnect for all errors.  We may be receiving
 888                          * soft/hard/blocking errors because of a network
 889                          * change.
 890                          * XXX: we should rate limit or delay this
 891                          * to once every N attempts or something.
 892                          * although TCP doesn't seem to.
 893                          */
 894                         if (error) {
 895                                 error2 = nfs_sndlock(rep);
 896                                 if (!error2) {
 897                                         error2 = nfs_reconnect(rep);
 898                                         if (error2)
 899                                                 error = error2;
 900                                         else if (!rep->r_nmp) /* if unmounted then bailout */
 901                                                 error = ENXIO;
 902                                         else
 903                                                 so = rep->r_nmp->nm_so;
 904                                         nfs_sndunlock(rep);
 905                                 } else {
 906                                         error = error2;
 907                                 }
 908                         }
 909                 } while (error == EWOULDBLOCK);
 910         }
 911 shutout:
 912         if (error) {
 913                 mbuf_freem(*mp);
 914                 *mp = NULL;
 915         }
 916         return (error);
 917 }
 918
 919 /*
 920  * Implement receipt of reply on a socket.
 921  * We must search through the list of received datagrams matching them
 922  * with outstanding requests using the xid, until ours is found.
 923  */
 924 /* ARGSUSED */
 925 int
 926 nfs_reply(myrep)
 927         struct nfsreq *myrep;
 928 {
 929         struct nfsreq *rep;
 930         struct nfsmount *nmp = myrep->r_nmp;
 931         long t1;
 932         mbuf_t mrep, md;
 933         u_long rxid, *tl;
 934         caddr_t dpos, cp2;
 935         int error;
 936
 937         /*
 938          * Loop around until we get our own reply
 939          */
 940         for (;;) {
 941                 /*
 942                  * Lock against other receivers so that I don't get stuck in
 943                  * sbwait() after someone else has received my reply for me.
 944                  * Also necessary for connection based protocols to avoid
 945                  * race conditions during a reconnect.
 946                  * If nfs_rcvlock() returns EALREADY, that means that
 947                  * the reply has already been recieved by another
 948                  * process and we can return immediately.  In this
 949                  * case, the lock is not taken to avoid races with
 950                  * other processes.
 951                  */
 952                 error = nfs_rcvlock(myrep);
 953                 if (error == EALREADY)
 954                         return (0);
 955                 if (error)
 956                         return (error);
 957
 958                 /*
 959                  * If we slept after putting bits otw, then reply may have
 960                  * arrived.  In which case returning is required, or we
 961                  * would hang trying to nfs_receive an already received reply.
 962                  */
 963                 if (myrep->r_mrep != NULL) {
 964                         nfs_rcvunlock(myrep);
 965                         FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
 966                         return (0);
 967                 }
 968                 /*
 969                  * Get the next Rpc reply off the socket. Assume myrep->r_nmp
 970                  * is still intact by checks done in nfs_rcvlock.
 971                  */
 972                 error = nfs_receive(myrep, &mrep);
 973                 /*
 974                  * Bailout asap if nfsmount struct gone (unmounted).
 975                  */
 976                 if (!myrep->r_nmp) {
 977                         FSDBG(530, myrep->r_xid, myrep, nmp, -2);
 978                         if (mrep)
 979                                 mbuf_freem(mrep);
 980                         return (ENXIO);
 981                 }
 982                 if (error) {
 983                         FSDBG(530, myrep->r_xid, myrep, nmp, error);
 984                         nfs_rcvunlock(myrep);
 985
 986                         /* Bailout asap if nfsmount struct gone (unmounted). */
 987                         if (!myrep->r_nmp) {
 988                                 if (mrep)
 989                                         mbuf_freem(mrep);
 990                                 return (ENXIO);
 991                         }
 992
 993                         /*
 994                          * Ignore routing errors on connectionless protocols??
 995                          */
 996                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
 997                                 if (nmp->nm_so) {
 998                                         int clearerror;
 999                                         int optlen = sizeof(clearerror);
1000                                         sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1001                                 }
1002                                 continue;
1003                         }
1004                         if (mrep)
1005                                 mbuf_freem(mrep);
1006                         return (error);
1007                 }
1008
1009                 /*
1010                  * We assume all is fine, but if we did not have an error
1011                  * and mrep is 0, better not dereference it. nfs_receive
1012                  * calls soreceive which carefully sets error=0 when it got
1013                  * errors on sbwait (tsleep). In most cases, I assume that's
1014                  * so we could go back again. In tcp case, EPIPE is returned.
1015                  * In udp, case nfs_receive gets back here with no error and no
1016                  * mrep. Is the right fix to have soreceive check for process
1017                  * aborted after sbwait and return something non-zero? Should
1018                  * nfs_receive give an EPIPE?  Too risky to play with those
1019                  * two this late in game for a shutdown problem. Instead,
1020                  * just check here and get out. (ekn)
1021                  */
1022                 if (!mrep) {
1023                         nfs_rcvunlock(myrep);
1024                         FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1025                         return (ENXIO); /* sounds good */
1026                 }
1027
1028                 /*
1029                  * Get the xid and check that it is an rpc reply
1030                  */
1031                 md = mrep;
1032                 dpos = mbuf_data(md);
1033                 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1034                 rxid = *tl++;
1035                 if (*tl != rpc_reply) {
1036                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1037                         mbuf_freem(mrep);
1038 nfsmout:
1039                         if (nmp->nm_state & NFSSTA_RCVLOCK)
1040                                 nfs_rcvunlock(myrep);
1041                         continue;
1042                 }
1043
1044                 /*
1045                  * Loop through the request list to match up the reply
1046                  * Iff no match, just drop the datagram
1047                  */
1048                 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1049                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1050                                 /* Found it.. */
1051                                 rep->r_mrep = mrep;
1052                                 rep->r_md = md;
1053                                 rep->r_dpos = dpos;
1054                                 /*
1055                                  * If we're tracking the round trip time
1056                                  * then we update the circular log here
1057                                  * with the stats from our current request.
1058                                  */
1059                                 if (nfsrtton) {
1060                                         struct rttl *rt;
1061
1062                                         rt = &nfsrtt.rttl[nfsrtt.pos];
1063                                         rt->proc = rep->r_procnum;
1064                                         rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1065                                         rt->sent = nmp->nm_sent;
1066                                         rt->cwnd = nmp->nm_cwnd;
1067                                         if (proct[rep->r_procnum] == 0)
1068                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1069                                         rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1070                                         rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1071                                         rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1072                                         microtime(&rt->tstamp); // XXX unused
1073                                         if (rep->r_flags & R_TIMING)
1074                                                 rt->rtt = rep->r_rtt;
1075                                         else
1076                                                 rt->rtt = 1000000;
1077                                         nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1078                                 }
1079                                 /*
1080                                  * Update congestion window.
1081                                  * Do the additive increase of
1082                                  * one rpc/rtt.
1083                                  */
1084                                 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1085                                       nmp->nm_cwnd);
1086                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
1087                                         nmp->nm_cwnd +=
1088                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
1089                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1090                                         if (nmp->nm_cwnd > NFS_MAXCWND)
1091                                                 nmp->nm_cwnd = NFS_MAXCWND;
1092                                 }
1093                                 if (rep->r_flags & R_SENT) {
1094                                     rep->r_flags &= ~R_SENT;
1095                                     nmp->nm_sent -= NFS_CWNDSCALE;
1096                                }
1097                                 /*
1098                                  * Update rtt using a gain of 0.125 on the mean
1099                                  * and a gain of 0.25 on the deviation.
1100                                  */
1101                                 if (rep->r_flags & R_TIMING) {
1102                                         /*
1103                                          * Since the timer resolution of
1104                                          * NFS_HZ is so course, it can often
1105                                          * result in r_rtt == 0. Since
1106                                          * r_rtt == N means that the actual
1107                                          * rtt is between N+dt and N+2-dt ticks,
1108                                          * add 1.
1109                                          */
1110                                         if (proct[rep->r_procnum] == 0)
1111                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1112                                         t1 = rep->r_rtt + 1;
1113                                         t1 -= (NFS_SRTT(rep) >> 3);
1114                                         NFS_SRTT(rep) += t1;
1115                                         if (t1 < 0)
1116                                                 t1 = -t1;
1117                                         t1 -= (NFS_SDRTT(rep) >> 2);
1118                                         NFS_SDRTT(rep) += t1;
1119                                 }
1120                                 nmp->nm_timeouts = 0;
1121                                 break;
1122                         }
1123                 }
1124                 nfs_rcvunlock(myrep);
1125                 /*
1126                  * If not matched to a request, drop it.
1127                  * If it's mine, get out.
1128                  */
1129                 if (rep == 0) {
1130                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1131                         mbuf_freem(mrep);
1132                 } else if (rep == myrep) {
1133                         if (rep->r_mrep == NULL)
1134                                 panic("nfs_reply: nil r_mrep");
1135                         return (0);
1136                 }
1137                 FSDBG(530, myrep->r_xid, myrep, rep,
1138                       rep ? rep->r_xid : myrep->r_flags);
1139         }
1140 }
1141
1142 /*
1143  * nfs_request - goes something like this
1144  *      - fill in request struct
1145  *      - links it into list
1146  *      - calls nfs_send() for first transmit
1147  *      - calls nfs_receive() to get reply
1148  *      - break down rpc header and return with nfs reply pointed to
1149  *        by mrep or error
1150  * nb: always frees up mreq mbuf list
1151  */
1152 int
1153 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1154         vnode_t vp;
1155         mount_t mp;
1156         mbuf_t mrest;
1157         int procnum;
1158         proc_t procp;
1159         kauth_cred_t cred;
1160         mbuf_t *mrp;
1161         mbuf_t *mdp;
1162         caddr_t *dposp;
1163         u_int64_t *xidp;
1164 {
1165         mbuf_t m, mrep, m2;
1166         struct nfsreq re, *rep;
1167         u_long *tl;
1168         int i;
1169         struct nfsmount *nmp;
1170         mbuf_t md, mheadend;
1171         char nickv[RPCX_NICKVERF];
1172         time_t waituntil;
1173         caddr_t dpos, cp2;
1174         int t1, error = 0, mrest_len, auth_len, auth_type;
1175         int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1176         int verf_len, verf_type;
1177         u_long xid;
1178         char *auth_str, *verf_str;
1179         NFSKERBKEY_T key;               /* save session key */
1180         int nmsotype;
1181         struct timeval now;
1182
1183         if (mrp)
1184                 *mrp = NULL;
1185         if (xidp)
1186                 *xidp = 0;
1187         nmp = VFSTONFS(mp);
1188
1189         rep = &re;
1190
1191         if (vp)
1192                 nmp = VFSTONFS(vnode_mount(vp));
1193         if (nmp == NULL ||
1194             (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1195             (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1196                 mbuf_freem(mrest);
1197                 return (ENXIO);
1198         }
1199         nmsotype = nmp->nm_sotype;
1200
1201         FSDBG_TOP(531, vp, procnum, nmp, rep);
1202
1203         rep->r_nmp = nmp;
1204         rep->r_vp = vp;
1205         rep->r_procp = procp;
1206         rep->r_procnum = procnum;
1207         microuptime(&now);
1208         rep->r_lastmsg = now.tv_sec -
1209             ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1210         i = 0;
1211         m = mrest;
1212         while (m) {
1213                 i += mbuf_len(m);
1214                 m = mbuf_next(m);
1215         }
1216         mrest_len = i;
1217
1218         /*
1219          * Get the RPC header with authorization.
1220          */
1221 kerbauth:
1222         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1223         if (!nmp) {
1224                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1225                 mbuf_freem(mrest);
1226                 return (ENXIO);
1227         }
1228         verf_str = auth_str = (char *)0;
1229         if (nmp->nm_flag & NFSMNT_KERB) {
1230                 verf_str = nickv;
1231                 verf_len = sizeof (nickv);
1232                 auth_type = RPCAUTH_KERB4;
1233                 bzero((caddr_t)key, sizeof (key));
1234                 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1235                         &auth_len, verf_str, verf_len)) {
1236                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1237                         if (!nmp) {
1238                                 FSDBG_BOT(531, 2, vp, error, rep);
1239                                 mbuf_freem(mrest);
1240                                 return (ENXIO);
1241                         }
1242                         error = nfs_getauth(nmp, rep, cred, &auth_str,
1243                                 &auth_len, verf_str, &verf_len, key);
1244                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1245                         if (!error && !nmp)
1246                                 error = ENXIO;
1247                         if (error) {
1248                                 FSDBG_BOT(531, 2, vp, error, rep);
1249                                 mbuf_freem(mrest);
1250                                 return (error);
1251                         }
1252                 }
1253         } else {
1254                 auth_type = RPCAUTH_UNIX;
1255                 if (cred->cr_ngroups < 1)
1256                         panic("nfsreq nogrps");
1257                 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1258                         nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1259                         5 * NFSX_UNSIGNED;
1260         }
1261         error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1262              auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1263         if (auth_str)
1264                 _FREE(auth_str, M_TEMP);
1265         if (error) {
1266                 mbuf_freem(mrest);
1267                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1268                 return (error);
1269         }
1270         if (xidp)
1271                 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1272
1273         /*
1274          * For stream protocols, insert a Sun RPC Record Mark.
1275          */
1276         if (nmsotype == SOCK_STREAM) {
1277                 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1278                 if (error) {
1279                         mbuf_freem(m);
1280                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1281                         return (error);
1282                 }
1283                 *((u_long*)mbuf_data(m)) =
1284                         htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1285         }
1286         rep->r_mreq = m;
1287         rep->r_xid = xid;
1288 tryagain:
1289         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1290         if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1291                 rep->r_retry = nmp->nm_retry;
1292         else
1293                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
1294         rep->r_rtt = rep->r_rexmit = 0;
1295         if (proct[procnum] > 0)
1296                 rep->r_flags = R_TIMING;
1297         else
1298                 rep->r_flags = 0;
1299         rep->r_mrep = NULL;
1300
1301         /*
1302          * Do the client side RPC.
1303          */
1304         OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1305         /*
1306          * Chain request into list of outstanding requests. Be sure
1307          * to put it LAST so timer finds oldest requests first.
1308          */
1309         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1310
1311         /*
1312          * If backing off another request or avoiding congestion, don't
1313          * send this one now but let timer do it. If not timing a request,
1314          * do it now.
1315          */
1316         if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1317                            (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1318                            nmp->nm_sent < nmp->nm_cwnd)) {
1319                 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1320
1321                 if (connrequired)
1322                         error = nfs_sndlock(rep);
1323
1324                 /*
1325                  * Set the R_SENT before doing the send in case another thread
1326                  * processes the reply before the nfs_send returns here
1327                  */
1328                 if (!error) {
1329                         if ((rep->r_flags & R_MUSTRESEND) == 0) {
1330                                 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1331                                       nmp->nm_cwnd);
1332                                 nmp->nm_sent += NFS_CWNDSCALE;
1333                                 rep->r_flags |= R_SENT;
1334                         }
1335
1336                         error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1337                         if (!error)
1338                                 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1339                         if (connrequired)
1340                                 nfs_sndunlock(rep);
1341                 }
1342                 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1343                 if (error) {
1344                         if (nmp)
1345                                 nmp->nm_sent -= NFS_CWNDSCALE;
1346                         rep->r_flags &= ~R_SENT;
1347                 }
1348         } else {
1349                 rep->r_rtt = -1;
1350         }
1351
1352         /*
1353          * Wait for the reply from our send or the timer's.
1354          */
1355         if (!error || error == EPIPE)
1356                 error = nfs_reply(rep);
1357
1358         /*
1359          * RPC done, unlink the request.
1360          */
1361         nfs_repdequeue(rep);
1362
1363         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1364
1365         /*
1366          * Decrement the outstanding request count.
1367          */
1368         if (rep->r_flags & R_SENT) {
1369                 rep->r_flags &= ~R_SENT;        /* paranoia */
1370                 if (nmp) {
1371                         FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1372                         nmp->nm_sent -= NFS_CWNDSCALE;
1373                 }
1374         }
1375
1376         /*
1377          * If there was a successful reply and a tprintf msg.
1378          * tprintf a response.
1379          */
1380         if (!error)
1381                 nfs_up(nmp, procp, NFSSTA_TIMEO,
1382                         (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1383         mrep = rep->r_mrep;
1384         md = rep->r_md;
1385         dpos = rep->r_dpos;
1386         if (!error && !nmp)
1387                 error = ENXIO;
1388         if (error) {
1389                 mbuf_freem(rep->r_mreq);
1390                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1391                 return (error);
1392         }
1393
1394         /*
1395          * break down the rpc header and check if ok
1396          */
1397         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1398         if (*tl++ == rpc_msgdenied) {
1399                 if (*tl == rpc_mismatch)
1400                         error = EOPNOTSUPP;
1401                 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1402                         if (!failed_auth) {
1403                                 failed_auth++;
1404                                 error = mbuf_setnext(mheadend, NULL);
1405                                 mbuf_freem(mrep);
1406                                 mbuf_freem(rep->r_mreq);
1407                                 if (!error)
1408                                         goto kerbauth;
1409                                 printf("nfs_request: mbuf_setnext failed\n");
1410                         } else
1411                                 error = EAUTH;
1412                 } else
1413                         error = EACCES;
1414                 mbuf_freem(mrep);
1415                 mbuf_freem(rep->r_mreq);
1416                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1417                 return (error);
1418         }
1419
1420         /*
1421          * Grab any Kerberos verifier, otherwise just throw it away.
1422          */
1423         verf_type = fxdr_unsigned(int, *tl++);
1424         i = fxdr_unsigned(int, *tl);
1425         if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1426                 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1427                 if (error)
1428                         goto nfsmout;
1429         } else if (i > 0)
1430                 nfsm_adv(nfsm_rndup(i));
1431         nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1432         /* 0 == ok */
1433         if (*tl == 0) {
1434                 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1435                 if (*tl != 0) {
1436                         error = fxdr_unsigned(int, *tl);
1437                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1438                                 error == NFSERR_TRYLATER) {
1439                                 mbuf_freem(mrep);
1440                                 error = 0;
1441                                 microuptime(&now);
1442                                 waituntil = now.tv_sec + trylater_delay;
1443                                 while (now.tv_sec < waituntil) {
1444                                         tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1445                                         microuptime(&now);
1446                                 }
1447                                 trylater_delay *= 2;
1448                                 if (trylater_delay > 60)
1449                                         trylater_delay = 60;
1450                                 goto tryagain;
1451                         }
1452
1453                         /*
1454                          * If the File Handle was stale, invalidate the
1455                          * lookup cache, just in case.
1456                          */
1457                         if ((error == ESTALE) && vp)
1458                                 cache_purge(vp);
1459                         if (nmp->nm_flag & NFSMNT_NFSV3) {
1460                                 *mrp = mrep;
1461                                 *mdp = md;
1462                                 *dposp = dpos;
1463                                 error |= NFSERR_RETERR;
1464                         } else {
1465                                 mbuf_freem(mrep);
1466                                 error &= ~NFSERR_RETERR;
1467                         }
1468                         mbuf_freem(rep->r_mreq);
1469                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1470                         return (error);
1471                 }
1472
1473                 *mrp = mrep;
1474                 *mdp = md;
1475                 *dposp = dpos;
1476                 mbuf_freem(rep->r_mreq);
1477                 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1478                 return (0);
1479         }
1480         mbuf_freem(mrep);
1481         error = EPROTONOSUPPORT;
1482 nfsmout:
1483         mbuf_freem(rep->r_mreq);
1484         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1485         return (error);
1486 }
1487
1488 #ifndef NFS_NOSERVER
1489 /*
1490  * Generate the rpc reply header
1491  * siz arg. is used to decide if adding a cluster is worthwhile
1492  */
1493 int
1494 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1495         int siz;
1496         struct nfsrv_descript *nd;
1497         struct nfssvc_sock *slp;
1498         int err;
1499         mbuf_t *mrq;
1500         mbuf_t *mbp;
1501         caddr_t *bposp;
1502 {
1503         u_long *tl;
1504         mbuf_t mreq;
1505         caddr_t bpos;
1506         mbuf_t mb, mb2;
1507         int error, mlen;
1508
1509         /*
1510          * If this is a big reply, use a cluster else
1511          * try and leave leading space for the lower level headers.
1512          */
1513         siz += RPC_REPLYSIZ;
1514         if (siz >= nfs_mbuf_minclsize) {
1515                 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1516         } else {
1517                 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1518         }
1519         if (error) {
1520                 /* unable to allocate packet */
1521                 /* XXX nfsstat? */
1522                 return (error);
1523         }
1524         mb = mreq;
1525         tl = mbuf_data(mreq);
1526         mlen = 6 * NFSX_UNSIGNED;
1527         if (siz < nfs_mbuf_minclsize) {
1528                 /* leave space for lower level headers */
1529                 tl += 80/sizeof(*tl);  /* XXX max_hdr? XXX */
1530                 mbuf_setdata(mreq, tl, mlen);
1531         } else {
1532                 mbuf_setlen(mreq, mlen);
1533         }
1534         bpos = ((caddr_t)tl) + mlen;
1535         *tl++ = txdr_unsigned(nd->nd_retxid);
1536         *tl++ = rpc_reply;
1537         if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1538                 *tl++ = rpc_msgdenied;
1539                 if (err & NFSERR_AUTHERR) {
1540                         *tl++ = rpc_autherr;
1541                         *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1542                         mlen -= NFSX_UNSIGNED;
1543                         mbuf_setlen(mreq, mlen);
1544                         bpos -= NFSX_UNSIGNED;
1545                 } else {
1546                         *tl++ = rpc_mismatch;
1547                         *tl++ = txdr_unsigned(RPC_VER2);
1548                         *tl = txdr_unsigned(RPC_VER2);
1549                 }
1550         } else {
1551                 *tl++ = rpc_msgaccepted;
1552
1553                 /*
1554                  * For Kerberos authentication, we must send the nickname
1555                  * verifier back, otherwise just RPCAUTH_NULL.
1556                  */
1557                 if (nd->nd_flag & ND_KERBFULL) {
1558                     struct nfsuid *nuidp;
1559                     struct timeval ktvin, ktvout;
1560                     uid_t uid = kauth_cred_getuid(nd->nd_cr);
1561
1562                     lck_rw_lock_shared(&slp->ns_rwlock);
1563                     for (nuidp = NUIDHASH(slp, uid)->lh_first;
1564                         nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1565                         if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1566                             (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1567                              &nuidp->nu_haddr, nd->nd_nam2)))
1568                             break;
1569                     }
1570                     if (nuidp) {
1571                         ktvin.tv_sec =
1572                             txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1573                         ktvin.tv_usec =
1574                             txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1575
1576                         /*
1577                          * Encrypt the timestamp in ecb mode using the
1578                          * session key.
1579                          */
1580 #if NFSKERB
1581                         XXX
1582 #endif
1583
1584                         *tl++ = rpc_auth_kerb;
1585                         *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1586                         *tl = ktvout.tv_sec;
1587                         nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1588                         *tl++ = ktvout.tv_usec;
1589                         *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1590                     } else {
1591                         *tl++ = 0;
1592                         *tl++ = 0;
1593                     }
1594                     lck_rw_done(&slp->ns_rwlock);
1595                 } else {
1596                         *tl++ = 0;
1597                         *tl++ = 0;
1598                 }
1599                 switch (err) {
1600                 case EPROGUNAVAIL:
1601                         *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1602                         break;
1603                 case EPROGMISMATCH:
1604                         *tl = txdr_unsigned(RPC_PROGMISMATCH);
1605                         nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1606                         // XXX hard coded versions
1607                         *tl++ = txdr_unsigned(2);
1608                         *tl = txdr_unsigned(3);
1609                         break;
1610                 case EPROCUNAVAIL:
1611                         *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1612                         break;
1613                 case EBADRPC:
1614                         *tl = txdr_unsigned(RPC_GARBAGE);
1615                         break;
1616                 default:
1617                         *tl = 0;
1618                         if (err != NFSERR_RETVOID) {
1619                                 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1620                                 if (err)
1621                                     *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1622                                 else
1623                                     *tl = 0;
1624                         }
1625                         break;
1626                 }
1627         }
1628
1629         if (mrq != NULL)
1630                 *mrq = mreq;
1631         *mbp = mb;
1632         *bposp = bpos;
1633         if (err != 0 && err != NFSERR_RETVOID) {
1634                 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1635         }
1636         return (0);
1637 }
1638
1639
1640 #endif /* NFS_NOSERVER */
1641
1642
1643 /*
1644  * From FreeBSD 1.58, a Matt Dillon fix...
1645  * Flag a request as being about to terminate.
1646  * The nm_sent count is decremented now to avoid deadlocks when the process
1647  * in soreceive() hasn't yet managed to send its own request.
1648  */
1649 static void
1650 nfs_softterm(struct nfsreq *rep)
1651 {
1652
1653         rep->r_flags |= R_SOFTTERM;
1654         if (rep->r_flags & R_SENT) {
1655                 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1656                       rep->r_nmp->nm_cwnd);
1657                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1658                 rep->r_flags &= ~R_SENT;
1659         }
1660 }
1661
1662 void
1663 nfs_timer_funnel(void * arg)
1664 {
1665         (void) thread_funnel_set(kernel_flock, TRUE);
1666         nfs_timer(arg);
1667         (void) thread_funnel_set(kernel_flock, FALSE);
1668
1669 }
1670
1671 /*
1672  * Ensure rep isn't in use by the timer, then dequeue it.
1673  */
1674 static void
1675 nfs_repdequeue(struct nfsreq *rep)
1676 {
1677
1678         while ((rep->r_flags & R_BUSY)) {
1679                 rep->r_flags |= R_WAITING;
1680                 tsleep(rep, PSOCK, "repdeq", 0);
1681         }
1682         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1683 }
1684
1685 /*
1686  * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1687  * free()'d out from under it.
1688  */
1689 static void
1690 nfs_repbusy(struct nfsreq *rep)
1691 {
1692
1693         if ((rep->r_flags & R_BUSY))
1694                 panic("rep locked");
1695         rep->r_flags |= R_BUSY;
1696 }
1697
1698 /*
1699  * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1700  */
1701 static struct nfsreq *
1702 nfs_repnext(struct nfsreq *rep)
1703 {
1704         struct nfsreq * nextrep;
1705
1706         if (rep == NULL)
1707                 return (NULL);
1708         /*
1709          * We need to get and busy the next req before signalling the
1710          * current one, otherwise wakeup() may block us and we'll race to
1711          * grab the next req.
1712          */
1713         nextrep = TAILQ_NEXT(rep, r_chain);
1714         if (nextrep != NULL)
1715                 nfs_repbusy(nextrep);
1716         /* unbusy and signal. */
1717         rep->r_flags &= ~R_BUSY;
1718         if ((rep->r_flags & R_WAITING)) {
1719                 rep->r_flags &= ~R_WAITING;
1720                 wakeup(rep);
1721         }
1722         return (nextrep);
1723 }
1724
1725 /*
1726  * Nfs timer routine
1727  * Scan the nfsreq list and retranmit any requests that have timed out
1728  * To avoid retransmission attempts on STREAM sockets (in the future) make
1729  * sure to set the r_retry field to 0 (implies nm_retry == 0).
1730  */
1731 void
1732 nfs_timer(__unused void *arg)
1733 {
1734         struct nfsreq *rep;
1735         mbuf_t m;
1736         socket_t so;
1737         struct nfsmount *nmp;
1738         int timeo;
1739         int error;
1740 #ifndef NFS_NOSERVER
1741         struct nfssvc_sock *slp;
1742         u_quad_t cur_usec;
1743 #endif /* NFS_NOSERVER */
1744         int flags, rexmit, cwnd, sent;
1745         u_long xid;
1746         struct timeval now;
1747
1748         rep = TAILQ_FIRST(&nfs_reqq);
1749         if (rep != NULL)
1750                 nfs_repbusy(rep);
1751         microuptime(&now);
1752         for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1753                 nmp = rep->r_nmp;
1754                 if (!nmp) /* unmounted */
1755                     continue;
1756                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1757                         continue;
1758                 if (nfs_sigintr(nmp, rep, rep->r_procp))
1759                         continue;
1760                 if (nmp->nm_tprintf_initial_delay != 0 &&
1761                     (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1762                     rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1763                         rep->r_lastmsg = now.tv_sec;
1764                         nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1765                                 "not responding");
1766                         rep->r_flags |= R_TPRINTFMSG;
1767                         if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1768                                 /* we're not yet completely mounted and */
1769                                 /* we can't complete an RPC, so we fail */
1770                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1771                                 nfs_softterm(rep);
1772                                 continue;
1773                         }
1774                 }
1775                 if (rep->r_rtt >= 0) {
1776                         rep->r_rtt++;
1777                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1778                                 timeo = nmp->nm_timeo;
1779                         else
1780                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1781                         /* ensure 62.5 ms floor */
1782                         while (16 * timeo < hz)
1783                             timeo *= 2;
1784                         if (nmp->nm_timeouts > 0)
1785                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1786                         if (rep->r_rtt <= timeo)
1787                                 continue;
1788                         if (nmp->nm_timeouts < 8)
1789                                 nmp->nm_timeouts++;
1790                 }
1791                 /*
1792                  * Check for too many retransmits.  This is never true for
1793                  * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1794                  * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1795                  */
1796                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
1797                         OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1798                         nfs_softterm(rep);
1799                         continue;
1800                 }
1801                 if (nmp->nm_sotype != SOCK_DGRAM) {
1802                         if (++rep->r_rexmit > NFS_MAXREXMIT)
1803                                 rep->r_rexmit = NFS_MAXREXMIT;
1804                         continue;
1805                 }
1806                 if ((so = nmp->nm_so) == NULL)
1807                         continue;
1808
1809                 /*
1810                  * If there is enough space and the window allows..
1811                  *      Resend it
1812                  * Set r_rtt to -1 in case we fail to send it now.
1813                  */
1814                 rep->r_rtt = -1;
1815                 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1816                     (rep->r_flags & R_SENT) ||
1817                     nmp->nm_sent < nmp->nm_cwnd) &&
1818                    (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1819                         struct msghdr   msg;
1820                         /*
1821                          * Iff first send, start timing
1822                          * else turn timing off, backoff timer
1823                          * and divide congestion window by 2.
1824                          * We update these *before* the send to avoid
1825                          * racing against receiving the reply.
1826                          * We save them so we can restore them on send error.
1827                          */
1828                         flags = rep->r_flags;
1829                         rexmit = rep->r_rexmit;
1830                         cwnd = nmp->nm_cwnd;
1831                         sent = nmp->nm_sent;
1832                         xid = rep->r_xid;
1833                         if (rep->r_flags & R_SENT) {
1834                                 rep->r_flags &= ~R_TIMING;
1835                                 if (++rep->r_rexmit > NFS_MAXREXMIT)
1836                                         rep->r_rexmit = NFS_MAXREXMIT;
1837                                 nmp->nm_cwnd >>= 1;
1838                                 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1839                                         nmp->nm_cwnd = NFS_CWNDSCALE;
1840                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1841                         } else {
1842                                 rep->r_flags |= R_SENT;
1843                                 nmp->nm_sent += NFS_CWNDSCALE;
1844                         }
1845                         FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1846
1847                         bzero(&msg, sizeof(msg));
1848                         if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1849                                 msg.msg_name = mbuf_data(nmp->nm_nam);
1850                                 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1851                         }
1852                         error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1853
1854                         FSDBG(535, xid, error, sent, cwnd);
1855
1856                         if (error) {
1857                                 if (error == EWOULDBLOCK) {
1858                                         rep->r_flags = flags;
1859                                         rep->r_rexmit = rexmit;
1860                                         nmp->nm_cwnd = cwnd;
1861                                         nmp->nm_sent = sent;
1862                                         rep->r_xid = xid;
1863                                 }
1864                                 else {
1865                                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1866                                                 int clearerror;
1867                                                 int optlen = sizeof(clearerror);
1868                                                 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1869                                         }
1870                                         rep->r_flags  = flags | R_RESENDERR;
1871                                         rep->r_rexmit = rexmit;
1872                                         nmp->nm_cwnd = cwnd;
1873                                         nmp->nm_sent = sent;
1874                                         if (flags & R_SENT)
1875                                                 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1876                                 }
1877                         } else
1878                                 rep->r_rtt = 0;
1879                 }
1880         }
1881         microuptime(&now);
1882 #ifndef NFS_NOSERVER
1883         /*
1884          * Scan the write gathering queues for writes that need to be
1885          * completed now.
1886          */
1887         cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1888         lck_mtx_lock(nfsd_mutex);
1889         TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1890             if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1891                 nfsrv_wakenfsd(slp);
1892         }
1893         while ((slp = TAILQ_FIRST(&nfssvc_deadsockhead))) {
1894                 if ((slp->ns_timestamp + 5) > now.tv_sec)
1895                         break;
1896                 TAILQ_REMOVE(&nfssvc_deadsockhead, slp, ns_chain);
1897                 nfsrv_slpfree(slp);
1898         }
1899         lck_mtx_unlock(nfsd_mutex);
1900 #endif /* NFS_NOSERVER */
1901
1902         if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1903                 /*
1904                  * We haven't called nfs_buf_freeup() in a little while.
1905                  * So, see if we can free up any stale/unused bufs now.
1906                  */
1907                 nfs_buf_freeup(1);
1908         }
1909
1910         timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1911
1912 }
1913
1914
1915 /*
1916  * Test for a termination condition pending on the process.
1917  * This is used to determine if we need to bail on a mount.
1918  * EIO is returned if there has been a soft timeout.
1919  * EINTR is returned if there is a signal pending that is not being ignored
1920  * and the mount is interruptable, or if we are a thread that is in the process
1921  * of cancellation (also SIGKILL posted).
1922  */
1923 int
1924 nfs_sigintr(nmp, rep, p)
1925         struct nfsmount *nmp;
1926         struct nfsreq *rep;
1927         proc_t p;
1928 {
1929         sigset_t pending_sigs;
1930         int context_good = 0;
1931         struct nfsmount *repnmp;
1932         extern proc_t kernproc;
1933
1934         if (nmp == NULL)
1935                 return (ENXIO);
1936         if (rep != NULL) {
1937                 repnmp = rep->r_nmp;
1938                 /* we've had a forced unmount. */
1939                 if (repnmp == NULL)
1940                         return (ENXIO);
1941                 /* request has timed out on a 'soft' mount. */
1942                 if (rep->r_flags & R_SOFTTERM)
1943                         return (EIO);
1944                 /*
1945                  * We're in the progress of a force unmount and there's
1946                  * been a timeout we're dead and fail IO.
1947                  */
1948                 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1949                    (NFSSTA_FORCE|NFSSTA_TIMEO))
1950                         return (EIO);
1951                 /* Someone is unmounting us, go soft and mark it. */
1952                 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1953                         repnmp->nm_flag |= NFSMNT_SOFT;
1954                         nmp->nm_state |= NFSSTA_FORCE;
1955                 }
1956                 /*
1957                  * If the mount is hung and we've requested not to hang
1958                  * on remote filesystems, then bail now.
1959                  */
1960                 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1961                     (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1962                         return (EIO);
1963         }
1964         /* XXX: is this valid?  this probably should be an assertion. */
1965         if (p == NULL)
1966                 return (0);
1967
1968         /* Is this thread belongs to kernel task; then abort check  is not needed */
1969         if ((current_proc() != kernproc) && current_thread_aborted()) {
1970                 return (EINTR);
1971         }
1972         /* mask off thread and process blocked signals. */
1973
1974         pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1975         if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1976                 return (EINTR);
1977         return (0);
1978 }
1979
1980 /*
1981  * Lock a socket against others.
1982  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1983  * and also to avoid race conditions between the processes with nfs requests
1984  * in progress when a reconnect is necessary.
1985  */
1986 int
1987 nfs_sndlock(rep)
1988         struct nfsreq *rep;
1989 {
1990         int *statep;
1991         proc_t p;
1992         int error, slpflag = 0, slptimeo = 0;
1993
1994         if (rep->r_nmp == NULL)
1995                 return (ENXIO);
1996         statep = &rep->r_nmp->nm_state;
1997
1998         p = rep->r_procp;
1999         if (rep->r_nmp->nm_flag & NFSMNT_INT)
2000                 slpflag = PCATCH;
2001         while (*statep & NFSSTA_SNDLOCK) {
2002                 error = nfs_sigintr(rep->r_nmp, rep, p);
2003                 if (error)
2004                         return (error);
2005                 *statep |= NFSSTA_WANTSND;
2006                 if (p != NULL && (proc_noremotehang(p)) != 0)
2007                         slptimeo = hz;
2008                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2009                 if (slpflag == PCATCH) {
2010                         slpflag = 0;
2011                         slptimeo = 2 * hz;
2012                 }
2013                 /*
2014                  * Make sure while we slept that the mountpoint didn't go away.
2015                  * nfs_sigintr and callers expect it in tact.
2016                  */
2017                 if (!rep->r_nmp)
2018                         return (ENXIO); /* don't have lock until out of loop */
2019         }
2020         *statep |= NFSSTA_SNDLOCK;
2021         return (0);
2022 }
2023
2024 /*
2025  * Unlock the stream socket for others.
2026  */
2027 void
2028 nfs_sndunlock(rep)
2029         struct nfsreq *rep;
2030 {
2031         int *statep;
2032
2033         if (rep->r_nmp == NULL)
2034                 return;
2035         statep = &rep->r_nmp->nm_state;
2036         if ((*statep & NFSSTA_SNDLOCK) == 0)
2037                 panic("nfs sndunlock");
2038         *statep &= ~NFSSTA_SNDLOCK;
2039         if (*statep & NFSSTA_WANTSND) {
2040                 *statep &= ~NFSSTA_WANTSND;
2041                 wakeup((caddr_t)statep);
2042         }
2043 }
2044
2045 static int
2046 nfs_rcvlock(struct nfsreq *rep)
2047 {
2048         int *statep;
2049         int error, slpflag, slptimeo = 0;
2050
2051         /* make sure we still have our mountpoint */
2052         if (!rep->r_nmp) {
2053                 if (rep->r_mrep != NULL)
2054                         return (EALREADY);
2055                 return (ENXIO);
2056         }
2057
2058         statep = &rep->r_nmp->nm_state;
2059         FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2060         if (rep->r_nmp->nm_flag & NFSMNT_INT)
2061                 slpflag = PCATCH;
2062         else
2063                 slpflag = 0;
2064         while (*statep & NFSSTA_RCVLOCK) {
2065                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2066                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2067                         return (error);
2068                 } else if (rep->r_mrep != NULL) {
2069                         /*
2070                          * Don't bother sleeping if reply already arrived
2071                          */
2072                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2073                         return (EALREADY);
2074                 }
2075                 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2076                 *statep |= NFSSTA_WANTRCV;
2077                 /*
2078                  * We need to poll if we're P_NOREMOTEHANG so that we
2079                  * call nfs_sigintr periodically above.
2080                  */
2081                 if (rep->r_procp != NULL &&
2082                     (proc_noremotehang(rep->r_procp)) != 0)
2083                         slptimeo = hz;
2084                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2085                 if (slpflag == PCATCH) {
2086                         slpflag = 0;
2087                         slptimeo = 2 * hz;
2088                 }
2089                 /*
2090                  * Make sure while we slept that the mountpoint didn't go away.
2091                  * nfs_sigintr and caller nfs_reply expect it intact.
2092                  */
2093                 if (!rep->r_nmp)  {
2094                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2095                         return (ENXIO); /* don't have lock until out of loop */
2096                 }
2097         }
2098         /*
2099          * nfs_reply will handle it if reply already arrived.
2100          * (We may have slept or been preempted).
2101          */
2102         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2103         *statep |= NFSSTA_RCVLOCK;
2104         return (0);
2105 }
2106
2107 /*
2108  * Unlock the stream socket for others.
2109  */
2110 static void
2111 nfs_rcvunlock(struct nfsreq *rep)
2112 {
2113         int *statep;
2114
2115         if (rep->r_nmp == NULL)
2116                 return;
2117         statep = &rep->r_nmp->nm_state;
2118
2119         FSDBG(533, statep, *statep, 0, 0);
2120         if ((*statep & NFSSTA_RCVLOCK) == 0)
2121                 panic("nfs rcvunlock");
2122         *statep &= ~NFSSTA_RCVLOCK;
2123         if (*statep & NFSSTA_WANTRCV) {
2124                 *statep &= ~NFSSTA_WANTRCV;
2125                 wakeup((caddr_t)statep);
2126         }
2127 }
2128
2129
2130 #ifndef NFS_NOSERVER
2131 /*
2132  * Socket upcall routine for the nfsd sockets.
2133  * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2134  * Essentially do as much as possible non-blocking, else punt and it will
2135  * be called with MBUF_WAITOK from an nfsd.
2136  */
2137 void
2138 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2139 {
2140         struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2141
2142         if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2143                 return;
2144
2145         lck_rw_lock_exclusive(&slp->ns_rwlock);
2146         nfsrv_rcv_locked(so, slp, waitflag);
2147         /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2148 }
2149 void
2150 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2151 {
2152         mbuf_t m, mp, mhck, m2;
2153         int ns_flag=0, error;
2154         struct msghdr   msg;
2155         size_t bytes_read;
2156
2157         if ((slp->ns_flag & SLP_VALID) == 0) {
2158                 if (waitflag == MBUF_DONTWAIT)
2159                         lck_rw_done(&slp->ns_rwlock);
2160                 return;
2161         }
2162
2163 #ifdef notdef
2164         /*
2165          * Define this to test for nfsds handling this under heavy load.
2166          */
2167         if (waitflag == MBUF_DONTWAIT) {
2168                 ns_flag = SLP_NEEDQ;
2169                 goto dorecs;
2170         }
2171 #endif
2172         if (slp->ns_sotype == SOCK_STREAM) {
2173                 /*
2174                  * If there are already records on the queue, defer soreceive()
2175                  * to an nfsd so that there is feedback to the TCP layer that
2176                  * the nfs servers are heavily loaded.
2177                  */
2178                 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2179                         ns_flag = SLP_NEEDQ;
2180                         goto dorecs;
2181                 }
2182
2183                 /*
2184                  * Do soreceive().
2185                  */
2186                 bytes_read = 1000000000;
2187                 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2188                 if (error || mp == NULL) {
2189                         if (error == EWOULDBLOCK)
2190                                 ns_flag = SLP_NEEDQ;
2191                         else
2192                                 ns_flag = SLP_DISCONN;
2193                         goto dorecs;
2194                 }
2195                 m = mp;
2196                 if (slp->ns_rawend) {
2197                         if ((error = mbuf_setnext(slp->ns_rawend, m)))
2198                                 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2199                         slp->ns_cc += bytes_read;
2200                 } else {
2201                         slp->ns_raw = m;
2202                         slp->ns_cc = bytes_read;
2203                 }
2204                 while ((m2 = mbuf_next(m)))
2205                         m = m2;
2206                 slp->ns_rawend = m;
2207
2208                 /*
2209                  * Now try and parse record(s) out of the raw stream data.
2210                  */
2211                 error = nfsrv_getstream(slp, waitflag);
2212                 if (error) {
2213                         if (error == EPERM)
2214                                 ns_flag = SLP_DISCONN;
2215                         else
2216                                 ns_flag = SLP_NEEDQ;
2217                 }
2218         } else {
2219                 struct sockaddr_storage nam;
2220
2221                 bzero(&msg, sizeof(msg));
2222                 msg.msg_name = (caddr_t)&nam;
2223                 msg.msg_namelen = sizeof(nam);
2224
2225                 do {
2226                         bytes_read = 1000000000;
2227                         error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2228                         if (mp) {
2229                                 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2230                                         mbuf_setlen(mhck, nam.ss_len);
2231                                         bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2232                                         m = mhck;
2233                                         if (mbuf_setnext(m, mp)) {
2234                                                 /* trouble... just drop it */
2235                                                 printf("nfsrv_rcv: mbuf_setnext failed\n");
2236                                                 mbuf_free(mhck);
2237                                                 m = mp;
2238                                         }
2239                                 } else {
2240                                         m = mp;
2241                                 }
2242                                 if (slp->ns_recend)
2243                                         mbuf_setnextpkt(slp->ns_recend, m);
2244                                 else
2245                                         slp->ns_rec = m;
2246                                 slp->ns_recend = m;
2247                                 mbuf_setnextpkt(m, NULL);
2248                         }
2249 #if 0
2250                         if (error) {
2251                                 /*
2252                                  * This may be needed in the future to support
2253                                  * non-byte-stream connection-oriented protocols
2254                                  * such as SCTP.
2255                                  */
2256                                 /*
2257                                  * This (slp->ns_sotype == SOCK_STREAM) should really
2258                                  * be a check for PR_CONNREQUIRED.
2259                                  */
2260                                 if ((slp->ns_sotype == SOCK_STREAM)
2261                                         && error != EWOULDBLOCK) {
2262                                         ns_flag = SLP_DISCONN;
2263                                         goto dorecs;
2264                                 }
2265                         }
2266 #endif
2267                 } while (mp);
2268         }
2269
2270         /*
2271          * Now try and process the request records, non-blocking.
2272          */
2273 dorecs:
2274         if (ns_flag)
2275                 slp->ns_flag |= ns_flag;
2276         if (waitflag == MBUF_DONTWAIT) {
2277                 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2278                 lck_rw_done(&slp->ns_rwlock);
2279                 if (wake && nfs_numnfsd) {
2280                         lck_mtx_lock(nfsd_mutex);
2281                         nfsrv_wakenfsd(slp);
2282                         lck_mtx_unlock(nfsd_mutex);
2283                 }
2284         }
2285 }
2286
2287 /*
2288  * Try and extract an RPC request from the mbuf data list received on a
2289  * stream socket. The "waitflag" argument indicates whether or not it
2290  * can sleep.
2291  */
2292 static int
2293 nfsrv_getstream(slp, waitflag)
2294         struct nfssvc_sock *slp;
2295         int waitflag;
2296 {
2297         mbuf_t m;
2298         char *cp1, *cp2, *mdata;
2299         int len, mlen, error;
2300         mbuf_t om, m2, recm;
2301         u_long recmark;
2302
2303         if (slp->ns_flag & SLP_GETSTREAM)
2304                 panic("nfs getstream");
2305         slp->ns_flag |= SLP_GETSTREAM;
2306         for (;;) {
2307             if (slp->ns_reclen == 0) {
2308                 if (slp->ns_cc < NFSX_UNSIGNED) {
2309                         slp->ns_flag &= ~SLP_GETSTREAM;
2310                         return (0);
2311                 }
2312                 m = slp->ns_raw;
2313                 mdata = mbuf_data(m);
2314                 mlen = mbuf_len(m);
2315                 if (mlen >= NFSX_UNSIGNED) {
2316                         bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2317                         mdata += NFSX_UNSIGNED;
2318                         mlen -= NFSX_UNSIGNED;
2319                         mbuf_setdata(m, mdata, mlen);
2320                 } else {
2321                         cp1 = (caddr_t)&recmark;
2322                         cp2 = mdata;
2323                         while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2324                                 while (mlen == 0) {
2325                                         m = mbuf_next(m);
2326                                         cp2 = mbuf_data(m);
2327                                         mlen = mbuf_len(m);
2328                                 }
2329                                 *cp1++ = *cp2++;
2330                                 mlen--;
2331                                 mbuf_setdata(m, cp2, mlen);
2332                         }
2333                 }
2334                 slp->ns_cc -= NFSX_UNSIGNED;
2335                 recmark = ntohl(recmark);
2336                 slp->ns_reclen = recmark & ~0x80000000;
2337                 if (recmark & 0x80000000)
2338                         slp->ns_flag |= SLP_LASTFRAG;
2339                 else
2340                         slp->ns_flag &= ~SLP_LASTFRAG;
2341                 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2342                         slp->ns_flag &= ~SLP_GETSTREAM;
2343                         return (EPERM);
2344                 }
2345             }
2346
2347             /*
2348              * Now get the record part.
2349              *
2350              * Note that slp->ns_reclen may be 0.  Linux sometimes
2351              * generates 0-length RPCs
2352              */
2353             recm = NULL;
2354             if (slp->ns_cc == slp->ns_reclen) {
2355                 recm = slp->ns_raw;
2356                 slp->ns_raw = slp->ns_rawend = NULL;
2357                 slp->ns_cc = slp->ns_reclen = 0;
2358             } else if (slp->ns_cc > slp->ns_reclen) {
2359                 len = 0;
2360                 m = slp->ns_raw;
2361                 mlen = mbuf_len(m);
2362                 mdata = mbuf_data(m);
2363                 om = NULL;
2364                 while (len < slp->ns_reclen) {
2365                         if ((len + mlen) > slp->ns_reclen) {
2366                                 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2367                                         slp->ns_flag &= ~SLP_GETSTREAM;
2368                                         return (EWOULDBLOCK);
2369                                 }
2370                                 if (om) {
2371                                         if (mbuf_setnext(om, m2)) {
2372                                                 /* trouble... just drop it */
2373                                                 printf("nfsrv_getstream: mbuf_setnext failed\n");
2374                                                 mbuf_freem(m2);
2375                                                 slp->ns_flag &= ~SLP_GETSTREAM;
2376                                                 return (EWOULDBLOCK);
2377                                         }
2378                                         recm = slp->ns_raw;
2379                                 } else {
2380                                         recm = m2;
2381                                 }
2382                                 mdata += slp->ns_reclen - len;
2383                                 mlen -= slp->ns_reclen - len;
2384                                 mbuf_setdata(m, mdata, mlen);
2385                                 len = slp->ns_reclen;
2386                         } else if ((len + mlen) == slp->ns_reclen) {
2387                                 om = m;
2388                                 len += mlen;
2389                                 m = mbuf_next(m);
2390                                 recm = slp->ns_raw;
2391                                 if (mbuf_setnext(om, NULL)) {
2392                                         printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2393                                         slp->ns_flag &= ~SLP_GETSTREAM;
2394                                         return (EWOULDBLOCK);
2395                                 }
2396                                 mlen = mbuf_len(m);
2397                                 mdata = mbuf_data(m);
2398                         } else {
2399                                 om = m;
2400                                 len += mlen;
2401                                 m = mbuf_next(m);
2402                                 mlen = mbuf_len(m);
2403                                 mdata = mbuf_data(m);
2404                         }
2405                 }
2406                 slp->ns_raw = m;
2407                 slp->ns_cc -= len;
2408                 slp->ns_reclen = 0;
2409             } else {
2410                 slp->ns_flag &= ~SLP_GETSTREAM;
2411                 return (0);
2412             }
2413
2414             /*
2415              * Accumulate the fragments into a record.
2416              */
2417             if (slp->ns_frag == NULL) {
2418                 slp->ns_frag = recm;
2419             } else {
2420                 m = slp->ns_frag;
2421                 while ((m2 = mbuf_next(m)))
2422                     m = m2;
2423                 if ((error = mbuf_setnext(m, recm)))
2424                     panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2425             }
2426             if (slp->ns_flag & SLP_LASTFRAG) {
2427                 if (slp->ns_recend)
2428                     mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2429                 else
2430                     slp->ns_rec = slp->ns_frag;
2431                 slp->ns_recend = slp->ns_frag;
2432                 slp->ns_frag = NULL;
2433             }
2434         }
2435 }
2436
2437 /*
2438  * Parse an RPC header.
2439  */
2440 int
2441 nfsrv_dorec(slp, nfsd, ndp)
2442         struct nfssvc_sock *slp;
2443         struct nfsd *nfsd;
2444         struct nfsrv_descript **ndp;
2445 {
2446         mbuf_t m;
2447         mbuf_t nam;
2448         struct nfsrv_descript *nd;
2449         int error;
2450
2451         *ndp = NULL;
2452         if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2453                 return (ENOBUFS);
2454         MALLOC_ZONE(nd, struct nfsrv_descript *,
2455                         sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2456         if (!nd)
2457                 return (ENOMEM);
2458         m = slp->ns_rec;
2459         slp->ns_rec = mbuf_nextpkt(m);
2460         if (slp->ns_rec)
2461                 mbuf_setnextpkt(m, NULL);
2462         else
2463                 slp->ns_recend = NULL;
2464         if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2465                 nam = m;
2466                 m = mbuf_next(m);
2467                 if ((error = mbuf_setnext(nam, NULL)))
2468                         panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2469         } else
2470                 nam = NULL;
2471         nd->nd_md = nd->nd_mrep = m;
2472         nd->nd_nam2 = nam;
2473         nd->nd_dpos = mbuf_data(m);
2474         error = nfs_getreq(nd, nfsd, TRUE);
2475         if (error) {
2476                 if (nam)
2477                         mbuf_freem(nam);
2478                 FREE_ZONE((caddr_t)nd,  sizeof *nd, M_NFSRVDESC);
2479                 return (error);
2480         }
2481         *ndp = nd;
2482         nfsd->nfsd_nd = nd;
2483         return (0);
2484 }
2485
2486 /*
2487  * Parse an RPC request
2488  * - verify it
2489  * - fill in the cred struct.
2490  */
2491 int
2492 nfs_getreq(nd, nfsd, has_header)
2493         struct nfsrv_descript *nd;
2494         struct nfsd *nfsd;
2495         int has_header;
2496 {
2497         int len, i;
2498         u_long *tl;
2499         long t1;
2500         uio_t uiop;
2501         caddr_t dpos, cp2, cp;
2502         u_long nfsvers, auth_type;
2503         uid_t nickuid;
2504         int error = 0, ticklen;
2505         mbuf_t mrep, md;
2506         struct nfsuid *nuidp;
2507         uid_t user_id;
2508         gid_t group_id;
2509         int ngroups;
2510         struct ucred temp_cred;
2511         struct timeval tvin, tvout, now;
2512         char uio_buf[ UIO_SIZEOF(1) ];
2513 #if 0                           /* until encrypted keys are implemented */
2514         NFSKERBKEYSCHED_T keys; /* stores key schedule */
2515 #endif
2516
2517         nd->nd_cr = NULL;
2518
2519         mrep = nd->nd_mrep;
2520         md = nd->nd_md;
2521         dpos = nd->nd_dpos;
2522         if (has_header) {
2523                 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2524                 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2525                 if (*tl++ != rpc_call) {
2526                         mbuf_freem(mrep);
2527                         return (EBADRPC);
2528                 }
2529         } else
2530                 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2531         nd->nd_repstat = 0;
2532         nd->nd_flag = 0;
2533         if (*tl++ != rpc_vers) {
2534                 nd->nd_repstat = ERPCMISMATCH;
2535                 nd->nd_procnum = NFSPROC_NOOP;
2536                 return (0);
2537         }
2538         if (*tl != nfs_prog) {
2539                 nd->nd_repstat = EPROGUNAVAIL;
2540                 nd->nd_procnum = NFSPROC_NOOP;
2541                 return (0);
2542         }
2543         tl++;
2544         nfsvers = fxdr_unsigned(u_long, *tl++);
2545         if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2546                 nd->nd_repstat = EPROGMISMATCH;
2547                 nd->nd_procnum = NFSPROC_NOOP;
2548                 return (0);
2549         }
2550         else if (nfsvers == NFS_VER3)
2551                 nd->nd_flag = ND_NFSV3;
2552         nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2553         if (nd->nd_procnum == NFSPROC_NULL)
2554                 return (0);
2555         if ((nd->nd_procnum >= NFS_NPROCS) ||
2556                 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2557                 nd->nd_repstat = EPROCUNAVAIL;
2558                 nd->nd_procnum = NFSPROC_NOOP;
2559                 return (0);
2560         }
2561         if ((nd->nd_flag & ND_NFSV3) == 0)
2562                 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2563         auth_type = *tl++;
2564         len = fxdr_unsigned(int, *tl++);
2565         if (len < 0 || len > RPCAUTH_MAXSIZ) {
2566                 mbuf_freem(mrep);
2567                 return (EBADRPC);
2568         }
2569
2570         nd->nd_flag &= ~ND_KERBAUTH;
2571         /*
2572          * Handle auth_unix or auth_kerb.
2573          */
2574         if (auth_type == rpc_auth_unix) {
2575                 len = fxdr_unsigned(int, *++tl);
2576                 if (len < 0 || len > NFS_MAXNAMLEN) {
2577                         mbuf_freem(mrep);
2578                         return (EBADRPC);
2579                 }
2580                 bzero(&temp_cred, sizeof(temp_cred));
2581                 nfsm_adv(nfsm_rndup(len));
2582                 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2583                 user_id = fxdr_unsigned(uid_t, *tl++);
2584                 group_id = fxdr_unsigned(gid_t, *tl++);
2585                 temp_cred.cr_groups[0] = group_id;
2586                 len = fxdr_unsigned(int, *tl);
2587                 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2588                         mbuf_freem(mrep);
2589                         return (EBADRPC);
2590                 }
2591                 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2592                 for (i = 1; i <= len; i++)
2593                     if (i < NGROUPS)
2594                         temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2595                     else
2596                         tl++;
2597                 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2598                 if (ngroups > 1)
2599                     nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2600                 len = fxdr_unsigned(int, *++tl);
2601                 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2602                         mbuf_freem(mrep);
2603                         return (EBADRPC);
2604                 }
2605                 temp_cred.cr_uid = user_id;
2606                 temp_cred.cr_ngroups = ngroups;
2607                 nd->nd_cr = kauth_cred_create(&temp_cred);
2608                 if (nd->nd_cr == NULL) {
2609                         nd->nd_repstat = ENOMEM;
2610                         nd->nd_procnum = NFSPROC_NOOP;
2611                         return (0);
2612                 }
2613                 if (len > 0)
2614                         nfsm_adv(nfsm_rndup(len));
2615         } else if (auth_type == rpc_auth_kerb) {
2616                 switch (fxdr_unsigned(int, *tl++)) {
2617                 case RPCAKN_FULLNAME:
2618                         ticklen = fxdr_unsigned(int, *tl);
2619                         *((u_long *)nfsd->nfsd_authstr) = *tl;
2620                         uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2621                                                 &uio_buf[0], sizeof(uio_buf));
2622                         if (!uiop) {
2623                                 nd->nd_repstat = ENOMEM;
2624                                 nd->nd_procnum = NFSPROC_NOOP;
2625                                 return (0);
2626                         }
2627
2628                         // LP64todo - fix this
2629                         nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2630                         if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2631                                 mbuf_freem(mrep);
2632                                 return (EBADRPC);
2633                         }
2634                         uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2635                         // LP64todo - fix this
2636                         nfsm_mtouio(uiop, uio_resid(uiop));
2637                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2638                         if (*tl++ != rpc_auth_kerb ||
2639                                 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2640                                 printf("Bad kerb verifier\n");
2641                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2642                                 nd->nd_procnum = NFSPROC_NOOP;
2643                                 return (0);
2644                         }
2645                         nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2646                         tl = (u_long *)cp;
2647                         if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2648                                 printf("Not fullname kerb verifier\n");
2649                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2650                                 nd->nd_procnum = NFSPROC_NOOP;
2651                                 return (0);
2652                         }
2653                         cp += NFSX_UNSIGNED;
2654                         bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2655                         nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2656                         nd->nd_flag |= ND_KERBFULL;
2657                         nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2658                         break;
2659                 case RPCAKN_NICKNAME:
2660                         if (len != 2 * NFSX_UNSIGNED) {
2661                                 printf("Kerb nickname short\n");
2662                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2663                                 nd->nd_procnum = NFSPROC_NOOP;
2664                                 return (0);
2665                         }
2666                         nickuid = fxdr_unsigned(uid_t, *tl);
2667                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2668                         if (*tl++ != rpc_auth_kerb ||
2669                                 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2670                                 printf("Kerb nick verifier bad\n");
2671                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2672                                 nd->nd_procnum = NFSPROC_NOOP;
2673                                 return (0);
2674                         }
2675                         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2676                         tvin.tv_sec = *tl++;
2677                         tvin.tv_usec = *tl;
2678
2679                         for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2680                             nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2681                                 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2682                                     (!nd->nd_nam2 ||
2683                                      netaddr_match(NU_NETFAM(nuidp),
2684                                       &nuidp->nu_haddr, nd->nd_nam2)))
2685                                         break;
2686                         }
2687                         if (!nuidp) {
2688                                 nd->nd_repstat =
2689                                         (NFSERR_AUTHERR|AUTH_REJECTCRED);
2690                                 nd->nd_procnum = NFSPROC_NOOP;
2691                                 return (0);
2692                         }
2693
2694                         /*
2695                          * Now, decrypt the timestamp using the session key
2696                          * and validate it.
2697                          */
2698 #if NFSKERB
2699                         XXX
2700 #endif
2701
2702                         tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2703                         tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2704                         microtime(&now);
2705                         if (nuidp->nu_expire < now.tv_sec ||
2706                             nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2707                             (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2708                              nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2709                                 nuidp->nu_expire = 0;
2710                                 nd->nd_repstat =
2711                                     (NFSERR_AUTHERR|AUTH_REJECTVERF);
2712                                 nd->nd_procnum = NFSPROC_NOOP;
2713                                 return (0);
2714                         }
2715                         bzero(&temp_cred, sizeof(temp_cred));
2716                         ngroups = nuidp->nu_cr->cr_ngroups;
2717                         for (i = 0; i < ngroups; i++)
2718                                 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2719                         if (ngroups > 1)
2720                                 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2721
2722                         temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2723                         temp_cred.cr_ngroups = ngroups;
2724                         nd->nd_cr = kauth_cred_create(&temp_cred);
2725                         if (!nd->nd_cr) {
2726                                 nd->nd_repstat = ENOMEM;
2727                                 nd->nd_procnum = NFSPROC_NOOP;
2728                                 return (0);
2729                         }
2730                         nd->nd_flag |= ND_KERBNICK;
2731                 };
2732         } else {
2733                 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2734                 nd->nd_procnum = NFSPROC_NOOP;
2735                 return (0);
2736         }
2737
2738         nd->nd_md = md;
2739         nd->nd_dpos = dpos;
2740         return (0);
2741 nfsmout:
2742         if (nd->nd_cr)
2743                 kauth_cred_rele(nd->nd_cr);
2744         return (error);
2745 }
2746
2747 /*
2748  * Search for a sleeping nfsd and wake it up.
2749  * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2750  * running nfsds will go look for the work in the nfssvc_sock list.
2751  * Note: Must be called with nfsd_mutex held.
2752  */
2753 void
2754 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2755 {
2756         struct nfsd *nd;
2757
2758         if ((slp->ns_flag & SLP_VALID) == 0)
2759                 return;
2760
2761         lck_rw_lock_exclusive(&slp->ns_rwlock);
2762
2763         if (nfsd_waiting) {
2764                 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2765                         if (nd->nfsd_flag & NFSD_WAITING) {
2766                                 nd->nfsd_flag &= ~NFSD_WAITING;
2767                                 if (nd->nfsd_slp)
2768                                         panic("nfsd wakeup");
2769                                 slp->ns_sref++;
2770                                 nd->nfsd_slp = slp;
2771                                 lck_rw_done(&slp->ns_rwlock);
2772                                 wakeup((caddr_t)nd);
2773                                 return;
2774                         }
2775                 }
2776         }
2777
2778         slp->ns_flag |= SLP_DOREC;
2779
2780         lck_rw_done(&slp->ns_rwlock);
2781
2782         nfsd_head_flag |= NFSD_CHECKSLP;
2783 }
2784 #endif /* NFS_NOSERVER */
2785
2786 static int
2787 nfs_msg(proc_t p,
2788         const char *server,
2789         const char *msg,
2790         int error)
2791 {
2792         tpr_t tpr;
2793
2794         if (p)
2795                 tpr = tprintf_open(p);
2796         else
2797                 tpr = NULL;
2798         if (error)
2799                 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2800                     error);
2801         else
2802                 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2803         tprintf_close(tpr);
2804         return (0);
2805 }
2806
2807 void
2808 nfs_down(nmp, proc, error, flags, msg)
2809         struct nfsmount *nmp;
2810         proc_t proc;
2811         int error, flags;
2812         const char *msg;
2813 {
2814         if (nmp == NULL)
2815                 return;
2816         if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2817                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2818                 nmp->nm_state |= NFSSTA_TIMEO;
2819         }
2820         if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2821                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2822                 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2823         }
2824         nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2825 }
2826
2827 void
2828 nfs_up(nmp, proc, flags, msg)
2829         struct nfsmount *nmp;
2830         proc_t proc;
2831         int flags;
2832         const char *msg;
2833 {
2834         if (nmp == NULL)
2835                 return;
2836         if (msg)
2837                 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2838         if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2839                 nmp->nm_state &= ~NFSSTA_TIMEO;
2840                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2841         }
2842         if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2843                 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2844                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2845         }
2846 }
2847