bsd/nfs/nfs_socket.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  31 /*
  32  * Copyright (c) 1989, 1991, 1993, 1995
  33  *      The Regents of the University of California.  All rights reserved.
  34  *
  35  * This code is derived from software contributed to Berkeley by
  36  * Rick Macklem at The University of Guelph.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
  67  * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
  68  */
  69
  70 /*
  71  * Socket operations for use by nfs
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/proc.h>
  77 #include <sys/kauth.h>
  78 #include <sys/mount_internal.h>
  79 #include <sys/kernel.h>
  80 #include <sys/kpi_mbuf.h>
  81 #include <sys/malloc.h>
  82 #include <sys/vnode.h>
  83 #include <sys/domain.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/syslog.h>
  87 #include <sys/tprintf.h>
  88 #include <sys/uio_internal.h>
  89 #include <libkern/OSAtomic.h>
  90
  91 #include <sys/time.h>
  92 #include <kern/clock.h>
  93 #include <kern/task.h>
  94 #include <kern/thread.h>
  95 #include <sys/user.h>
  96
  97 #include <netinet/in.h>
  98 #include <netinet/tcp.h>
  99
 100 #include <nfs/rpcv2.h>
 101 #include <nfs/nfsproto.h>
 102 #include <nfs/nfs.h>
 103 #include <nfs/xdr_subs.h>
 104 #include <nfs/nfsm_subs.h>
 105 #include <nfs/nfsmount.h>
 106 #include <nfs/nfsnode.h>
 107 #include <nfs/nfsrtt.h>
 108
 109 #include <sys/kdebug.h>
 110
 111 #define FSDBG(A, B, C, D, E) \
 112         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
 113                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 114 #define FSDBG_TOP(A, B, C, D, E) \
 115         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
 116                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 117 #define FSDBG_BOT(A, B, C, D, E) \
 118         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
 119                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 120
 121 /*
 122  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
 123  * Use the mean and mean deviation of rtt for the appropriate type of rpc
 124  * for the frequent rpcs and a default for the others.
 125  * The justification for doing "other" this way is that these rpcs
 126  * happen so infrequently that timer est. would probably be stale.
 127  * Also, since many of these rpcs are
 128  * non-idempotent, a conservative timeout is desired.
 129  * getattr, lookup - A+2D
 130  * read, write     - A+4D
 131  * other           - nm_timeo
 132  */
 133 #define NFS_RTO(n, t) \
 134         ((t) == 0 ? (n)->nm_timeo : \
 135          ((t) < 3 ? \
 136           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
 137           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
 138 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
 139 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
 140 /*
 141  * External data, mostly RPC constants in XDR form
 142  */
 143 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
 144         rpc_msgaccepted, rpc_call, rpc_autherr,
 145         rpc_auth_kerb;
 146 extern u_long nfs_prog;
 147 extern struct nfsstats nfsstats;
 148 extern int nfsv3_procid[NFS_NPROCS];
 149 extern int nfs_ticks;
 150 extern u_long nfs_xidwrap;
 151
 152 /*
 153  * Defines which timer to use for the procnum.
 154  * 0 - default
 155  * 1 - getattr
 156  * 2 - lookup
 157  * 3 - read
 158  * 4 - write
 159  */
 160 static int proct[NFS_NPROCS] = {
 161         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
 162 };
 163
 164 /*
 165  * There is a congestion window for outstanding rpcs maintained per mount
 166  * point. The cwnd size is adjusted in roughly the way that:
 167  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
 168  * SIGCOMM '88". ACM, August 1988.
 169  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
 170  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
 171  * of rpcs is in progress.
 172  * (The sent count and cwnd are scaled for integer arith.)
 173  * Variants of "slow start" were tried and were found to be too much of a
 174  * performance hit (ave. rtt 3 times larger),
 175  * I suspect due to the large rtt that nfs rpcs have.
 176  */
 177 #define NFS_CWNDSCALE   256
 178 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
 179 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
 180 int nfsrtton = 0;
 181 struct nfsrtt nfsrtt;
 182
 183 static int      nfs_rcvlock(struct nfsreq *);
 184 static void     nfs_rcvunlock(struct nfsreq *);
 185 static int      nfs_receive(struct nfsreq *rep, mbuf_t *mp);
 186 static int      nfs_reconnect(struct nfsreq *rep);
 187 static void     nfs_repdequeue(struct nfsreq *rep);
 188
 189 /* XXX */
 190 boolean_t       current_thread_aborted(void);
 191 kern_return_t   thread_terminate(thread_t);
 192
 193 #ifndef NFS_NOSERVER
 194 static int      nfsrv_getstream(struct nfssvc_sock *,int);
 195
 196 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
 197                                     struct nfssvc_sock *slp,
 198                                     proc_t procp,
 199                                     mbuf_t *mreqp) = {
 200         nfsrv_null,
 201         nfsrv_getattr,
 202         nfsrv_setattr,
 203         nfsrv_lookup,
 204         nfsrv3_access,
 205         nfsrv_readlink,
 206         nfsrv_read,
 207         nfsrv_write,
 208         nfsrv_create,
 209         nfsrv_mkdir,
 210         nfsrv_symlink,
 211         nfsrv_mknod,
 212         nfsrv_remove,
 213         nfsrv_rmdir,
 214         nfsrv_rename,
 215         nfsrv_link,
 216         nfsrv_readdir,
 217         nfsrv_readdirplus,
 218         nfsrv_statfs,
 219         nfsrv_fsinfo,
 220         nfsrv_pathconf,
 221         nfsrv_commit,
 222         nfsrv_noop
 223 };
 224 #endif /* NFS_NOSERVER */
 225
 226
 227 /*
 228  * attempt to bind a socket to a reserved port
 229  */
 230 static int
 231 nfs_bind_resv(struct nfsmount *nmp)
 232 {
 233         socket_t so = nmp->nm_so;
 234         struct sockaddr_in sin;
 235         int error;
 236         u_short tport;
 237
 238         if (!so)
 239                 return (EINVAL);
 240
 241         sin.sin_len = sizeof (struct sockaddr_in);
 242         sin.sin_family = AF_INET;
 243         sin.sin_addr.s_addr = INADDR_ANY;
 244         tport = IPPORT_RESERVED - 1;
 245         sin.sin_port = htons(tport);
 246
 247         while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
 248                (--tport > IPPORT_RESERVED / 2))
 249                 sin.sin_port = htons(tport);
 250         return (error);
 251 }
 252
 253 /*
 254  * variables for managing the nfs_bind_resv_thread
 255  */
 256 int nfs_resv_mounts = 0;
 257 static int nfs_bind_resv_thread_state = 0;
 258 #define NFS_BIND_RESV_THREAD_STATE_INITTED      1
 259 #define NFS_BIND_RESV_THREAD_STATE_RUNNING      2
 260 lck_grp_t *nfs_bind_resv_lck_grp;
 261 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
 262 lck_attr_t *nfs_bind_resv_lck_attr;
 263 lck_mtx_t *nfs_bind_resv_mutex;
 264 struct nfs_bind_resv_request {
 265         TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
 266         struct nfsmount *brr_nmp;
 267         int brr_error;
 268 };
 269 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
 270
 271 /*
 272  * thread to handle any reserved port bind requests
 273  */
 274 static void
 275 nfs_bind_resv_thread(void)
 276 {
 277         struct nfs_bind_resv_request *brreq;
 278
 279         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 280
 281         while (nfs_resv_mounts > 0) {
 282                 lck_mtx_lock(nfs_bind_resv_mutex);
 283                 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
 284                         TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
 285                         lck_mtx_unlock(nfs_bind_resv_mutex);
 286                         brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
 287                         wakeup(brreq);
 288                         lck_mtx_lock(nfs_bind_resv_mutex);
 289                 }
 290                 msleep((caddr_t)&nfs_bind_resv_request_queue,
 291                                 nfs_bind_resv_mutex, PSOCK | PDROP,
 292                                 "nfs_bind_resv_request_queue", 0);
 293         }
 294
 295         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 296         (void) thread_terminate(current_thread());
 297 }
 298
 299 int
 300 nfs_bind_resv_thread_wake(void)
 301 {
 302         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
 303                 return (EIO);
 304         wakeup(&nfs_bind_resv_request_queue);
 305         return (0);
 306 }
 307
 308 /*
 309  * underprivileged procs call this to request nfs_bind_resv_thread
 310  * to perform the reserved port binding for them.
 311  */
 312 static int
 313 nfs_bind_resv_nopriv(struct nfsmount *nmp)
 314 {
 315         struct nfs_bind_resv_request brreq;
 316         int error;
 317
 318         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
 319                 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
 320                         nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
 321                         nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
 322                         nfs_bind_resv_lck_attr = lck_attr_alloc_init();
 323                         nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
 324                         TAILQ_INIT(&nfs_bind_resv_request_queue);
 325                         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 326                 }
 327                 kernel_thread(kernel_task, nfs_bind_resv_thread);
 328                 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 329         }
 330
 331         brreq.brr_nmp = nmp;
 332         brreq.brr_error = 0;
 333
 334         lck_mtx_lock(nfs_bind_resv_mutex);
 335         TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 336         lck_mtx_unlock(nfs_bind_resv_mutex);
 337
 338         error = nfs_bind_resv_thread_wake();
 339         if (error) {
 340                 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 341                 /* Note: we might be able to simply restart the thread */
 342                 return (error);
 343         }
 344
 345         tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
 346
 347         return (brreq.brr_error);
 348 }
 349
 350 /*
 351  * Initialize sockets and congestion for a new NFS connection.
 352  * We do not free the sockaddr if error.
 353  */
 354 int
 355 nfs_connect(
 356         struct nfsmount *nmp,
 357         __unused struct nfsreq *rep)
 358 {
 359         socket_t so;
 360         int error, rcvreserve, sndreserve;
 361         struct sockaddr *saddr;
 362         struct timeval timeo;
 363
 364         nmp->nm_so = 0;
 365         saddr = mbuf_data(nmp->nm_nam);
 366         error = sock_socket(saddr->sa_family, nmp->nm_sotype,
 367                                                 nmp->nm_soproto, 0, 0, &nmp->nm_so);
 368         if (error) {
 369                 goto bad;
 370         }
 371         so = nmp->nm_so;
 372
 373         /*
 374          * Some servers require that the client port be a reserved port number.
 375          */
 376         if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
 377                 proc_t p;
 378                 /*
 379                  * sobind() requires current_proc() to have superuser privs.
 380                  * If this bind is part of a reconnect, and the current proc
 381                  * doesn't have superuser privs, we hand the sobind() off to
 382                  * a kernel thread to process.
 383                  */
 384                 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
 385                     (p = current_proc()) && suser(kauth_cred_get(), 0)) {
 386                         /* request nfs_bind_resv_thread() to do bind */
 387                         error = nfs_bind_resv_nopriv(nmp);
 388                 } else {
 389                         error = nfs_bind_resv(nmp);
 390                 }
 391                 if (error)
 392                         goto bad;
 393         }
 394
 395         /*
 396          * Protocols that do not require connections may be optionally left
 397          * unconnected for servers that reply from a port other than NFS_PORT.
 398          */
 399         if (nmp->nm_flag & NFSMNT_NOCONN) {
 400                 if (nmp->nm_sotype == SOCK_STREAM) {
 401                         error = ENOTCONN;
 402                         goto bad;
 403                 }
 404         } else {
 405                 struct timeval  tv;
 406                 tv.tv_sec = 2;
 407                 tv.tv_usec = 0;
 408                 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
 409                 if (error && error != EINPROGRESS) {
 410                         goto bad;
 411                 }
 412
 413                 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
 414                         if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
 415                                 goto bad;
 416                         }
 417                 }
 418         }
 419
 420         /*
 421          * Always time out on recieve, this allows us to reconnect the
 422          * socket to deal with network changes.
 423          */
 424         timeo.tv_usec = 0;
 425         timeo.tv_sec = 2;
 426         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
 427         if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
 428                 timeo.tv_sec = 5;
 429         } else {
 430                 timeo.tv_sec = 0;
 431         }
 432         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
 433
 434         if (nmp->nm_sotype == SOCK_DGRAM) {
 435                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 436                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 437                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 438         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
 439                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 440                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 441                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 442         } else {
 443                 int proto;
 444                 int on = 1;
 445
 446                 sock_gettype(so, NULL, NULL, &proto);
 447                 if (nmp->nm_sotype != SOCK_STREAM)
 448                         panic("nfscon sotype");
 449
 450                 // Assume that SOCK_STREAM always requires a connection
 451                 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
 452
 453                 if (proto == IPPROTO_TCP) {
 454                         sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
 455                 }
 456
 457                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
 458                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
 459                                 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 460         }
 461
 462         if (sndreserve > NFS_MAXSOCKBUF)
 463                 sndreserve = NFS_MAXSOCKBUF;
 464         if (rcvreserve > NFS_MAXSOCKBUF)
 465                 rcvreserve = NFS_MAXSOCKBUF;
 466         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
 467         if (error) {
 468                 goto bad;
 469         }
 470         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
 471         if (error) {
 472                 goto bad;
 473         }
 474
 475         sock_nointerrupt(so, 1);
 476
 477         /* Initialize other non-zero congestion variables */
 478         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
 479                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
 480         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
 481                 nmp->nm_sdrtt[3] = 0;
 482         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
 483         nmp->nm_sent = 0;
 484         FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
 485         nmp->nm_timeouts = 0;
 486         return (0);
 487
 488 bad:
 489         nfs_disconnect(nmp);
 490         return (error);
 491 }
 492
 493 /*
 494  * Reconnect routine:
 495  * Called when a connection is broken on a reliable protocol.
 496  * - clean up the old socket
 497  * - nfs_connect() again
 498  * - set R_MUSTRESEND for all outstanding requests on mount point
 499  * If this fails the mount point is DEAD!
 500  * nb: Must be called with the nfs_sndlock() set on the mount point.
 501  */
 502 static int
 503 nfs_reconnect(struct nfsreq *rep)
 504 {
 505         struct nfsreq *rp;
 506         struct nfsmount *nmp = rep->r_nmp;
 507         int error;
 508
 509         nfs_disconnect(nmp);
 510         while ((error = nfs_connect(nmp, rep))) {
 511                 if (error == EINTR || error == ERESTART)
 512                         return (EINTR);
 513                 if (error == EIO)
 514                         return (EIO);
 515                 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
 516                         "can not connect");
 517                 rep->r_flags |= R_TPRINTFMSG;
 518                 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
 519                         /* we're not yet completely mounted and */
 520                         /* we can't reconnect, so we fail */
 521                         return (error);
 522                 }
 523                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
 524                         return (error);
 525                 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
 526         }
 527
 528         /*
 529          * Loop through outstanding request list and fix up all requests
 530          * on old socket.
 531          */
 532         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
 533                 if (rp->r_nmp == nmp)
 534                         rp->r_flags |= R_MUSTRESEND;
 535         }
 536         return (0);
 537 }
 538
 539 /*
 540  * NFS disconnect. Clean up and unlink.
 541  */
 542 void
 543 nfs_disconnect(struct nfsmount *nmp)
 544 {
 545         socket_t so;
 546
 547         if (nmp->nm_so) {
 548                 so = nmp->nm_so;
 549                 nmp->nm_so = 0;
 550                 sock_shutdown(so, 2);
 551                 sock_close(so);
 552         }
 553 }
 554
 555 /*
 556  * This is the nfs send routine. For connection based socket types, it
 557  * must be called with an nfs_sndlock() on the socket.
 558  * "rep == NULL" indicates that it has been called from a server.
 559  * For the client side:
 560  * - return EINTR if the RPC is terminated, 0 otherwise
 561  * - set R_MUSTRESEND if the send fails for any reason
 562  * - do any cleanup required by recoverable socket errors (???)
 563  * For the server side:
 564  * - return EINTR or ERESTART if interrupted by a signal
 565  * - return EPIPE if a connection is lost for connection based sockets (TCP...)
 566  * - do any cleanup required by recoverable socket errors (???)
 567  */
 568 int
 569 nfs_send(so, nam, top, rep)
 570         socket_t so;
 571         mbuf_t nam;
 572         mbuf_t top;
 573         struct nfsreq *rep;
 574 {
 575         struct sockaddr *sendnam;
 576         int error, error2, sotype, flags;
 577         u_long xidqueued = 0;
 578         struct nfsreq *rp;
 579         char savenametolog[MAXPATHLEN];
 580         struct msghdr msg;
 581
 582         if (rep) {
 583                 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 584                 if (error) {
 585                         mbuf_freem(top);
 586                         return (error);
 587                 }
 588                 if ((so = rep->r_nmp->nm_so) == NULL) {
 589                         rep->r_flags |= R_MUSTRESEND;
 590                         mbuf_freem(top);
 591                         return (0);
 592                 }
 593                 rep->r_flags &= ~R_MUSTRESEND;
 594                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 595                         if (rp == rep)
 596                                 break;
 597                 if (rp)
 598                         xidqueued = rp->r_xid;
 599         }
 600         sock_gettype(so, NULL, &sotype, NULL);
 601         if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
 602             (nam == 0))
 603                 sendnam = (struct sockaddr *)0;
 604         else
 605                 sendnam = mbuf_data(nam);
 606
 607         if (sotype == SOCK_SEQPACKET)
 608                 flags = MSG_EOR;
 609         else
 610                 flags = 0;
 611
 612         /*
 613          * Save the name here in case mount point goes away if we block.
 614          * The name is using local stack and is large, but don't
 615          * want to block if we malloc.
 616          */
 617         if (rep)
 618                 strncpy(savenametolog,
 619                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
 620                         MAXPATHLEN - 1);
 621         bzero(&msg, sizeof(msg));
 622         msg.msg_name = (caddr_t)sendnam;
 623         msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
 624         error = sock_sendmbuf(so, &msg, top, flags, NULL);
 625
 626         if (error) {
 627                 if (rep) {
 628                         if (xidqueued) {
 629                                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 630                                         if (rp == rep && rp->r_xid == xidqueued)
 631                                                 break;
 632                                 if (!rp)
 633                                         panic("nfs_send: error %d xid %x gone",
 634                                               error, xidqueued);
 635                         }
 636                         log(LOG_INFO, "nfs send error %d for server %s\n",
 637                             error, savenametolog);
 638                         /*
 639                          * Deal with errors for the client side.
 640                          */
 641                         error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 642                         if (error2) {
 643                                 error = error2;
 644                         } else {
 645                                 rep->r_flags |= R_MUSTRESEND;
 646                         }
 647                 } else
 648                         log(LOG_INFO, "nfsd send error %d\n", error);
 649
 650                 /*
 651                  * Handle any recoverable (soft) socket errors here. (???)
 652                  */
 653                 if (error != EINTR && error != ERESTART && error != EIO &&
 654                         error != EWOULDBLOCK && error != EPIPE) {
 655                         error = 0;
 656                 }
 657         }
 658         return (error);
 659 }
 660
 661 /*
 662  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
 663  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
 664  * Mark and consolidate the data into a new mbuf list.
 665  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
 666  *     small mbufs.
 667  * For SOCK_STREAM we must be very careful to read an entire record once
 668  * we have read any of it, even if the system call has been interrupted.
 669  */
 670 static int
 671 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
 672 {
 673         socket_t so;
 674         struct iovec_32 aio;
 675         mbuf_t m, mlast;
 676         u_long len, fraglen;
 677         int error, error2, sotype;
 678         proc_t p = current_proc();      /* XXX */
 679         struct msghdr msg;
 680         size_t rcvlen;
 681         int lastfragment;
 682
 683         /*
 684          * Set up arguments for soreceive()
 685          */
 686         *mp = NULL;
 687         sotype = rep->r_nmp->nm_sotype;
 688
 689         /*
 690          * For reliable protocols, lock against other senders/receivers
 691          * in case a reconnect is necessary.
 692          * For SOCK_STREAM, first get the Record Mark to find out how much
 693          * more there is to get.
 694          * We must lock the socket against other receivers
 695          * until we have an entire rpc request/reply.
 696          */
 697         if (sotype != SOCK_DGRAM) {
 698                 error = nfs_sndlock(rep);
 699                 if (error)
 700                         return (error);
 701 tryagain:
 702                 /*
 703                  * Check for fatal errors and resending request.
 704                  */
 705                 /*
 706                  * Ugh: If a reconnect attempt just happened, nm_so
 707                  * would have changed. NULL indicates a failed
 708                  * attempt that has essentially shut down this
 709                  * mount point.
 710                  */
 711                 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
 712                         nfs_sndunlock(rep);
 713                         if (error)
 714                                 return (error);
 715                         return (EINTR);
 716                 }
 717                 so = rep->r_nmp->nm_so;
 718                 if (!so) {
 719                         error = nfs_reconnect(rep);
 720                         if (error) {
 721                                 nfs_sndunlock(rep);
 722                                 return (error);
 723                         }
 724                         goto tryagain;
 725                 }
 726                 while (rep->r_flags & R_MUSTRESEND) {
 727                         error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
 728                         if (!error) {
 729                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
 730                                 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
 731                         }
 732                         /*
 733                          * we also hold rcv lock so rep is still
 734                          * legit this point
 735                          */
 736                         if (error) {
 737                                 if (error == EINTR || error == ERESTART ||
 738                                     (error = nfs_reconnect(rep))) {
 739                                         nfs_sndunlock(rep);
 740                                         return (error);
 741                                 }
 742                                 goto tryagain;
 743                         }
 744                 }
 745                 nfs_sndunlock(rep);
 746                 if (sotype == SOCK_STREAM) {
 747                         error = 0;
 748                         len = 0;
 749                         lastfragment = 0;
 750                         mlast = NULL;
 751                         while (!error && !lastfragment) {
 752                                 aio.iov_base = (uintptr_t) &fraglen;
 753                                 aio.iov_len = sizeof(u_long);
 754                                 bzero(&msg, sizeof(msg));
 755                                 msg.msg_iov = (struct iovec *) &aio;
 756                                 msg.msg_iovlen = 1;
 757                                 do {
 758                                    error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
 759                                    if (!rep->r_nmp) /* if unmounted then bailout */
 760                                         goto shutout;
 761                                    if (error == EWOULDBLOCK && rep) {
 762                                         error2 = nfs_sigintr(rep->r_nmp, rep, p);
 763                                         if (error2)
 764                                                 error = error2;
 765                                    }
 766                                 } while (error == EWOULDBLOCK);
 767                                 if (!error && rcvlen < aio.iov_len) {
 768                                     /* only log a message if we got a partial word */
 769                                     if (rcvlen != 0)
 770                                             log(LOG_INFO,
 771                                                  "short receive (%d/%d) from nfs server %s\n",
 772                                                  rcvlen, sizeof(u_long),
 773                                                  vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 774                                     error = EPIPE;
 775                                 }
 776                                 if (error)
 777                                         goto errout;
 778                                 lastfragment = ntohl(fraglen) & 0x80000000;
 779                                 fraglen = ntohl(fraglen) & ~0x80000000;
 780                                 len += fraglen;
 781                                 /*
 782                                  * This is SERIOUS! We are out of sync with the sender
 783                                  * and forcing a disconnect/reconnect is all I can do.
 784                                  */
 785                                 if (len > NFS_MAXPACKET) {
 786                                     log(LOG_ERR, "%s (%d) from nfs server %s\n",
 787                                         "impossible RPC record length", len,
 788                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 789                                     error = EFBIG;
 790                                     goto errout;
 791                                 }
 792
 793                                 m = NULL;
 794                                 do {
 795                                     rcvlen = fraglen;
 796                                     error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
 797                                     if (!rep->r_nmp) /* if unmounted then bailout */ {
 798                                         goto shutout;
 799                                     }
 800                                 } while (error == EWOULDBLOCK || error == EINTR ||
 801                                          error == ERESTART);
 802
 803                                 if (!error && fraglen > rcvlen) {
 804                                     log(LOG_INFO,
 805                                         "short receive (%d/%d) from nfs server %s\n",
 806                                         rcvlen, fraglen,
 807                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 808                                     error = EPIPE;
 809                                     mbuf_freem(m);
 810                                 }
 811                                 if (!error) {
 812                                         if (!*mp) {
 813                                                 *mp = m;
 814                                                 mlast = m;
 815                                         } else {
 816                                                 error = mbuf_setnext(mlast, m);
 817                                                 if (error) {
 818                                                         printf("nfs_receive: mbuf_setnext failed %d\n", error);
 819                                                         mbuf_freem(m);
 820                                                 }
 821                                         }
 822                                         while (mbuf_next(mlast))
 823                                                 mlast = mbuf_next(mlast);
 824                                 }
 825                         }
 826                 } else {
 827                         bzero(&msg, sizeof(msg));
 828                         do {
 829                             rcvlen = 100000000;
 830                             error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 831                             if (!rep->r_nmp) /* if unmounted then bailout */ {
 832                                 goto shutout;
 833                             }
 834                             if (error == EWOULDBLOCK && rep) {
 835                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 836                                 if (error2) {
 837                                         return (error2);
 838                                 }
 839                             }
 840                         } while (error == EWOULDBLOCK);
 841
 842                         if ((msg.msg_flags & MSG_EOR) == 0)
 843                                 printf("Egad!!\n");
 844                         if (!error && *mp == NULL)
 845                                 error = EPIPE;
 846                         len = rcvlen;
 847                 }
 848 errout:
 849                 if (error && error != EINTR && error != ERESTART) {
 850                         mbuf_freem(*mp);
 851                         *mp = NULL;
 852                         if (error != EPIPE)
 853                                 log(LOG_INFO,
 854                                     "receive error %d from nfs server %s\n", error,
 855                                     vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 856                         error = nfs_sndlock(rep);
 857                         if (!error) {
 858                                 error = nfs_reconnect(rep);
 859                                 if (!error)
 860                                         goto tryagain;
 861                                 nfs_sndunlock(rep);
 862                         }
 863                 }
 864         } else {
 865                 /*
 866                  * We could have failed while rebinding the datagram socket
 867                  * so we need to attempt to rebind here.
 868                  */
 869                 if ((so = rep->r_nmp->nm_so) == NULL) {
 870                         error = nfs_sndlock(rep);
 871                         if (!error) {
 872                                 error = nfs_reconnect(rep);
 873                                 nfs_sndunlock(rep);
 874                         }
 875                         if (error)
 876                                 return (error);
 877                         if (!rep->r_nmp) /* if unmounted then bailout */
 878                                 return (ENXIO);
 879                         so = rep->r_nmp->nm_so;
 880                 }
 881                 bzero(&msg, sizeof(msg));
 882                 len = 0;
 883                 do {
 884                         rcvlen = 1000000;
 885                         error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 886                         if (!rep->r_nmp) /* if unmounted then bailout */
 887                                 goto shutout;
 888                         if (error) {
 889                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 890                                 if (error2) {
 891                                         error = error2;
 892                                         goto shutout;
 893                                 }
 894                         }
 895                         /* Reconnect for all errors.  We may be receiving
 896                          * soft/hard/blocking errors because of a network
 897                          * change.
 898                          * XXX: we should rate limit or delay this
 899                          * to once every N attempts or something.
 900                          * although TCP doesn't seem to.
 901                          */
 902                         if (error) {
 903                                 error2 = nfs_sndlock(rep);
 904                                 if (!error2) {
 905                                         error2 = nfs_reconnect(rep);
 906                                         if (error2)
 907                                                 error = error2;
 908                                         else if (!rep->r_nmp) /* if unmounted then bailout */
 909                                                 error = ENXIO;
 910                                         else
 911                                                 so = rep->r_nmp->nm_so;
 912                                         nfs_sndunlock(rep);
 913                                 } else {
 914                                         error = error2;
 915                                 }
 916                         }
 917                 } while (error == EWOULDBLOCK);
 918         }
 919 shutout:
 920         if (error) {
 921                 mbuf_freem(*mp);
 922                 *mp = NULL;
 923         }
 924         return (error);
 925 }
 926
 927 /*
 928  * Implement receipt of reply on a socket.
 929  * We must search through the list of received datagrams matching them
 930  * with outstanding requests using the xid, until ours is found.
 931  */
 932 /* ARGSUSED */
 933 int
 934 nfs_reply(myrep)
 935         struct nfsreq *myrep;
 936 {
 937         struct nfsreq *rep;
 938         struct nfsmount *nmp = myrep->r_nmp;
 939         long t1;
 940         mbuf_t mrep, md;
 941         u_long rxid, *tl;
 942         caddr_t dpos, cp2;
 943         int error;
 944
 945         /*
 946          * Loop around until we get our own reply
 947          */
 948         for (;;) {
 949                 /*
 950                  * Lock against other receivers so that I don't get stuck in
 951                  * sbwait() after someone else has received my reply for me.
 952                  * Also necessary for connection based protocols to avoid
 953                  * race conditions during a reconnect.
 954                  * If nfs_rcvlock() returns EALREADY, that means that
 955                  * the reply has already been recieved by another
 956                  * process and we can return immediately.  In this
 957                  * case, the lock is not taken to avoid races with
 958                  * other processes.
 959                  */
 960                 error = nfs_rcvlock(myrep);
 961                 if (error == EALREADY)
 962                         return (0);
 963                 if (error)
 964                         return (error);
 965
 966                 /*
 967                  * If we slept after putting bits otw, then reply may have
 968                  * arrived.  In which case returning is required, or we
 969                  * would hang trying to nfs_receive an already received reply.
 970                  */
 971                 if (myrep->r_mrep != NULL) {
 972                         nfs_rcvunlock(myrep);
 973                         FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
 974                         return (0);
 975                 }
 976                 /*
 977                  * Get the next Rpc reply off the socket. Assume myrep->r_nmp
 978                  * is still intact by checks done in nfs_rcvlock.
 979                  */
 980                 error = nfs_receive(myrep, &mrep);
 981                 /*
 982                  * Bailout asap if nfsmount struct gone (unmounted).
 983                  */
 984                 if (!myrep->r_nmp) {
 985                         FSDBG(530, myrep->r_xid, myrep, nmp, -2);
 986                         if (mrep)
 987                                 mbuf_freem(mrep);
 988                         return (ENXIO);
 989                 }
 990                 if (error) {
 991                         FSDBG(530, myrep->r_xid, myrep, nmp, error);
 992                         nfs_rcvunlock(myrep);
 993
 994                         /* Bailout asap if nfsmount struct gone (unmounted). */
 995                         if (!myrep->r_nmp) {
 996                                 if (mrep)
 997                                         mbuf_freem(mrep);
 998                                 return (ENXIO);
 999                         }
1000
1001                         /*
1002                          * Ignore routing errors on connectionless protocols??
1003                          */
1004                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1005                                 if (nmp->nm_so) {
1006                                         int clearerror;
1007                                         int optlen = sizeof(clearerror);
1008                                         sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1009                                 }
1010                                 continue;
1011                         }
1012                         if (mrep)
1013                                 mbuf_freem(mrep);
1014                         return (error);
1015                 }
1016
1017                 /*
1018                  * We assume all is fine, but if we did not have an error
1019                  * and mrep is 0, better not dereference it. nfs_receive
1020                  * calls soreceive which carefully sets error=0 when it got
1021                  * errors on sbwait (tsleep). In most cases, I assume that's
1022                  * so we could go back again. In tcp case, EPIPE is returned.
1023                  * In udp, case nfs_receive gets back here with no error and no
1024                  * mrep. Is the right fix to have soreceive check for process
1025                  * aborted after sbwait and return something non-zero? Should
1026                  * nfs_receive give an EPIPE?  Too risky to play with those
1027                  * two this late in game for a shutdown problem. Instead,
1028                  * just check here and get out. (ekn)
1029                  */
1030                 if (!mrep) {
1031                         nfs_rcvunlock(myrep);
1032                         FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1033                         return (ENXIO); /* sounds good */
1034                 }
1035
1036                 /*
1037                  * Get the xid and check that it is an rpc reply
1038                  */
1039                 md = mrep;
1040                 dpos = mbuf_data(md);
1041                 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1042                 rxid = *tl++;
1043                 if (*tl != rpc_reply) {
1044                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1045                         mbuf_freem(mrep);
1046 nfsmout:
1047                         if (nmp->nm_state & NFSSTA_RCVLOCK)
1048                                 nfs_rcvunlock(myrep);
1049                         continue;
1050                 }
1051
1052                 /*
1053                  * Loop through the request list to match up the reply
1054                  * Iff no match, just drop the datagram
1055                  */
1056                 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1057                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1058                                 /* Found it.. */
1059                                 rep->r_mrep = mrep;
1060                                 rep->r_md = md;
1061                                 rep->r_dpos = dpos;
1062                                 /*
1063                                  * If we're tracking the round trip time
1064                                  * then we update the circular log here
1065                                  * with the stats from our current request.
1066                                  */
1067                                 if (nfsrtton) {
1068                                         struct rttl *rt;
1069
1070                                         rt = &nfsrtt.rttl[nfsrtt.pos];
1071                                         rt->proc = rep->r_procnum;
1072                                         rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1073                                         rt->sent = nmp->nm_sent;
1074                                         rt->cwnd = nmp->nm_cwnd;
1075                                         if (proct[rep->r_procnum] == 0)
1076                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1077                                         rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1078                                         rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1079                                         rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1080                                         microtime(&rt->tstamp); // XXX unused
1081                                         if (rep->r_flags & R_TIMING)
1082                                                 rt->rtt = rep->r_rtt;
1083                                         else
1084                                                 rt->rtt = 1000000;
1085                                         nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1086                                 }
1087                                 /*
1088                                  * Update congestion window.
1089                                  * Do the additive increase of
1090                                  * one rpc/rtt.
1091                                  */
1092                                 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1093                                       nmp->nm_cwnd);
1094                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
1095                                         nmp->nm_cwnd +=
1096                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
1097                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1098                                         if (nmp->nm_cwnd > NFS_MAXCWND)
1099                                                 nmp->nm_cwnd = NFS_MAXCWND;
1100                                 }
1101                                 if (rep->r_flags & R_SENT) {
1102                                     rep->r_flags &= ~R_SENT;
1103                                     nmp->nm_sent -= NFS_CWNDSCALE;
1104                                }
1105                                 /*
1106                                  * Update rtt using a gain of 0.125 on the mean
1107                                  * and a gain of 0.25 on the deviation.
1108                                  */
1109                                 if (rep->r_flags & R_TIMING) {
1110                                         /*
1111                                          * Since the timer resolution of
1112                                          * NFS_HZ is so course, it can often
1113                                          * result in r_rtt == 0. Since
1114                                          * r_rtt == N means that the actual
1115                                          * rtt is between N+dt and N+2-dt ticks,
1116                                          * add 1.
1117                                          */
1118                                         if (proct[rep->r_procnum] == 0)
1119                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1120                                         t1 = rep->r_rtt + 1;
1121                                         t1 -= (NFS_SRTT(rep) >> 3);
1122                                         NFS_SRTT(rep) += t1;
1123                                         if (t1 < 0)
1124                                                 t1 = -t1;
1125                                         t1 -= (NFS_SDRTT(rep) >> 2);
1126                                         NFS_SDRTT(rep) += t1;
1127                                 }
1128                                 nmp->nm_timeouts = 0;
1129                                 break;
1130                         }
1131                 }
1132                 nfs_rcvunlock(myrep);
1133                 /*
1134                  * If not matched to a request, drop it.
1135                  * If it's mine, get out.
1136                  */
1137                 if (rep == 0) {
1138                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1139                         mbuf_freem(mrep);
1140                 } else if (rep == myrep) {
1141                         if (rep->r_mrep == NULL)
1142                                 panic("nfs_reply: nil r_mrep");
1143                         return (0);
1144                 }
1145                 FSDBG(530, myrep->r_xid, myrep, rep,
1146                       rep ? rep->r_xid : myrep->r_flags);
1147         }
1148 }
1149
1150 /*
1151  * nfs_request - goes something like this
1152  *      - fill in request struct
1153  *      - links it into list
1154  *      - calls nfs_send() for first transmit
1155  *      - calls nfs_receive() to get reply
1156  *      - break down rpc header and return with nfs reply pointed to
1157  *        by mrep or error
1158  * nb: always frees up mreq mbuf list
1159  */
1160 int
1161 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1162         vnode_t vp;
1163         mount_t mp;
1164         mbuf_t mrest;
1165         int procnum;
1166         proc_t procp;
1167         kauth_cred_t cred;
1168         mbuf_t *mrp;
1169         mbuf_t *mdp;
1170         caddr_t *dposp;
1171         u_int64_t *xidp;
1172 {
1173         mbuf_t m, mrep, m2;
1174         struct nfsreq re, *rep;
1175         u_long *tl;
1176         int i;
1177         struct nfsmount *nmp;
1178         mbuf_t md, mheadend;
1179         char nickv[RPCX_NICKVERF];
1180         time_t waituntil;
1181         caddr_t dpos, cp2;
1182         int t1, error = 0, mrest_len, auth_len, auth_type;
1183         int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1184         int verf_len, verf_type;
1185         u_long xid;
1186         char *auth_str, *verf_str;
1187         NFSKERBKEY_T key;               /* save session key */
1188         int nmsotype;
1189         struct timeval now;
1190
1191         if (mrp)
1192                 *mrp = NULL;
1193         if (xidp)
1194                 *xidp = 0;
1195         nmp = VFSTONFS(mp);
1196
1197         rep = &re;
1198
1199         if (vp)
1200                 nmp = VFSTONFS(vnode_mount(vp));
1201         if (nmp == NULL ||
1202             (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1203             (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1204                 mbuf_freem(mrest);
1205                 return (ENXIO);
1206         }
1207         nmsotype = nmp->nm_sotype;
1208
1209         FSDBG_TOP(531, vp, procnum, nmp, rep);
1210
1211         rep->r_nmp = nmp;
1212         rep->r_vp = vp;
1213         rep->r_procp = procp;
1214         rep->r_procnum = procnum;
1215         microuptime(&now);
1216         rep->r_lastmsg = now.tv_sec -
1217             ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1218         i = 0;
1219         m = mrest;
1220         while (m) {
1221                 i += mbuf_len(m);
1222                 m = mbuf_next(m);
1223         }
1224         mrest_len = i;
1225
1226         /*
1227          * Get the RPC header with authorization.
1228          */
1229 kerbauth:
1230         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1231         if (!nmp) {
1232                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1233                 mbuf_freem(mrest);
1234                 return (ENXIO);
1235         }
1236         verf_str = auth_str = (char *)0;
1237         if (nmp->nm_flag & NFSMNT_KERB) {
1238                 verf_str = nickv;
1239                 verf_len = sizeof (nickv);
1240                 auth_type = RPCAUTH_KERB4;
1241                 bzero((caddr_t)key, sizeof (key));
1242                 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1243                         &auth_len, verf_str, verf_len)) {
1244                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1245                         if (!nmp) {
1246                                 FSDBG_BOT(531, 2, vp, error, rep);
1247                                 mbuf_freem(mrest);
1248                                 return (ENXIO);
1249                         }
1250                         error = nfs_getauth(nmp, rep, cred, &auth_str,
1251                                 &auth_len, verf_str, &verf_len, key);
1252                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1253                         if (!error && !nmp)
1254                                 error = ENXIO;
1255                         if (error) {
1256                                 FSDBG_BOT(531, 2, vp, error, rep);
1257                                 mbuf_freem(mrest);
1258                                 return (error);
1259                         }
1260                 }
1261         } else {
1262                 auth_type = RPCAUTH_UNIX;
1263                 if (cred->cr_ngroups < 1)
1264                         panic("nfsreq nogrps");
1265                 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1266                         nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1267                         5 * NFSX_UNSIGNED;
1268         }
1269         error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1270              auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1271         if (auth_str)
1272                 _FREE(auth_str, M_TEMP);
1273         if (error) {
1274                 mbuf_freem(mrest);
1275                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1276                 return (error);
1277         }
1278         if (xidp)
1279                 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1280
1281         /*
1282          * For stream protocols, insert a Sun RPC Record Mark.
1283          */
1284         if (nmsotype == SOCK_STREAM) {
1285                 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1286                 if (error) {
1287                         mbuf_freem(m);
1288                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1289                         return (error);
1290                 }
1291                 *((u_long*)mbuf_data(m)) =
1292                         htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1293         }
1294         rep->r_mreq = m;
1295         rep->r_xid = xid;
1296 tryagain:
1297         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1298         if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1299                 rep->r_retry = nmp->nm_retry;
1300         else
1301                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
1302         rep->r_rtt = rep->r_rexmit = 0;
1303         if (proct[procnum] > 0)
1304                 rep->r_flags = R_TIMING;
1305         else
1306                 rep->r_flags = 0;
1307         rep->r_mrep = NULL;
1308
1309         /*
1310          * Do the client side RPC.
1311          */
1312         OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1313         /*
1314          * Chain request into list of outstanding requests. Be sure
1315          * to put it LAST so timer finds oldest requests first.
1316          */
1317         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1318
1319         /*
1320          * If backing off another request or avoiding congestion, don't
1321          * send this one now but let timer do it. If not timing a request,
1322          * do it now.
1323          */
1324         if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1325                            (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1326                            nmp->nm_sent < nmp->nm_cwnd)) {
1327                 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1328
1329                 if (connrequired)
1330                         error = nfs_sndlock(rep);
1331
1332                 /*
1333                  * Set the R_SENT before doing the send in case another thread
1334                  * processes the reply before the nfs_send returns here
1335                  */
1336                 if (!error) {
1337                         if ((rep->r_flags & R_MUSTRESEND) == 0) {
1338                                 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1339                                       nmp->nm_cwnd);
1340                                 nmp->nm_sent += NFS_CWNDSCALE;
1341                                 rep->r_flags |= R_SENT;
1342                         }
1343
1344                         error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1345                         if (!error)
1346                                 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1347                         if (connrequired)
1348                                 nfs_sndunlock(rep);
1349                 }
1350                 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1351                 if (error) {
1352                         if (nmp)
1353                                 nmp->nm_sent -= NFS_CWNDSCALE;
1354                         rep->r_flags &= ~R_SENT;
1355                 }
1356         } else {
1357                 rep->r_rtt = -1;
1358         }
1359
1360         /*
1361          * Wait for the reply from our send or the timer's.
1362          */
1363         if (!error || error == EPIPE)
1364                 error = nfs_reply(rep);
1365
1366         /*
1367          * RPC done, unlink the request.
1368          */
1369         nfs_repdequeue(rep);
1370
1371         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1372
1373         /*
1374          * Decrement the outstanding request count.
1375          */
1376         if (rep->r_flags & R_SENT) {
1377                 rep->r_flags &= ~R_SENT;        /* paranoia */
1378                 if (nmp) {
1379                         FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1380                         nmp->nm_sent -= NFS_CWNDSCALE;
1381                 }
1382         }
1383
1384         /*
1385          * If there was a successful reply and a tprintf msg.
1386          * tprintf a response.
1387          */
1388         if (!error)
1389                 nfs_up(nmp, procp, NFSSTA_TIMEO,
1390                         (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1391         mrep = rep->r_mrep;
1392         md = rep->r_md;
1393         dpos = rep->r_dpos;
1394         if (!error && !nmp)
1395                 error = ENXIO;
1396         if (error) {
1397                 mbuf_freem(rep->r_mreq);
1398                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1399                 return (error);
1400         }
1401
1402         /*
1403          * break down the rpc header and check if ok
1404          */
1405         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1406         if (*tl++ == rpc_msgdenied) {
1407                 if (*tl == rpc_mismatch)
1408                         error = EOPNOTSUPP;
1409                 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1410                         if (!failed_auth) {
1411                                 failed_auth++;
1412                                 error = mbuf_setnext(mheadend, NULL);
1413                                 mbuf_freem(mrep);
1414                                 mbuf_freem(rep->r_mreq);
1415                                 if (!error)
1416                                         goto kerbauth;
1417                                 printf("nfs_request: mbuf_setnext failed\n");
1418                         } else
1419                                 error = EAUTH;
1420                 } else
1421                         error = EACCES;
1422                 mbuf_freem(mrep);
1423                 mbuf_freem(rep->r_mreq);
1424                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1425                 return (error);
1426         }
1427
1428         /*
1429          * Grab any Kerberos verifier, otherwise just throw it away.
1430          */
1431         verf_type = fxdr_unsigned(int, *tl++);
1432         i = fxdr_unsigned(int, *tl);
1433         if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1434                 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1435                 if (error)
1436                         goto nfsmout;
1437         } else if (i > 0)
1438                 nfsm_adv(nfsm_rndup(i));
1439         nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1440         /* 0 == ok */
1441         if (*tl == 0) {
1442                 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1443                 if (*tl != 0) {
1444                         error = fxdr_unsigned(int, *tl);
1445                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1446                                 error == NFSERR_TRYLATER) {
1447                                 mbuf_freem(mrep);
1448                                 error = 0;
1449                                 microuptime(&now);
1450                                 waituntil = now.tv_sec + trylater_delay;
1451                                 while (now.tv_sec < waituntil) {
1452                                         tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1453                                         microuptime(&now);
1454                                 }
1455                                 trylater_delay *= 2;
1456                                 if (trylater_delay > 60)
1457                                         trylater_delay = 60;
1458                                 goto tryagain;
1459                         }
1460
1461                         /*
1462                          * If the File Handle was stale, invalidate the
1463                          * lookup cache, just in case.
1464                          */
1465                         if ((error == ESTALE) && vp)
1466                                 cache_purge(vp);
1467                         if (nmp->nm_flag & NFSMNT_NFSV3) {
1468                                 *mrp = mrep;
1469                                 *mdp = md;
1470                                 *dposp = dpos;
1471                                 error |= NFSERR_RETERR;
1472                         } else {
1473                                 mbuf_freem(mrep);
1474                                 error &= ~NFSERR_RETERR;
1475                         }
1476                         mbuf_freem(rep->r_mreq);
1477                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1478                         return (error);
1479                 }
1480
1481                 *mrp = mrep;
1482                 *mdp = md;
1483                 *dposp = dpos;
1484                 mbuf_freem(rep->r_mreq);
1485                 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1486                 return (0);
1487         }
1488         mbuf_freem(mrep);
1489         error = EPROTONOSUPPORT;
1490 nfsmout:
1491         mbuf_freem(rep->r_mreq);
1492         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1493         return (error);
1494 }
1495
1496 #ifndef NFS_NOSERVER
1497 /*
1498  * Generate the rpc reply header
1499  * siz arg. is used to decide if adding a cluster is worthwhile
1500  */
1501 int
1502 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1503         int siz;
1504         struct nfsrv_descript *nd;
1505         struct nfssvc_sock *slp;
1506         int err;
1507         mbuf_t *mrq;
1508         mbuf_t *mbp;
1509         caddr_t *bposp;
1510 {
1511         u_long *tl;
1512         mbuf_t mreq;
1513         caddr_t bpos;
1514         mbuf_t mb, mb2;
1515         int error, mlen;
1516
1517         /*
1518          * If this is a big reply, use a cluster else
1519          * try and leave leading space for the lower level headers.
1520          */
1521         siz += RPC_REPLYSIZ;
1522         if (siz >= nfs_mbuf_minclsize) {
1523                 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1524         } else {
1525                 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1526         }
1527         if (error) {
1528                 /* unable to allocate packet */
1529                 /* XXX nfsstat? */
1530                 return (error);
1531         }
1532         mb = mreq;
1533         tl = mbuf_data(mreq);
1534         mlen = 6 * NFSX_UNSIGNED;
1535         if (siz < nfs_mbuf_minclsize) {
1536                 /* leave space for lower level headers */
1537                 tl += 80/sizeof(*tl);  /* XXX max_hdr? XXX */
1538                 mbuf_setdata(mreq, tl, mlen);
1539         } else {
1540                 mbuf_setlen(mreq, mlen);
1541         }
1542         bpos = ((caddr_t)tl) + mlen;
1543         *tl++ = txdr_unsigned(nd->nd_retxid);
1544         *tl++ = rpc_reply;
1545         if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1546                 *tl++ = rpc_msgdenied;
1547                 if (err & NFSERR_AUTHERR) {
1548                         *tl++ = rpc_autherr;
1549                         *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1550                         mlen -= NFSX_UNSIGNED;
1551                         mbuf_setlen(mreq, mlen);
1552                         bpos -= NFSX_UNSIGNED;
1553                 } else {
1554                         *tl++ = rpc_mismatch;
1555                         *tl++ = txdr_unsigned(RPC_VER2);
1556                         *tl = txdr_unsigned(RPC_VER2);
1557                 }
1558         } else {
1559                 *tl++ = rpc_msgaccepted;
1560
1561                 /*
1562                  * For Kerberos authentication, we must send the nickname
1563                  * verifier back, otherwise just RPCAUTH_NULL.
1564                  */
1565                 if (nd->nd_flag & ND_KERBFULL) {
1566                     struct nfsuid *nuidp;
1567                     struct timeval ktvin, ktvout;
1568                     uid_t uid = kauth_cred_getuid(nd->nd_cr);
1569
1570                     lck_rw_lock_shared(&slp->ns_rwlock);
1571                     for (nuidp = NUIDHASH(slp, uid)->lh_first;
1572                         nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1573                         if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1574                             (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1575                              &nuidp->nu_haddr, nd->nd_nam2)))
1576                             break;
1577                     }
1578                     if (nuidp) {
1579                         ktvin.tv_sec =
1580                             txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1581                         ktvin.tv_usec =
1582                             txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1583
1584                         /*
1585                          * Encrypt the timestamp in ecb mode using the
1586                          * session key.
1587                          */
1588 #if NFSKERB
1589                         XXX
1590 #endif
1591
1592                         *tl++ = rpc_auth_kerb;
1593                         *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1594                         *tl = ktvout.tv_sec;
1595                         nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1596                         *tl++ = ktvout.tv_usec;
1597                         *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1598                     } else {
1599                         *tl++ = 0;
1600                         *tl++ = 0;
1601                     }
1602                     lck_rw_done(&slp->ns_rwlock);
1603                 } else {
1604                         *tl++ = 0;
1605                         *tl++ = 0;
1606                 }
1607                 switch (err) {
1608                 case EPROGUNAVAIL:
1609                         *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1610                         break;
1611                 case EPROGMISMATCH:
1612                         *tl = txdr_unsigned(RPC_PROGMISMATCH);
1613                         nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1614                         // XXX hard coded versions
1615                         *tl++ = txdr_unsigned(2);
1616                         *tl = txdr_unsigned(3);
1617                         break;
1618                 case EPROCUNAVAIL:
1619                         *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1620                         break;
1621                 case EBADRPC:
1622                         *tl = txdr_unsigned(RPC_GARBAGE);
1623                         break;
1624                 default:
1625                         *tl = 0;
1626                         if (err != NFSERR_RETVOID) {
1627                                 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1628                                 if (err)
1629                                     *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1630                                 else
1631                                     *tl = 0;
1632                         }
1633                         break;
1634                 }
1635         }
1636
1637         if (mrq != NULL)
1638                 *mrq = mreq;
1639         *mbp = mb;
1640         *bposp = bpos;
1641         if (err != 0 && err != NFSERR_RETVOID) {
1642                 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1643         }
1644         return (0);
1645 }
1646
1647
1648 #endif /* NFS_NOSERVER */
1649
1650
1651 /*
1652  * From FreeBSD 1.58, a Matt Dillon fix...
1653  * Flag a request as being about to terminate.
1654  * The nm_sent count is decremented now to avoid deadlocks when the process
1655  * in soreceive() hasn't yet managed to send its own request.
1656  */
1657 static void
1658 nfs_softterm(struct nfsreq *rep)
1659 {
1660
1661         rep->r_flags |= R_SOFTTERM;
1662         if (rep->r_flags & R_SENT) {
1663                 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1664                       rep->r_nmp->nm_cwnd);
1665                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1666                 rep->r_flags &= ~R_SENT;
1667         }
1668 }
1669
1670 void
1671 nfs_timer_funnel(void * arg)
1672 {
1673         (void) thread_funnel_set(kernel_flock, TRUE);
1674         nfs_timer(arg);
1675         (void) thread_funnel_set(kernel_flock, FALSE);
1676
1677 }
1678
1679 /*
1680  * Ensure rep isn't in use by the timer, then dequeue it.
1681  */
1682 static void
1683 nfs_repdequeue(struct nfsreq *rep)
1684 {
1685
1686         while ((rep->r_flags & R_BUSY)) {
1687                 rep->r_flags |= R_WAITING;
1688                 tsleep(rep, PSOCK, "repdeq", 0);
1689         }
1690         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1691 }
1692
1693 /*
1694  * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1695  * free()'d out from under it.
1696  */
1697 static void
1698 nfs_repbusy(struct nfsreq *rep)
1699 {
1700
1701         if ((rep->r_flags & R_BUSY))
1702                 panic("rep locked");
1703         rep->r_flags |= R_BUSY;
1704 }
1705
1706 /*
1707  * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1708  */
1709 static struct nfsreq *
1710 nfs_repnext(struct nfsreq *rep)
1711 {
1712         struct nfsreq * nextrep;
1713
1714         if (rep == NULL)
1715                 return (NULL);
1716         /*
1717          * We need to get and busy the next req before signalling the
1718          * current one, otherwise wakeup() may block us and we'll race to
1719          * grab the next req.
1720          */
1721         nextrep = TAILQ_NEXT(rep, r_chain);
1722         if (nextrep != NULL)
1723                 nfs_repbusy(nextrep);
1724         /* unbusy and signal. */
1725         rep->r_flags &= ~R_BUSY;
1726         if ((rep->r_flags & R_WAITING)) {
1727                 rep->r_flags &= ~R_WAITING;
1728                 wakeup(rep);
1729         }
1730         return (nextrep);
1731 }
1732
1733 /*
1734  * Nfs timer routine
1735  * Scan the nfsreq list and retranmit any requests that have timed out
1736  * To avoid retransmission attempts on STREAM sockets (in the future) make
1737  * sure to set the r_retry field to 0 (implies nm_retry == 0).
1738  */
1739 void
1740 nfs_timer(__unused void *arg)
1741 {
1742         struct nfsreq *rep;
1743         mbuf_t m;
1744         socket_t so;
1745         struct nfsmount *nmp;
1746         int timeo;
1747         int error;
1748 #ifndef NFS_NOSERVER
1749         struct nfssvc_sock *slp;
1750         u_quad_t cur_usec;
1751 #endif /* NFS_NOSERVER */
1752         int flags, rexmit, cwnd, sent;
1753         u_long xid;
1754         struct timeval now;
1755
1756         rep = TAILQ_FIRST(&nfs_reqq);
1757         if (rep != NULL)
1758                 nfs_repbusy(rep);
1759         microuptime(&now);
1760         for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1761                 nmp = rep->r_nmp;
1762                 if (!nmp) /* unmounted */
1763                     continue;
1764                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1765                         continue;
1766                 if (nfs_sigintr(nmp, rep, rep->r_procp))
1767                         continue;
1768                 if (nmp->nm_tprintf_initial_delay != 0 &&
1769                     (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1770                     rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1771                         rep->r_lastmsg = now.tv_sec;
1772                         nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1773                                 "not responding");
1774                         rep->r_flags |= R_TPRINTFMSG;
1775                         if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1776                                 /* we're not yet completely mounted and */
1777                                 /* we can't complete an RPC, so we fail */
1778                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1779                                 nfs_softterm(rep);
1780                                 continue;
1781                         }
1782                 }
1783                 if (rep->r_rtt >= 0) {
1784                         rep->r_rtt++;
1785                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1786                                 timeo = nmp->nm_timeo;
1787                         else
1788                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1789                         /* ensure 62.5 ms floor */
1790                         while (16 * timeo < hz)
1791                             timeo *= 2;
1792                         if (nmp->nm_timeouts > 0)
1793                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1794                         if (rep->r_rtt <= timeo)
1795                                 continue;
1796                         if (nmp->nm_timeouts < 8)
1797                                 nmp->nm_timeouts++;
1798                 }
1799                 /*
1800                  * Check for too many retransmits.  This is never true for
1801                  * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1802                  * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1803                  */
1804                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
1805                         OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1806                         nfs_softterm(rep);
1807                         continue;
1808                 }
1809                 if (nmp->nm_sotype != SOCK_DGRAM) {
1810                         if (++rep->r_rexmit > NFS_MAXREXMIT)
1811                                 rep->r_rexmit = NFS_MAXREXMIT;
1812                         continue;
1813                 }
1814                 if ((so = nmp->nm_so) == NULL)
1815                         continue;
1816
1817                 /*
1818                  * If there is enough space and the window allows..
1819                  *      Resend it
1820                  * Set r_rtt to -1 in case we fail to send it now.
1821                  */
1822                 rep->r_rtt = -1;
1823                 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1824                     (rep->r_flags & R_SENT) ||
1825                     nmp->nm_sent < nmp->nm_cwnd) &&
1826                    (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1827                         struct msghdr   msg;
1828                         /*
1829                          * Iff first send, start timing
1830                          * else turn timing off, backoff timer
1831                          * and divide congestion window by 2.
1832                          * We update these *before* the send to avoid
1833                          * racing against receiving the reply.
1834                          * We save them so we can restore them on send error.
1835                          */
1836                         flags = rep->r_flags;
1837                         rexmit = rep->r_rexmit;
1838                         cwnd = nmp->nm_cwnd;
1839                         sent = nmp->nm_sent;
1840                         xid = rep->r_xid;
1841                         if (rep->r_flags & R_SENT) {
1842                                 rep->r_flags &= ~R_TIMING;
1843                                 if (++rep->r_rexmit > NFS_MAXREXMIT)
1844                                         rep->r_rexmit = NFS_MAXREXMIT;
1845                                 nmp->nm_cwnd >>= 1;
1846                                 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1847                                         nmp->nm_cwnd = NFS_CWNDSCALE;
1848                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1849                         } else {
1850                                 rep->r_flags |= R_SENT;
1851                                 nmp->nm_sent += NFS_CWNDSCALE;
1852                         }
1853                         FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1854
1855                         bzero(&msg, sizeof(msg));
1856                         if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1857                                 msg.msg_name = mbuf_data(nmp->nm_nam);
1858                                 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1859                         }
1860                         error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1861
1862                         FSDBG(535, xid, error, sent, cwnd);
1863
1864                         if (error) {
1865                                 if (error == EWOULDBLOCK) {
1866                                         rep->r_flags = flags;
1867                                         rep->r_rexmit = rexmit;
1868                                         nmp->nm_cwnd = cwnd;
1869                                         nmp->nm_sent = sent;
1870                                         rep->r_xid = xid;
1871                                 }
1872                                 else {
1873                                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1874                                                 int clearerror;
1875                                                 int optlen = sizeof(clearerror);
1876                                                 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1877                                         }
1878                                         rep->r_flags  = flags | R_RESENDERR;
1879                                         rep->r_rexmit = rexmit;
1880                                         nmp->nm_cwnd = cwnd;
1881                                         nmp->nm_sent = sent;
1882                                         if (flags & R_SENT)
1883                                                 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1884                                 }
1885                         } else
1886                                 rep->r_rtt = 0;
1887                 }
1888         }
1889         microuptime(&now);
1890 #ifndef NFS_NOSERVER
1891         /*
1892          * Scan the write gathering queues for writes that need to be
1893          * completed now.
1894          */
1895         cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1896         lck_mtx_lock(nfsd_mutex);
1897         TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1898             if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1899                 nfsrv_wakenfsd(slp);
1900         }
1901         while ((slp = TAILQ_FIRST(&nfssvc_deadsockhead))) {
1902                 if ((slp->ns_timestamp + 5) > now.tv_sec)
1903                         break;
1904                 TAILQ_REMOVE(&nfssvc_deadsockhead, slp, ns_chain);
1905                 nfsrv_slpfree(slp);
1906         }
1907         lck_mtx_unlock(nfsd_mutex);
1908 #endif /* NFS_NOSERVER */
1909
1910         if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1911                 /*
1912                  * We haven't called nfs_buf_freeup() in a little while.
1913                  * So, see if we can free up any stale/unused bufs now.
1914                  */
1915                 nfs_buf_freeup(1);
1916         }
1917
1918         timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1919
1920 }
1921
1922
1923 /*
1924  * Test for a termination condition pending on the process.
1925  * This is used to determine if we need to bail on a mount.
1926  * EIO is returned if there has been a soft timeout.
1927  * EINTR is returned if there is a signal pending that is not being ignored
1928  * and the mount is interruptable, or if we are a thread that is in the process
1929  * of cancellation (also SIGKILL posted).
1930  */
1931 int
1932 nfs_sigintr(nmp, rep, p)
1933         struct nfsmount *nmp;
1934         struct nfsreq *rep;
1935         proc_t p;
1936 {
1937         sigset_t pending_sigs;
1938         int context_good = 0;
1939         struct nfsmount *repnmp;
1940         extern proc_t kernproc;
1941
1942         if (nmp == NULL)
1943                 return (ENXIO);
1944         if (rep != NULL) {
1945                 repnmp = rep->r_nmp;
1946                 /* we've had a forced unmount. */
1947                 if (repnmp == NULL)
1948                         return (ENXIO);
1949                 /* request has timed out on a 'soft' mount. */
1950                 if (rep->r_flags & R_SOFTTERM)
1951                         return (EIO);
1952                 /*
1953                  * We're in the progress of a force unmount and there's
1954                  * been a timeout we're dead and fail IO.
1955                  */
1956                 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1957                    (NFSSTA_FORCE|NFSSTA_TIMEO))
1958                         return (EIO);
1959                 /* Someone is unmounting us, go soft and mark it. */
1960                 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1961                         repnmp->nm_flag |= NFSMNT_SOFT;
1962                         nmp->nm_state |= NFSSTA_FORCE;
1963                 }
1964                 /*
1965                  * If the mount is hung and we've requested not to hang
1966                  * on remote filesystems, then bail now.
1967                  */
1968                 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1969                     (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1970                         return (EIO);
1971         }
1972         /* XXX: is this valid?  this probably should be an assertion. */
1973         if (p == NULL)
1974                 return (0);
1975
1976         /* Is this thread belongs to kernel task; then abort check  is not needed */
1977         if ((current_proc() != kernproc) && current_thread_aborted()) {
1978                 return (EINTR);
1979         }
1980         /* mask off thread and process blocked signals. */
1981
1982         pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1983         if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1984                 return (EINTR);
1985         return (0);
1986 }
1987
1988 /*
1989  * Lock a socket against others.
1990  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1991  * and also to avoid race conditions between the processes with nfs requests
1992  * in progress when a reconnect is necessary.
1993  */
1994 int
1995 nfs_sndlock(rep)
1996         struct nfsreq *rep;
1997 {
1998         int *statep;
1999         proc_t p;
2000         int error, slpflag = 0, slptimeo = 0;
2001
2002         if (rep->r_nmp == NULL)
2003                 return (ENXIO);
2004         statep = &rep->r_nmp->nm_state;
2005
2006         p = rep->r_procp;
2007         if (rep->r_nmp->nm_flag & NFSMNT_INT)
2008                 slpflag = PCATCH;
2009         while (*statep & NFSSTA_SNDLOCK) {
2010                 error = nfs_sigintr(rep->r_nmp, rep, p);
2011                 if (error)
2012                         return (error);
2013                 *statep |= NFSSTA_WANTSND;
2014                 if (p != NULL && (proc_noremotehang(p)) != 0)
2015                         slptimeo = hz;
2016                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2017                 if (slpflag == PCATCH) {
2018                         slpflag = 0;
2019                         slptimeo = 2 * hz;
2020                 }
2021                 /*
2022                  * Make sure while we slept that the mountpoint didn't go away.
2023                  * nfs_sigintr and callers expect it in tact.
2024                  */
2025                 if (!rep->r_nmp)
2026                         return (ENXIO); /* don't have lock until out of loop */
2027         }
2028         *statep |= NFSSTA_SNDLOCK;
2029         return (0);
2030 }
2031
2032 /*
2033  * Unlock the stream socket for others.
2034  */
2035 void
2036 nfs_sndunlock(rep)
2037         struct nfsreq *rep;
2038 {
2039         int *statep;
2040
2041         if (rep->r_nmp == NULL)
2042                 return;
2043         statep = &rep->r_nmp->nm_state;
2044         if ((*statep & NFSSTA_SNDLOCK) == 0)
2045                 panic("nfs sndunlock");
2046         *statep &= ~NFSSTA_SNDLOCK;
2047         if (*statep & NFSSTA_WANTSND) {
2048                 *statep &= ~NFSSTA_WANTSND;
2049                 wakeup((caddr_t)statep);
2050         }
2051 }
2052
2053 static int
2054 nfs_rcvlock(struct nfsreq *rep)
2055 {
2056         int *statep;
2057         int error, slpflag, slptimeo = 0;
2058
2059         /* make sure we still have our mountpoint */
2060         if (!rep->r_nmp) {
2061                 if (rep->r_mrep != NULL)
2062                         return (EALREADY);
2063                 return (ENXIO);
2064         }
2065
2066         statep = &rep->r_nmp->nm_state;
2067         FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2068         if (rep->r_nmp->nm_flag & NFSMNT_INT)
2069                 slpflag = PCATCH;
2070         else
2071                 slpflag = 0;
2072         while (*statep & NFSSTA_RCVLOCK) {
2073                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2074                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2075                         return (error);
2076                 } else if (rep->r_mrep != NULL) {
2077                         /*
2078                          * Don't bother sleeping if reply already arrived
2079                          */
2080                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2081                         return (EALREADY);
2082                 }
2083                 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2084                 *statep |= NFSSTA_WANTRCV;
2085                 /*
2086                  * We need to poll if we're P_NOREMOTEHANG so that we
2087                  * call nfs_sigintr periodically above.
2088                  */
2089                 if (rep->r_procp != NULL &&
2090                     (proc_noremotehang(rep->r_procp)) != 0)
2091                         slptimeo = hz;
2092                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2093                 if (slpflag == PCATCH) {
2094                         slpflag = 0;
2095                         slptimeo = 2 * hz;
2096                 }
2097                 /*
2098                  * Make sure while we slept that the mountpoint didn't go away.
2099                  * nfs_sigintr and caller nfs_reply expect it intact.
2100                  */
2101                 if (!rep->r_nmp)  {
2102                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2103                         return (ENXIO); /* don't have lock until out of loop */
2104                 }
2105         }
2106         /*
2107          * nfs_reply will handle it if reply already arrived.
2108          * (We may have slept or been preempted).
2109          */
2110         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2111         *statep |= NFSSTA_RCVLOCK;
2112         return (0);
2113 }
2114
2115 /*
2116  * Unlock the stream socket for others.
2117  */
2118 static void
2119 nfs_rcvunlock(struct nfsreq *rep)
2120 {
2121         int *statep;
2122
2123         if (rep->r_nmp == NULL)
2124                 return;
2125         statep = &rep->r_nmp->nm_state;
2126
2127         FSDBG(533, statep, *statep, 0, 0);
2128         if ((*statep & NFSSTA_RCVLOCK) == 0)
2129                 panic("nfs rcvunlock");
2130         *statep &= ~NFSSTA_RCVLOCK;
2131         if (*statep & NFSSTA_WANTRCV) {
2132                 *statep &= ~NFSSTA_WANTRCV;
2133                 wakeup((caddr_t)statep);
2134         }
2135 }
2136
2137
2138 #ifndef NFS_NOSERVER
2139 /*
2140  * Socket upcall routine for the nfsd sockets.
2141  * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2142  * Essentially do as much as possible non-blocking, else punt and it will
2143  * be called with MBUF_WAITOK from an nfsd.
2144  */
2145 void
2146 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2147 {
2148         struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2149
2150         if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2151                 return;
2152
2153         lck_rw_lock_exclusive(&slp->ns_rwlock);
2154         nfsrv_rcv_locked(so, slp, waitflag);
2155         /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2156 }
2157 void
2158 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2159 {
2160         mbuf_t m, mp, mhck, m2;
2161         int ns_flag=0, error;
2162         struct msghdr   msg;
2163         size_t bytes_read;
2164
2165         if ((slp->ns_flag & SLP_VALID) == 0) {
2166                 if (waitflag == MBUF_DONTWAIT)
2167                         lck_rw_done(&slp->ns_rwlock);
2168                 return;
2169         }
2170
2171 #ifdef notdef
2172         /*
2173          * Define this to test for nfsds handling this under heavy load.
2174          */
2175         if (waitflag == MBUF_DONTWAIT) {
2176                 ns_flag = SLP_NEEDQ;
2177                 goto dorecs;
2178         }
2179 #endif
2180         if (slp->ns_sotype == SOCK_STREAM) {
2181                 /*
2182                  * If there are already records on the queue, defer soreceive()
2183                  * to an nfsd so that there is feedback to the TCP layer that
2184                  * the nfs servers are heavily loaded.
2185                  */
2186                 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2187                         ns_flag = SLP_NEEDQ;
2188                         goto dorecs;
2189                 }
2190
2191                 /*
2192                  * Do soreceive().
2193                  */
2194                 bytes_read = 1000000000;
2195                 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2196                 if (error || mp == NULL) {
2197                         if (error == EWOULDBLOCK)
2198                                 ns_flag = SLP_NEEDQ;
2199                         else
2200                                 ns_flag = SLP_DISCONN;
2201                         goto dorecs;
2202                 }
2203                 m = mp;
2204                 if (slp->ns_rawend) {
2205                         if ((error = mbuf_setnext(slp->ns_rawend, m)))
2206                                 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2207                         slp->ns_cc += bytes_read;
2208                 } else {
2209                         slp->ns_raw = m;
2210                         slp->ns_cc = bytes_read;
2211                 }
2212                 while ((m2 = mbuf_next(m)))
2213                         m = m2;
2214                 slp->ns_rawend = m;
2215
2216                 /*
2217                  * Now try and parse record(s) out of the raw stream data.
2218                  */
2219                 error = nfsrv_getstream(slp, waitflag);
2220                 if (error) {
2221                         if (error == EPERM)
2222                                 ns_flag = SLP_DISCONN;
2223                         else
2224                                 ns_flag = SLP_NEEDQ;
2225                 }
2226         } else {
2227                 struct sockaddr_storage nam;
2228
2229                 bzero(&msg, sizeof(msg));
2230                 msg.msg_name = (caddr_t)&nam;
2231                 msg.msg_namelen = sizeof(nam);
2232
2233                 do {
2234                         bytes_read = 1000000000;
2235                         error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2236                         if (mp) {
2237                                 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2238                                         mbuf_setlen(mhck, nam.ss_len);
2239                                         bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2240                                         m = mhck;
2241                                         if (mbuf_setnext(m, mp)) {
2242                                                 /* trouble... just drop it */
2243                                                 printf("nfsrv_rcv: mbuf_setnext failed\n");
2244                                                 mbuf_free(mhck);
2245                                                 m = mp;
2246                                         }
2247                                 } else {
2248                                         m = mp;
2249                                 }
2250                                 if (slp->ns_recend)
2251                                         mbuf_setnextpkt(slp->ns_recend, m);
2252                                 else
2253                                         slp->ns_rec = m;
2254                                 slp->ns_recend = m;
2255                                 mbuf_setnextpkt(m, NULL);
2256                         }
2257 #if 0
2258                         if (error) {
2259                                 /*
2260                                  * This may be needed in the future to support
2261                                  * non-byte-stream connection-oriented protocols
2262                                  * such as SCTP.
2263                                  */
2264                                 /*
2265                                  * This (slp->ns_sotype == SOCK_STREAM) should really
2266                                  * be a check for PR_CONNREQUIRED.
2267                                  */
2268                                 if ((slp->ns_sotype == SOCK_STREAM)
2269                                         && error != EWOULDBLOCK) {
2270                                         ns_flag = SLP_DISCONN;
2271                                         goto dorecs;
2272                                 }
2273                         }
2274 #endif
2275                 } while (mp);
2276         }
2277
2278         /*
2279          * Now try and process the request records, non-blocking.
2280          */
2281 dorecs:
2282         if (ns_flag)
2283                 slp->ns_flag |= ns_flag;
2284         if (waitflag == MBUF_DONTWAIT) {
2285                 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2286                 lck_rw_done(&slp->ns_rwlock);
2287                 if (wake && nfs_numnfsd) {
2288                         lck_mtx_lock(nfsd_mutex);
2289                         nfsrv_wakenfsd(slp);
2290                         lck_mtx_unlock(nfsd_mutex);
2291                 }
2292         }
2293 }
2294
2295 /*
2296  * Try and extract an RPC request from the mbuf data list received on a
2297  * stream socket. The "waitflag" argument indicates whether or not it
2298  * can sleep.
2299  */
2300 static int
2301 nfsrv_getstream(slp, waitflag)
2302         struct nfssvc_sock *slp;
2303         int waitflag;
2304 {
2305         mbuf_t m;
2306         char *cp1, *cp2, *mdata;
2307         int len, mlen, error;
2308         mbuf_t om, m2, recm;
2309         u_long recmark;
2310
2311         if (slp->ns_flag & SLP_GETSTREAM)
2312                 panic("nfs getstream");
2313         slp->ns_flag |= SLP_GETSTREAM;
2314         for (;;) {
2315             if (slp->ns_reclen == 0) {
2316                 if (slp->ns_cc < NFSX_UNSIGNED) {
2317                         slp->ns_flag &= ~SLP_GETSTREAM;
2318                         return (0);
2319                 }
2320                 m = slp->ns_raw;
2321                 mdata = mbuf_data(m);
2322                 mlen = mbuf_len(m);
2323                 if (mlen >= NFSX_UNSIGNED) {
2324                         bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2325                         mdata += NFSX_UNSIGNED;
2326                         mlen -= NFSX_UNSIGNED;
2327                         mbuf_setdata(m, mdata, mlen);
2328                 } else {
2329                         cp1 = (caddr_t)&recmark;
2330                         cp2 = mdata;
2331                         while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2332                                 while (mlen == 0) {
2333                                         m = mbuf_next(m);
2334                                         cp2 = mbuf_data(m);
2335                                         mlen = mbuf_len(m);
2336                                 }
2337                                 *cp1++ = *cp2++;
2338                                 mlen--;
2339                                 mbuf_setdata(m, cp2, mlen);
2340                         }
2341                 }
2342                 slp->ns_cc -= NFSX_UNSIGNED;
2343                 recmark = ntohl(recmark);
2344                 slp->ns_reclen = recmark & ~0x80000000;
2345                 if (recmark & 0x80000000)
2346                         slp->ns_flag |= SLP_LASTFRAG;
2347                 else
2348                         slp->ns_flag &= ~SLP_LASTFRAG;
2349                 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2350                         slp->ns_flag &= ~SLP_GETSTREAM;
2351                         return (EPERM);
2352                 }
2353             }
2354
2355             /*
2356              * Now get the record part.
2357              *
2358              * Note that slp->ns_reclen may be 0.  Linux sometimes
2359              * generates 0-length RPCs
2360              */
2361             recm = NULL;
2362             if (slp->ns_cc == slp->ns_reclen) {
2363                 recm = slp->ns_raw;
2364                 slp->ns_raw = slp->ns_rawend = NULL;
2365                 slp->ns_cc = slp->ns_reclen = 0;
2366             } else if (slp->ns_cc > slp->ns_reclen) {
2367                 len = 0;
2368                 m = slp->ns_raw;
2369                 mlen = mbuf_len(m);
2370                 mdata = mbuf_data(m);
2371                 om = NULL;
2372                 while (len < slp->ns_reclen) {
2373                         if ((len + mlen) > slp->ns_reclen) {
2374                                 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2375                                         slp->ns_flag &= ~SLP_GETSTREAM;
2376                                         return (EWOULDBLOCK);
2377                                 }
2378                                 if (om) {
2379                                         if (mbuf_setnext(om, m2)) {
2380                                                 /* trouble... just drop it */
2381                                                 printf("nfsrv_getstream: mbuf_setnext failed\n");
2382                                                 mbuf_freem(m2);
2383                                                 slp->ns_flag &= ~SLP_GETSTREAM;
2384                                                 return (EWOULDBLOCK);
2385                                         }
2386                                         recm = slp->ns_raw;
2387                                 } else {
2388                                         recm = m2;
2389                                 }
2390                                 mdata += slp->ns_reclen - len;
2391                                 mlen -= slp->ns_reclen - len;
2392                                 mbuf_setdata(m, mdata, mlen);
2393                                 len = slp->ns_reclen;
2394                         } else if ((len + mlen) == slp->ns_reclen) {
2395                                 om = m;
2396                                 len += mlen;
2397                                 m = mbuf_next(m);
2398                                 recm = slp->ns_raw;
2399                                 if (mbuf_setnext(om, NULL)) {
2400                                         printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2401                                         slp->ns_flag &= ~SLP_GETSTREAM;
2402                                         return (EWOULDBLOCK);
2403                                 }
2404                                 mlen = mbuf_len(m);
2405                                 mdata = mbuf_data(m);
2406                         } else {
2407                                 om = m;
2408                                 len += mlen;
2409                                 m = mbuf_next(m);
2410                                 mlen = mbuf_len(m);
2411                                 mdata = mbuf_data(m);
2412                         }
2413                 }
2414                 slp->ns_raw = m;
2415                 slp->ns_cc -= len;
2416                 slp->ns_reclen = 0;
2417             } else {
2418                 slp->ns_flag &= ~SLP_GETSTREAM;
2419                 return (0);
2420             }
2421
2422             /*
2423              * Accumulate the fragments into a record.
2424              */
2425             if (slp->ns_frag == NULL) {
2426                 slp->ns_frag = recm;
2427             } else {
2428                 m = slp->ns_frag;
2429                 while ((m2 = mbuf_next(m)))
2430                     m = m2;
2431                 if ((error = mbuf_setnext(m, recm)))
2432                     panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2433             }
2434             if (slp->ns_flag & SLP_LASTFRAG) {
2435                 if (slp->ns_recend)
2436                     mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2437                 else
2438                     slp->ns_rec = slp->ns_frag;
2439                 slp->ns_recend = slp->ns_frag;
2440                 slp->ns_frag = NULL;
2441             }
2442         }
2443 }
2444
2445 /*
2446  * Parse an RPC header.
2447  */
2448 int
2449 nfsrv_dorec(slp, nfsd, ndp)
2450         struct nfssvc_sock *slp;
2451         struct nfsd *nfsd;
2452         struct nfsrv_descript **ndp;
2453 {
2454         mbuf_t m;
2455         mbuf_t nam;
2456         struct nfsrv_descript *nd;
2457         int error;
2458
2459         *ndp = NULL;
2460         if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2461                 return (ENOBUFS);
2462         MALLOC_ZONE(nd, struct nfsrv_descript *,
2463                         sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2464         if (!nd)
2465                 return (ENOMEM);
2466         m = slp->ns_rec;
2467         slp->ns_rec = mbuf_nextpkt(m);
2468         if (slp->ns_rec)
2469                 mbuf_setnextpkt(m, NULL);
2470         else
2471                 slp->ns_recend = NULL;
2472         if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2473                 nam = m;
2474                 m = mbuf_next(m);
2475                 if ((error = mbuf_setnext(nam, NULL)))
2476                         panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2477         } else
2478                 nam = NULL;
2479         nd->nd_md = nd->nd_mrep = m;
2480         nd->nd_nam2 = nam;
2481         nd->nd_dpos = mbuf_data(m);
2482         error = nfs_getreq(nd, nfsd, TRUE);
2483         if (error) {
2484                 if (nam)
2485                         mbuf_freem(nam);
2486                 FREE_ZONE((caddr_t)nd,  sizeof *nd, M_NFSRVDESC);
2487                 return (error);
2488         }
2489         *ndp = nd;
2490         nfsd->nfsd_nd = nd;
2491         return (0);
2492 }
2493
2494 /*
2495  * Parse an RPC request
2496  * - verify it
2497  * - fill in the cred struct.
2498  */
2499 int
2500 nfs_getreq(nd, nfsd, has_header)
2501         struct nfsrv_descript *nd;
2502         struct nfsd *nfsd;
2503         int has_header;
2504 {
2505         int len, i;
2506         u_long *tl;
2507         long t1;
2508         uio_t uiop;
2509         caddr_t dpos, cp2, cp;
2510         u_long nfsvers, auth_type;
2511         uid_t nickuid;
2512         int error = 0, ticklen;
2513         mbuf_t mrep, md;
2514         struct nfsuid *nuidp;
2515         uid_t user_id;
2516         gid_t group_id;
2517         int ngroups;
2518         struct ucred temp_cred;
2519         struct timeval tvin, tvout, now;
2520         char uio_buf[ UIO_SIZEOF(1) ];
2521 #if 0                           /* until encrypted keys are implemented */
2522         NFSKERBKEYSCHED_T keys; /* stores key schedule */
2523 #endif
2524
2525         nd->nd_cr = NULL;
2526
2527         mrep = nd->nd_mrep;
2528         md = nd->nd_md;
2529         dpos = nd->nd_dpos;
2530         if (has_header) {
2531                 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2532                 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2533                 if (*tl++ != rpc_call) {
2534                         mbuf_freem(mrep);
2535                         return (EBADRPC);
2536                 }
2537         } else
2538                 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2539         nd->nd_repstat = 0;
2540         nd->nd_flag = 0;
2541         if (*tl++ != rpc_vers) {
2542                 nd->nd_repstat = ERPCMISMATCH;
2543                 nd->nd_procnum = NFSPROC_NOOP;
2544                 return (0);
2545         }
2546         if (*tl != nfs_prog) {
2547                 nd->nd_repstat = EPROGUNAVAIL;
2548                 nd->nd_procnum = NFSPROC_NOOP;
2549                 return (0);
2550         }
2551         tl++;
2552         nfsvers = fxdr_unsigned(u_long, *tl++);
2553         if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2554                 nd->nd_repstat = EPROGMISMATCH;
2555                 nd->nd_procnum = NFSPROC_NOOP;
2556                 return (0);
2557         }
2558         else if (nfsvers == NFS_VER3)
2559                 nd->nd_flag = ND_NFSV3;
2560         nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2561         if (nd->nd_procnum == NFSPROC_NULL)
2562                 return (0);
2563         if ((nd->nd_procnum >= NFS_NPROCS) ||
2564                 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2565                 nd->nd_repstat = EPROCUNAVAIL;
2566                 nd->nd_procnum = NFSPROC_NOOP;
2567                 return (0);
2568         }
2569         if ((nd->nd_flag & ND_NFSV3) == 0)
2570                 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2571         auth_type = *tl++;
2572         len = fxdr_unsigned(int, *tl++);
2573         if (len < 0 || len > RPCAUTH_MAXSIZ) {
2574                 mbuf_freem(mrep);
2575                 return (EBADRPC);
2576         }
2577
2578         nd->nd_flag &= ~ND_KERBAUTH;
2579         /*
2580          * Handle auth_unix or auth_kerb.
2581          */
2582         if (auth_type == rpc_auth_unix) {
2583                 len = fxdr_unsigned(int, *++tl);
2584                 if (len < 0 || len > NFS_MAXNAMLEN) {
2585                         mbuf_freem(mrep);
2586                         return (EBADRPC);
2587                 }
2588                 bzero(&temp_cred, sizeof(temp_cred));
2589                 nfsm_adv(nfsm_rndup(len));
2590                 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2591                 user_id = fxdr_unsigned(uid_t, *tl++);
2592                 group_id = fxdr_unsigned(gid_t, *tl++);
2593                 temp_cred.cr_groups[0] = group_id;
2594                 len = fxdr_unsigned(int, *tl);
2595                 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2596                         mbuf_freem(mrep);
2597                         return (EBADRPC);
2598                 }
2599                 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2600                 for (i = 1; i <= len; i++)
2601                     if (i < NGROUPS)
2602                         temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2603                     else
2604                         tl++;
2605                 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2606                 if (ngroups > 1)
2607                     nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2608                 len = fxdr_unsigned(int, *++tl);
2609                 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2610                         mbuf_freem(mrep);
2611                         return (EBADRPC);
2612                 }
2613                 temp_cred.cr_uid = user_id;
2614                 temp_cred.cr_ngroups = ngroups;
2615                 nd->nd_cr = kauth_cred_create(&temp_cred);
2616                 if (nd->nd_cr == NULL) {
2617                         nd->nd_repstat = ENOMEM;
2618                         nd->nd_procnum = NFSPROC_NOOP;
2619                         return (0);
2620                 }
2621                 if (len > 0)
2622                         nfsm_adv(nfsm_rndup(len));
2623         } else if (auth_type == rpc_auth_kerb) {
2624                 switch (fxdr_unsigned(int, *tl++)) {
2625                 case RPCAKN_FULLNAME:
2626                         ticklen = fxdr_unsigned(int, *tl);
2627                         *((u_long *)nfsd->nfsd_authstr) = *tl;
2628                         uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2629                                                 &uio_buf[0], sizeof(uio_buf));
2630                         if (!uiop) {
2631                                 nd->nd_repstat = ENOMEM;
2632                                 nd->nd_procnum = NFSPROC_NOOP;
2633                                 return (0);
2634                         }
2635
2636                         // LP64todo - fix this
2637                         nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2638                         if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2639                                 mbuf_freem(mrep);
2640                                 return (EBADRPC);
2641                         }
2642                         uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2643                         // LP64todo - fix this
2644                         nfsm_mtouio(uiop, uio_resid(uiop));
2645                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2646                         if (*tl++ != rpc_auth_kerb ||
2647                                 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2648                                 printf("Bad kerb verifier\n");
2649                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2650                                 nd->nd_procnum = NFSPROC_NOOP;
2651                                 return (0);
2652                         }
2653                         nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2654                         tl = (u_long *)cp;
2655                         if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2656                                 printf("Not fullname kerb verifier\n");
2657                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2658                                 nd->nd_procnum = NFSPROC_NOOP;
2659                                 return (0);
2660                         }
2661                         cp += NFSX_UNSIGNED;
2662                         bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2663                         nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2664                         nd->nd_flag |= ND_KERBFULL;
2665                         nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2666                         break;
2667                 case RPCAKN_NICKNAME:
2668                         if (len != 2 * NFSX_UNSIGNED) {
2669                                 printf("Kerb nickname short\n");
2670                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2671                                 nd->nd_procnum = NFSPROC_NOOP;
2672                                 return (0);
2673                         }
2674                         nickuid = fxdr_unsigned(uid_t, *tl);
2675                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2676                         if (*tl++ != rpc_auth_kerb ||
2677                                 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2678                                 printf("Kerb nick verifier bad\n");
2679                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2680                                 nd->nd_procnum = NFSPROC_NOOP;
2681                                 return (0);
2682                         }
2683                         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2684                         tvin.tv_sec = *tl++;
2685                         tvin.tv_usec = *tl;
2686
2687                         for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2688                             nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2689                                 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2690                                     (!nd->nd_nam2 ||
2691                                      netaddr_match(NU_NETFAM(nuidp),
2692                                       &nuidp->nu_haddr, nd->nd_nam2)))
2693                                         break;
2694                         }
2695                         if (!nuidp) {
2696                                 nd->nd_repstat =
2697                                         (NFSERR_AUTHERR|AUTH_REJECTCRED);
2698                                 nd->nd_procnum = NFSPROC_NOOP;
2699                                 return (0);
2700                         }
2701
2702                         /*
2703                          * Now, decrypt the timestamp using the session key
2704                          * and validate it.
2705                          */
2706 #if NFSKERB
2707                         XXX
2708 #endif
2709
2710                         tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2711                         tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2712                         microtime(&now);
2713                         if (nuidp->nu_expire < now.tv_sec ||
2714                             nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2715                             (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2716                              nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2717                                 nuidp->nu_expire = 0;
2718                                 nd->nd_repstat =
2719                                     (NFSERR_AUTHERR|AUTH_REJECTVERF);
2720                                 nd->nd_procnum = NFSPROC_NOOP;
2721                                 return (0);
2722                         }
2723                         bzero(&temp_cred, sizeof(temp_cred));
2724                         ngroups = nuidp->nu_cr->cr_ngroups;
2725                         for (i = 0; i < ngroups; i++)
2726                                 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2727                         if (ngroups > 1)
2728                                 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2729
2730                         temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2731                         temp_cred.cr_ngroups = ngroups;
2732                         nd->nd_cr = kauth_cred_create(&temp_cred);
2733                         if (!nd->nd_cr) {
2734                                 nd->nd_repstat = ENOMEM;
2735                                 nd->nd_procnum = NFSPROC_NOOP;
2736                                 return (0);
2737                         }
2738                         nd->nd_flag |= ND_KERBNICK;
2739                 };
2740         } else {
2741                 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2742                 nd->nd_procnum = NFSPROC_NOOP;
2743                 return (0);
2744         }
2745
2746         nd->nd_md = md;
2747         nd->nd_dpos = dpos;
2748         return (0);
2749 nfsmout:
2750         if (nd->nd_cr)
2751                 kauth_cred_rele(nd->nd_cr);
2752         return (error);
2753 }
2754
2755 /*
2756  * Search for a sleeping nfsd and wake it up.
2757  * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2758  * running nfsds will go look for the work in the nfssvc_sock list.
2759  * Note: Must be called with nfsd_mutex held.
2760  */
2761 void
2762 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2763 {
2764         struct nfsd *nd;
2765
2766         if ((slp->ns_flag & SLP_VALID) == 0)
2767                 return;
2768
2769         lck_rw_lock_exclusive(&slp->ns_rwlock);
2770
2771         if (nfsd_waiting) {
2772                 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2773                         if (nd->nfsd_flag & NFSD_WAITING) {
2774                                 nd->nfsd_flag &= ~NFSD_WAITING;
2775                                 if (nd->nfsd_slp)
2776                                         panic("nfsd wakeup");
2777                                 slp->ns_sref++;
2778                                 nd->nfsd_slp = slp;
2779                                 lck_rw_done(&slp->ns_rwlock);
2780                                 wakeup((caddr_t)nd);
2781                                 return;
2782                         }
2783                 }
2784         }
2785
2786         slp->ns_flag |= SLP_DOREC;
2787
2788         lck_rw_done(&slp->ns_rwlock);
2789
2790         nfsd_head_flag |= NFSD_CHECKSLP;
2791 }
2792 #endif /* NFS_NOSERVER */
2793
2794 static int
2795 nfs_msg(proc_t p,
2796         const char *server,
2797         const char *msg,
2798         int error)
2799 {
2800         tpr_t tpr;
2801
2802         if (p)
2803                 tpr = tprintf_open(p);
2804         else
2805                 tpr = NULL;
2806         if (error)
2807                 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2808                     error);
2809         else
2810                 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2811         tprintf_close(tpr);
2812         return (0);
2813 }
2814
2815 void
2816 nfs_down(nmp, proc, error, flags, msg)
2817         struct nfsmount *nmp;
2818         proc_t proc;
2819         int error, flags;
2820         const char *msg;
2821 {
2822         if (nmp == NULL)
2823                 return;
2824         if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2825                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2826                 nmp->nm_state |= NFSSTA_TIMEO;
2827         }
2828         if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2829                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2830                 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2831         }
2832         nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2833 }
2834
2835 void
2836 nfs_up(nmp, proc, flags, msg)
2837         struct nfsmount *nmp;
2838         proc_t proc;
2839         int flags;
2840         const char *msg;
2841 {
2842         if (nmp == NULL)
2843                 return;
2844         if (msg)
2845                 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2846         if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2847                 nmp->nm_state &= ~NFSSTA_TIMEO;
2848                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2849         }
2850         if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2851                 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2852                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2853         }
2854 }
2855