bsd/nfs/nfs_socket.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1991, 1993, 1995
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * This code is derived from software contributed to Berkeley by
  34  * Rick Macklem at The University of Guelph.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  * 3. All advertising materials mentioning features or use of this software
  45  *    must display the following acknowledgement:
  46  *      This product includes software developed by the University of
  47  *      California, Berkeley and its contributors.
  48  * 4. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  *
  64  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
  65  * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
  66  */
  67
  68 /*
  69  * Socket operations for use by nfs
  70  */
  71
  72 #include <sys/param.h>
  73 #include <sys/systm.h>
  74 #include <sys/proc.h>
  75 #include <sys/kauth.h>
  76 #include <sys/mount_internal.h>
  77 #include <sys/kernel.h>
  78 #include <sys/kpi_mbuf.h>
  79 #include <sys/malloc.h>
  80 #include <sys/vnode.h>
  81 #include <sys/domain.h>
  82 #include <sys/protosw.h>
  83 #include <sys/socket.h>
  84 #include <sys/syslog.h>
  85 #include <sys/tprintf.h>
  86 #include <sys/uio_internal.h>
  87 #include <libkern/OSAtomic.h>
  88
  89 #include <sys/time.h>
  90 #include <kern/clock.h>
  91 #include <kern/task.h>
  92 #include <kern/thread.h>
  93 #include <sys/user.h>
  94
  95 #include <netinet/in.h>
  96 #include <netinet/tcp.h>
  97
  98 #include <nfs/rpcv2.h>
  99 #include <nfs/nfsproto.h>
 100 #include <nfs/nfs.h>
 101 #include <nfs/xdr_subs.h>
 102 #include <nfs/nfsm_subs.h>
 103 #include <nfs/nfsmount.h>
 104 #include <nfs/nfsnode.h>
 105 #include <nfs/nfsrtt.h>
 106
 107 #include <sys/kdebug.h>
 108
 109 #define FSDBG(A, B, C, D, E) \
 110         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
 111                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 112 #define FSDBG_TOP(A, B, C, D, E) \
 113         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
 114                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 115 #define FSDBG_BOT(A, B, C, D, E) \
 116         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
 117                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 118
 119 /*
 120  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
 121  * Use the mean and mean deviation of rtt for the appropriate type of rpc
 122  * for the frequent rpcs and a default for the others.
 123  * The justification for doing "other" this way is that these rpcs
 124  * happen so infrequently that timer est. would probably be stale.
 125  * Also, since many of these rpcs are
 126  * non-idempotent, a conservative timeout is desired.
 127  * getattr, lookup - A+2D
 128  * read, write     - A+4D
 129  * other           - nm_timeo
 130  */
 131 #define NFS_RTO(n, t) \
 132         ((t) == 0 ? (n)->nm_timeo : \
 133          ((t) < 3 ? \
 134           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
 135           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
 136 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
 137 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
 138 /*
 139  * External data, mostly RPC constants in XDR form
 140  */
 141 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
 142         rpc_msgaccepted, rpc_call, rpc_autherr,
 143         rpc_auth_kerb;
 144 extern u_long nfs_prog;
 145 extern struct nfsstats nfsstats;
 146 extern int nfsv3_procid[NFS_NPROCS];
 147 extern int nfs_ticks;
 148 extern u_long nfs_xidwrap;
 149
 150 /*
 151  * Defines which timer to use for the procnum.
 152  * 0 - default
 153  * 1 - getattr
 154  * 2 - lookup
 155  * 3 - read
 156  * 4 - write
 157  */
 158 static int proct[NFS_NPROCS] = {
 159         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
 160 };
 161
 162 /*
 163  * There is a congestion window for outstanding rpcs maintained per mount
 164  * point. The cwnd size is adjusted in roughly the way that:
 165  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
 166  * SIGCOMM '88". ACM, August 1988.
 167  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
 168  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
 169  * of rpcs is in progress.
 170  * (The sent count and cwnd are scaled for integer arith.)
 171  * Variants of "slow start" were tried and were found to be too much of a
 172  * performance hit (ave. rtt 3 times larger),
 173  * I suspect due to the large rtt that nfs rpcs have.
 174  */
 175 #define NFS_CWNDSCALE   256
 176 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
 177 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
 178 int nfsrtton = 0;
 179 struct nfsrtt nfsrtt;
 180
 181 static int      nfs_rcvlock(struct nfsreq *);
 182 static void     nfs_rcvunlock(struct nfsreq *);
 183 static int      nfs_receive(struct nfsreq *rep, mbuf_t *mp);
 184 static int      nfs_reconnect(struct nfsreq *rep);
 185 static void     nfs_repdequeue(struct nfsreq *rep);
 186
 187 /* XXX */
 188 boolean_t       current_thread_aborted(void);
 189 kern_return_t   thread_terminate(thread_t);
 190
 191 #ifndef NFS_NOSERVER
 192 static int      nfsrv_getstream(struct nfssvc_sock *,int);
 193
 194 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
 195                                     struct nfssvc_sock *slp,
 196                                     proc_t procp,
 197                                     mbuf_t *mreqp) = {
 198         nfsrv_null,
 199         nfsrv_getattr,
 200         nfsrv_setattr,
 201         nfsrv_lookup,
 202         nfsrv3_access,
 203         nfsrv_readlink,
 204         nfsrv_read,
 205         nfsrv_write,
 206         nfsrv_create,
 207         nfsrv_mkdir,
 208         nfsrv_symlink,
 209         nfsrv_mknod,
 210         nfsrv_remove,
 211         nfsrv_rmdir,
 212         nfsrv_rename,
 213         nfsrv_link,
 214         nfsrv_readdir,
 215         nfsrv_readdirplus,
 216         nfsrv_statfs,
 217         nfsrv_fsinfo,
 218         nfsrv_pathconf,
 219         nfsrv_commit,
 220         nfsrv_noop
 221 };
 222 #endif /* NFS_NOSERVER */
 223
 224
 225 /*
 226  * attempt to bind a socket to a reserved port
 227  */
 228 static int
 229 nfs_bind_resv(struct nfsmount *nmp)
 230 {
 231         socket_t so = nmp->nm_so;
 232         struct sockaddr_in sin;
 233         int error;
 234         u_short tport;
 235
 236         if (!so)
 237                 return (EINVAL);
 238
 239         sin.sin_len = sizeof (struct sockaddr_in);
 240         sin.sin_family = AF_INET;
 241         sin.sin_addr.s_addr = INADDR_ANY;
 242         tport = IPPORT_RESERVED - 1;
 243         sin.sin_port = htons(tport);
 244
 245         while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
 246                (--tport > IPPORT_RESERVED / 2))
 247                 sin.sin_port = htons(tport);
 248         return (error);
 249 }
 250
 251 /*
 252  * variables for managing the nfs_bind_resv_thread
 253  */
 254 int nfs_resv_mounts = 0;
 255 static int nfs_bind_resv_thread_state = 0;
 256 #define NFS_BIND_RESV_THREAD_STATE_INITTED      1
 257 #define NFS_BIND_RESV_THREAD_STATE_RUNNING      2
 258 lck_grp_t *nfs_bind_resv_lck_grp;
 259 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
 260 lck_attr_t *nfs_bind_resv_lck_attr;
 261 lck_mtx_t *nfs_bind_resv_mutex;
 262 struct nfs_bind_resv_request {
 263         TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
 264         struct nfsmount *brr_nmp;
 265         int brr_error;
 266 };
 267 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
 268
 269 /*
 270  * thread to handle any reserved port bind requests
 271  */
 272 static void
 273 nfs_bind_resv_thread(void)
 274 {
 275         struct nfs_bind_resv_request *brreq;
 276
 277         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 278
 279         while (nfs_resv_mounts > 0) {
 280                 lck_mtx_lock(nfs_bind_resv_mutex);
 281                 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
 282                         TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
 283                         lck_mtx_unlock(nfs_bind_resv_mutex);
 284                         brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
 285                         wakeup(brreq);
 286                         lck_mtx_lock(nfs_bind_resv_mutex);
 287                 }
 288                 msleep((caddr_t)&nfs_bind_resv_request_queue,
 289                                 nfs_bind_resv_mutex, PSOCK | PDROP,
 290                                 "nfs_bind_resv_request_queue", 0);
 291         }
 292
 293         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 294         (void) thread_terminate(current_thread());
 295 }
 296
 297 int
 298 nfs_bind_resv_thread_wake(void)
 299 {
 300         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
 301                 return (EIO);
 302         wakeup(&nfs_bind_resv_request_queue);
 303         return (0);
 304 }
 305
 306 /*
 307  * underprivileged procs call this to request nfs_bind_resv_thread
 308  * to perform the reserved port binding for them.
 309  */
 310 static int
 311 nfs_bind_resv_nopriv(struct nfsmount *nmp)
 312 {
 313         struct nfs_bind_resv_request brreq;
 314         int error;
 315
 316         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
 317                 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
 318                         nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
 319                         nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
 320                         nfs_bind_resv_lck_attr = lck_attr_alloc_init();
 321                         nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
 322                         TAILQ_INIT(&nfs_bind_resv_request_queue);
 323                         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 324                 }
 325                 kernel_thread(kernel_task, nfs_bind_resv_thread);
 326                 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 327         }
 328
 329         brreq.brr_nmp = nmp;
 330         brreq.brr_error = 0;
 331
 332         lck_mtx_lock(nfs_bind_resv_mutex);
 333         TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 334         lck_mtx_unlock(nfs_bind_resv_mutex);
 335
 336         error = nfs_bind_resv_thread_wake();
 337         if (error) {
 338                 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 339                 /* Note: we might be able to simply restart the thread */
 340                 return (error);
 341         }
 342
 343         tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
 344
 345         return (brreq.brr_error);
 346 }
 347
 348 /*
 349  * Initialize sockets and congestion for a new NFS connection.
 350  * We do not free the sockaddr if error.
 351  */
 352 int
 353 nfs_connect(
 354         struct nfsmount *nmp,
 355         __unused struct nfsreq *rep)
 356 {
 357         socket_t so;
 358         int error, rcvreserve, sndreserve;
 359         struct sockaddr *saddr;
 360         struct timeval timeo;
 361
 362         nmp->nm_so = 0;
 363         saddr = mbuf_data(nmp->nm_nam);
 364         error = sock_socket(saddr->sa_family, nmp->nm_sotype,
 365                                                 nmp->nm_soproto, 0, 0, &nmp->nm_so);
 366         if (error) {
 367                 goto bad;
 368         }
 369         so = nmp->nm_so;
 370
 371         /*
 372          * Some servers require that the client port be a reserved port number.
 373          */
 374         if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
 375                 proc_t p;
 376                 /*
 377                  * sobind() requires current_proc() to have superuser privs.
 378                  * If this bind is part of a reconnect, and the current proc
 379                  * doesn't have superuser privs, we hand the sobind() off to
 380                  * a kernel thread to process.
 381                  */
 382                 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
 383                     (p = current_proc()) && suser(kauth_cred_get(), 0)) {
 384                         /* request nfs_bind_resv_thread() to do bind */
 385                         error = nfs_bind_resv_nopriv(nmp);
 386                 } else {
 387                         error = nfs_bind_resv(nmp);
 388                 }
 389                 if (error)
 390                         goto bad;
 391         }
 392
 393         /*
 394          * Protocols that do not require connections may be optionally left
 395          * unconnected for servers that reply from a port other than NFS_PORT.
 396          */
 397         if (nmp->nm_flag & NFSMNT_NOCONN) {
 398                 if (nmp->nm_sotype == SOCK_STREAM) {
 399                         error = ENOTCONN;
 400                         goto bad;
 401                 }
 402         } else {
 403                 struct timeval  tv;
 404                 tv.tv_sec = 2;
 405                 tv.tv_usec = 0;
 406                 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
 407                 if (error && error != EINPROGRESS) {
 408                         goto bad;
 409                 }
 410
 411                 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
 412                         if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
 413                                 goto bad;
 414                         }
 415                 }
 416         }
 417
 418         /*
 419          * Always time out on recieve, this allows us to reconnect the
 420          * socket to deal with network changes.
 421          */
 422         timeo.tv_usec = 0;
 423         timeo.tv_sec = 2;
 424         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
 425         if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
 426                 timeo.tv_sec = 5;
 427         } else {
 428                 timeo.tv_sec = 0;
 429         }
 430         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
 431
 432         if (nmp->nm_sotype == SOCK_DGRAM) {
 433                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 434                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 435                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 436         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
 437                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 438                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 439                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 440         } else {
 441                 int proto;
 442                 int on = 1;
 443
 444                 sock_gettype(so, NULL, NULL, &proto);
 445                 if (nmp->nm_sotype != SOCK_STREAM)
 446                         panic("nfscon sotype");
 447
 448                 // Assume that SOCK_STREAM always requires a connection
 449                 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
 450
 451                 if (proto == IPPROTO_TCP) {
 452                         sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
 453                 }
 454
 455                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
 456                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
 457                                 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 458         }
 459
 460         if (sndreserve > NFS_MAXSOCKBUF)
 461                 sndreserve = NFS_MAXSOCKBUF;
 462         if (rcvreserve > NFS_MAXSOCKBUF)
 463                 rcvreserve = NFS_MAXSOCKBUF;
 464         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
 465         if (error) {
 466                 goto bad;
 467         }
 468         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
 469         if (error) {
 470                 goto bad;
 471         }
 472
 473         sock_nointerrupt(so, 1);
 474
 475         /* Initialize other non-zero congestion variables */
 476         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
 477                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
 478         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
 479                 nmp->nm_sdrtt[3] = 0;
 480         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
 481         nmp->nm_sent = 0;
 482         FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
 483         nmp->nm_timeouts = 0;
 484         return (0);
 485
 486 bad:
 487         nfs_disconnect(nmp);
 488         return (error);
 489 }
 490
 491 /*
 492  * Reconnect routine:
 493  * Called when a connection is broken on a reliable protocol.
 494  * - clean up the old socket
 495  * - nfs_connect() again
 496  * - set R_MUSTRESEND for all outstanding requests on mount point
 497  * If this fails the mount point is DEAD!
 498  * nb: Must be called with the nfs_sndlock() set on the mount point.
 499  */
 500 static int
 501 nfs_reconnect(struct nfsreq *rep)
 502 {
 503         struct nfsreq *rp;
 504         struct nfsmount *nmp = rep->r_nmp;
 505         int error;
 506
 507         nfs_disconnect(nmp);
 508         while ((error = nfs_connect(nmp, rep))) {
 509                 if (error == EINTR || error == ERESTART)
 510                         return (EINTR);
 511                 if (error == EIO)
 512                         return (EIO);
 513                 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
 514                         "can not connect");
 515                 rep->r_flags |= R_TPRINTFMSG;
 516                 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
 517                         /* we're not yet completely mounted and */
 518                         /* we can't reconnect, so we fail */
 519                         return (error);
 520                 }
 521                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
 522                         return (error);
 523                 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
 524         }
 525
 526         /*
 527          * Loop through outstanding request list and fix up all requests
 528          * on old socket.
 529          */
 530         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
 531                 if (rp->r_nmp == nmp)
 532                         rp->r_flags |= R_MUSTRESEND;
 533         }
 534         return (0);
 535 }
 536
 537 /*
 538  * NFS disconnect. Clean up and unlink.
 539  */
 540 void
 541 nfs_disconnect(struct nfsmount *nmp)
 542 {
 543         socket_t so;
 544
 545         if (nmp->nm_so) {
 546                 so = nmp->nm_so;
 547                 nmp->nm_so = 0;
 548                 sock_shutdown(so, 2);
 549                 sock_close(so);
 550         }
 551 }
 552
 553 /*
 554  * This is the nfs send routine. For connection based socket types, it
 555  * must be called with an nfs_sndlock() on the socket.
 556  * "rep == NULL" indicates that it has been called from a server.
 557  * For the client side:
 558  * - return EINTR if the RPC is terminated, 0 otherwise
 559  * - set R_MUSTRESEND if the send fails for any reason
 560  * - do any cleanup required by recoverable socket errors (???)
 561  * For the server side:
 562  * - return EINTR or ERESTART if interrupted by a signal
 563  * - return EPIPE if a connection is lost for connection based sockets (TCP...)
 564  * - do any cleanup required by recoverable socket errors (???)
 565  */
 566 int
 567 nfs_send(so, nam, top, rep)
 568         socket_t so;
 569         mbuf_t nam;
 570         mbuf_t top;
 571         struct nfsreq *rep;
 572 {
 573         struct sockaddr *sendnam;
 574         int error, error2, sotype, flags;
 575         u_long xidqueued = 0;
 576         struct nfsreq *rp;
 577         char savenametolog[MAXPATHLEN];
 578         struct msghdr msg;
 579
 580         if (rep) {
 581                 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 582                 if (error) {
 583                         mbuf_freem(top);
 584                         return (error);
 585                 }
 586                 if ((so = rep->r_nmp->nm_so) == NULL) {
 587                         rep->r_flags |= R_MUSTRESEND;
 588                         mbuf_freem(top);
 589                         return (0);
 590                 }
 591                 rep->r_flags &= ~R_MUSTRESEND;
 592                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 593                         if (rp == rep)
 594                                 break;
 595                 if (rp)
 596                         xidqueued = rp->r_xid;
 597         }
 598         sock_gettype(so, NULL, &sotype, NULL);
 599         if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
 600             (nam == 0))
 601                 sendnam = (struct sockaddr *)0;
 602         else
 603                 sendnam = mbuf_data(nam);
 604
 605         if (sotype == SOCK_SEQPACKET)
 606                 flags = MSG_EOR;
 607         else
 608                 flags = 0;
 609
 610         /*
 611          * Save the name here in case mount point goes away if we block.
 612          * The name is using local stack and is large, but don't
 613          * want to block if we malloc.
 614          */
 615         if (rep)
 616                 strncpy(savenametolog,
 617                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
 618                         MAXPATHLEN - 1);
 619         bzero(&msg, sizeof(msg));
 620         msg.msg_name = (caddr_t)sendnam;
 621         msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
 622         error = sock_sendmbuf(so, &msg, top, flags, NULL);
 623
 624         if (error) {
 625                 if (rep) {
 626                         if (xidqueued) {
 627                                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 628                                         if (rp == rep && rp->r_xid == xidqueued)
 629                                                 break;
 630                                 if (!rp)
 631                                         panic("nfs_send: error %d xid %x gone",
 632                                               error, xidqueued);
 633                         }
 634                         log(LOG_INFO, "nfs send error %d for server %s\n",
 635                             error, savenametolog);
 636                         /*
 637                          * Deal with errors for the client side.
 638                          */
 639                         error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 640                         if (error2) {
 641                                 error = error2;
 642                         } else {
 643                                 rep->r_flags |= R_MUSTRESEND;
 644                         }
 645                 } else
 646                         log(LOG_INFO, "nfsd send error %d\n", error);
 647
 648                 /*
 649                  * Handle any recoverable (soft) socket errors here. (???)
 650                  */
 651                 if (error != EINTR && error != ERESTART && error != EIO &&
 652                         error != EWOULDBLOCK && error != EPIPE) {
 653                         error = 0;
 654                 }
 655         }
 656         return (error);
 657 }
 658
 659 /*
 660  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
 661  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
 662  * Mark and consolidate the data into a new mbuf list.
 663  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
 664  *     small mbufs.
 665  * For SOCK_STREAM we must be very careful to read an entire record once
 666  * we have read any of it, even if the system call has been interrupted.
 667  */
 668 static int
 669 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
 670 {
 671         socket_t so;
 672         struct iovec_32 aio;
 673         mbuf_t m, mlast;
 674         u_long len, fraglen;
 675         int error, error2, sotype;
 676         proc_t p = current_proc();      /* XXX */
 677         struct msghdr msg;
 678         size_t rcvlen;
 679         int lastfragment;
 680
 681         /*
 682          * Set up arguments for soreceive()
 683          */
 684         *mp = NULL;
 685         sotype = rep->r_nmp->nm_sotype;
 686
 687         /*
 688          * For reliable protocols, lock against other senders/receivers
 689          * in case a reconnect is necessary.
 690          * For SOCK_STREAM, first get the Record Mark to find out how much
 691          * more there is to get.
 692          * We must lock the socket against other receivers
 693          * until we have an entire rpc request/reply.
 694          */
 695         if (sotype != SOCK_DGRAM) {
 696                 error = nfs_sndlock(rep);
 697                 if (error)
 698                         return (error);
 699 tryagain:
 700                 /*
 701                  * Check for fatal errors and resending request.
 702                  */
 703                 /*
 704                  * Ugh: If a reconnect attempt just happened, nm_so
 705                  * would have changed. NULL indicates a failed
 706                  * attempt that has essentially shut down this
 707                  * mount point.
 708                  */
 709                 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
 710                         nfs_sndunlock(rep);
 711                         if (error)
 712                                 return (error);
 713                         return (EINTR);
 714                 }
 715                 so = rep->r_nmp->nm_so;
 716                 if (!so) {
 717                         error = nfs_reconnect(rep);
 718                         if (error) {
 719                                 nfs_sndunlock(rep);
 720                                 return (error);
 721                         }
 722                         goto tryagain;
 723                 }
 724                 while (rep->r_flags & R_MUSTRESEND) {
 725                         error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
 726                         if (!error) {
 727                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
 728                                 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
 729                         }
 730                         /*
 731                          * we also hold rcv lock so rep is still
 732                          * legit this point
 733                          */
 734                         if (error) {
 735                                 if (error == EINTR || error == ERESTART ||
 736                                     (error = nfs_reconnect(rep))) {
 737                                         nfs_sndunlock(rep);
 738                                         return (error);
 739                                 }
 740                                 goto tryagain;
 741                         }
 742                 }
 743                 nfs_sndunlock(rep);
 744                 if (sotype == SOCK_STREAM) {
 745                         error = 0;
 746                         len = 0;
 747                         lastfragment = 0;
 748                         mlast = NULL;
 749                         while (!error && !lastfragment) {
 750                                 aio.iov_base = (uintptr_t) &fraglen;
 751                                 aio.iov_len = sizeof(u_long);
 752                                 bzero(&msg, sizeof(msg));
 753                                 msg.msg_iov = (struct iovec *) &aio;
 754                                 msg.msg_iovlen = 1;
 755                                 do {
 756                                    error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
 757                                    if (!rep->r_nmp) /* if unmounted then bailout */
 758                                         goto shutout;
 759                                    if (error == EWOULDBLOCK && rep) {
 760                                         error2 = nfs_sigintr(rep->r_nmp, rep, p);
 761                                         if (error2)
 762                                                 error = error2;
 763                                    }
 764                                 } while (error == EWOULDBLOCK);
 765                                 if (!error && rcvlen < aio.iov_len) {
 766                                     /* only log a message if we got a partial word */
 767                                     if (rcvlen != 0)
 768                                             log(LOG_INFO,
 769                                                  "short receive (%d/%d) from nfs server %s\n",
 770                                                  rcvlen, sizeof(u_long),
 771                                                  vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 772                                     error = EPIPE;
 773                                 }
 774                                 if (error)
 775                                         goto errout;
 776                                 lastfragment = ntohl(fraglen) & 0x80000000;
 777                                 fraglen = ntohl(fraglen) & ~0x80000000;
 778                                 len += fraglen;
 779                                 /*
 780                                  * This is SERIOUS! We are out of sync with the sender
 781                                  * and forcing a disconnect/reconnect is all I can do.
 782                                  */
 783                                 if (len > NFS_MAXPACKET) {
 784                                     log(LOG_ERR, "%s (%d) from nfs server %s\n",
 785                                         "impossible RPC record length", len,
 786                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 787                                     error = EFBIG;
 788                                     goto errout;
 789                                 }
 790
 791                                 m = NULL;
 792                                 do {
 793                                     rcvlen = fraglen;
 794                                     error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
 795                                     if (!rep->r_nmp) /* if unmounted then bailout */ {
 796                                         goto shutout;
 797                                     }
 798                                 } while (error == EWOULDBLOCK || error == EINTR ||
 799                                          error == ERESTART);
 800
 801                                 if (!error && fraglen > rcvlen) {
 802                                     log(LOG_INFO,
 803                                         "short receive (%d/%d) from nfs server %s\n",
 804                                         rcvlen, fraglen,
 805                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 806                                     error = EPIPE;
 807                                     mbuf_freem(m);
 808                                 }
 809                                 if (!error) {
 810                                         if (!*mp) {
 811                                                 *mp = m;
 812                                                 mlast = m;
 813                                         } else {
 814                                                 error = mbuf_setnext(mlast, m);
 815                                                 if (error) {
 816                                                         printf("nfs_receive: mbuf_setnext failed %d\n", error);
 817                                                         mbuf_freem(m);
 818                                                 }
 819                                         }
 820                                         while (mbuf_next(mlast))
 821                                                 mlast = mbuf_next(mlast);
 822                                 }
 823                         }
 824                 } else {
 825                         bzero(&msg, sizeof(msg));
 826                         do {
 827                             rcvlen = 100000000;
 828                             error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 829                             if (!rep->r_nmp) /* if unmounted then bailout */ {
 830                                 goto shutout;
 831                             }
 832                             if (error == EWOULDBLOCK && rep) {
 833                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 834                                 if (error2) {
 835                                         return (error2);
 836                                 }
 837                             }
 838                         } while (error == EWOULDBLOCK);
 839
 840                         if ((msg.msg_flags & MSG_EOR) == 0)
 841                                 printf("Egad!!\n");
 842                         if (!error && *mp == NULL)
 843                                 error = EPIPE;
 844                         len = rcvlen;
 845                 }
 846 errout:
 847                 if (error && error != EINTR && error != ERESTART) {
 848                         mbuf_freem(*mp);
 849                         *mp = NULL;
 850                         if (error != EPIPE)
 851                                 log(LOG_INFO,
 852                                     "receive error %d from nfs server %s\n", error,
 853                                     vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 854                         error = nfs_sndlock(rep);
 855                         if (!error) {
 856                                 error = nfs_reconnect(rep);
 857                                 if (!error)
 858                                         goto tryagain;
 859                                 nfs_sndunlock(rep);
 860                         }
 861                 }
 862         } else {
 863                 /*
 864                  * We could have failed while rebinding the datagram socket
 865                  * so we need to attempt to rebind here.
 866                  */
 867                 if ((so = rep->r_nmp->nm_so) == NULL) {
 868                         error = nfs_sndlock(rep);
 869                         if (!error) {
 870                                 error = nfs_reconnect(rep);
 871                                 nfs_sndunlock(rep);
 872                         }
 873                         if (error)
 874                                 return (error);
 875                         if (!rep->r_nmp) /* if unmounted then bailout */
 876                                 return (ENXIO);
 877                         so = rep->r_nmp->nm_so;
 878                 }
 879                 bzero(&msg, sizeof(msg));
 880                 len = 0;
 881                 do {
 882                         rcvlen = 1000000;
 883                         error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 884                         if (!rep->r_nmp) /* if unmounted then bailout */
 885                                 goto shutout;
 886                         if (error) {
 887                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 888                                 if (error2) {
 889                                         error = error2;
 890                                         goto shutout;
 891                                 }
 892                         }
 893                         /* Reconnect for all errors.  We may be receiving
 894                          * soft/hard/blocking errors because of a network
 895                          * change.
 896                          * XXX: we should rate limit or delay this
 897                          * to once every N attempts or something.
 898                          * although TCP doesn't seem to.
 899                          */
 900                         if (error) {
 901                                 error2 = nfs_sndlock(rep);
 902                                 if (!error2) {
 903                                         error2 = nfs_reconnect(rep);
 904                                         if (error2)
 905                                                 error = error2;
 906                                         else if (!rep->r_nmp) /* if unmounted then bailout */
 907                                                 error = ENXIO;
 908                                         else
 909                                                 so = rep->r_nmp->nm_so;
 910                                         nfs_sndunlock(rep);
 911                                 } else {
 912                                         error = error2;
 913                                 }
 914                         }
 915                 } while (error == EWOULDBLOCK);
 916         }
 917 shutout:
 918         if (error) {
 919                 mbuf_freem(*mp);
 920                 *mp = NULL;
 921         }
 922         return (error);
 923 }
 924
 925 /*
 926  * Implement receipt of reply on a socket.
 927  * We must search through the list of received datagrams matching them
 928  * with outstanding requests using the xid, until ours is found.
 929  */
 930 /* ARGSUSED */
 931 int
 932 nfs_reply(myrep)
 933         struct nfsreq *myrep;
 934 {
 935         struct nfsreq *rep;
 936         struct nfsmount *nmp = myrep->r_nmp;
 937         long t1;
 938         mbuf_t mrep, md;
 939         u_long rxid, *tl;
 940         caddr_t dpos, cp2;
 941         int error;
 942
 943         /*
 944          * Loop around until we get our own reply
 945          */
 946         for (;;) {
 947                 /*
 948                  * Lock against other receivers so that I don't get stuck in
 949                  * sbwait() after someone else has received my reply for me.
 950                  * Also necessary for connection based protocols to avoid
 951                  * race conditions during a reconnect.
 952                  * If nfs_rcvlock() returns EALREADY, that means that
 953                  * the reply has already been recieved by another
 954                  * process and we can return immediately.  In this
 955                  * case, the lock is not taken to avoid races with
 956                  * other processes.
 957                  */
 958                 error = nfs_rcvlock(myrep);
 959                 if (error == EALREADY)
 960                         return (0);
 961                 if (error)
 962                         return (error);
 963
 964                 /*
 965                  * If we slept after putting bits otw, then reply may have
 966                  * arrived.  In which case returning is required, or we
 967                  * would hang trying to nfs_receive an already received reply.
 968                  */
 969                 if (myrep->r_mrep != NULL) {
 970                         nfs_rcvunlock(myrep);
 971                         FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
 972                         return (0);
 973                 }
 974                 /*
 975                  * Get the next Rpc reply off the socket. Assume myrep->r_nmp
 976                  * is still intact by checks done in nfs_rcvlock.
 977                  */
 978                 error = nfs_receive(myrep, &mrep);
 979                 /*
 980                  * Bailout asap if nfsmount struct gone (unmounted).
 981                  */
 982                 if (!myrep->r_nmp) {
 983                         FSDBG(530, myrep->r_xid, myrep, nmp, -2);
 984                         if (mrep)
 985                                 mbuf_freem(mrep);
 986                         return (ENXIO);
 987                 }
 988                 if (error) {
 989                         FSDBG(530, myrep->r_xid, myrep, nmp, error);
 990                         nfs_rcvunlock(myrep);
 991
 992                         /* Bailout asap if nfsmount struct gone (unmounted). */
 993                         if (!myrep->r_nmp) {
 994                                 if (mrep)
 995                                         mbuf_freem(mrep);
 996                                 return (ENXIO);
 997                         }
 998
 999                         /*
1000                          * Ignore routing errors on connectionless protocols??
1001                          */
1002                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1003                                 if (nmp->nm_so) {
1004                                         int clearerror;
1005                                         int optlen = sizeof(clearerror);
1006                                         sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1007                                 }
1008                                 continue;
1009                         }
1010                         if (mrep)
1011                                 mbuf_freem(mrep);
1012                         return (error);
1013                 }
1014
1015                 /*
1016                  * We assume all is fine, but if we did not have an error
1017                  * and mrep is 0, better not dereference it. nfs_receive
1018                  * calls soreceive which carefully sets error=0 when it got
1019                  * errors on sbwait (tsleep). In most cases, I assume that's
1020                  * so we could go back again. In tcp case, EPIPE is returned.
1021                  * In udp, case nfs_receive gets back here with no error and no
1022                  * mrep. Is the right fix to have soreceive check for process
1023                  * aborted after sbwait and return something non-zero? Should
1024                  * nfs_receive give an EPIPE?  Too risky to play with those
1025                  * two this late in game for a shutdown problem. Instead,
1026                  * just check here and get out. (ekn)
1027                  */
1028                 if (!mrep) {
1029                         nfs_rcvunlock(myrep);
1030                         FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1031                         return (ENXIO); /* sounds good */
1032                 }
1033
1034                 /*
1035                  * Get the xid and check that it is an rpc reply
1036                  */
1037                 md = mrep;
1038                 dpos = mbuf_data(md);
1039                 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1040                 rxid = *tl++;
1041                 if (*tl != rpc_reply) {
1042                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1043                         mbuf_freem(mrep);
1044 nfsmout:
1045                         if (nmp->nm_state & NFSSTA_RCVLOCK)
1046                                 nfs_rcvunlock(myrep);
1047                         continue;
1048                 }
1049
1050                 /*
1051                  * Loop through the request list to match up the reply
1052                  * Iff no match, just drop the datagram
1053                  */
1054                 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1055                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1056                                 /* Found it.. */
1057                                 rep->r_mrep = mrep;
1058                                 rep->r_md = md;
1059                                 rep->r_dpos = dpos;
1060                                 /*
1061                                  * If we're tracking the round trip time
1062                                  * then we update the circular log here
1063                                  * with the stats from our current request.
1064                                  */
1065                                 if (nfsrtton) {
1066                                         struct rttl *rt;
1067
1068                                         rt = &nfsrtt.rttl[nfsrtt.pos];
1069                                         rt->proc = rep->r_procnum;
1070                                         rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1071                                         rt->sent = nmp->nm_sent;
1072                                         rt->cwnd = nmp->nm_cwnd;
1073                                         if (proct[rep->r_procnum] == 0)
1074                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1075                                         rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1076                                         rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1077                                         rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1078                                         microtime(&rt->tstamp); // XXX unused
1079                                         if (rep->r_flags & R_TIMING)
1080                                                 rt->rtt = rep->r_rtt;
1081                                         else
1082                                                 rt->rtt = 1000000;
1083                                         nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1084                                 }
1085                                 /*
1086                                  * Update congestion window.
1087                                  * Do the additive increase of
1088                                  * one rpc/rtt.
1089                                  */
1090                                 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1091                                       nmp->nm_cwnd);
1092                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
1093                                         nmp->nm_cwnd +=
1094                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
1095                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1096                                         if (nmp->nm_cwnd > NFS_MAXCWND)
1097                                                 nmp->nm_cwnd = NFS_MAXCWND;
1098                                 }
1099                                 if (rep->r_flags & R_SENT) {
1100                                     rep->r_flags &= ~R_SENT;
1101                                     nmp->nm_sent -= NFS_CWNDSCALE;
1102                                }
1103                                 /*
1104                                  * Update rtt using a gain of 0.125 on the mean
1105                                  * and a gain of 0.25 on the deviation.
1106                                  */
1107                                 if (rep->r_flags & R_TIMING) {
1108                                         /*
1109                                          * Since the timer resolution of
1110                                          * NFS_HZ is so course, it can often
1111                                          * result in r_rtt == 0. Since
1112                                          * r_rtt == N means that the actual
1113                                          * rtt is between N+dt and N+2-dt ticks,
1114                                          * add 1.
1115                                          */
1116                                         if (proct[rep->r_procnum] == 0)
1117                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1118                                         t1 = rep->r_rtt + 1;
1119                                         t1 -= (NFS_SRTT(rep) >> 3);
1120                                         NFS_SRTT(rep) += t1;
1121                                         if (t1 < 0)
1122                                                 t1 = -t1;
1123                                         t1 -= (NFS_SDRTT(rep) >> 2);
1124                                         NFS_SDRTT(rep) += t1;
1125                                 }
1126                                 nmp->nm_timeouts = 0;
1127                                 break;
1128                         }
1129                 }
1130                 nfs_rcvunlock(myrep);
1131                 /*
1132                  * If not matched to a request, drop it.
1133                  * If it's mine, get out.
1134                  */
1135                 if (rep == 0) {
1136                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1137                         mbuf_freem(mrep);
1138                 } else if (rep == myrep) {
1139                         if (rep->r_mrep == NULL)
1140                                 panic("nfs_reply: nil r_mrep");
1141                         return (0);
1142                 }
1143                 FSDBG(530, myrep->r_xid, myrep, rep,
1144                       rep ? rep->r_xid : myrep->r_flags);
1145         }
1146 }
1147
1148 /*
1149  * nfs_request - goes something like this
1150  *      - fill in request struct
1151  *      - links it into list
1152  *      - calls nfs_send() for first transmit
1153  *      - calls nfs_receive() to get reply
1154  *      - break down rpc header and return with nfs reply pointed to
1155  *        by mrep or error
1156  * nb: always frees up mreq mbuf list
1157  */
1158 int
1159 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1160         vnode_t vp;
1161         mount_t mp;
1162         mbuf_t mrest;
1163         int procnum;
1164         proc_t procp;
1165         kauth_cred_t cred;
1166         mbuf_t *mrp;
1167         mbuf_t *mdp;
1168         caddr_t *dposp;
1169         u_int64_t *xidp;
1170 {
1171         mbuf_t m, mrep, m2;
1172         struct nfsreq re, *rep;
1173         u_long *tl;
1174         int i;
1175         struct nfsmount *nmp;
1176         mbuf_t md, mheadend;
1177         char nickv[RPCX_NICKVERF];
1178         time_t waituntil;
1179         caddr_t dpos, cp2;
1180         int t1, error = 0, mrest_len, auth_len, auth_type;
1181         int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1182         int verf_len, verf_type;
1183         u_long xid;
1184         char *auth_str, *verf_str;
1185         NFSKERBKEY_T key;               /* save session key */
1186         int nmsotype;
1187         struct timeval now;
1188
1189         if (mrp)
1190                 *mrp = NULL;
1191         if (xidp)
1192                 *xidp = 0;
1193         nmp = VFSTONFS(mp);
1194
1195         rep = &re;
1196
1197         if (vp)
1198                 nmp = VFSTONFS(vnode_mount(vp));
1199         if (nmp == NULL ||
1200             (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1201             (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1202                 mbuf_freem(mrest);
1203                 return (ENXIO);
1204         }
1205         nmsotype = nmp->nm_sotype;
1206
1207         FSDBG_TOP(531, vp, procnum, nmp, rep);
1208
1209         rep->r_nmp = nmp;
1210         rep->r_vp = vp;
1211         rep->r_procp = procp;
1212         rep->r_procnum = procnum;
1213         microuptime(&now);
1214         rep->r_lastmsg = now.tv_sec -
1215             ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1216         i = 0;
1217         m = mrest;
1218         while (m) {
1219                 i += mbuf_len(m);
1220                 m = mbuf_next(m);
1221         }
1222         mrest_len = i;
1223
1224         /*
1225          * Get the RPC header with authorization.
1226          */
1227 kerbauth:
1228         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1229         if (!nmp) {
1230                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1231                 mbuf_freem(mrest);
1232                 return (ENXIO);
1233         }
1234         verf_str = auth_str = (char *)0;
1235         if (nmp->nm_flag & NFSMNT_KERB) {
1236                 verf_str = nickv;
1237                 verf_len = sizeof (nickv);
1238                 auth_type = RPCAUTH_KERB4;
1239                 bzero((caddr_t)key, sizeof (key));
1240                 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1241                         &auth_len, verf_str, verf_len)) {
1242                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1243                         if (!nmp) {
1244                                 FSDBG_BOT(531, 2, vp, error, rep);
1245                                 mbuf_freem(mrest);
1246                                 return (ENXIO);
1247                         }
1248                         error = nfs_getauth(nmp, rep, cred, &auth_str,
1249                                 &auth_len, verf_str, &verf_len, key);
1250                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1251                         if (!error && !nmp)
1252                                 error = ENXIO;
1253                         if (error) {
1254                                 FSDBG_BOT(531, 2, vp, error, rep);
1255                                 mbuf_freem(mrest);
1256                                 return (error);
1257                         }
1258                 }
1259         } else {
1260                 auth_type = RPCAUTH_UNIX;
1261                 if (cred->cr_ngroups < 1)
1262                         panic("nfsreq nogrps");
1263                 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1264                         nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1265                         5 * NFSX_UNSIGNED;
1266         }
1267         error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1268              auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1269         if (auth_str)
1270                 _FREE(auth_str, M_TEMP);
1271         if (error) {
1272                 mbuf_freem(mrest);
1273                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1274                 return (error);
1275         }
1276         if (xidp)
1277                 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1278
1279         /*
1280          * For stream protocols, insert a Sun RPC Record Mark.
1281          */
1282         if (nmsotype == SOCK_STREAM) {
1283                 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1284                 if (error) {
1285                         mbuf_freem(m);
1286                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1287                         return (error);
1288                 }
1289                 *((u_long*)mbuf_data(m)) =
1290                         htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1291         }
1292         rep->r_mreq = m;
1293         rep->r_xid = xid;
1294 tryagain:
1295         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1296         if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1297                 rep->r_retry = nmp->nm_retry;
1298         else
1299                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
1300         rep->r_rtt = rep->r_rexmit = 0;
1301         if (proct[procnum] > 0)
1302                 rep->r_flags = R_TIMING;
1303         else
1304                 rep->r_flags = 0;
1305         rep->r_mrep = NULL;
1306
1307         /*
1308          * Do the client side RPC.
1309          */
1310         OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1311         /*
1312          * Chain request into list of outstanding requests. Be sure
1313          * to put it LAST so timer finds oldest requests first.
1314          */
1315         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1316
1317         /*
1318          * If backing off another request or avoiding congestion, don't
1319          * send this one now but let timer do it. If not timing a request,
1320          * do it now.
1321          */
1322         if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1323                            (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1324                            nmp->nm_sent < nmp->nm_cwnd)) {
1325                 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1326
1327                 if (connrequired)
1328                         error = nfs_sndlock(rep);
1329
1330                 /*
1331                  * Set the R_SENT before doing the send in case another thread
1332                  * processes the reply before the nfs_send returns here
1333                  */
1334                 if (!error) {
1335                         if ((rep->r_flags & R_MUSTRESEND) == 0) {
1336                                 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1337                                       nmp->nm_cwnd);
1338                                 nmp->nm_sent += NFS_CWNDSCALE;
1339                                 rep->r_flags |= R_SENT;
1340                         }
1341
1342                         error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1343                         if (!error)
1344                                 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1345                         if (connrequired)
1346                                 nfs_sndunlock(rep);
1347                 }
1348                 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1349                 if (error) {
1350                         if (nmp)
1351                                 nmp->nm_sent -= NFS_CWNDSCALE;
1352                         rep->r_flags &= ~R_SENT;
1353                 }
1354         } else {
1355                 rep->r_rtt = -1;
1356         }
1357
1358         /*
1359          * Wait for the reply from our send or the timer's.
1360          */
1361         if (!error || error == EPIPE)
1362                 error = nfs_reply(rep);
1363
1364         /*
1365          * RPC done, unlink the request.
1366          */
1367         nfs_repdequeue(rep);
1368
1369         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1370
1371         /*
1372          * Decrement the outstanding request count.
1373          */
1374         if (rep->r_flags & R_SENT) {
1375                 rep->r_flags &= ~R_SENT;        /* paranoia */
1376                 if (nmp) {
1377                         FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1378                         nmp->nm_sent -= NFS_CWNDSCALE;
1379                 }
1380         }
1381
1382         /*
1383          * If there was a successful reply and a tprintf msg.
1384          * tprintf a response.
1385          */
1386         if (!error)
1387                 nfs_up(nmp, procp, NFSSTA_TIMEO,
1388                         (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1389         mrep = rep->r_mrep;
1390         md = rep->r_md;
1391         dpos = rep->r_dpos;
1392         if (!error && !nmp)
1393                 error = ENXIO;
1394         if (error) {
1395                 mbuf_freem(rep->r_mreq);
1396                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1397                 return (error);
1398         }
1399
1400         /*
1401          * break down the rpc header and check if ok
1402          */
1403         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1404         if (*tl++ == rpc_msgdenied) {
1405                 if (*tl == rpc_mismatch)
1406                         error = EOPNOTSUPP;
1407                 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1408                         if (!failed_auth) {
1409                                 failed_auth++;
1410                                 error = mbuf_setnext(mheadend, NULL);
1411                                 mbuf_freem(mrep);
1412                                 mbuf_freem(rep->r_mreq);
1413                                 if (!error)
1414                                         goto kerbauth;
1415                                 printf("nfs_request: mbuf_setnext failed\n");
1416                         } else
1417                                 error = EAUTH;
1418                 } else
1419                         error = EACCES;
1420                 mbuf_freem(mrep);
1421                 mbuf_freem(rep->r_mreq);
1422                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1423                 return (error);
1424         }
1425
1426         /*
1427          * Grab any Kerberos verifier, otherwise just throw it away.
1428          */
1429         verf_type = fxdr_unsigned(int, *tl++);
1430         i = fxdr_unsigned(int, *tl);
1431         if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1432                 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1433                 if (error)
1434                         goto nfsmout;
1435         } else if (i > 0)
1436                 nfsm_adv(nfsm_rndup(i));
1437         nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1438         /* 0 == ok */
1439         if (*tl == 0) {
1440                 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1441                 if (*tl != 0) {
1442                         error = fxdr_unsigned(int, *tl);
1443                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1444                                 error == NFSERR_TRYLATER) {
1445                                 mbuf_freem(mrep);
1446                                 error = 0;
1447                                 microuptime(&now);
1448                                 waituntil = now.tv_sec + trylater_delay;
1449                                 while (now.tv_sec < waituntil) {
1450                                         tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1451                                         microuptime(&now);
1452                                 }
1453                                 trylater_delay *= 2;
1454                                 if (trylater_delay > 60)
1455                                         trylater_delay = 60;
1456                                 goto tryagain;
1457                         }
1458
1459                         /*
1460                          * If the File Handle was stale, invalidate the
1461                          * lookup cache, just in case.
1462                          */
1463                         if ((error == ESTALE) && vp)
1464                                 cache_purge(vp);
1465                         if (nmp->nm_flag & NFSMNT_NFSV3) {
1466                                 *mrp = mrep;
1467                                 *mdp = md;
1468                                 *dposp = dpos;
1469                                 error |= NFSERR_RETERR;
1470                         } else {
1471                                 mbuf_freem(mrep);
1472                                 error &= ~NFSERR_RETERR;
1473                         }
1474                         mbuf_freem(rep->r_mreq);
1475                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1476                         return (error);
1477                 }
1478
1479                 *mrp = mrep;
1480                 *mdp = md;
1481                 *dposp = dpos;
1482                 mbuf_freem(rep->r_mreq);
1483                 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1484                 return (0);
1485         }
1486         mbuf_freem(mrep);
1487         error = EPROTONOSUPPORT;
1488 nfsmout:
1489         mbuf_freem(rep->r_mreq);
1490         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1491         return (error);
1492 }
1493
1494 #ifndef NFS_NOSERVER
1495 /*
1496  * Generate the rpc reply header
1497  * siz arg. is used to decide if adding a cluster is worthwhile
1498  */
1499 int
1500 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1501         int siz;
1502         struct nfsrv_descript *nd;
1503         struct nfssvc_sock *slp;
1504         int err;
1505         mbuf_t *mrq;
1506         mbuf_t *mbp;
1507         caddr_t *bposp;
1508 {
1509         u_long *tl;
1510         mbuf_t mreq;
1511         caddr_t bpos;
1512         mbuf_t mb, mb2;
1513         int error, mlen;
1514
1515         /*
1516          * If this is a big reply, use a cluster else
1517          * try and leave leading space for the lower level headers.
1518          */
1519         siz += RPC_REPLYSIZ;
1520         if (siz >= nfs_mbuf_minclsize) {
1521                 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1522         } else {
1523                 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1524         }
1525         if (error) {
1526                 /* unable to allocate packet */
1527                 /* XXX nfsstat? */
1528                 return (error);
1529         }
1530         mb = mreq;
1531         tl = mbuf_data(mreq);
1532         mlen = 6 * NFSX_UNSIGNED;
1533         if (siz < nfs_mbuf_minclsize) {
1534                 /* leave space for lower level headers */
1535                 tl += 80/sizeof(*tl);  /* XXX max_hdr? XXX */
1536                 mbuf_setdata(mreq, tl, mlen);
1537         } else {
1538                 mbuf_setlen(mreq, mlen);
1539         }
1540         bpos = ((caddr_t)tl) + mlen;
1541         *tl++ = txdr_unsigned(nd->nd_retxid);
1542         *tl++ = rpc_reply;
1543         if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1544                 *tl++ = rpc_msgdenied;
1545                 if (err & NFSERR_AUTHERR) {
1546                         *tl++ = rpc_autherr;
1547                         *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1548                         mlen -= NFSX_UNSIGNED;
1549                         mbuf_setlen(mreq, mlen);
1550                         bpos -= NFSX_UNSIGNED;
1551                 } else {
1552                         *tl++ = rpc_mismatch;
1553                         *tl++ = txdr_unsigned(RPC_VER2);
1554                         *tl = txdr_unsigned(RPC_VER2);
1555                 }
1556         } else {
1557                 *tl++ = rpc_msgaccepted;
1558
1559                 /*
1560                  * For Kerberos authentication, we must send the nickname
1561                  * verifier back, otherwise just RPCAUTH_NULL.
1562                  */
1563                 if (nd->nd_flag & ND_KERBFULL) {
1564                     struct nfsuid *nuidp;
1565                     struct timeval ktvin, ktvout;
1566                     uid_t uid = kauth_cred_getuid(nd->nd_cr);
1567
1568                     lck_rw_lock_shared(&slp->ns_rwlock);
1569                     for (nuidp = NUIDHASH(slp, uid)->lh_first;
1570                         nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1571                         if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1572                             (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1573                              &nuidp->nu_haddr, nd->nd_nam2)))
1574                             break;
1575                     }
1576                     if (nuidp) {
1577                         ktvin.tv_sec =
1578                             txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1579                         ktvin.tv_usec =
1580                             txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1581
1582                         /*
1583                          * Encrypt the timestamp in ecb mode using the
1584                          * session key.
1585                          */
1586 #if NFSKERB
1587                         XXX
1588 #endif
1589
1590                         *tl++ = rpc_auth_kerb;
1591                         *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1592                         *tl = ktvout.tv_sec;
1593                         nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1594                         *tl++ = ktvout.tv_usec;
1595                         *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1596                     } else {
1597                         *tl++ = 0;
1598                         *tl++ = 0;
1599                     }
1600                     lck_rw_done(&slp->ns_rwlock);
1601                 } else {
1602                         *tl++ = 0;
1603                         *tl++ = 0;
1604                 }
1605                 switch (err) {
1606                 case EPROGUNAVAIL:
1607                         *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1608                         break;
1609                 case EPROGMISMATCH:
1610                         *tl = txdr_unsigned(RPC_PROGMISMATCH);
1611                         nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1612                         // XXX hard coded versions
1613                         *tl++ = txdr_unsigned(2);
1614                         *tl = txdr_unsigned(3);
1615                         break;
1616                 case EPROCUNAVAIL:
1617                         *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1618                         break;
1619                 case EBADRPC:
1620                         *tl = txdr_unsigned(RPC_GARBAGE);
1621                         break;
1622                 default:
1623                         *tl = 0;
1624                         if (err != NFSERR_RETVOID) {
1625                                 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1626                                 if (err)
1627                                     *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1628                                 else
1629                                     *tl = 0;
1630                         }
1631                         break;
1632                 }
1633         }
1634
1635         if (mrq != NULL)
1636                 *mrq = mreq;
1637         *mbp = mb;
1638         *bposp = bpos;
1639         if (err != 0 && err != NFSERR_RETVOID) {
1640                 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1641         }
1642         return (0);
1643 }
1644
1645
1646 #endif /* NFS_NOSERVER */
1647
1648
1649 /*
1650  * From FreeBSD 1.58, a Matt Dillon fix...
1651  * Flag a request as being about to terminate.
1652  * The nm_sent count is decremented now to avoid deadlocks when the process
1653  * in soreceive() hasn't yet managed to send its own request.
1654  */
1655 static void
1656 nfs_softterm(struct nfsreq *rep)
1657 {
1658
1659         rep->r_flags |= R_SOFTTERM;
1660         if (rep->r_flags & R_SENT) {
1661                 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1662                       rep->r_nmp->nm_cwnd);
1663                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1664                 rep->r_flags &= ~R_SENT;
1665         }
1666 }
1667
1668 void
1669 nfs_timer_funnel(void * arg)
1670 {
1671         (void) thread_funnel_set(kernel_flock, TRUE);
1672         nfs_timer(arg);
1673         (void) thread_funnel_set(kernel_flock, FALSE);
1674
1675 }
1676
1677 /*
1678  * Ensure rep isn't in use by the timer, then dequeue it.
1679  */
1680 static void
1681 nfs_repdequeue(struct nfsreq *rep)
1682 {
1683
1684         while ((rep->r_flags & R_BUSY)) {
1685                 rep->r_flags |= R_WAITING;
1686                 tsleep(rep, PSOCK, "repdeq", 0);
1687         }
1688         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1689 }
1690
1691 /*
1692  * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1693  * free()'d out from under it.
1694  */
1695 static void
1696 nfs_repbusy(struct nfsreq *rep)
1697 {
1698
1699         if ((rep->r_flags & R_BUSY))
1700                 panic("rep locked");
1701         rep->r_flags |= R_BUSY;
1702 }
1703
1704 /*
1705  * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1706  */
1707 static struct nfsreq *
1708 nfs_repnext(struct nfsreq *rep)
1709 {
1710         struct nfsreq * nextrep;
1711
1712         if (rep == NULL)
1713                 return (NULL);
1714         /*
1715          * We need to get and busy the next req before signalling the
1716          * current one, otherwise wakeup() may block us and we'll race to
1717          * grab the next req.
1718          */
1719         nextrep = TAILQ_NEXT(rep, r_chain);
1720         if (nextrep != NULL)
1721                 nfs_repbusy(nextrep);
1722         /* unbusy and signal. */
1723         rep->r_flags &= ~R_BUSY;
1724         if ((rep->r_flags & R_WAITING)) {
1725                 rep->r_flags &= ~R_WAITING;
1726                 wakeup(rep);
1727         }
1728         return (nextrep);
1729 }
1730
1731 /*
1732  * Nfs timer routine
1733  * Scan the nfsreq list and retranmit any requests that have timed out
1734  * To avoid retransmission attempts on STREAM sockets (in the future) make
1735  * sure to set the r_retry field to 0 (implies nm_retry == 0).
1736  */
1737 void
1738 nfs_timer(__unused void *arg)
1739 {
1740         struct nfsreq *rep;
1741         mbuf_t m;
1742         socket_t so;
1743         struct nfsmount *nmp;
1744         int timeo;
1745         int error;
1746 #ifndef NFS_NOSERVER
1747         struct nfssvc_sock *slp;
1748         u_quad_t cur_usec;
1749 #endif /* NFS_NOSERVER */
1750         int flags, rexmit, cwnd, sent;
1751         u_long xid;
1752         struct timeval now;
1753
1754         rep = TAILQ_FIRST(&nfs_reqq);
1755         if (rep != NULL)
1756                 nfs_repbusy(rep);
1757         microuptime(&now);
1758         for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1759                 nmp = rep->r_nmp;
1760                 if (!nmp) /* unmounted */
1761                     continue;
1762                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1763                         continue;
1764                 if (nfs_sigintr(nmp, rep, rep->r_procp))
1765                         continue;
1766                 if (nmp->nm_tprintf_initial_delay != 0 &&
1767                     (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1768                     rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1769                         rep->r_lastmsg = now.tv_sec;
1770                         nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1771                                 "not responding");
1772                         rep->r_flags |= R_TPRINTFMSG;
1773                         if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1774                                 /* we're not yet completely mounted and */
1775                                 /* we can't complete an RPC, so we fail */
1776                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1777                                 nfs_softterm(rep);
1778                                 continue;
1779                         }
1780                 }
1781                 if (rep->r_rtt >= 0) {
1782                         rep->r_rtt++;
1783                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1784                                 timeo = nmp->nm_timeo;
1785                         else
1786                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1787                         /* ensure 62.5 ms floor */
1788                         while (16 * timeo < hz)
1789                             timeo *= 2;
1790                         if (nmp->nm_timeouts > 0)
1791                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1792                         if (rep->r_rtt <= timeo)
1793                                 continue;
1794                         if (nmp->nm_timeouts < 8)
1795                                 nmp->nm_timeouts++;
1796                 }
1797                 /*
1798                  * Check for too many retransmits.  This is never true for
1799                  * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1800                  * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1801                  */
1802                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
1803                         OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1804                         nfs_softterm(rep);
1805                         continue;
1806                 }
1807                 if (nmp->nm_sotype != SOCK_DGRAM) {
1808                         if (++rep->r_rexmit > NFS_MAXREXMIT)
1809                                 rep->r_rexmit = NFS_MAXREXMIT;
1810                         continue;
1811                 }
1812                 if ((so = nmp->nm_so) == NULL)
1813                         continue;
1814
1815                 /*
1816                  * If there is enough space and the window allows..
1817                  *      Resend it
1818                  * Set r_rtt to -1 in case we fail to send it now.
1819                  */
1820                 rep->r_rtt = -1;
1821                 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1822                     (rep->r_flags & R_SENT) ||
1823                     nmp->nm_sent < nmp->nm_cwnd) &&
1824                    (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1825                         struct msghdr   msg;
1826                         /*
1827                          * Iff first send, start timing
1828                          * else turn timing off, backoff timer
1829                          * and divide congestion window by 2.
1830                          * We update these *before* the send to avoid
1831                          * racing against receiving the reply.
1832                          * We save them so we can restore them on send error.
1833                          */
1834                         flags = rep->r_flags;
1835                         rexmit = rep->r_rexmit;
1836                         cwnd = nmp->nm_cwnd;
1837                         sent = nmp->nm_sent;
1838                         xid = rep->r_xid;
1839                         if (rep->r_flags & R_SENT) {
1840                                 rep->r_flags &= ~R_TIMING;
1841                                 if (++rep->r_rexmit > NFS_MAXREXMIT)
1842                                         rep->r_rexmit = NFS_MAXREXMIT;
1843                                 nmp->nm_cwnd >>= 1;
1844                                 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1845                                         nmp->nm_cwnd = NFS_CWNDSCALE;
1846                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1847                         } else {
1848                                 rep->r_flags |= R_SENT;
1849                                 nmp->nm_sent += NFS_CWNDSCALE;
1850                         }
1851                         FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1852
1853                         bzero(&msg, sizeof(msg));
1854                         if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1855                                 msg.msg_name = mbuf_data(nmp->nm_nam);
1856                                 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1857                         }
1858                         error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1859
1860                         FSDBG(535, xid, error, sent, cwnd);
1861
1862                         if (error) {
1863                                 if (error == EWOULDBLOCK) {
1864                                         rep->r_flags = flags;
1865                                         rep->r_rexmit = rexmit;
1866                                         nmp->nm_cwnd = cwnd;
1867                                         nmp->nm_sent = sent;
1868                                         rep->r_xid = xid;
1869                                 }
1870                                 else {
1871                                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1872                                                 int clearerror;
1873                                                 int optlen = sizeof(clearerror);
1874                                                 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1875                                         }
1876                                         rep->r_flags  = flags | R_RESENDERR;
1877                                         rep->r_rexmit = rexmit;
1878                                         nmp->nm_cwnd = cwnd;
1879                                         nmp->nm_sent = sent;
1880                                         if (flags & R_SENT)
1881                                                 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1882                                 }
1883                         } else
1884                                 rep->r_rtt = 0;
1885                 }
1886         }
1887         microuptime(&now);
1888 #ifndef NFS_NOSERVER
1889         /*
1890          * Scan the write gathering queues for writes that need to be
1891          * completed now.
1892          */
1893         cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1894         lck_mtx_lock(nfsd_mutex);
1895         TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1896             if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1897                 nfsrv_wakenfsd(slp);
1898         }
1899         while ((slp = TAILQ_FIRST(&nfssvc_deadsockhead))) {
1900                 if ((slp->ns_timestamp + 5) > now.tv_sec)
1901                         break;
1902                 TAILQ_REMOVE(&nfssvc_deadsockhead, slp, ns_chain);
1903                 nfsrv_slpfree(slp);
1904         }
1905         lck_mtx_unlock(nfsd_mutex);
1906 #endif /* NFS_NOSERVER */
1907
1908         if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1909                 /*
1910                  * We haven't called nfs_buf_freeup() in a little while.
1911                  * So, see if we can free up any stale/unused bufs now.
1912                  */
1913                 nfs_buf_freeup(1);
1914         }
1915
1916         timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1917
1918 }
1919
1920
1921 /*
1922  * Test for a termination condition pending on the process.
1923  * This is used to determine if we need to bail on a mount.
1924  * EIO is returned if there has been a soft timeout.
1925  * EINTR is returned if there is a signal pending that is not being ignored
1926  * and the mount is interruptable, or if we are a thread that is in the process
1927  * of cancellation (also SIGKILL posted).
1928  */
1929 int
1930 nfs_sigintr(nmp, rep, p)
1931         struct nfsmount *nmp;
1932         struct nfsreq *rep;
1933         proc_t p;
1934 {
1935         sigset_t pending_sigs;
1936         int context_good = 0;
1937         struct nfsmount *repnmp;
1938         extern proc_t kernproc;
1939
1940         if (nmp == NULL)
1941                 return (ENXIO);
1942         if (rep != NULL) {
1943                 repnmp = rep->r_nmp;
1944                 /* we've had a forced unmount. */
1945                 if (repnmp == NULL)
1946                         return (ENXIO);
1947                 /* request has timed out on a 'soft' mount. */
1948                 if (rep->r_flags & R_SOFTTERM)
1949                         return (EIO);
1950                 /*
1951                  * We're in the progress of a force unmount and there's
1952                  * been a timeout we're dead and fail IO.
1953                  */
1954                 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1955                    (NFSSTA_FORCE|NFSSTA_TIMEO))
1956                         return (EIO);
1957                 /* Someone is unmounting us, go soft and mark it. */
1958                 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1959                         repnmp->nm_flag |= NFSMNT_SOFT;
1960                         nmp->nm_state |= NFSSTA_FORCE;
1961                 }
1962                 /*
1963                  * If the mount is hung and we've requested not to hang
1964                  * on remote filesystems, then bail now.
1965                  */
1966                 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1967                     (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1968                         return (EIO);
1969         }
1970         /* XXX: is this valid?  this probably should be an assertion. */
1971         if (p == NULL)
1972                 return (0);
1973
1974         /* Is this thread belongs to kernel task; then abort check  is not needed */
1975         if ((current_proc() != kernproc) && current_thread_aborted()) {
1976                 return (EINTR);
1977         }
1978         /* mask off thread and process blocked signals. */
1979
1980         pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1981         if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1982                 return (EINTR);
1983         return (0);
1984 }
1985
1986 /*
1987  * Lock a socket against others.
1988  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1989  * and also to avoid race conditions between the processes with nfs requests
1990  * in progress when a reconnect is necessary.
1991  */
1992 int
1993 nfs_sndlock(rep)
1994         struct nfsreq *rep;
1995 {
1996         int *statep;
1997         proc_t p;
1998         int error, slpflag = 0, slptimeo = 0;
1999
2000         if (rep->r_nmp == NULL)
2001                 return (ENXIO);
2002         statep = &rep->r_nmp->nm_state;
2003
2004         p = rep->r_procp;
2005         if (rep->r_nmp->nm_flag & NFSMNT_INT)
2006                 slpflag = PCATCH;
2007         while (*statep & NFSSTA_SNDLOCK) {
2008                 error = nfs_sigintr(rep->r_nmp, rep, p);
2009                 if (error)
2010                         return (error);
2011                 *statep |= NFSSTA_WANTSND;
2012                 if (p != NULL && (proc_noremotehang(p)) != 0)
2013                         slptimeo = hz;
2014                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2015                 if (slpflag == PCATCH) {
2016                         slpflag = 0;
2017                         slptimeo = 2 * hz;
2018                 }
2019                 /*
2020                  * Make sure while we slept that the mountpoint didn't go away.
2021                  * nfs_sigintr and callers expect it in tact.
2022                  */
2023                 if (!rep->r_nmp)
2024                         return (ENXIO); /* don't have lock until out of loop */
2025         }
2026         *statep |= NFSSTA_SNDLOCK;
2027         return (0);
2028 }
2029
2030 /*
2031  * Unlock the stream socket for others.
2032  */
2033 void
2034 nfs_sndunlock(rep)
2035         struct nfsreq *rep;
2036 {
2037         int *statep;
2038
2039         if (rep->r_nmp == NULL)
2040                 return;
2041         statep = &rep->r_nmp->nm_state;
2042         if ((*statep & NFSSTA_SNDLOCK) == 0)
2043                 panic("nfs sndunlock");
2044         *statep &= ~NFSSTA_SNDLOCK;
2045         if (*statep & NFSSTA_WANTSND) {
2046                 *statep &= ~NFSSTA_WANTSND;
2047                 wakeup((caddr_t)statep);
2048         }
2049 }
2050
2051 static int
2052 nfs_rcvlock(struct nfsreq *rep)
2053 {
2054         int *statep;
2055         int error, slpflag, slptimeo = 0;
2056
2057         /* make sure we still have our mountpoint */
2058         if (!rep->r_nmp) {
2059                 if (rep->r_mrep != NULL)
2060                         return (EALREADY);
2061                 return (ENXIO);
2062         }
2063
2064         statep = &rep->r_nmp->nm_state;
2065         FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2066         if (rep->r_nmp->nm_flag & NFSMNT_INT)
2067                 slpflag = PCATCH;
2068         else
2069                 slpflag = 0;
2070         while (*statep & NFSSTA_RCVLOCK) {
2071                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2072                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2073                         return (error);
2074                 } else if (rep->r_mrep != NULL) {
2075                         /*
2076                          * Don't bother sleeping if reply already arrived
2077                          */
2078                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2079                         return (EALREADY);
2080                 }
2081                 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2082                 *statep |= NFSSTA_WANTRCV;
2083                 /*
2084                  * We need to poll if we're P_NOREMOTEHANG so that we
2085                  * call nfs_sigintr periodically above.
2086                  */
2087                 if (rep->r_procp != NULL &&
2088                     (proc_noremotehang(rep->r_procp)) != 0)
2089                         slptimeo = hz;
2090                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2091                 if (slpflag == PCATCH) {
2092                         slpflag = 0;
2093                         slptimeo = 2 * hz;
2094                 }
2095                 /*
2096                  * Make sure while we slept that the mountpoint didn't go away.
2097                  * nfs_sigintr and caller nfs_reply expect it intact.
2098                  */
2099                 if (!rep->r_nmp)  {
2100                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2101                         return (ENXIO); /* don't have lock until out of loop */
2102                 }
2103         }
2104         /*
2105          * nfs_reply will handle it if reply already arrived.
2106          * (We may have slept or been preempted).
2107          */
2108         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2109         *statep |= NFSSTA_RCVLOCK;
2110         return (0);
2111 }
2112
2113 /*
2114  * Unlock the stream socket for others.
2115  */
2116 static void
2117 nfs_rcvunlock(struct nfsreq *rep)
2118 {
2119         int *statep;
2120
2121         if (rep->r_nmp == NULL)
2122                 return;
2123         statep = &rep->r_nmp->nm_state;
2124
2125         FSDBG(533, statep, *statep, 0, 0);
2126         if ((*statep & NFSSTA_RCVLOCK) == 0)
2127                 panic("nfs rcvunlock");
2128         *statep &= ~NFSSTA_RCVLOCK;
2129         if (*statep & NFSSTA_WANTRCV) {
2130                 *statep &= ~NFSSTA_WANTRCV;
2131                 wakeup((caddr_t)statep);
2132         }
2133 }
2134
2135
2136 #ifndef NFS_NOSERVER
2137 /*
2138  * Socket upcall routine for the nfsd sockets.
2139  * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2140  * Essentially do as much as possible non-blocking, else punt and it will
2141  * be called with MBUF_WAITOK from an nfsd.
2142  */
2143 void
2144 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2145 {
2146         struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2147
2148         if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2149                 return;
2150
2151         lck_rw_lock_exclusive(&slp->ns_rwlock);
2152         nfsrv_rcv_locked(so, slp, waitflag);
2153         /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2154 }
2155 void
2156 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2157 {
2158         mbuf_t m, mp, mhck, m2;
2159         int ns_flag=0, error;
2160         struct msghdr   msg;
2161         size_t bytes_read;
2162
2163         if ((slp->ns_flag & SLP_VALID) == 0) {
2164                 if (waitflag == MBUF_DONTWAIT)
2165                         lck_rw_done(&slp->ns_rwlock);
2166                 return;
2167         }
2168
2169 #ifdef notdef
2170         /*
2171          * Define this to test for nfsds handling this under heavy load.
2172          */
2173         if (waitflag == MBUF_DONTWAIT) {
2174                 ns_flag = SLP_NEEDQ;
2175                 goto dorecs;
2176         }
2177 #endif
2178         if (slp->ns_sotype == SOCK_STREAM) {
2179                 /*
2180                  * If there are already records on the queue, defer soreceive()
2181                  * to an nfsd so that there is feedback to the TCP layer that
2182                  * the nfs servers are heavily loaded.
2183                  */
2184                 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2185                         ns_flag = SLP_NEEDQ;
2186                         goto dorecs;
2187                 }
2188
2189                 /*
2190                  * Do soreceive().
2191                  */
2192                 bytes_read = 1000000000;
2193                 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2194                 if (error || mp == NULL) {
2195                         if (error == EWOULDBLOCK)
2196                                 ns_flag = SLP_NEEDQ;
2197                         else
2198                                 ns_flag = SLP_DISCONN;
2199                         goto dorecs;
2200                 }
2201                 m = mp;
2202                 if (slp->ns_rawend) {
2203                         if ((error = mbuf_setnext(slp->ns_rawend, m)))
2204                                 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2205                         slp->ns_cc += bytes_read;
2206                 } else {
2207                         slp->ns_raw = m;
2208                         slp->ns_cc = bytes_read;
2209                 }
2210                 while ((m2 = mbuf_next(m)))
2211                         m = m2;
2212                 slp->ns_rawend = m;
2213
2214                 /*
2215                  * Now try and parse record(s) out of the raw stream data.
2216                  */
2217                 error = nfsrv_getstream(slp, waitflag);
2218                 if (error) {
2219                         if (error == EPERM)
2220                                 ns_flag = SLP_DISCONN;
2221                         else
2222                                 ns_flag = SLP_NEEDQ;
2223                 }
2224         } else {
2225                 struct sockaddr_storage nam;
2226
2227                 bzero(&msg, sizeof(msg));
2228                 msg.msg_name = (caddr_t)&nam;
2229                 msg.msg_namelen = sizeof(nam);
2230
2231                 do {
2232                         bytes_read = 1000000000;
2233                         error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2234                         if (mp) {
2235                                 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2236                                         mbuf_setlen(mhck, nam.ss_len);
2237                                         bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2238                                         m = mhck;
2239                                         if (mbuf_setnext(m, mp)) {
2240                                                 /* trouble... just drop it */
2241                                                 printf("nfsrv_rcv: mbuf_setnext failed\n");
2242                                                 mbuf_free(mhck);
2243                                                 m = mp;
2244                                         }
2245                                 } else {
2246                                         m = mp;
2247                                 }
2248                                 if (slp->ns_recend)
2249                                         mbuf_setnextpkt(slp->ns_recend, m);
2250                                 else
2251                                         slp->ns_rec = m;
2252                                 slp->ns_recend = m;
2253                                 mbuf_setnextpkt(m, NULL);
2254                         }
2255 #if 0
2256                         if (error) {
2257                                 /*
2258                                  * This may be needed in the future to support
2259                                  * non-byte-stream connection-oriented protocols
2260                                  * such as SCTP.
2261                                  */
2262                                 /*
2263                                  * This (slp->ns_sotype == SOCK_STREAM) should really
2264                                  * be a check for PR_CONNREQUIRED.
2265                                  */
2266                                 if ((slp->ns_sotype == SOCK_STREAM)
2267                                         && error != EWOULDBLOCK) {
2268                                         ns_flag = SLP_DISCONN;
2269                                         goto dorecs;
2270                                 }
2271                         }
2272 #endif
2273                 } while (mp);
2274         }
2275
2276         /*
2277          * Now try and process the request records, non-blocking.
2278          */
2279 dorecs:
2280         if (ns_flag)
2281                 slp->ns_flag |= ns_flag;
2282         if (waitflag == MBUF_DONTWAIT) {
2283                 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2284                 lck_rw_done(&slp->ns_rwlock);
2285                 if (wake && nfs_numnfsd) {
2286                         lck_mtx_lock(nfsd_mutex);
2287                         nfsrv_wakenfsd(slp);
2288                         lck_mtx_unlock(nfsd_mutex);
2289                 }
2290         }
2291 }
2292
2293 /*
2294  * Try and extract an RPC request from the mbuf data list received on a
2295  * stream socket. The "waitflag" argument indicates whether or not it
2296  * can sleep.
2297  */
2298 static int
2299 nfsrv_getstream(slp, waitflag)
2300         struct nfssvc_sock *slp;
2301         int waitflag;
2302 {
2303         mbuf_t m;
2304         char *cp1, *cp2, *mdata;
2305         int len, mlen, error;
2306         mbuf_t om, m2, recm;
2307         u_long recmark;
2308
2309         if (slp->ns_flag & SLP_GETSTREAM)
2310                 panic("nfs getstream");
2311         slp->ns_flag |= SLP_GETSTREAM;
2312         for (;;) {
2313             if (slp->ns_reclen == 0) {
2314                 if (slp->ns_cc < NFSX_UNSIGNED) {
2315                         slp->ns_flag &= ~SLP_GETSTREAM;
2316                         return (0);
2317                 }
2318                 m = slp->ns_raw;
2319                 mdata = mbuf_data(m);
2320                 mlen = mbuf_len(m);
2321                 if (mlen >= NFSX_UNSIGNED) {
2322                         bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2323                         mdata += NFSX_UNSIGNED;
2324                         mlen -= NFSX_UNSIGNED;
2325                         mbuf_setdata(m, mdata, mlen);
2326                 } else {
2327                         cp1 = (caddr_t)&recmark;
2328                         cp2 = mdata;
2329                         while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2330                                 while (mlen == 0) {
2331                                         m = mbuf_next(m);
2332                                         cp2 = mbuf_data(m);
2333                                         mlen = mbuf_len(m);
2334                                 }
2335                                 *cp1++ = *cp2++;
2336                                 mlen--;
2337                                 mbuf_setdata(m, cp2, mlen);
2338                         }
2339                 }
2340                 slp->ns_cc -= NFSX_UNSIGNED;
2341                 recmark = ntohl(recmark);
2342                 slp->ns_reclen = recmark & ~0x80000000;
2343                 if (recmark & 0x80000000)
2344                         slp->ns_flag |= SLP_LASTFRAG;
2345                 else
2346                         slp->ns_flag &= ~SLP_LASTFRAG;
2347                 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2348                         slp->ns_flag &= ~SLP_GETSTREAM;
2349                         return (EPERM);
2350                 }
2351             }
2352
2353             /*
2354              * Now get the record part.
2355              *
2356              * Note that slp->ns_reclen may be 0.  Linux sometimes
2357              * generates 0-length RPCs
2358              */
2359             recm = NULL;
2360             if (slp->ns_cc == slp->ns_reclen) {
2361                 recm = slp->ns_raw;
2362                 slp->ns_raw = slp->ns_rawend = NULL;
2363                 slp->ns_cc = slp->ns_reclen = 0;
2364             } else if (slp->ns_cc > slp->ns_reclen) {
2365                 len = 0;
2366                 m = slp->ns_raw;
2367                 mlen = mbuf_len(m);
2368                 mdata = mbuf_data(m);
2369                 om = NULL;
2370                 while (len < slp->ns_reclen) {
2371                         if ((len + mlen) > slp->ns_reclen) {
2372                                 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2373                                         slp->ns_flag &= ~SLP_GETSTREAM;
2374                                         return (EWOULDBLOCK);
2375                                 }
2376                                 if (om) {
2377                                         if (mbuf_setnext(om, m2)) {
2378                                                 /* trouble... just drop it */
2379                                                 printf("nfsrv_getstream: mbuf_setnext failed\n");
2380                                                 mbuf_freem(m2);
2381                                                 slp->ns_flag &= ~SLP_GETSTREAM;
2382                                                 return (EWOULDBLOCK);
2383                                         }
2384                                         recm = slp->ns_raw;
2385                                 } else {
2386                                         recm = m2;
2387                                 }
2388                                 mdata += slp->ns_reclen - len;
2389                                 mlen -= slp->ns_reclen - len;
2390                                 mbuf_setdata(m, mdata, mlen);
2391                                 len = slp->ns_reclen;
2392                         } else if ((len + mlen) == slp->ns_reclen) {
2393                                 om = m;
2394                                 len += mlen;
2395                                 m = mbuf_next(m);
2396                                 recm = slp->ns_raw;
2397                                 if (mbuf_setnext(om, NULL)) {
2398                                         printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2399                                         slp->ns_flag &= ~SLP_GETSTREAM;
2400                                         return (EWOULDBLOCK);
2401                                 }
2402                                 mlen = mbuf_len(m);
2403                                 mdata = mbuf_data(m);
2404                         } else {
2405                                 om = m;
2406                                 len += mlen;
2407                                 m = mbuf_next(m);
2408                                 mlen = mbuf_len(m);
2409                                 mdata = mbuf_data(m);
2410                         }
2411                 }
2412                 slp->ns_raw = m;
2413                 slp->ns_cc -= len;
2414                 slp->ns_reclen = 0;
2415             } else {
2416                 slp->ns_flag &= ~SLP_GETSTREAM;
2417                 return (0);
2418             }
2419
2420             /*
2421              * Accumulate the fragments into a record.
2422              */
2423             if (slp->ns_frag == NULL) {
2424                 slp->ns_frag = recm;
2425             } else {
2426                 m = slp->ns_frag;
2427                 while ((m2 = mbuf_next(m)))
2428                     m = m2;
2429                 if ((error = mbuf_setnext(m, recm)))
2430                     panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2431             }
2432             if (slp->ns_flag & SLP_LASTFRAG) {
2433                 if (slp->ns_recend)
2434                     mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2435                 else
2436                     slp->ns_rec = slp->ns_frag;
2437                 slp->ns_recend = slp->ns_frag;
2438                 slp->ns_frag = NULL;
2439             }
2440         }
2441 }
2442
2443 /*
2444  * Parse an RPC header.
2445  */
2446 int
2447 nfsrv_dorec(slp, nfsd, ndp)
2448         struct nfssvc_sock *slp;
2449         struct nfsd *nfsd;
2450         struct nfsrv_descript **ndp;
2451 {
2452         mbuf_t m;
2453         mbuf_t nam;
2454         struct nfsrv_descript *nd;
2455         int error;
2456
2457         *ndp = NULL;
2458         if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2459                 return (ENOBUFS);
2460         MALLOC_ZONE(nd, struct nfsrv_descript *,
2461                         sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2462         if (!nd)
2463                 return (ENOMEM);
2464         m = slp->ns_rec;
2465         slp->ns_rec = mbuf_nextpkt(m);
2466         if (slp->ns_rec)
2467                 mbuf_setnextpkt(m, NULL);
2468         else
2469                 slp->ns_recend = NULL;
2470         if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2471                 nam = m;
2472                 m = mbuf_next(m);
2473                 if ((error = mbuf_setnext(nam, NULL)))
2474                         panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2475         } else
2476                 nam = NULL;
2477         nd->nd_md = nd->nd_mrep = m;
2478         nd->nd_nam2 = nam;
2479         nd->nd_dpos = mbuf_data(m);
2480         error = nfs_getreq(nd, nfsd, TRUE);
2481         if (error) {
2482                 if (nam)
2483                         mbuf_freem(nam);
2484                 FREE_ZONE((caddr_t)nd,  sizeof *nd, M_NFSRVDESC);
2485                 return (error);
2486         }
2487         *ndp = nd;
2488         nfsd->nfsd_nd = nd;
2489         return (0);
2490 }
2491
2492 /*
2493  * Parse an RPC request
2494  * - verify it
2495  * - fill in the cred struct.
2496  */
2497 int
2498 nfs_getreq(nd, nfsd, has_header)
2499         struct nfsrv_descript *nd;
2500         struct nfsd *nfsd;
2501         int has_header;
2502 {
2503         int len, i;
2504         u_long *tl;
2505         long t1;
2506         uio_t uiop;
2507         caddr_t dpos, cp2, cp;
2508         u_long nfsvers, auth_type;
2509         uid_t nickuid;
2510         int error = 0, ticklen;
2511         mbuf_t mrep, md;
2512         struct nfsuid *nuidp;
2513         uid_t user_id;
2514         gid_t group_id;
2515         int ngroups;
2516         struct ucred temp_cred;
2517         struct timeval tvin, tvout, now;
2518         char uio_buf[ UIO_SIZEOF(1) ];
2519 #if 0                           /* until encrypted keys are implemented */
2520         NFSKERBKEYSCHED_T keys; /* stores key schedule */
2521 #endif
2522
2523         nd->nd_cr = NULL;
2524
2525         mrep = nd->nd_mrep;
2526         md = nd->nd_md;
2527         dpos = nd->nd_dpos;
2528         if (has_header) {
2529                 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2530                 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2531                 if (*tl++ != rpc_call) {
2532                         mbuf_freem(mrep);
2533                         return (EBADRPC);
2534                 }
2535         } else
2536                 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2537         nd->nd_repstat = 0;
2538         nd->nd_flag = 0;
2539         if (*tl++ != rpc_vers) {
2540                 nd->nd_repstat = ERPCMISMATCH;
2541                 nd->nd_procnum = NFSPROC_NOOP;
2542                 return (0);
2543         }
2544         if (*tl != nfs_prog) {
2545                 nd->nd_repstat = EPROGUNAVAIL;
2546                 nd->nd_procnum = NFSPROC_NOOP;
2547                 return (0);
2548         }
2549         tl++;
2550         nfsvers = fxdr_unsigned(u_long, *tl++);
2551         if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2552                 nd->nd_repstat = EPROGMISMATCH;
2553                 nd->nd_procnum = NFSPROC_NOOP;
2554                 return (0);
2555         }
2556         else if (nfsvers == NFS_VER3)
2557                 nd->nd_flag = ND_NFSV3;
2558         nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2559         if (nd->nd_procnum == NFSPROC_NULL)
2560                 return (0);
2561         if ((nd->nd_procnum >= NFS_NPROCS) ||
2562                 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2563                 nd->nd_repstat = EPROCUNAVAIL;
2564                 nd->nd_procnum = NFSPROC_NOOP;
2565                 return (0);
2566         }
2567         if ((nd->nd_flag & ND_NFSV3) == 0)
2568                 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2569         auth_type = *tl++;
2570         len = fxdr_unsigned(int, *tl++);
2571         if (len < 0 || len > RPCAUTH_MAXSIZ) {
2572                 mbuf_freem(mrep);
2573                 return (EBADRPC);
2574         }
2575
2576         nd->nd_flag &= ~ND_KERBAUTH;
2577         /*
2578          * Handle auth_unix or auth_kerb.
2579          */
2580         if (auth_type == rpc_auth_unix) {
2581                 len = fxdr_unsigned(int, *++tl);
2582                 if (len < 0 || len > NFS_MAXNAMLEN) {
2583                         mbuf_freem(mrep);
2584                         return (EBADRPC);
2585                 }
2586                 bzero(&temp_cred, sizeof(temp_cred));
2587                 nfsm_adv(nfsm_rndup(len));
2588                 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2589                 user_id = fxdr_unsigned(uid_t, *tl++);
2590                 group_id = fxdr_unsigned(gid_t, *tl++);
2591                 temp_cred.cr_groups[0] = group_id;
2592                 len = fxdr_unsigned(int, *tl);
2593                 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2594                         mbuf_freem(mrep);
2595                         return (EBADRPC);
2596                 }
2597                 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2598                 for (i = 1; i <= len; i++)
2599                     if (i < NGROUPS)
2600                         temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2601                     else
2602                         tl++;
2603                 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2604                 if (ngroups > 1)
2605                     nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2606                 len = fxdr_unsigned(int, *++tl);
2607                 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2608                         mbuf_freem(mrep);
2609                         return (EBADRPC);
2610                 }
2611                 temp_cred.cr_uid = user_id;
2612                 temp_cred.cr_ngroups = ngroups;
2613                 nd->nd_cr = kauth_cred_create(&temp_cred);
2614                 if (nd->nd_cr == NULL) {
2615                         nd->nd_repstat = ENOMEM;
2616                         nd->nd_procnum = NFSPROC_NOOP;
2617                         return (0);
2618                 }
2619                 if (len > 0)
2620                         nfsm_adv(nfsm_rndup(len));
2621         } else if (auth_type == rpc_auth_kerb) {
2622                 switch (fxdr_unsigned(int, *tl++)) {
2623                 case RPCAKN_FULLNAME:
2624                         ticklen = fxdr_unsigned(int, *tl);
2625                         *((u_long *)nfsd->nfsd_authstr) = *tl;
2626                         uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2627                                                 &uio_buf[0], sizeof(uio_buf));
2628                         if (!uiop) {
2629                                 nd->nd_repstat = ENOMEM;
2630                                 nd->nd_procnum = NFSPROC_NOOP;
2631                                 return (0);
2632                         }
2633
2634                         // LP64todo - fix this
2635                         nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2636                         if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2637                                 mbuf_freem(mrep);
2638                                 return (EBADRPC);
2639                         }
2640                         uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2641                         // LP64todo - fix this
2642                         nfsm_mtouio(uiop, uio_resid(uiop));
2643                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2644                         if (*tl++ != rpc_auth_kerb ||
2645                                 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2646                                 printf("Bad kerb verifier\n");
2647                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2648                                 nd->nd_procnum = NFSPROC_NOOP;
2649                                 return (0);
2650                         }
2651                         nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2652                         tl = (u_long *)cp;
2653                         if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2654                                 printf("Not fullname kerb verifier\n");
2655                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2656                                 nd->nd_procnum = NFSPROC_NOOP;
2657                                 return (0);
2658                         }
2659                         cp += NFSX_UNSIGNED;
2660                         bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2661                         nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2662                         nd->nd_flag |= ND_KERBFULL;
2663                         nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2664                         break;
2665                 case RPCAKN_NICKNAME:
2666                         if (len != 2 * NFSX_UNSIGNED) {
2667                                 printf("Kerb nickname short\n");
2668                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2669                                 nd->nd_procnum = NFSPROC_NOOP;
2670                                 return (0);
2671                         }
2672                         nickuid = fxdr_unsigned(uid_t, *tl);
2673                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2674                         if (*tl++ != rpc_auth_kerb ||
2675                                 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2676                                 printf("Kerb nick verifier bad\n");
2677                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2678                                 nd->nd_procnum = NFSPROC_NOOP;
2679                                 return (0);
2680                         }
2681                         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2682                         tvin.tv_sec = *tl++;
2683                         tvin.tv_usec = *tl;
2684
2685                         for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2686                             nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2687                                 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2688                                     (!nd->nd_nam2 ||
2689                                      netaddr_match(NU_NETFAM(nuidp),
2690                                       &nuidp->nu_haddr, nd->nd_nam2)))
2691                                         break;
2692                         }
2693                         if (!nuidp) {
2694                                 nd->nd_repstat =
2695                                         (NFSERR_AUTHERR|AUTH_REJECTCRED);
2696                                 nd->nd_procnum = NFSPROC_NOOP;
2697                                 return (0);
2698                         }
2699
2700                         /*
2701                          * Now, decrypt the timestamp using the session key
2702                          * and validate it.
2703                          */
2704 #if NFSKERB
2705                         XXX
2706 #endif
2707
2708                         tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2709                         tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2710                         microtime(&now);
2711                         if (nuidp->nu_expire < now.tv_sec ||
2712                             nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2713                             (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2714                              nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2715                                 nuidp->nu_expire = 0;
2716                                 nd->nd_repstat =
2717                                     (NFSERR_AUTHERR|AUTH_REJECTVERF);
2718                                 nd->nd_procnum = NFSPROC_NOOP;
2719                                 return (0);
2720                         }
2721                         bzero(&temp_cred, sizeof(temp_cred));
2722                         ngroups = nuidp->nu_cr->cr_ngroups;
2723                         for (i = 0; i < ngroups; i++)
2724                                 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2725                         if (ngroups > 1)
2726                                 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2727
2728                         temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2729                         temp_cred.cr_ngroups = ngroups;
2730                         nd->nd_cr = kauth_cred_create(&temp_cred);
2731                         if (!nd->nd_cr) {
2732                                 nd->nd_repstat = ENOMEM;
2733                                 nd->nd_procnum = NFSPROC_NOOP;
2734                                 return (0);
2735                         }
2736                         nd->nd_flag |= ND_KERBNICK;
2737                 };
2738         } else {
2739                 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2740                 nd->nd_procnum = NFSPROC_NOOP;
2741                 return (0);
2742         }
2743
2744         nd->nd_md = md;
2745         nd->nd_dpos = dpos;
2746         return (0);
2747 nfsmout:
2748         if (IS_VALID_CRED(nd->nd_cr))
2749                 kauth_cred_unref(&nd->nd_cr);
2750         return (error);
2751 }
2752
2753 /*
2754  * Search for a sleeping nfsd and wake it up.
2755  * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2756  * running nfsds will go look for the work in the nfssvc_sock list.
2757  * Note: Must be called with nfsd_mutex held.
2758  */
2759 void
2760 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2761 {
2762         struct nfsd *nd;
2763
2764         if ((slp->ns_flag & SLP_VALID) == 0)
2765                 return;
2766
2767         lck_rw_lock_exclusive(&slp->ns_rwlock);
2768
2769         if (nfsd_waiting) {
2770                 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2771                         if (nd->nfsd_flag & NFSD_WAITING) {
2772                                 nd->nfsd_flag &= ~NFSD_WAITING;
2773                                 if (nd->nfsd_slp)
2774                                         panic("nfsd wakeup");
2775                                 slp->ns_sref++;
2776                                 nd->nfsd_slp = slp;
2777                                 lck_rw_done(&slp->ns_rwlock);
2778                                 wakeup((caddr_t)nd);
2779                                 return;
2780                         }
2781                 }
2782         }
2783
2784         slp->ns_flag |= SLP_DOREC;
2785
2786         lck_rw_done(&slp->ns_rwlock);
2787
2788         nfsd_head_flag |= NFSD_CHECKSLP;
2789 }
2790 #endif /* NFS_NOSERVER */
2791
2792 static int
2793 nfs_msg(proc_t p,
2794         const char *server,
2795         const char *msg,
2796         int error)
2797 {
2798         tpr_t tpr;
2799
2800         if (p)
2801                 tpr = tprintf_open(p);
2802         else
2803                 tpr = NULL;
2804         if (error)
2805                 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2806                     error);
2807         else
2808                 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2809         tprintf_close(tpr);
2810         return (0);
2811 }
2812
2813 void
2814 nfs_down(nmp, proc, error, flags, msg)
2815         struct nfsmount *nmp;
2816         proc_t proc;
2817         int error, flags;
2818         const char *msg;
2819 {
2820         if (nmp == NULL)
2821                 return;
2822         if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2823                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2824                 nmp->nm_state |= NFSSTA_TIMEO;
2825         }
2826         if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2827                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2828                 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2829         }
2830         nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2831 }
2832
2833 void
2834 nfs_up(nmp, proc, flags, msg)
2835         struct nfsmount *nmp;
2836         proc_t proc;
2837         int flags;
2838         const char *msg;
2839 {
2840         if (nmp == NULL)
2841                 return;
2842         if (msg)
2843                 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2844         if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2845                 nmp->nm_state &= ~NFSSTA_TIMEO;
2846                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2847         }
2848         if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2849                 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2850                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2851         }
2852 }
2853