bsd/nfs/nfs_socket.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1989, 1991, 1993, 1995
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * This code is derived from software contributed to Berkeley by
  28  * Rick Macklem at The University of Guelph.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  * 1. Redistributions of source code must retain the above copyright
  34  *    notice, this list of conditions and the following disclaimer.
  35  * 2. Redistributions in binary form must reproduce the above copyright
  36  *    notice, this list of conditions and the following disclaimer in the
  37  *    documentation and/or other materials provided with the distribution.
  38  * 3. All advertising materials mentioning features or use of this software
  39  *    must display the following acknowledgement:
  40  *      This product includes software developed by the University of
  41  *      California, Berkeley and its contributors.
  42  * 4. Neither the name of the University nor the names of its contributors
  43  *    may be used to endorse or promote products derived from this software
  44  *    without specific prior written permission.
  45  *
  46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  56  * SUCH DAMAGE.
  57  *
  58  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
  59  * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
  60  */
  61
  62 /*
  63  * Socket operations for use by nfs
  64  */
  65
  66 #include <sys/param.h>
  67 #include <sys/systm.h>
  68 #include <sys/proc.h>
  69 #include <sys/kauth.h>
  70 #include <sys/mount_internal.h>
  71 #include <sys/kernel.h>
  72 #include <sys/kpi_mbuf.h>
  73 #include <sys/malloc.h>
  74 #include <sys/vnode.h>
  75 #include <sys/domain.h>
  76 #include <sys/protosw.h>
  77 #include <sys/socket.h>
  78 #include <sys/syslog.h>
  79 #include <sys/tprintf.h>
  80 #include <sys/uio_internal.h>
  81 #include <libkern/OSAtomic.h>
  82
  83 #include <sys/time.h>
  84 #include <kern/clock.h>
  85 #include <kern/task.h>
  86 #include <kern/thread.h>
  87 #include <sys/user.h>
  88
  89 #include <netinet/in.h>
  90 #include <netinet/tcp.h>
  91
  92 #include <nfs/rpcv2.h>
  93 #include <nfs/nfsproto.h>
  94 #include <nfs/nfs.h>
  95 #include <nfs/xdr_subs.h>
  96 #include <nfs/nfsm_subs.h>
  97 #include <nfs/nfsmount.h>
  98 #include <nfs/nfsnode.h>
  99 #include <nfs/nfsrtt.h>
 100
 101 #include <sys/kdebug.h>
 102
 103 #define FSDBG(A, B, C, D, E) \
 104         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
 105                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 106 #define FSDBG_TOP(A, B, C, D, E) \
 107         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
 108                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 109 #define FSDBG_BOT(A, B, C, D, E) \
 110         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
 111                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 112
 113 /*
 114  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
 115  * Use the mean and mean deviation of rtt for the appropriate type of rpc
 116  * for the frequent rpcs and a default for the others.
 117  * The justification for doing "other" this way is that these rpcs
 118  * happen so infrequently that timer est. would probably be stale.
 119  * Also, since many of these rpcs are
 120  * non-idempotent, a conservative timeout is desired.
 121  * getattr, lookup - A+2D
 122  * read, write     - A+4D
 123  * other           - nm_timeo
 124  */
 125 #define NFS_RTO(n, t) \
 126         ((t) == 0 ? (n)->nm_timeo : \
 127          ((t) < 3 ? \
 128           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
 129           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
 130 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
 131 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
 132 /*
 133  * External data, mostly RPC constants in XDR form
 134  */
 135 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
 136         rpc_msgaccepted, rpc_call, rpc_autherr,
 137         rpc_auth_kerb;
 138 extern u_long nfs_prog;
 139 extern struct nfsstats nfsstats;
 140 extern int nfsv3_procid[NFS_NPROCS];
 141 extern int nfs_ticks;
 142 extern u_long nfs_xidwrap;
 143
 144 /*
 145  * Defines which timer to use for the procnum.
 146  * 0 - default
 147  * 1 - getattr
 148  * 2 - lookup
 149  * 3 - read
 150  * 4 - write
 151  */
 152 static int proct[NFS_NPROCS] = {
 153         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
 154 };
 155
 156 /*
 157  * There is a congestion window for outstanding rpcs maintained per mount
 158  * point. The cwnd size is adjusted in roughly the way that:
 159  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
 160  * SIGCOMM '88". ACM, August 1988.
 161  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
 162  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
 163  * of rpcs is in progress.
 164  * (The sent count and cwnd are scaled for integer arith.)
 165  * Variants of "slow start" were tried and were found to be too much of a
 166  * performance hit (ave. rtt 3 times larger),
 167  * I suspect due to the large rtt that nfs rpcs have.
 168  */
 169 #define NFS_CWNDSCALE   256
 170 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
 171 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
 172 int nfsrtton = 0;
 173 struct nfsrtt nfsrtt;
 174
 175 static int      nfs_rcvlock(struct nfsreq *);
 176 static void     nfs_rcvunlock(struct nfsreq *);
 177 static int      nfs_receive(struct nfsreq *rep, mbuf_t *mp);
 178 static int      nfs_reconnect(struct nfsreq *rep);
 179 static void     nfs_repdequeue(struct nfsreq *rep);
 180
 181 /* XXX */
 182 boolean_t       current_thread_aborted(void);
 183 kern_return_t   thread_terminate(thread_t);
 184
 185 #ifndef NFS_NOSERVER
 186 static int      nfsrv_getstream(struct nfssvc_sock *,int);
 187
 188 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
 189                                     struct nfssvc_sock *slp,
 190                                     proc_t procp,
 191                                     mbuf_t *mreqp) = {
 192         nfsrv_null,
 193         nfsrv_getattr,
 194         nfsrv_setattr,
 195         nfsrv_lookup,
 196         nfsrv3_access,
 197         nfsrv_readlink,
 198         nfsrv_read,
 199         nfsrv_write,
 200         nfsrv_create,
 201         nfsrv_mkdir,
 202         nfsrv_symlink,
 203         nfsrv_mknod,
 204         nfsrv_remove,
 205         nfsrv_rmdir,
 206         nfsrv_rename,
 207         nfsrv_link,
 208         nfsrv_readdir,
 209         nfsrv_readdirplus,
 210         nfsrv_statfs,
 211         nfsrv_fsinfo,
 212         nfsrv_pathconf,
 213         nfsrv_commit,
 214         nfsrv_noop
 215 };
 216 #endif /* NFS_NOSERVER */
 217
 218
 219 /*
 220  * attempt to bind a socket to a reserved port
 221  */
 222 static int
 223 nfs_bind_resv(struct nfsmount *nmp)
 224 {
 225         socket_t so = nmp->nm_so;
 226         struct sockaddr_in sin;
 227         int error;
 228         u_short tport;
 229
 230         if (!so)
 231                 return (EINVAL);
 232
 233         sin.sin_len = sizeof (struct sockaddr_in);
 234         sin.sin_family = AF_INET;
 235         sin.sin_addr.s_addr = INADDR_ANY;
 236         tport = IPPORT_RESERVED - 1;
 237         sin.sin_port = htons(tport);
 238
 239         while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
 240                (--tport > IPPORT_RESERVED / 2))
 241                 sin.sin_port = htons(tport);
 242         return (error);
 243 }
 244
 245 /*
 246  * variables for managing the nfs_bind_resv_thread
 247  */
 248 int nfs_resv_mounts = 0;
 249 static int nfs_bind_resv_thread_state = 0;
 250 #define NFS_BIND_RESV_THREAD_STATE_INITTED      1
 251 #define NFS_BIND_RESV_THREAD_STATE_RUNNING      2
 252 lck_grp_t *nfs_bind_resv_lck_grp;
 253 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
 254 lck_attr_t *nfs_bind_resv_lck_attr;
 255 lck_mtx_t *nfs_bind_resv_mutex;
 256 struct nfs_bind_resv_request {
 257         TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
 258         struct nfsmount *brr_nmp;
 259         int brr_error;
 260 };
 261 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
 262
 263 /*
 264  * thread to handle any reserved port bind requests
 265  */
 266 static void
 267 nfs_bind_resv_thread(void)
 268 {
 269         struct nfs_bind_resv_request *brreq;
 270
 271         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 272
 273         while (nfs_resv_mounts > 0) {
 274                 lck_mtx_lock(nfs_bind_resv_mutex);
 275                 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
 276                         TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
 277                         lck_mtx_unlock(nfs_bind_resv_mutex);
 278                         brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
 279                         wakeup(brreq);
 280                         lck_mtx_lock(nfs_bind_resv_mutex);
 281                 }
 282                 msleep((caddr_t)&nfs_bind_resv_request_queue,
 283                                 nfs_bind_resv_mutex, PSOCK | PDROP,
 284                                 "nfs_bind_resv_request_queue", 0);
 285         }
 286
 287         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 288         (void) thread_terminate(current_thread());
 289 }
 290
 291 int
 292 nfs_bind_resv_thread_wake(void)
 293 {
 294         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
 295                 return (EIO);
 296         wakeup(&nfs_bind_resv_request_queue);
 297         return (0);
 298 }
 299
 300 /*
 301  * underprivileged procs call this to request nfs_bind_resv_thread
 302  * to perform the reserved port binding for them.
 303  */
 304 static int
 305 nfs_bind_resv_nopriv(struct nfsmount *nmp)
 306 {
 307         struct nfs_bind_resv_request brreq;
 308         int error;
 309
 310         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
 311                 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
 312                         nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
 313                         lck_grp_attr_setstat(nfs_bind_resv_lck_grp_attr);
 314                         nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
 315                         nfs_bind_resv_lck_attr = lck_attr_alloc_init();
 316                         nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
 317                         TAILQ_INIT(&nfs_bind_resv_request_queue);
 318                         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 319                 }
 320                 kernel_thread(kernel_task, nfs_bind_resv_thread);
 321                 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 322         }
 323
 324         brreq.brr_nmp = nmp;
 325         brreq.brr_error = 0;
 326
 327         lck_mtx_lock(nfs_bind_resv_mutex);
 328         TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 329         lck_mtx_unlock(nfs_bind_resv_mutex);
 330
 331         error = nfs_bind_resv_thread_wake();
 332         if (error) {
 333                 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 334                 /* Note: we might be able to simply restart the thread */
 335                 return (error);
 336         }
 337
 338         tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
 339
 340         return (brreq.brr_error);
 341 }
 342
 343 /*
 344  * Initialize sockets and congestion for a new NFS connection.
 345  * We do not free the sockaddr if error.
 346  */
 347 int
 348 nfs_connect(
 349         struct nfsmount *nmp,
 350         __unused struct nfsreq *rep)
 351 {
 352         socket_t so;
 353         int error, rcvreserve, sndreserve;
 354         struct sockaddr *saddr;
 355         struct timeval timeo;
 356
 357         nmp->nm_so = 0;
 358         saddr = mbuf_data(nmp->nm_nam);
 359         error = sock_socket(saddr->sa_family, nmp->nm_sotype,
 360                                                 nmp->nm_soproto, 0, 0, &nmp->nm_so);
 361         if (error) {
 362                 goto bad;
 363         }
 364         so = nmp->nm_so;
 365
 366         /*
 367          * Some servers require that the client port be a reserved port number.
 368          */
 369         if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
 370                 proc_t p;
 371                 /*
 372                  * sobind() requires current_proc() to have superuser privs.
 373                  * If this bind is part of a reconnect, and the current proc
 374                  * doesn't have superuser privs, we hand the sobind() off to
 375                  * a kernel thread to process.
 376                  */
 377                 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
 378                     (p = current_proc()) && suser(kauth_cred_get(), 0)) {
 379                         /* request nfs_bind_resv_thread() to do bind */
 380                         error = nfs_bind_resv_nopriv(nmp);
 381                 } else {
 382                         error = nfs_bind_resv(nmp);
 383                 }
 384                 if (error)
 385                         goto bad;
 386         }
 387
 388         /*
 389          * Protocols that do not require connections may be optionally left
 390          * unconnected for servers that reply from a port other than NFS_PORT.
 391          */
 392         if (nmp->nm_flag & NFSMNT_NOCONN) {
 393                 if (nmp->nm_sotype == SOCK_STREAM) {
 394                         error = ENOTCONN;
 395                         goto bad;
 396                 }
 397         } else {
 398                 struct timeval  tv;
 399                 tv.tv_sec = 2;
 400                 tv.tv_usec = 0;
 401                 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
 402                 if (error && error != EINPROGRESS) {
 403                         goto bad;
 404                 }
 405
 406                 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
 407                         if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
 408                                 goto bad;
 409                         }
 410                 }
 411         }
 412
 413         /*
 414          * Always time out on recieve, this allows us to reconnect the
 415          * socket to deal with network changes.
 416          */
 417         timeo.tv_usec = 0;
 418         timeo.tv_sec = 2;
 419         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
 420         if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
 421                 timeo.tv_sec = 5;
 422         } else {
 423                 timeo.tv_sec = 0;
 424         }
 425         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
 426
 427         if (nmp->nm_sotype == SOCK_DGRAM) {
 428                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 429                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 430                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 431         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
 432                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 433                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 434                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 435         } else {
 436                 int proto;
 437                 int on = 1;
 438
 439                 sock_gettype(so, NULL, NULL, &proto);
 440                 if (nmp->nm_sotype != SOCK_STREAM)
 441                         panic("nfscon sotype");
 442
 443                 // Assume that SOCK_STREAM always requires a connection
 444                 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
 445
 446                 if (proto == IPPROTO_TCP) {
 447                         sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
 448                 }
 449
 450                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
 451                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
 452                                 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 453         }
 454
 455         if (sndreserve > NFS_MAXSOCKBUF)
 456                 sndreserve = NFS_MAXSOCKBUF;
 457         if (rcvreserve > NFS_MAXSOCKBUF)
 458                 rcvreserve = NFS_MAXSOCKBUF;
 459         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
 460         if (error) {
 461                 goto bad;
 462         }
 463         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
 464         if (error) {
 465                 goto bad;
 466         }
 467
 468         sock_nointerrupt(so, 1);
 469
 470         /* Initialize other non-zero congestion variables */
 471         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
 472                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
 473         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
 474                 nmp->nm_sdrtt[3] = 0;
 475         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
 476         nmp->nm_sent = 0;
 477         FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
 478         nmp->nm_timeouts = 0;
 479         return (0);
 480
 481 bad:
 482         nfs_disconnect(nmp);
 483         return (error);
 484 }
 485
 486 /*
 487  * Reconnect routine:
 488  * Called when a connection is broken on a reliable protocol.
 489  * - clean up the old socket
 490  * - nfs_connect() again
 491  * - set R_MUSTRESEND for all outstanding requests on mount point
 492  * If this fails the mount point is DEAD!
 493  * nb: Must be called with the nfs_sndlock() set on the mount point.
 494  */
 495 static int
 496 nfs_reconnect(struct nfsreq *rep)
 497 {
 498         struct nfsreq *rp;
 499         struct nfsmount *nmp = rep->r_nmp;
 500         int error;
 501
 502         nfs_disconnect(nmp);
 503         while ((error = nfs_connect(nmp, rep))) {
 504                 if (error == EINTR || error == ERESTART)
 505                         return (EINTR);
 506                 if (error == EIO)
 507                         return (EIO);
 508                 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
 509                         "can not connect");
 510                 rep->r_flags |= R_TPRINTFMSG;
 511                 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
 512                         /* we're not yet completely mounted and */
 513                         /* we can't reconnect, so we fail */
 514                         return (error);
 515                 }
 516                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
 517                         return (error);
 518                 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
 519         }
 520
 521         /*
 522          * Loop through outstanding request list and fix up all requests
 523          * on old socket.
 524          */
 525         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
 526                 if (rp->r_nmp == nmp)
 527                         rp->r_flags |= R_MUSTRESEND;
 528         }
 529         return (0);
 530 }
 531
 532 /*
 533  * NFS disconnect. Clean up and unlink.
 534  */
 535 void
 536 nfs_disconnect(struct nfsmount *nmp)
 537 {
 538         socket_t so;
 539
 540         if (nmp->nm_so) {
 541                 so = nmp->nm_so;
 542                 nmp->nm_so = 0;
 543                 sock_shutdown(so, 2);
 544                 sock_close(so);
 545         }
 546 }
 547
 548 /*
 549  * This is the nfs send routine. For connection based socket types, it
 550  * must be called with an nfs_sndlock() on the socket.
 551  * "rep == NULL" indicates that it has been called from a server.
 552  * For the client side:
 553  * - return EINTR if the RPC is terminated, 0 otherwise
 554  * - set R_MUSTRESEND if the send fails for any reason
 555  * - do any cleanup required by recoverable socket errors (???)
 556  * For the server side:
 557  * - return EINTR or ERESTART if interrupted by a signal
 558  * - return EPIPE if a connection is lost for connection based sockets (TCP...)
 559  * - do any cleanup required by recoverable socket errors (???)
 560  */
 561 int
 562 nfs_send(so, nam, top, rep)
 563         socket_t so;
 564         mbuf_t nam;
 565         mbuf_t top;
 566         struct nfsreq *rep;
 567 {
 568         struct sockaddr *sendnam;
 569         int error, error2, sotype, flags;
 570         u_long xidqueued = 0;
 571         struct nfsreq *rp;
 572         char savenametolog[MAXPATHLEN];
 573         struct msghdr msg;
 574
 575         if (rep) {
 576                 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 577                 if (error) {
 578                         mbuf_freem(top);
 579                         return (error);
 580                 }
 581                 if ((so = rep->r_nmp->nm_so) == NULL) {
 582                         rep->r_flags |= R_MUSTRESEND;
 583                         mbuf_freem(top);
 584                         return (0);
 585                 }
 586                 rep->r_flags &= ~R_MUSTRESEND;
 587                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 588                         if (rp == rep)
 589                                 break;
 590                 if (rp)
 591                         xidqueued = rp->r_xid;
 592         }
 593         sock_gettype(so, NULL, &sotype, NULL);
 594         if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
 595             (nam == 0))
 596                 sendnam = (struct sockaddr *)0;
 597         else
 598                 sendnam = mbuf_data(nam);
 599
 600         if (sotype == SOCK_SEQPACKET)
 601                 flags = MSG_EOR;
 602         else
 603                 flags = 0;
 604
 605         /*
 606          * Save the name here in case mount point goes away if we block.
 607          * The name is using local stack and is large, but don't
 608          * want to block if we malloc.
 609          */
 610         if (rep)
 611                 strncpy(savenametolog,
 612                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
 613                         MAXPATHLEN - 1);
 614         bzero(&msg, sizeof(msg));
 615         msg.msg_name = (caddr_t)sendnam;
 616         msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
 617         error = sock_sendmbuf(so, &msg, top, flags, NULL);
 618
 619         if (error) {
 620                 if (rep) {
 621                         if (xidqueued) {
 622                                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 623                                         if (rp == rep && rp->r_xid == xidqueued)
 624                                                 break;
 625                                 if (!rp)
 626                                         panic("nfs_send: error %d xid %x gone",
 627                                               error, xidqueued);
 628                         }
 629                         log(LOG_INFO, "nfs send error %d for server %s\n",
 630                             error, savenametolog);
 631                         /*
 632                          * Deal with errors for the client side.
 633                          */
 634                         error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 635                         if (error2) {
 636                                 error = error2;
 637                         } else {
 638                                 rep->r_flags |= R_MUSTRESEND;
 639                         }
 640                 } else
 641                         log(LOG_INFO, "nfsd send error %d\n", error);
 642
 643                 /*
 644                  * Handle any recoverable (soft) socket errors here. (???)
 645                  */
 646                 if (error != EINTR && error != ERESTART && error != EIO &&
 647                         error != EWOULDBLOCK && error != EPIPE) {
 648                         error = 0;
 649                 }
 650         }
 651         return (error);
 652 }
 653
 654 /*
 655  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
 656  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
 657  * Mark and consolidate the data into a new mbuf list.
 658  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
 659  *     small mbufs.
 660  * For SOCK_STREAM we must be very careful to read an entire record once
 661  * we have read any of it, even if the system call has been interrupted.
 662  */
 663 static int
 664 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
 665 {
 666         socket_t so;
 667         struct iovec_32 aio;
 668         mbuf_t m, mlast;
 669         u_long len, fraglen;
 670         int error, error2, sotype;
 671         proc_t p = current_proc();      /* XXX */
 672         struct msghdr msg;
 673         size_t rcvlen;
 674         int lastfragment;
 675
 676         /*
 677          * Set up arguments for soreceive()
 678          */
 679         *mp = NULL;
 680         sotype = rep->r_nmp->nm_sotype;
 681
 682         /*
 683          * For reliable protocols, lock against other senders/receivers
 684          * in case a reconnect is necessary.
 685          * For SOCK_STREAM, first get the Record Mark to find out how much
 686          * more there is to get.
 687          * We must lock the socket against other receivers
 688          * until we have an entire rpc request/reply.
 689          */
 690         if (sotype != SOCK_DGRAM) {
 691                 error = nfs_sndlock(rep);
 692                 if (error)
 693                         return (error);
 694 tryagain:
 695                 /*
 696                  * Check for fatal errors and resending request.
 697                  */
 698                 /*
 699                  * Ugh: If a reconnect attempt just happened, nm_so
 700                  * would have changed. NULL indicates a failed
 701                  * attempt that has essentially shut down this
 702                  * mount point.
 703                  */
 704                 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
 705                         nfs_sndunlock(rep);
 706                         if (error)
 707                                 return (error);
 708                         return (EINTR);
 709                 }
 710                 so = rep->r_nmp->nm_so;
 711                 if (!so) {
 712                         error = nfs_reconnect(rep);
 713                         if (error) {
 714                                 nfs_sndunlock(rep);
 715                                 return (error);
 716                         }
 717                         goto tryagain;
 718                 }
 719                 while (rep->r_flags & R_MUSTRESEND) {
 720                         error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
 721                         if (!error) {
 722                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
 723                                 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
 724                         }
 725                         /*
 726                          * we also hold rcv lock so rep is still
 727                          * legit this point
 728                          */
 729                         if (error) {
 730                                 if (error == EINTR || error == ERESTART ||
 731                                     (error = nfs_reconnect(rep))) {
 732                                         nfs_sndunlock(rep);
 733                                         return (error);
 734                                 }
 735                                 goto tryagain;
 736                         }
 737                 }
 738                 nfs_sndunlock(rep);
 739                 if (sotype == SOCK_STREAM) {
 740                         error = 0;
 741                         len = 0;
 742                         lastfragment = 0;
 743                         mlast = NULL;
 744                         while (!error && !lastfragment) {
 745                                 aio.iov_base = (uintptr_t) &fraglen;
 746                                 aio.iov_len = sizeof(u_long);
 747                                 bzero(&msg, sizeof(msg));
 748                                 msg.msg_iov = (struct iovec *) &aio;
 749                                 msg.msg_iovlen = 1;
 750                                 do {
 751                                    error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
 752                                    if (!rep->r_nmp) /* if unmounted then bailout */
 753                                         goto shutout;
 754                                    if (error == EWOULDBLOCK && rep) {
 755                                         error2 = nfs_sigintr(rep->r_nmp, rep, p);
 756                                         if (error2)
 757                                                 error = error2;
 758                                    }
 759                                 } while (error == EWOULDBLOCK);
 760                                 if (!error && rcvlen < aio.iov_len) {
 761                                     /* only log a message if we got a partial word */
 762                                     if (rcvlen != 0)
 763                                             log(LOG_INFO,
 764                                                  "short receive (%d/%d) from nfs server %s\n",
 765                                                  rcvlen, sizeof(u_long),
 766                                                  vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 767                                     error = EPIPE;
 768                                 }
 769                                 if (error)
 770                                         goto errout;
 771                                 lastfragment = ntohl(fraglen) & 0x80000000;
 772                                 fraglen = ntohl(fraglen) & ~0x80000000;
 773                                 len += fraglen;
 774                                 /*
 775                                  * This is SERIOUS! We are out of sync with the sender
 776                                  * and forcing a disconnect/reconnect is all I can do.
 777                                  */
 778                                 if (len > NFS_MAXPACKET) {
 779                                     log(LOG_ERR, "%s (%d) from nfs server %s\n",
 780                                         "impossible RPC record length", len,
 781                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 782                                     error = EFBIG;
 783                                     goto errout;
 784                                 }
 785
 786                                 m = NULL;
 787                                 do {
 788                                     rcvlen = fraglen;
 789                                     error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
 790                                     if (!rep->r_nmp) /* if unmounted then bailout */ {
 791                                         goto shutout;
 792                                     }
 793                                 } while (error == EWOULDBLOCK || error == EINTR ||
 794                                          error == ERESTART);
 795
 796                                 if (!error && fraglen > rcvlen) {
 797                                     log(LOG_INFO,
 798                                         "short receive (%d/%d) from nfs server %s\n",
 799                                         rcvlen, fraglen,
 800                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 801                                     error = EPIPE;
 802                                     mbuf_freem(m);
 803                                 }
 804                                 if (!error) {
 805                                         if (!*mp) {
 806                                                 *mp = m;
 807                                                 mlast = m;
 808                                         } else {
 809                                                 error = mbuf_setnext(mlast, m);
 810                                                 if (error) {
 811                                                         printf("nfs_receive: mbuf_setnext failed %d\n", error);
 812                                                         mbuf_freem(m);
 813                                                 }
 814                                         }
 815                                         while (mbuf_next(mlast))
 816                                                 mlast = mbuf_next(mlast);
 817                                 }
 818                         }
 819                 } else {
 820                         bzero(&msg, sizeof(msg));
 821                         do {
 822                             rcvlen = 100000000;
 823                             error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 824                             if (!rep->r_nmp) /* if unmounted then bailout */ {
 825                                 goto shutout;
 826                             }
 827                             if (error == EWOULDBLOCK && rep) {
 828                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 829                                 if (error2) {
 830                                         return (error2);
 831                                 }
 832                             }
 833                         } while (error == EWOULDBLOCK);
 834
 835                         if ((msg.msg_flags & MSG_EOR) == 0)
 836                                 printf("Egad!!\n");
 837                         if (!error && *mp == NULL)
 838                                 error = EPIPE;
 839                         len = rcvlen;
 840                 }
 841 errout:
 842                 if (error && error != EINTR && error != ERESTART) {
 843                         mbuf_freem(*mp);
 844                         *mp = NULL;
 845                         if (error != EPIPE)
 846                                 log(LOG_INFO,
 847                                     "receive error %d from nfs server %s\n", error,
 848                                     vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 849                         error = nfs_sndlock(rep);
 850                         if (!error) {
 851                                 error = nfs_reconnect(rep);
 852                                 if (!error)
 853                                         goto tryagain;
 854                                 nfs_sndunlock(rep);
 855                         }
 856                 }
 857         } else {
 858                 /*
 859                  * We could have failed while rebinding the datagram socket
 860                  * so we need to attempt to rebind here.
 861                  */
 862                 if ((so = rep->r_nmp->nm_so) == NULL) {
 863                         error = nfs_sndlock(rep);
 864                         if (!error) {
 865                                 error = nfs_reconnect(rep);
 866                                 nfs_sndunlock(rep);
 867                         }
 868                         if (error)
 869                                 return (error);
 870                         if (!rep->r_nmp) /* if unmounted then bailout */
 871                                 return (ENXIO);
 872                         so = rep->r_nmp->nm_so;
 873                 }
 874                 bzero(&msg, sizeof(msg));
 875                 len = 0;
 876                 do {
 877                         rcvlen = 1000000;
 878                         error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 879                         if (!rep->r_nmp) /* if unmounted then bailout */
 880                                 goto shutout;
 881                         if (error) {
 882                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 883                                 if (error2) {
 884                                         error = error2;
 885                                         goto shutout;
 886                                 }
 887                         }
 888                         /* Reconnect for all errors.  We may be receiving
 889                          * soft/hard/blocking errors because of a network
 890                          * change.
 891                          * XXX: we should rate limit or delay this
 892                          * to once every N attempts or something.
 893                          * although TCP doesn't seem to.
 894                          */
 895                         if (error) {
 896                                 error2 = nfs_sndlock(rep);
 897                                 if (!error2) {
 898                                         error2 = nfs_reconnect(rep);
 899                                         if (error2)
 900                                                 error = error2;
 901                                         else if (!rep->r_nmp) /* if unmounted then bailout */
 902                                                 error = ENXIO;
 903                                         else
 904                                                 so = rep->r_nmp->nm_so;
 905                                         nfs_sndunlock(rep);
 906                                 } else {
 907                                         error = error2;
 908                                 }
 909                         }
 910                 } while (error == EWOULDBLOCK);
 911         }
 912 shutout:
 913         if (error) {
 914                 mbuf_freem(*mp);
 915                 *mp = NULL;
 916         }
 917         return (error);
 918 }
 919
 920 /*
 921  * Implement receipt of reply on a socket.
 922  * We must search through the list of received datagrams matching them
 923  * with outstanding requests using the xid, until ours is found.
 924  */
 925 /* ARGSUSED */
 926 int
 927 nfs_reply(myrep)
 928         struct nfsreq *myrep;
 929 {
 930         struct nfsreq *rep;
 931         struct nfsmount *nmp = myrep->r_nmp;
 932         long t1;
 933         mbuf_t mrep, md;
 934         u_long rxid, *tl;
 935         caddr_t dpos, cp2;
 936         int error;
 937
 938         /*
 939          * Loop around until we get our own reply
 940          */
 941         for (;;) {
 942                 /*
 943                  * Lock against other receivers so that I don't get stuck in
 944                  * sbwait() after someone else has received my reply for me.
 945                  * Also necessary for connection based protocols to avoid
 946                  * race conditions during a reconnect.
 947                  * If nfs_rcvlock() returns EALREADY, that means that
 948                  * the reply has already been recieved by another
 949                  * process and we can return immediately.  In this
 950                  * case, the lock is not taken to avoid races with
 951                  * other processes.
 952                  */
 953                 error = nfs_rcvlock(myrep);
 954                 if (error == EALREADY)
 955                         return (0);
 956                 if (error)
 957                         return (error);
 958
 959                 /*
 960                  * If we slept after putting bits otw, then reply may have
 961                  * arrived.  In which case returning is required, or we
 962                  * would hang trying to nfs_receive an already received reply.
 963                  */
 964                 if (myrep->r_mrep != NULL) {
 965                         nfs_rcvunlock(myrep);
 966                         FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
 967                         return (0);
 968                 }
 969                 /*
 970                  * Get the next Rpc reply off the socket. Assume myrep->r_nmp
 971                  * is still intact by checks done in nfs_rcvlock.
 972                  */
 973                 error = nfs_receive(myrep, &mrep);
 974                 /*
 975                  * Bailout asap if nfsmount struct gone (unmounted).
 976                  */
 977                 if (!myrep->r_nmp) {
 978                         FSDBG(530, myrep->r_xid, myrep, nmp, -2);
 979                         if (mrep)
 980                                 mbuf_freem(mrep);
 981                         return (ENXIO);
 982                 }
 983                 if (error) {
 984                         FSDBG(530, myrep->r_xid, myrep, nmp, error);
 985                         nfs_rcvunlock(myrep);
 986
 987                         /* Bailout asap if nfsmount struct gone (unmounted). */
 988                         if (!myrep->r_nmp) {
 989                                 if (mrep)
 990                                         mbuf_freem(mrep);
 991                                 return (ENXIO);
 992                         }
 993
 994                         /*
 995                          * Ignore routing errors on connectionless protocols??
 996                          */
 997                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
 998                                 if (nmp->nm_so) {
 999                                         int clearerror;
1000                                         int optlen = sizeof(clearerror);
1001                                         sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1002                                 }
1003                                 continue;
1004                         }
1005                         if (mrep)
1006                                 mbuf_freem(mrep);
1007                         return (error);
1008                 }
1009
1010                 /*
1011                  * We assume all is fine, but if we did not have an error
1012                  * and mrep is 0, better not dereference it. nfs_receive
1013                  * calls soreceive which carefully sets error=0 when it got
1014                  * errors on sbwait (tsleep). In most cases, I assume that's
1015                  * so we could go back again. In tcp case, EPIPE is returned.
1016                  * In udp, case nfs_receive gets back here with no error and no
1017                  * mrep. Is the right fix to have soreceive check for process
1018                  * aborted after sbwait and return something non-zero? Should
1019                  * nfs_receive give an EPIPE?  Too risky to play with those
1020                  * two this late in game for a shutdown problem. Instead,
1021                  * just check here and get out. (ekn)
1022                  */
1023                 if (!mrep) {
1024                         nfs_rcvunlock(myrep);
1025                         FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1026                         return (ENXIO); /* sounds good */
1027                 }
1028
1029                 /*
1030                  * Get the xid and check that it is an rpc reply
1031                  */
1032                 md = mrep;
1033                 dpos = mbuf_data(md);
1034                 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1035                 rxid = *tl++;
1036                 if (*tl != rpc_reply) {
1037                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1038                         mbuf_freem(mrep);
1039 nfsmout:
1040                         if (nmp->nm_state & NFSSTA_RCVLOCK)
1041                                 nfs_rcvunlock(myrep);
1042                         continue;
1043                 }
1044
1045                 /*
1046                  * Loop through the request list to match up the reply
1047                  * Iff no match, just drop the datagram
1048                  */
1049                 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1050                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1051                                 /* Found it.. */
1052                                 rep->r_mrep = mrep;
1053                                 rep->r_md = md;
1054                                 rep->r_dpos = dpos;
1055                                 /*
1056                                  * If we're tracking the round trip time
1057                                  * then we update the circular log here
1058                                  * with the stats from our current request.
1059                                  */
1060                                 if (nfsrtton) {
1061                                         struct rttl *rt;
1062
1063                                         rt = &nfsrtt.rttl[nfsrtt.pos];
1064                                         rt->proc = rep->r_procnum;
1065                                         rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1066                                         rt->sent = nmp->nm_sent;
1067                                         rt->cwnd = nmp->nm_cwnd;
1068                                         if (proct[rep->r_procnum] == 0)
1069                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1070                                         rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1071                                         rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1072                                         rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1073                                         microtime(&rt->tstamp); // XXX unused
1074                                         if (rep->r_flags & R_TIMING)
1075                                                 rt->rtt = rep->r_rtt;
1076                                         else
1077                                                 rt->rtt = 1000000;
1078                                         nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1079                                 }
1080                                 /*
1081                                  * Update congestion window.
1082                                  * Do the additive increase of
1083                                  * one rpc/rtt.
1084                                  */
1085                                 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1086                                       nmp->nm_cwnd);
1087                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
1088                                         nmp->nm_cwnd +=
1089                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
1090                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1091                                         if (nmp->nm_cwnd > NFS_MAXCWND)
1092                                                 nmp->nm_cwnd = NFS_MAXCWND;
1093                                 }
1094                                 if (rep->r_flags & R_SENT) {
1095                                     rep->r_flags &= ~R_SENT;
1096                                     nmp->nm_sent -= NFS_CWNDSCALE;
1097                                }
1098                                 /*
1099                                  * Update rtt using a gain of 0.125 on the mean
1100                                  * and a gain of 0.25 on the deviation.
1101                                  */
1102                                 if (rep->r_flags & R_TIMING) {
1103                                         /*
1104                                          * Since the timer resolution of
1105                                          * NFS_HZ is so course, it can often
1106                                          * result in r_rtt == 0. Since
1107                                          * r_rtt == N means that the actual
1108                                          * rtt is between N+dt and N+2-dt ticks,
1109                                          * add 1.
1110                                          */
1111                                         if (proct[rep->r_procnum] == 0)
1112                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1113                                         t1 = rep->r_rtt + 1;
1114                                         t1 -= (NFS_SRTT(rep) >> 3);
1115                                         NFS_SRTT(rep) += t1;
1116                                         if (t1 < 0)
1117                                                 t1 = -t1;
1118                                         t1 -= (NFS_SDRTT(rep) >> 2);
1119                                         NFS_SDRTT(rep) += t1;
1120                                 }
1121                                 nmp->nm_timeouts = 0;
1122                                 break;
1123                         }
1124                 }
1125                 nfs_rcvunlock(myrep);
1126                 /*
1127                  * If not matched to a request, drop it.
1128                  * If it's mine, get out.
1129                  */
1130                 if (rep == 0) {
1131                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1132                         mbuf_freem(mrep);
1133                 } else if (rep == myrep) {
1134                         if (rep->r_mrep == NULL)
1135                                 panic("nfs_reply: nil r_mrep");
1136                         return (0);
1137                 }
1138                 FSDBG(530, myrep->r_xid, myrep, rep,
1139                       rep ? rep->r_xid : myrep->r_flags);
1140         }
1141 }
1142
1143 /*
1144  * nfs_request - goes something like this
1145  *      - fill in request struct
1146  *      - links it into list
1147  *      - calls nfs_send() for first transmit
1148  *      - calls nfs_receive() to get reply
1149  *      - break down rpc header and return with nfs reply pointed to
1150  *        by mrep or error
1151  * nb: always frees up mreq mbuf list
1152  */
1153 int
1154 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1155         vnode_t vp;
1156         mount_t mp;
1157         mbuf_t mrest;
1158         int procnum;
1159         proc_t procp;
1160         kauth_cred_t cred;
1161         mbuf_t *mrp;
1162         mbuf_t *mdp;
1163         caddr_t *dposp;
1164         u_int64_t *xidp;
1165 {
1166         mbuf_t m, mrep, m2;
1167         struct nfsreq re, *rep;
1168         u_long *tl;
1169         int i;
1170         struct nfsmount *nmp;
1171         mbuf_t md, mheadend;
1172         char nickv[RPCX_NICKVERF];
1173         time_t waituntil;
1174         caddr_t dpos, cp2;
1175         int t1, error = 0, mrest_len, auth_len, auth_type;
1176         int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1177         int verf_len, verf_type;
1178         u_long xid;
1179         char *auth_str, *verf_str;
1180         NFSKERBKEY_T key;               /* save session key */
1181         int nmsotype;
1182         struct timeval now;
1183
1184         if (mrp)
1185                 *mrp = NULL;
1186         if (xidp)
1187                 *xidp = 0;
1188         nmp = VFSTONFS(mp);
1189
1190         rep = &re;
1191
1192         if (vp)
1193                 nmp = VFSTONFS(vnode_mount(vp));
1194         if (nmp == NULL ||
1195             (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1196             (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1197                 mbuf_freem(mrest);
1198                 return (ENXIO);
1199         }
1200         nmsotype = nmp->nm_sotype;
1201
1202         FSDBG_TOP(531, vp, procnum, nmp, rep);
1203
1204         rep->r_nmp = nmp;
1205         rep->r_vp = vp;
1206         rep->r_procp = procp;
1207         rep->r_procnum = procnum;
1208         microuptime(&now);
1209         rep->r_lastmsg = now.tv_sec -
1210             ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1211         i = 0;
1212         m = mrest;
1213         while (m) {
1214                 i += mbuf_len(m);
1215                 m = mbuf_next(m);
1216         }
1217         mrest_len = i;
1218
1219         /*
1220          * Get the RPC header with authorization.
1221          */
1222 kerbauth:
1223         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1224         if (!nmp) {
1225                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1226                 mbuf_freem(mrest);
1227                 return (ENXIO);
1228         }
1229         verf_str = auth_str = (char *)0;
1230         if (nmp->nm_flag & NFSMNT_KERB) {
1231                 verf_str = nickv;
1232                 verf_len = sizeof (nickv);
1233                 auth_type = RPCAUTH_KERB4;
1234                 bzero((caddr_t)key, sizeof (key));
1235                 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1236                         &auth_len, verf_str, verf_len)) {
1237                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1238                         if (!nmp) {
1239                                 FSDBG_BOT(531, 2, vp, error, rep);
1240                                 mbuf_freem(mrest);
1241                                 return (ENXIO);
1242                         }
1243                         error = nfs_getauth(nmp, rep, cred, &auth_str,
1244                                 &auth_len, verf_str, &verf_len, key);
1245                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1246                         if (!error && !nmp)
1247                                 error = ENXIO;
1248                         if (error) {
1249                                 FSDBG_BOT(531, 2, vp, error, rep);
1250                                 mbuf_freem(mrest);
1251                                 return (error);
1252                         }
1253                 }
1254         } else {
1255                 auth_type = RPCAUTH_UNIX;
1256                 if (cred->cr_ngroups < 1)
1257                         panic("nfsreq nogrps");
1258                 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1259                         nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1260                         5 * NFSX_UNSIGNED;
1261         }
1262         error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1263              auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1264         if (auth_str)
1265                 _FREE(auth_str, M_TEMP);
1266         if (error) {
1267                 mbuf_freem(mrest);
1268                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1269                 return (error);
1270         }
1271         if (xidp)
1272                 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1273
1274         /*
1275          * For stream protocols, insert a Sun RPC Record Mark.
1276          */
1277         if (nmsotype == SOCK_STREAM) {
1278                 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1279                 if (error) {
1280                         mbuf_freem(m);
1281                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1282                         return (error);
1283                 }
1284                 *((u_long*)mbuf_data(m)) =
1285                         htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1286         }
1287         rep->r_mreq = m;
1288         rep->r_xid = xid;
1289 tryagain:
1290         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1291         if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1292                 rep->r_retry = nmp->nm_retry;
1293         else
1294                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
1295         rep->r_rtt = rep->r_rexmit = 0;
1296         if (proct[procnum] > 0)
1297                 rep->r_flags = R_TIMING;
1298         else
1299                 rep->r_flags = 0;
1300         rep->r_mrep = NULL;
1301
1302         /*
1303          * Do the client side RPC.
1304          */
1305         OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1306         /*
1307          * Chain request into list of outstanding requests. Be sure
1308          * to put it LAST so timer finds oldest requests first.
1309          */
1310         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1311
1312         /*
1313          * If backing off another request or avoiding congestion, don't
1314          * send this one now but let timer do it. If not timing a request,
1315          * do it now.
1316          */
1317         if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1318                            (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1319                            nmp->nm_sent < nmp->nm_cwnd)) {
1320                 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1321
1322                 if (connrequired)
1323                         error = nfs_sndlock(rep);
1324
1325                 /*
1326                  * Set the R_SENT before doing the send in case another thread
1327                  * processes the reply before the nfs_send returns here
1328                  */
1329                 if (!error) {
1330                         if ((rep->r_flags & R_MUSTRESEND) == 0) {
1331                                 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1332                                       nmp->nm_cwnd);
1333                                 nmp->nm_sent += NFS_CWNDSCALE;
1334                                 rep->r_flags |= R_SENT;
1335                         }
1336
1337                         error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1338                         if (!error)
1339                                 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1340                         if (connrequired)
1341                                 nfs_sndunlock(rep);
1342                 }
1343                 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1344                 if (error) {
1345                         if (nmp)
1346                                 nmp->nm_sent -= NFS_CWNDSCALE;
1347                         rep->r_flags &= ~R_SENT;
1348                 }
1349         } else {
1350                 rep->r_rtt = -1;
1351         }
1352
1353         /*
1354          * Wait for the reply from our send or the timer's.
1355          */
1356         if (!error || error == EPIPE)
1357                 error = nfs_reply(rep);
1358
1359         /*
1360          * RPC done, unlink the request.
1361          */
1362         nfs_repdequeue(rep);
1363
1364         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1365
1366         /*
1367          * Decrement the outstanding request count.
1368          */
1369         if (rep->r_flags & R_SENT) {
1370                 rep->r_flags &= ~R_SENT;        /* paranoia */
1371                 if (nmp) {
1372                         FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1373                         nmp->nm_sent -= NFS_CWNDSCALE;
1374                 }
1375         }
1376
1377         /*
1378          * If there was a successful reply and a tprintf msg.
1379          * tprintf a response.
1380          */
1381         if (!error)
1382                 nfs_up(nmp, procp, NFSSTA_TIMEO,
1383                         (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1384         mrep = rep->r_mrep;
1385         md = rep->r_md;
1386         dpos = rep->r_dpos;
1387         if (!error && !nmp)
1388                 error = ENXIO;
1389         if (error) {
1390                 mbuf_freem(rep->r_mreq);
1391                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1392                 return (error);
1393         }
1394
1395         /*
1396          * break down the rpc header and check if ok
1397          */
1398         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1399         if (*tl++ == rpc_msgdenied) {
1400                 if (*tl == rpc_mismatch)
1401                         error = EOPNOTSUPP;
1402                 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1403                         if (!failed_auth) {
1404                                 failed_auth++;
1405                                 error = mbuf_setnext(mheadend, NULL);
1406                                 mbuf_freem(mrep);
1407                                 mbuf_freem(rep->r_mreq);
1408                                 if (!error)
1409                                         goto kerbauth;
1410                                 printf("nfs_request: mbuf_setnext failed\n");
1411                         } else
1412                                 error = EAUTH;
1413                 } else
1414                         error = EACCES;
1415                 mbuf_freem(mrep);
1416                 mbuf_freem(rep->r_mreq);
1417                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1418                 return (error);
1419         }
1420
1421         /*
1422          * Grab any Kerberos verifier, otherwise just throw it away.
1423          */
1424         verf_type = fxdr_unsigned(int, *tl++);
1425         i = fxdr_unsigned(int, *tl);
1426         if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1427                 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1428                 if (error)
1429                         goto nfsmout;
1430         } else if (i > 0)
1431                 nfsm_adv(nfsm_rndup(i));
1432         nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1433         /* 0 == ok */
1434         if (*tl == 0) {
1435                 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1436                 if (*tl != 0) {
1437                         error = fxdr_unsigned(int, *tl);
1438                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1439                                 error == NFSERR_TRYLATER) {
1440                                 mbuf_freem(mrep);
1441                                 error = 0;
1442                                 microuptime(&now);
1443                                 waituntil = now.tv_sec + trylater_delay;
1444                                 while (now.tv_sec < waituntil) {
1445                                         tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1446                                         microuptime(&now);
1447                                 }
1448                                 trylater_delay *= 2;
1449                                 if (trylater_delay > 60)
1450                                         trylater_delay = 60;
1451                                 goto tryagain;
1452                         }
1453
1454                         /*
1455                          * If the File Handle was stale, invalidate the
1456                          * lookup cache, just in case.
1457                          */
1458                         if ((error == ESTALE) && vp)
1459                                 cache_purge(vp);
1460                         if (nmp->nm_flag & NFSMNT_NFSV3) {
1461                                 *mrp = mrep;
1462                                 *mdp = md;
1463                                 *dposp = dpos;
1464                                 error |= NFSERR_RETERR;
1465                         } else {
1466                                 mbuf_freem(mrep);
1467                                 error &= ~NFSERR_RETERR;
1468                         }
1469                         mbuf_freem(rep->r_mreq);
1470                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1471                         return (error);
1472                 }
1473
1474                 *mrp = mrep;
1475                 *mdp = md;
1476                 *dposp = dpos;
1477                 mbuf_freem(rep->r_mreq);
1478                 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1479                 return (0);
1480         }
1481         mbuf_freem(mrep);
1482         error = EPROTONOSUPPORT;
1483 nfsmout:
1484         mbuf_freem(rep->r_mreq);
1485         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1486         return (error);
1487 }
1488
1489 #ifndef NFS_NOSERVER
1490 /*
1491  * Generate the rpc reply header
1492  * siz arg. is used to decide if adding a cluster is worthwhile
1493  */
1494 int
1495 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1496         int siz;
1497         struct nfsrv_descript *nd;
1498         struct nfssvc_sock *slp;
1499         int err;
1500         mbuf_t *mrq;
1501         mbuf_t *mbp;
1502         caddr_t *bposp;
1503 {
1504         u_long *tl;
1505         mbuf_t mreq;
1506         caddr_t bpos;
1507         mbuf_t mb, mb2;
1508         int error, mlen;
1509
1510         /*
1511          * If this is a big reply, use a cluster else
1512          * try and leave leading space for the lower level headers.
1513          */
1514         siz += RPC_REPLYSIZ;
1515         if (siz >= nfs_mbuf_minclsize) {
1516                 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1517         } else {
1518                 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1519         }
1520         if (error) {
1521                 /* unable to allocate packet */
1522                 /* XXX nfsstat? */
1523                 return (error);
1524         }
1525         mb = mreq;
1526         tl = mbuf_data(mreq);
1527         mlen = 6 * NFSX_UNSIGNED;
1528         if (siz < nfs_mbuf_minclsize) {
1529                 /* leave space for lower level headers */
1530                 tl += 80/sizeof(*tl);  /* XXX max_hdr? XXX */
1531                 mbuf_setdata(mreq, tl, mlen);
1532         } else {
1533                 mbuf_setlen(mreq, mlen);
1534         }
1535         bpos = ((caddr_t)tl) + mlen;
1536         *tl++ = txdr_unsigned(nd->nd_retxid);
1537         *tl++ = rpc_reply;
1538         if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1539                 *tl++ = rpc_msgdenied;
1540                 if (err & NFSERR_AUTHERR) {
1541                         *tl++ = rpc_autherr;
1542                         *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1543                         mlen -= NFSX_UNSIGNED;
1544                         mbuf_setlen(mreq, mlen);
1545                         bpos -= NFSX_UNSIGNED;
1546                 } else {
1547                         *tl++ = rpc_mismatch;
1548                         *tl++ = txdr_unsigned(RPC_VER2);
1549                         *tl = txdr_unsigned(RPC_VER2);
1550                 }
1551         } else {
1552                 *tl++ = rpc_msgaccepted;
1553
1554                 /*
1555                  * For Kerberos authentication, we must send the nickname
1556                  * verifier back, otherwise just RPCAUTH_NULL.
1557                  */
1558                 if (nd->nd_flag & ND_KERBFULL) {
1559                     struct nfsuid *nuidp;
1560                     struct timeval ktvin, ktvout;
1561                     uid_t uid = kauth_cred_getuid(nd->nd_cr);
1562
1563                     lck_rw_lock_shared(&slp->ns_rwlock);
1564                     for (nuidp = NUIDHASH(slp, uid)->lh_first;
1565                         nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1566                         if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1567                             (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1568                              &nuidp->nu_haddr, nd->nd_nam2)))
1569                             break;
1570                     }
1571                     if (nuidp) {
1572                         ktvin.tv_sec =
1573                             txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1574                         ktvin.tv_usec =
1575                             txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1576
1577                         /*
1578                          * Encrypt the timestamp in ecb mode using the
1579                          * session key.
1580                          */
1581 #if NFSKERB
1582                         XXX
1583 #endif
1584
1585                         *tl++ = rpc_auth_kerb;
1586                         *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1587                         *tl = ktvout.tv_sec;
1588                         nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1589                         *tl++ = ktvout.tv_usec;
1590                         *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1591                     } else {
1592                         *tl++ = 0;
1593                         *tl++ = 0;
1594                     }
1595                     lck_rw_done(&slp->ns_rwlock);
1596                 } else {
1597                         *tl++ = 0;
1598                         *tl++ = 0;
1599                 }
1600                 switch (err) {
1601                 case EPROGUNAVAIL:
1602                         *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1603                         break;
1604                 case EPROGMISMATCH:
1605                         *tl = txdr_unsigned(RPC_PROGMISMATCH);
1606                         nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1607                         // XXX hard coded versions
1608                         *tl++ = txdr_unsigned(2);
1609                         *tl = txdr_unsigned(3);
1610                         break;
1611                 case EPROCUNAVAIL:
1612                         *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1613                         break;
1614                 case EBADRPC:
1615                         *tl = txdr_unsigned(RPC_GARBAGE);
1616                         break;
1617                 default:
1618                         *tl = 0;
1619                         if (err != NFSERR_RETVOID) {
1620                                 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1621                                 if (err)
1622                                     *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1623                                 else
1624                                     *tl = 0;
1625                         }
1626                         break;
1627                 }
1628         }
1629
1630         if (mrq != NULL)
1631                 *mrq = mreq;
1632         *mbp = mb;
1633         *bposp = bpos;
1634         if (err != 0 && err != NFSERR_RETVOID) {
1635                 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1636         }
1637         return (0);
1638 }
1639
1640
1641 #endif /* NFS_NOSERVER */
1642
1643
1644 /*
1645  * From FreeBSD 1.58, a Matt Dillon fix...
1646  * Flag a request as being about to terminate.
1647  * The nm_sent count is decremented now to avoid deadlocks when the process
1648  * in soreceive() hasn't yet managed to send its own request.
1649  */
1650 static void
1651 nfs_softterm(struct nfsreq *rep)
1652 {
1653
1654         rep->r_flags |= R_SOFTTERM;
1655         if (rep->r_flags & R_SENT) {
1656                 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1657                       rep->r_nmp->nm_cwnd);
1658                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1659                 rep->r_flags &= ~R_SENT;
1660         }
1661 }
1662
1663 void
1664 nfs_timer_funnel(void * arg)
1665 {
1666         (void) thread_funnel_set(kernel_flock, TRUE);
1667         nfs_timer(arg);
1668         (void) thread_funnel_set(kernel_flock, FALSE);
1669
1670 }
1671
1672 /*
1673  * Ensure rep isn't in use by the timer, then dequeue it.
1674  */
1675 static void
1676 nfs_repdequeue(struct nfsreq *rep)
1677 {
1678
1679         while ((rep->r_flags & R_BUSY)) {
1680                 rep->r_flags |= R_WAITING;
1681                 tsleep(rep, PSOCK, "repdeq", 0);
1682         }
1683         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1684 }
1685
1686 /*
1687  * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1688  * free()'d out from under it.
1689  */
1690 static void
1691 nfs_repbusy(struct nfsreq *rep)
1692 {
1693
1694         if ((rep->r_flags & R_BUSY))
1695                 panic("rep locked");
1696         rep->r_flags |= R_BUSY;
1697 }
1698
1699 /*
1700  * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1701  */
1702 static struct nfsreq *
1703 nfs_repnext(struct nfsreq *rep)
1704 {
1705         struct nfsreq * nextrep;
1706
1707         if (rep == NULL)
1708                 return (NULL);
1709         /*
1710          * We need to get and busy the next req before signalling the
1711          * current one, otherwise wakeup() may block us and we'll race to
1712          * grab the next req.
1713          */
1714         nextrep = TAILQ_NEXT(rep, r_chain);
1715         if (nextrep != NULL)
1716                 nfs_repbusy(nextrep);
1717         /* unbusy and signal. */
1718         rep->r_flags &= ~R_BUSY;
1719         if ((rep->r_flags & R_WAITING)) {
1720                 rep->r_flags &= ~R_WAITING;
1721                 wakeup(rep);
1722         }
1723         return (nextrep);
1724 }
1725
1726 /*
1727  * Nfs timer routine
1728  * Scan the nfsreq list and retranmit any requests that have timed out
1729  * To avoid retransmission attempts on STREAM sockets (in the future) make
1730  * sure to set the r_retry field to 0 (implies nm_retry == 0).
1731  */
1732 void
1733 nfs_timer(__unused void *arg)
1734 {
1735         struct nfsreq *rep;
1736         mbuf_t m;
1737         socket_t so;
1738         struct nfsmount *nmp;
1739         int timeo;
1740         int error;
1741 #ifndef NFS_NOSERVER
1742         struct nfssvc_sock *slp;
1743         u_quad_t cur_usec;
1744 #endif /* NFS_NOSERVER */
1745         int flags, rexmit, cwnd, sent;
1746         u_long xid;
1747         struct timeval now;
1748
1749         rep = TAILQ_FIRST(&nfs_reqq);
1750         if (rep != NULL)
1751                 nfs_repbusy(rep);
1752         microuptime(&now);
1753         for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1754                 nmp = rep->r_nmp;
1755                 if (!nmp) /* unmounted */
1756                     continue;
1757                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1758                         continue;
1759                 if (nfs_sigintr(nmp, rep, rep->r_procp))
1760                         continue;
1761                 if (nmp->nm_tprintf_initial_delay != 0 &&
1762                     (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1763                     rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1764                         rep->r_lastmsg = now.tv_sec;
1765                         nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1766                                 "not responding");
1767                         rep->r_flags |= R_TPRINTFMSG;
1768                         if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1769                                 /* we're not yet completely mounted and */
1770                                 /* we can't complete an RPC, so we fail */
1771                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1772                                 nfs_softterm(rep);
1773                                 continue;
1774                         }
1775                 }
1776                 if (rep->r_rtt >= 0) {
1777                         rep->r_rtt++;
1778                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1779                                 timeo = nmp->nm_timeo;
1780                         else
1781                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1782                         /* ensure 62.5 ms floor */
1783                         while (16 * timeo < hz)
1784                             timeo *= 2;
1785                         if (nmp->nm_timeouts > 0)
1786                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1787                         if (rep->r_rtt <= timeo)
1788                                 continue;
1789                         if (nmp->nm_timeouts < 8)
1790                                 nmp->nm_timeouts++;
1791                 }
1792                 /*
1793                  * Check for too many retransmits.  This is never true for
1794                  * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1795                  * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1796                  */
1797                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
1798                         OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1799                         nfs_softterm(rep);
1800                         continue;
1801                 }
1802                 if (nmp->nm_sotype != SOCK_DGRAM) {
1803                         if (++rep->r_rexmit > NFS_MAXREXMIT)
1804                                 rep->r_rexmit = NFS_MAXREXMIT;
1805                         continue;
1806                 }
1807                 if ((so = nmp->nm_so) == NULL)
1808                         continue;
1809
1810                 /*
1811                  * If there is enough space and the window allows..
1812                  *      Resend it
1813                  * Set r_rtt to -1 in case we fail to send it now.
1814                  */
1815                 rep->r_rtt = -1;
1816                 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1817                     (rep->r_flags & R_SENT) ||
1818                     nmp->nm_sent < nmp->nm_cwnd) &&
1819                    (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1820                         struct msghdr   msg;
1821                         /*
1822                          * Iff first send, start timing
1823                          * else turn timing off, backoff timer
1824                          * and divide congestion window by 2.
1825                          * We update these *before* the send to avoid
1826                          * racing against receiving the reply.
1827                          * We save them so we can restore them on send error.
1828                          */
1829                         flags = rep->r_flags;
1830                         rexmit = rep->r_rexmit;
1831                         cwnd = nmp->nm_cwnd;
1832                         sent = nmp->nm_sent;
1833                         xid = rep->r_xid;
1834                         if (rep->r_flags & R_SENT) {
1835                                 rep->r_flags &= ~R_TIMING;
1836                                 if (++rep->r_rexmit > NFS_MAXREXMIT)
1837                                         rep->r_rexmit = NFS_MAXREXMIT;
1838                                 nmp->nm_cwnd >>= 1;
1839                                 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1840                                         nmp->nm_cwnd = NFS_CWNDSCALE;
1841                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1842                         } else {
1843                                 rep->r_flags |= R_SENT;
1844                                 nmp->nm_sent += NFS_CWNDSCALE;
1845                         }
1846                         FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1847
1848                         bzero(&msg, sizeof(msg));
1849                         if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1850                                 msg.msg_name = mbuf_data(nmp->nm_nam);
1851                                 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1852                         }
1853                         error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1854
1855                         FSDBG(535, xid, error, sent, cwnd);
1856
1857                         if (error) {
1858                                 if (error == EWOULDBLOCK) {
1859                                         rep->r_flags = flags;
1860                                         rep->r_rexmit = rexmit;
1861                                         nmp->nm_cwnd = cwnd;
1862                                         nmp->nm_sent = sent;
1863                                         rep->r_xid = xid;
1864                                 }
1865                                 else {
1866                                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1867                                                 int clearerror;
1868                                                 int optlen = sizeof(clearerror);
1869                                                 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1870                                         }
1871                                         rep->r_flags  = flags | R_RESENDERR;
1872                                         rep->r_rexmit = rexmit;
1873                                         nmp->nm_cwnd = cwnd;
1874                                         nmp->nm_sent = sent;
1875                                         if (flags & R_SENT)
1876                                                 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1877                                 }
1878                         } else
1879                                 rep->r_rtt = 0;
1880                 }
1881         }
1882         microuptime(&now);
1883 #ifndef NFS_NOSERVER
1884         /*
1885          * Scan the write gathering queues for writes that need to be
1886          * completed now.
1887          */
1888         cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1889         lck_mtx_lock(nfsd_mutex);
1890         TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1891             if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1892                 nfsrv_wakenfsd(slp);
1893         }
1894         lck_mtx_unlock(nfsd_mutex);
1895 #endif /* NFS_NOSERVER */
1896
1897         if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1898                 /*
1899                  * We haven't called nfs_buf_freeup() in a little while.
1900                  * So, see if we can free up any stale/unused bufs now.
1901                  */
1902                 nfs_buf_freeup(1);
1903         }
1904
1905         timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1906
1907 }
1908
1909
1910 /*
1911  * Test for a termination condition pending on the process.
1912  * This is used to determine if we need to bail on a mount.
1913  * EIO is returned if there has been a soft timeout.
1914  * EINTR is returned if there is a signal pending that is not being ignored
1915  * and the mount is interruptable, or if we are a thread that is in the process
1916  * of cancellation (also SIGKILL posted).
1917  */
1918 int
1919 nfs_sigintr(nmp, rep, p)
1920         struct nfsmount *nmp;
1921         struct nfsreq *rep;
1922         proc_t p;
1923 {
1924         sigset_t pending_sigs;
1925         int context_good = 0;
1926         struct nfsmount *repnmp;
1927         extern proc_t kernproc;
1928
1929         if (nmp == NULL)
1930                 return (ENXIO);
1931         if (rep != NULL) {
1932                 repnmp = rep->r_nmp;
1933                 /* we've had a forced unmount. */
1934                 if (repnmp == NULL)
1935                         return (ENXIO);
1936                 /* request has timed out on a 'soft' mount. */
1937                 if (rep->r_flags & R_SOFTTERM)
1938                         return (EIO);
1939                 /*
1940                  * We're in the progress of a force unmount and there's
1941                  * been a timeout we're dead and fail IO.
1942                  */
1943                 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1944                    (NFSSTA_FORCE|NFSSTA_TIMEO))
1945                         return (EIO);
1946                 /* Someone is unmounting us, go soft and mark it. */
1947                 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1948                         repnmp->nm_flag |= NFSMNT_SOFT;
1949                         nmp->nm_state |= NFSSTA_FORCE;
1950                 }
1951                 /*
1952                  * If the mount is hung and we've requested not to hang
1953                  * on remote filesystems, then bail now.
1954                  */
1955                 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1956                     (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1957                         return (EIO);
1958         }
1959         /* XXX: is this valid?  this probably should be an assertion. */
1960         if (p == NULL)
1961                 return (0);
1962
1963         /* Is this thread belongs to kernel task; then abort check  is not needed */
1964         if ((current_proc() != kernproc) && current_thread_aborted()) {
1965                 return (EINTR);
1966         }
1967         /* mask off thread and process blocked signals. */
1968
1969         pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1970         if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1971                 return (EINTR);
1972         return (0);
1973 }
1974
1975 /*
1976  * Lock a socket against others.
1977  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1978  * and also to avoid race conditions between the processes with nfs requests
1979  * in progress when a reconnect is necessary.
1980  */
1981 int
1982 nfs_sndlock(rep)
1983         struct nfsreq *rep;
1984 {
1985         int *statep;
1986         proc_t p;
1987         int error, slpflag = 0, slptimeo = 0;
1988
1989         if (rep->r_nmp == NULL)
1990                 return (ENXIO);
1991         statep = &rep->r_nmp->nm_state;
1992
1993         p = rep->r_procp;
1994         if (rep->r_nmp->nm_flag & NFSMNT_INT)
1995                 slpflag = PCATCH;
1996         while (*statep & NFSSTA_SNDLOCK) {
1997                 error = nfs_sigintr(rep->r_nmp, rep, p);
1998                 if (error)
1999                         return (error);
2000                 *statep |= NFSSTA_WANTSND;
2001                 if (p != NULL && (proc_noremotehang(p)) != 0)
2002                         slptimeo = hz;
2003                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2004                 if (slpflag == PCATCH) {
2005                         slpflag = 0;
2006                         slptimeo = 2 * hz;
2007                 }
2008                 /*
2009                  * Make sure while we slept that the mountpoint didn't go away.
2010                  * nfs_sigintr and callers expect it in tact.
2011                  */
2012                 if (!rep->r_nmp)
2013                         return (ENXIO); /* don't have lock until out of loop */
2014         }
2015         *statep |= NFSSTA_SNDLOCK;
2016         return (0);
2017 }
2018
2019 /*
2020  * Unlock the stream socket for others.
2021  */
2022 void
2023 nfs_sndunlock(rep)
2024         struct nfsreq *rep;
2025 {
2026         int *statep;
2027
2028         if (rep->r_nmp == NULL)
2029                 return;
2030         statep = &rep->r_nmp->nm_state;
2031         if ((*statep & NFSSTA_SNDLOCK) == 0)
2032                 panic("nfs sndunlock");
2033         *statep &= ~NFSSTA_SNDLOCK;
2034         if (*statep & NFSSTA_WANTSND) {
2035                 *statep &= ~NFSSTA_WANTSND;
2036                 wakeup((caddr_t)statep);
2037         }
2038 }
2039
2040 static int
2041 nfs_rcvlock(struct nfsreq *rep)
2042 {
2043         int *statep;
2044         int error, slpflag, slptimeo = 0;
2045
2046         /* make sure we still have our mountpoint */
2047         if (!rep->r_nmp) {
2048                 if (rep->r_mrep != NULL)
2049                         return (EALREADY);
2050                 return (ENXIO);
2051         }
2052
2053         statep = &rep->r_nmp->nm_state;
2054         FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2055         if (rep->r_nmp->nm_flag & NFSMNT_INT)
2056                 slpflag = PCATCH;
2057         else
2058                 slpflag = 0;
2059         while (*statep & NFSSTA_RCVLOCK) {
2060                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2061                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2062                         return (error);
2063                 } else if (rep->r_mrep != NULL) {
2064                         /*
2065                          * Don't bother sleeping if reply already arrived
2066                          */
2067                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2068                         return (EALREADY);
2069                 }
2070                 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2071                 *statep |= NFSSTA_WANTRCV;
2072                 /*
2073                  * We need to poll if we're P_NOREMOTEHANG so that we
2074                  * call nfs_sigintr periodically above.
2075                  */
2076                 if (rep->r_procp != NULL &&
2077                     (proc_noremotehang(rep->r_procp)) != 0)
2078                         slptimeo = hz;
2079                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2080                 if (slpflag == PCATCH) {
2081                         slpflag = 0;
2082                         slptimeo = 2 * hz;
2083                 }
2084                 /*
2085                  * Make sure while we slept that the mountpoint didn't go away.
2086                  * nfs_sigintr and caller nfs_reply expect it intact.
2087                  */
2088                 if (!rep->r_nmp)  {
2089                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2090                         return (ENXIO); /* don't have lock until out of loop */
2091                 }
2092         }
2093         /*
2094          * nfs_reply will handle it if reply already arrived.
2095          * (We may have slept or been preempted).
2096          */
2097         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2098         *statep |= NFSSTA_RCVLOCK;
2099         return (0);
2100 }
2101
2102 /*
2103  * Unlock the stream socket for others.
2104  */
2105 static void
2106 nfs_rcvunlock(struct nfsreq *rep)
2107 {
2108         int *statep;
2109
2110         if (rep->r_nmp == NULL)
2111                 return;
2112         statep = &rep->r_nmp->nm_state;
2113
2114         FSDBG(533, statep, *statep, 0, 0);
2115         if ((*statep & NFSSTA_RCVLOCK) == 0)
2116                 panic("nfs rcvunlock");
2117         *statep &= ~NFSSTA_RCVLOCK;
2118         if (*statep & NFSSTA_WANTRCV) {
2119                 *statep &= ~NFSSTA_WANTRCV;
2120                 wakeup((caddr_t)statep);
2121         }
2122 }
2123
2124
2125 #ifndef NFS_NOSERVER
2126 /*
2127  * Socket upcall routine for the nfsd sockets.
2128  * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2129  * Essentially do as much as possible non-blocking, else punt and it will
2130  * be called with MBUF_WAITOK from an nfsd.
2131  */
2132 void
2133 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2134 {
2135         struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2136
2137         if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2138                 return;
2139
2140         lck_rw_lock_exclusive(&slp->ns_rwlock);
2141         nfsrv_rcv_locked(so, slp, waitflag);
2142         /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2143 }
2144 void
2145 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2146 {
2147         mbuf_t m, mp, mhck, m2;
2148         int ns_flag=0, error;
2149         struct msghdr   msg;
2150         size_t bytes_read;
2151
2152         if ((slp->ns_flag & SLP_VALID) == 0) {
2153                 if (waitflag == MBUF_DONTWAIT)
2154                         lck_rw_done(&slp->ns_rwlock);
2155                 return;
2156         }
2157
2158 #ifdef notdef
2159         /*
2160          * Define this to test for nfsds handling this under heavy load.
2161          */
2162         if (waitflag == MBUF_DONTWAIT) {
2163                 ns_flag = SLP_NEEDQ;
2164                 goto dorecs;
2165         }
2166 #endif
2167         if (slp->ns_sotype == SOCK_STREAM) {
2168                 /*
2169                  * If there are already records on the queue, defer soreceive()
2170                  * to an nfsd so that there is feedback to the TCP layer that
2171                  * the nfs servers are heavily loaded.
2172                  */
2173                 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2174                         ns_flag = SLP_NEEDQ;
2175                         goto dorecs;
2176                 }
2177
2178                 /*
2179                  * Do soreceive().
2180                  */
2181                 bytes_read = 1000000000;
2182                 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2183                 if (error || mp == NULL) {
2184                         if (error == EWOULDBLOCK)
2185                                 ns_flag = SLP_NEEDQ;
2186                         else
2187                                 ns_flag = SLP_DISCONN;
2188                         goto dorecs;
2189                 }
2190                 m = mp;
2191                 if (slp->ns_rawend) {
2192                         if ((error = mbuf_setnext(slp->ns_rawend, m)))
2193                                 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2194                         slp->ns_cc += bytes_read;
2195                 } else {
2196                         slp->ns_raw = m;
2197                         slp->ns_cc = bytes_read;
2198                 }
2199                 while ((m2 = mbuf_next(m)))
2200                         m = m2;
2201                 slp->ns_rawend = m;
2202
2203                 /*
2204                  * Now try and parse record(s) out of the raw stream data.
2205                  */
2206                 error = nfsrv_getstream(slp, waitflag);
2207                 if (error) {
2208                         if (error == EPERM)
2209                                 ns_flag = SLP_DISCONN;
2210                         else
2211                                 ns_flag = SLP_NEEDQ;
2212                 }
2213         } else {
2214                 struct sockaddr_storage nam;
2215
2216                 bzero(&msg, sizeof(msg));
2217                 msg.msg_name = (caddr_t)&nam;
2218                 msg.msg_namelen = sizeof(nam);
2219
2220                 do {
2221                         bytes_read = 1000000000;
2222                         error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2223                         if (mp) {
2224                                 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2225                                         mbuf_setlen(mhck, nam.ss_len);
2226                                         bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2227                                         m = mhck;
2228                                         if (mbuf_setnext(m, mp)) {
2229                                                 /* trouble... just drop it */
2230                                                 printf("nfsrv_rcv: mbuf_setnext failed\n");
2231                                                 mbuf_free(mhck);
2232                                                 m = mp;
2233                                         }
2234                                 } else {
2235                                         m = mp;
2236                                 }
2237                                 if (slp->ns_recend)
2238                                         mbuf_setnextpkt(slp->ns_recend, m);
2239                                 else
2240                                         slp->ns_rec = m;
2241                                 slp->ns_recend = m;
2242                                 mbuf_setnextpkt(m, NULL);
2243                         }
2244 #if 0
2245                         if (error) {
2246                                 /*
2247                                  * This may be needed in the future to support
2248                                  * non-byte-stream connection-oriented protocols
2249                                  * such as SCTP.
2250                                  */
2251                                 /*
2252                                  * This (slp->ns_sotype == SOCK_STREAM) should really
2253                                  * be a check for PR_CONNREQUIRED.
2254                                  */
2255                                 if ((slp->ns_sotype == SOCK_STREAM)
2256                                         && error != EWOULDBLOCK) {
2257                                         ns_flag = SLP_DISCONN;
2258                                         goto dorecs;
2259                                 }
2260                         }
2261 #endif
2262                 } while (mp);
2263         }
2264
2265         /*
2266          * Now try and process the request records, non-blocking.
2267          */
2268 dorecs:
2269         if (ns_flag)
2270                 slp->ns_flag |= ns_flag;
2271         if (waitflag == MBUF_DONTWAIT) {
2272                 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2273                 lck_rw_done(&slp->ns_rwlock);
2274                 if (wake && nfs_numnfsd) {
2275                         lck_mtx_lock(nfsd_mutex);
2276                         nfsrv_wakenfsd(slp);
2277                         lck_mtx_unlock(nfsd_mutex);
2278                 }
2279         }
2280 }
2281
2282 /*
2283  * Try and extract an RPC request from the mbuf data list received on a
2284  * stream socket. The "waitflag" argument indicates whether or not it
2285  * can sleep.
2286  */
2287 static int
2288 nfsrv_getstream(slp, waitflag)
2289         struct nfssvc_sock *slp;
2290         int waitflag;
2291 {
2292         mbuf_t m;
2293         char *cp1, *cp2, *mdata;
2294         int len, mlen, error;
2295         mbuf_t om, m2, recm;
2296         u_long recmark;
2297
2298         if (slp->ns_flag & SLP_GETSTREAM)
2299                 panic("nfs getstream");
2300         slp->ns_flag |= SLP_GETSTREAM;
2301         for (;;) {
2302             if (slp->ns_reclen == 0) {
2303                 if (slp->ns_cc < NFSX_UNSIGNED) {
2304                         slp->ns_flag &= ~SLP_GETSTREAM;
2305                         return (0);
2306                 }
2307                 m = slp->ns_raw;
2308                 mdata = mbuf_data(m);
2309                 mlen = mbuf_len(m);
2310                 if (mlen >= NFSX_UNSIGNED) {
2311                         bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2312                         mdata += NFSX_UNSIGNED;
2313                         mlen -= NFSX_UNSIGNED;
2314                         mbuf_setdata(m, mdata, mlen);
2315                 } else {
2316                         cp1 = (caddr_t)&recmark;
2317                         cp2 = mdata;
2318                         while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2319                                 while (mlen == 0) {
2320                                         m = mbuf_next(m);
2321                                         cp2 = mbuf_data(m);
2322                                         mlen = mbuf_len(m);
2323                                 }
2324                                 *cp1++ = *cp2++;
2325                                 mlen--;
2326                                 mbuf_setdata(m, cp2, mlen);
2327                         }
2328                 }
2329                 slp->ns_cc -= NFSX_UNSIGNED;
2330                 recmark = ntohl(recmark);
2331                 slp->ns_reclen = recmark & ~0x80000000;
2332                 if (recmark & 0x80000000)
2333                         slp->ns_flag |= SLP_LASTFRAG;
2334                 else
2335                         slp->ns_flag &= ~SLP_LASTFRAG;
2336                 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2337                         slp->ns_flag &= ~SLP_GETSTREAM;
2338                         return (EPERM);
2339                 }
2340             }
2341
2342             /*
2343              * Now get the record part.
2344              *
2345              * Note that slp->ns_reclen may be 0.  Linux sometimes
2346              * generates 0-length RPCs
2347              */
2348             recm = NULL;
2349             if (slp->ns_cc == slp->ns_reclen) {
2350                 recm = slp->ns_raw;
2351                 slp->ns_raw = slp->ns_rawend = NULL;
2352                 slp->ns_cc = slp->ns_reclen = 0;
2353             } else if (slp->ns_cc > slp->ns_reclen) {
2354                 len = 0;
2355                 m = slp->ns_raw;
2356                 mlen = mbuf_len(m);
2357                 mdata = mbuf_data(m);
2358                 om = NULL;
2359                 while (len < slp->ns_reclen) {
2360                         if ((len + mlen) > slp->ns_reclen) {
2361                                 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2362                                         slp->ns_flag &= ~SLP_GETSTREAM;
2363                                         return (EWOULDBLOCK);
2364                                 }
2365                                 if (om) {
2366                                         if (mbuf_setnext(om, m2)) {
2367                                                 /* trouble... just drop it */
2368                                                 printf("nfsrv_getstream: mbuf_setnext failed\n");
2369                                                 mbuf_freem(m2);
2370                                                 slp->ns_flag &= ~SLP_GETSTREAM;
2371                                                 return (EWOULDBLOCK);
2372                                         }
2373                                         recm = slp->ns_raw;
2374                                 } else {
2375                                         recm = m2;
2376                                 }
2377                                 mdata += slp->ns_reclen - len;
2378                                 mlen -= slp->ns_reclen - len;
2379                                 mbuf_setdata(m, mdata, mlen);
2380                                 len = slp->ns_reclen;
2381                         } else if ((len + mlen) == slp->ns_reclen) {
2382                                 om = m;
2383                                 len += mlen;
2384                                 m = mbuf_next(m);
2385                                 recm = slp->ns_raw;
2386                                 if (mbuf_setnext(om, NULL)) {
2387                                         printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2388                                         slp->ns_flag &= ~SLP_GETSTREAM;
2389                                         return (EWOULDBLOCK);
2390                                 }
2391                                 mlen = mbuf_len(m);
2392                                 mdata = mbuf_data(m);
2393                         } else {
2394                                 om = m;
2395                                 len += mlen;
2396                                 m = mbuf_next(m);
2397                                 mlen = mbuf_len(m);
2398                                 mdata = mbuf_data(m);
2399                         }
2400                 }
2401                 slp->ns_raw = m;
2402                 slp->ns_cc -= len;
2403                 slp->ns_reclen = 0;
2404             } else {
2405                 slp->ns_flag &= ~SLP_GETSTREAM;
2406                 return (0);
2407             }
2408
2409             /*
2410              * Accumulate the fragments into a record.
2411              */
2412             if (slp->ns_frag == NULL) {
2413                 slp->ns_frag = recm;
2414             } else {
2415                 m = slp->ns_frag;
2416                 while ((m2 = mbuf_next(m)))
2417                     m = m2;
2418                 if ((error = mbuf_setnext(m, recm)))
2419                     panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2420             }
2421             if (slp->ns_flag & SLP_LASTFRAG) {
2422                 if (slp->ns_recend)
2423                     mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2424                 else
2425                     slp->ns_rec = slp->ns_frag;
2426                 slp->ns_recend = slp->ns_frag;
2427                 slp->ns_frag = NULL;
2428             }
2429         }
2430 }
2431
2432 /*
2433  * Parse an RPC header.
2434  */
2435 int
2436 nfsrv_dorec(slp, nfsd, ndp)
2437         struct nfssvc_sock *slp;
2438         struct nfsd *nfsd;
2439         struct nfsrv_descript **ndp;
2440 {
2441         mbuf_t m;
2442         mbuf_t nam;
2443         struct nfsrv_descript *nd;
2444         int error;
2445
2446         *ndp = NULL;
2447         if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2448                 return (ENOBUFS);
2449         MALLOC_ZONE(nd, struct nfsrv_descript *,
2450                         sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2451         if (!nd)
2452                 return (ENOMEM);
2453         m = slp->ns_rec;
2454         slp->ns_rec = mbuf_nextpkt(m);
2455         if (slp->ns_rec)
2456                 mbuf_setnextpkt(m, NULL);
2457         else
2458                 slp->ns_recend = NULL;
2459         if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2460                 nam = m;
2461                 m = mbuf_next(m);
2462                 if ((error = mbuf_setnext(nam, NULL)))
2463                         panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2464         } else
2465                 nam = NULL;
2466         nd->nd_md = nd->nd_mrep = m;
2467         nd->nd_nam2 = nam;
2468         nd->nd_dpos = mbuf_data(m);
2469         error = nfs_getreq(nd, nfsd, TRUE);
2470         if (error) {
2471                 if (nam)
2472                         mbuf_freem(nam);
2473                 FREE_ZONE((caddr_t)nd,  sizeof *nd, M_NFSRVDESC);
2474                 return (error);
2475         }
2476         *ndp = nd;
2477         nfsd->nfsd_nd = nd;
2478         return (0);
2479 }
2480
2481 /*
2482  * Parse an RPC request
2483  * - verify it
2484  * - fill in the cred struct.
2485  */
2486 int
2487 nfs_getreq(nd, nfsd, has_header)
2488         struct nfsrv_descript *nd;
2489         struct nfsd *nfsd;
2490         int has_header;
2491 {
2492         int len, i;
2493         u_long *tl;
2494         long t1;
2495         uio_t uiop;
2496         caddr_t dpos, cp2, cp;
2497         u_long nfsvers, auth_type;
2498         uid_t nickuid;
2499         int error = 0, ticklen;
2500         mbuf_t mrep, md;
2501         struct nfsuid *nuidp;
2502         uid_t user_id;
2503         gid_t group_id;
2504         int ngroups;
2505         struct ucred temp_cred;
2506         struct timeval tvin, tvout, now;
2507         char uio_buf[ UIO_SIZEOF(1) ];
2508 #if 0                           /* until encrypted keys are implemented */
2509         NFSKERBKEYSCHED_T keys; /* stores key schedule */
2510 #endif
2511
2512         nd->nd_cr = NULL;
2513
2514         mrep = nd->nd_mrep;
2515         md = nd->nd_md;
2516         dpos = nd->nd_dpos;
2517         if (has_header) {
2518                 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2519                 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2520                 if (*tl++ != rpc_call) {
2521                         mbuf_freem(mrep);
2522                         return (EBADRPC);
2523                 }
2524         } else
2525                 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2526         nd->nd_repstat = 0;
2527         nd->nd_flag = 0;
2528         if (*tl++ != rpc_vers) {
2529                 nd->nd_repstat = ERPCMISMATCH;
2530                 nd->nd_procnum = NFSPROC_NOOP;
2531                 return (0);
2532         }
2533         if (*tl != nfs_prog) {
2534                 nd->nd_repstat = EPROGUNAVAIL;
2535                 nd->nd_procnum = NFSPROC_NOOP;
2536                 return (0);
2537         }
2538         tl++;
2539         nfsvers = fxdr_unsigned(u_long, *tl++);
2540         if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2541                 nd->nd_repstat = EPROGMISMATCH;
2542                 nd->nd_procnum = NFSPROC_NOOP;
2543                 return (0);
2544         }
2545         else if (nfsvers == NFS_VER3)
2546                 nd->nd_flag = ND_NFSV3;
2547         nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2548         if (nd->nd_procnum == NFSPROC_NULL)
2549                 return (0);
2550         if ((nd->nd_procnum >= NFS_NPROCS) ||
2551                 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2552                 nd->nd_repstat = EPROCUNAVAIL;
2553                 nd->nd_procnum = NFSPROC_NOOP;
2554                 return (0);
2555         }
2556         if ((nd->nd_flag & ND_NFSV3) == 0)
2557                 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2558         auth_type = *tl++;
2559         len = fxdr_unsigned(int, *tl++);
2560         if (len < 0 || len > RPCAUTH_MAXSIZ) {
2561                 mbuf_freem(mrep);
2562                 return (EBADRPC);
2563         }
2564
2565         nd->nd_flag &= ~ND_KERBAUTH;
2566         /*
2567          * Handle auth_unix or auth_kerb.
2568          */
2569         if (auth_type == rpc_auth_unix) {
2570                 len = fxdr_unsigned(int, *++tl);
2571                 if (len < 0 || len > NFS_MAXNAMLEN) {
2572                         mbuf_freem(mrep);
2573                         return (EBADRPC);
2574                 }
2575                 bzero(&temp_cred, sizeof(temp_cred));
2576                 nfsm_adv(nfsm_rndup(len));
2577                 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2578                 user_id = fxdr_unsigned(uid_t, *tl++);
2579                 group_id = fxdr_unsigned(gid_t, *tl++);
2580                 temp_cred.cr_groups[0] = group_id;
2581                 len = fxdr_unsigned(int, *tl);
2582                 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2583                         mbuf_freem(mrep);
2584                         return (EBADRPC);
2585                 }
2586                 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2587                 for (i = 1; i <= len; i++)
2588                     if (i < NGROUPS)
2589                         temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2590                     else
2591                         tl++;
2592                 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2593                 if (ngroups > 1)
2594                     nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2595                 len = fxdr_unsigned(int, *++tl);
2596                 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2597                         mbuf_freem(mrep);
2598                         return (EBADRPC);
2599                 }
2600                 temp_cred.cr_uid = user_id;
2601                 temp_cred.cr_ngroups = ngroups;
2602                 nd->nd_cr = kauth_cred_create(&temp_cred);
2603                 if (nd->nd_cr == NULL) {
2604                         nd->nd_repstat = ENOMEM;
2605                         nd->nd_procnum = NFSPROC_NOOP;
2606                         return (0);
2607                 }
2608                 if (len > 0)
2609                         nfsm_adv(nfsm_rndup(len));
2610         } else if (auth_type == rpc_auth_kerb) {
2611                 switch (fxdr_unsigned(int, *tl++)) {
2612                 case RPCAKN_FULLNAME:
2613                         ticklen = fxdr_unsigned(int, *tl);
2614                         *((u_long *)nfsd->nfsd_authstr) = *tl;
2615                         uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2616                                                 &uio_buf[0], sizeof(uio_buf));
2617                         if (!uiop) {
2618                                 nd->nd_repstat = ENOMEM;
2619                                 nd->nd_procnum = NFSPROC_NOOP;
2620                                 return (0);
2621                         }
2622
2623                         // LP64todo - fix this
2624                         nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2625                         if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2626                                 mbuf_freem(mrep);
2627                                 return (EBADRPC);
2628                         }
2629                         uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2630                         // LP64todo - fix this
2631                         nfsm_mtouio(uiop, uio_resid(uiop));
2632                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2633                         if (*tl++ != rpc_auth_kerb ||
2634                                 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2635                                 printf("Bad kerb verifier\n");
2636                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2637                                 nd->nd_procnum = NFSPROC_NOOP;
2638                                 return (0);
2639                         }
2640                         nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2641                         tl = (u_long *)cp;
2642                         if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2643                                 printf("Not fullname kerb verifier\n");
2644                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2645                                 nd->nd_procnum = NFSPROC_NOOP;
2646                                 return (0);
2647                         }
2648                         cp += NFSX_UNSIGNED;
2649                         bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2650                         nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2651                         nd->nd_flag |= ND_KERBFULL;
2652                         nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2653                         break;
2654                 case RPCAKN_NICKNAME:
2655                         if (len != 2 * NFSX_UNSIGNED) {
2656                                 printf("Kerb nickname short\n");
2657                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2658                                 nd->nd_procnum = NFSPROC_NOOP;
2659                                 return (0);
2660                         }
2661                         nickuid = fxdr_unsigned(uid_t, *tl);
2662                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2663                         if (*tl++ != rpc_auth_kerb ||
2664                                 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2665                                 printf("Kerb nick verifier bad\n");
2666                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2667                                 nd->nd_procnum = NFSPROC_NOOP;
2668                                 return (0);
2669                         }
2670                         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2671                         tvin.tv_sec = *tl++;
2672                         tvin.tv_usec = *tl;
2673
2674                         for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2675                             nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2676                                 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2677                                     (!nd->nd_nam2 ||
2678                                      netaddr_match(NU_NETFAM(nuidp),
2679                                       &nuidp->nu_haddr, nd->nd_nam2)))
2680                                         break;
2681                         }
2682                         if (!nuidp) {
2683                                 nd->nd_repstat =
2684                                         (NFSERR_AUTHERR|AUTH_REJECTCRED);
2685                                 nd->nd_procnum = NFSPROC_NOOP;
2686                                 return (0);
2687                         }
2688
2689                         /*
2690                          * Now, decrypt the timestamp using the session key
2691                          * and validate it.
2692                          */
2693 #if NFSKERB
2694                         XXX
2695 #endif
2696
2697                         tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2698                         tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2699                         microtime(&now);
2700                         if (nuidp->nu_expire < now.tv_sec ||
2701                             nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2702                             (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2703                              nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2704                                 nuidp->nu_expire = 0;
2705                                 nd->nd_repstat =
2706                                     (NFSERR_AUTHERR|AUTH_REJECTVERF);
2707                                 nd->nd_procnum = NFSPROC_NOOP;
2708                                 return (0);
2709                         }
2710                         bzero(&temp_cred, sizeof(temp_cred));
2711                         ngroups = nuidp->nu_cr->cr_ngroups;
2712                         for (i = 0; i < ngroups; i++)
2713                                 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2714                         if (ngroups > 1)
2715                                 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2716
2717                         temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2718                         temp_cred.cr_ngroups = ngroups;
2719                         nd->nd_cr = kauth_cred_create(&temp_cred);
2720                         if (!nd->nd_cr) {
2721                                 nd->nd_repstat = ENOMEM;
2722                                 nd->nd_procnum = NFSPROC_NOOP;
2723                                 return (0);
2724                         }
2725                         nd->nd_flag |= ND_KERBNICK;
2726                 };
2727         } else {
2728                 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2729                 nd->nd_procnum = NFSPROC_NOOP;
2730                 return (0);
2731         }
2732
2733         nd->nd_md = md;
2734         nd->nd_dpos = dpos;
2735         return (0);
2736 nfsmout:
2737         if (nd->nd_cr)
2738                 kauth_cred_rele(nd->nd_cr);
2739         return (error);
2740 }
2741
2742 /*
2743  * Search for a sleeping nfsd and wake it up.
2744  * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2745  * running nfsds will go look for the work in the nfssvc_sock list.
2746  * Note: Must be called with nfsd_mutex held.
2747  */
2748 void
2749 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2750 {
2751         struct nfsd *nd;
2752
2753         if ((slp->ns_flag & SLP_VALID) == 0)
2754                 return;
2755
2756         lck_rw_lock_exclusive(&slp->ns_rwlock);
2757
2758         if (nfsd_waiting) {
2759                 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2760                         if (nd->nfsd_flag & NFSD_WAITING) {
2761                                 nd->nfsd_flag &= ~NFSD_WAITING;
2762                                 if (nd->nfsd_slp)
2763                                         panic("nfsd wakeup");
2764                                 slp->ns_sref++;
2765                                 nd->nfsd_slp = slp;
2766                                 lck_rw_done(&slp->ns_rwlock);
2767                                 wakeup((caddr_t)nd);
2768                                 return;
2769                         }
2770                 }
2771         }
2772
2773         slp->ns_flag |= SLP_DOREC;
2774
2775         lck_rw_done(&slp->ns_rwlock);
2776
2777         nfsd_head_flag |= NFSD_CHECKSLP;
2778 }
2779 #endif /* NFS_NOSERVER */
2780
2781 static int
2782 nfs_msg(proc_t p,
2783         const char *server,
2784         const char *msg,
2785         int error)
2786 {
2787         tpr_t tpr;
2788
2789         if (p)
2790                 tpr = tprintf_open(p);
2791         else
2792                 tpr = NULL;
2793         if (error)
2794                 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2795                     error);
2796         else
2797                 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2798         tprintf_close(tpr);
2799         return (0);
2800 }
2801
2802 void
2803 nfs_down(nmp, proc, error, flags, msg)
2804         struct nfsmount *nmp;
2805         proc_t proc;
2806         int error, flags;
2807         const char *msg;
2808 {
2809         if (nmp == NULL)
2810                 return;
2811         if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2812                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2813                 nmp->nm_state |= NFSSTA_TIMEO;
2814         }
2815         if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2816                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2817                 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2818         }
2819         nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2820 }
2821
2822 void
2823 nfs_up(nmp, proc, flags, msg)
2824         struct nfsmount *nmp;
2825         proc_t proc;
2826         int flags;
2827         const char *msg;
2828 {
2829         if (nmp == NULL)
2830                 return;
2831         if (msg)
2832                 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2833         if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2834                 nmp->nm_state &= ~NFSSTA_TIMEO;
2835                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2836         }
2837         if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2838                 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2839                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2840         }
2841 }
2842