bsd/nfs/nfs_socket.c

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All Rights Reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  31 /*
  32  * Copyright (c) 1989, 1991, 1993, 1995
  33  *      The Regents of the University of California.  All rights reserved.
  34  *
  35  * This code is derived from software contributed to Berkeley by
  36  * Rick Macklem at The University of Guelph.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
  67  * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
  68  */
  69
  70 /*
  71  * Socket operations for use by nfs
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/proc.h>
  77 #include <sys/kauth.h>
  78 #include <sys/mount_internal.h>
  79 #include <sys/kernel.h>
  80 #include <sys/kpi_mbuf.h>
  81 #include <sys/malloc.h>
  82 #include <sys/vnode.h>
  83 #include <sys/domain.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/syslog.h>
  87 #include <sys/tprintf.h>
  88 #include <sys/uio_internal.h>
  89 #include <libkern/OSAtomic.h>
  90
  91 #include <sys/time.h>
  92 #include <kern/clock.h>
  93 #include <kern/task.h>
  94 #include <kern/thread.h>
  95 #include <sys/user.h>
  96
  97 #include <netinet/in.h>
  98 #include <netinet/tcp.h>
  99
 100 #include <nfs/rpcv2.h>
 101 #include <nfs/nfsproto.h>
 102 #include <nfs/nfs.h>
 103 #include <nfs/xdr_subs.h>
 104 #include <nfs/nfsm_subs.h>
 105 #include <nfs/nfsmount.h>
 106 #include <nfs/nfsnode.h>
 107 #include <nfs/nfsrtt.h>
 108
 109 #include <sys/kdebug.h>
 110
 111 #define FSDBG(A, B, C, D, E) \
 112         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
 113                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 114 #define FSDBG_TOP(A, B, C, D, E) \
 115         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
 116                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 117 #define FSDBG_BOT(A, B, C, D, E) \
 118         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
 119                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 120
 121 /*
 122  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
 123  * Use the mean and mean deviation of rtt for the appropriate type of rpc
 124  * for the frequent rpcs and a default for the others.
 125  * The justification for doing "other" this way is that these rpcs
 126  * happen so infrequently that timer est. would probably be stale.
 127  * Also, since many of these rpcs are
 128  * non-idempotent, a conservative timeout is desired.
 129  * getattr, lookup - A+2D
 130  * read, write     - A+4D
 131  * other           - nm_timeo
 132  */
 133 #define NFS_RTO(n, t) \
 134         ((t) == 0 ? (n)->nm_timeo : \
 135          ((t) < 3 ? \
 136           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
 137           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
 138 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
 139 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
 140 /*
 141  * External data, mostly RPC constants in XDR form
 142  */
 143 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
 144         rpc_msgaccepted, rpc_call, rpc_autherr,
 145         rpc_auth_kerb;
 146 extern u_long nfs_prog;
 147 extern struct nfsstats nfsstats;
 148 extern int nfsv3_procid[NFS_NPROCS];
 149 extern int nfs_ticks;
 150 extern u_long nfs_xidwrap;
 151
 152 /*
 153  * Defines which timer to use for the procnum.
 154  * 0 - default
 155  * 1 - getattr
 156  * 2 - lookup
 157  * 3 - read
 158  * 4 - write
 159  */
 160 static int proct[NFS_NPROCS] = {
 161         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
 162 };
 163
 164 /*
 165  * There is a congestion window for outstanding rpcs maintained per mount
 166  * point. The cwnd size is adjusted in roughly the way that:
 167  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
 168  * SIGCOMM '88". ACM, August 1988.
 169  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
 170  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
 171  * of rpcs is in progress.
 172  * (The sent count and cwnd are scaled for integer arith.)
 173  * Variants of "slow start" were tried and were found to be too much of a
 174  * performance hit (ave. rtt 3 times larger),
 175  * I suspect due to the large rtt that nfs rpcs have.
 176  */
 177 #define NFS_CWNDSCALE   256
 178 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
 179 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
 180 int nfsrtton = 0;
 181 struct nfsrtt nfsrtt;
 182
 183 static int      nfs_rcvlock(struct nfsreq *);
 184 static void     nfs_rcvunlock(struct nfsreq *);
 185 static int      nfs_receive(struct nfsreq *rep, mbuf_t *mp);
 186 static int      nfs_reconnect(struct nfsreq *rep);
 187 static void     nfs_repdequeue(struct nfsreq *rep);
 188
 189 /* XXX */
 190 boolean_t       current_thread_aborted(void);
 191 kern_return_t   thread_terminate(thread_t);
 192
 193 #ifndef NFS_NOSERVER
 194 static int      nfsrv_getstream(struct nfssvc_sock *,int);
 195
 196 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
 197                                     struct nfssvc_sock *slp,
 198                                     proc_t procp,
 199                                     mbuf_t *mreqp) = {
 200         nfsrv_null,
 201         nfsrv_getattr,
 202         nfsrv_setattr,
 203         nfsrv_lookup,
 204         nfsrv3_access,
 205         nfsrv_readlink,
 206         nfsrv_read,
 207         nfsrv_write,
 208         nfsrv_create,
 209         nfsrv_mkdir,
 210         nfsrv_symlink,
 211         nfsrv_mknod,
 212         nfsrv_remove,
 213         nfsrv_rmdir,
 214         nfsrv_rename,
 215         nfsrv_link,
 216         nfsrv_readdir,
 217         nfsrv_readdirplus,
 218         nfsrv_statfs,
 219         nfsrv_fsinfo,
 220         nfsrv_pathconf,
 221         nfsrv_commit,
 222         nfsrv_noop
 223 };
 224 #endif /* NFS_NOSERVER */
 225
 226
 227 /*
 228  * attempt to bind a socket to a reserved port
 229  */
 230 static int
 231 nfs_bind_resv(struct nfsmount *nmp)
 232 {
 233         socket_t so = nmp->nm_so;
 234         struct sockaddr_in sin;
 235         int error;
 236         u_short tport;
 237
 238         if (!so)
 239                 return (EINVAL);
 240
 241         sin.sin_len = sizeof (struct sockaddr_in);
 242         sin.sin_family = AF_INET;
 243         sin.sin_addr.s_addr = INADDR_ANY;
 244         tport = IPPORT_RESERVED - 1;
 245         sin.sin_port = htons(tport);
 246
 247         while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
 248                (--tport > IPPORT_RESERVED / 2))
 249                 sin.sin_port = htons(tport);
 250         return (error);
 251 }
 252
 253 /*
 254  * variables for managing the nfs_bind_resv_thread
 255  */
 256 int nfs_resv_mounts = 0;
 257 static int nfs_bind_resv_thread_state = 0;
 258 #define NFS_BIND_RESV_THREAD_STATE_INITTED      1
 259 #define NFS_BIND_RESV_THREAD_STATE_RUNNING      2
 260 lck_grp_t *nfs_bind_resv_lck_grp;
 261 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
 262 lck_attr_t *nfs_bind_resv_lck_attr;
 263 lck_mtx_t *nfs_bind_resv_mutex;
 264 struct nfs_bind_resv_request {
 265         TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
 266         struct nfsmount *brr_nmp;
 267         int brr_error;
 268 };
 269 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
 270
 271 /*
 272  * thread to handle any reserved port bind requests
 273  */
 274 static void
 275 nfs_bind_resv_thread(void)
 276 {
 277         struct nfs_bind_resv_request *brreq;
 278
 279         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 280
 281         while (nfs_resv_mounts > 0) {
 282                 lck_mtx_lock(nfs_bind_resv_mutex);
 283                 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
 284                         TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
 285                         lck_mtx_unlock(nfs_bind_resv_mutex);
 286                         brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
 287                         wakeup(brreq);
 288                         lck_mtx_lock(nfs_bind_resv_mutex);
 289                 }
 290                 msleep((caddr_t)&nfs_bind_resv_request_queue,
 291                                 nfs_bind_resv_mutex, PSOCK | PDROP,
 292                                 "nfs_bind_resv_request_queue", 0);
 293         }
 294
 295         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 296         (void) thread_terminate(current_thread());
 297 }
 298
 299 int
 300 nfs_bind_resv_thread_wake(void)
 301 {
 302         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
 303                 return (EIO);
 304         wakeup(&nfs_bind_resv_request_queue);
 305         return (0);
 306 }
 307
 308 /*
 309  * underprivileged procs call this to request nfs_bind_resv_thread
 310  * to perform the reserved port binding for them.
 311  */
 312 static int
 313 nfs_bind_resv_nopriv(struct nfsmount *nmp)
 314 {
 315         struct nfs_bind_resv_request brreq;
 316         int error;
 317
 318         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
 319                 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
 320                         nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
 321                         lck_grp_attr_setstat(nfs_bind_resv_lck_grp_attr);
 322                         nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
 323                         nfs_bind_resv_lck_attr = lck_attr_alloc_init();
 324                         nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
 325                         TAILQ_INIT(&nfs_bind_resv_request_queue);
 326                         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 327                 }
 328                 kernel_thread(kernel_task, nfs_bind_resv_thread);
 329                 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 330         }
 331
 332         brreq.brr_nmp = nmp;
 333         brreq.brr_error = 0;
 334
 335         lck_mtx_lock(nfs_bind_resv_mutex);
 336         TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 337         lck_mtx_unlock(nfs_bind_resv_mutex);
 338
 339         error = nfs_bind_resv_thread_wake();
 340         if (error) {
 341                 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 342                 /* Note: we might be able to simply restart the thread */
 343                 return (error);
 344         }
 345
 346         tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
 347
 348         return (brreq.brr_error);
 349 }
 350
 351 /*
 352  * Initialize sockets and congestion for a new NFS connection.
 353  * We do not free the sockaddr if error.
 354  */
 355 int
 356 nfs_connect(
 357         struct nfsmount *nmp,
 358         __unused struct nfsreq *rep)
 359 {
 360         socket_t so;
 361         int error, rcvreserve, sndreserve;
 362         struct sockaddr *saddr;
 363         struct timeval timeo;
 364
 365         nmp->nm_so = 0;
 366         saddr = mbuf_data(nmp->nm_nam);
 367         error = sock_socket(saddr->sa_family, nmp->nm_sotype,
 368                                                 nmp->nm_soproto, 0, 0, &nmp->nm_so);
 369         if (error) {
 370                 goto bad;
 371         }
 372         so = nmp->nm_so;
 373
 374         /*
 375          * Some servers require that the client port be a reserved port number.
 376          */
 377         if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
 378                 proc_t p;
 379                 /*
 380                  * sobind() requires current_proc() to have superuser privs.
 381                  * If this bind is part of a reconnect, and the current proc
 382                  * doesn't have superuser privs, we hand the sobind() off to
 383                  * a kernel thread to process.
 384                  */
 385                 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
 386                     (p = current_proc()) && suser(kauth_cred_get(), 0)) {
 387                         /* request nfs_bind_resv_thread() to do bind */
 388                         error = nfs_bind_resv_nopriv(nmp);
 389                 } else {
 390                         error = nfs_bind_resv(nmp);
 391                 }
 392                 if (error)
 393                         goto bad;
 394         }
 395
 396         /*
 397          * Protocols that do not require connections may be optionally left
 398          * unconnected for servers that reply from a port other than NFS_PORT.
 399          */
 400         if (nmp->nm_flag & NFSMNT_NOCONN) {
 401                 if (nmp->nm_sotype == SOCK_STREAM) {
 402                         error = ENOTCONN;
 403                         goto bad;
 404                 }
 405         } else {
 406                 struct timeval  tv;
 407                 tv.tv_sec = 2;
 408                 tv.tv_usec = 0;
 409                 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
 410                 if (error && error != EINPROGRESS) {
 411                         goto bad;
 412                 }
 413
 414                 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
 415                         if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
 416                                 goto bad;
 417                         }
 418                 }
 419         }
 420
 421         /*
 422          * Always time out on recieve, this allows us to reconnect the
 423          * socket to deal with network changes.
 424          */
 425         timeo.tv_usec = 0;
 426         timeo.tv_sec = 2;
 427         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
 428         if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
 429                 timeo.tv_sec = 5;
 430         } else {
 431                 timeo.tv_sec = 0;
 432         }
 433         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
 434
 435         if (nmp->nm_sotype == SOCK_DGRAM) {
 436                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 437                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 438                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 439         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
 440                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 441                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 442                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 443         } else {
 444                 int proto;
 445                 int on = 1;
 446
 447                 sock_gettype(so, NULL, NULL, &proto);
 448                 if (nmp->nm_sotype != SOCK_STREAM)
 449                         panic("nfscon sotype");
 450
 451                 // Assume that SOCK_STREAM always requires a connection
 452                 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
 453
 454                 if (proto == IPPROTO_TCP) {
 455                         sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
 456                 }
 457
 458                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
 459                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
 460                                 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 461         }
 462
 463         if (sndreserve > NFS_MAXSOCKBUF)
 464                 sndreserve = NFS_MAXSOCKBUF;
 465         if (rcvreserve > NFS_MAXSOCKBUF)
 466                 rcvreserve = NFS_MAXSOCKBUF;
 467         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
 468         if (error) {
 469                 goto bad;
 470         }
 471         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
 472         if (error) {
 473                 goto bad;
 474         }
 475
 476         sock_nointerrupt(so, 1);
 477
 478         /* Initialize other non-zero congestion variables */
 479         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
 480                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
 481         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
 482                 nmp->nm_sdrtt[3] = 0;
 483         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
 484         nmp->nm_sent = 0;
 485         FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
 486         nmp->nm_timeouts = 0;
 487         return (0);
 488
 489 bad:
 490         nfs_disconnect(nmp);
 491         return (error);
 492 }
 493
 494 /*
 495  * Reconnect routine:
 496  * Called when a connection is broken on a reliable protocol.
 497  * - clean up the old socket
 498  * - nfs_connect() again
 499  * - set R_MUSTRESEND for all outstanding requests on mount point
 500  * If this fails the mount point is DEAD!
 501  * nb: Must be called with the nfs_sndlock() set on the mount point.
 502  */
 503 static int
 504 nfs_reconnect(struct nfsreq *rep)
 505 {
 506         struct nfsreq *rp;
 507         struct nfsmount *nmp = rep->r_nmp;
 508         int error;
 509
 510         nfs_disconnect(nmp);
 511         while ((error = nfs_connect(nmp, rep))) {
 512                 if (error == EINTR || error == ERESTART)
 513                         return (EINTR);
 514                 if (error == EIO)
 515                         return (EIO);
 516                 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
 517                         "can not connect");
 518                 rep->r_flags |= R_TPRINTFMSG;
 519                 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
 520                         /* we're not yet completely mounted and */
 521                         /* we can't reconnect, so we fail */
 522                         return (error);
 523                 }
 524                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
 525                         return (error);
 526                 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
 527         }
 528
 529         /*
 530          * Loop through outstanding request list and fix up all requests
 531          * on old socket.
 532          */
 533         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
 534                 if (rp->r_nmp == nmp)
 535                         rp->r_flags |= R_MUSTRESEND;
 536         }
 537         return (0);
 538 }
 539
 540 /*
 541  * NFS disconnect. Clean up and unlink.
 542  */
 543 void
 544 nfs_disconnect(struct nfsmount *nmp)
 545 {
 546         socket_t so;
 547
 548         if (nmp->nm_so) {
 549                 so = nmp->nm_so;
 550                 nmp->nm_so = 0;
 551                 sock_shutdown(so, 2);
 552                 sock_close(so);
 553         }
 554 }
 555
 556 /*
 557  * This is the nfs send routine. For connection based socket types, it
 558  * must be called with an nfs_sndlock() on the socket.
 559  * "rep == NULL" indicates that it has been called from a server.
 560  * For the client side:
 561  * - return EINTR if the RPC is terminated, 0 otherwise
 562  * - set R_MUSTRESEND if the send fails for any reason
 563  * - do any cleanup required by recoverable socket errors (???)
 564  * For the server side:
 565  * - return EINTR or ERESTART if interrupted by a signal
 566  * - return EPIPE if a connection is lost for connection based sockets (TCP...)
 567  * - do any cleanup required by recoverable socket errors (???)
 568  */
 569 int
 570 nfs_send(so, nam, top, rep)
 571         socket_t so;
 572         mbuf_t nam;
 573         mbuf_t top;
 574         struct nfsreq *rep;
 575 {
 576         struct sockaddr *sendnam;
 577         int error, error2, sotype, flags;
 578         u_long xidqueued = 0;
 579         struct nfsreq *rp;
 580         char savenametolog[MAXPATHLEN];
 581         struct msghdr msg;
 582
 583         if (rep) {
 584                 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 585                 if (error) {
 586                         mbuf_freem(top);
 587                         return (error);
 588                 }
 589                 if ((so = rep->r_nmp->nm_so) == NULL) {
 590                         rep->r_flags |= R_MUSTRESEND;
 591                         mbuf_freem(top);
 592                         return (0);
 593                 }
 594                 rep->r_flags &= ~R_MUSTRESEND;
 595                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 596                         if (rp == rep)
 597                                 break;
 598                 if (rp)
 599                         xidqueued = rp->r_xid;
 600         }
 601         sock_gettype(so, NULL, &sotype, NULL);
 602         if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
 603             (nam == 0))
 604                 sendnam = (struct sockaddr *)0;
 605         else
 606                 sendnam = mbuf_data(nam);
 607
 608         if (sotype == SOCK_SEQPACKET)
 609                 flags = MSG_EOR;
 610         else
 611                 flags = 0;
 612
 613         /*
 614          * Save the name here in case mount point goes away if we block.
 615          * The name is using local stack and is large, but don't
 616          * want to block if we malloc.
 617          */
 618         if (rep)
 619                 strncpy(savenametolog,
 620                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
 621                         MAXPATHLEN - 1);
 622         bzero(&msg, sizeof(msg));
 623         msg.msg_name = (caddr_t)sendnam;
 624         msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
 625         error = sock_sendmbuf(so, &msg, top, flags, NULL);
 626
 627         if (error) {
 628                 if (rep) {
 629                         if (xidqueued) {
 630                                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 631                                         if (rp == rep && rp->r_xid == xidqueued)
 632                                                 break;
 633                                 if (!rp)
 634                                         panic("nfs_send: error %d xid %x gone",
 635                                               error, xidqueued);
 636                         }
 637                         log(LOG_INFO, "nfs send error %d for server %s\n",
 638                             error, savenametolog);
 639                         /*
 640                          * Deal with errors for the client side.
 641                          */
 642                         error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 643                         if (error2) {
 644                                 error = error2;
 645                         } else {
 646                                 rep->r_flags |= R_MUSTRESEND;
 647                         }
 648                 } else
 649                         log(LOG_INFO, "nfsd send error %d\n", error);
 650
 651                 /*
 652                  * Handle any recoverable (soft) socket errors here. (???)
 653                  */
 654                 if (error != EINTR && error != ERESTART && error != EIO &&
 655                         error != EWOULDBLOCK && error != EPIPE) {
 656                         error = 0;
 657                 }
 658         }
 659         return (error);
 660 }
 661
 662 /*
 663  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
 664  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
 665  * Mark and consolidate the data into a new mbuf list.
 666  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
 667  *     small mbufs.
 668  * For SOCK_STREAM we must be very careful to read an entire record once
 669  * we have read any of it, even if the system call has been interrupted.
 670  */
 671 static int
 672 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
 673 {
 674         socket_t so;
 675         struct iovec_32 aio;
 676         mbuf_t m, mlast;
 677         u_long len, fraglen;
 678         int error, error2, sotype;
 679         proc_t p = current_proc();      /* XXX */
 680         struct msghdr msg;
 681         size_t rcvlen;
 682         int lastfragment;
 683
 684         /*
 685          * Set up arguments for soreceive()
 686          */
 687         *mp = NULL;
 688         sotype = rep->r_nmp->nm_sotype;
 689
 690         /*
 691          * For reliable protocols, lock against other senders/receivers
 692          * in case a reconnect is necessary.
 693          * For SOCK_STREAM, first get the Record Mark to find out how much
 694          * more there is to get.
 695          * We must lock the socket against other receivers
 696          * until we have an entire rpc request/reply.
 697          */
 698         if (sotype != SOCK_DGRAM) {
 699                 error = nfs_sndlock(rep);
 700                 if (error)
 701                         return (error);
 702 tryagain:
 703                 /*
 704                  * Check for fatal errors and resending request.
 705                  */
 706                 /*
 707                  * Ugh: If a reconnect attempt just happened, nm_so
 708                  * would have changed. NULL indicates a failed
 709                  * attempt that has essentially shut down this
 710                  * mount point.
 711                  */
 712                 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
 713                         nfs_sndunlock(rep);
 714                         if (error)
 715                                 return (error);
 716                         return (EINTR);
 717                 }
 718                 so = rep->r_nmp->nm_so;
 719                 if (!so) {
 720                         error = nfs_reconnect(rep);
 721                         if (error) {
 722                                 nfs_sndunlock(rep);
 723                                 return (error);
 724                         }
 725                         goto tryagain;
 726                 }
 727                 while (rep->r_flags & R_MUSTRESEND) {
 728                         error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
 729                         if (!error) {
 730                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
 731                                 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
 732                         }
 733                         /*
 734                          * we also hold rcv lock so rep is still
 735                          * legit this point
 736                          */
 737                         if (error) {
 738                                 if (error == EINTR || error == ERESTART ||
 739                                     (error = nfs_reconnect(rep))) {
 740                                         nfs_sndunlock(rep);
 741                                         return (error);
 742                                 }
 743                                 goto tryagain;
 744                         }
 745                 }
 746                 nfs_sndunlock(rep);
 747                 if (sotype == SOCK_STREAM) {
 748                         error = 0;
 749                         len = 0;
 750                         lastfragment = 0;
 751                         mlast = NULL;
 752                         while (!error && !lastfragment) {
 753                                 aio.iov_base = (uintptr_t) &fraglen;
 754                                 aio.iov_len = sizeof(u_long);
 755                                 bzero(&msg, sizeof(msg));
 756                                 msg.msg_iov = (struct iovec *) &aio;
 757                                 msg.msg_iovlen = 1;
 758                                 do {
 759                                    error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
 760                                    if (!rep->r_nmp) /* if unmounted then bailout */
 761                                         goto shutout;
 762                                    if (error == EWOULDBLOCK && rep) {
 763                                         error2 = nfs_sigintr(rep->r_nmp, rep, p);
 764                                         if (error2)
 765                                                 error = error2;
 766                                    }
 767                                 } while (error == EWOULDBLOCK);
 768                                 if (!error && rcvlen < aio.iov_len) {
 769                                     /* only log a message if we got a partial word */
 770                                     if (rcvlen != 0)
 771                                             log(LOG_INFO,
 772                                                  "short receive (%d/%d) from nfs server %s\n",
 773                                                  rcvlen, sizeof(u_long),
 774                                                  vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 775                                     error = EPIPE;
 776                                 }
 777                                 if (error)
 778                                         goto errout;
 779                                 lastfragment = ntohl(fraglen) & 0x80000000;
 780                                 fraglen = ntohl(fraglen) & ~0x80000000;
 781                                 len += fraglen;
 782                                 /*
 783                                  * This is SERIOUS! We are out of sync with the sender
 784                                  * and forcing a disconnect/reconnect is all I can do.
 785                                  */
 786                                 if (len > NFS_MAXPACKET) {
 787                                     log(LOG_ERR, "%s (%d) from nfs server %s\n",
 788                                         "impossible RPC record length", len,
 789                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 790                                     error = EFBIG;
 791                                     goto errout;
 792                                 }
 793
 794                                 m = NULL;
 795                                 do {
 796                                     rcvlen = fraglen;
 797                                     error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
 798                                     if (!rep->r_nmp) /* if unmounted then bailout */ {
 799                                         goto shutout;
 800                                     }
 801                                 } while (error == EWOULDBLOCK || error == EINTR ||
 802                                          error == ERESTART);
 803
 804                                 if (!error && fraglen > rcvlen) {
 805                                     log(LOG_INFO,
 806                                         "short receive (%d/%d) from nfs server %s\n",
 807                                         rcvlen, fraglen,
 808                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 809                                     error = EPIPE;
 810                                     mbuf_freem(m);
 811                                 }
 812                                 if (!error) {
 813                                         if (!*mp) {
 814                                                 *mp = m;
 815                                                 mlast = m;
 816                                         } else {
 817                                                 error = mbuf_setnext(mlast, m);
 818                                                 if (error) {
 819                                                         printf("nfs_receive: mbuf_setnext failed %d\n", error);
 820                                                         mbuf_freem(m);
 821                                                 }
 822                                         }
 823                                         while (mbuf_next(mlast))
 824                                                 mlast = mbuf_next(mlast);
 825                                 }
 826                         }
 827                 } else {
 828                         bzero(&msg, sizeof(msg));
 829                         do {
 830                             rcvlen = 100000000;
 831                             error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 832                             if (!rep->r_nmp) /* if unmounted then bailout */ {
 833                                 goto shutout;
 834                             }
 835                             if (error == EWOULDBLOCK && rep) {
 836                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 837                                 if (error2) {
 838                                         return (error2);
 839                                 }
 840                             }
 841                         } while (error == EWOULDBLOCK);
 842
 843                         if ((msg.msg_flags & MSG_EOR) == 0)
 844                                 printf("Egad!!\n");
 845                         if (!error && *mp == NULL)
 846                                 error = EPIPE;
 847                         len = rcvlen;
 848                 }
 849 errout:
 850                 if (error && error != EINTR && error != ERESTART) {
 851                         mbuf_freem(*mp);
 852                         *mp = NULL;
 853                         if (error != EPIPE)
 854                                 log(LOG_INFO,
 855                                     "receive error %d from nfs server %s\n", error,
 856                                     vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 857                         error = nfs_sndlock(rep);
 858                         if (!error) {
 859                                 error = nfs_reconnect(rep);
 860                                 if (!error)
 861                                         goto tryagain;
 862                                 nfs_sndunlock(rep);
 863                         }
 864                 }
 865         } else {
 866                 /*
 867                  * We could have failed while rebinding the datagram socket
 868                  * so we need to attempt to rebind here.
 869                  */
 870                 if ((so = rep->r_nmp->nm_so) == NULL) {
 871                         error = nfs_sndlock(rep);
 872                         if (!error) {
 873                                 error = nfs_reconnect(rep);
 874                                 nfs_sndunlock(rep);
 875                         }
 876                         if (error)
 877                                 return (error);
 878                         if (!rep->r_nmp) /* if unmounted then bailout */
 879                                 return (ENXIO);
 880                         so = rep->r_nmp->nm_so;
 881                 }
 882                 bzero(&msg, sizeof(msg));
 883                 len = 0;
 884                 do {
 885                         rcvlen = 1000000;
 886                         error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 887                         if (!rep->r_nmp) /* if unmounted then bailout */
 888                                 goto shutout;
 889                         if (error) {
 890                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 891                                 if (error2) {
 892                                         error = error2;
 893                                         goto shutout;
 894                                 }
 895                         }
 896                         /* Reconnect for all errors.  We may be receiving
 897                          * soft/hard/blocking errors because of a network
 898                          * change.
 899                          * XXX: we should rate limit or delay this
 900                          * to once every N attempts or something.
 901                          * although TCP doesn't seem to.
 902                          */
 903                         if (error) {
 904                                 error2 = nfs_sndlock(rep);
 905                                 if (!error2) {
 906                                         error2 = nfs_reconnect(rep);
 907                                         if (error2)
 908                                                 error = error2;
 909                                         else if (!rep->r_nmp) /* if unmounted then bailout */
 910                                                 error = ENXIO;
 911                                         else
 912                                                 so = rep->r_nmp->nm_so;
 913                                         nfs_sndunlock(rep);
 914                                 } else {
 915                                         error = error2;
 916                                 }
 917                         }
 918                 } while (error == EWOULDBLOCK);
 919         }
 920 shutout:
 921         if (error) {
 922                 mbuf_freem(*mp);
 923                 *mp = NULL;
 924         }
 925         return (error);
 926 }
 927
 928 /*
 929  * Implement receipt of reply on a socket.
 930  * We must search through the list of received datagrams matching them
 931  * with outstanding requests using the xid, until ours is found.
 932  */
 933 /* ARGSUSED */
 934 int
 935 nfs_reply(myrep)
 936         struct nfsreq *myrep;
 937 {
 938         struct nfsreq *rep;
 939         struct nfsmount *nmp = myrep->r_nmp;
 940         long t1;
 941         mbuf_t mrep, md;
 942         u_long rxid, *tl;
 943         caddr_t dpos, cp2;
 944         int error;
 945
 946         /*
 947          * Loop around until we get our own reply
 948          */
 949         for (;;) {
 950                 /*
 951                  * Lock against other receivers so that I don't get stuck in
 952                  * sbwait() after someone else has received my reply for me.
 953                  * Also necessary for connection based protocols to avoid
 954                  * race conditions during a reconnect.
 955                  * If nfs_rcvlock() returns EALREADY, that means that
 956                  * the reply has already been recieved by another
 957                  * process and we can return immediately.  In this
 958                  * case, the lock is not taken to avoid races with
 959                  * other processes.
 960                  */
 961                 error = nfs_rcvlock(myrep);
 962                 if (error == EALREADY)
 963                         return (0);
 964                 if (error)
 965                         return (error);
 966
 967                 /*
 968                  * If we slept after putting bits otw, then reply may have
 969                  * arrived.  In which case returning is required, or we
 970                  * would hang trying to nfs_receive an already received reply.
 971                  */
 972                 if (myrep->r_mrep != NULL) {
 973                         nfs_rcvunlock(myrep);
 974                         FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
 975                         return (0);
 976                 }
 977                 /*
 978                  * Get the next Rpc reply off the socket. Assume myrep->r_nmp
 979                  * is still intact by checks done in nfs_rcvlock.
 980                  */
 981                 error = nfs_receive(myrep, &mrep);
 982                 /*
 983                  * Bailout asap if nfsmount struct gone (unmounted).
 984                  */
 985                 if (!myrep->r_nmp) {
 986                         FSDBG(530, myrep->r_xid, myrep, nmp, -2);
 987                         if (mrep)
 988                                 mbuf_freem(mrep);
 989                         return (ENXIO);
 990                 }
 991                 if (error) {
 992                         FSDBG(530, myrep->r_xid, myrep, nmp, error);
 993                         nfs_rcvunlock(myrep);
 994
 995                         /* Bailout asap if nfsmount struct gone (unmounted). */
 996                         if (!myrep->r_nmp) {
 997                                 if (mrep)
 998                                         mbuf_freem(mrep);
 999                                 return (ENXIO);
1000                         }
1001
1002                         /*
1003                          * Ignore routing errors on connectionless protocols??
1004                          */
1005                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1006                                 if (nmp->nm_so) {
1007                                         int clearerror;
1008                                         int optlen = sizeof(clearerror);
1009                                         sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1010                                 }
1011                                 continue;
1012                         }
1013                         if (mrep)
1014                                 mbuf_freem(mrep);
1015                         return (error);
1016                 }
1017
1018                 /*
1019                  * We assume all is fine, but if we did not have an error
1020                  * and mrep is 0, better not dereference it. nfs_receive
1021                  * calls soreceive which carefully sets error=0 when it got
1022                  * errors on sbwait (tsleep). In most cases, I assume that's
1023                  * so we could go back again. In tcp case, EPIPE is returned.
1024                  * In udp, case nfs_receive gets back here with no error and no
1025                  * mrep. Is the right fix to have soreceive check for process
1026                  * aborted after sbwait and return something non-zero? Should
1027                  * nfs_receive give an EPIPE?  Too risky to play with those
1028                  * two this late in game for a shutdown problem. Instead,
1029                  * just check here and get out. (ekn)
1030                  */
1031                 if (!mrep) {
1032                         nfs_rcvunlock(myrep);
1033                         FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1034                         return (ENXIO); /* sounds good */
1035                 }
1036
1037                 /*
1038                  * Get the xid and check that it is an rpc reply
1039                  */
1040                 md = mrep;
1041                 dpos = mbuf_data(md);
1042                 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1043                 rxid = *tl++;
1044                 if (*tl != rpc_reply) {
1045                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1046                         mbuf_freem(mrep);
1047 nfsmout:
1048                         if (nmp->nm_state & NFSSTA_RCVLOCK)
1049                                 nfs_rcvunlock(myrep);
1050                         continue;
1051                 }
1052
1053                 /*
1054                  * Loop through the request list to match up the reply
1055                  * Iff no match, just drop the datagram
1056                  */
1057                 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1058                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1059                                 /* Found it.. */
1060                                 rep->r_mrep = mrep;
1061                                 rep->r_md = md;
1062                                 rep->r_dpos = dpos;
1063                                 /*
1064                                  * If we're tracking the round trip time
1065                                  * then we update the circular log here
1066                                  * with the stats from our current request.
1067                                  */
1068                                 if (nfsrtton) {
1069                                         struct rttl *rt;
1070
1071                                         rt = &nfsrtt.rttl[nfsrtt.pos];
1072                                         rt->proc = rep->r_procnum;
1073                                         rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1074                                         rt->sent = nmp->nm_sent;
1075                                         rt->cwnd = nmp->nm_cwnd;
1076                                         if (proct[rep->r_procnum] == 0)
1077                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1078                                         rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1079                                         rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1080                                         rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1081                                         microtime(&rt->tstamp); // XXX unused
1082                                         if (rep->r_flags & R_TIMING)
1083                                                 rt->rtt = rep->r_rtt;
1084                                         else
1085                                                 rt->rtt = 1000000;
1086                                         nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1087                                 }
1088                                 /*
1089                                  * Update congestion window.
1090                                  * Do the additive increase of
1091                                  * one rpc/rtt.
1092                                  */
1093                                 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1094                                       nmp->nm_cwnd);
1095                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
1096                                         nmp->nm_cwnd +=
1097                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
1098                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1099                                         if (nmp->nm_cwnd > NFS_MAXCWND)
1100                                                 nmp->nm_cwnd = NFS_MAXCWND;
1101                                 }
1102                                 if (rep->r_flags & R_SENT) {
1103                                     rep->r_flags &= ~R_SENT;
1104                                     nmp->nm_sent -= NFS_CWNDSCALE;
1105                                }
1106                                 /*
1107                                  * Update rtt using a gain of 0.125 on the mean
1108                                  * and a gain of 0.25 on the deviation.
1109                                  */
1110                                 if (rep->r_flags & R_TIMING) {
1111                                         /*
1112                                          * Since the timer resolution of
1113                                          * NFS_HZ is so course, it can often
1114                                          * result in r_rtt == 0. Since
1115                                          * r_rtt == N means that the actual
1116                                          * rtt is between N+dt and N+2-dt ticks,
1117                                          * add 1.
1118                                          */
1119                                         if (proct[rep->r_procnum] == 0)
1120                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1121                                         t1 = rep->r_rtt + 1;
1122                                         t1 -= (NFS_SRTT(rep) >> 3);
1123                                         NFS_SRTT(rep) += t1;
1124                                         if (t1 < 0)
1125                                                 t1 = -t1;
1126                                         t1 -= (NFS_SDRTT(rep) >> 2);
1127                                         NFS_SDRTT(rep) += t1;
1128                                 }
1129                                 nmp->nm_timeouts = 0;
1130                                 break;
1131                         }
1132                 }
1133                 nfs_rcvunlock(myrep);
1134                 /*
1135                  * If not matched to a request, drop it.
1136                  * If it's mine, get out.
1137                  */
1138                 if (rep == 0) {
1139                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1140                         mbuf_freem(mrep);
1141                 } else if (rep == myrep) {
1142                         if (rep->r_mrep == NULL)
1143                                 panic("nfs_reply: nil r_mrep");
1144                         return (0);
1145                 }
1146                 FSDBG(530, myrep->r_xid, myrep, rep,
1147                       rep ? rep->r_xid : myrep->r_flags);
1148         }
1149 }
1150
1151 /*
1152  * nfs_request - goes something like this
1153  *      - fill in request struct
1154  *      - links it into list
1155  *      - calls nfs_send() for first transmit
1156  *      - calls nfs_receive() to get reply
1157  *      - break down rpc header and return with nfs reply pointed to
1158  *        by mrep or error
1159  * nb: always frees up mreq mbuf list
1160  */
1161 int
1162 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1163         vnode_t vp;
1164         mount_t mp;
1165         mbuf_t mrest;
1166         int procnum;
1167         proc_t procp;
1168         kauth_cred_t cred;
1169         mbuf_t *mrp;
1170         mbuf_t *mdp;
1171         caddr_t *dposp;
1172         u_int64_t *xidp;
1173 {
1174         mbuf_t m, mrep, m2;
1175         struct nfsreq re, *rep;
1176         u_long *tl;
1177         int i;
1178         struct nfsmount *nmp;
1179         mbuf_t md, mheadend;
1180         char nickv[RPCX_NICKVERF];
1181         time_t waituntil;
1182         caddr_t dpos, cp2;
1183         int t1, error = 0, mrest_len, auth_len, auth_type;
1184         int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1185         int verf_len, verf_type;
1186         u_long xid;
1187         char *auth_str, *verf_str;
1188         NFSKERBKEY_T key;               /* save session key */
1189         int nmsotype;
1190         struct timeval now;
1191
1192         if (mrp)
1193                 *mrp = NULL;
1194         if (xidp)
1195                 *xidp = 0;
1196         nmp = VFSTONFS(mp);
1197
1198         rep = &re;
1199
1200         if (vp)
1201                 nmp = VFSTONFS(vnode_mount(vp));
1202         if (nmp == NULL ||
1203             (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1204             (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1205                 mbuf_freem(mrest);
1206                 return (ENXIO);
1207         }
1208         nmsotype = nmp->nm_sotype;
1209
1210         FSDBG_TOP(531, vp, procnum, nmp, rep);
1211
1212         rep->r_nmp = nmp;
1213         rep->r_vp = vp;
1214         rep->r_procp = procp;
1215         rep->r_procnum = procnum;
1216         microuptime(&now);
1217         rep->r_lastmsg = now.tv_sec -
1218             ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1219         i = 0;
1220         m = mrest;
1221         while (m) {
1222                 i += mbuf_len(m);
1223                 m = mbuf_next(m);
1224         }
1225         mrest_len = i;
1226
1227         /*
1228          * Get the RPC header with authorization.
1229          */
1230 kerbauth:
1231         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1232         if (!nmp) {
1233                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1234                 mbuf_freem(mrest);
1235                 return (ENXIO);
1236         }
1237         verf_str = auth_str = (char *)0;
1238         if (nmp->nm_flag & NFSMNT_KERB) {
1239                 verf_str = nickv;
1240                 verf_len = sizeof (nickv);
1241                 auth_type = RPCAUTH_KERB4;
1242                 bzero((caddr_t)key, sizeof (key));
1243                 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1244                         &auth_len, verf_str, verf_len)) {
1245                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1246                         if (!nmp) {
1247                                 FSDBG_BOT(531, 2, vp, error, rep);
1248                                 mbuf_freem(mrest);
1249                                 return (ENXIO);
1250                         }
1251                         error = nfs_getauth(nmp, rep, cred, &auth_str,
1252                                 &auth_len, verf_str, &verf_len, key);
1253                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1254                         if (!error && !nmp)
1255                                 error = ENXIO;
1256                         if (error) {
1257                                 FSDBG_BOT(531, 2, vp, error, rep);
1258                                 mbuf_freem(mrest);
1259                                 return (error);
1260                         }
1261                 }
1262         } else {
1263                 auth_type = RPCAUTH_UNIX;
1264                 if (cred->cr_ngroups < 1)
1265                         panic("nfsreq nogrps");
1266                 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1267                         nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1268                         5 * NFSX_UNSIGNED;
1269         }
1270         error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1271              auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1272         if (auth_str)
1273                 _FREE(auth_str, M_TEMP);
1274         if (error) {
1275                 mbuf_freem(mrest);
1276                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1277                 return (error);
1278         }
1279         if (xidp)
1280                 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1281
1282         /*
1283          * For stream protocols, insert a Sun RPC Record Mark.
1284          */
1285         if (nmsotype == SOCK_STREAM) {
1286                 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1287                 if (error) {
1288                         mbuf_freem(m);
1289                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1290                         return (error);
1291                 }
1292                 *((u_long*)mbuf_data(m)) =
1293                         htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1294         }
1295         rep->r_mreq = m;
1296         rep->r_xid = xid;
1297 tryagain:
1298         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1299         if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1300                 rep->r_retry = nmp->nm_retry;
1301         else
1302                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
1303         rep->r_rtt = rep->r_rexmit = 0;
1304         if (proct[procnum] > 0)
1305                 rep->r_flags = R_TIMING;
1306         else
1307                 rep->r_flags = 0;
1308         rep->r_mrep = NULL;
1309
1310         /*
1311          * Do the client side RPC.
1312          */
1313         OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1314         /*
1315          * Chain request into list of outstanding requests. Be sure
1316          * to put it LAST so timer finds oldest requests first.
1317          */
1318         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1319
1320         /*
1321          * If backing off another request or avoiding congestion, don't
1322          * send this one now but let timer do it. If not timing a request,
1323          * do it now.
1324          */
1325         if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1326                            (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1327                            nmp->nm_sent < nmp->nm_cwnd)) {
1328                 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1329
1330                 if (connrequired)
1331                         error = nfs_sndlock(rep);
1332
1333                 /*
1334                  * Set the R_SENT before doing the send in case another thread
1335                  * processes the reply before the nfs_send returns here
1336                  */
1337                 if (!error) {
1338                         if ((rep->r_flags & R_MUSTRESEND) == 0) {
1339                                 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1340                                       nmp->nm_cwnd);
1341                                 nmp->nm_sent += NFS_CWNDSCALE;
1342                                 rep->r_flags |= R_SENT;
1343                         }
1344
1345                         error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1346                         if (!error)
1347                                 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1348                         if (connrequired)
1349                                 nfs_sndunlock(rep);
1350                 }
1351                 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1352                 if (error) {
1353                         if (nmp)
1354                                 nmp->nm_sent -= NFS_CWNDSCALE;
1355                         rep->r_flags &= ~R_SENT;
1356                 }
1357         } else {
1358                 rep->r_rtt = -1;
1359         }
1360
1361         /*
1362          * Wait for the reply from our send or the timer's.
1363          */
1364         if (!error || error == EPIPE)
1365                 error = nfs_reply(rep);
1366
1367         /*
1368          * RPC done, unlink the request.
1369          */
1370         nfs_repdequeue(rep);
1371
1372         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1373
1374         /*
1375          * Decrement the outstanding request count.
1376          */
1377         if (rep->r_flags & R_SENT) {
1378                 rep->r_flags &= ~R_SENT;        /* paranoia */
1379                 if (nmp) {
1380                         FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1381                         nmp->nm_sent -= NFS_CWNDSCALE;
1382                 }
1383         }
1384
1385         /*
1386          * If there was a successful reply and a tprintf msg.
1387          * tprintf a response.
1388          */
1389         if (!error)
1390                 nfs_up(nmp, procp, NFSSTA_TIMEO,
1391                         (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1392         mrep = rep->r_mrep;
1393         md = rep->r_md;
1394         dpos = rep->r_dpos;
1395         if (!error && !nmp)
1396                 error = ENXIO;
1397         if (error) {
1398                 mbuf_freem(rep->r_mreq);
1399                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1400                 return (error);
1401         }
1402
1403         /*
1404          * break down the rpc header and check if ok
1405          */
1406         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1407         if (*tl++ == rpc_msgdenied) {
1408                 if (*tl == rpc_mismatch)
1409                         error = EOPNOTSUPP;
1410                 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1411                         if (!failed_auth) {
1412                                 failed_auth++;
1413                                 error = mbuf_setnext(mheadend, NULL);
1414                                 mbuf_freem(mrep);
1415                                 mbuf_freem(rep->r_mreq);
1416                                 if (!error)
1417                                         goto kerbauth;
1418                                 printf("nfs_request: mbuf_setnext failed\n");
1419                         } else
1420                                 error = EAUTH;
1421                 } else
1422                         error = EACCES;
1423                 mbuf_freem(mrep);
1424                 mbuf_freem(rep->r_mreq);
1425                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1426                 return (error);
1427         }
1428
1429         /*
1430          * Grab any Kerberos verifier, otherwise just throw it away.
1431          */
1432         verf_type = fxdr_unsigned(int, *tl++);
1433         i = fxdr_unsigned(int, *tl);
1434         if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1435                 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1436                 if (error)
1437                         goto nfsmout;
1438         } else if (i > 0)
1439                 nfsm_adv(nfsm_rndup(i));
1440         nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1441         /* 0 == ok */
1442         if (*tl == 0) {
1443                 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1444                 if (*tl != 0) {
1445                         error = fxdr_unsigned(int, *tl);
1446                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1447                                 error == NFSERR_TRYLATER) {
1448                                 mbuf_freem(mrep);
1449                                 error = 0;
1450                                 microuptime(&now);
1451                                 waituntil = now.tv_sec + trylater_delay;
1452                                 while (now.tv_sec < waituntil) {
1453                                         tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1454                                         microuptime(&now);
1455                                 }
1456                                 trylater_delay *= 2;
1457                                 if (trylater_delay > 60)
1458                                         trylater_delay = 60;
1459                                 goto tryagain;
1460                         }
1461
1462                         /*
1463                          * If the File Handle was stale, invalidate the
1464                          * lookup cache, just in case.
1465                          */
1466                         if ((error == ESTALE) && vp)
1467                                 cache_purge(vp);
1468                         if (nmp->nm_flag & NFSMNT_NFSV3) {
1469                                 *mrp = mrep;
1470                                 *mdp = md;
1471                                 *dposp = dpos;
1472                                 error |= NFSERR_RETERR;
1473                         } else {
1474                                 mbuf_freem(mrep);
1475                                 error &= ~NFSERR_RETERR;
1476                         }
1477                         mbuf_freem(rep->r_mreq);
1478                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1479                         return (error);
1480                 }
1481
1482                 *mrp = mrep;
1483                 *mdp = md;
1484                 *dposp = dpos;
1485                 mbuf_freem(rep->r_mreq);
1486                 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1487                 return (0);
1488         }
1489         mbuf_freem(mrep);
1490         error = EPROTONOSUPPORT;
1491 nfsmout:
1492         mbuf_freem(rep->r_mreq);
1493         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1494         return (error);
1495 }
1496
1497 #ifndef NFS_NOSERVER
1498 /*
1499  * Generate the rpc reply header
1500  * siz arg. is used to decide if adding a cluster is worthwhile
1501  */
1502 int
1503 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1504         int siz;
1505         struct nfsrv_descript *nd;
1506         struct nfssvc_sock *slp;
1507         int err;
1508         mbuf_t *mrq;
1509         mbuf_t *mbp;
1510         caddr_t *bposp;
1511 {
1512         u_long *tl;
1513         mbuf_t mreq;
1514         caddr_t bpos;
1515         mbuf_t mb, mb2;
1516         int error, mlen;
1517
1518         /*
1519          * If this is a big reply, use a cluster else
1520          * try and leave leading space for the lower level headers.
1521          */
1522         siz += RPC_REPLYSIZ;
1523         if (siz >= nfs_mbuf_minclsize) {
1524                 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1525         } else {
1526                 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1527         }
1528         if (error) {
1529                 /* unable to allocate packet */
1530                 /* XXX nfsstat? */
1531                 return (error);
1532         }
1533         mb = mreq;
1534         tl = mbuf_data(mreq);
1535         mlen = 6 * NFSX_UNSIGNED;
1536         if (siz < nfs_mbuf_minclsize) {
1537                 /* leave space for lower level headers */
1538                 tl += 80/sizeof(*tl);  /* XXX max_hdr? XXX */
1539                 mbuf_setdata(mreq, tl, mlen);
1540         } else {
1541                 mbuf_setlen(mreq, mlen);
1542         }
1543         bpos = ((caddr_t)tl) + mlen;
1544         *tl++ = txdr_unsigned(nd->nd_retxid);
1545         *tl++ = rpc_reply;
1546         if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1547                 *tl++ = rpc_msgdenied;
1548                 if (err & NFSERR_AUTHERR) {
1549                         *tl++ = rpc_autherr;
1550                         *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1551                         mlen -= NFSX_UNSIGNED;
1552                         mbuf_setlen(mreq, mlen);
1553                         bpos -= NFSX_UNSIGNED;
1554                 } else {
1555                         *tl++ = rpc_mismatch;
1556                         *tl++ = txdr_unsigned(RPC_VER2);
1557                         *tl = txdr_unsigned(RPC_VER2);
1558                 }
1559         } else {
1560                 *tl++ = rpc_msgaccepted;
1561
1562                 /*
1563                  * For Kerberos authentication, we must send the nickname
1564                  * verifier back, otherwise just RPCAUTH_NULL.
1565                  */
1566                 if (nd->nd_flag & ND_KERBFULL) {
1567                     struct nfsuid *nuidp;
1568                     struct timeval ktvin, ktvout;
1569                     uid_t uid = kauth_cred_getuid(nd->nd_cr);
1570
1571                     lck_rw_lock_shared(&slp->ns_rwlock);
1572                     for (nuidp = NUIDHASH(slp, uid)->lh_first;
1573                         nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1574                         if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1575                             (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1576                              &nuidp->nu_haddr, nd->nd_nam2)))
1577                             break;
1578                     }
1579                     if (nuidp) {
1580                         ktvin.tv_sec =
1581                             txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1582                         ktvin.tv_usec =
1583                             txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1584
1585                         /*
1586                          * Encrypt the timestamp in ecb mode using the
1587                          * session key.
1588                          */
1589 #if NFSKERB
1590                         XXX
1591 #endif
1592
1593                         *tl++ = rpc_auth_kerb;
1594                         *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1595                         *tl = ktvout.tv_sec;
1596                         nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1597                         *tl++ = ktvout.tv_usec;
1598                         *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1599                     } else {
1600                         *tl++ = 0;
1601                         *tl++ = 0;
1602                     }
1603                     lck_rw_done(&slp->ns_rwlock);
1604                 } else {
1605                         *tl++ = 0;
1606                         *tl++ = 0;
1607                 }
1608                 switch (err) {
1609                 case EPROGUNAVAIL:
1610                         *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1611                         break;
1612                 case EPROGMISMATCH:
1613                         *tl = txdr_unsigned(RPC_PROGMISMATCH);
1614                         nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1615                         // XXX hard coded versions
1616                         *tl++ = txdr_unsigned(2);
1617                         *tl = txdr_unsigned(3);
1618                         break;
1619                 case EPROCUNAVAIL:
1620                         *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1621                         break;
1622                 case EBADRPC:
1623                         *tl = txdr_unsigned(RPC_GARBAGE);
1624                         break;
1625                 default:
1626                         *tl = 0;
1627                         if (err != NFSERR_RETVOID) {
1628                                 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1629                                 if (err)
1630                                     *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1631                                 else
1632                                     *tl = 0;
1633                         }
1634                         break;
1635                 }
1636         }
1637
1638         if (mrq != NULL)
1639                 *mrq = mreq;
1640         *mbp = mb;
1641         *bposp = bpos;
1642         if (err != 0 && err != NFSERR_RETVOID) {
1643                 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1644         }
1645         return (0);
1646 }
1647
1648
1649 #endif /* NFS_NOSERVER */
1650
1651
1652 /*
1653  * From FreeBSD 1.58, a Matt Dillon fix...
1654  * Flag a request as being about to terminate.
1655  * The nm_sent count is decremented now to avoid deadlocks when the process
1656  * in soreceive() hasn't yet managed to send its own request.
1657  */
1658 static void
1659 nfs_softterm(struct nfsreq *rep)
1660 {
1661
1662         rep->r_flags |= R_SOFTTERM;
1663         if (rep->r_flags & R_SENT) {
1664                 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1665                       rep->r_nmp->nm_cwnd);
1666                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1667                 rep->r_flags &= ~R_SENT;
1668         }
1669 }
1670
1671 void
1672 nfs_timer_funnel(void * arg)
1673 {
1674         (void) thread_funnel_set(kernel_flock, TRUE);
1675         nfs_timer(arg);
1676         (void) thread_funnel_set(kernel_flock, FALSE);
1677
1678 }
1679
1680 /*
1681  * Ensure rep isn't in use by the timer, then dequeue it.
1682  */
1683 static void
1684 nfs_repdequeue(struct nfsreq *rep)
1685 {
1686
1687         while ((rep->r_flags & R_BUSY)) {
1688                 rep->r_flags |= R_WAITING;
1689                 tsleep(rep, PSOCK, "repdeq", 0);
1690         }
1691         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1692 }
1693
1694 /*
1695  * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1696  * free()'d out from under it.
1697  */
1698 static void
1699 nfs_repbusy(struct nfsreq *rep)
1700 {
1701
1702         if ((rep->r_flags & R_BUSY))
1703                 panic("rep locked");
1704         rep->r_flags |= R_BUSY;
1705 }
1706
1707 /*
1708  * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1709  */
1710 static struct nfsreq *
1711 nfs_repnext(struct nfsreq *rep)
1712 {
1713         struct nfsreq * nextrep;
1714
1715         if (rep == NULL)
1716                 return (NULL);
1717         /*
1718          * We need to get and busy the next req before signalling the
1719          * current one, otherwise wakeup() may block us and we'll race to
1720          * grab the next req.
1721          */
1722         nextrep = TAILQ_NEXT(rep, r_chain);
1723         if (nextrep != NULL)
1724                 nfs_repbusy(nextrep);
1725         /* unbusy and signal. */
1726         rep->r_flags &= ~R_BUSY;
1727         if ((rep->r_flags & R_WAITING)) {
1728                 rep->r_flags &= ~R_WAITING;
1729                 wakeup(rep);
1730         }
1731         return (nextrep);
1732 }
1733
1734 /*
1735  * Nfs timer routine
1736  * Scan the nfsreq list and retranmit any requests that have timed out
1737  * To avoid retransmission attempts on STREAM sockets (in the future) make
1738  * sure to set the r_retry field to 0 (implies nm_retry == 0).
1739  */
1740 void
1741 nfs_timer(__unused void *arg)
1742 {
1743         struct nfsreq *rep;
1744         mbuf_t m;
1745         socket_t so;
1746         struct nfsmount *nmp;
1747         int timeo;
1748         int error;
1749 #ifndef NFS_NOSERVER
1750         struct nfssvc_sock *slp;
1751         u_quad_t cur_usec;
1752 #endif /* NFS_NOSERVER */
1753         int flags, rexmit, cwnd, sent;
1754         u_long xid;
1755         struct timeval now;
1756
1757         rep = TAILQ_FIRST(&nfs_reqq);
1758         if (rep != NULL)
1759                 nfs_repbusy(rep);
1760         microuptime(&now);
1761         for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1762                 nmp = rep->r_nmp;
1763                 if (!nmp) /* unmounted */
1764                     continue;
1765                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1766                         continue;
1767                 if (nfs_sigintr(nmp, rep, rep->r_procp))
1768                         continue;
1769                 if (nmp->nm_tprintf_initial_delay != 0 &&
1770                     (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1771                     rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1772                         rep->r_lastmsg = now.tv_sec;
1773                         nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1774                                 "not responding");
1775                         rep->r_flags |= R_TPRINTFMSG;
1776                         if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1777                                 /* we're not yet completely mounted and */
1778                                 /* we can't complete an RPC, so we fail */
1779                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1780                                 nfs_softterm(rep);
1781                                 continue;
1782                         }
1783                 }
1784                 if (rep->r_rtt >= 0) {
1785                         rep->r_rtt++;
1786                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1787                                 timeo = nmp->nm_timeo;
1788                         else
1789                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1790                         /* ensure 62.5 ms floor */
1791                         while (16 * timeo < hz)
1792                             timeo *= 2;
1793                         if (nmp->nm_timeouts > 0)
1794                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1795                         if (rep->r_rtt <= timeo)
1796                                 continue;
1797                         if (nmp->nm_timeouts < 8)
1798                                 nmp->nm_timeouts++;
1799                 }
1800                 /*
1801                  * Check for too many retransmits.  This is never true for
1802                  * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1803                  * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1804                  */
1805                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
1806                         OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1807                         nfs_softterm(rep);
1808                         continue;
1809                 }
1810                 if (nmp->nm_sotype != SOCK_DGRAM) {
1811                         if (++rep->r_rexmit > NFS_MAXREXMIT)
1812                                 rep->r_rexmit = NFS_MAXREXMIT;
1813                         continue;
1814                 }
1815                 if ((so = nmp->nm_so) == NULL)
1816                         continue;
1817
1818                 /*
1819                  * If there is enough space and the window allows..
1820                  *      Resend it
1821                  * Set r_rtt to -1 in case we fail to send it now.
1822                  */
1823                 rep->r_rtt = -1;
1824                 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1825                     (rep->r_flags & R_SENT) ||
1826                     nmp->nm_sent < nmp->nm_cwnd) &&
1827                    (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1828                         struct msghdr   msg;
1829                         /*
1830                          * Iff first send, start timing
1831                          * else turn timing off, backoff timer
1832                          * and divide congestion window by 2.
1833                          * We update these *before* the send to avoid
1834                          * racing against receiving the reply.
1835                          * We save them so we can restore them on send error.
1836                          */
1837                         flags = rep->r_flags;
1838                         rexmit = rep->r_rexmit;
1839                         cwnd = nmp->nm_cwnd;
1840                         sent = nmp->nm_sent;
1841                         xid = rep->r_xid;
1842                         if (rep->r_flags & R_SENT) {
1843                                 rep->r_flags &= ~R_TIMING;
1844                                 if (++rep->r_rexmit > NFS_MAXREXMIT)
1845                                         rep->r_rexmit = NFS_MAXREXMIT;
1846                                 nmp->nm_cwnd >>= 1;
1847                                 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1848                                         nmp->nm_cwnd = NFS_CWNDSCALE;
1849                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1850                         } else {
1851                                 rep->r_flags |= R_SENT;
1852                                 nmp->nm_sent += NFS_CWNDSCALE;
1853                         }
1854                         FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1855
1856                         bzero(&msg, sizeof(msg));
1857                         if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1858                                 msg.msg_name = mbuf_data(nmp->nm_nam);
1859                                 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1860                         }
1861                         error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1862
1863                         FSDBG(535, xid, error, sent, cwnd);
1864
1865                         if (error) {
1866                                 if (error == EWOULDBLOCK) {
1867                                         rep->r_flags = flags;
1868                                         rep->r_rexmit = rexmit;
1869                                         nmp->nm_cwnd = cwnd;
1870                                         nmp->nm_sent = sent;
1871                                         rep->r_xid = xid;
1872                                 }
1873                                 else {
1874                                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1875                                                 int clearerror;
1876                                                 int optlen = sizeof(clearerror);
1877                                                 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1878                                         }
1879                                         rep->r_flags  = flags | R_RESENDERR;
1880                                         rep->r_rexmit = rexmit;
1881                                         nmp->nm_cwnd = cwnd;
1882                                         nmp->nm_sent = sent;
1883                                         if (flags & R_SENT)
1884                                                 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1885                                 }
1886                         } else
1887                                 rep->r_rtt = 0;
1888                 }
1889         }
1890         microuptime(&now);
1891 #ifndef NFS_NOSERVER
1892         /*
1893          * Scan the write gathering queues for writes that need to be
1894          * completed now.
1895          */
1896         cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1897         lck_mtx_lock(nfsd_mutex);
1898         TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1899             if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1900                 nfsrv_wakenfsd(slp);
1901         }
1902         while ((slp = TAILQ_FIRST(&nfssvc_deadsockhead))) {
1903                 if ((slp->ns_timestamp + 5) > now.tv_sec)
1904                         break;
1905                 TAILQ_REMOVE(&nfssvc_deadsockhead, slp, ns_chain);
1906                 nfsrv_slpfree(slp);
1907         }
1908         lck_mtx_unlock(nfsd_mutex);
1909 #endif /* NFS_NOSERVER */
1910
1911         if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1912                 /*
1913                  * We haven't called nfs_buf_freeup() in a little while.
1914                  * So, see if we can free up any stale/unused bufs now.
1915                  */
1916                 nfs_buf_freeup(1);
1917         }
1918
1919         timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1920
1921 }
1922
1923
1924 /*
1925  * Test for a termination condition pending on the process.
1926  * This is used to determine if we need to bail on a mount.
1927  * EIO is returned if there has been a soft timeout.
1928  * EINTR is returned if there is a signal pending that is not being ignored
1929  * and the mount is interruptable, or if we are a thread that is in the process
1930  * of cancellation (also SIGKILL posted).
1931  */
1932 int
1933 nfs_sigintr(nmp, rep, p)
1934         struct nfsmount *nmp;
1935         struct nfsreq *rep;
1936         proc_t p;
1937 {
1938         sigset_t pending_sigs;
1939         int context_good = 0;
1940         struct nfsmount *repnmp;
1941         extern proc_t kernproc;
1942
1943         if (nmp == NULL)
1944                 return (ENXIO);
1945         if (rep != NULL) {
1946                 repnmp = rep->r_nmp;
1947                 /* we've had a forced unmount. */
1948                 if (repnmp == NULL)
1949                         return (ENXIO);
1950                 /* request has timed out on a 'soft' mount. */
1951                 if (rep->r_flags & R_SOFTTERM)
1952                         return (EIO);
1953                 /*
1954                  * We're in the progress of a force unmount and there's
1955                  * been a timeout we're dead and fail IO.
1956                  */
1957                 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1958                    (NFSSTA_FORCE|NFSSTA_TIMEO))
1959                         return (EIO);
1960                 /* Someone is unmounting us, go soft and mark it. */
1961                 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1962                         repnmp->nm_flag |= NFSMNT_SOFT;
1963                         nmp->nm_state |= NFSSTA_FORCE;
1964                 }
1965                 /*
1966                  * If the mount is hung and we've requested not to hang
1967                  * on remote filesystems, then bail now.
1968                  */
1969                 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1970                     (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1971                         return (EIO);
1972         }
1973         /* XXX: is this valid?  this probably should be an assertion. */
1974         if (p == NULL)
1975                 return (0);
1976
1977         /* Is this thread belongs to kernel task; then abort check  is not needed */
1978         if ((current_proc() != kernproc) && current_thread_aborted()) {
1979                 return (EINTR);
1980         }
1981         /* mask off thread and process blocked signals. */
1982
1983         pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1984         if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1985                 return (EINTR);
1986         return (0);
1987 }
1988
1989 /*
1990  * Lock a socket against others.
1991  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1992  * and also to avoid race conditions between the processes with nfs requests
1993  * in progress when a reconnect is necessary.
1994  */
1995 int
1996 nfs_sndlock(rep)
1997         struct nfsreq *rep;
1998 {
1999         int *statep;
2000         proc_t p;
2001         int error, slpflag = 0, slptimeo = 0;
2002
2003         if (rep->r_nmp == NULL)
2004                 return (ENXIO);
2005         statep = &rep->r_nmp->nm_state;
2006
2007         p = rep->r_procp;
2008         if (rep->r_nmp->nm_flag & NFSMNT_INT)
2009                 slpflag = PCATCH;
2010         while (*statep & NFSSTA_SNDLOCK) {
2011                 error = nfs_sigintr(rep->r_nmp, rep, p);
2012                 if (error)
2013                         return (error);
2014                 *statep |= NFSSTA_WANTSND;
2015                 if (p != NULL && (proc_noremotehang(p)) != 0)
2016                         slptimeo = hz;
2017                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2018                 if (slpflag == PCATCH) {
2019                         slpflag = 0;
2020                         slptimeo = 2 * hz;
2021                 }
2022                 /*
2023                  * Make sure while we slept that the mountpoint didn't go away.
2024                  * nfs_sigintr and callers expect it in tact.
2025                  */
2026                 if (!rep->r_nmp)
2027                         return (ENXIO); /* don't have lock until out of loop */
2028         }
2029         *statep |= NFSSTA_SNDLOCK;
2030         return (0);
2031 }
2032
2033 /*
2034  * Unlock the stream socket for others.
2035  */
2036 void
2037 nfs_sndunlock(rep)
2038         struct nfsreq *rep;
2039 {
2040         int *statep;
2041
2042         if (rep->r_nmp == NULL)
2043                 return;
2044         statep = &rep->r_nmp->nm_state;
2045         if ((*statep & NFSSTA_SNDLOCK) == 0)
2046                 panic("nfs sndunlock");
2047         *statep &= ~NFSSTA_SNDLOCK;
2048         if (*statep & NFSSTA_WANTSND) {
2049                 *statep &= ~NFSSTA_WANTSND;
2050                 wakeup((caddr_t)statep);
2051         }
2052 }
2053
2054 static int
2055 nfs_rcvlock(struct nfsreq *rep)
2056 {
2057         int *statep;
2058         int error, slpflag, slptimeo = 0;
2059
2060         /* make sure we still have our mountpoint */
2061         if (!rep->r_nmp) {
2062                 if (rep->r_mrep != NULL)
2063                         return (EALREADY);
2064                 return (ENXIO);
2065         }
2066
2067         statep = &rep->r_nmp->nm_state;
2068         FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2069         if (rep->r_nmp->nm_flag & NFSMNT_INT)
2070                 slpflag = PCATCH;
2071         else
2072                 slpflag = 0;
2073         while (*statep & NFSSTA_RCVLOCK) {
2074                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2075                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2076                         return (error);
2077                 } else if (rep->r_mrep != NULL) {
2078                         /*
2079                          * Don't bother sleeping if reply already arrived
2080                          */
2081                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2082                         return (EALREADY);
2083                 }
2084                 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2085                 *statep |= NFSSTA_WANTRCV;
2086                 /*
2087                  * We need to poll if we're P_NOREMOTEHANG so that we
2088                  * call nfs_sigintr periodically above.
2089                  */
2090                 if (rep->r_procp != NULL &&
2091                     (proc_noremotehang(rep->r_procp)) != 0)
2092                         slptimeo = hz;
2093                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2094                 if (slpflag == PCATCH) {
2095                         slpflag = 0;
2096                         slptimeo = 2 * hz;
2097                 }
2098                 /*
2099                  * Make sure while we slept that the mountpoint didn't go away.
2100                  * nfs_sigintr and caller nfs_reply expect it intact.
2101                  */
2102                 if (!rep->r_nmp)  {
2103                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2104                         return (ENXIO); /* don't have lock until out of loop */
2105                 }
2106         }
2107         /*
2108          * nfs_reply will handle it if reply already arrived.
2109          * (We may have slept or been preempted).
2110          */
2111         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2112         *statep |= NFSSTA_RCVLOCK;
2113         return (0);
2114 }
2115
2116 /*
2117  * Unlock the stream socket for others.
2118  */
2119 static void
2120 nfs_rcvunlock(struct nfsreq *rep)
2121 {
2122         int *statep;
2123
2124         if (rep->r_nmp == NULL)
2125                 return;
2126         statep = &rep->r_nmp->nm_state;
2127
2128         FSDBG(533, statep, *statep, 0, 0);
2129         if ((*statep & NFSSTA_RCVLOCK) == 0)
2130                 panic("nfs rcvunlock");
2131         *statep &= ~NFSSTA_RCVLOCK;
2132         if (*statep & NFSSTA_WANTRCV) {
2133                 *statep &= ~NFSSTA_WANTRCV;
2134                 wakeup((caddr_t)statep);
2135         }
2136 }
2137
2138
2139 #ifndef NFS_NOSERVER
2140 /*
2141  * Socket upcall routine for the nfsd sockets.
2142  * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2143  * Essentially do as much as possible non-blocking, else punt and it will
2144  * be called with MBUF_WAITOK from an nfsd.
2145  */
2146 void
2147 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2148 {
2149         struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2150
2151         if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2152                 return;
2153
2154         lck_rw_lock_exclusive(&slp->ns_rwlock);
2155         nfsrv_rcv_locked(so, slp, waitflag);
2156         /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2157 }
2158 void
2159 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2160 {
2161         mbuf_t m, mp, mhck, m2;
2162         int ns_flag=0, error;
2163         struct msghdr   msg;
2164         size_t bytes_read;
2165
2166         if ((slp->ns_flag & SLP_VALID) == 0) {
2167                 if (waitflag == MBUF_DONTWAIT)
2168                         lck_rw_done(&slp->ns_rwlock);
2169                 return;
2170         }
2171
2172 #ifdef notdef
2173         /*
2174          * Define this to test for nfsds handling this under heavy load.
2175          */
2176         if (waitflag == MBUF_DONTWAIT) {
2177                 ns_flag = SLP_NEEDQ;
2178                 goto dorecs;
2179         }
2180 #endif
2181         if (slp->ns_sotype == SOCK_STREAM) {
2182                 /*
2183                  * If there are already records on the queue, defer soreceive()
2184                  * to an nfsd so that there is feedback to the TCP layer that
2185                  * the nfs servers are heavily loaded.
2186                  */
2187                 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2188                         ns_flag = SLP_NEEDQ;
2189                         goto dorecs;
2190                 }
2191
2192                 /*
2193                  * Do soreceive().
2194                  */
2195                 bytes_read = 1000000000;
2196                 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2197                 if (error || mp == NULL) {
2198                         if (error == EWOULDBLOCK)
2199                                 ns_flag = SLP_NEEDQ;
2200                         else
2201                                 ns_flag = SLP_DISCONN;
2202                         goto dorecs;
2203                 }
2204                 m = mp;
2205                 if (slp->ns_rawend) {
2206                         if ((error = mbuf_setnext(slp->ns_rawend, m)))
2207                                 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2208                         slp->ns_cc += bytes_read;
2209                 } else {
2210                         slp->ns_raw = m;
2211                         slp->ns_cc = bytes_read;
2212                 }
2213                 while ((m2 = mbuf_next(m)))
2214                         m = m2;
2215                 slp->ns_rawend = m;
2216
2217                 /*
2218                  * Now try and parse record(s) out of the raw stream data.
2219                  */
2220                 error = nfsrv_getstream(slp, waitflag);
2221                 if (error) {
2222                         if (error == EPERM)
2223                                 ns_flag = SLP_DISCONN;
2224                         else
2225                                 ns_flag = SLP_NEEDQ;
2226                 }
2227         } else {
2228                 struct sockaddr_storage nam;
2229
2230                 bzero(&msg, sizeof(msg));
2231                 msg.msg_name = (caddr_t)&nam;
2232                 msg.msg_namelen = sizeof(nam);
2233
2234                 do {
2235                         bytes_read = 1000000000;
2236                         error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2237                         if (mp) {
2238                                 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2239                                         mbuf_setlen(mhck, nam.ss_len);
2240                                         bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2241                                         m = mhck;
2242                                         if (mbuf_setnext(m, mp)) {
2243                                                 /* trouble... just drop it */
2244                                                 printf("nfsrv_rcv: mbuf_setnext failed\n");
2245                                                 mbuf_free(mhck);
2246                                                 m = mp;
2247                                         }
2248                                 } else {
2249                                         m = mp;
2250                                 }
2251                                 if (slp->ns_recend)
2252                                         mbuf_setnextpkt(slp->ns_recend, m);
2253                                 else
2254                                         slp->ns_rec = m;
2255                                 slp->ns_recend = m;
2256                                 mbuf_setnextpkt(m, NULL);
2257                         }
2258 #if 0
2259                         if (error) {
2260                                 /*
2261                                  * This may be needed in the future to support
2262                                  * non-byte-stream connection-oriented protocols
2263                                  * such as SCTP.
2264                                  */
2265                                 /*
2266                                  * This (slp->ns_sotype == SOCK_STREAM) should really
2267                                  * be a check for PR_CONNREQUIRED.
2268                                  */
2269                                 if ((slp->ns_sotype == SOCK_STREAM)
2270                                         && error != EWOULDBLOCK) {
2271                                         ns_flag = SLP_DISCONN;
2272                                         goto dorecs;
2273                                 }
2274                         }
2275 #endif
2276                 } while (mp);
2277         }
2278
2279         /*
2280          * Now try and process the request records, non-blocking.
2281          */
2282 dorecs:
2283         if (ns_flag)
2284                 slp->ns_flag |= ns_flag;
2285         if (waitflag == MBUF_DONTWAIT) {
2286                 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2287                 lck_rw_done(&slp->ns_rwlock);
2288                 if (wake && nfs_numnfsd) {
2289                         lck_mtx_lock(nfsd_mutex);
2290                         nfsrv_wakenfsd(slp);
2291                         lck_mtx_unlock(nfsd_mutex);
2292                 }
2293         }
2294 }
2295
2296 /*
2297  * Try and extract an RPC request from the mbuf data list received on a
2298  * stream socket. The "waitflag" argument indicates whether or not it
2299  * can sleep.
2300  */
2301 static int
2302 nfsrv_getstream(slp, waitflag)
2303         struct nfssvc_sock *slp;
2304         int waitflag;
2305 {
2306         mbuf_t m;
2307         char *cp1, *cp2, *mdata;
2308         int len, mlen, error;
2309         mbuf_t om, m2, recm;
2310         u_long recmark;
2311
2312         if (slp->ns_flag & SLP_GETSTREAM)
2313                 panic("nfs getstream");
2314         slp->ns_flag |= SLP_GETSTREAM;
2315         for (;;) {
2316             if (slp->ns_reclen == 0) {
2317                 if (slp->ns_cc < NFSX_UNSIGNED) {
2318                         slp->ns_flag &= ~SLP_GETSTREAM;
2319                         return (0);
2320                 }
2321                 m = slp->ns_raw;
2322                 mdata = mbuf_data(m);
2323                 mlen = mbuf_len(m);
2324                 if (mlen >= NFSX_UNSIGNED) {
2325                         bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2326                         mdata += NFSX_UNSIGNED;
2327                         mlen -= NFSX_UNSIGNED;
2328                         mbuf_setdata(m, mdata, mlen);
2329                 } else {
2330                         cp1 = (caddr_t)&recmark;
2331                         cp2 = mdata;
2332                         while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2333                                 while (mlen == 0) {
2334                                         m = mbuf_next(m);
2335                                         cp2 = mbuf_data(m);
2336                                         mlen = mbuf_len(m);
2337                                 }
2338                                 *cp1++ = *cp2++;
2339                                 mlen--;
2340                                 mbuf_setdata(m, cp2, mlen);
2341                         }
2342                 }
2343                 slp->ns_cc -= NFSX_UNSIGNED;
2344                 recmark = ntohl(recmark);
2345                 slp->ns_reclen = recmark & ~0x80000000;
2346                 if (recmark & 0x80000000)
2347                         slp->ns_flag |= SLP_LASTFRAG;
2348                 else
2349                         slp->ns_flag &= ~SLP_LASTFRAG;
2350                 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2351                         slp->ns_flag &= ~SLP_GETSTREAM;
2352                         return (EPERM);
2353                 }
2354             }
2355
2356             /*
2357              * Now get the record part.
2358              *
2359              * Note that slp->ns_reclen may be 0.  Linux sometimes
2360              * generates 0-length RPCs
2361              */
2362             recm = NULL;
2363             if (slp->ns_cc == slp->ns_reclen) {
2364                 recm = slp->ns_raw;
2365                 slp->ns_raw = slp->ns_rawend = NULL;
2366                 slp->ns_cc = slp->ns_reclen = 0;
2367             } else if (slp->ns_cc > slp->ns_reclen) {
2368                 len = 0;
2369                 m = slp->ns_raw;
2370                 mlen = mbuf_len(m);
2371                 mdata = mbuf_data(m);
2372                 om = NULL;
2373                 while (len < slp->ns_reclen) {
2374                         if ((len + mlen) > slp->ns_reclen) {
2375                                 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2376                                         slp->ns_flag &= ~SLP_GETSTREAM;
2377                                         return (EWOULDBLOCK);
2378                                 }
2379                                 if (om) {
2380                                         if (mbuf_setnext(om, m2)) {
2381                                                 /* trouble... just drop it */
2382                                                 printf("nfsrv_getstream: mbuf_setnext failed\n");
2383                                                 mbuf_freem(m2);
2384                                                 slp->ns_flag &= ~SLP_GETSTREAM;
2385                                                 return (EWOULDBLOCK);
2386                                         }
2387                                         recm = slp->ns_raw;
2388                                 } else {
2389                                         recm = m2;
2390                                 }
2391                                 mdata += slp->ns_reclen - len;
2392                                 mlen -= slp->ns_reclen - len;
2393                                 mbuf_setdata(m, mdata, mlen);
2394                                 len = slp->ns_reclen;
2395                         } else if ((len + mlen) == slp->ns_reclen) {
2396                                 om = m;
2397                                 len += mlen;
2398                                 m = mbuf_next(m);
2399                                 recm = slp->ns_raw;
2400                                 if (mbuf_setnext(om, NULL)) {
2401                                         printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2402                                         slp->ns_flag &= ~SLP_GETSTREAM;
2403                                         return (EWOULDBLOCK);
2404                                 }
2405                                 mlen = mbuf_len(m);
2406                                 mdata = mbuf_data(m);
2407                         } else {
2408                                 om = m;
2409                                 len += mlen;
2410                                 m = mbuf_next(m);
2411                                 mlen = mbuf_len(m);
2412                                 mdata = mbuf_data(m);
2413                         }
2414                 }
2415                 slp->ns_raw = m;
2416                 slp->ns_cc -= len;
2417                 slp->ns_reclen = 0;
2418             } else {
2419                 slp->ns_flag &= ~SLP_GETSTREAM;
2420                 return (0);
2421             }
2422
2423             /*
2424              * Accumulate the fragments into a record.
2425              */
2426             if (slp->ns_frag == NULL) {
2427                 slp->ns_frag = recm;
2428             } else {
2429                 m = slp->ns_frag;
2430                 while ((m2 = mbuf_next(m)))
2431                     m = m2;
2432                 if ((error = mbuf_setnext(m, recm)))
2433                     panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2434             }
2435             if (slp->ns_flag & SLP_LASTFRAG) {
2436                 if (slp->ns_recend)
2437                     mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2438                 else
2439                     slp->ns_rec = slp->ns_frag;
2440                 slp->ns_recend = slp->ns_frag;
2441                 slp->ns_frag = NULL;
2442             }
2443         }
2444 }
2445
2446 /*
2447  * Parse an RPC header.
2448  */
2449 int
2450 nfsrv_dorec(slp, nfsd, ndp)
2451         struct nfssvc_sock *slp;
2452         struct nfsd *nfsd;
2453         struct nfsrv_descript **ndp;
2454 {
2455         mbuf_t m;
2456         mbuf_t nam;
2457         struct nfsrv_descript *nd;
2458         int error;
2459
2460         *ndp = NULL;
2461         if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2462                 return (ENOBUFS);
2463         MALLOC_ZONE(nd, struct nfsrv_descript *,
2464                         sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2465         if (!nd)
2466                 return (ENOMEM);
2467         m = slp->ns_rec;
2468         slp->ns_rec = mbuf_nextpkt(m);
2469         if (slp->ns_rec)
2470                 mbuf_setnextpkt(m, NULL);
2471         else
2472                 slp->ns_recend = NULL;
2473         if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2474                 nam = m;
2475                 m = mbuf_next(m);
2476                 if ((error = mbuf_setnext(nam, NULL)))
2477                         panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2478         } else
2479                 nam = NULL;
2480         nd->nd_md = nd->nd_mrep = m;
2481         nd->nd_nam2 = nam;
2482         nd->nd_dpos = mbuf_data(m);
2483         error = nfs_getreq(nd, nfsd, TRUE);
2484         if (error) {
2485                 if (nam)
2486                         mbuf_freem(nam);
2487                 FREE_ZONE((caddr_t)nd,  sizeof *nd, M_NFSRVDESC);
2488                 return (error);
2489         }
2490         *ndp = nd;
2491         nfsd->nfsd_nd = nd;
2492         return (0);
2493 }
2494
2495 /*
2496  * Parse an RPC request
2497  * - verify it
2498  * - fill in the cred struct.
2499  */
2500 int
2501 nfs_getreq(nd, nfsd, has_header)
2502         struct nfsrv_descript *nd;
2503         struct nfsd *nfsd;
2504         int has_header;
2505 {
2506         int len, i;
2507         u_long *tl;
2508         long t1;
2509         uio_t uiop;
2510         caddr_t dpos, cp2, cp;
2511         u_long nfsvers, auth_type;
2512         uid_t nickuid;
2513         int error = 0, ticklen;
2514         mbuf_t mrep, md;
2515         struct nfsuid *nuidp;
2516         uid_t user_id;
2517         gid_t group_id;
2518         int ngroups;
2519         struct ucred temp_cred;
2520         struct timeval tvin, tvout, now;
2521         char uio_buf[ UIO_SIZEOF(1) ];
2522 #if 0                           /* until encrypted keys are implemented */
2523         NFSKERBKEYSCHED_T keys; /* stores key schedule */
2524 #endif
2525
2526         nd->nd_cr = NULL;
2527
2528         mrep = nd->nd_mrep;
2529         md = nd->nd_md;
2530         dpos = nd->nd_dpos;
2531         if (has_header) {
2532                 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2533                 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2534                 if (*tl++ != rpc_call) {
2535                         mbuf_freem(mrep);
2536                         return (EBADRPC);
2537                 }
2538         } else
2539                 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2540         nd->nd_repstat = 0;
2541         nd->nd_flag = 0;
2542         if (*tl++ != rpc_vers) {
2543                 nd->nd_repstat = ERPCMISMATCH;
2544                 nd->nd_procnum = NFSPROC_NOOP;
2545                 return (0);
2546         }
2547         if (*tl != nfs_prog) {
2548                 nd->nd_repstat = EPROGUNAVAIL;
2549                 nd->nd_procnum = NFSPROC_NOOP;
2550                 return (0);
2551         }
2552         tl++;
2553         nfsvers = fxdr_unsigned(u_long, *tl++);
2554         if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2555                 nd->nd_repstat = EPROGMISMATCH;
2556                 nd->nd_procnum = NFSPROC_NOOP;
2557                 return (0);
2558         }
2559         else if (nfsvers == NFS_VER3)
2560                 nd->nd_flag = ND_NFSV3;
2561         nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2562         if (nd->nd_procnum == NFSPROC_NULL)
2563                 return (0);
2564         if ((nd->nd_procnum >= NFS_NPROCS) ||
2565                 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2566                 nd->nd_repstat = EPROCUNAVAIL;
2567                 nd->nd_procnum = NFSPROC_NOOP;
2568                 return (0);
2569         }
2570         if ((nd->nd_flag & ND_NFSV3) == 0)
2571                 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2572         auth_type = *tl++;
2573         len = fxdr_unsigned(int, *tl++);
2574         if (len < 0 || len > RPCAUTH_MAXSIZ) {
2575                 mbuf_freem(mrep);
2576                 return (EBADRPC);
2577         }
2578
2579         nd->nd_flag &= ~ND_KERBAUTH;
2580         /*
2581          * Handle auth_unix or auth_kerb.
2582          */
2583         if (auth_type == rpc_auth_unix) {
2584                 len = fxdr_unsigned(int, *++tl);
2585                 if (len < 0 || len > NFS_MAXNAMLEN) {
2586                         mbuf_freem(mrep);
2587                         return (EBADRPC);
2588                 }
2589                 bzero(&temp_cred, sizeof(temp_cred));
2590                 nfsm_adv(nfsm_rndup(len));
2591                 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2592                 user_id = fxdr_unsigned(uid_t, *tl++);
2593                 group_id = fxdr_unsigned(gid_t, *tl++);
2594                 temp_cred.cr_groups[0] = group_id;
2595                 len = fxdr_unsigned(int, *tl);
2596                 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2597                         mbuf_freem(mrep);
2598                         return (EBADRPC);
2599                 }
2600                 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2601                 for (i = 1; i <= len; i++)
2602                     if (i < NGROUPS)
2603                         temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2604                     else
2605                         tl++;
2606                 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2607                 if (ngroups > 1)
2608                     nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2609                 len = fxdr_unsigned(int, *++tl);
2610                 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2611                         mbuf_freem(mrep);
2612                         return (EBADRPC);
2613                 }
2614                 temp_cred.cr_uid = user_id;
2615                 temp_cred.cr_ngroups = ngroups;
2616                 nd->nd_cr = kauth_cred_create(&temp_cred);
2617                 if (nd->nd_cr == NULL) {
2618                         nd->nd_repstat = ENOMEM;
2619                         nd->nd_procnum = NFSPROC_NOOP;
2620                         return (0);
2621                 }
2622                 if (len > 0)
2623                         nfsm_adv(nfsm_rndup(len));
2624         } else if (auth_type == rpc_auth_kerb) {
2625                 switch (fxdr_unsigned(int, *tl++)) {
2626                 case RPCAKN_FULLNAME:
2627                         ticklen = fxdr_unsigned(int, *tl);
2628                         *((u_long *)nfsd->nfsd_authstr) = *tl;
2629                         uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2630                                                 &uio_buf[0], sizeof(uio_buf));
2631                         if (!uiop) {
2632                                 nd->nd_repstat = ENOMEM;
2633                                 nd->nd_procnum = NFSPROC_NOOP;
2634                                 return (0);
2635                         }
2636
2637                         // LP64todo - fix this
2638                         nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2639                         if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2640                                 mbuf_freem(mrep);
2641                                 return (EBADRPC);
2642                         }
2643                         uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2644                         // LP64todo - fix this
2645                         nfsm_mtouio(uiop, uio_resid(uiop));
2646                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2647                         if (*tl++ != rpc_auth_kerb ||
2648                                 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2649                                 printf("Bad kerb verifier\n");
2650                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2651                                 nd->nd_procnum = NFSPROC_NOOP;
2652                                 return (0);
2653                         }
2654                         nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2655                         tl = (u_long *)cp;
2656                         if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2657                                 printf("Not fullname kerb verifier\n");
2658                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2659                                 nd->nd_procnum = NFSPROC_NOOP;
2660                                 return (0);
2661                         }
2662                         cp += NFSX_UNSIGNED;
2663                         bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2664                         nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2665                         nd->nd_flag |= ND_KERBFULL;
2666                         nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2667                         break;
2668                 case RPCAKN_NICKNAME:
2669                         if (len != 2 * NFSX_UNSIGNED) {
2670                                 printf("Kerb nickname short\n");
2671                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2672                                 nd->nd_procnum = NFSPROC_NOOP;
2673                                 return (0);
2674                         }
2675                         nickuid = fxdr_unsigned(uid_t, *tl);
2676                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2677                         if (*tl++ != rpc_auth_kerb ||
2678                                 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2679                                 printf("Kerb nick verifier bad\n");
2680                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2681                                 nd->nd_procnum = NFSPROC_NOOP;
2682                                 return (0);
2683                         }
2684                         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2685                         tvin.tv_sec = *tl++;
2686                         tvin.tv_usec = *tl;
2687
2688                         for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2689                             nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2690                                 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2691                                     (!nd->nd_nam2 ||
2692                                      netaddr_match(NU_NETFAM(nuidp),
2693                                       &nuidp->nu_haddr, nd->nd_nam2)))
2694                                         break;
2695                         }
2696                         if (!nuidp) {
2697                                 nd->nd_repstat =
2698                                         (NFSERR_AUTHERR|AUTH_REJECTCRED);
2699                                 nd->nd_procnum = NFSPROC_NOOP;
2700                                 return (0);
2701                         }
2702
2703                         /*
2704                          * Now, decrypt the timestamp using the session key
2705                          * and validate it.
2706                          */
2707 #if NFSKERB
2708                         XXX
2709 #endif
2710
2711                         tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2712                         tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2713                         microtime(&now);
2714                         if (nuidp->nu_expire < now.tv_sec ||
2715                             nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2716                             (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2717                              nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2718                                 nuidp->nu_expire = 0;
2719                                 nd->nd_repstat =
2720                                     (NFSERR_AUTHERR|AUTH_REJECTVERF);
2721                                 nd->nd_procnum = NFSPROC_NOOP;
2722                                 return (0);
2723                         }
2724                         bzero(&temp_cred, sizeof(temp_cred));
2725                         ngroups = nuidp->nu_cr->cr_ngroups;
2726                         for (i = 0; i < ngroups; i++)
2727                                 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2728                         if (ngroups > 1)
2729                                 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2730
2731                         temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2732                         temp_cred.cr_ngroups = ngroups;
2733                         nd->nd_cr = kauth_cred_create(&temp_cred);
2734                         if (!nd->nd_cr) {
2735                                 nd->nd_repstat = ENOMEM;
2736                                 nd->nd_procnum = NFSPROC_NOOP;
2737                                 return (0);
2738                         }
2739                         nd->nd_flag |= ND_KERBNICK;
2740                 };
2741         } else {
2742                 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2743                 nd->nd_procnum = NFSPROC_NOOP;
2744                 return (0);
2745         }
2746
2747         nd->nd_md = md;
2748         nd->nd_dpos = dpos;
2749         return (0);
2750 nfsmout:
2751         if (nd->nd_cr)
2752                 kauth_cred_rele(nd->nd_cr);
2753         return (error);
2754 }
2755
2756 /*
2757  * Search for a sleeping nfsd and wake it up.
2758  * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2759  * running nfsds will go look for the work in the nfssvc_sock list.
2760  * Note: Must be called with nfsd_mutex held.
2761  */
2762 void
2763 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2764 {
2765         struct nfsd *nd;
2766
2767         if ((slp->ns_flag & SLP_VALID) == 0)
2768                 return;
2769
2770         lck_rw_lock_exclusive(&slp->ns_rwlock);
2771
2772         if (nfsd_waiting) {
2773                 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2774                         if (nd->nfsd_flag & NFSD_WAITING) {
2775                                 nd->nfsd_flag &= ~NFSD_WAITING;
2776                                 if (nd->nfsd_slp)
2777                                         panic("nfsd wakeup");
2778                                 slp->ns_sref++;
2779                                 nd->nfsd_slp = slp;
2780                                 lck_rw_done(&slp->ns_rwlock);
2781                                 wakeup((caddr_t)nd);
2782                                 return;
2783                         }
2784                 }
2785         }
2786
2787         slp->ns_flag |= SLP_DOREC;
2788
2789         lck_rw_done(&slp->ns_rwlock);
2790
2791         nfsd_head_flag |= NFSD_CHECKSLP;
2792 }
2793 #endif /* NFS_NOSERVER */
2794
2795 static int
2796 nfs_msg(proc_t p,
2797         const char *server,
2798         const char *msg,
2799         int error)
2800 {
2801         tpr_t tpr;
2802
2803         if (p)
2804                 tpr = tprintf_open(p);
2805         else
2806                 tpr = NULL;
2807         if (error)
2808                 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2809                     error);
2810         else
2811                 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2812         tprintf_close(tpr);
2813         return (0);
2814 }
2815
2816 void
2817 nfs_down(nmp, proc, error, flags, msg)
2818         struct nfsmount *nmp;
2819         proc_t proc;
2820         int error, flags;
2821         const char *msg;
2822 {
2823         if (nmp == NULL)
2824                 return;
2825         if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2826                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2827                 nmp->nm_state |= NFSSTA_TIMEO;
2828         }
2829         if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2830                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2831                 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2832         }
2833         nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2834 }
2835
2836 void
2837 nfs_up(nmp, proc, flags, msg)
2838         struct nfsmount *nmp;
2839         proc_t proc;
2840         int flags;
2841         const char *msg;
2842 {
2843         if (nmp == NULL)
2844                 return;
2845         if (msg)
2846                 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2847         if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2848                 nmp->nm_state &= ~NFSSTA_TIMEO;
2849                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2850         }
2851         if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2852                 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2853                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2854         }
2855 }
2856