bsd/nfs/nfs_socket.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1991, 1993, 1995
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * This code is derived from software contributed to Berkeley by
  34  * Rick Macklem at The University of Guelph.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  * 3. All advertising materials mentioning features or use of this software
  45  *    must display the following acknowledgement:
  46  *      This product includes software developed by the University of
  47  *      California, Berkeley and its contributors.
  48  * 4. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  *
  64  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
  65  * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
  66  */
  67
  68 /*
  69  * Socket operations for use by nfs
  70  */
  71
  72 #include <sys/param.h>
  73 #include <sys/systm.h>
  74 #include <sys/proc.h>
  75 #include <sys/kauth.h>
  76 #include <sys/mount_internal.h>
  77 #include <sys/kernel.h>
  78 #include <sys/kpi_mbuf.h>
  79 #include <sys/malloc.h>
  80 #include <sys/vnode.h>
  81 #include <sys/domain.h>
  82 #include <sys/protosw.h>
  83 #include <sys/socket.h>
  84 #include <sys/syslog.h>
  85 #include <sys/tprintf.h>
  86 #include <sys/uio_internal.h>
  87 #include <libkern/OSAtomic.h>
  88
  89 #include <sys/time.h>
  90 #include <kern/clock.h>
  91 #include <kern/task.h>
  92 #include <kern/thread.h>
  93 #include <sys/user.h>
  94
  95 #include <netinet/in.h>
  96 #include <netinet/tcp.h>
  97
  98 #include <nfs/rpcv2.h>
  99 #include <nfs/nfsproto.h>
 100 #include <nfs/nfs.h>
 101 #include <nfs/xdr_subs.h>
 102 #include <nfs/nfsm_subs.h>
 103 #include <nfs/nfsmount.h>
 104 #include <nfs/nfsnode.h>
 105 #include <nfs/nfsrtt.h>
 106
 107 #include <sys/kdebug.h>
 108
 109 #define FSDBG(A, B, C, D, E) \
 110         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
 111                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 112 #define FSDBG_TOP(A, B, C, D, E) \
 113         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
 114                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 115 #define FSDBG_BOT(A, B, C, D, E) \
 116         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
 117                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 118
 119 /*
 120  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
 121  * Use the mean and mean deviation of rtt for the appropriate type of rpc
 122  * for the frequent rpcs and a default for the others.
 123  * The justification for doing "other" this way is that these rpcs
 124  * happen so infrequently that timer est. would probably be stale.
 125  * Also, since many of these rpcs are
 126  * non-idempotent, a conservative timeout is desired.
 127  * getattr, lookup - A+2D
 128  * read, write     - A+4D
 129  * other           - nm_timeo
 130  */
 131 #define NFS_RTO(n, t) \
 132         ((t) == 0 ? (n)->nm_timeo : \
 133          ((t) < 3 ? \
 134           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
 135           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
 136 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
 137 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
 138 /*
 139  * External data, mostly RPC constants in XDR form
 140  */
 141 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
 142         rpc_msgaccepted, rpc_call, rpc_autherr,
 143         rpc_auth_kerb;
 144 extern u_long nfs_prog;
 145 extern struct nfsstats nfsstats;
 146 extern int nfsv3_procid[NFS_NPROCS];
 147 extern int nfs_ticks;
 148 extern u_long nfs_xidwrap;
 149
 150 /*
 151  * Defines which timer to use for the procnum.
 152  * 0 - default
 153  * 1 - getattr
 154  * 2 - lookup
 155  * 3 - read
 156  * 4 - write
 157  */
 158 static int proct[NFS_NPROCS] = {
 159         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
 160 };
 161
 162 /*
 163  * There is a congestion window for outstanding rpcs maintained per mount
 164  * point. The cwnd size is adjusted in roughly the way that:
 165  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
 166  * SIGCOMM '88". ACM, August 1988.
 167  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
 168  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
 169  * of rpcs is in progress.
 170  * (The sent count and cwnd are scaled for integer arith.)
 171  * Variants of "slow start" were tried and were found to be too much of a
 172  * performance hit (ave. rtt 3 times larger),
 173  * I suspect due to the large rtt that nfs rpcs have.
 174  */
 175 #define NFS_CWNDSCALE   256
 176 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
 177 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
 178 int nfsrtton = 0;
 179 struct nfsrtt nfsrtt;
 180
 181 static int      nfs_rcvlock(struct nfsreq *);
 182 static void     nfs_rcvunlock(struct nfsreq *);
 183 static int      nfs_receive(struct nfsreq *rep, mbuf_t *mp);
 184 static int      nfs_reconnect(struct nfsreq *rep);
 185 static void     nfs_repdequeue(struct nfsreq *rep);
 186
 187 /* XXX */
 188 boolean_t       current_thread_aborted(void);
 189 kern_return_t   thread_terminate(thread_t);
 190
 191 #ifndef NFS_NOSERVER
 192 static int      nfsrv_getstream(struct nfssvc_sock *,int);
 193
 194 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
 195                                     struct nfssvc_sock *slp,
 196                                     proc_t procp,
 197                                     mbuf_t *mreqp) = {
 198         nfsrv_null,
 199         nfsrv_getattr,
 200         nfsrv_setattr,
 201         nfsrv_lookup,
 202         nfsrv3_access,
 203         nfsrv_readlink,
 204         nfsrv_read,
 205         nfsrv_write,
 206         nfsrv_create,
 207         nfsrv_mkdir,
 208         nfsrv_symlink,
 209         nfsrv_mknod,
 210         nfsrv_remove,
 211         nfsrv_rmdir,
 212         nfsrv_rename,
 213         nfsrv_link,
 214         nfsrv_readdir,
 215         nfsrv_readdirplus,
 216         nfsrv_statfs,
 217         nfsrv_fsinfo,
 218         nfsrv_pathconf,
 219         nfsrv_commit,
 220         nfsrv_noop
 221 };
 222 #endif /* NFS_NOSERVER */
 223
 224
 225 /*
 226  * attempt to bind a socket to a reserved port
 227  */
 228 static int
 229 nfs_bind_resv(struct nfsmount *nmp)
 230 {
 231         socket_t so = nmp->nm_so;
 232         struct sockaddr_in sin;
 233         int error;
 234         u_short tport;
 235
 236         if (!so)
 237                 return (EINVAL);
 238
 239         sin.sin_len = sizeof (struct sockaddr_in);
 240         sin.sin_family = AF_INET;
 241         sin.sin_addr.s_addr = INADDR_ANY;
 242         tport = IPPORT_RESERVED - 1;
 243         sin.sin_port = htons(tport);
 244
 245         while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
 246                (--tport > IPPORT_RESERVED / 2))
 247                 sin.sin_port = htons(tport);
 248         return (error);
 249 }
 250
 251 /*
 252  * variables for managing the nfs_bind_resv_thread
 253  */
 254 int nfs_resv_mounts = 0;
 255 static int nfs_bind_resv_thread_state = 0;
 256 #define NFS_BIND_RESV_THREAD_STATE_INITTED      1
 257 #define NFS_BIND_RESV_THREAD_STATE_RUNNING      2
 258 lck_grp_t *nfs_bind_resv_lck_grp;
 259 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
 260 lck_attr_t *nfs_bind_resv_lck_attr;
 261 lck_mtx_t *nfs_bind_resv_mutex;
 262 struct nfs_bind_resv_request {
 263         TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
 264         struct nfsmount *brr_nmp;
 265         int brr_error;
 266 };
 267 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
 268
 269 /*
 270  * thread to handle any reserved port bind requests
 271  */
 272 static void
 273 nfs_bind_resv_thread(void)
 274 {
 275         struct nfs_bind_resv_request *brreq;
 276
 277         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 278
 279         while (nfs_resv_mounts > 0) {
 280                 lck_mtx_lock(nfs_bind_resv_mutex);
 281                 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
 282                         TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
 283                         lck_mtx_unlock(nfs_bind_resv_mutex);
 284                         brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
 285                         wakeup(brreq);
 286                         lck_mtx_lock(nfs_bind_resv_mutex);
 287                 }
 288                 msleep((caddr_t)&nfs_bind_resv_request_queue,
 289                                 nfs_bind_resv_mutex, PSOCK | PDROP,
 290                                 "nfs_bind_resv_request_queue", 0);
 291         }
 292
 293         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 294         (void) thread_terminate(current_thread());
 295 }
 296
 297 int
 298 nfs_bind_resv_thread_wake(void)
 299 {
 300         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
 301                 return (EIO);
 302         wakeup(&nfs_bind_resv_request_queue);
 303         return (0);
 304 }
 305
 306 /*
 307  * underprivileged procs call this to request nfs_bind_resv_thread
 308  * to perform the reserved port binding for them.
 309  */
 310 static int
 311 nfs_bind_resv_nopriv(struct nfsmount *nmp)
 312 {
 313         struct nfs_bind_resv_request brreq;
 314         int error;
 315
 316         if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
 317                 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
 318                         nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
 319                         lck_grp_attr_setstat(nfs_bind_resv_lck_grp_attr);
 320                         nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
 321                         nfs_bind_resv_lck_attr = lck_attr_alloc_init();
 322                         nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
 323                         TAILQ_INIT(&nfs_bind_resv_request_queue);
 324                         nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 325                 }
 326                 kernel_thread(kernel_task, nfs_bind_resv_thread);
 327                 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 328         }
 329
 330         brreq.brr_nmp = nmp;
 331         brreq.brr_error = 0;
 332
 333         lck_mtx_lock(nfs_bind_resv_mutex);
 334         TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 335         lck_mtx_unlock(nfs_bind_resv_mutex);
 336
 337         error = nfs_bind_resv_thread_wake();
 338         if (error) {
 339                 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 340                 /* Note: we might be able to simply restart the thread */
 341                 return (error);
 342         }
 343
 344         tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
 345
 346         return (brreq.brr_error);
 347 }
 348
 349 /*
 350  * Initialize sockets and congestion for a new NFS connection.
 351  * We do not free the sockaddr if error.
 352  */
 353 int
 354 nfs_connect(
 355         struct nfsmount *nmp,
 356         __unused struct nfsreq *rep)
 357 {
 358         socket_t so;
 359         int error, rcvreserve, sndreserve;
 360         struct sockaddr *saddr;
 361         struct timeval timeo;
 362
 363         nmp->nm_so = 0;
 364         saddr = mbuf_data(nmp->nm_nam);
 365         error = sock_socket(saddr->sa_family, nmp->nm_sotype,
 366                                                 nmp->nm_soproto, 0, 0, &nmp->nm_so);
 367         if (error) {
 368                 goto bad;
 369         }
 370         so = nmp->nm_so;
 371
 372         /*
 373          * Some servers require that the client port be a reserved port number.
 374          */
 375         if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
 376                 proc_t p;
 377                 /*
 378                  * sobind() requires current_proc() to have superuser privs.
 379                  * If this bind is part of a reconnect, and the current proc
 380                  * doesn't have superuser privs, we hand the sobind() off to
 381                  * a kernel thread to process.
 382                  */
 383                 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
 384                     (p = current_proc()) && suser(kauth_cred_get(), 0)) {
 385                         /* request nfs_bind_resv_thread() to do bind */
 386                         error = nfs_bind_resv_nopriv(nmp);
 387                 } else {
 388                         error = nfs_bind_resv(nmp);
 389                 }
 390                 if (error)
 391                         goto bad;
 392         }
 393
 394         /*
 395          * Protocols that do not require connections may be optionally left
 396          * unconnected for servers that reply from a port other than NFS_PORT.
 397          */
 398         if (nmp->nm_flag & NFSMNT_NOCONN) {
 399                 if (nmp->nm_sotype == SOCK_STREAM) {
 400                         error = ENOTCONN;
 401                         goto bad;
 402                 }
 403         } else {
 404                 struct timeval  tv;
 405                 tv.tv_sec = 2;
 406                 tv.tv_usec = 0;
 407                 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
 408                 if (error && error != EINPROGRESS) {
 409                         goto bad;
 410                 }
 411
 412                 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
 413                         if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
 414                                 goto bad;
 415                         }
 416                 }
 417         }
 418
 419         /*
 420          * Always time out on recieve, this allows us to reconnect the
 421          * socket to deal with network changes.
 422          */
 423         timeo.tv_usec = 0;
 424         timeo.tv_sec = 2;
 425         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
 426         if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
 427                 timeo.tv_sec = 5;
 428         } else {
 429                 timeo.tv_sec = 0;
 430         }
 431         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
 432
 433         if (nmp->nm_sotype == SOCK_DGRAM) {
 434                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 435                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 436                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 437         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
 438                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 439                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 440                         (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 441         } else {
 442                 int proto;
 443                 int on = 1;
 444
 445                 sock_gettype(so, NULL, NULL, &proto);
 446                 if (nmp->nm_sotype != SOCK_STREAM)
 447                         panic("nfscon sotype");
 448
 449                 // Assume that SOCK_STREAM always requires a connection
 450                 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
 451
 452                 if (proto == IPPROTO_TCP) {
 453                         sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
 454                 }
 455
 456                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
 457                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
 458                                 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 459         }
 460
 461         if (sndreserve > NFS_MAXSOCKBUF)
 462                 sndreserve = NFS_MAXSOCKBUF;
 463         if (rcvreserve > NFS_MAXSOCKBUF)
 464                 rcvreserve = NFS_MAXSOCKBUF;
 465         error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
 466         if (error) {
 467                 goto bad;
 468         }
 469         error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
 470         if (error) {
 471                 goto bad;
 472         }
 473
 474         sock_nointerrupt(so, 1);
 475
 476         /* Initialize other non-zero congestion variables */
 477         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
 478                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
 479         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
 480                 nmp->nm_sdrtt[3] = 0;
 481         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
 482         nmp->nm_sent = 0;
 483         FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
 484         nmp->nm_timeouts = 0;
 485         return (0);
 486
 487 bad:
 488         nfs_disconnect(nmp);
 489         return (error);
 490 }
 491
 492 /*
 493  * Reconnect routine:
 494  * Called when a connection is broken on a reliable protocol.
 495  * - clean up the old socket
 496  * - nfs_connect() again
 497  * - set R_MUSTRESEND for all outstanding requests on mount point
 498  * If this fails the mount point is DEAD!
 499  * nb: Must be called with the nfs_sndlock() set on the mount point.
 500  */
 501 static int
 502 nfs_reconnect(struct nfsreq *rep)
 503 {
 504         struct nfsreq *rp;
 505         struct nfsmount *nmp = rep->r_nmp;
 506         int error;
 507
 508         nfs_disconnect(nmp);
 509         while ((error = nfs_connect(nmp, rep))) {
 510                 if (error == EINTR || error == ERESTART)
 511                         return (EINTR);
 512                 if (error == EIO)
 513                         return (EIO);
 514                 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
 515                         "can not connect");
 516                 rep->r_flags |= R_TPRINTFMSG;
 517                 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
 518                         /* we're not yet completely mounted and */
 519                         /* we can't reconnect, so we fail */
 520                         return (error);
 521                 }
 522                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
 523                         return (error);
 524                 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
 525         }
 526
 527         /*
 528          * Loop through outstanding request list and fix up all requests
 529          * on old socket.
 530          */
 531         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
 532                 if (rp->r_nmp == nmp)
 533                         rp->r_flags |= R_MUSTRESEND;
 534         }
 535         return (0);
 536 }
 537
 538 /*
 539  * NFS disconnect. Clean up and unlink.
 540  */
 541 void
 542 nfs_disconnect(struct nfsmount *nmp)
 543 {
 544         socket_t so;
 545
 546         if (nmp->nm_so) {
 547                 so = nmp->nm_so;
 548                 nmp->nm_so = 0;
 549                 sock_shutdown(so, 2);
 550                 sock_close(so);
 551         }
 552 }
 553
 554 /*
 555  * This is the nfs send routine. For connection based socket types, it
 556  * must be called with an nfs_sndlock() on the socket.
 557  * "rep == NULL" indicates that it has been called from a server.
 558  * For the client side:
 559  * - return EINTR if the RPC is terminated, 0 otherwise
 560  * - set R_MUSTRESEND if the send fails for any reason
 561  * - do any cleanup required by recoverable socket errors (???)
 562  * For the server side:
 563  * - return EINTR or ERESTART if interrupted by a signal
 564  * - return EPIPE if a connection is lost for connection based sockets (TCP...)
 565  * - do any cleanup required by recoverable socket errors (???)
 566  */
 567 int
 568 nfs_send(so, nam, top, rep)
 569         socket_t so;
 570         mbuf_t nam;
 571         mbuf_t top;
 572         struct nfsreq *rep;
 573 {
 574         struct sockaddr *sendnam;
 575         int error, error2, sotype, flags;
 576         u_long xidqueued = 0;
 577         struct nfsreq *rp;
 578         char savenametolog[MAXPATHLEN];
 579         struct msghdr msg;
 580
 581         if (rep) {
 582                 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 583                 if (error) {
 584                         mbuf_freem(top);
 585                         return (error);
 586                 }
 587                 if ((so = rep->r_nmp->nm_so) == NULL) {
 588                         rep->r_flags |= R_MUSTRESEND;
 589                         mbuf_freem(top);
 590                         return (0);
 591                 }
 592                 rep->r_flags &= ~R_MUSTRESEND;
 593                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 594                         if (rp == rep)
 595                                 break;
 596                 if (rp)
 597                         xidqueued = rp->r_xid;
 598         }
 599         sock_gettype(so, NULL, &sotype, NULL);
 600         if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
 601             (nam == 0))
 602                 sendnam = (struct sockaddr *)0;
 603         else
 604                 sendnam = mbuf_data(nam);
 605
 606         if (sotype == SOCK_SEQPACKET)
 607                 flags = MSG_EOR;
 608         else
 609                 flags = 0;
 610
 611         /*
 612          * Save the name here in case mount point goes away if we block.
 613          * The name is using local stack and is large, but don't
 614          * want to block if we malloc.
 615          */
 616         if (rep)
 617                 strncpy(savenametolog,
 618                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
 619                         MAXPATHLEN - 1);
 620         bzero(&msg, sizeof(msg));
 621         msg.msg_name = (caddr_t)sendnam;
 622         msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
 623         error = sock_sendmbuf(so, &msg, top, flags, NULL);
 624
 625         if (error) {
 626                 if (rep) {
 627                         if (xidqueued) {
 628                                 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 629                                         if (rp == rep && rp->r_xid == xidqueued)
 630                                                 break;
 631                                 if (!rp)
 632                                         panic("nfs_send: error %d xid %x gone",
 633                                               error, xidqueued);
 634                         }
 635                         log(LOG_INFO, "nfs send error %d for server %s\n",
 636                             error, savenametolog);
 637                         /*
 638                          * Deal with errors for the client side.
 639                          */
 640                         error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 641                         if (error2) {
 642                                 error = error2;
 643                         } else {
 644                                 rep->r_flags |= R_MUSTRESEND;
 645                         }
 646                 } else
 647                         log(LOG_INFO, "nfsd send error %d\n", error);
 648
 649                 /*
 650                  * Handle any recoverable (soft) socket errors here. (???)
 651                  */
 652                 if (error != EINTR && error != ERESTART && error != EIO &&
 653                         error != EWOULDBLOCK && error != EPIPE) {
 654                         error = 0;
 655                 }
 656         }
 657         return (error);
 658 }
 659
 660 /*
 661  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
 662  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
 663  * Mark and consolidate the data into a new mbuf list.
 664  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
 665  *     small mbufs.
 666  * For SOCK_STREAM we must be very careful to read an entire record once
 667  * we have read any of it, even if the system call has been interrupted.
 668  */
 669 static int
 670 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
 671 {
 672         socket_t so;
 673         struct iovec_32 aio;
 674         mbuf_t m, mlast;
 675         u_long len, fraglen;
 676         int error, error2, sotype;
 677         proc_t p = current_proc();      /* XXX */
 678         struct msghdr msg;
 679         size_t rcvlen;
 680         int lastfragment;
 681
 682         /*
 683          * Set up arguments for soreceive()
 684          */
 685         *mp = NULL;
 686         sotype = rep->r_nmp->nm_sotype;
 687
 688         /*
 689          * For reliable protocols, lock against other senders/receivers
 690          * in case a reconnect is necessary.
 691          * For SOCK_STREAM, first get the Record Mark to find out how much
 692          * more there is to get.
 693          * We must lock the socket against other receivers
 694          * until we have an entire rpc request/reply.
 695          */
 696         if (sotype != SOCK_DGRAM) {
 697                 error = nfs_sndlock(rep);
 698                 if (error)
 699                         return (error);
 700 tryagain:
 701                 /*
 702                  * Check for fatal errors and resending request.
 703                  */
 704                 /*
 705                  * Ugh: If a reconnect attempt just happened, nm_so
 706                  * would have changed. NULL indicates a failed
 707                  * attempt that has essentially shut down this
 708                  * mount point.
 709                  */
 710                 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
 711                         nfs_sndunlock(rep);
 712                         if (error)
 713                                 return (error);
 714                         return (EINTR);
 715                 }
 716                 so = rep->r_nmp->nm_so;
 717                 if (!so) {
 718                         error = nfs_reconnect(rep);
 719                         if (error) {
 720                                 nfs_sndunlock(rep);
 721                                 return (error);
 722                         }
 723                         goto tryagain;
 724                 }
 725                 while (rep->r_flags & R_MUSTRESEND) {
 726                         error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
 727                         if (!error) {
 728                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
 729                                 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
 730                         }
 731                         /*
 732                          * we also hold rcv lock so rep is still
 733                          * legit this point
 734                          */
 735                         if (error) {
 736                                 if (error == EINTR || error == ERESTART ||
 737                                     (error = nfs_reconnect(rep))) {
 738                                         nfs_sndunlock(rep);
 739                                         return (error);
 740                                 }
 741                                 goto tryagain;
 742                         }
 743                 }
 744                 nfs_sndunlock(rep);
 745                 if (sotype == SOCK_STREAM) {
 746                         error = 0;
 747                         len = 0;
 748                         lastfragment = 0;
 749                         mlast = NULL;
 750                         while (!error && !lastfragment) {
 751                                 aio.iov_base = (uintptr_t) &fraglen;
 752                                 aio.iov_len = sizeof(u_long);
 753                                 bzero(&msg, sizeof(msg));
 754                                 msg.msg_iov = (struct iovec *) &aio;
 755                                 msg.msg_iovlen = 1;
 756                                 do {
 757                                    error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
 758                                    if (!rep->r_nmp) /* if unmounted then bailout */
 759                                         goto shutout;
 760                                    if (error == EWOULDBLOCK && rep) {
 761                                         error2 = nfs_sigintr(rep->r_nmp, rep, p);
 762                                         if (error2)
 763                                                 error = error2;
 764                                    }
 765                                 } while (error == EWOULDBLOCK);
 766                                 if (!error && rcvlen < aio.iov_len) {
 767                                     /* only log a message if we got a partial word */
 768                                     if (rcvlen != 0)
 769                                             log(LOG_INFO,
 770                                                  "short receive (%d/%d) from nfs server %s\n",
 771                                                  rcvlen, sizeof(u_long),
 772                                                  vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 773                                     error = EPIPE;
 774                                 }
 775                                 if (error)
 776                                         goto errout;
 777                                 lastfragment = ntohl(fraglen) & 0x80000000;
 778                                 fraglen = ntohl(fraglen) & ~0x80000000;
 779                                 len += fraglen;
 780                                 /*
 781                                  * This is SERIOUS! We are out of sync with the sender
 782                                  * and forcing a disconnect/reconnect is all I can do.
 783                                  */
 784                                 if (len > NFS_MAXPACKET) {
 785                                     log(LOG_ERR, "%s (%d) from nfs server %s\n",
 786                                         "impossible RPC record length", len,
 787                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 788                                     error = EFBIG;
 789                                     goto errout;
 790                                 }
 791
 792                                 m = NULL;
 793                                 do {
 794                                     rcvlen = fraglen;
 795                                     error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
 796                                     if (!rep->r_nmp) /* if unmounted then bailout */ {
 797                                         goto shutout;
 798                                     }
 799                                 } while (error == EWOULDBLOCK || error == EINTR ||
 800                                          error == ERESTART);
 801
 802                                 if (!error && fraglen > rcvlen) {
 803                                     log(LOG_INFO,
 804                                         "short receive (%d/%d) from nfs server %s\n",
 805                                         rcvlen, fraglen,
 806                                         vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 807                                     error = EPIPE;
 808                                     mbuf_freem(m);
 809                                 }
 810                                 if (!error) {
 811                                         if (!*mp) {
 812                                                 *mp = m;
 813                                                 mlast = m;
 814                                         } else {
 815                                                 error = mbuf_setnext(mlast, m);
 816                                                 if (error) {
 817                                                         printf("nfs_receive: mbuf_setnext failed %d\n", error);
 818                                                         mbuf_freem(m);
 819                                                 }
 820                                         }
 821                                         while (mbuf_next(mlast))
 822                                                 mlast = mbuf_next(mlast);
 823                                 }
 824                         }
 825                 } else {
 826                         bzero(&msg, sizeof(msg));
 827                         do {
 828                             rcvlen = 100000000;
 829                             error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 830                             if (!rep->r_nmp) /* if unmounted then bailout */ {
 831                                 goto shutout;
 832                             }
 833                             if (error == EWOULDBLOCK && rep) {
 834                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 835                                 if (error2) {
 836                                         return (error2);
 837                                 }
 838                             }
 839                         } while (error == EWOULDBLOCK);
 840
 841                         if ((msg.msg_flags & MSG_EOR) == 0)
 842                                 printf("Egad!!\n");
 843                         if (!error && *mp == NULL)
 844                                 error = EPIPE;
 845                         len = rcvlen;
 846                 }
 847 errout:
 848                 if (error && error != EINTR && error != ERESTART) {
 849                         mbuf_freem(*mp);
 850                         *mp = NULL;
 851                         if (error != EPIPE)
 852                                 log(LOG_INFO,
 853                                     "receive error %d from nfs server %s\n", error,
 854                                     vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 855                         error = nfs_sndlock(rep);
 856                         if (!error) {
 857                                 error = nfs_reconnect(rep);
 858                                 if (!error)
 859                                         goto tryagain;
 860                                 nfs_sndunlock(rep);
 861                         }
 862                 }
 863         } else {
 864                 /*
 865                  * We could have failed while rebinding the datagram socket
 866                  * so we need to attempt to rebind here.
 867                  */
 868                 if ((so = rep->r_nmp->nm_so) == NULL) {
 869                         error = nfs_sndlock(rep);
 870                         if (!error) {
 871                                 error = nfs_reconnect(rep);
 872                                 nfs_sndunlock(rep);
 873                         }
 874                         if (error)
 875                                 return (error);
 876                         if (!rep->r_nmp) /* if unmounted then bailout */
 877                                 return (ENXIO);
 878                         so = rep->r_nmp->nm_so;
 879                 }
 880                 bzero(&msg, sizeof(msg));
 881                 len = 0;
 882                 do {
 883                         rcvlen = 1000000;
 884                         error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 885                         if (!rep->r_nmp) /* if unmounted then bailout */
 886                                 goto shutout;
 887                         if (error) {
 888                                 error2 = nfs_sigintr(rep->r_nmp, rep, p);
 889                                 if (error2) {
 890                                         error = error2;
 891                                         goto shutout;
 892                                 }
 893                         }
 894                         /* Reconnect for all errors.  We may be receiving
 895                          * soft/hard/blocking errors because of a network
 896                          * change.
 897                          * XXX: we should rate limit or delay this
 898                          * to once every N attempts or something.
 899                          * although TCP doesn't seem to.
 900                          */
 901                         if (error) {
 902                                 error2 = nfs_sndlock(rep);
 903                                 if (!error2) {
 904                                         error2 = nfs_reconnect(rep);
 905                                         if (error2)
 906                                                 error = error2;
 907                                         else if (!rep->r_nmp) /* if unmounted then bailout */
 908                                                 error = ENXIO;
 909                                         else
 910                                                 so = rep->r_nmp->nm_so;
 911                                         nfs_sndunlock(rep);
 912                                 } else {
 913                                         error = error2;
 914                                 }
 915                         }
 916                 } while (error == EWOULDBLOCK);
 917         }
 918 shutout:
 919         if (error) {
 920                 mbuf_freem(*mp);
 921                 *mp = NULL;
 922         }
 923         return (error);
 924 }
 925
 926 /*
 927  * Implement receipt of reply on a socket.
 928  * We must search through the list of received datagrams matching them
 929  * with outstanding requests using the xid, until ours is found.
 930  */
 931 /* ARGSUSED */
 932 int
 933 nfs_reply(myrep)
 934         struct nfsreq *myrep;
 935 {
 936         struct nfsreq *rep;
 937         struct nfsmount *nmp = myrep->r_nmp;
 938         long t1;
 939         mbuf_t mrep, md;
 940         u_long rxid, *tl;
 941         caddr_t dpos, cp2;
 942         int error;
 943
 944         /*
 945          * Loop around until we get our own reply
 946          */
 947         for (;;) {
 948                 /*
 949                  * Lock against other receivers so that I don't get stuck in
 950                  * sbwait() after someone else has received my reply for me.
 951                  * Also necessary for connection based protocols to avoid
 952                  * race conditions during a reconnect.
 953                  * If nfs_rcvlock() returns EALREADY, that means that
 954                  * the reply has already been recieved by another
 955                  * process and we can return immediately.  In this
 956                  * case, the lock is not taken to avoid races with
 957                  * other processes.
 958                  */
 959                 error = nfs_rcvlock(myrep);
 960                 if (error == EALREADY)
 961                         return (0);
 962                 if (error)
 963                         return (error);
 964
 965                 /*
 966                  * If we slept after putting bits otw, then reply may have
 967                  * arrived.  In which case returning is required, or we
 968                  * would hang trying to nfs_receive an already received reply.
 969                  */
 970                 if (myrep->r_mrep != NULL) {
 971                         nfs_rcvunlock(myrep);
 972                         FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
 973                         return (0);
 974                 }
 975                 /*
 976                  * Get the next Rpc reply off the socket. Assume myrep->r_nmp
 977                  * is still intact by checks done in nfs_rcvlock.
 978                  */
 979                 error = nfs_receive(myrep, &mrep);
 980                 /*
 981                  * Bailout asap if nfsmount struct gone (unmounted).
 982                  */
 983                 if (!myrep->r_nmp) {
 984                         FSDBG(530, myrep->r_xid, myrep, nmp, -2);
 985                         if (mrep)
 986                                 mbuf_freem(mrep);
 987                         return (ENXIO);
 988                 }
 989                 if (error) {
 990                         FSDBG(530, myrep->r_xid, myrep, nmp, error);
 991                         nfs_rcvunlock(myrep);
 992
 993                         /* Bailout asap if nfsmount struct gone (unmounted). */
 994                         if (!myrep->r_nmp) {
 995                                 if (mrep)
 996                                         mbuf_freem(mrep);
 997                                 return (ENXIO);
 998                         }
 999
1000                         /*
1001                          * Ignore routing errors on connectionless protocols??
1002                          */
1003                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1004                                 if (nmp->nm_so) {
1005                                         int clearerror;
1006                                         int optlen = sizeof(clearerror);
1007                                         sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1008                                 }
1009                                 continue;
1010                         }
1011                         if (mrep)
1012                                 mbuf_freem(mrep);
1013                         return (error);
1014                 }
1015
1016                 /*
1017                  * We assume all is fine, but if we did not have an error
1018                  * and mrep is 0, better not dereference it. nfs_receive
1019                  * calls soreceive which carefully sets error=0 when it got
1020                  * errors on sbwait (tsleep). In most cases, I assume that's
1021                  * so we could go back again. In tcp case, EPIPE is returned.
1022                  * In udp, case nfs_receive gets back here with no error and no
1023                  * mrep. Is the right fix to have soreceive check for process
1024                  * aborted after sbwait and return something non-zero? Should
1025                  * nfs_receive give an EPIPE?  Too risky to play with those
1026                  * two this late in game for a shutdown problem. Instead,
1027                  * just check here and get out. (ekn)
1028                  */
1029                 if (!mrep) {
1030                         nfs_rcvunlock(myrep);
1031                         FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1032                         return (ENXIO); /* sounds good */
1033                 }
1034
1035                 /*
1036                  * Get the xid and check that it is an rpc reply
1037                  */
1038                 md = mrep;
1039                 dpos = mbuf_data(md);
1040                 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1041                 rxid = *tl++;
1042                 if (*tl != rpc_reply) {
1043                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1044                         mbuf_freem(mrep);
1045 nfsmout:
1046                         if (nmp->nm_state & NFSSTA_RCVLOCK)
1047                                 nfs_rcvunlock(myrep);
1048                         continue;
1049                 }
1050
1051                 /*
1052                  * Loop through the request list to match up the reply
1053                  * Iff no match, just drop the datagram
1054                  */
1055                 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1056                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1057                                 /* Found it.. */
1058                                 rep->r_mrep = mrep;
1059                                 rep->r_md = md;
1060                                 rep->r_dpos = dpos;
1061                                 /*
1062                                  * If we're tracking the round trip time
1063                                  * then we update the circular log here
1064                                  * with the stats from our current request.
1065                                  */
1066                                 if (nfsrtton) {
1067                                         struct rttl *rt;
1068
1069                                         rt = &nfsrtt.rttl[nfsrtt.pos];
1070                                         rt->proc = rep->r_procnum;
1071                                         rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1072                                         rt->sent = nmp->nm_sent;
1073                                         rt->cwnd = nmp->nm_cwnd;
1074                                         if (proct[rep->r_procnum] == 0)
1075                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1076                                         rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1077                                         rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1078                                         rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1079                                         microtime(&rt->tstamp); // XXX unused
1080                                         if (rep->r_flags & R_TIMING)
1081                                                 rt->rtt = rep->r_rtt;
1082                                         else
1083                                                 rt->rtt = 1000000;
1084                                         nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1085                                 }
1086                                 /*
1087                                  * Update congestion window.
1088                                  * Do the additive increase of
1089                                  * one rpc/rtt.
1090                                  */
1091                                 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1092                                       nmp->nm_cwnd);
1093                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
1094                                         nmp->nm_cwnd +=
1095                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
1096                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1097                                         if (nmp->nm_cwnd > NFS_MAXCWND)
1098                                                 nmp->nm_cwnd = NFS_MAXCWND;
1099                                 }
1100                                 if (rep->r_flags & R_SENT) {
1101                                     rep->r_flags &= ~R_SENT;
1102                                     nmp->nm_sent -= NFS_CWNDSCALE;
1103                                }
1104                                 /*
1105                                  * Update rtt using a gain of 0.125 on the mean
1106                                  * and a gain of 0.25 on the deviation.
1107                                  */
1108                                 if (rep->r_flags & R_TIMING) {
1109                                         /*
1110                                          * Since the timer resolution of
1111                                          * NFS_HZ is so course, it can often
1112                                          * result in r_rtt == 0. Since
1113                                          * r_rtt == N means that the actual
1114                                          * rtt is between N+dt and N+2-dt ticks,
1115                                          * add 1.
1116                                          */
1117                                         if (proct[rep->r_procnum] == 0)
1118                                                 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1119                                         t1 = rep->r_rtt + 1;
1120                                         t1 -= (NFS_SRTT(rep) >> 3);
1121                                         NFS_SRTT(rep) += t1;
1122                                         if (t1 < 0)
1123                                                 t1 = -t1;
1124                                         t1 -= (NFS_SDRTT(rep) >> 2);
1125                                         NFS_SDRTT(rep) += t1;
1126                                 }
1127                                 nmp->nm_timeouts = 0;
1128                                 break;
1129                         }
1130                 }
1131                 nfs_rcvunlock(myrep);
1132                 /*
1133                  * If not matched to a request, drop it.
1134                  * If it's mine, get out.
1135                  */
1136                 if (rep == 0) {
1137                         OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1138                         mbuf_freem(mrep);
1139                 } else if (rep == myrep) {
1140                         if (rep->r_mrep == NULL)
1141                                 panic("nfs_reply: nil r_mrep");
1142                         return (0);
1143                 }
1144                 FSDBG(530, myrep->r_xid, myrep, rep,
1145                       rep ? rep->r_xid : myrep->r_flags);
1146         }
1147 }
1148
1149 /*
1150  * nfs_request - goes something like this
1151  *      - fill in request struct
1152  *      - links it into list
1153  *      - calls nfs_send() for first transmit
1154  *      - calls nfs_receive() to get reply
1155  *      - break down rpc header and return with nfs reply pointed to
1156  *        by mrep or error
1157  * nb: always frees up mreq mbuf list
1158  */
1159 int
1160 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1161         vnode_t vp;
1162         mount_t mp;
1163         mbuf_t mrest;
1164         int procnum;
1165         proc_t procp;
1166         kauth_cred_t cred;
1167         mbuf_t *mrp;
1168         mbuf_t *mdp;
1169         caddr_t *dposp;
1170         u_int64_t *xidp;
1171 {
1172         mbuf_t m, mrep, m2;
1173         struct nfsreq re, *rep;
1174         u_long *tl;
1175         int i;
1176         struct nfsmount *nmp;
1177         mbuf_t md, mheadend;
1178         char nickv[RPCX_NICKVERF];
1179         time_t waituntil;
1180         caddr_t dpos, cp2;
1181         int t1, error = 0, mrest_len, auth_len, auth_type;
1182         int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1183         int verf_len, verf_type;
1184         u_long xid;
1185         char *auth_str, *verf_str;
1186         NFSKERBKEY_T key;               /* save session key */
1187         int nmsotype;
1188         struct timeval now;
1189
1190         if (mrp)
1191                 *mrp = NULL;
1192         if (xidp)
1193                 *xidp = 0;
1194         nmp = VFSTONFS(mp);
1195
1196         rep = &re;
1197
1198         if (vp)
1199                 nmp = VFSTONFS(vnode_mount(vp));
1200         if (nmp == NULL ||
1201             (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1202             (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1203                 mbuf_freem(mrest);
1204                 return (ENXIO);
1205         }
1206         nmsotype = nmp->nm_sotype;
1207
1208         FSDBG_TOP(531, vp, procnum, nmp, rep);
1209
1210         rep->r_nmp = nmp;
1211         rep->r_vp = vp;
1212         rep->r_procp = procp;
1213         rep->r_procnum = procnum;
1214         microuptime(&now);
1215         rep->r_lastmsg = now.tv_sec -
1216             ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1217         i = 0;
1218         m = mrest;
1219         while (m) {
1220                 i += mbuf_len(m);
1221                 m = mbuf_next(m);
1222         }
1223         mrest_len = i;
1224
1225         /*
1226          * Get the RPC header with authorization.
1227          */
1228 kerbauth:
1229         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1230         if (!nmp) {
1231                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1232                 mbuf_freem(mrest);
1233                 return (ENXIO);
1234         }
1235         verf_str = auth_str = (char *)0;
1236         if (nmp->nm_flag & NFSMNT_KERB) {
1237                 verf_str = nickv;
1238                 verf_len = sizeof (nickv);
1239                 auth_type = RPCAUTH_KERB4;
1240                 bzero((caddr_t)key, sizeof (key));
1241                 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1242                         &auth_len, verf_str, verf_len)) {
1243                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1244                         if (!nmp) {
1245                                 FSDBG_BOT(531, 2, vp, error, rep);
1246                                 mbuf_freem(mrest);
1247                                 return (ENXIO);
1248                         }
1249                         error = nfs_getauth(nmp, rep, cred, &auth_str,
1250                                 &auth_len, verf_str, &verf_len, key);
1251                         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1252                         if (!error && !nmp)
1253                                 error = ENXIO;
1254                         if (error) {
1255                                 FSDBG_BOT(531, 2, vp, error, rep);
1256                                 mbuf_freem(mrest);
1257                                 return (error);
1258                         }
1259                 }
1260         } else {
1261                 auth_type = RPCAUTH_UNIX;
1262                 if (cred->cr_ngroups < 1)
1263                         panic("nfsreq nogrps");
1264                 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1265                         nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1266                         5 * NFSX_UNSIGNED;
1267         }
1268         error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1269              auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1270         if (auth_str)
1271                 _FREE(auth_str, M_TEMP);
1272         if (error) {
1273                 mbuf_freem(mrest);
1274                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1275                 return (error);
1276         }
1277         if (xidp)
1278                 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1279
1280         /*
1281          * For stream protocols, insert a Sun RPC Record Mark.
1282          */
1283         if (nmsotype == SOCK_STREAM) {
1284                 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1285                 if (error) {
1286                         mbuf_freem(m);
1287                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1288                         return (error);
1289                 }
1290                 *((u_long*)mbuf_data(m)) =
1291                         htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1292         }
1293         rep->r_mreq = m;
1294         rep->r_xid = xid;
1295 tryagain:
1296         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1297         if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1298                 rep->r_retry = nmp->nm_retry;
1299         else
1300                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
1301         rep->r_rtt = rep->r_rexmit = 0;
1302         if (proct[procnum] > 0)
1303                 rep->r_flags = R_TIMING;
1304         else
1305                 rep->r_flags = 0;
1306         rep->r_mrep = NULL;
1307
1308         /*
1309          * Do the client side RPC.
1310          */
1311         OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1312         /*
1313          * Chain request into list of outstanding requests. Be sure
1314          * to put it LAST so timer finds oldest requests first.
1315          */
1316         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1317
1318         /*
1319          * If backing off another request or avoiding congestion, don't
1320          * send this one now but let timer do it. If not timing a request,
1321          * do it now.
1322          */
1323         if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1324                            (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1325                            nmp->nm_sent < nmp->nm_cwnd)) {
1326                 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1327
1328                 if (connrequired)
1329                         error = nfs_sndlock(rep);
1330
1331                 /*
1332                  * Set the R_SENT before doing the send in case another thread
1333                  * processes the reply before the nfs_send returns here
1334                  */
1335                 if (!error) {
1336                         if ((rep->r_flags & R_MUSTRESEND) == 0) {
1337                                 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1338                                       nmp->nm_cwnd);
1339                                 nmp->nm_sent += NFS_CWNDSCALE;
1340                                 rep->r_flags |= R_SENT;
1341                         }
1342
1343                         error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1344                         if (!error)
1345                                 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1346                         if (connrequired)
1347                                 nfs_sndunlock(rep);
1348                 }
1349                 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1350                 if (error) {
1351                         if (nmp)
1352                                 nmp->nm_sent -= NFS_CWNDSCALE;
1353                         rep->r_flags &= ~R_SENT;
1354                 }
1355         } else {
1356                 rep->r_rtt = -1;
1357         }
1358
1359         /*
1360          * Wait for the reply from our send or the timer's.
1361          */
1362         if (!error || error == EPIPE)
1363                 error = nfs_reply(rep);
1364
1365         /*
1366          * RPC done, unlink the request.
1367          */
1368         nfs_repdequeue(rep);
1369
1370         nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1371
1372         /*
1373          * Decrement the outstanding request count.
1374          */
1375         if (rep->r_flags & R_SENT) {
1376                 rep->r_flags &= ~R_SENT;        /* paranoia */
1377                 if (nmp) {
1378                         FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1379                         nmp->nm_sent -= NFS_CWNDSCALE;
1380                 }
1381         }
1382
1383         /*
1384          * If there was a successful reply and a tprintf msg.
1385          * tprintf a response.
1386          */
1387         if (!error)
1388                 nfs_up(nmp, procp, NFSSTA_TIMEO,
1389                         (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1390         mrep = rep->r_mrep;
1391         md = rep->r_md;
1392         dpos = rep->r_dpos;
1393         if (!error && !nmp)
1394                 error = ENXIO;
1395         if (error) {
1396                 mbuf_freem(rep->r_mreq);
1397                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1398                 return (error);
1399         }
1400
1401         /*
1402          * break down the rpc header and check if ok
1403          */
1404         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1405         if (*tl++ == rpc_msgdenied) {
1406                 if (*tl == rpc_mismatch)
1407                         error = EOPNOTSUPP;
1408                 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1409                         if (!failed_auth) {
1410                                 failed_auth++;
1411                                 error = mbuf_setnext(mheadend, NULL);
1412                                 mbuf_freem(mrep);
1413                                 mbuf_freem(rep->r_mreq);
1414                                 if (!error)
1415                                         goto kerbauth;
1416                                 printf("nfs_request: mbuf_setnext failed\n");
1417                         } else
1418                                 error = EAUTH;
1419                 } else
1420                         error = EACCES;
1421                 mbuf_freem(mrep);
1422                 mbuf_freem(rep->r_mreq);
1423                 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1424                 return (error);
1425         }
1426
1427         /*
1428          * Grab any Kerberos verifier, otherwise just throw it away.
1429          */
1430         verf_type = fxdr_unsigned(int, *tl++);
1431         i = fxdr_unsigned(int, *tl);
1432         if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1433                 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1434                 if (error)
1435                         goto nfsmout;
1436         } else if (i > 0)
1437                 nfsm_adv(nfsm_rndup(i));
1438         nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1439         /* 0 == ok */
1440         if (*tl == 0) {
1441                 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1442                 if (*tl != 0) {
1443                         error = fxdr_unsigned(int, *tl);
1444                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1445                                 error == NFSERR_TRYLATER) {
1446                                 mbuf_freem(mrep);
1447                                 error = 0;
1448                                 microuptime(&now);
1449                                 waituntil = now.tv_sec + trylater_delay;
1450                                 while (now.tv_sec < waituntil) {
1451                                         tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1452                                         microuptime(&now);
1453                                 }
1454                                 trylater_delay *= 2;
1455                                 if (trylater_delay > 60)
1456                                         trylater_delay = 60;
1457                                 goto tryagain;
1458                         }
1459
1460                         /*
1461                          * If the File Handle was stale, invalidate the
1462                          * lookup cache, just in case.
1463                          */
1464                         if ((error == ESTALE) && vp)
1465                                 cache_purge(vp);
1466                         if (nmp->nm_flag & NFSMNT_NFSV3) {
1467                                 *mrp = mrep;
1468                                 *mdp = md;
1469                                 *dposp = dpos;
1470                                 error |= NFSERR_RETERR;
1471                         } else {
1472                                 mbuf_freem(mrep);
1473                                 error &= ~NFSERR_RETERR;
1474                         }
1475                         mbuf_freem(rep->r_mreq);
1476                         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1477                         return (error);
1478                 }
1479
1480                 *mrp = mrep;
1481                 *mdp = md;
1482                 *dposp = dpos;
1483                 mbuf_freem(rep->r_mreq);
1484                 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1485                 return (0);
1486         }
1487         mbuf_freem(mrep);
1488         error = EPROTONOSUPPORT;
1489 nfsmout:
1490         mbuf_freem(rep->r_mreq);
1491         FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1492         return (error);
1493 }
1494
1495 #ifndef NFS_NOSERVER
1496 /*
1497  * Generate the rpc reply header
1498  * siz arg. is used to decide if adding a cluster is worthwhile
1499  */
1500 int
1501 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1502         int siz;
1503         struct nfsrv_descript *nd;
1504         struct nfssvc_sock *slp;
1505         int err;
1506         mbuf_t *mrq;
1507         mbuf_t *mbp;
1508         caddr_t *bposp;
1509 {
1510         u_long *tl;
1511         mbuf_t mreq;
1512         caddr_t bpos;
1513         mbuf_t mb, mb2;
1514         int error, mlen;
1515
1516         /*
1517          * If this is a big reply, use a cluster else
1518          * try and leave leading space for the lower level headers.
1519          */
1520         siz += RPC_REPLYSIZ;
1521         if (siz >= nfs_mbuf_minclsize) {
1522                 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1523         } else {
1524                 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1525         }
1526         if (error) {
1527                 /* unable to allocate packet */
1528                 /* XXX nfsstat? */
1529                 return (error);
1530         }
1531         mb = mreq;
1532         tl = mbuf_data(mreq);
1533         mlen = 6 * NFSX_UNSIGNED;
1534         if (siz < nfs_mbuf_minclsize) {
1535                 /* leave space for lower level headers */
1536                 tl += 80/sizeof(*tl);  /* XXX max_hdr? XXX */
1537                 mbuf_setdata(mreq, tl, mlen);
1538         } else {
1539                 mbuf_setlen(mreq, mlen);
1540         }
1541         bpos = ((caddr_t)tl) + mlen;
1542         *tl++ = txdr_unsigned(nd->nd_retxid);
1543         *tl++ = rpc_reply;
1544         if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1545                 *tl++ = rpc_msgdenied;
1546                 if (err & NFSERR_AUTHERR) {
1547                         *tl++ = rpc_autherr;
1548                         *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1549                         mlen -= NFSX_UNSIGNED;
1550                         mbuf_setlen(mreq, mlen);
1551                         bpos -= NFSX_UNSIGNED;
1552                 } else {
1553                         *tl++ = rpc_mismatch;
1554                         *tl++ = txdr_unsigned(RPC_VER2);
1555                         *tl = txdr_unsigned(RPC_VER2);
1556                 }
1557         } else {
1558                 *tl++ = rpc_msgaccepted;
1559
1560                 /*
1561                  * For Kerberos authentication, we must send the nickname
1562                  * verifier back, otherwise just RPCAUTH_NULL.
1563                  */
1564                 if (nd->nd_flag & ND_KERBFULL) {
1565                     struct nfsuid *nuidp;
1566                     struct timeval ktvin, ktvout;
1567                     uid_t uid = kauth_cred_getuid(nd->nd_cr);
1568
1569                     lck_rw_lock_shared(&slp->ns_rwlock);
1570                     for (nuidp = NUIDHASH(slp, uid)->lh_first;
1571                         nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1572                         if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1573                             (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1574                              &nuidp->nu_haddr, nd->nd_nam2)))
1575                             break;
1576                     }
1577                     if (nuidp) {
1578                         ktvin.tv_sec =
1579                             txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1580                         ktvin.tv_usec =
1581                             txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1582
1583                         /*
1584                          * Encrypt the timestamp in ecb mode using the
1585                          * session key.
1586                          */
1587 #if NFSKERB
1588                         XXX
1589 #endif
1590
1591                         *tl++ = rpc_auth_kerb;
1592                         *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1593                         *tl = ktvout.tv_sec;
1594                         nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1595                         *tl++ = ktvout.tv_usec;
1596                         *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1597                     } else {
1598                         *tl++ = 0;
1599                         *tl++ = 0;
1600                     }
1601                     lck_rw_done(&slp->ns_rwlock);
1602                 } else {
1603                         *tl++ = 0;
1604                         *tl++ = 0;
1605                 }
1606                 switch (err) {
1607                 case EPROGUNAVAIL:
1608                         *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1609                         break;
1610                 case EPROGMISMATCH:
1611                         *tl = txdr_unsigned(RPC_PROGMISMATCH);
1612                         nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1613                         // XXX hard coded versions
1614                         *tl++ = txdr_unsigned(2);
1615                         *tl = txdr_unsigned(3);
1616                         break;
1617                 case EPROCUNAVAIL:
1618                         *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1619                         break;
1620                 case EBADRPC:
1621                         *tl = txdr_unsigned(RPC_GARBAGE);
1622                         break;
1623                 default:
1624                         *tl = 0;
1625                         if (err != NFSERR_RETVOID) {
1626                                 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1627                                 if (err)
1628                                     *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1629                                 else
1630                                     *tl = 0;
1631                         }
1632                         break;
1633                 }
1634         }
1635
1636         if (mrq != NULL)
1637                 *mrq = mreq;
1638         *mbp = mb;
1639         *bposp = bpos;
1640         if (err != 0 && err != NFSERR_RETVOID) {
1641                 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1642         }
1643         return (0);
1644 }
1645
1646
1647 #endif /* NFS_NOSERVER */
1648
1649
1650 /*
1651  * From FreeBSD 1.58, a Matt Dillon fix...
1652  * Flag a request as being about to terminate.
1653  * The nm_sent count is decremented now to avoid deadlocks when the process
1654  * in soreceive() hasn't yet managed to send its own request.
1655  */
1656 static void
1657 nfs_softterm(struct nfsreq *rep)
1658 {
1659
1660         rep->r_flags |= R_SOFTTERM;
1661         if (rep->r_flags & R_SENT) {
1662                 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1663                       rep->r_nmp->nm_cwnd);
1664                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1665                 rep->r_flags &= ~R_SENT;
1666         }
1667 }
1668
1669 void
1670 nfs_timer_funnel(void * arg)
1671 {
1672         (void) thread_funnel_set(kernel_flock, TRUE);
1673         nfs_timer(arg);
1674         (void) thread_funnel_set(kernel_flock, FALSE);
1675
1676 }
1677
1678 /*
1679  * Ensure rep isn't in use by the timer, then dequeue it.
1680  */
1681 static void
1682 nfs_repdequeue(struct nfsreq *rep)
1683 {
1684
1685         while ((rep->r_flags & R_BUSY)) {
1686                 rep->r_flags |= R_WAITING;
1687                 tsleep(rep, PSOCK, "repdeq", 0);
1688         }
1689         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1690 }
1691
1692 /*
1693  * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1694  * free()'d out from under it.
1695  */
1696 static void
1697 nfs_repbusy(struct nfsreq *rep)
1698 {
1699
1700         if ((rep->r_flags & R_BUSY))
1701                 panic("rep locked");
1702         rep->r_flags |= R_BUSY;
1703 }
1704
1705 /*
1706  * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1707  */
1708 static struct nfsreq *
1709 nfs_repnext(struct nfsreq *rep)
1710 {
1711         struct nfsreq * nextrep;
1712
1713         if (rep == NULL)
1714                 return (NULL);
1715         /*
1716          * We need to get and busy the next req before signalling the
1717          * current one, otherwise wakeup() may block us and we'll race to
1718          * grab the next req.
1719          */
1720         nextrep = TAILQ_NEXT(rep, r_chain);
1721         if (nextrep != NULL)
1722                 nfs_repbusy(nextrep);
1723         /* unbusy and signal. */
1724         rep->r_flags &= ~R_BUSY;
1725         if ((rep->r_flags & R_WAITING)) {
1726                 rep->r_flags &= ~R_WAITING;
1727                 wakeup(rep);
1728         }
1729         return (nextrep);
1730 }
1731
1732 /*
1733  * Nfs timer routine
1734  * Scan the nfsreq list and retranmit any requests that have timed out
1735  * To avoid retransmission attempts on STREAM sockets (in the future) make
1736  * sure to set the r_retry field to 0 (implies nm_retry == 0).
1737  */
1738 void
1739 nfs_timer(__unused void *arg)
1740 {
1741         struct nfsreq *rep;
1742         mbuf_t m;
1743         socket_t so;
1744         struct nfsmount *nmp;
1745         int timeo;
1746         int error;
1747 #ifndef NFS_NOSERVER
1748         struct nfssvc_sock *slp;
1749         u_quad_t cur_usec;
1750 #endif /* NFS_NOSERVER */
1751         int flags, rexmit, cwnd, sent;
1752         u_long xid;
1753         struct timeval now;
1754
1755         rep = TAILQ_FIRST(&nfs_reqq);
1756         if (rep != NULL)
1757                 nfs_repbusy(rep);
1758         microuptime(&now);
1759         for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1760                 nmp = rep->r_nmp;
1761                 if (!nmp) /* unmounted */
1762                     continue;
1763                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1764                         continue;
1765                 if (nfs_sigintr(nmp, rep, rep->r_procp))
1766                         continue;
1767                 if (nmp->nm_tprintf_initial_delay != 0 &&
1768                     (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1769                     rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1770                         rep->r_lastmsg = now.tv_sec;
1771                         nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1772                                 "not responding");
1773                         rep->r_flags |= R_TPRINTFMSG;
1774                         if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1775                                 /* we're not yet completely mounted and */
1776                                 /* we can't complete an RPC, so we fail */
1777                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1778                                 nfs_softterm(rep);
1779                                 continue;
1780                         }
1781                 }
1782                 if (rep->r_rtt >= 0) {
1783                         rep->r_rtt++;
1784                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1785                                 timeo = nmp->nm_timeo;
1786                         else
1787                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1788                         /* ensure 62.5 ms floor */
1789                         while (16 * timeo < hz)
1790                             timeo *= 2;
1791                         if (nmp->nm_timeouts > 0)
1792                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1793                         if (rep->r_rtt <= timeo)
1794                                 continue;
1795                         if (nmp->nm_timeouts < 8)
1796                                 nmp->nm_timeouts++;
1797                 }
1798                 /*
1799                  * Check for too many retransmits.  This is never true for
1800                  * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1801                  * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1802                  */
1803                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
1804                         OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1805                         nfs_softterm(rep);
1806                         continue;
1807                 }
1808                 if (nmp->nm_sotype != SOCK_DGRAM) {
1809                         if (++rep->r_rexmit > NFS_MAXREXMIT)
1810                                 rep->r_rexmit = NFS_MAXREXMIT;
1811                         continue;
1812                 }
1813                 if ((so = nmp->nm_so) == NULL)
1814                         continue;
1815
1816                 /*
1817                  * If there is enough space and the window allows..
1818                  *      Resend it
1819                  * Set r_rtt to -1 in case we fail to send it now.
1820                  */
1821                 rep->r_rtt = -1;
1822                 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1823                     (rep->r_flags & R_SENT) ||
1824                     nmp->nm_sent < nmp->nm_cwnd) &&
1825                    (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1826                         struct msghdr   msg;
1827                         /*
1828                          * Iff first send, start timing
1829                          * else turn timing off, backoff timer
1830                          * and divide congestion window by 2.
1831                          * We update these *before* the send to avoid
1832                          * racing against receiving the reply.
1833                          * We save them so we can restore them on send error.
1834                          */
1835                         flags = rep->r_flags;
1836                         rexmit = rep->r_rexmit;
1837                         cwnd = nmp->nm_cwnd;
1838                         sent = nmp->nm_sent;
1839                         xid = rep->r_xid;
1840                         if (rep->r_flags & R_SENT) {
1841                                 rep->r_flags &= ~R_TIMING;
1842                                 if (++rep->r_rexmit > NFS_MAXREXMIT)
1843                                         rep->r_rexmit = NFS_MAXREXMIT;
1844                                 nmp->nm_cwnd >>= 1;
1845                                 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1846                                         nmp->nm_cwnd = NFS_CWNDSCALE;
1847                                 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1848                         } else {
1849                                 rep->r_flags |= R_SENT;
1850                                 nmp->nm_sent += NFS_CWNDSCALE;
1851                         }
1852                         FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1853
1854                         bzero(&msg, sizeof(msg));
1855                         if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1856                                 msg.msg_name = mbuf_data(nmp->nm_nam);
1857                                 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1858                         }
1859                         error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1860
1861                         FSDBG(535, xid, error, sent, cwnd);
1862
1863                         if (error) {
1864                                 if (error == EWOULDBLOCK) {
1865                                         rep->r_flags = flags;
1866                                         rep->r_rexmit = rexmit;
1867                                         nmp->nm_cwnd = cwnd;
1868                                         nmp->nm_sent = sent;
1869                                         rep->r_xid = xid;
1870                                 }
1871                                 else {
1872                                         if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1873                                                 int clearerror;
1874                                                 int optlen = sizeof(clearerror);
1875                                                 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1876                                         }
1877                                         rep->r_flags  = flags | R_RESENDERR;
1878                                         rep->r_rexmit = rexmit;
1879                                         nmp->nm_cwnd = cwnd;
1880                                         nmp->nm_sent = sent;
1881                                         if (flags & R_SENT)
1882                                                 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1883                                 }
1884                         } else
1885                                 rep->r_rtt = 0;
1886                 }
1887         }
1888         microuptime(&now);
1889 #ifndef NFS_NOSERVER
1890         /*
1891          * Scan the write gathering queues for writes that need to be
1892          * completed now.
1893          */
1894         cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1895         lck_mtx_lock(nfsd_mutex);
1896         TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1897             if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1898                 nfsrv_wakenfsd(slp);
1899         }
1900         while ((slp = TAILQ_FIRST(&nfssvc_deadsockhead))) {
1901                 if ((slp->ns_timestamp + 5) > now.tv_sec)
1902                         break;
1903                 TAILQ_REMOVE(&nfssvc_deadsockhead, slp, ns_chain);
1904                 nfsrv_slpfree(slp);
1905         }
1906         lck_mtx_unlock(nfsd_mutex);
1907 #endif /* NFS_NOSERVER */
1908
1909         if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1910                 /*
1911                  * We haven't called nfs_buf_freeup() in a little while.
1912                  * So, see if we can free up any stale/unused bufs now.
1913                  */
1914                 nfs_buf_freeup(1);
1915         }
1916
1917         timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1918
1919 }
1920
1921
1922 /*
1923  * Test for a termination condition pending on the process.
1924  * This is used to determine if we need to bail on a mount.
1925  * EIO is returned if there has been a soft timeout.
1926  * EINTR is returned if there is a signal pending that is not being ignored
1927  * and the mount is interruptable, or if we are a thread that is in the process
1928  * of cancellation (also SIGKILL posted).
1929  */
1930 int
1931 nfs_sigintr(nmp, rep, p)
1932         struct nfsmount *nmp;
1933         struct nfsreq *rep;
1934         proc_t p;
1935 {
1936         sigset_t pending_sigs;
1937         int context_good = 0;
1938         struct nfsmount *repnmp;
1939         extern proc_t kernproc;
1940
1941         if (nmp == NULL)
1942                 return (ENXIO);
1943         if (rep != NULL) {
1944                 repnmp = rep->r_nmp;
1945                 /* we've had a forced unmount. */
1946                 if (repnmp == NULL)
1947                         return (ENXIO);
1948                 /* request has timed out on a 'soft' mount. */
1949                 if (rep->r_flags & R_SOFTTERM)
1950                         return (EIO);
1951                 /*
1952                  * We're in the progress of a force unmount and there's
1953                  * been a timeout we're dead and fail IO.
1954                  */
1955                 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1956                    (NFSSTA_FORCE|NFSSTA_TIMEO))
1957                         return (EIO);
1958                 /* Someone is unmounting us, go soft and mark it. */
1959                 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1960                         repnmp->nm_flag |= NFSMNT_SOFT;
1961                         nmp->nm_state |= NFSSTA_FORCE;
1962                 }
1963                 /*
1964                  * If the mount is hung and we've requested not to hang
1965                  * on remote filesystems, then bail now.
1966                  */
1967                 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1968                     (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1969                         return (EIO);
1970         }
1971         /* XXX: is this valid?  this probably should be an assertion. */
1972         if (p == NULL)
1973                 return (0);
1974
1975         /* Is this thread belongs to kernel task; then abort check  is not needed */
1976         if ((current_proc() != kernproc) && current_thread_aborted()) {
1977                 return (EINTR);
1978         }
1979         /* mask off thread and process blocked signals. */
1980
1981         pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1982         if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1983                 return (EINTR);
1984         return (0);
1985 }
1986
1987 /*
1988  * Lock a socket against others.
1989  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1990  * and also to avoid race conditions between the processes with nfs requests
1991  * in progress when a reconnect is necessary.
1992  */
1993 int
1994 nfs_sndlock(rep)
1995         struct nfsreq *rep;
1996 {
1997         int *statep;
1998         proc_t p;
1999         int error, slpflag = 0, slptimeo = 0;
2000
2001         if (rep->r_nmp == NULL)
2002                 return (ENXIO);
2003         statep = &rep->r_nmp->nm_state;
2004
2005         p = rep->r_procp;
2006         if (rep->r_nmp->nm_flag & NFSMNT_INT)
2007                 slpflag = PCATCH;
2008         while (*statep & NFSSTA_SNDLOCK) {
2009                 error = nfs_sigintr(rep->r_nmp, rep, p);
2010                 if (error)
2011                         return (error);
2012                 *statep |= NFSSTA_WANTSND;
2013                 if (p != NULL && (proc_noremotehang(p)) != 0)
2014                         slptimeo = hz;
2015                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2016                 if (slpflag == PCATCH) {
2017                         slpflag = 0;
2018                         slptimeo = 2 * hz;
2019                 }
2020                 /*
2021                  * Make sure while we slept that the mountpoint didn't go away.
2022                  * nfs_sigintr and callers expect it in tact.
2023                  */
2024                 if (!rep->r_nmp)
2025                         return (ENXIO); /* don't have lock until out of loop */
2026         }
2027         *statep |= NFSSTA_SNDLOCK;
2028         return (0);
2029 }
2030
2031 /*
2032  * Unlock the stream socket for others.
2033  */
2034 void
2035 nfs_sndunlock(rep)
2036         struct nfsreq *rep;
2037 {
2038         int *statep;
2039
2040         if (rep->r_nmp == NULL)
2041                 return;
2042         statep = &rep->r_nmp->nm_state;
2043         if ((*statep & NFSSTA_SNDLOCK) == 0)
2044                 panic("nfs sndunlock");
2045         *statep &= ~NFSSTA_SNDLOCK;
2046         if (*statep & NFSSTA_WANTSND) {
2047                 *statep &= ~NFSSTA_WANTSND;
2048                 wakeup((caddr_t)statep);
2049         }
2050 }
2051
2052 static int
2053 nfs_rcvlock(struct nfsreq *rep)
2054 {
2055         int *statep;
2056         int error, slpflag, slptimeo = 0;
2057
2058         /* make sure we still have our mountpoint */
2059         if (!rep->r_nmp) {
2060                 if (rep->r_mrep != NULL)
2061                         return (EALREADY);
2062                 return (ENXIO);
2063         }
2064
2065         statep = &rep->r_nmp->nm_state;
2066         FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2067         if (rep->r_nmp->nm_flag & NFSMNT_INT)
2068                 slpflag = PCATCH;
2069         else
2070                 slpflag = 0;
2071         while (*statep & NFSSTA_RCVLOCK) {
2072                 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2073                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2074                         return (error);
2075                 } else if (rep->r_mrep != NULL) {
2076                         /*
2077                          * Don't bother sleeping if reply already arrived
2078                          */
2079                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2080                         return (EALREADY);
2081                 }
2082                 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2083                 *statep |= NFSSTA_WANTRCV;
2084                 /*
2085                  * We need to poll if we're P_NOREMOTEHANG so that we
2086                  * call nfs_sigintr periodically above.
2087                  */
2088                 if (rep->r_procp != NULL &&
2089                     (proc_noremotehang(rep->r_procp)) != 0)
2090                         slptimeo = hz;
2091                 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2092                 if (slpflag == PCATCH) {
2093                         slpflag = 0;
2094                         slptimeo = 2 * hz;
2095                 }
2096                 /*
2097                  * Make sure while we slept that the mountpoint didn't go away.
2098                  * nfs_sigintr and caller nfs_reply expect it intact.
2099                  */
2100                 if (!rep->r_nmp)  {
2101                         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2102                         return (ENXIO); /* don't have lock until out of loop */
2103                 }
2104         }
2105         /*
2106          * nfs_reply will handle it if reply already arrived.
2107          * (We may have slept or been preempted).
2108          */
2109         FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2110         *statep |= NFSSTA_RCVLOCK;
2111         return (0);
2112 }
2113
2114 /*
2115  * Unlock the stream socket for others.
2116  */
2117 static void
2118 nfs_rcvunlock(struct nfsreq *rep)
2119 {
2120         int *statep;
2121
2122         if (rep->r_nmp == NULL)
2123                 return;
2124         statep = &rep->r_nmp->nm_state;
2125
2126         FSDBG(533, statep, *statep, 0, 0);
2127         if ((*statep & NFSSTA_RCVLOCK) == 0)
2128                 panic("nfs rcvunlock");
2129         *statep &= ~NFSSTA_RCVLOCK;
2130         if (*statep & NFSSTA_WANTRCV) {
2131                 *statep &= ~NFSSTA_WANTRCV;
2132                 wakeup((caddr_t)statep);
2133         }
2134 }
2135
2136
2137 #ifndef NFS_NOSERVER
2138 /*
2139  * Socket upcall routine for the nfsd sockets.
2140  * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2141  * Essentially do as much as possible non-blocking, else punt and it will
2142  * be called with MBUF_WAITOK from an nfsd.
2143  */
2144 void
2145 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2146 {
2147         struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2148
2149         if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2150                 return;
2151
2152         lck_rw_lock_exclusive(&slp->ns_rwlock);
2153         nfsrv_rcv_locked(so, slp, waitflag);
2154         /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2155 }
2156 void
2157 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2158 {
2159         mbuf_t m, mp, mhck, m2;
2160         int ns_flag=0, error;
2161         struct msghdr   msg;
2162         size_t bytes_read;
2163
2164         if ((slp->ns_flag & SLP_VALID) == 0) {
2165                 if (waitflag == MBUF_DONTWAIT)
2166                         lck_rw_done(&slp->ns_rwlock);
2167                 return;
2168         }
2169
2170 #ifdef notdef
2171         /*
2172          * Define this to test for nfsds handling this under heavy load.
2173          */
2174         if (waitflag == MBUF_DONTWAIT) {
2175                 ns_flag = SLP_NEEDQ;
2176                 goto dorecs;
2177         }
2178 #endif
2179         if (slp->ns_sotype == SOCK_STREAM) {
2180                 /*
2181                  * If there are already records on the queue, defer soreceive()
2182                  * to an nfsd so that there is feedback to the TCP layer that
2183                  * the nfs servers are heavily loaded.
2184                  */
2185                 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2186                         ns_flag = SLP_NEEDQ;
2187                         goto dorecs;
2188                 }
2189
2190                 /*
2191                  * Do soreceive().
2192                  */
2193                 bytes_read = 1000000000;
2194                 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2195                 if (error || mp == NULL) {
2196                         if (error == EWOULDBLOCK)
2197                                 ns_flag = SLP_NEEDQ;
2198                         else
2199                                 ns_flag = SLP_DISCONN;
2200                         goto dorecs;
2201                 }
2202                 m = mp;
2203                 if (slp->ns_rawend) {
2204                         if ((error = mbuf_setnext(slp->ns_rawend, m)))
2205                                 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2206                         slp->ns_cc += bytes_read;
2207                 } else {
2208                         slp->ns_raw = m;
2209                         slp->ns_cc = bytes_read;
2210                 }
2211                 while ((m2 = mbuf_next(m)))
2212                         m = m2;
2213                 slp->ns_rawend = m;
2214
2215                 /*
2216                  * Now try and parse record(s) out of the raw stream data.
2217                  */
2218                 error = nfsrv_getstream(slp, waitflag);
2219                 if (error) {
2220                         if (error == EPERM)
2221                                 ns_flag = SLP_DISCONN;
2222                         else
2223                                 ns_flag = SLP_NEEDQ;
2224                 }
2225         } else {
2226                 struct sockaddr_storage nam;
2227
2228                 bzero(&msg, sizeof(msg));
2229                 msg.msg_name = (caddr_t)&nam;
2230                 msg.msg_namelen = sizeof(nam);
2231
2232                 do {
2233                         bytes_read = 1000000000;
2234                         error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2235                         if (mp) {
2236                                 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2237                                         mbuf_setlen(mhck, nam.ss_len);
2238                                         bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2239                                         m = mhck;
2240                                         if (mbuf_setnext(m, mp)) {
2241                                                 /* trouble... just drop it */
2242                                                 printf("nfsrv_rcv: mbuf_setnext failed\n");
2243                                                 mbuf_free(mhck);
2244                                                 m = mp;
2245                                         }
2246                                 } else {
2247                                         m = mp;
2248                                 }
2249                                 if (slp->ns_recend)
2250                                         mbuf_setnextpkt(slp->ns_recend, m);
2251                                 else
2252                                         slp->ns_rec = m;
2253                                 slp->ns_recend = m;
2254                                 mbuf_setnextpkt(m, NULL);
2255                         }
2256 #if 0
2257                         if (error) {
2258                                 /*
2259                                  * This may be needed in the future to support
2260                                  * non-byte-stream connection-oriented protocols
2261                                  * such as SCTP.
2262                                  */
2263                                 /*
2264                                  * This (slp->ns_sotype == SOCK_STREAM) should really
2265                                  * be a check for PR_CONNREQUIRED.
2266                                  */
2267                                 if ((slp->ns_sotype == SOCK_STREAM)
2268                                         && error != EWOULDBLOCK) {
2269                                         ns_flag = SLP_DISCONN;
2270                                         goto dorecs;
2271                                 }
2272                         }
2273 #endif
2274                 } while (mp);
2275         }
2276
2277         /*
2278          * Now try and process the request records, non-blocking.
2279          */
2280 dorecs:
2281         if (ns_flag)
2282                 slp->ns_flag |= ns_flag;
2283         if (waitflag == MBUF_DONTWAIT) {
2284                 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2285                 lck_rw_done(&slp->ns_rwlock);
2286                 if (wake && nfs_numnfsd) {
2287                         lck_mtx_lock(nfsd_mutex);
2288                         nfsrv_wakenfsd(slp);
2289                         lck_mtx_unlock(nfsd_mutex);
2290                 }
2291         }
2292 }
2293
2294 /*
2295  * Try and extract an RPC request from the mbuf data list received on a
2296  * stream socket. The "waitflag" argument indicates whether or not it
2297  * can sleep.
2298  */
2299 static int
2300 nfsrv_getstream(slp, waitflag)
2301         struct nfssvc_sock *slp;
2302         int waitflag;
2303 {
2304         mbuf_t m;
2305         char *cp1, *cp2, *mdata;
2306         int len, mlen, error;
2307         mbuf_t om, m2, recm;
2308         u_long recmark;
2309
2310         if (slp->ns_flag & SLP_GETSTREAM)
2311                 panic("nfs getstream");
2312         slp->ns_flag |= SLP_GETSTREAM;
2313         for (;;) {
2314             if (slp->ns_reclen == 0) {
2315                 if (slp->ns_cc < NFSX_UNSIGNED) {
2316                         slp->ns_flag &= ~SLP_GETSTREAM;
2317                         return (0);
2318                 }
2319                 m = slp->ns_raw;
2320                 mdata = mbuf_data(m);
2321                 mlen = mbuf_len(m);
2322                 if (mlen >= NFSX_UNSIGNED) {
2323                         bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2324                         mdata += NFSX_UNSIGNED;
2325                         mlen -= NFSX_UNSIGNED;
2326                         mbuf_setdata(m, mdata, mlen);
2327                 } else {
2328                         cp1 = (caddr_t)&recmark;
2329                         cp2 = mdata;
2330                         while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2331                                 while (mlen == 0) {
2332                                         m = mbuf_next(m);
2333                                         cp2 = mbuf_data(m);
2334                                         mlen = mbuf_len(m);
2335                                 }
2336                                 *cp1++ = *cp2++;
2337                                 mlen--;
2338                                 mbuf_setdata(m, cp2, mlen);
2339                         }
2340                 }
2341                 slp->ns_cc -= NFSX_UNSIGNED;
2342                 recmark = ntohl(recmark);
2343                 slp->ns_reclen = recmark & ~0x80000000;
2344                 if (recmark & 0x80000000)
2345                         slp->ns_flag |= SLP_LASTFRAG;
2346                 else
2347                         slp->ns_flag &= ~SLP_LASTFRAG;
2348                 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2349                         slp->ns_flag &= ~SLP_GETSTREAM;
2350                         return (EPERM);
2351                 }
2352             }
2353
2354             /*
2355              * Now get the record part.
2356              *
2357              * Note that slp->ns_reclen may be 0.  Linux sometimes
2358              * generates 0-length RPCs
2359              */
2360             recm = NULL;
2361             if (slp->ns_cc == slp->ns_reclen) {
2362                 recm = slp->ns_raw;
2363                 slp->ns_raw = slp->ns_rawend = NULL;
2364                 slp->ns_cc = slp->ns_reclen = 0;
2365             } else if (slp->ns_cc > slp->ns_reclen) {
2366                 len = 0;
2367                 m = slp->ns_raw;
2368                 mlen = mbuf_len(m);
2369                 mdata = mbuf_data(m);
2370                 om = NULL;
2371                 while (len < slp->ns_reclen) {
2372                         if ((len + mlen) > slp->ns_reclen) {
2373                                 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2374                                         slp->ns_flag &= ~SLP_GETSTREAM;
2375                                         return (EWOULDBLOCK);
2376                                 }
2377                                 if (om) {
2378                                         if (mbuf_setnext(om, m2)) {
2379                                                 /* trouble... just drop it */
2380                                                 printf("nfsrv_getstream: mbuf_setnext failed\n");
2381                                                 mbuf_freem(m2);
2382                                                 slp->ns_flag &= ~SLP_GETSTREAM;
2383                                                 return (EWOULDBLOCK);
2384                                         }
2385                                         recm = slp->ns_raw;
2386                                 } else {
2387                                         recm = m2;
2388                                 }
2389                                 mdata += slp->ns_reclen - len;
2390                                 mlen -= slp->ns_reclen - len;
2391                                 mbuf_setdata(m, mdata, mlen);
2392                                 len = slp->ns_reclen;
2393                         } else if ((len + mlen) == slp->ns_reclen) {
2394                                 om = m;
2395                                 len += mlen;
2396                                 m = mbuf_next(m);
2397                                 recm = slp->ns_raw;
2398                                 if (mbuf_setnext(om, NULL)) {
2399                                         printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2400                                         slp->ns_flag &= ~SLP_GETSTREAM;
2401                                         return (EWOULDBLOCK);
2402                                 }
2403                                 mlen = mbuf_len(m);
2404                                 mdata = mbuf_data(m);
2405                         } else {
2406                                 om = m;
2407                                 len += mlen;
2408                                 m = mbuf_next(m);
2409                                 mlen = mbuf_len(m);
2410                                 mdata = mbuf_data(m);
2411                         }
2412                 }
2413                 slp->ns_raw = m;
2414                 slp->ns_cc -= len;
2415                 slp->ns_reclen = 0;
2416             } else {
2417                 slp->ns_flag &= ~SLP_GETSTREAM;
2418                 return (0);
2419             }
2420
2421             /*
2422              * Accumulate the fragments into a record.
2423              */
2424             if (slp->ns_frag == NULL) {
2425                 slp->ns_frag = recm;
2426             } else {
2427                 m = slp->ns_frag;
2428                 while ((m2 = mbuf_next(m)))
2429                     m = m2;
2430                 if ((error = mbuf_setnext(m, recm)))
2431                     panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2432             }
2433             if (slp->ns_flag & SLP_LASTFRAG) {
2434                 if (slp->ns_recend)
2435                     mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2436                 else
2437                     slp->ns_rec = slp->ns_frag;
2438                 slp->ns_recend = slp->ns_frag;
2439                 slp->ns_frag = NULL;
2440             }
2441         }
2442 }
2443
2444 /*
2445  * Parse an RPC header.
2446  */
2447 int
2448 nfsrv_dorec(slp, nfsd, ndp)
2449         struct nfssvc_sock *slp;
2450         struct nfsd *nfsd;
2451         struct nfsrv_descript **ndp;
2452 {
2453         mbuf_t m;
2454         mbuf_t nam;
2455         struct nfsrv_descript *nd;
2456         int error;
2457
2458         *ndp = NULL;
2459         if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2460                 return (ENOBUFS);
2461         MALLOC_ZONE(nd, struct nfsrv_descript *,
2462                         sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2463         if (!nd)
2464                 return (ENOMEM);
2465         m = slp->ns_rec;
2466         slp->ns_rec = mbuf_nextpkt(m);
2467         if (slp->ns_rec)
2468                 mbuf_setnextpkt(m, NULL);
2469         else
2470                 slp->ns_recend = NULL;
2471         if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2472                 nam = m;
2473                 m = mbuf_next(m);
2474                 if ((error = mbuf_setnext(nam, NULL)))
2475                         panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2476         } else
2477                 nam = NULL;
2478         nd->nd_md = nd->nd_mrep = m;
2479         nd->nd_nam2 = nam;
2480         nd->nd_dpos = mbuf_data(m);
2481         error = nfs_getreq(nd, nfsd, TRUE);
2482         if (error) {
2483                 if (nam)
2484                         mbuf_freem(nam);
2485                 FREE_ZONE((caddr_t)nd,  sizeof *nd, M_NFSRVDESC);
2486                 return (error);
2487         }
2488         *ndp = nd;
2489         nfsd->nfsd_nd = nd;
2490         return (0);
2491 }
2492
2493 /*
2494  * Parse an RPC request
2495  * - verify it
2496  * - fill in the cred struct.
2497  */
2498 int
2499 nfs_getreq(nd, nfsd, has_header)
2500         struct nfsrv_descript *nd;
2501         struct nfsd *nfsd;
2502         int has_header;
2503 {
2504         int len, i;
2505         u_long *tl;
2506         long t1;
2507         uio_t uiop;
2508         caddr_t dpos, cp2, cp;
2509         u_long nfsvers, auth_type;
2510         uid_t nickuid;
2511         int error = 0, ticklen;
2512         mbuf_t mrep, md;
2513         struct nfsuid *nuidp;
2514         uid_t user_id;
2515         gid_t group_id;
2516         int ngroups;
2517         struct ucred temp_cred;
2518         struct timeval tvin, tvout, now;
2519         char uio_buf[ UIO_SIZEOF(1) ];
2520 #if 0                           /* until encrypted keys are implemented */
2521         NFSKERBKEYSCHED_T keys; /* stores key schedule */
2522 #endif
2523
2524         nd->nd_cr = NULL;
2525
2526         mrep = nd->nd_mrep;
2527         md = nd->nd_md;
2528         dpos = nd->nd_dpos;
2529         if (has_header) {
2530                 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2531                 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2532                 if (*tl++ != rpc_call) {
2533                         mbuf_freem(mrep);
2534                         return (EBADRPC);
2535                 }
2536         } else
2537                 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2538         nd->nd_repstat = 0;
2539         nd->nd_flag = 0;
2540         if (*tl++ != rpc_vers) {
2541                 nd->nd_repstat = ERPCMISMATCH;
2542                 nd->nd_procnum = NFSPROC_NOOP;
2543                 return (0);
2544         }
2545         if (*tl != nfs_prog) {
2546                 nd->nd_repstat = EPROGUNAVAIL;
2547                 nd->nd_procnum = NFSPROC_NOOP;
2548                 return (0);
2549         }
2550         tl++;
2551         nfsvers = fxdr_unsigned(u_long, *tl++);
2552         if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2553                 nd->nd_repstat = EPROGMISMATCH;
2554                 nd->nd_procnum = NFSPROC_NOOP;
2555                 return (0);
2556         }
2557         else if (nfsvers == NFS_VER3)
2558                 nd->nd_flag = ND_NFSV3;
2559         nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2560         if (nd->nd_procnum == NFSPROC_NULL)
2561                 return (0);
2562         if ((nd->nd_procnum >= NFS_NPROCS) ||
2563                 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2564                 nd->nd_repstat = EPROCUNAVAIL;
2565                 nd->nd_procnum = NFSPROC_NOOP;
2566                 return (0);
2567         }
2568         if ((nd->nd_flag & ND_NFSV3) == 0)
2569                 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2570         auth_type = *tl++;
2571         len = fxdr_unsigned(int, *tl++);
2572         if (len < 0 || len > RPCAUTH_MAXSIZ) {
2573                 mbuf_freem(mrep);
2574                 return (EBADRPC);
2575         }
2576
2577         nd->nd_flag &= ~ND_KERBAUTH;
2578         /*
2579          * Handle auth_unix or auth_kerb.
2580          */
2581         if (auth_type == rpc_auth_unix) {
2582                 len = fxdr_unsigned(int, *++tl);
2583                 if (len < 0 || len > NFS_MAXNAMLEN) {
2584                         mbuf_freem(mrep);
2585                         return (EBADRPC);
2586                 }
2587                 bzero(&temp_cred, sizeof(temp_cred));
2588                 nfsm_adv(nfsm_rndup(len));
2589                 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2590                 user_id = fxdr_unsigned(uid_t, *tl++);
2591                 group_id = fxdr_unsigned(gid_t, *tl++);
2592                 temp_cred.cr_groups[0] = group_id;
2593                 len = fxdr_unsigned(int, *tl);
2594                 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2595                         mbuf_freem(mrep);
2596                         return (EBADRPC);
2597                 }
2598                 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2599                 for (i = 1; i <= len; i++)
2600                     if (i < NGROUPS)
2601                         temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2602                     else
2603                         tl++;
2604                 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2605                 if (ngroups > 1)
2606                     nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2607                 len = fxdr_unsigned(int, *++tl);
2608                 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2609                         mbuf_freem(mrep);
2610                         return (EBADRPC);
2611                 }
2612                 temp_cred.cr_uid = user_id;
2613                 temp_cred.cr_ngroups = ngroups;
2614                 nd->nd_cr = kauth_cred_create(&temp_cred);
2615                 if (nd->nd_cr == NULL) {
2616                         nd->nd_repstat = ENOMEM;
2617                         nd->nd_procnum = NFSPROC_NOOP;
2618                         return (0);
2619                 }
2620                 if (len > 0)
2621                         nfsm_adv(nfsm_rndup(len));
2622         } else if (auth_type == rpc_auth_kerb) {
2623                 switch (fxdr_unsigned(int, *tl++)) {
2624                 case RPCAKN_FULLNAME:
2625                         ticklen = fxdr_unsigned(int, *tl);
2626                         *((u_long *)nfsd->nfsd_authstr) = *tl;
2627                         uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2628                                                 &uio_buf[0], sizeof(uio_buf));
2629                         if (!uiop) {
2630                                 nd->nd_repstat = ENOMEM;
2631                                 nd->nd_procnum = NFSPROC_NOOP;
2632                                 return (0);
2633                         }
2634
2635                         // LP64todo - fix this
2636                         nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2637                         if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2638                                 mbuf_freem(mrep);
2639                                 return (EBADRPC);
2640                         }
2641                         uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2642                         // LP64todo - fix this
2643                         nfsm_mtouio(uiop, uio_resid(uiop));
2644                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2645                         if (*tl++ != rpc_auth_kerb ||
2646                                 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2647                                 printf("Bad kerb verifier\n");
2648                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2649                                 nd->nd_procnum = NFSPROC_NOOP;
2650                                 return (0);
2651                         }
2652                         nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2653                         tl = (u_long *)cp;
2654                         if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2655                                 printf("Not fullname kerb verifier\n");
2656                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2657                                 nd->nd_procnum = NFSPROC_NOOP;
2658                                 return (0);
2659                         }
2660                         cp += NFSX_UNSIGNED;
2661                         bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2662                         nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2663                         nd->nd_flag |= ND_KERBFULL;
2664                         nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2665                         break;
2666                 case RPCAKN_NICKNAME:
2667                         if (len != 2 * NFSX_UNSIGNED) {
2668                                 printf("Kerb nickname short\n");
2669                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2670                                 nd->nd_procnum = NFSPROC_NOOP;
2671                                 return (0);
2672                         }
2673                         nickuid = fxdr_unsigned(uid_t, *tl);
2674                         nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2675                         if (*tl++ != rpc_auth_kerb ||
2676                                 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2677                                 printf("Kerb nick verifier bad\n");
2678                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2679                                 nd->nd_procnum = NFSPROC_NOOP;
2680                                 return (0);
2681                         }
2682                         nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2683                         tvin.tv_sec = *tl++;
2684                         tvin.tv_usec = *tl;
2685
2686                         for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2687                             nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2688                                 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2689                                     (!nd->nd_nam2 ||
2690                                      netaddr_match(NU_NETFAM(nuidp),
2691                                       &nuidp->nu_haddr, nd->nd_nam2)))
2692                                         break;
2693                         }
2694                         if (!nuidp) {
2695                                 nd->nd_repstat =
2696                                         (NFSERR_AUTHERR|AUTH_REJECTCRED);
2697                                 nd->nd_procnum = NFSPROC_NOOP;
2698                                 return (0);
2699                         }
2700
2701                         /*
2702                          * Now, decrypt the timestamp using the session key
2703                          * and validate it.
2704                          */
2705 #if NFSKERB
2706                         XXX
2707 #endif
2708
2709                         tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2710                         tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2711                         microtime(&now);
2712                         if (nuidp->nu_expire < now.tv_sec ||
2713                             nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2714                             (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2715                              nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2716                                 nuidp->nu_expire = 0;
2717                                 nd->nd_repstat =
2718                                     (NFSERR_AUTHERR|AUTH_REJECTVERF);
2719                                 nd->nd_procnum = NFSPROC_NOOP;
2720                                 return (0);
2721                         }
2722                         bzero(&temp_cred, sizeof(temp_cred));
2723                         ngroups = nuidp->nu_cr->cr_ngroups;
2724                         for (i = 0; i < ngroups; i++)
2725                                 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2726                         if (ngroups > 1)
2727                                 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2728
2729                         temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2730                         temp_cred.cr_ngroups = ngroups;
2731                         nd->nd_cr = kauth_cred_create(&temp_cred);
2732                         if (!nd->nd_cr) {
2733                                 nd->nd_repstat = ENOMEM;
2734                                 nd->nd_procnum = NFSPROC_NOOP;
2735                                 return (0);
2736                         }
2737                         nd->nd_flag |= ND_KERBNICK;
2738                 };
2739         } else {
2740                 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2741                 nd->nd_procnum = NFSPROC_NOOP;
2742                 return (0);
2743         }
2744
2745         nd->nd_md = md;
2746         nd->nd_dpos = dpos;
2747         return (0);
2748 nfsmout:
2749         if (nd->nd_cr)
2750                 kauth_cred_rele(nd->nd_cr);
2751         return (error);
2752 }
2753
2754 /*
2755  * Search for a sleeping nfsd and wake it up.
2756  * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2757  * running nfsds will go look for the work in the nfssvc_sock list.
2758  * Note: Must be called with nfsd_mutex held.
2759  */
2760 void
2761 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2762 {
2763         struct nfsd *nd;
2764
2765         if ((slp->ns_flag & SLP_VALID) == 0)
2766                 return;
2767
2768         lck_rw_lock_exclusive(&slp->ns_rwlock);
2769
2770         if (nfsd_waiting) {
2771                 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2772                         if (nd->nfsd_flag & NFSD_WAITING) {
2773                                 nd->nfsd_flag &= ~NFSD_WAITING;
2774                                 if (nd->nfsd_slp)
2775                                         panic("nfsd wakeup");
2776                                 slp->ns_sref++;
2777                                 nd->nfsd_slp = slp;
2778                                 lck_rw_done(&slp->ns_rwlock);
2779                                 wakeup((caddr_t)nd);
2780                                 return;
2781                         }
2782                 }
2783         }
2784
2785         slp->ns_flag |= SLP_DOREC;
2786
2787         lck_rw_done(&slp->ns_rwlock);
2788
2789         nfsd_head_flag |= NFSD_CHECKSLP;
2790 }
2791 #endif /* NFS_NOSERVER */
2792
2793 static int
2794 nfs_msg(proc_t p,
2795         const char *server,
2796         const char *msg,
2797         int error)
2798 {
2799         tpr_t tpr;
2800
2801         if (p)
2802                 tpr = tprintf_open(p);
2803         else
2804                 tpr = NULL;
2805         if (error)
2806                 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2807                     error);
2808         else
2809                 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2810         tprintf_close(tpr);
2811         return (0);
2812 }
2813
2814 void
2815 nfs_down(nmp, proc, error, flags, msg)
2816         struct nfsmount *nmp;
2817         proc_t proc;
2818         int error, flags;
2819         const char *msg;
2820 {
2821         if (nmp == NULL)
2822                 return;
2823         if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2824                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2825                 nmp->nm_state |= NFSSTA_TIMEO;
2826         }
2827         if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2828                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2829                 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2830         }
2831         nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2832 }
2833
2834 void
2835 nfs_up(nmp, proc, flags, msg)
2836         struct nfsmount *nmp;
2837         proc_t proc;
2838         int flags;
2839         const char *msg;
2840 {
2841         if (nmp == NULL)
2842                 return;
2843         if (msg)
2844                 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2845         if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2846                 nmp->nm_state &= ~NFSSTA_TIMEO;
2847                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2848         }
2849         if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2850                 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2851                 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2852         }
2853 }
2854