2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1989, 1991, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
69 * Socket operations for use by nfs
72 #include <sys/param.h>
73 #include <sys/systm.h>
75 #include <sys/kauth.h>
76 #include <sys/mount_internal.h>
77 #include <sys/kernel.h>
78 #include <sys/kpi_mbuf.h>
79 #include <sys/malloc.h>
80 #include <sys/vnode.h>
81 #include <sys/domain.h>
82 #include <sys/protosw.h>
83 #include <sys/socket.h>
84 #include <sys/syslog.h>
85 #include <sys/tprintf.h>
86 #include <sys/uio_internal.h>
87 #include <libkern/OSAtomic.h>
90 #include <kern/clock.h>
91 #include <kern/task.h>
92 #include <kern/thread.h>
93 #include <kern/thread_call.h>
96 #include <netinet/in.h>
97 #include <netinet/tcp.h>
99 #include <nfs/rpcv2.h>
100 #include <nfs/nfsproto.h>
102 #include <nfs/xdr_subs.h>
103 #include <nfs/nfsm_subs.h>
104 #include <nfs/nfs_gss.h>
105 #include <nfs/nfsmount.h>
106 #include <nfs/nfsnode.h>
109 boolean_t
current_thread_aborted(void);
110 kern_return_t
thread_terminate(thread_t
);
114 int nfsrv_sock_max_rec_queue_length
= 128; /* max # RPC records queued on (UDP) socket */
116 static int nfsrv_getstream(struct nfsrv_sock
*,int);
117 static int nfsrv_getreq(struct nfsrv_descript
*);
118 extern int nfsv3_procid
[NFS_NPROCS
];
119 #endif /* NFSSERVER */
123 static int nfs_connect_setup(struct nfsmount
*);
124 static void nfs_reqdequeue(struct nfsreq
*);
125 static void nfs_udp_rcv(socket_t
, void*, int);
126 static void nfs_tcp_rcv(socket_t
, void*, int);
127 static void nfs_request_match_reply(struct nfsmount
*, mbuf_t
);
128 static void nfs_softterm(struct nfsreq
*);
130 #ifdef NFS_SOCKET_DEBUGGING
131 #define NFS_SOCK_DBG(X) printf X
133 #define NFS_SOCK_DBG(X)
137 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
138 * Use the mean and mean deviation of rtt for the appropriate type of rpc
139 * for the frequent rpcs and a default for the others.
140 * The justification for doing "other" this way is that these rpcs
141 * happen so infrequently that timer est. would probably be stale.
142 * Also, since many of these rpcs are
143 * non-idempotent, a conservative timeout is desired.
144 * getattr, lookup - A+2D
148 #define NFS_RTO(n, t) \
149 ((t) == 0 ? (n)->nm_timeo : \
151 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
152 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
153 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
154 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
157 * Defines which timer to use for the procnum.
164 static int proct
[NFS_NPROCS
] = {
165 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
169 * There is a congestion window for outstanding rpcs maintained per mount
170 * point. The cwnd size is adjusted in roughly the way that:
171 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
172 * SIGCOMM '88". ACM, August 1988.
173 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
174 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
175 * of rpcs is in progress.
176 * (The sent count and cwnd are scaled for integer arith.)
177 * Variants of "slow start" were tried and were found to be too much of a
178 * performance hit (ave. rtt 3 times larger),
179 * I suspect due to the large rtt that nfs rpcs have.
181 #define NFS_CWNDSCALE 256
182 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
183 static int nfs_backoff
[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
186 * Initialize socket state and perform setup for a new NFS connection.
189 nfs_connect(struct nfsmount
*nmp
)
192 int error
, on
= 1, proto
;
194 struct sockaddr
*saddr
;
195 struct sockaddr_in sin
;
196 struct timeval timeo
;
199 lck_mtx_lock(&nmp
->nm_lock
);
200 nmp
->nm_sockflags
|= NMSOCK_CONNECTING
;
201 saddr
= mbuf_data(nmp
->nm_nam
);
202 upcall
= (nmp
->nm_sotype
== SOCK_STREAM
) ? nfs_tcp_rcv
: nfs_udp_rcv
;
203 lck_mtx_unlock(&nmp
->nm_lock
);
204 error
= sock_socket(saddr
->sa_family
, nmp
->nm_sotype
,
205 nmp
->nm_soproto
, upcall
, nmp
, &nmp
->nm_so
);
208 lck_mtx_lock(&nmp
->nm_lock
);
212 * Some servers require that the client port be a reserved port number.
214 if (saddr
->sa_family
== AF_INET
&& (nmp
->nm_flag
& NFSMNT_RESVPORT
)) {
215 lck_mtx_unlock(&nmp
->nm_lock
);
216 sin
.sin_len
= sizeof (struct sockaddr_in
);
217 sin
.sin_family
= AF_INET
;
218 sin
.sin_addr
.s_addr
= INADDR_ANY
;
219 tport
= IPPORT_RESERVED
- 1;
220 sin
.sin_port
= htons(tport
);
221 while (((error
= sock_bind(so
, (struct sockaddr
*) &sin
)) == EADDRINUSE
) &&
222 (--tport
> IPPORT_RESERVED
/ 2))
223 sin
.sin_port
= htons(tport
);
226 lck_mtx_lock(&nmp
->nm_lock
);
230 * Protocols that do not require connections may be optionally left
231 * unconnected for servers that reply from a different address/port.
233 if (nmp
->nm_flag
& NFSMNT_NOCONN
) {
234 if (nmp
->nm_sotype
== SOCK_STREAM
) {
236 lck_mtx_unlock(&nmp
->nm_lock
);
240 int tocnt
= 0, optlen
= sizeof(error
);
241 struct timespec ts
= { 2, 0 };
243 lck_mtx_unlock(&nmp
->nm_lock
);
244 error
= sock_connect(so
, mbuf_data(nmp
->nm_nam
), MSG_DONTWAIT
);
245 if (error
&& (error
!= EINPROGRESS
))
247 lck_mtx_lock(&nmp
->nm_lock
);
248 while (!sock_isconnected(so
)) {
249 if (tocnt
++ == 15) /* log a warning if connect is taking a while */
250 log(LOG_INFO
, "nfs_connect: socket connect taking a while for %s\n",
251 vfs_statfs(nmp
->nm_mountp
)->f_mntfromname
);
252 /* check for error on socket */
253 sock_getsockopt(so
, SOL_SOCKET
, SO_ERROR
, &error
, &optlen
);
255 log(LOG_INFO
, "nfs_connect: socket error %d for %s\n",
256 error
, vfs_statfs(nmp
->nm_mountp
)->f_mntfromname
);
260 /* abort if this is taking too long */
264 if ((error
= nfs_sigintr(nmp
, NULL
, current_thread(), 1)))
266 msleep(&nmp
->nm_so
, &nmp
->nm_lock
, PSOCK
, "nfs_socket_connect", &ts
);
269 log(LOG_INFO
, "nfs_connect: socket connect %s for %s\n",
270 error
? "aborted" : "completed",
271 vfs_statfs(nmp
->nm_mountp
)->f_mntfromname
);
273 lck_mtx_unlock(&nmp
->nm_lock
);
279 * Set socket send/receive timeouts
280 * - Receive timeout shouldn't matter because all receives are performed
281 * in the socket upcall non-blocking.
282 * - Send timeout should allow us to react to a blocked socket.
283 * Soft mounts will want to abort sooner.
286 timeo
.tv_sec
= (nmp
->nm_flag
& NFSMNT_SOFT
) ? 10 : 60;
287 error
|= sock_setsockopt(so
, SOL_SOCKET
, SO_RCVTIMEO
, &timeo
, sizeof(timeo
));
288 error
|= sock_setsockopt(so
, SOL_SOCKET
, SO_SNDTIMEO
, &timeo
, sizeof(timeo
));
290 log(LOG_INFO
, "nfs_connect: socket timeout setting errors for %s\n",
291 vfs_statfs(nmp
->nm_mountp
)->f_mntfromname
);
295 if (nmp
->nm_sotype
== SOCK_STREAM
) {
296 /* Assume that SOCK_STREAM always requires a connection */
297 sock_setsockopt(so
, SOL_SOCKET
, SO_KEEPALIVE
, &on
, sizeof(on
));
298 /* set nodelay for TCP */
299 sock_gettype(so
, NULL
, NULL
, &proto
);
300 if (proto
== IPPROTO_TCP
)
301 sock_setsockopt(so
, IPPROTO_TCP
, TCP_NODELAY
, &on
, sizeof(on
));
304 if (nmp
->nm_sotype
== SOCK_DGRAM
) { /* set socket buffer sizes for UDP */
305 int reserve
= NFS_UDPSOCKBUF
;
306 error
|= sock_setsockopt(so
, SOL_SOCKET
, SO_SNDBUF
, &reserve
, sizeof(reserve
));
307 error
|= sock_setsockopt(so
, SOL_SOCKET
, SO_RCVBUF
, &reserve
, sizeof(reserve
));
309 log(LOG_INFO
, "nfs_connect: socket buffer setting errors for %s\n",
310 vfs_statfs(nmp
->nm_mountp
)->f_mntfromname
);
315 /* set SO_NOADDRERR to detect network changes ASAP */
316 error
= sock_setsockopt(so
, SOL_SOCKET
, SO_NOADDRERR
, &on
, sizeof(on
));
318 lck_mtx_unlock(&nmp
->nm_lock
);
321 /* just playin' it safe */
322 sock_setsockopt(so
, SOL_SOCKET
, SO_UPCALLCLOSEWAIT
, &on
, sizeof(on
));
324 if (!(nmp
->nm_flag
& NFSMNT_INT
))
325 sock_nointerrupt(so
, 1);
327 /* Initialize socket state variables */
328 nmp
->nm_srtt
[0] = nmp
->nm_srtt
[1] = nmp
->nm_srtt
[2] =
329 nmp
->nm_srtt
[3] = (NFS_TIMEO
<< 3);
330 nmp
->nm_sdrtt
[0] = nmp
->nm_sdrtt
[1] = nmp
->nm_sdrtt
[2] =
331 nmp
->nm_sdrtt
[3] = 0;
332 if (nmp
->nm_sotype
== SOCK_DGRAM
) {
333 /* XXX do we really want to reset this on each reconnect? */
334 nmp
->nm_cwnd
= NFS_MAXCWND
/ 2; /* Initial send window */
336 } else if (nmp
->nm_sotype
== SOCK_STREAM
) {
337 nmp
->nm_markerleft
= sizeof(nmp
->nm_fragleft
);
338 nmp
->nm_fragleft
= nmp
->nm_reclen
= 0;
339 nmp
->nm_timeouts
= 0;
341 nmp
->nm_sockflags
&= ~NMSOCK_CONNECTING
;
342 nmp
->nm_sockflags
|= NMSOCK_SETUP
;
343 FSDBG(529, nmp
, nmp
->nm_state
, nmp
->nm_flag
, nmp
->nm_cwnd
);
344 lck_mtx_unlock(&nmp
->nm_lock
);
345 error
= nfs_connect_setup(nmp
);
347 lck_mtx_lock(&nmp
->nm_lock
);
348 nmp
->nm_sockflags
&= ~(NMSOCK_CONNECTING
|NMSOCK_SETUP
);
350 nmp
->nm_sockflags
|= NMSOCK_READY
;
351 wakeup(&nmp
->nm_sockflags
);
353 lck_mtx_unlock(&nmp
->nm_lock
);
359 /* setup & confirm socket connection is functional */
361 nfs_connect_setup(struct nfsmount
*nmp
)
363 struct nfsm_chain nmreq
, nmrep
;
364 int error
= 0, status
;
367 if (nmp
->nm_vers
>= NFS_VER4
) {
368 error
= nfs4_setclientid(nmp
);
370 /* verify connection's OK by sending a NULL request */
371 nfsm_chain_null(&nmreq
);
372 nfsm_chain_null(&nmrep
);
373 nfsm_chain_build_alloc_init(error
, &nmreq
, 0);
374 nfsm_chain_build_done(error
, &nmreq
);
376 error
= nfs_request2(NULL
, nmp
->nm_mountp
, &nmreq
, NFSPROC_NULL
,
377 current_thread(), NULL
, R_SETUP
, &nmrep
, &xid
, &status
);
381 nfsm_chain_cleanup(&nmreq
);
382 nfsm_chain_cleanup(&nmrep
);
388 * NFS socket reconnect routine:
389 * Called when a connection is broken.
390 * - disconnect the old socket
391 * - nfs_connect() again
392 * - set R_MUSTRESEND for all outstanding requests on mount point
393 * If this fails the mount point is DEAD!
396 nfs_reconnect(struct nfsmount
*nmp
)
400 thread_t thd
= current_thread();
401 int error
, lastmsg
, wentdown
= 0;
404 lastmsg
= now
.tv_sec
- (nmp
->nm_tprintf_delay
- nmp
->nm_tprintf_initial_delay
);
408 while ((error
= nfs_connect(nmp
))) {
409 if (error
== EINTR
|| error
== ERESTART
)
414 if ((lastmsg
+ nmp
->nm_tprintf_delay
) < now
.tv_sec
) {
415 lastmsg
= now
.tv_sec
;
416 nfs_down(nmp
, thd
, error
, NFSSTA_TIMEO
, "can not connect");
419 lck_mtx_lock(&nmp
->nm_lock
);
420 if (!(nmp
->nm_state
& NFSSTA_MOUNTED
)) {
421 /* we're not yet completely mounted and */
422 /* we can't reconnect, so we fail */
423 lck_mtx_unlock(&nmp
->nm_lock
);
426 if ((error
= nfs_sigintr(nmp
, NULL
, thd
, 1))) {
427 lck_mtx_unlock(&nmp
->nm_lock
);
430 lck_mtx_unlock(&nmp
->nm_lock
);
431 tsleep(&lbolt
, PSOCK
, "nfs_reconnect_delay", 0);
432 if ((error
= nfs_sigintr(nmp
, NULL
, thd
, 0)))
437 nfs_up(nmp
, thd
, NFSSTA_TIMEO
, "connected");
440 * Loop through outstanding request list and mark all requests
441 * as needing a resend. (Though nfs_need_reconnect() probably
442 * marked them all already.)
444 lck_mtx_lock(nfs_request_mutex
);
445 TAILQ_FOREACH(rq
, &nfs_reqq
, r_chain
) {
446 if (rq
->r_nmp
== nmp
) {
447 lck_mtx_lock(&rq
->r_mtx
);
448 if (!rq
->r_error
&& !rq
->r_nmrep
.nmc_mhead
&& !(rq
->r_flags
& R_MUSTRESEND
)) {
449 rq
->r_flags
|= R_MUSTRESEND
;
452 if ((rq
->r_flags
& (R_ASYNC
|R_ASYNCWAIT
)) == R_ASYNC
)
453 nfs_asyncio_resend(rq
);
455 lck_mtx_unlock(&rq
->r_mtx
);
458 lck_mtx_unlock(nfs_request_mutex
);
463 * NFS disconnect. Clean up and unlink.
466 nfs_disconnect(struct nfsmount
*nmp
)
470 lck_mtx_lock(&nmp
->nm_lock
);
471 if ((nmp
->nm_sotype
== SOCK_STREAM
) && nmp
->nm_m
) {
472 mbuf_freem(nmp
->nm_m
);
473 nmp
->nm_m
= nmp
->nm_mlast
= NULL
;
478 lck_mtx_unlock(&nmp
->nm_lock
);
479 sock_shutdown(so
, SHUT_RDWR
);
482 lck_mtx_unlock(&nmp
->nm_lock
);
487 * mark an NFS mount as needing a reconnect/resends.
490 nfs_need_reconnect(struct nfsmount
*nmp
)
494 lck_mtx_lock(&nmp
->nm_lock
);
495 nmp
->nm_sockflags
&= ~(NMSOCK_READY
|NMSOCK_SETUP
);
496 lck_mtx_unlock(&nmp
->nm_lock
);
499 * Loop through outstanding request list and
500 * mark all requests as needing a resend.
502 lck_mtx_lock(nfs_request_mutex
);
503 TAILQ_FOREACH(rq
, &nfs_reqq
, r_chain
) {
504 if (rq
->r_nmp
== nmp
) {
505 lck_mtx_lock(&rq
->r_mtx
);
506 if (!rq
->r_error
&& !rq
->r_nmrep
.nmc_mhead
&& !(rq
->r_flags
& R_MUSTRESEND
)) {
507 rq
->r_flags
|= R_MUSTRESEND
;
510 if ((rq
->r_flags
& (R_ASYNC
|R_ASYNCWAIT
)) == R_ASYNC
)
511 nfs_asyncio_resend(rq
);
513 lck_mtx_unlock(&rq
->r_mtx
);
516 lck_mtx_unlock(nfs_request_mutex
);
520 * thread to handle miscellaneous async NFS socket work (reconnects/resends)
523 nfs_mount_sock_thread(void *arg
, __unused wait_result_t wr
)
525 struct nfsmount
*nmp
= arg
;
526 struct timespec ts
= { 30, 0 };
527 thread_t thd
= current_thread();
530 int error
, dofinish
, force
;
532 lck_mtx_lock(&nmp
->nm_lock
);
534 while (!(nmp
->nm_sockflags
& NMSOCK_READY
) || !TAILQ_EMPTY(&nmp
->nm_resendq
)) {
535 if (nmp
->nm_sockflags
& NMSOCK_UNMOUNT
)
537 force
= (nmp
->nm_state
& NFSSTA_FORCE
);
538 /* do reconnect, if necessary */
539 if (!(nmp
->nm_sockflags
& NMSOCK_READY
) && !force
) {
540 if (nmp
->nm_reconnect_start
<= 0) {
542 nmp
->nm_reconnect_start
= now
.tv_sec
;
544 lck_mtx_unlock(&nmp
->nm_lock
);
545 NFS_SOCK_DBG(("nfs reconnect %s\n", vfs_statfs(nmp
->nm_mountp
)->f_mntfromname
));
546 if ((error
= nfs_reconnect(nmp
)))
547 printf("nfs_reconnect failed %d for %s\n", error
,
548 vfs_statfs(nmp
->nm_mountp
)->f_mntfromname
);
550 nmp
->nm_reconnect_start
= 0;
551 lck_mtx_lock(&nmp
->nm_lock
);
553 /* do resends, if necessary/possible */
554 while (((nmp
->nm_sockflags
& NMSOCK_READY
) || force
) && ((req
= TAILQ_FIRST(&nmp
->nm_resendq
)))) {
555 if (req
->r_resendtime
)
557 while (req
&& !force
&& req
->r_resendtime
&& (now
.tv_sec
< req
->r_resendtime
))
558 req
= TAILQ_NEXT(req
, r_rchain
);
561 TAILQ_REMOVE(&nmp
->nm_resendq
, req
, r_rchain
);
562 req
->r_rchain
.tqe_next
= NFSREQNOLIST
;
563 lck_mtx_unlock(&nmp
->nm_lock
);
564 lck_mtx_lock(&req
->r_mtx
);
565 if (req
->r_error
|| req
->r_nmrep
.nmc_mhead
) {
566 dofinish
= req
->r_callback
.rcb_func
&& !(req
->r_flags
& R_WAITSENT
);
567 req
->r_flags
&= ~R_RESENDQ
;
569 lck_mtx_unlock(&req
->r_mtx
);
571 nfs_asyncio_finish(req
);
572 lck_mtx_lock(&nmp
->nm_lock
);
575 if ((req
->r_flags
& R_RESTART
) || req
->r_gss_ctx
) {
576 req
->r_flags
&= ~R_RESTART
;
577 req
->r_resendtime
= 0;
578 lck_mtx_unlock(&req
->r_mtx
);
579 /* async RPCs on GSS mounts need to be rebuilt and resent. */
581 if (req
->r_gss_ctx
) {
582 nfs_gss_clnt_rpcdone(req
);
583 error
= nfs_gss_clnt_args_restore(req
);
584 if (error
== ENEEDAUTH
)
587 NFS_SOCK_DBG(("nfs async%s restart: p %d x 0x%llx f 0x%x rtt %d\n",
588 req
->r_gss_ctx
? " gss" : "", req
->r_procnum
, req
->r_xid
,
589 req
->r_flags
, req
->r_rtt
));
590 error
= !req
->r_nmp
? ENXIO
: 0; /* unmounted? */
592 error
= nfs_sigintr(nmp
, req
, req
->r_thread
, 0);
594 error
= nfs_request_add_header(req
);
596 error
= nfs_request_send(req
, 0);
597 lck_mtx_lock(&req
->r_mtx
);
598 if (req
->r_rchain
.tqe_next
== NFSREQNOLIST
)
599 req
->r_flags
&= ~R_RESENDQ
;
601 req
->r_error
= error
;
603 dofinish
= error
&& req
->r_callback
.rcb_func
&& !(req
->r_flags
& R_WAITSENT
);
604 lck_mtx_unlock(&req
->r_mtx
);
606 nfs_asyncio_finish(req
);
607 lck_mtx_lock(&nmp
->nm_lock
);
611 NFS_SOCK_DBG(("nfs async resend: p %d x 0x%llx f 0x%x rtt %d\n",
612 req
->r_procnum
, req
->r_xid
, req
->r_flags
, req
->r_rtt
));
613 error
= !req
->r_nmp
? ENXIO
: 0; /* unmounted? */
615 error
= nfs_sigintr(nmp
, req
, req
->r_thread
, 0);
617 lck_mtx_unlock(&req
->r_mtx
);
618 error
= nfs_send(req
, 0);
619 lck_mtx_lock(&req
->r_mtx
);
621 if (req
->r_rchain
.tqe_next
== NFSREQNOLIST
)
622 req
->r_flags
&= ~R_RESENDQ
;
624 lck_mtx_unlock(&req
->r_mtx
);
625 lck_mtx_lock(&nmp
->nm_lock
);
629 req
->r_error
= error
;
630 if (req
->r_rchain
.tqe_next
== NFSREQNOLIST
)
631 req
->r_flags
&= ~R_RESENDQ
;
633 dofinish
= req
->r_callback
.rcb_func
&& !(req
->r_flags
& R_WAITSENT
);
634 lck_mtx_unlock(&req
->r_mtx
);
636 nfs_asyncio_finish(req
);
637 lck_mtx_lock(&nmp
->nm_lock
);
639 if (nmp
->nm_sockflags
& NMSOCK_READY
) {
640 ts
.tv_sec
= TAILQ_EMPTY(&nmp
->nm_resendq
) ? 30 : 1;
641 msleep(&nmp
->nm_sockthd
, &nmp
->nm_lock
, PSOCK
, "nfssockthread", &ts
);
646 if (nmp
->nm_sockthd
== thd
)
647 nmp
->nm_sockthd
= NULL
;
648 lck_mtx_unlock(&nmp
->nm_lock
);
649 wakeup(&nmp
->nm_sockthd
);
650 thread_terminate(thd
);
653 /* start or wake a mount's socket thread */
655 nfs_mount_sock_thread_wake(struct nfsmount
*nmp
)
658 wakeup(&nmp
->nm_sockthd
);
659 else if (kernel_thread_start(nfs_mount_sock_thread
, nmp
, &nmp
->nm_sockthd
) == KERN_SUCCESS
)
660 thread_deallocate(nmp
->nm_sockthd
);
664 * The NFS client send routine.
666 * Send the given NFS request out the mount's socket.
667 * Holds nfs_sndlock() for the duration of this call.
669 * - check for request termination (sigintr)
670 * - perform reconnect, if necessary
671 * - UDP: check the congestion window
672 * - make a copy of the request to send
673 * - UDP: update the congestion window
676 * If sent successfully, R_MUSTRESEND and R_RESENDERR are cleared.
677 * rexmit count is also updated if this isn't the first send.
679 * If the send is not successful, make sure R_MUSTRESEND is set.
680 * If this wasn't the first transmit, set R_RESENDERR.
681 * Also, undo any UDP congestion window changes made.
683 * If the error appears to indicate that the socket should
684 * be reconnected, mark the socket for reconnection.
686 * Only return errors when the request should be aborted.
689 nfs_send(struct nfsreq
*req
, int wait
)
691 struct nfsmount
*nmp
;
693 int error
, error2
, sotype
, rexmit
, slpflag
= 0, needrecon
;
695 struct sockaddr
*sendnam
;
698 struct timespec ts
= { 2, 0 };
701 error
= nfs_sndlock(req
);
705 error
= nfs_sigintr(req
->r_nmp
, req
, req
->r_thread
, 0);
711 sotype
= nmp
->nm_sotype
;
713 if ((req
->r_flags
& R_SETUP
) && !(nmp
->nm_sockflags
& NMSOCK_SETUP
)) {
714 /* a setup RPC but we're not in SETUP... must need reconnect */
719 /* If the socket needs reconnection, do that now. */
720 /* wait until socket is ready - unless this request is part of setup */
721 lck_mtx_lock(&nmp
->nm_lock
);
722 if (!(nmp
->nm_sockflags
& NMSOCK_READY
) &&
723 !((nmp
->nm_sockflags
& NMSOCK_SETUP
) && (req
->r_flags
& R_SETUP
))) {
724 if (nmp
->nm_flag
& NFSMNT_INT
)
726 lck_mtx_unlock(&nmp
->nm_lock
);
729 lck_mtx_lock(&req
->r_mtx
);
730 req
->r_flags
|= R_MUSTRESEND
;
732 lck_mtx_unlock(&req
->r_mtx
);
735 NFS_SOCK_DBG(("nfs_send: 0x%llx wait reconnect\n", req
->r_xid
));
736 lck_mtx_lock(&req
->r_mtx
);
737 req
->r_flags
&= ~R_MUSTRESEND
;
739 lck_mtx_unlock(&req
->r_mtx
);
740 lck_mtx_lock(&nmp
->nm_lock
);
741 while (!(nmp
->nm_sockflags
& NMSOCK_READY
)) {
742 /* don't bother waiting if the socket thread won't be reconnecting it */
743 if (nmp
->nm_state
& NFSSTA_FORCE
) {
747 /* make sure socket thread is running, then wait */
748 nfs_mount_sock_thread_wake(nmp
);
749 if ((error
= nfs_sigintr(req
->r_nmp
, req
, req
->r_thread
, 1)))
751 msleep(req
, &nmp
->nm_lock
, slpflag
|PSOCK
, "nfsconnectwait", &ts
);
754 lck_mtx_unlock(&nmp
->nm_lock
);
760 lck_mtx_unlock(&nmp
->nm_lock
);
763 lck_mtx_lock(&req
->r_mtx
);
764 req
->r_flags
|= R_MUSTRESEND
;
766 lck_mtx_unlock(&req
->r_mtx
);
770 lck_mtx_lock(&req
->r_mtx
);
771 rexmit
= (req
->r_flags
& R_SENT
);
773 if (sotype
== SOCK_DGRAM
) {
774 lck_mtx_lock(&nmp
->nm_lock
);
775 if (!(req
->r_flags
& R_CWND
) && (nmp
->nm_sent
>= nmp
->nm_cwnd
)) {
776 /* if we can't send this out yet, wait on the cwnd queue */
777 slpflag
= ((nmp
->nm_flag
& NFSMNT_INT
) && req
->r_thread
) ? PCATCH
: 0;
778 lck_mtx_unlock(&nmp
->nm_lock
);
780 req
->r_flags
|= R_MUSTRESEND
;
781 lck_mtx_unlock(&req
->r_mtx
);
786 lck_mtx_lock(&nmp
->nm_lock
);
787 while (nmp
->nm_sent
>= nmp
->nm_cwnd
) {
788 if ((error
= nfs_sigintr(req
->r_nmp
, req
, req
->r_thread
, 1)))
790 TAILQ_INSERT_TAIL(&nmp
->nm_cwndq
, req
, r_cchain
);
791 msleep(req
, &nmp
->nm_lock
, slpflag
| (PZERO
- 1), "nfswaitcwnd", &ts
);
793 if ((req
->r_cchain
.tqe_next
!= NFSREQNOLIST
)) {
794 TAILQ_REMOVE(&nmp
->nm_cwndq
, req
, r_cchain
);
795 req
->r_cchain
.tqe_next
= NFSREQNOLIST
;
798 lck_mtx_unlock(&nmp
->nm_lock
);
802 * We update these *before* the send to avoid racing
803 * against others who may be looking to send requests.
807 req
->r_flags
|= R_CWND
;
808 nmp
->nm_sent
+= NFS_CWNDSCALE
;
811 * When retransmitting, turn timing off
812 * and divide congestion window by 2.
814 req
->r_flags
&= ~R_TIMING
;
816 if (nmp
->nm_cwnd
< NFS_CWNDSCALE
)
817 nmp
->nm_cwnd
= NFS_CWNDSCALE
;
819 lck_mtx_unlock(&nmp
->nm_lock
);
822 req
->r_flags
&= ~R_MUSTRESEND
;
823 lck_mtx_unlock(&req
->r_mtx
);
825 error
= mbuf_copym(req
->r_mhead
, 0, MBUF_COPYALL
,
826 wait
? MBUF_WAITOK
: MBUF_DONTWAIT
, &mreqcopy
);
829 log(LOG_INFO
, "nfs_send: mbuf copy failed %d\n", error
);
831 lck_mtx_lock(&req
->r_mtx
);
832 req
->r_flags
|= R_MUSTRESEND
;
834 lck_mtx_unlock(&req
->r_mtx
);
838 bzero(&msg
, sizeof(msg
));
839 if (nmp
->nm_nam
&& (sotype
!= SOCK_STREAM
) && !sock_isconnected(so
)) {
840 if ((sendnam
= mbuf_data(nmp
->nm_nam
))) {
841 msg
.msg_name
= (caddr_t
)sendnam
;
842 msg
.msg_namelen
= sendnam
->sa_len
;
845 error
= sock_sendmbuf(so
, &msg
, mreqcopy
, 0, &sentlen
);
846 #ifdef NFS_SOCKET_DEBUGGING
847 if (error
|| (sentlen
!= req
->r_mreqlen
))
848 NFS_SOCK_DBG(("nfs_send: 0x%llx sent %d/%d error %d\n",
849 req
->r_xid
, (int)sentlen
, (int)req
->r_mreqlen
, error
));
851 if (!error
&& (sentlen
!= req
->r_mreqlen
))
853 needrecon
= ((sotype
== SOCK_STREAM
) && sentlen
&& (sentlen
!= req
->r_mreqlen
));
855 lck_mtx_lock(&req
->r_mtx
);
857 if (rexmit
&& (++req
->r_rexmit
> NFS_MAXREXMIT
))
858 req
->r_rexmit
= NFS_MAXREXMIT
;
862 req
->r_flags
&= ~R_RESENDERR
;
864 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpcretries
);
865 req
->r_flags
|= R_SENT
;
866 if (req
->r_flags
& R_WAITSENT
) {
867 req
->r_flags
&= ~R_WAITSENT
;
871 lck_mtx_unlock(&req
->r_mtx
);
876 req
->r_flags
|= R_MUSTRESEND
;
878 req
->r_flags
|= R_RESENDERR
;
879 if ((error
== EINTR
) || (error
== ERESTART
))
880 req
->r_error
= error
;
881 lck_mtx_unlock(&req
->r_mtx
);
883 if (sotype
== SOCK_DGRAM
) {
885 * Note: even though a first send may fail, we consider
886 * the request sent for congestion window purposes.
887 * So we don't need to undo any of the changes made above.
890 * Socket errors ignored for connectionless sockets??
891 * For now, ignore them all
893 if ((error
!= EINTR
) && (error
!= ERESTART
) &&
894 (error
!= EWOULDBLOCK
) && (error
!= EIO
)) {
895 int clearerror
= 0, optlen
= sizeof(clearerror
);
896 sock_getsockopt(so
, SOL_SOCKET
, SO_ERROR
, &clearerror
, &optlen
);
897 #ifdef NFS_SOCKET_DEBUGGING
899 NFS_SOCK_DBG(("nfs_send: ignoring UDP socket error %d so %d\n",
905 /* check if it appears we should reconnect the socket */
908 /* if send timed out, reconnect if on TCP */
909 if (sotype
!= SOCK_STREAM
)
926 if (needrecon
) { /* mark socket as needing reconnect */
927 NFS_SOCK_DBG(("nfs_send: 0x%llx need reconnect %d\n", req
->r_xid
, error
));
928 nfs_need_reconnect(nmp
);
934 * Don't log some errors:
935 * EPIPE errors may be common with servers that drop idle connections.
936 * EADDRNOTAVAIL may occur on network transitions.
937 * ENOTCONN may occur under some network conditions.
939 if ((error
== EPIPE
) || (error
== EADDRNOTAVAIL
) || (error
== ENOTCONN
))
941 if (error
&& (error
!= EINTR
) && (error
!= ERESTART
))
942 log(LOG_INFO
, "nfs send error %d for server %s\n", error
,
943 !req
->r_nmp
? "<unmounted>" :
944 vfs_statfs(req
->r_nmp
->nm_mountp
)->f_mntfromname
);
946 /* prefer request termination error over other errors */
947 error2
= nfs_sigintr(req
->r_nmp
, req
, req
->r_thread
, 0);
951 /* only allow the following errors to be returned */
952 if ((error
!= EINTR
) && (error
!= ERESTART
) && (error
!= EIO
) &&
953 (error
!= ENXIO
) && (error
!= ETIMEDOUT
))
959 * NFS client socket upcalls
961 * Pull RPC replies out of an NFS mount's socket and match them
962 * up with the pending request.
964 * The datagram code is simple because we always get whole
965 * messages out of the socket.
967 * The stream code is more involved because we have to parse
968 * the RPC records out of the stream.
971 /* NFS client UDP socket upcall */
973 nfs_udp_rcv(socket_t so
, void *arg
, __unused
int waitflag
)
975 struct nfsmount
*nmp
= arg
;
980 if (nmp
->nm_sockflags
& NMSOCK_CONNECTING
) {
985 /* make sure we're on the current socket */
986 if (nmp
->nm_so
!= so
)
992 error
= sock_receivembuf(so
, NULL
, &m
, MSG_DONTWAIT
, &rcvlen
);
994 nfs_request_match_reply(nmp
, m
);
995 } while (m
&& !error
);
997 if (error
&& (error
!= EWOULDBLOCK
)) {
998 /* problems with the socket... mark for reconnection */
999 NFS_SOCK_DBG(("nfs_udp_rcv: need reconnect %d\n", error
));
1000 nfs_need_reconnect(nmp
);
1004 /* NFS client TCP socket upcall */
1006 nfs_tcp_rcv(socket_t so
, void *arg
, __unused
int waitflag
)
1008 struct nfsmount
*nmp
= arg
;
1009 struct iovec_32 aio
;
1016 if (nmp
->nm_sockflags
& NMSOCK_CONNECTING
) {
1017 wakeup(&nmp
->nm_so
);
1021 /* make sure we're on the current socket */
1022 if (nmp
->nm_so
!= so
)
1025 lck_mtx_lock(&nmp
->nm_lock
);
1026 if (nmp
->nm_sockflags
& NMSOCK_UPCALL
) {
1027 /* upcall is already receiving data - just return */
1028 lck_mtx_unlock(&nmp
->nm_lock
);
1031 nmp
->nm_sockflags
|= NMSOCK_UPCALL
;
1036 /* read the TCP RPC record marker */
1037 while (!error
&& nmp
->nm_markerleft
) {
1038 aio
.iov_base
= (uintptr_t)((char*)&nmp
->nm_fragleft
+
1039 sizeof(nmp
->nm_fragleft
) - nmp
->nm_markerleft
);
1040 aio
.iov_len
= nmp
->nm_markerleft
;
1041 bzero(&msg
, sizeof(msg
));
1042 msg
.msg_iov
= (struct iovec
*) &aio
;
1044 lck_mtx_unlock(&nmp
->nm_lock
);
1045 error
= sock_receive(so
, &msg
, MSG_DONTWAIT
, &rcvlen
);
1046 lck_mtx_lock(&nmp
->nm_lock
);
1047 if (error
|| !rcvlen
)
1050 nmp
->nm_markerleft
-= rcvlen
;
1051 if (nmp
->nm_markerleft
)
1053 /* record marker complete */
1054 nmp
->nm_fragleft
= ntohl(nmp
->nm_fragleft
);
1055 if (nmp
->nm_fragleft
& 0x80000000) {
1056 nmp
->nm_sockflags
|= NMSOCK_LASTFRAG
;
1057 nmp
->nm_fragleft
&= ~0x80000000;
1059 nmp
->nm_reclen
+= nmp
->nm_fragleft
;
1060 if (nmp
->nm_reclen
> NFS_MAXPACKET
) {
1062 * This is SERIOUS! We are out of sync with the sender
1063 * and forcing a disconnect/reconnect is all I can do.
1065 log(LOG_ERR
, "%s (%d) from nfs server %s\n",
1066 "impossible RPC record length", nmp
->nm_reclen
,
1067 vfs_statfs(nmp
->nm_mountp
)->f_mntfromname
);
1072 /* read the TCP RPC record fragment */
1073 while (!error
&& !nmp
->nm_markerleft
&& nmp
->nm_fragleft
) {
1075 rcvlen
= nmp
->nm_fragleft
;
1076 lck_mtx_unlock(&nmp
->nm_lock
);
1077 error
= sock_receivembuf(so
, NULL
, &m
, MSG_DONTWAIT
, &rcvlen
);
1078 lck_mtx_lock(&nmp
->nm_lock
);
1079 if (error
|| !rcvlen
|| !m
)
1082 /* append mbufs to list */
1083 nmp
->nm_fragleft
-= rcvlen
;
1087 error
= mbuf_setnext(nmp
->nm_mlast
, m
);
1089 printf("nfs_tcp_rcv: mbuf_setnext failed %d\n", error
);
1094 while (mbuf_next(m
))
1099 /* done reading fragment? */
1101 if (!error
&& !nmp
->nm_markerleft
&& !nmp
->nm_fragleft
) {
1102 /* reset socket fragment parsing state */
1103 nmp
->nm_markerleft
= sizeof(nmp
->nm_fragleft
);
1104 if (nmp
->nm_sockflags
& NMSOCK_LASTFRAG
) {
1105 /* RPC record complete */
1107 /* reset socket record parsing state */
1109 nmp
->nm_m
= nmp
->nm_mlast
= NULL
;
1110 nmp
->nm_sockflags
&= ~NMSOCK_LASTFRAG
;
1114 if (m
) { /* match completed response with request */
1115 lck_mtx_unlock(&nmp
->nm_lock
);
1116 nfs_request_match_reply(nmp
, m
);
1117 lck_mtx_lock(&nmp
->nm_lock
);
1120 /* loop if we've been making error-free progress */
1124 nmp
->nm_sockflags
&= ~NMSOCK_UPCALL
;
1125 lck_mtx_unlock(&nmp
->nm_lock
);
1126 #ifdef NFS_SOCKET_DEBUGGING
1127 if (!recv
&& (error
!= EWOULDBLOCK
))
1128 NFS_SOCK_DBG(("nfs_tcp_rcv: got nothing, error %d, got FIN?\n", error
));
1130 /* note: no error and no data indicates server closed its end */
1131 if ((error
!= EWOULDBLOCK
) && (error
|| !recv
)) {
1132 /* problems with the socket... mark for reconnection */
1133 NFS_SOCK_DBG(("nfs_tcp_rcv: need reconnect %d\n", error
));
1134 nfs_need_reconnect(nmp
);
1139 * "poke" a socket to try to provoke any pending errors
1142 nfs_sock_poke(struct nfsmount
*nmp
)
1144 struct iovec_32 aio
;
1150 lck_mtx_lock(&nmp
->nm_lock
);
1151 if ((nmp
->nm_sockflags
& NMSOCK_UNMOUNT
) || !nmp
->nm_so
) {
1152 lck_mtx_unlock(&nmp
->nm_lock
);
1155 lck_mtx_unlock(&nmp
->nm_lock
);
1156 aio
.iov_base
= (uintptr_t)&dummy
;
1159 bzero(&msg
, sizeof(msg
));
1160 msg
.msg_iov
= (struct iovec
*) &aio
;
1162 error
= sock_send(nmp
->nm_so
, &msg
, MSG_DONTWAIT
, &len
);
1163 NFS_SOCK_DBG(("nfs_sock_poke: error %d\n", error
));
1167 * Match an RPC reply with the corresponding request
1170 nfs_request_match_reply(struct nfsmount
*nmp
, mbuf_t mrep
)
1173 struct nfsm_chain nmrep
;
1174 u_long reply
= 0, rxid
= 0;
1176 int error
= 0, asyncioq
, asyncgss
;
1178 /* Get the xid and check that it is an rpc reply */
1179 nfsm_chain_dissect_init(error
, &nmrep
, mrep
);
1180 nfsm_chain_get_32(error
, &nmrep
, rxid
);
1181 nfsm_chain_get_32(error
, &nmrep
, reply
);
1182 if (error
|| (reply
!= RPC_REPLY
)) {
1183 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpcinvalid
);
1189 * Loop through the request list to match up the reply
1190 * Iff no match, just drop it.
1192 lck_mtx_lock(nfs_request_mutex
);
1193 TAILQ_FOREACH(req
, &nfs_reqq
, r_chain
) {
1194 if (req
->r_nmrep
.nmc_mhead
|| (rxid
!= R_XID32(req
->r_xid
)))
1196 /* looks like we have it, grab lock and double check */
1197 lck_mtx_lock(&req
->r_mtx
);
1198 if (req
->r_nmrep
.nmc_mhead
|| (rxid
!= R_XID32(req
->r_xid
))) {
1199 lck_mtx_unlock(&req
->r_mtx
);
1203 req
->r_nmrep
= nmrep
;
1204 lck_mtx_lock(&nmp
->nm_lock
);
1205 if (nmp
->nm_sotype
== SOCK_DGRAM
) {
1207 * Update congestion window.
1208 * Do the additive increase of one rpc/rtt.
1210 FSDBG(530, R_XID32(req
->r_xid
), req
, nmp
->nm_sent
, nmp
->nm_cwnd
);
1211 if (nmp
->nm_cwnd
<= nmp
->nm_sent
) {
1213 ((NFS_CWNDSCALE
* NFS_CWNDSCALE
) +
1214 (nmp
->nm_cwnd
>> 1)) / nmp
->nm_cwnd
;
1215 if (nmp
->nm_cwnd
> NFS_MAXCWND
)
1216 nmp
->nm_cwnd
= NFS_MAXCWND
;
1218 if (req
->r_flags
& R_CWND
) {
1219 nmp
->nm_sent
-= NFS_CWNDSCALE
;
1220 req
->r_flags
&= ~R_CWND
;
1222 if ((nmp
->nm_sent
< nmp
->nm_cwnd
) && !TAILQ_EMPTY(&nmp
->nm_cwndq
)) {
1223 /* congestion window is open, poke the cwnd queue */
1224 struct nfsreq
*req2
= TAILQ_FIRST(&nmp
->nm_cwndq
);
1225 TAILQ_REMOVE(&nmp
->nm_cwndq
, req2
, r_cchain
);
1226 req2
->r_cchain
.tqe_next
= NFSREQNOLIST
;
1231 * Update rtt using a gain of 0.125 on the mean
1232 * and a gain of 0.25 on the deviation.
1234 if (req
->r_flags
& R_TIMING
) {
1236 * Since the timer resolution of
1237 * NFS_HZ is so course, it can often
1238 * result in r_rtt == 0. Since
1239 * r_rtt == N means that the actual
1240 * rtt is between N+dt and N+2-dt ticks,
1243 if (proct
[req
->r_procnum
] == 0)
1244 panic("nfs_request_match_reply: proct[%d] is zero", req
->r_procnum
);
1245 t1
= req
->r_rtt
+ 1;
1246 t1
-= (NFS_SRTT(req
) >> 3);
1247 NFS_SRTT(req
) += t1
;
1250 t1
-= (NFS_SDRTT(req
) >> 2);
1251 NFS_SDRTT(req
) += t1
;
1253 nmp
->nm_timeouts
= 0;
1254 lck_mtx_unlock(&nmp
->nm_lock
);
1255 /* signal anyone waiting on this request */
1257 asyncioq
= (req
->r_callback
.rcb_func
!= NULL
);
1258 if ((asyncgss
= ((req
->r_gss_ctx
!= NULL
) && ((req
->r_flags
& (R_ASYNC
|R_ASYNCWAIT
|R_ALLOCATED
)) == (R_ASYNC
|R_ALLOCATED
)))))
1259 nfs_request_ref(req
, 1);
1260 lck_mtx_unlock(&req
->r_mtx
);
1261 lck_mtx_unlock(nfs_request_mutex
);
1263 nfs_gss_clnt_rpcdone(req
);
1264 nfs_request_rele(req
);
1266 /* if it's an async RPC with a callback, queue it up */
1268 nfs_asyncio_finish(req
);
1273 /* not matched to a request, so drop it. */
1274 lck_mtx_unlock(nfs_request_mutex
);
1275 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpcunexpected
);
1281 * Wait for the reply for a given request...
1282 * ...potentially resending the request if necessary.
1285 nfs_wait_reply(struct nfsreq
*req
)
1287 struct nfsmount
*nmp
= req
->r_nmp
;
1288 struct timespec ts
= { 30, 0 };
1289 int error
= 0, slpflag
;
1291 if ((nmp
->nm_flag
& NFSMNT_INT
) && req
->r_thread
)
1296 lck_mtx_lock(&req
->r_mtx
);
1297 while (!req
->r_nmrep
.nmc_mhead
) {
1298 if ((error
= nfs_sigintr(nmp
, req
, req
->r_thread
, 0)))
1300 if (((error
= req
->r_error
)) || req
->r_nmrep
.nmc_mhead
)
1302 /* check if we need to resend */
1303 if (req
->r_flags
& R_MUSTRESEND
) {
1304 NFS_SOCK_DBG(("nfs wait resend: p %d x 0x%llx f 0x%x rtt %d\n",
1305 req
->r_procnum
, req
->r_xid
, req
->r_flags
, req
->r_rtt
));
1306 lck_mtx_unlock(&req
->r_mtx
);
1307 if (req
->r_gss_ctx
) {
1309 * It's an RPCSEC_GSS mount.
1310 * Can't just resend the original request
1311 * without bumping the cred sequence number.
1312 * Go back and re-build the request.
1316 error
= nfs_send(req
, 1);
1317 lck_mtx_lock(&req
->r_mtx
);
1318 NFS_SOCK_DBG(("nfs wait resend: p %d x 0x%llx f 0x%x rtt %d err %d\n",
1319 req
->r_procnum
, req
->r_xid
, req
->r_flags
, req
->r_rtt
, error
));
1322 if (((error
= req
->r_error
)) || req
->r_nmrep
.nmc_mhead
)
1325 /* need to poll if we're P_NOREMOTEHANG */
1326 if (nfs_noremotehang(req
->r_thread
))
1328 msleep(req
, &req
->r_mtx
, slpflag
| (PZERO
- 1), "nfswaitreply", &ts
);
1331 lck_mtx_unlock(&req
->r_mtx
);
1337 * An NFS request goes something like this:
1338 * (nb: always frees up mreq mbuf list)
1339 * nfs_request_create()
1340 * - allocates a request struct if one is not provided
1341 * - initial fill-in of the request struct
1342 * nfs_request_add_header()
1343 * - add the RPC header
1344 * nfs_request_send()
1345 * - link it into list
1346 * - call nfs_send() for first transmit
1347 * nfs_request_wait()
1348 * - call nfs_wait_reply() to wait for the reply
1349 * nfs_request_finish()
1350 * - break down rpc header and return with error or nfs reply
1351 * pointed to by nmrep.
1352 * nfs_request_rele()
1353 * nfs_request_destroy()
1354 * - clean up the request struct
1355 * - free the request struct if it was allocated by nfs_request_create()
1359 * Set up an NFS request struct (allocating if no request passed in).
1364 mount_t mp
, /* used only if !np */
1365 struct nfsm_chain
*nmrest
,
1369 struct nfsreq
**reqp
)
1371 struct nfsreq
*req
, *newreq
= NULL
;
1372 struct nfsmount
*nmp
;
1376 /* allocate a new NFS request structure */
1377 MALLOC_ZONE(newreq
, struct nfsreq
*, sizeof(*newreq
), M_NFSREQ
, M_WAITOK
);
1379 mbuf_freem(nmrest
->nmc_mhead
);
1380 nmrest
->nmc_mhead
= NULL
;
1386 bzero(req
, sizeof(*req
));
1388 req
->r_flags
= R_ALLOCATED
;
1390 nmp
= VFSTONFS(np
? NFSTOMP(np
) : mp
);
1393 FREE_ZONE(newreq
, sizeof(*newreq
), M_NFSREQ
);
1396 lck_mtx_lock(&nmp
->nm_lock
);
1397 if ((nmp
->nm_state
& (NFSSTA_FORCE
|NFSSTA_TIMEO
)) ==
1398 (NFSSTA_FORCE
|NFSSTA_TIMEO
)) {
1399 lck_mtx_unlock(&nmp
->nm_lock
);
1400 mbuf_freem(nmrest
->nmc_mhead
);
1401 nmrest
->nmc_mhead
= NULL
;
1403 FREE_ZONE(newreq
, sizeof(*newreq
), M_NFSREQ
);
1407 if ((nmp
->nm_vers
!= NFS_VER4
) && (procnum
>= 0) && (procnum
< NFS_NPROCS
))
1408 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpccnt
[procnum
]);
1409 if ((nmp
->nm_vers
== NFS_VER4
) && (procnum
!= NFSPROC4_COMPOUND
) && (procnum
!= NFSPROC4_NULL
))
1410 panic("nfs_request: invalid NFSv4 RPC request %d\n", procnum
);
1412 lck_mtx_init(&req
->r_mtx
, nfs_request_grp
, LCK_ATTR_NULL
);
1415 req
->r_thread
= thd
;
1416 if (IS_VALID_CRED(cred
)) {
1417 kauth_cred_ref(cred
);
1420 req
->r_procnum
= procnum
;
1421 if (proct
[procnum
] > 0)
1422 req
->r_flags
|= R_TIMING
;
1423 req
->r_nmrep
.nmc_mhead
= NULL
;
1424 SLIST_INIT(&req
->r_gss_seqlist
);
1425 req
->r_achain
.tqe_next
= NFSREQNOLIST
;
1426 req
->r_rchain
.tqe_next
= NFSREQNOLIST
;
1427 req
->r_cchain
.tqe_next
= NFSREQNOLIST
;
1429 lck_mtx_unlock(&nmp
->nm_lock
);
1431 /* move the request mbuf chain to the nfsreq */
1432 req
->r_mrest
= nmrest
->nmc_mhead
;
1433 nmrest
->nmc_mhead
= NULL
;
1435 req
->r_flags
|= R_INITTED
;
1443 * Clean up and free an NFS request structure.
1446 nfs_request_destroy(struct nfsreq
*req
)
1448 struct nfsmount
*nmp
= req
->r_np
? NFSTONMP(req
->r_np
) : req
->r_nmp
;
1449 struct gss_seq
*gsp
, *ngsp
;
1450 struct timespec ts
= { 1, 0 };
1452 if (!req
|| !(req
->r_flags
& R_INITTED
))
1454 req
->r_flags
&= ~R_INITTED
;
1455 if (req
->r_lflags
& RL_QUEUED
)
1456 nfs_reqdequeue(req
);
1457 if (req
->r_achain
.tqe_next
!= NFSREQNOLIST
) {
1458 /* still on an async I/O queue? */
1459 lck_mtx_lock(nfsiod_mutex
);
1460 if (nmp
&& (req
->r_achain
.tqe_next
!= NFSREQNOLIST
)) {
1461 TAILQ_REMOVE(&nmp
->nm_iodq
, req
, r_achain
);
1462 req
->r_achain
.tqe_next
= NFSREQNOLIST
;
1464 lck_mtx_unlock(nfsiod_mutex
);
1467 lck_mtx_lock(&nmp
->nm_lock
);
1468 if (req
->r_rchain
.tqe_next
!= NFSREQNOLIST
) {
1469 TAILQ_REMOVE(&nmp
->nm_resendq
, req
, r_rchain
);
1470 req
->r_rchain
.tqe_next
= NFSREQNOLIST
;
1471 req
->r_flags
&= ~R_RESENDQ
;
1473 if (req
->r_cchain
.tqe_next
!= NFSREQNOLIST
) {
1474 TAILQ_REMOVE(&nmp
->nm_cwndq
, req
, r_cchain
);
1475 req
->r_cchain
.tqe_next
= NFSREQNOLIST
;
1477 lck_mtx_unlock(&nmp
->nm_lock
);
1479 lck_mtx_lock(&req
->r_mtx
);
1480 while (req
->r_flags
& R_RESENDQ
)
1481 msleep(req
, &req
->r_mtx
, (PZERO
- 1), "nfsresendqwait", &ts
);
1482 lck_mtx_unlock(&req
->r_mtx
);
1484 mbuf_freem(req
->r_mhead
);
1485 else if (req
->r_mrest
)
1486 mbuf_freem(req
->r_mrest
);
1487 if (req
->r_nmrep
.nmc_mhead
)
1488 mbuf_freem(req
->r_nmrep
.nmc_mhead
);
1489 if (IS_VALID_CRED(req
->r_cred
))
1490 kauth_cred_unref(&req
->r_cred
);
1492 nfs_gss_clnt_rpcdone(req
);
1493 SLIST_FOREACH_SAFE(gsp
, &req
->r_gss_seqlist
, gss_seqnext
, ngsp
)
1496 nfs_gss_clnt_ctx_unref(req
);
1498 lck_mtx_destroy(&req
->r_mtx
, nfs_request_grp
);
1499 if (req
->r_flags
& R_ALLOCATED
)
1500 FREE_ZONE(req
, sizeof(*req
), M_NFSREQ
);
1504 nfs_request_ref(struct nfsreq
*req
, int locked
)
1507 lck_mtx_lock(&req
->r_mtx
);
1508 if (req
->r_refs
<= 0)
1509 panic("nfsreq reference error");
1512 lck_mtx_unlock(&req
->r_mtx
);
1516 nfs_request_rele(struct nfsreq
*req
)
1520 lck_mtx_lock(&req
->r_mtx
);
1521 if (req
->r_refs
<= 0)
1522 panic("nfsreq reference underflow");
1524 destroy
= (req
->r_refs
== 0);
1525 lck_mtx_unlock(&req
->r_mtx
);
1527 nfs_request_destroy(req
);
1532 * Add an (updated) RPC header with authorization to an NFS request.
1535 nfs_request_add_header(struct nfsreq
*req
)
1537 struct nfsmount
*nmp
;
1538 int error
= 0, auth_len
= 0;
1541 /* free up any previous header */
1542 if ((m
= req
->r_mhead
)) {
1543 while (m
&& (m
!= req
->r_mrest
))
1545 req
->r_mhead
= NULL
;
1548 nmp
= req
->r_np
? NFSTONMP(req
->r_np
) : req
->r_nmp
;
1552 if (!req
->r_cred
) /* RPCAUTH_NULL */
1554 else switch (nmp
->nm_auth
) {
1556 if (req
->r_cred
->cr_ngroups
< 1)
1558 auth_len
= ((((req
->r_cred
->cr_ngroups
- 1) > nmp
->nm_numgrps
) ?
1559 nmp
->nm_numgrps
: (req
->r_cred
->cr_ngroups
- 1)) << 2) +
1565 auth_len
= 5 * NFSX_UNSIGNED
+ 0; // zero context handle for now
1569 error
= nfsm_rpchead(req
, auth_len
, req
->r_mrest
, &req
->r_xid
, &req
->r_mhead
);
1573 req
->r_mreqlen
= mbuf_pkthdr_len(req
->r_mhead
);
1574 nmp
= req
->r_np
? NFSTONMP(req
->r_np
) : req
->r_nmp
;
1577 lck_mtx_lock(&nmp
->nm_lock
);
1578 if (nmp
->nm_flag
& NFSMNT_SOFT
)
1579 req
->r_retry
= nmp
->nm_retry
;
1581 req
->r_retry
= NFS_MAXREXMIT
+ 1; /* past clip limit */
1582 lck_mtx_unlock(&nmp
->nm_lock
);
1589 * Queue an NFS request up and send it out.
1592 nfs_request_send(struct nfsreq
*req
, int wait
)
1594 struct nfsmount
*nmp
;
1597 lck_mtx_lock(nfs_request_mutex
);
1599 nmp
= req
->r_np
? NFSTONMP(req
->r_np
) : req
->r_nmp
;
1601 lck_mtx_unlock(nfs_request_mutex
);
1606 if (!req
->r_start
) {
1607 req
->r_start
= now
.tv_sec
;
1608 req
->r_lastmsg
= now
.tv_sec
-
1609 ((nmp
->nm_tprintf_delay
) - (nmp
->nm_tprintf_initial_delay
));
1612 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpcrequests
);
1615 * Chain request into list of outstanding requests. Be sure
1616 * to put it LAST so timer finds oldest requests first.
1617 * Make sure that the request queue timer is running
1618 * to check for possible request timeout.
1620 TAILQ_INSERT_TAIL(&nfs_reqq
, req
, r_chain
);
1621 req
->r_lflags
|= RL_QUEUED
;
1622 if (!nfs_request_timer_on
) {
1623 nfs_request_timer_on
= 1;
1624 nfs_interval_timer_start(nfs_request_timer_call
,
1627 lck_mtx_unlock(nfs_request_mutex
);
1629 /* Send the request... */
1630 return (nfs_send(req
, wait
));
1634 * Call nfs_wait_reply() to wait for the reply.
1637 nfs_request_wait(struct nfsreq
*req
)
1639 req
->r_error
= nfs_wait_reply(req
);
1643 * Finish up an NFS request by dequeueing it and
1644 * doing the initial NFS request reply processing.
1649 struct nfsm_chain
*nmrepp
,
1652 struct nfsmount
*nmp
;
1655 uint32_t verf_len
= 0;
1656 uint32_t reply_status
= 0;
1657 uint32_t rejected_status
= 0;
1658 uint32_t auth_status
= 0;
1659 uint32_t accepted_status
= 0;
1660 struct nfsm_chain nmrep
;
1663 error
= req
->r_error
;
1666 nmrepp
->nmc_mhead
= NULL
;
1668 /* RPC done, unlink the request. */
1669 nfs_reqdequeue(req
);
1671 mrep
= req
->r_nmrep
.nmc_mhead
;
1673 nmp
= req
->r_np
? NFSTONMP(req
->r_np
) : req
->r_nmp
;
1676 * Decrement the outstanding request count.
1678 if (req
->r_flags
& R_CWND
) {
1679 req
->r_flags
&= ~R_CWND
;
1680 lck_mtx_lock(&nmp
->nm_lock
);
1681 FSDBG(273, R_XID32(req
->r_xid
), req
, nmp
->nm_sent
, nmp
->nm_cwnd
);
1682 nmp
->nm_sent
-= NFS_CWNDSCALE
;
1683 if ((nmp
->nm_sent
< nmp
->nm_cwnd
) && !TAILQ_EMPTY(&nmp
->nm_cwndq
)) {
1684 /* congestion window is open, poke the cwnd queue */
1685 struct nfsreq
*req2
= TAILQ_FIRST(&nmp
->nm_cwndq
);
1686 TAILQ_REMOVE(&nmp
->nm_cwndq
, req2
, r_cchain
);
1687 req2
->r_cchain
.tqe_next
= NFSREQNOLIST
;
1690 lck_mtx_unlock(&nmp
->nm_lock
);
1693 if (req
->r_gss_ctx
) { // Using gss cred ?
1695 * If the request had an RPCSEC_GSS credential
1696 * then reset its sequence number bit in the
1699 nfs_gss_clnt_rpcdone(req
);
1702 * If we need to re-send, go back and re-build the
1703 * request based on a new sequence number.
1704 * Note that we're using the original XID.
1706 if (error
== EAGAIN
) {
1710 error
= nfs_gss_clnt_args_restore(req
); // remove any trailer mbufs
1711 req
->r_nmrep
.nmc_mhead
= NULL
;
1712 req
->r_flags
|= R_RESTART
;
1713 if (error
== ENEEDAUTH
) {
1714 req
->r_xid
= 0; // get a new XID
1722 * If there was a successful reply, make sure to mark the mount as up.
1723 * If a tprintf message was given (or if this is a timed-out soft mount)
1724 * then post a tprintf message indicating the server is alive again.
1727 if ((req
->r_flags
& R_TPRINTFMSG
) ||
1728 (nmp
&& (nmp
->nm_flag
& NFSMNT_SOFT
) &&
1729 ((nmp
->nm_state
& (NFSSTA_TIMEO
|NFSSTA_FORCE
)) == NFSSTA_TIMEO
)))
1730 nfs_up(nmp
, req
->r_thread
, NFSSTA_TIMEO
, "is alive again");
1732 nfs_up(nmp
, req
->r_thread
, NFSSTA_TIMEO
, NULL
);
1739 * break down the RPC header and check if ok
1741 nmrep
= req
->r_nmrep
;
1742 nfsm_chain_get_32(error
, &nmrep
, reply_status
);
1744 if (reply_status
== RPC_MSGDENIED
) {
1745 nfsm_chain_get_32(error
, &nmrep
, rejected_status
);
1747 if (rejected_status
== RPC_MISMATCH
) {
1751 nfsm_chain_get_32(error
, &nmrep
, auth_status
);
1753 switch (auth_status
) {
1754 case RPCSEC_GSS_CREDPROBLEM
:
1755 case RPCSEC_GSS_CTXPROBLEM
:
1757 * An RPCSEC_GSS cred or context problem.
1758 * We can't use it anymore.
1759 * Restore the args, renew the context
1760 * and set up for a resend.
1762 error
= nfs_gss_clnt_args_restore(req
);
1763 if (error
&& error
!= ENEEDAUTH
)
1767 error
= nfs_gss_clnt_ctx_renew(req
);
1772 req
->r_nmrep
.nmc_mhead
= NULL
;
1773 req
->r_xid
= 0; // get a new XID
1774 req
->r_flags
|= R_RESTART
;
1783 /* Now check the verifier */
1784 nfsm_chain_get_32(error
, &nmrep
, verf_type
); // verifier flavor
1785 nfsm_chain_get_32(error
, &nmrep
, verf_len
); // verifier length
1788 auth
= !req
->r_cred
? RPCAUTH_NULL
: nmp
->nm_auth
;
1792 /* Any AUTH_UNIX verifier is ignored */
1794 nfsm_chain_adv(error
, &nmrep
, nfsm_rndup(verf_len
));
1795 nfsm_chain_get_32(error
, &nmrep
, accepted_status
);
1800 error
= nfs_gss_clnt_verf_get(req
, &nmrep
,
1801 verf_type
, verf_len
, &accepted_status
);
1806 switch (accepted_status
) {
1808 if (req
->r_procnum
== NFSPROC_NULL
) {
1810 * The NFS null procedure is unique,
1811 * in not returning an NFS status.
1815 nfsm_chain_get_32(error
, &nmrep
, *status
);
1819 if ((nmp
->nm_vers
!= NFS_VER2
) && (*status
== NFSERR_TRYLATER
)) {
1821 * It's a JUKEBOX error - delay and try again
1823 int delay
, slpflag
= (nmp
->nm_flag
& NFSMNT_INT
) ? PCATCH
: 0;
1826 req
->r_nmrep
.nmc_mhead
= NULL
;
1827 if ((req
->r_delay
>= 30) && !(nmp
->nm_state
& NFSSTA_MOUNTED
)) {
1828 /* we're not yet completely mounted and */
1829 /* we can't complete an RPC, so we fail */
1830 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpctimeouts
);
1832 error
= req
->r_error
;
1835 req
->r_delay
= !req
->r_delay
? NFS_TRYLATERDEL
: (req
->r_delay
* 2);
1836 if (req
->r_delay
> 30)
1838 if (nmp
->nm_tprintf_initial_delay
&& (req
->r_delay
== 30)) {
1839 nfs_down(req
->r_nmp
, req
->r_thread
, 0, NFSSTA_JUKEBOXTIMEO
,
1840 "resource temporarily unavailable (jukebox)");
1841 req
->r_flags
|= R_JBTPRINTFMSG
;
1843 delay
= req
->r_delay
;
1844 if (req
->r_callback
.rcb_func
) {
1847 req
->r_resendtime
= now
.tv_sec
+ delay
;
1850 if ((error
= nfs_sigintr(req
->r_nmp
, req
, req
->r_thread
, 0)))
1852 tsleep(&lbolt
, PSOCK
|slpflag
, "nfs_jukebox_trylater", 0);
1853 } while (--delay
> 0);
1855 req
->r_xid
= 0; // get a new XID
1856 req
->r_flags
|= R_RESTART
;
1858 FSDBG(273, R_XID32(req
->r_xid
), nmp
, req
, NFSERR_TRYLATER
);
1862 if (req
->r_flags
& R_JBTPRINTFMSG
)
1863 nfs_up(nmp
, req
->r_thread
, NFSSTA_JUKEBOXTIMEO
, "resource available again");
1865 if (*status
== NFS_OK
) {
1867 * Successful NFS request
1870 req
->r_nmrep
.nmc_mhead
= NULL
;
1873 /* Got an NFS error of some kind */
1876 * If the File Handle was stale, invalidate the
1877 * lookup cache, just in case.
1879 if ((*status
== ESTALE
) && req
->r_np
)
1880 cache_purge(NFSTOV(req
->r_np
));
1881 if (nmp
->nm_vers
== NFS_VER2
)
1885 req
->r_nmrep
.nmc_mhead
= NULL
;
1888 case RPC_PROGUNAVAIL
:
1889 error
= EPROGUNAVAIL
;
1891 case RPC_PROGMISMATCH
:
1892 error
= ERPCMISMATCH
;
1894 case RPC_PROCUNAVAIL
:
1895 error
= EPROCUNAVAIL
;
1900 case RPC_SYSTEM_ERR
:
1906 if (!error
&& (req
->r_flags
& R_JBTPRINTFMSG
))
1907 nfs_up(nmp
, req
->r_thread
, NFSSTA_JUKEBOXTIMEO
, NULL
);
1908 FSDBG(273, R_XID32(req
->r_xid
), nmp
, req
,
1909 (!error
&& (*status
== NFS_OK
)) ? 0xf0f0f0f0 : error
);
1915 * Perform an NFS request synchronously.
1921 mount_t mp
, /* used only if !np */
1922 struct nfsm_chain
*nmrest
,
1925 struct nfsm_chain
*nmrepp
,
1929 return nfs_request2(np
, mp
, nmrest
, procnum
,
1930 vfs_context_thread(ctx
), vfs_context_ucred(ctx
),
1931 0, nmrepp
, xidp
, status
);
1937 mount_t mp
, /* used only if !np */
1938 struct nfsm_chain
*nmrest
,
1943 struct nfsm_chain
*nmrepp
,
1947 struct nfsreq rq
, *req
= &rq
;
1950 if ((error
= nfs_request_create(np
, mp
, nmrest
, procnum
, thd
, cred
, &req
)))
1952 req
->r_flags
|= (flags
& R_OPTMASK
);
1954 FSDBG_TOP(273, R_XID32(req
->r_xid
), np
, procnum
, 0);
1957 req
->r_flags
&= ~R_RESTART
;
1958 if ((error
= nfs_request_add_header(req
)))
1962 if ((error
= nfs_request_send(req
, 1)))
1964 nfs_request_wait(req
);
1965 if ((error
= nfs_request_finish(req
, nmrepp
, status
)))
1967 } while (req
->r_flags
& R_RESTART
);
1969 FSDBG_BOT(273, R_XID32(req
->r_xid
), np
, procnum
, error
);
1970 nfs_request_rele(req
);
1975 * Create and start an asynchronous NFS request.
1980 mount_t mp
, /* used only if !np */
1981 struct nfsm_chain
*nmrest
,
1985 struct nfsreq_cbinfo
*cb
,
1986 struct nfsreq
**reqp
)
1991 error
= nfs_request_create(np
, mp
, nmrest
, procnum
, thd
, cred
, reqp
);
1993 FSDBG(274, (req
? R_XID32(req
->r_xid
) : 0), np
, procnum
, error
);
1996 req
->r_flags
|= R_ASYNC
;
1998 req
->r_callback
= *cb
;
1999 error
= nfs_request_add_header(req
);
2001 req
->r_flags
|= R_WAITSENT
;
2002 if (req
->r_callback
.rcb_func
)
2003 nfs_request_ref(req
, 0);
2004 error
= nfs_request_send(req
, 1);
2005 lck_mtx_lock(&req
->r_mtx
);
2006 if (!error
&& !(req
->r_flags
& R_SENT
) && req
->r_callback
.rcb_func
) {
2007 /* make sure to wait until this async I/O request gets sent */
2008 int slpflag
= (req
->r_nmp
&& (req
->r_nmp
->nm_flag
& NFSMNT_INT
) && req
->r_thread
) ? PCATCH
: 0;
2009 struct timespec ts
= { 2, 0 };
2010 while (!(req
->r_flags
& R_SENT
)) {
2011 if ((error
= nfs_sigintr(req
->r_nmp
, req
, req
->r_thread
, 0)))
2013 msleep(req
, &req
->r_mtx
, slpflag
| (PZERO
- 1), "nfswaitsent", &ts
);
2017 sent
= req
->r_flags
& R_SENT
;
2018 lck_mtx_unlock(&req
->r_mtx
);
2019 if (error
&& req
->r_callback
.rcb_func
&& !sent
)
2020 nfs_request_rele(req
);
2022 FSDBG(274, R_XID32(req
->r_xid
), np
, procnum
, error
);
2023 if (error
|| req
->r_callback
.rcb_func
)
2024 nfs_request_rele(req
);
2029 * Wait for and finish an asynchronous NFS request.
2032 nfs_request_async_finish(
2034 struct nfsm_chain
*nmrepp
,
2038 int error
, asyncio
= req
->r_callback
.rcb_func
? 1 : 0;
2040 lck_mtx_lock(&req
->r_mtx
);
2042 req
->r_flags
|= R_ASYNCWAIT
;
2043 while (req
->r_flags
& R_RESENDQ
) /* wait until the request is off the resend queue */
2044 msleep(req
, &req
->r_mtx
, PZERO
-1, "nfsresendqwait", NULL
);
2045 lck_mtx_unlock(&req
->r_mtx
);
2047 nfs_request_wait(req
);
2048 error
= nfs_request_finish(req
, nmrepp
, status
);
2050 while (!error
&& (req
->r_flags
& R_RESTART
)) {
2051 if (asyncio
&& req
->r_resendtime
) { /* send later */
2052 lck_mtx_lock(&req
->r_mtx
);
2053 nfs_asyncio_resend(req
);
2054 lck_mtx_unlock(&req
->r_mtx
);
2055 return (EINPROGRESS
);
2058 req
->r_flags
&= ~R_RESTART
;
2059 if ((error
= nfs_request_add_header(req
)))
2061 if ((error
= nfs_request_send(req
, !asyncio
)))
2064 return (EINPROGRESS
);
2065 nfs_request_wait(req
);
2066 if ((error
= nfs_request_finish(req
, nmrepp
, status
)))
2072 FSDBG(275, R_XID32(req
->r_xid
), req
->r_np
, req
->r_procnum
, error
);
2073 nfs_request_rele(req
);
2078 * Cancel a pending asynchronous NFS request.
2081 nfs_request_async_cancel(struct nfsreq
*req
)
2083 nfs_reqdequeue(req
);
2084 FSDBG(275, R_XID32(req
->r_xid
), req
->r_np
, req
->r_procnum
, 0xD1ED1E);
2085 nfs_request_rele(req
);
2089 * Flag a request as being terminated.
2092 nfs_softterm(struct nfsreq
*req
)
2094 struct nfsmount
*nmp
= req
->r_nmp
;
2095 req
->r_flags
|= R_SOFTTERM
;
2096 req
->r_error
= ETIMEDOUT
;
2097 if (!(req
->r_flags
& R_CWND
) || !nmp
)
2099 /* update congestion window */
2100 req
->r_flags
&= ~R_CWND
;
2101 lck_mtx_lock(&nmp
->nm_lock
);
2102 FSDBG(532, R_XID32(req
->r_xid
), req
, nmp
->nm_sent
, nmp
->nm_cwnd
);
2103 nmp
->nm_sent
-= NFS_CWNDSCALE
;
2104 if ((nmp
->nm_sent
< nmp
->nm_cwnd
) && !TAILQ_EMPTY(&nmp
->nm_cwndq
)) {
2105 /* congestion window is open, poke the cwnd queue */
2106 struct nfsreq
*req2
= TAILQ_FIRST(&nmp
->nm_cwndq
);
2107 TAILQ_REMOVE(&nmp
->nm_cwndq
, req2
, r_cchain
);
2108 req2
->r_cchain
.tqe_next
= NFSREQNOLIST
;
2111 lck_mtx_unlock(&nmp
->nm_lock
);
2115 * Ensure req isn't in use by the timer, then dequeue it.
2118 nfs_reqdequeue(struct nfsreq
*req
)
2120 lck_mtx_lock(nfs_request_mutex
);
2121 while (req
->r_lflags
& RL_BUSY
) {
2122 req
->r_lflags
|= RL_WAITING
;
2123 msleep(&req
->r_lflags
, nfs_request_mutex
, PSOCK
, "reqdeq", NULL
);
2125 if (req
->r_lflags
& RL_QUEUED
) {
2126 TAILQ_REMOVE(&nfs_reqq
, req
, r_chain
);
2127 req
->r_lflags
&= ~RL_QUEUED
;
2129 lck_mtx_unlock(nfs_request_mutex
);
2133 * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
2134 * free()'d out from under it.
2137 nfs_reqbusy(struct nfsreq
*req
)
2139 if (req
->r_lflags
& RL_BUSY
)
2140 panic("req locked");
2141 req
->r_lflags
|= RL_BUSY
;
2145 * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
2147 static struct nfsreq
*
2148 nfs_reqnext(struct nfsreq
*req
)
2150 struct nfsreq
* nextreq
;
2155 * We need to get and busy the next req before signalling the
2156 * current one, otherwise wakeup() may block us and we'll race to
2157 * grab the next req.
2159 nextreq
= TAILQ_NEXT(req
, r_chain
);
2160 if (nextreq
!= NULL
)
2161 nfs_reqbusy(nextreq
);
2162 /* unbusy and signal. */
2163 req
->r_lflags
&= ~RL_BUSY
;
2164 if (req
->r_lflags
& RL_WAITING
) {
2165 req
->r_lflags
&= ~RL_WAITING
;
2166 wakeup(&req
->r_lflags
);
2172 * NFS request queue timer routine
2174 * Scan the NFS request queue for any requests that have timed out.
2176 * Alert the system of unresponsive servers.
2177 * Mark expired requests on soft mounts as terminated.
2178 * For UDP, mark/signal requests for retransmission.
2181 nfs_request_timer(__unused
void *param0
, __unused
void *param1
)
2184 struct nfsmount
*nmp
;
2185 int timeo
, maxtime
, finish_asyncio
, error
;
2187 TAILQ_HEAD(nfs_mount_pokeq
, nfsmount
) nfs_mount_poke_queue
;
2189 lck_mtx_lock(nfs_request_mutex
);
2190 req
= TAILQ_FIRST(&nfs_reqq
);
2191 if (req
== NULL
) { /* no requests - turn timer off */
2192 nfs_request_timer_on
= 0;
2193 lck_mtx_unlock(nfs_request_mutex
);
2198 TAILQ_INIT(&nfs_mount_poke_queue
);
2201 for ( ; req
!= NULL
; req
= nfs_reqnext(req
)) {
2203 if (!nmp
) /* unmounted */
2205 if (req
->r_error
|| req
->r_nmrep
.nmc_mhead
)
2207 if ((error
= nfs_sigintr(nmp
, req
, req
->r_thread
, 0))) {
2208 if (req
->r_callback
.rcb_func
!= NULL
) {
2209 /* async I/O RPC needs to be finished */
2210 lck_mtx_lock(&req
->r_mtx
);
2211 req
->r_error
= error
;
2212 finish_asyncio
= !(req
->r_flags
& R_WAITSENT
);
2214 lck_mtx_unlock(&req
->r_mtx
);
2216 nfs_asyncio_finish(req
);
2221 lck_mtx_lock(&req
->r_mtx
);
2223 if (nmp
->nm_tprintf_initial_delay
&&
2224 ((req
->r_rexmit
> 2) || (req
->r_flags
& R_RESENDERR
)) &&
2225 ((req
->r_lastmsg
+ nmp
->nm_tprintf_delay
) < now
.tv_sec
)) {
2226 req
->r_lastmsg
= now
.tv_sec
;
2227 nfs_down(req
->r_nmp
, req
->r_thread
, 0, NFSSTA_TIMEO
,
2229 req
->r_flags
|= R_TPRINTFMSG
;
2230 lck_mtx_lock(&nmp
->nm_lock
);
2231 if (!(nmp
->nm_state
& NFSSTA_MOUNTED
)) {
2232 lck_mtx_unlock(&nmp
->nm_lock
);
2233 /* we're not yet completely mounted and */
2234 /* we can't complete an RPC, so we fail */
2235 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpctimeouts
);
2237 finish_asyncio
= ((req
->r_callback
.rcb_func
!= NULL
) && !(req
->r_flags
& R_WAITSENT
));
2239 lck_mtx_unlock(&req
->r_mtx
);
2241 nfs_asyncio_finish(req
);
2244 lck_mtx_unlock(&nmp
->nm_lock
);
2248 * Put a reasonable limit on the maximum timeout,
2249 * and reduce that limit when soft mounts get timeouts or are in reconnect.
2251 if (!(nmp
->nm_flag
& NFSMNT_SOFT
))
2252 maxtime
= NFS_MAXTIMEO
;
2253 else if ((req
->r_flags
& R_SETUP
) || ((nmp
->nm_reconnect_start
<= 0) || ((now
.tv_sec
- nmp
->nm_reconnect_start
) < 8)))
2254 maxtime
= (NFS_MAXTIMEO
/ (nmp
->nm_timeouts
+1))/2;
2256 maxtime
= NFS_MINTIMEO
/4;
2259 * Check for request timeout.
2261 if (req
->r_rtt
>= 0) {
2263 lck_mtx_lock(&nmp
->nm_lock
);
2264 if (req
->r_flags
& R_RESENDERR
) {
2265 /* with resend errors, retry every few seconds */
2268 if (req
->r_procnum
== NFSPROC_NULL
&& req
->r_gss_ctx
!= NULL
)
2269 timeo
= NFS_MINIDEMTIMEO
; // gss context setup
2270 else if (nmp
->nm_flag
& NFSMNT_DUMBTIMR
)
2271 timeo
= nmp
->nm_timeo
;
2273 timeo
= NFS_RTO(nmp
, proct
[req
->r_procnum
]);
2275 /* ensure 62.5 ms floor */
2276 while (16 * timeo
< hz
)
2278 if (nmp
->nm_timeouts
> 0)
2279 timeo
*= nfs_backoff
[nmp
->nm_timeouts
- 1];
2281 /* limit timeout to max */
2282 if (timeo
> maxtime
)
2284 if (req
->r_rtt
<= timeo
) {
2285 lck_mtx_unlock(&nmp
->nm_lock
);
2286 lck_mtx_unlock(&req
->r_mtx
);
2289 /* The request has timed out */
2290 NFS_SOCK_DBG(("nfs timeout: proc %d %d xid %llx rtt %d to %d # %d, t %ld/%d\n",
2291 req
->r_procnum
, proct
[req
->r_procnum
],
2292 req
->r_xid
, req
->r_rtt
, timeo
, nmp
->nm_timeouts
,
2293 (now
.tv_sec
- req
->r_start
)*NFS_HZ
, maxtime
));
2294 if (nmp
->nm_timeouts
< 8)
2296 /* if it's been a few seconds, try poking the socket */
2297 if ((nmp
->nm_sotype
== SOCK_STREAM
) &&
2298 ((now
.tv_sec
- req
->r_start
) >= 3) &&
2299 !(nmp
->nm_sockflags
& NMSOCK_POKE
)) {
2300 nmp
->nm_sockflags
|= NMSOCK_POKE
;
2301 TAILQ_INSERT_TAIL(&nfs_mount_poke_queue
, nmp
, nm_pokeq
);
2303 lck_mtx_unlock(&nmp
->nm_lock
);
2306 /* For soft mounts (& SETUPs), check for too many retransmits/timeout. */
2307 if (((nmp
->nm_flag
& NFSMNT_SOFT
) || (req
->r_flags
& R_SETUP
)) &&
2308 ((req
->r_rexmit
>= req
->r_retry
) || /* too many */
2309 ((now
.tv_sec
- req
->r_start
)*NFS_HZ
> maxtime
))) { /* too long */
2310 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpctimeouts
);
2311 lck_mtx_lock(&nmp
->nm_lock
);
2312 if (!(nmp
->nm_state
& NFSSTA_TIMEO
)) {
2313 lck_mtx_unlock(&nmp
->nm_lock
);
2314 /* make sure we note the unresponsive server */
2315 /* (maxtime may be less than tprintf delay) */
2316 nfs_down(req
->r_nmp
, req
->r_thread
, 0, NFSSTA_TIMEO
,
2318 req
->r_lastmsg
= now
.tv_sec
;
2319 req
->r_flags
|= R_TPRINTFMSG
;
2321 lck_mtx_unlock(&nmp
->nm_lock
);
2323 NFS_SOCK_DBG(("nfs timer TERMINATE: p %d x 0x%llx f 0x%x rtt %d t %ld\n",
2324 req
->r_procnum
, req
->r_xid
, req
->r_flags
, req
->r_rtt
,
2325 now
.tv_sec
- req
->r_start
));
2327 finish_asyncio
= ((req
->r_callback
.rcb_func
!= NULL
) && !(req
->r_flags
& R_WAITSENT
));
2329 lck_mtx_unlock(&req
->r_mtx
);
2331 nfs_asyncio_finish(req
);
2335 /* for TCP, only resend if explicitly requested */
2336 if ((nmp
->nm_sotype
== SOCK_STREAM
) && !(req
->r_flags
& R_MUSTRESEND
)) {
2337 if (++req
->r_rexmit
> NFS_MAXREXMIT
)
2338 req
->r_rexmit
= NFS_MAXREXMIT
;
2340 lck_mtx_unlock(&req
->r_mtx
);
2345 * The request needs to be (re)sent. Kick the requester to resend it.
2346 * (unless it's already marked as needing a resend)
2348 if ((req
->r_flags
& R_MUSTRESEND
) && (req
->r_rtt
== -1)) {
2349 lck_mtx_unlock(&req
->r_mtx
);
2352 NFS_SOCK_DBG(("nfs timer mark resend: p %d x 0x%llx f 0x%x rtt %d\n",
2353 req
->r_procnum
, req
->r_xid
, req
->r_flags
, req
->r_rtt
));
2354 req
->r_flags
|= R_MUSTRESEND
;
2357 if ((req
->r_flags
& (R_ASYNC
|R_ASYNCWAIT
)) == R_ASYNC
)
2358 nfs_asyncio_resend(req
);
2359 lck_mtx_unlock(&req
->r_mtx
);
2362 lck_mtx_unlock(nfs_request_mutex
);
2364 /* poke any sockets */
2365 while ((nmp
= TAILQ_FIRST(&nfs_mount_poke_queue
))) {
2366 TAILQ_REMOVE(&nfs_mount_poke_queue
, nmp
, nm_pokeq
);
2368 lck_mtx_lock(&nmp
->nm_lock
);
2369 nmp
->nm_sockflags
&= ~NMSOCK_POKE
;
2370 if (!(nmp
->nm_state
& NFSSTA_MOUNTED
))
2371 wakeup(&nmp
->nm_sockflags
);
2372 lck_mtx_unlock(&nmp
->nm_lock
);
2375 nfs_interval_timer_start(nfs_request_timer_call
, NFS_REQUESTDELAY
);
2379 * check a thread's proc for the "noremotehang" flag.
2382 nfs_noremotehang(thread_t thd
)
2384 proc_t p
= thd
? get_bsdthreadtask_info(thd
) : NULL
;
2385 return (p
&& proc_noremotehang(p
));
2389 * Test for a termination condition pending on the process.
2390 * This is used to determine if we need to bail on a mount.
2391 * ETIMEDOUT is returned if there has been a soft timeout.
2392 * EINTR is returned if there is a signal pending that is not being ignored
2393 * and the mount is interruptable, or if we are a thread that is in the process
2394 * of cancellation (also SIGKILL posted).
2397 nfs_sigintr(struct nfsmount
*nmp
, struct nfsreq
*req
, thread_t thd
, int nmplocked
)
2404 if (req
&& (req
->r_flags
& R_SOFTTERM
))
2405 return (ETIMEDOUT
); /* request has been terminated. */
2408 * If we're in the progress of a force unmount and there's
2409 * been a timeout, we're dead and fail IO.
2412 lck_mtx_lock(&nmp
->nm_lock
);
2413 if ((nmp
->nm_state
& NFSSTA_FORCE
) &&
2414 (nmp
->nm_state
& (NFSSTA_TIMEO
|NFSSTA_JUKEBOXTIMEO
|NFSSTA_LOCKTIMEO
))) {
2416 } else if (nmp
->nm_mountp
->mnt_kern_flag
& MNTK_FRCUNMOUNT
) {
2417 /* Someone is unmounting us, go soft and mark it. */
2418 nmp
->nm_flag
|= NFSMNT_SOFT
;
2419 nmp
->nm_state
|= NFSSTA_FORCE
;
2423 * If the mount is hung and we've requested not to hang
2424 * on remote filesystems, then bail now.
2426 if (!error
&& (nmp
->nm_state
& NFSSTA_TIMEO
) && nfs_noremotehang(thd
))
2430 lck_mtx_unlock(&nmp
->nm_lock
);
2434 /* may not have a thread for async I/O */
2438 /* If this thread belongs to kernel task; then abort check is not needed */
2439 if ((current_proc() != kernproc
) && current_thread_aborted())
2442 /* mask off thread and process blocked signals. */
2443 if ((nmp
->nm_flag
& NFSMNT_INT
) &&
2444 proc_pendingsignals(get_bsdthreadtask_info(thd
), NFSINT_SIGMASK
))
2450 * Lock a socket against others.
2451 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
2452 * and also to avoid race conditions between the processes with nfs requests
2453 * in progress when a reconnect is necessary.
2456 nfs_sndlock(struct nfsreq
*req
)
2458 struct nfsmount
*nmp
= req
->r_nmp
;
2460 int error
= 0, slpflag
= 0;
2461 struct timespec ts
= { 0, 0 };
2466 lck_mtx_lock(&nmp
->nm_lock
);
2467 statep
= &nmp
->nm_state
;
2469 if ((nmp
->nm_flag
& NFSMNT_INT
) && req
->r_thread
)
2471 while (*statep
& NFSSTA_SNDLOCK
) {
2472 if ((error
= nfs_sigintr(nmp
, req
, req
->r_thread
, 1)))
2474 *statep
|= NFSSTA_WANTSND
;
2475 if (nfs_noremotehang(req
->r_thread
))
2477 msleep(statep
, &nmp
->nm_lock
, slpflag
| (PZERO
- 1), "nfsndlck", &ts
);
2478 if (slpflag
== PCATCH
) {
2484 *statep
|= NFSSTA_SNDLOCK
;
2485 lck_mtx_unlock(&nmp
->nm_lock
);
2490 * Unlock the stream socket for others.
2493 nfs_sndunlock(struct nfsreq
*req
)
2495 struct nfsmount
*nmp
= req
->r_nmp
;
2496 int *statep
, wake
= 0;
2500 lck_mtx_lock(&nmp
->nm_lock
);
2501 statep
= &nmp
->nm_state
;
2502 if ((*statep
& NFSSTA_SNDLOCK
) == 0)
2503 panic("nfs sndunlock");
2504 *statep
&= ~NFSSTA_SNDLOCK
;
2505 if (*statep
& NFSSTA_WANTSND
) {
2506 *statep
&= ~NFSSTA_WANTSND
;
2509 lck_mtx_unlock(&nmp
->nm_lock
);
2514 #endif /* NFSCLIENT */
2519 * Generate the rpc reply header
2520 * siz arg. is used to decide if adding a cluster is worthwhile
2524 struct nfsrv_descript
*nd
,
2525 __unused
struct nfsrv_sock
*slp
,
2526 struct nfsm_chain
*nmrepp
,
2531 struct nfsm_chain nmrep
;
2534 err
= nd
->nd_repstat
;
2535 if (err
&& (nd
->nd_vers
== NFS_VER2
))
2539 * If this is a big reply, use a cluster else
2540 * try and leave leading space for the lower level headers.
2542 siz
+= RPC_REPLYSIZ
;
2543 if (siz
>= nfs_mbuf_minclsize
) {
2544 error
= mbuf_getpacket(MBUF_WAITOK
, &mrep
);
2546 error
= mbuf_gethdr(MBUF_WAITOK
, MBUF_TYPE_DATA
, &mrep
);
2549 /* unable to allocate packet */
2550 /* XXX should we keep statistics for these errors? */
2553 if (siz
< nfs_mbuf_minclsize
) {
2554 /* leave space for lower level headers */
2555 tl
= mbuf_data(mrep
);
2556 tl
+= 80/sizeof(*tl
); /* XXX max_hdr? XXX */
2557 mbuf_setdata(mrep
, tl
, 6 * NFSX_UNSIGNED
);
2559 nfsm_chain_init(&nmrep
, mrep
);
2560 nfsm_chain_add_32(error
, &nmrep
, nd
->nd_retxid
);
2561 nfsm_chain_add_32(error
, &nmrep
, RPC_REPLY
);
2562 if (err
== ERPCMISMATCH
|| (err
& NFSERR_AUTHERR
)) {
2563 nfsm_chain_add_32(error
, &nmrep
, RPC_MSGDENIED
);
2564 if (err
& NFSERR_AUTHERR
) {
2565 nfsm_chain_add_32(error
, &nmrep
, RPC_AUTHERR
);
2566 nfsm_chain_add_32(error
, &nmrep
, (err
& ~NFSERR_AUTHERR
));
2568 nfsm_chain_add_32(error
, &nmrep
, RPC_MISMATCH
);
2569 nfsm_chain_add_32(error
, &nmrep
, RPC_VER2
);
2570 nfsm_chain_add_32(error
, &nmrep
, RPC_VER2
);
2574 nfsm_chain_add_32(error
, &nmrep
, RPC_MSGACCEPTED
);
2575 if (nd
->nd_gss_context
!= NULL
) {
2576 /* RPCSEC_GSS verifier */
2577 error
= nfs_gss_svc_verf_put(nd
, &nmrep
);
2579 nfsm_chain_add_32(error
, &nmrep
, RPC_SYSTEM_ERR
);
2583 /* RPCAUTH_NULL verifier */
2584 nfsm_chain_add_32(error
, &nmrep
, RPCAUTH_NULL
);
2585 nfsm_chain_add_32(error
, &nmrep
, 0);
2587 /* accepted status */
2590 nfsm_chain_add_32(error
, &nmrep
, RPC_PROGUNAVAIL
);
2593 nfsm_chain_add_32(error
, &nmrep
, RPC_PROGMISMATCH
);
2594 /* XXX hard coded versions? */
2595 nfsm_chain_add_32(error
, &nmrep
, NFS_VER2
);
2596 nfsm_chain_add_32(error
, &nmrep
, NFS_VER3
);
2599 nfsm_chain_add_32(error
, &nmrep
, RPC_PROCUNAVAIL
);
2602 nfsm_chain_add_32(error
, &nmrep
, RPC_GARBAGE
);
2605 nfsm_chain_add_32(error
, &nmrep
, RPC_SUCCESS
);
2606 if (nd
->nd_gss_context
!= NULL
)
2607 error
= nfs_gss_svc_prepare_reply(nd
, &nmrep
);
2608 if (err
!= NFSERR_RETVOID
)
2609 nfsm_chain_add_32(error
, &nmrep
,
2610 (err
? nfsrv_errmap(nd
, err
) : 0));
2616 nfsm_chain_build_done(error
, &nmrep
);
2618 /* error composing reply header */
2619 /* XXX should we keep statistics for these errors? */
2625 if ((err
!= 0) && (err
!= NFSERR_RETVOID
))
2626 OSAddAtomic(1, (SInt32
*)&nfsstats
.srvrpc_errs
);
2631 * The nfs server send routine.
2633 * - return EINTR or ERESTART if interrupted by a signal
2634 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
2635 * - do any cleanup required by recoverable socket errors (???)
2638 nfsrv_send(struct nfsrv_sock
*slp
, mbuf_t nam
, mbuf_t top
)
2641 socket_t so
= slp
->ns_so
;
2642 struct sockaddr
*sendnam
;
2645 bzero(&msg
, sizeof(msg
));
2646 if (nam
&& !sock_isconnected(so
) && (slp
->ns_sotype
!= SOCK_STREAM
)) {
2647 if ((sendnam
= mbuf_data(nam
))) {
2648 msg
.msg_name
= (caddr_t
)sendnam
;
2649 msg
.msg_namelen
= sendnam
->sa_len
;
2652 error
= sock_sendmbuf(so
, &msg
, top
, 0, NULL
);
2655 log(LOG_INFO
, "nfsd send error %d\n", error
);
2657 if ((error
== EWOULDBLOCK
) && (slp
->ns_sotype
== SOCK_STREAM
))
2658 error
= EPIPE
; /* zap TCP sockets if they time out on send */
2660 /* Handle any recoverable (soft) socket errors here. (???) */
2661 if (error
!= EINTR
&& error
!= ERESTART
&& error
!= EIO
&&
2662 error
!= EWOULDBLOCK
&& error
!= EPIPE
)
2669 * Socket upcall routine for the nfsd sockets.
2670 * The caddr_t arg is a pointer to the "struct nfsrv_sock".
2671 * Essentially do as much as possible non-blocking, else punt and it will
2672 * be called with MBUF_WAITOK from an nfsd.
2675 nfsrv_rcv(socket_t so
, caddr_t arg
, int waitflag
)
2677 struct nfsrv_sock
*slp
= (struct nfsrv_sock
*)arg
;
2679 if (!nfsd_thread_count
|| !(slp
->ns_flag
& SLP_VALID
))
2682 lck_rw_lock_exclusive(&slp
->ns_rwlock
);
2683 nfsrv_rcv_locked(so
, slp
, waitflag
);
2684 /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2687 nfsrv_rcv_locked(socket_t so
, struct nfsrv_sock
*slp
, int waitflag
)
2689 mbuf_t m
, mp
, mhck
, m2
;
2690 int ns_flag
=0, error
;
2694 if ((slp
->ns_flag
& SLP_VALID
) == 0) {
2695 if (waitflag
== MBUF_DONTWAIT
)
2696 lck_rw_done(&slp
->ns_rwlock
);
2702 * Define this to test for nfsds handling this under heavy load.
2704 if (waitflag
== MBUF_DONTWAIT
) {
2705 ns_flag
= SLP_NEEDQ
;
2709 if (slp
->ns_sotype
== SOCK_STREAM
) {
2711 * If there are already records on the queue, defer soreceive()
2712 * to an nfsd so that there is feedback to the TCP layer that
2713 * the nfs servers are heavily loaded.
2715 if (slp
->ns_rec
&& waitflag
== MBUF_DONTWAIT
) {
2716 ns_flag
= SLP_NEEDQ
;
2723 bytes_read
= 1000000000;
2724 error
= sock_receivembuf(so
, NULL
, &mp
, MSG_DONTWAIT
, &bytes_read
);
2725 if (error
|| mp
== NULL
) {
2726 if (error
== EWOULDBLOCK
)
2727 ns_flag
= (waitflag
== MBUF_DONTWAIT
) ? SLP_NEEDQ
: 0;
2729 ns_flag
= SLP_DISCONN
;
2733 if (slp
->ns_rawend
) {
2734 if ((error
= mbuf_setnext(slp
->ns_rawend
, m
)))
2735 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error
);
2736 slp
->ns_cc
+= bytes_read
;
2739 slp
->ns_cc
= bytes_read
;
2741 while ((m2
= mbuf_next(m
)))
2746 * Now try and parse record(s) out of the raw stream data.
2748 error
= nfsrv_getstream(slp
, waitflag
);
2751 ns_flag
= SLP_DISCONN
;
2753 ns_flag
= SLP_NEEDQ
;
2756 struct sockaddr_storage nam
;
2758 if (slp
->ns_reccnt
>= nfsrv_sock_max_rec_queue_length
) {
2759 /* already have max # RPC records queued on this socket */
2760 ns_flag
= SLP_NEEDQ
;
2764 bzero(&msg
, sizeof(msg
));
2765 msg
.msg_name
= (caddr_t
)&nam
;
2766 msg
.msg_namelen
= sizeof(nam
);
2769 bytes_read
= 1000000000;
2770 error
= sock_receivembuf(so
, &msg
, &mp
, MSG_DONTWAIT
| MSG_NEEDSA
, &bytes_read
);
2772 if (msg
.msg_name
&& (mbuf_get(MBUF_WAITOK
, MBUF_TYPE_SONAME
, &mhck
) == 0)) {
2773 mbuf_setlen(mhck
, nam
.ss_len
);
2774 bcopy(&nam
, mbuf_data(mhck
), nam
.ss_len
);
2776 if (mbuf_setnext(m
, mp
)) {
2777 /* trouble... just drop it */
2778 printf("nfsrv_rcv: mbuf_setnext failed\n");
2786 mbuf_setnextpkt(slp
->ns_recend
, m
);
2789 slp
->ns_flag
|= SLP_DOREC
;
2792 mbuf_setnextpkt(m
, NULL
);
2799 * Now try and process the request records, non-blocking.
2803 slp
->ns_flag
|= ns_flag
;
2804 if (waitflag
== MBUF_DONTWAIT
) {
2805 int wake
= (slp
->ns_flag
& SLP_WORKTODO
);
2806 lck_rw_done(&slp
->ns_rwlock
);
2807 if (wake
&& nfsd_thread_count
) {
2808 lck_mtx_lock(nfsd_mutex
);
2809 nfsrv_wakenfsd(slp
);
2810 lck_mtx_unlock(nfsd_mutex
);
2816 * Try and extract an RPC request from the mbuf data list received on a
2817 * stream socket. The "waitflag" argument indicates whether or not it
2821 nfsrv_getstream(struct nfsrv_sock
*slp
, int waitflag
)
2824 char *cp1
, *cp2
, *mdata
;
2825 int len
, mlen
, error
;
2826 mbuf_t om
, m2
, recm
;
2829 if (slp
->ns_flag
& SLP_GETSTREAM
)
2830 panic("nfs getstream");
2831 slp
->ns_flag
|= SLP_GETSTREAM
;
2833 if (slp
->ns_reclen
== 0) {
2834 if (slp
->ns_cc
< NFSX_UNSIGNED
) {
2835 slp
->ns_flag
&= ~SLP_GETSTREAM
;
2839 mdata
= mbuf_data(m
);
2841 if (mlen
>= NFSX_UNSIGNED
) {
2842 bcopy(mdata
, (caddr_t
)&recmark
, NFSX_UNSIGNED
);
2843 mdata
+= NFSX_UNSIGNED
;
2844 mlen
-= NFSX_UNSIGNED
;
2845 mbuf_setdata(m
, mdata
, mlen
);
2847 cp1
= (caddr_t
)&recmark
;
2849 while (cp1
< ((caddr_t
)&recmark
) + NFSX_UNSIGNED
) {
2857 mbuf_setdata(m
, cp2
, mlen
);
2860 slp
->ns_cc
-= NFSX_UNSIGNED
;
2861 recmark
= ntohl(recmark
);
2862 slp
->ns_reclen
= recmark
& ~0x80000000;
2863 if (recmark
& 0x80000000)
2864 slp
->ns_flag
|= SLP_LASTFRAG
;
2866 slp
->ns_flag
&= ~SLP_LASTFRAG
;
2867 if (slp
->ns_reclen
< NFS_MINPACKET
|| slp
->ns_reclen
> NFS_MAXPACKET
) {
2868 slp
->ns_flag
&= ~SLP_GETSTREAM
;
2874 * Now get the record part.
2876 * Note that slp->ns_reclen may be 0. Linux sometimes
2877 * generates 0-length RPCs
2880 if (slp
->ns_cc
== slp
->ns_reclen
) {
2882 slp
->ns_raw
= slp
->ns_rawend
= NULL
;
2883 slp
->ns_cc
= slp
->ns_reclen
= 0;
2884 } else if (slp
->ns_cc
> slp
->ns_reclen
) {
2888 mdata
= mbuf_data(m
);
2890 while (len
< slp
->ns_reclen
) {
2891 if ((len
+ mlen
) > slp
->ns_reclen
) {
2892 if (mbuf_copym(m
, 0, slp
->ns_reclen
- len
, waitflag
, &m2
)) {
2893 slp
->ns_flag
&= ~SLP_GETSTREAM
;
2894 return (EWOULDBLOCK
);
2897 if (mbuf_setnext(om
, m2
)) {
2898 /* trouble... just drop it */
2899 printf("nfsrv_getstream: mbuf_setnext failed\n");
2901 slp
->ns_flag
&= ~SLP_GETSTREAM
;
2902 return (EWOULDBLOCK
);
2908 mdata
+= slp
->ns_reclen
- len
;
2909 mlen
-= slp
->ns_reclen
- len
;
2910 mbuf_setdata(m
, mdata
, mlen
);
2911 len
= slp
->ns_reclen
;
2912 } else if ((len
+ mlen
) == slp
->ns_reclen
) {
2917 if (mbuf_setnext(om
, NULL
)) {
2918 printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2919 slp
->ns_flag
&= ~SLP_GETSTREAM
;
2920 return (EWOULDBLOCK
);
2923 mdata
= mbuf_data(m
);
2929 mdata
= mbuf_data(m
);
2936 slp
->ns_flag
&= ~SLP_GETSTREAM
;
2941 * Accumulate the fragments into a record.
2943 if (slp
->ns_frag
== NULL
) {
2944 slp
->ns_frag
= recm
;
2947 while ((m2
= mbuf_next(m
)))
2949 if ((error
= mbuf_setnext(m
, recm
)))
2950 panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error
);
2952 if (slp
->ns_flag
& SLP_LASTFRAG
) {
2954 mbuf_setnextpkt(slp
->ns_recend
, slp
->ns_frag
);
2956 slp
->ns_rec
= slp
->ns_frag
;
2957 slp
->ns_flag
|= SLP_DOREC
;
2959 slp
->ns_recend
= slp
->ns_frag
;
2960 slp
->ns_frag
= NULL
;
2966 * Parse an RPC header.
2970 struct nfsrv_sock
*slp
,
2972 struct nfsrv_descript
**ndp
)
2976 struct nfsrv_descript
*nd
;
2980 if (!(slp
->ns_flag
& (SLP_VALID
|SLP_DOREC
)) || (slp
->ns_rec
== NULL
))
2982 MALLOC_ZONE(nd
, struct nfsrv_descript
*,
2983 sizeof (struct nfsrv_descript
), M_NFSRVDESC
, M_WAITOK
);
2987 slp
->ns_rec
= mbuf_nextpkt(m
);
2989 mbuf_setnextpkt(m
, NULL
);
2991 slp
->ns_flag
&= ~SLP_DOREC
;
2992 slp
->ns_recend
= NULL
;
2995 if (mbuf_type(m
) == MBUF_TYPE_SONAME
) {
2998 if ((error
= mbuf_setnext(nam
, NULL
)))
2999 panic("nfsrv_dorec: mbuf_setnext failed %d\n", error
);
3003 nfsm_chain_dissect_init(error
, &nd
->nd_nmreq
, m
);
3005 error
= nfsrv_getreq(nd
);
3009 FREE_ZONE(nd
, sizeof(*nd
), M_NFSRVDESC
);
3019 * Parse an RPC request
3021 * - fill in the cred struct.
3024 nfsrv_getreq(struct nfsrv_descript
*nd
)
3026 struct nfsm_chain
*nmreq
;
3028 u_long nfsvers
, auth_type
;
3033 struct ucred temp_cred
;
3037 nd
->nd_gss_context
= NULL
;
3038 nd
->nd_gss_seqnum
= 0;
3039 nd
->nd_gss_mb
= NULL
;
3041 user_id
= group_id
= -2;
3042 val
= auth_type
= len
= 0;
3044 nmreq
= &nd
->nd_nmreq
;
3045 nfsm_chain_get_32(error
, nmreq
, nd
->nd_retxid
); // XID
3046 nfsm_chain_get_32(error
, nmreq
, val
); // RPC Call
3047 if (!error
&& (val
!= RPC_CALL
))
3051 nfsm_chain_get_32(error
, nmreq
, val
); // RPC Version
3053 if (val
!= RPC_VER2
) {
3054 nd
->nd_repstat
= ERPCMISMATCH
;
3055 nd
->nd_procnum
= NFSPROC_NOOP
;
3058 nfsm_chain_get_32(error
, nmreq
, val
); // RPC Program Number
3060 if (val
!= NFS_PROG
) {
3061 nd
->nd_repstat
= EPROGUNAVAIL
;
3062 nd
->nd_procnum
= NFSPROC_NOOP
;
3065 nfsm_chain_get_32(error
, nmreq
, nfsvers
);// NFS Version Number
3067 if ((nfsvers
< NFS_VER2
) || (nfsvers
> NFS_VER3
)) {
3068 nd
->nd_repstat
= EPROGMISMATCH
;
3069 nd
->nd_procnum
= NFSPROC_NOOP
;
3072 nd
->nd_vers
= nfsvers
;
3073 nfsm_chain_get_32(error
, nmreq
, nd
->nd_procnum
);// NFS Procedure Number
3075 if ((nd
->nd_procnum
>= NFS_NPROCS
) ||
3076 ((nd
->nd_vers
== NFS_VER2
) && (nd
->nd_procnum
> NFSV2PROC_STATFS
))) {
3077 nd
->nd_repstat
= EPROCUNAVAIL
;
3078 nd
->nd_procnum
= NFSPROC_NOOP
;
3081 if (nfsvers
!= NFS_VER3
)
3082 nd
->nd_procnum
= nfsv3_procid
[nd
->nd_procnum
];
3083 nfsm_chain_get_32(error
, nmreq
, auth_type
); // Auth Flavor
3084 nfsm_chain_get_32(error
, nmreq
, len
); // Auth Length
3085 if (!error
&& (len
< 0 || len
> RPCAUTH_MAXSIZ
))
3089 /* Handle authentication */
3090 if (auth_type
== RPCAUTH_UNIX
) {
3091 if (nd
->nd_procnum
== NFSPROC_NULL
)
3093 nd
->nd_sec
= RPCAUTH_UNIX
;
3094 nfsm_chain_adv(error
, nmreq
, NFSX_UNSIGNED
); // skip stamp
3095 nfsm_chain_get_32(error
, nmreq
, len
); // hostname length
3096 if (len
< 0 || len
> NFS_MAXNAMLEN
)
3098 nfsm_chain_adv(error
, nmreq
, nfsm_rndup(len
)); // skip hostname
3101 /* create a temporary credential using the bits from the wire */
3102 bzero(&temp_cred
, sizeof(temp_cred
));
3103 nfsm_chain_get_32(error
, nmreq
, user_id
);
3104 nfsm_chain_get_32(error
, nmreq
, group_id
);
3105 temp_cred
.cr_groups
[0] = group_id
;
3106 nfsm_chain_get_32(error
, nmreq
, len
); // extra GID count
3107 if ((len
< 0) || (len
> RPCAUTH_UNIXGIDS
))
3110 for (i
= 1; i
<= len
; i
++)
3112 nfsm_chain_get_32(error
, nmreq
, temp_cred
.cr_groups
[i
]);
3114 nfsm_chain_adv(error
, nmreq
, NFSX_UNSIGNED
);
3116 ngroups
= (len
>= NGROUPS
) ? NGROUPS
: (len
+ 1);
3118 nfsrv_group_sort(&temp_cred
.cr_groups
[0], ngroups
);
3119 nfsm_chain_adv(error
, nmreq
, NFSX_UNSIGNED
); // verifier flavor (should be AUTH_NONE)
3120 nfsm_chain_get_32(error
, nmreq
, len
); // verifier length
3121 if (len
< 0 || len
> RPCAUTH_MAXSIZ
)
3124 nfsm_chain_adv(error
, nmreq
, nfsm_rndup(len
));
3126 /* request creation of a real credential */
3127 temp_cred
.cr_uid
= user_id
;
3128 temp_cred
.cr_ngroups
= ngroups
;
3129 nd
->nd_cr
= kauth_cred_create(&temp_cred
);
3130 if (nd
->nd_cr
== NULL
) {
3131 nd
->nd_repstat
= ENOMEM
;
3132 nd
->nd_procnum
= NFSPROC_NOOP
;
3135 } else if (auth_type
== RPCSEC_GSS
) {
3136 error
= nfs_gss_svc_cred_get(nd
, nmreq
);
3138 if (error
== EINVAL
)
3139 goto nfsmout
; // drop the request
3140 nd
->nd_repstat
= error
;
3141 nd
->nd_procnum
= NFSPROC_NOOP
;
3145 if (nd
->nd_procnum
== NFSPROC_NULL
) // assume it's AUTH_NONE
3147 nd
->nd_repstat
= (NFSERR_AUTHERR
| AUTH_REJECTCRED
);
3148 nd
->nd_procnum
= NFSPROC_NOOP
;
3153 if (IS_VALID_CRED(nd
->nd_cr
))
3154 kauth_cred_unref(&nd
->nd_cr
);
3155 nfsm_chain_cleanup(nmreq
);
3160 * Search for a sleeping nfsd and wake it up.
3161 * SIDE EFFECT: If none found, make sure the socket is queued up so that one
3162 * of the running nfsds will go look for the work in the nfsrv_sockwait list.
3163 * Note: Must be called with nfsd_mutex held.
3166 nfsrv_wakenfsd(struct nfsrv_sock
*slp
)
3170 if ((slp
->ns_flag
& SLP_VALID
) == 0)
3173 lck_rw_lock_exclusive(&slp
->ns_rwlock
);
3174 /* if there's work to do on this socket, make sure it's queued up */
3175 if ((slp
->ns_flag
& SLP_WORKTODO
) && !(slp
->ns_flag
& SLP_QUEUED
)) {
3176 TAILQ_INSERT_TAIL(&nfsrv_sockwait
, slp
, ns_svcq
);
3177 slp
->ns_flag
|= SLP_WAITQ
;
3179 lck_rw_done(&slp
->ns_rwlock
);
3181 /* wake up a waiting nfsd, if possible */
3182 nd
= TAILQ_FIRST(&nfsd_queue
);
3186 TAILQ_REMOVE(&nfsd_queue
, nd
, nfsd_queue
);
3187 nd
->nfsd_flag
&= ~NFSD_WAITING
;
3191 #endif /* NFSSERVER */
3194 nfs_msg(thread_t thd
,
3199 proc_t p
= thd
? get_bsdthreadtask_info(thd
) : NULL
;
3203 tpr
= tprintf_open(p
);
3207 tprintf(tpr
, "nfs server %s: %s, error %d\n", server
, msg
, error
);
3209 tprintf(tpr
, "nfs server %s: %s\n", server
, msg
);
3215 nfs_down(struct nfsmount
*nmp
, thread_t thd
, int error
, int flags
, const char *msg
)
3222 lck_mtx_lock(&nmp
->nm_lock
);
3223 ostate
= nmp
->nm_state
;
3224 if ((flags
& NFSSTA_TIMEO
) && !(ostate
& NFSSTA_TIMEO
))
3225 nmp
->nm_state
|= NFSSTA_TIMEO
;
3226 if ((flags
& NFSSTA_LOCKTIMEO
) && !(ostate
& NFSSTA_LOCKTIMEO
))
3227 nmp
->nm_state
|= NFSSTA_LOCKTIMEO
;
3228 if ((flags
& NFSSTA_JUKEBOXTIMEO
) && !(ostate
& NFSSTA_JUKEBOXTIMEO
))
3229 nmp
->nm_state
|= NFSSTA_JUKEBOXTIMEO
;
3230 lck_mtx_unlock(&nmp
->nm_lock
);
3232 if (!(ostate
& (NFSSTA_TIMEO
|NFSSTA_LOCKTIMEO
|NFSSTA_JUKEBOXTIMEO
)))
3233 vfs_event_signal(&vfs_statfs(nmp
->nm_mountp
)->f_fsid
, VQ_NOTRESP
, 0);
3235 nfs_msg(thd
, vfs_statfs(nmp
->nm_mountp
)->f_mntfromname
, msg
, error
);
3239 nfs_up(struct nfsmount
*nmp
, thread_t thd
, int flags
, const char *msg
)
3247 nfs_msg(thd
, vfs_statfs(nmp
->nm_mountp
)->f_mntfromname
, msg
, 0);
3249 lck_mtx_lock(&nmp
->nm_lock
);
3250 ostate
= nmp
->nm_state
;
3251 if ((flags
& NFSSTA_TIMEO
) && (ostate
& NFSSTA_TIMEO
))
3252 nmp
->nm_state
&= ~NFSSTA_TIMEO
;
3253 if ((flags
& NFSSTA_LOCKTIMEO
) && (ostate
& NFSSTA_LOCKTIMEO
))
3254 nmp
->nm_state
&= ~NFSSTA_LOCKTIMEO
;
3255 if ((flags
& NFSSTA_JUKEBOXTIMEO
) && (ostate
& NFSSTA_JUKEBOXTIMEO
))
3256 nmp
->nm_state
&= ~NFSSTA_JUKEBOXTIMEO
;
3257 state
= nmp
->nm_state
;
3258 lck_mtx_unlock(&nmp
->nm_lock
);
3260 if ((ostate
& (NFSSTA_TIMEO
|NFSSTA_LOCKTIMEO
|NFSSTA_JUKEBOXTIMEO
)) &&
3261 !(state
& (NFSSTA_TIMEO
|NFSSTA_LOCKTIMEO
|NFSSTA_JUKEBOXTIMEO
)))
3262 vfs_event_signal(&vfs_statfs(nmp
->nm_mountp
)->f_fsid
, VQ_NOTRESP
, 1);