2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1989, 1991, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
69 * Socket operations for use by nfs
72 #include <sys/param.h>
73 #include <sys/systm.h>
75 #include <sys/kauth.h>
76 #include <sys/mount_internal.h>
77 #include <sys/kernel.h>
78 #include <sys/kpi_mbuf.h>
79 #include <sys/malloc.h>
80 #include <sys/vnode.h>
81 #include <sys/domain.h>
82 #include <sys/protosw.h>
83 #include <sys/socket.h>
84 #include <sys/syslog.h>
85 #include <sys/tprintf.h>
86 #include <sys/uio_internal.h>
87 #include <libkern/OSAtomic.h>
90 #include <kern/clock.h>
91 #include <kern/task.h>
92 #include <kern/thread.h>
95 #include <netinet/in.h>
96 #include <netinet/tcp.h>
98 #include <nfs/rpcv2.h>
99 #include <nfs/nfsproto.h>
101 #include <nfs/xdr_subs.h>
102 #include <nfs/nfsm_subs.h>
103 #include <nfs/nfsmount.h>
104 #include <nfs/nfsnode.h>
105 #include <nfs/nfsrtt.h>
107 #include <sys/kdebug.h>
109 #define FSDBG(A, B, C, D, E) \
110 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
111 (int)(B), (int)(C), (int)(D), (int)(E), 0)
112 #define FSDBG_TOP(A, B, C, D, E) \
113 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
114 (int)(B), (int)(C), (int)(D), (int)(E), 0)
115 #define FSDBG_BOT(A, B, C, D, E) \
116 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
117 (int)(B), (int)(C), (int)(D), (int)(E), 0)
120 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
121 * Use the mean and mean deviation of rtt for the appropriate type of rpc
122 * for the frequent rpcs and a default for the others.
123 * The justification for doing "other" this way is that these rpcs
124 * happen so infrequently that timer est. would probably be stale.
125 * Also, since many of these rpcs are
126 * non-idempotent, a conservative timeout is desired.
127 * getattr, lookup - A+2D
131 #define NFS_RTO(n, t) \
132 ((t) == 0 ? (n)->nm_timeo : \
134 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
135 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
136 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
137 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
139 * External data, mostly RPC constants in XDR form
141 extern u_long rpc_reply
, rpc_msgdenied
, rpc_mismatch
, rpc_vers
, rpc_auth_unix
,
142 rpc_msgaccepted
, rpc_call
, rpc_autherr
,
144 extern u_long nfs_prog
;
145 extern struct nfsstats nfsstats
;
146 extern int nfsv3_procid
[NFS_NPROCS
];
147 extern int nfs_ticks
;
148 extern u_long nfs_xidwrap
;
151 * Defines which timer to use for the procnum.
158 static int proct
[NFS_NPROCS
] = {
159 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
163 * There is a congestion window for outstanding rpcs maintained per mount
164 * point. The cwnd size is adjusted in roughly the way that:
165 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
166 * SIGCOMM '88". ACM, August 1988.
167 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
168 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
169 * of rpcs is in progress.
170 * (The sent count and cwnd are scaled for integer arith.)
171 * Variants of "slow start" were tried and were found to be too much of a
172 * performance hit (ave. rtt 3 times larger),
173 * I suspect due to the large rtt that nfs rpcs have.
175 #define NFS_CWNDSCALE 256
176 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
177 static int nfs_backoff
[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
179 struct nfsrtt nfsrtt
;
181 static int nfs_rcvlock(struct nfsreq
*);
182 static void nfs_rcvunlock(struct nfsreq
*);
183 static int nfs_receive(struct nfsreq
*rep
, mbuf_t
*mp
);
184 static int nfs_reconnect(struct nfsreq
*rep
);
185 static void nfs_repdequeue(struct nfsreq
*rep
);
188 boolean_t
current_thread_aborted(void);
189 kern_return_t
thread_terminate(thread_t
);
192 static int nfsrv_getstream(struct nfssvc_sock
*,int);
194 int (*nfsrv3_procs
[NFS_NPROCS
])(struct nfsrv_descript
*nd
,
195 struct nfssvc_sock
*slp
,
222 #endif /* NFS_NOSERVER */
226 * attempt to bind a socket to a reserved port
229 nfs_bind_resv(struct nfsmount
*nmp
)
231 socket_t so
= nmp
->nm_so
;
232 struct sockaddr_in sin
;
239 sin
.sin_len
= sizeof (struct sockaddr_in
);
240 sin
.sin_family
= AF_INET
;
241 sin
.sin_addr
.s_addr
= INADDR_ANY
;
242 tport
= IPPORT_RESERVED
- 1;
243 sin
.sin_port
= htons(tport
);
245 while (((error
= sock_bind(so
, (struct sockaddr
*) &sin
)) == EADDRINUSE
) &&
246 (--tport
> IPPORT_RESERVED
/ 2))
247 sin
.sin_port
= htons(tport
);
252 * variables for managing the nfs_bind_resv_thread
254 int nfs_resv_mounts
= 0;
255 static int nfs_bind_resv_thread_state
= 0;
256 #define NFS_BIND_RESV_THREAD_STATE_INITTED 1
257 #define NFS_BIND_RESV_THREAD_STATE_RUNNING 2
258 lck_grp_t
*nfs_bind_resv_lck_grp
;
259 lck_grp_attr_t
*nfs_bind_resv_lck_grp_attr
;
260 lck_attr_t
*nfs_bind_resv_lck_attr
;
261 lck_mtx_t
*nfs_bind_resv_mutex
;
262 struct nfs_bind_resv_request
{
263 TAILQ_ENTRY(nfs_bind_resv_request
) brr_chain
;
264 struct nfsmount
*brr_nmp
;
267 static TAILQ_HEAD(, nfs_bind_resv_request
) nfs_bind_resv_request_queue
;
270 * thread to handle any reserved port bind requests
273 nfs_bind_resv_thread(void)
275 struct nfs_bind_resv_request
*brreq
;
277 nfs_bind_resv_thread_state
= NFS_BIND_RESV_THREAD_STATE_RUNNING
;
279 while (nfs_resv_mounts
> 0) {
280 lck_mtx_lock(nfs_bind_resv_mutex
);
281 while ((brreq
= TAILQ_FIRST(&nfs_bind_resv_request_queue
))) {
282 TAILQ_REMOVE(&nfs_bind_resv_request_queue
, brreq
, brr_chain
);
283 lck_mtx_unlock(nfs_bind_resv_mutex
);
284 brreq
->brr_error
= nfs_bind_resv(brreq
->brr_nmp
);
286 lck_mtx_lock(nfs_bind_resv_mutex
);
288 msleep((caddr_t
)&nfs_bind_resv_request_queue
,
289 nfs_bind_resv_mutex
, PSOCK
| PDROP
,
290 "nfs_bind_resv_request_queue", 0);
293 nfs_bind_resv_thread_state
= NFS_BIND_RESV_THREAD_STATE_INITTED
;
294 (void) thread_terminate(current_thread());
298 nfs_bind_resv_thread_wake(void)
300 if (nfs_bind_resv_thread_state
< NFS_BIND_RESV_THREAD_STATE_RUNNING
)
302 wakeup(&nfs_bind_resv_request_queue
);
307 * underprivileged procs call this to request nfs_bind_resv_thread
308 * to perform the reserved port binding for them.
311 nfs_bind_resv_nopriv(struct nfsmount
*nmp
)
313 struct nfs_bind_resv_request brreq
;
316 if (nfs_bind_resv_thread_state
< NFS_BIND_RESV_THREAD_STATE_RUNNING
) {
317 if (nfs_bind_resv_thread_state
< NFS_BIND_RESV_THREAD_STATE_INITTED
) {
318 nfs_bind_resv_lck_grp_attr
= lck_grp_attr_alloc_init();
319 lck_grp_attr_setstat(nfs_bind_resv_lck_grp_attr
);
320 nfs_bind_resv_lck_grp
= lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr
);
321 nfs_bind_resv_lck_attr
= lck_attr_alloc_init();
322 nfs_bind_resv_mutex
= lck_mtx_alloc_init(nfs_bind_resv_lck_grp
, nfs_bind_resv_lck_attr
);
323 TAILQ_INIT(&nfs_bind_resv_request_queue
);
324 nfs_bind_resv_thread_state
= NFS_BIND_RESV_THREAD_STATE_INITTED
;
326 kernel_thread(kernel_task
, nfs_bind_resv_thread
);
327 nfs_bind_resv_thread_state
= NFS_BIND_RESV_THREAD_STATE_RUNNING
;
333 lck_mtx_lock(nfs_bind_resv_mutex
);
334 TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue
, &brreq
, brr_chain
);
335 lck_mtx_unlock(nfs_bind_resv_mutex
);
337 error
= nfs_bind_resv_thread_wake();
339 TAILQ_REMOVE(&nfs_bind_resv_request_queue
, &brreq
, brr_chain
);
340 /* Note: we might be able to simply restart the thread */
344 tsleep((caddr_t
)&brreq
, PSOCK
, "nfsbindresv", 0);
346 return (brreq
.brr_error
);
350 * Initialize sockets and congestion for a new NFS connection.
351 * We do not free the sockaddr if error.
355 struct nfsmount
*nmp
,
356 __unused
struct nfsreq
*rep
)
359 int error
, rcvreserve
, sndreserve
;
360 struct sockaddr
*saddr
;
361 struct timeval timeo
;
364 saddr
= mbuf_data(nmp
->nm_nam
);
365 error
= sock_socket(saddr
->sa_family
, nmp
->nm_sotype
,
366 nmp
->nm_soproto
, 0, 0, &nmp
->nm_so
);
373 * Some servers require that the client port be a reserved port number.
375 if (saddr
->sa_family
== AF_INET
&& (nmp
->nm_flag
& NFSMNT_RESVPORT
)) {
378 * sobind() requires current_proc() to have superuser privs.
379 * If this bind is part of a reconnect, and the current proc
380 * doesn't have superuser privs, we hand the sobind() off to
381 * a kernel thread to process.
383 if ((nmp
->nm_state
& NFSSTA_MOUNTED
) &&
384 (p
= current_proc()) && suser(kauth_cred_get(), 0)) {
385 /* request nfs_bind_resv_thread() to do bind */
386 error
= nfs_bind_resv_nopriv(nmp
);
388 error
= nfs_bind_resv(nmp
);
395 * Protocols that do not require connections may be optionally left
396 * unconnected for servers that reply from a port other than NFS_PORT.
398 if (nmp
->nm_flag
& NFSMNT_NOCONN
) {
399 if (nmp
->nm_sotype
== SOCK_STREAM
) {
407 error
= sock_connect(so
, mbuf_data(nmp
->nm_nam
), MSG_DONTWAIT
);
408 if (error
&& error
!= EINPROGRESS
) {
412 while ((error
= sock_connectwait(so
, &tv
)) == EINPROGRESS
) {
413 if (rep
&& (error
= nfs_sigintr(nmp
, rep
, rep
->r_procp
))) {
420 * Always time out on recieve, this allows us to reconnect the
421 * socket to deal with network changes.
425 error
= sock_setsockopt(so
, SOL_SOCKET
, SO_RCVTIMEO
, &timeo
, sizeof(timeo
));
426 if (nmp
->nm_flag
& (NFSMNT_SOFT
| NFSMNT_INT
)) {
431 error
= sock_setsockopt(so
, SOL_SOCKET
, SO_SNDTIMEO
, &timeo
, sizeof(timeo
));
433 if (nmp
->nm_sotype
== SOCK_DGRAM
) {
434 sndreserve
= (nmp
->nm_wsize
+ NFS_MAXPKTHDR
) * 3;
435 rcvreserve
= (nmp
->nm_rsize
+ NFS_MAXPKTHDR
) *
436 (nmp
->nm_readahead
> 0 ? nmp
->nm_readahead
+ 1 : 2);
437 } else if (nmp
->nm_sotype
== SOCK_SEQPACKET
) {
438 sndreserve
= (nmp
->nm_wsize
+ NFS_MAXPKTHDR
) * 3;
439 rcvreserve
= (nmp
->nm_rsize
+ NFS_MAXPKTHDR
) *
440 (nmp
->nm_readahead
> 0 ? nmp
->nm_readahead
+ 1 : 2);
445 sock_gettype(so
, NULL
, NULL
, &proto
);
446 if (nmp
->nm_sotype
!= SOCK_STREAM
)
447 panic("nfscon sotype");
449 // Assume that SOCK_STREAM always requires a connection
450 sock_setsockopt(so
, SOL_SOCKET
, SO_KEEPALIVE
, &on
, sizeof(on
));
452 if (proto
== IPPROTO_TCP
) {
453 sock_setsockopt(so
, IPPROTO_TCP
, TCP_NODELAY
, &on
, sizeof(on
));
456 sndreserve
= (nmp
->nm_wsize
+ NFS_MAXPKTHDR
+ sizeof (u_long
)) * 3;
457 rcvreserve
= (nmp
->nm_rsize
+ NFS_MAXPKTHDR
+ sizeof (u_long
)) *
458 (nmp
->nm_readahead
> 0 ? nmp
->nm_readahead
+ 1 : 2);
461 if (sndreserve
> NFS_MAXSOCKBUF
)
462 sndreserve
= NFS_MAXSOCKBUF
;
463 if (rcvreserve
> NFS_MAXSOCKBUF
)
464 rcvreserve
= NFS_MAXSOCKBUF
;
465 error
= sock_setsockopt(so
, SOL_SOCKET
, SO_SNDBUF
, &sndreserve
, sizeof(sndreserve
));
469 error
= sock_setsockopt(so
, SOL_SOCKET
, SO_RCVBUF
, &rcvreserve
, sizeof(rcvreserve
));
474 sock_nointerrupt(so
, 1);
476 /* Initialize other non-zero congestion variables */
477 nmp
->nm_srtt
[0] = nmp
->nm_srtt
[1] = nmp
->nm_srtt
[2] =
478 nmp
->nm_srtt
[3] = (NFS_TIMEO
<< 3);
479 nmp
->nm_sdrtt
[0] = nmp
->nm_sdrtt
[1] = nmp
->nm_sdrtt
[2] =
480 nmp
->nm_sdrtt
[3] = 0;
481 nmp
->nm_cwnd
= NFS_MAXCWND
/ 2; /* Initial send window */
483 FSDBG(529, nmp
, nmp
->nm_state
, nmp
->nm_soflags
, nmp
->nm_cwnd
);
484 nmp
->nm_timeouts
= 0;
494 * Called when a connection is broken on a reliable protocol.
495 * - clean up the old socket
496 * - nfs_connect() again
497 * - set R_MUSTRESEND for all outstanding requests on mount point
498 * If this fails the mount point is DEAD!
499 * nb: Must be called with the nfs_sndlock() set on the mount point.
502 nfs_reconnect(struct nfsreq
*rep
)
505 struct nfsmount
*nmp
= rep
->r_nmp
;
509 while ((error
= nfs_connect(nmp
, rep
))) {
510 if (error
== EINTR
|| error
== ERESTART
)
514 nfs_down(rep
->r_nmp
, rep
->r_procp
, error
, NFSSTA_TIMEO
,
516 rep
->r_flags
|= R_TPRINTFMSG
;
517 if (!(nmp
->nm_state
& NFSSTA_MOUNTED
)) {
518 /* we're not yet completely mounted and */
519 /* we can't reconnect, so we fail */
522 if ((error
= nfs_sigintr(rep
->r_nmp
, rep
, rep
->r_procp
)))
524 tsleep((caddr_t
)&lbolt
, PSOCK
, "nfscon", 0);
528 * Loop through outstanding request list and fix up all requests
531 TAILQ_FOREACH(rp
, &nfs_reqq
, r_chain
) {
532 if (rp
->r_nmp
== nmp
)
533 rp
->r_flags
|= R_MUSTRESEND
;
539 * NFS disconnect. Clean up and unlink.
542 nfs_disconnect(struct nfsmount
*nmp
)
549 sock_shutdown(so
, 2);
555 * This is the nfs send routine. For connection based socket types, it
556 * must be called with an nfs_sndlock() on the socket.
557 * "rep == NULL" indicates that it has been called from a server.
558 * For the client side:
559 * - return EINTR if the RPC is terminated, 0 otherwise
560 * - set R_MUSTRESEND if the send fails for any reason
561 * - do any cleanup required by recoverable socket errors (???)
562 * For the server side:
563 * - return EINTR or ERESTART if interrupted by a signal
564 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
565 * - do any cleanup required by recoverable socket errors (???)
568 nfs_send(so
, nam
, top
, rep
)
574 struct sockaddr
*sendnam
;
575 int error
, error2
, sotype
, flags
;
576 u_long xidqueued
= 0;
578 char savenametolog
[MAXPATHLEN
];
582 error
= nfs_sigintr(rep
->r_nmp
, rep
, rep
->r_procp
);
587 if ((so
= rep
->r_nmp
->nm_so
) == NULL
) {
588 rep
->r_flags
|= R_MUSTRESEND
;
592 rep
->r_flags
&= ~R_MUSTRESEND
;
593 TAILQ_FOREACH(rp
, &nfs_reqq
, r_chain
)
597 xidqueued
= rp
->r_xid
;
599 sock_gettype(so
, NULL
, &sotype
, NULL
);
600 if ((sotype
== SOCK_STREAM
) || (sock_isconnected(so
)) ||
602 sendnam
= (struct sockaddr
*)0;
604 sendnam
= mbuf_data(nam
);
606 if (sotype
== SOCK_SEQPACKET
)
612 * Save the name here in case mount point goes away if we block.
613 * The name is using local stack and is large, but don't
614 * want to block if we malloc.
617 strncpy(savenametolog
,
618 vfs_statfs(rep
->r_nmp
->nm_mountp
)->f_mntfromname
,
620 bzero(&msg
, sizeof(msg
));
621 msg
.msg_name
= (caddr_t
)sendnam
;
622 msg
.msg_namelen
= sendnam
== 0 ? 0 : sendnam
->sa_len
;
623 error
= sock_sendmbuf(so
, &msg
, top
, flags
, NULL
);
628 TAILQ_FOREACH(rp
, &nfs_reqq
, r_chain
)
629 if (rp
== rep
&& rp
->r_xid
== xidqueued
)
632 panic("nfs_send: error %d xid %x gone",
635 log(LOG_INFO
, "nfs send error %d for server %s\n",
636 error
, savenametolog
);
638 * Deal with errors for the client side.
640 error2
= nfs_sigintr(rep
->r_nmp
, rep
, rep
->r_procp
);
644 rep
->r_flags
|= R_MUSTRESEND
;
647 log(LOG_INFO
, "nfsd send error %d\n", error
);
650 * Handle any recoverable (soft) socket errors here. (???)
652 if (error
!= EINTR
&& error
!= ERESTART
&& error
!= EIO
&&
653 error
!= EWOULDBLOCK
&& error
!= EPIPE
) {
661 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
662 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
663 * Mark and consolidate the data into a new mbuf list.
664 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
666 * For SOCK_STREAM we must be very careful to read an entire record once
667 * we have read any of it, even if the system call has been interrupted.
670 nfs_receive(struct nfsreq
*rep
, mbuf_t
*mp
)
676 int error
, error2
, sotype
;
677 proc_t p
= current_proc(); /* XXX */
683 * Set up arguments for soreceive()
686 sotype
= rep
->r_nmp
->nm_sotype
;
689 * For reliable protocols, lock against other senders/receivers
690 * in case a reconnect is necessary.
691 * For SOCK_STREAM, first get the Record Mark to find out how much
692 * more there is to get.
693 * We must lock the socket against other receivers
694 * until we have an entire rpc request/reply.
696 if (sotype
!= SOCK_DGRAM
) {
697 error
= nfs_sndlock(rep
);
702 * Check for fatal errors and resending request.
705 * Ugh: If a reconnect attempt just happened, nm_so
706 * would have changed. NULL indicates a failed
707 * attempt that has essentially shut down this
710 if ((error
= nfs_sigintr(rep
->r_nmp
, rep
, p
)) || rep
->r_mrep
) {
716 so
= rep
->r_nmp
->nm_so
;
718 error
= nfs_reconnect(rep
);
725 while (rep
->r_flags
& R_MUSTRESEND
) {
726 error
= mbuf_copym(rep
->r_mreq
, 0, MBUF_COPYALL
, MBUF_WAITOK
, &m
);
728 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpcretries
);
729 error
= nfs_send(so
, rep
->r_nmp
->nm_nam
, m
, rep
);
732 * we also hold rcv lock so rep is still
736 if (error
== EINTR
|| error
== ERESTART
||
737 (error
= nfs_reconnect(rep
))) {
745 if (sotype
== SOCK_STREAM
) {
750 while (!error
&& !lastfragment
) {
751 aio
.iov_base
= (uintptr_t) &fraglen
;
752 aio
.iov_len
= sizeof(u_long
);
753 bzero(&msg
, sizeof(msg
));
754 msg
.msg_iov
= (struct iovec
*) &aio
;
757 error
= sock_receive(so
, &msg
, MSG_WAITALL
, &rcvlen
);
758 if (!rep
->r_nmp
) /* if unmounted then bailout */
760 if (error
== EWOULDBLOCK
&& rep
) {
761 error2
= nfs_sigintr(rep
->r_nmp
, rep
, p
);
765 } while (error
== EWOULDBLOCK
);
766 if (!error
&& rcvlen
< aio
.iov_len
) {
767 /* only log a message if we got a partial word */
770 "short receive (%d/%d) from nfs server %s\n",
771 rcvlen
, sizeof(u_long
),
772 vfs_statfs(rep
->r_nmp
->nm_mountp
)->f_mntfromname
);
777 lastfragment
= ntohl(fraglen
) & 0x80000000;
778 fraglen
= ntohl(fraglen
) & ~0x80000000;
781 * This is SERIOUS! We are out of sync with the sender
782 * and forcing a disconnect/reconnect is all I can do.
784 if (len
> NFS_MAXPACKET
) {
785 log(LOG_ERR
, "%s (%d) from nfs server %s\n",
786 "impossible RPC record length", len
,
787 vfs_statfs(rep
->r_nmp
->nm_mountp
)->f_mntfromname
);
795 error
= sock_receivembuf(so
, NULL
, &m
, MSG_WAITALL
, &rcvlen
);
796 if (!rep
->r_nmp
) /* if unmounted then bailout */ {
799 } while (error
== EWOULDBLOCK
|| error
== EINTR
||
802 if (!error
&& fraglen
> rcvlen
) {
804 "short receive (%d/%d) from nfs server %s\n",
806 vfs_statfs(rep
->r_nmp
->nm_mountp
)->f_mntfromname
);
815 error
= mbuf_setnext(mlast
, m
);
817 printf("nfs_receive: mbuf_setnext failed %d\n", error
);
821 while (mbuf_next(mlast
))
822 mlast
= mbuf_next(mlast
);
826 bzero(&msg
, sizeof(msg
));
829 error
= sock_receivembuf(so
, &msg
, mp
, 0, &rcvlen
);
830 if (!rep
->r_nmp
) /* if unmounted then bailout */ {
833 if (error
== EWOULDBLOCK
&& rep
) {
834 error2
= nfs_sigintr(rep
->r_nmp
, rep
, p
);
839 } while (error
== EWOULDBLOCK
);
841 if ((msg
.msg_flags
& MSG_EOR
) == 0)
843 if (!error
&& *mp
== NULL
)
848 if (error
&& error
!= EINTR
&& error
!= ERESTART
) {
853 "receive error %d from nfs server %s\n", error
,
854 vfs_statfs(rep
->r_nmp
->nm_mountp
)->f_mntfromname
);
855 error
= nfs_sndlock(rep
);
857 error
= nfs_reconnect(rep
);
865 * We could have failed while rebinding the datagram socket
866 * so we need to attempt to rebind here.
868 if ((so
= rep
->r_nmp
->nm_so
) == NULL
) {
869 error
= nfs_sndlock(rep
);
871 error
= nfs_reconnect(rep
);
876 if (!rep
->r_nmp
) /* if unmounted then bailout */
878 so
= rep
->r_nmp
->nm_so
;
880 bzero(&msg
, sizeof(msg
));
884 error
= sock_receivembuf(so
, &msg
, mp
, 0, &rcvlen
);
885 if (!rep
->r_nmp
) /* if unmounted then bailout */
888 error2
= nfs_sigintr(rep
->r_nmp
, rep
, p
);
894 /* Reconnect for all errors. We may be receiving
895 * soft/hard/blocking errors because of a network
897 * XXX: we should rate limit or delay this
898 * to once every N attempts or something.
899 * although TCP doesn't seem to.
902 error2
= nfs_sndlock(rep
);
904 error2
= nfs_reconnect(rep
);
907 else if (!rep
->r_nmp
) /* if unmounted then bailout */
910 so
= rep
->r_nmp
->nm_so
;
916 } while (error
== EWOULDBLOCK
);
927 * Implement receipt of reply on a socket.
928 * We must search through the list of received datagrams matching them
929 * with outstanding requests using the xid, until ours is found.
934 struct nfsreq
*myrep
;
937 struct nfsmount
*nmp
= myrep
->r_nmp
;
945 * Loop around until we get our own reply
949 * Lock against other receivers so that I don't get stuck in
950 * sbwait() after someone else has received my reply for me.
951 * Also necessary for connection based protocols to avoid
952 * race conditions during a reconnect.
953 * If nfs_rcvlock() returns EALREADY, that means that
954 * the reply has already been recieved by another
955 * process and we can return immediately. In this
956 * case, the lock is not taken to avoid races with
959 error
= nfs_rcvlock(myrep
);
960 if (error
== EALREADY
)
966 * If we slept after putting bits otw, then reply may have
967 * arrived. In which case returning is required, or we
968 * would hang trying to nfs_receive an already received reply.
970 if (myrep
->r_mrep
!= NULL
) {
971 nfs_rcvunlock(myrep
);
972 FSDBG(530, myrep
->r_xid
, myrep
, myrep
->r_nmp
, -1);
976 * Get the next Rpc reply off the socket. Assume myrep->r_nmp
977 * is still intact by checks done in nfs_rcvlock.
979 error
= nfs_receive(myrep
, &mrep
);
981 * Bailout asap if nfsmount struct gone (unmounted).
984 FSDBG(530, myrep
->r_xid
, myrep
, nmp
, -2);
990 FSDBG(530, myrep
->r_xid
, myrep
, nmp
, error
);
991 nfs_rcvunlock(myrep
);
993 /* Bailout asap if nfsmount struct gone (unmounted). */
1001 * Ignore routing errors on connectionless protocols??
1003 if (NFSIGNORE_SOERROR(nmp
->nm_sotype
, error
)) {
1006 int optlen
= sizeof(clearerror
);
1007 sock_getsockopt(nmp
->nm_so
, SOL_SOCKET
, SO_ERROR
, &clearerror
, &optlen
);
1017 * We assume all is fine, but if we did not have an error
1018 * and mrep is 0, better not dereference it. nfs_receive
1019 * calls soreceive which carefully sets error=0 when it got
1020 * errors on sbwait (tsleep). In most cases, I assume that's
1021 * so we could go back again. In tcp case, EPIPE is returned.
1022 * In udp, case nfs_receive gets back here with no error and no
1023 * mrep. Is the right fix to have soreceive check for process
1024 * aborted after sbwait and return something non-zero? Should
1025 * nfs_receive give an EPIPE? Too risky to play with those
1026 * two this late in game for a shutdown problem. Instead,
1027 * just check here and get out. (ekn)
1030 nfs_rcvunlock(myrep
);
1031 FSDBG(530, myrep
->r_xid
, myrep
, nmp
, -3);
1032 return (ENXIO
); /* sounds good */
1036 * Get the xid and check that it is an rpc reply
1039 dpos
= mbuf_data(md
);
1040 nfsm_dissect(tl
, u_long
*, 2*NFSX_UNSIGNED
);
1042 if (*tl
!= rpc_reply
) {
1043 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpcinvalid
);
1046 if (nmp
->nm_state
& NFSSTA_RCVLOCK
)
1047 nfs_rcvunlock(myrep
);
1052 * Loop through the request list to match up the reply
1053 * Iff no match, just drop the datagram
1055 TAILQ_FOREACH(rep
, &nfs_reqq
, r_chain
) {
1056 if (rep
->r_mrep
== NULL
&& rxid
== rep
->r_xid
) {
1062 * If we're tracking the round trip time
1063 * then we update the circular log here
1064 * with the stats from our current request.
1069 rt
= &nfsrtt
.rttl
[nfsrtt
.pos
];
1070 rt
->proc
= rep
->r_procnum
;
1071 rt
->rto
= NFS_RTO(nmp
, proct
[rep
->r_procnum
]);
1072 rt
->sent
= nmp
->nm_sent
;
1073 rt
->cwnd
= nmp
->nm_cwnd
;
1074 if (proct
[rep
->r_procnum
] == 0)
1075 panic("nfs_reply: proct[%d] is zero", rep
->r_procnum
);
1076 rt
->srtt
= nmp
->nm_srtt
[proct
[rep
->r_procnum
] - 1];
1077 rt
->sdrtt
= nmp
->nm_sdrtt
[proct
[rep
->r_procnum
] - 1];
1078 rt
->fsid
= vfs_statfs(nmp
->nm_mountp
)->f_fsid
;
1079 microtime(&rt
->tstamp
); // XXX unused
1080 if (rep
->r_flags
& R_TIMING
)
1081 rt
->rtt
= rep
->r_rtt
;
1084 nfsrtt
.pos
= (nfsrtt
.pos
+ 1) % NFSRTTLOGSIZ
;
1087 * Update congestion window.
1088 * Do the additive increase of
1091 FSDBG(530, rep
->r_xid
, rep
, nmp
->nm_sent
,
1093 if (nmp
->nm_cwnd
<= nmp
->nm_sent
) {
1095 (NFS_CWNDSCALE
* NFS_CWNDSCALE
+
1096 (nmp
->nm_cwnd
>> 1)) / nmp
->nm_cwnd
;
1097 if (nmp
->nm_cwnd
> NFS_MAXCWND
)
1098 nmp
->nm_cwnd
= NFS_MAXCWND
;
1100 if (rep
->r_flags
& R_SENT
) {
1101 rep
->r_flags
&= ~R_SENT
;
1102 nmp
->nm_sent
-= NFS_CWNDSCALE
;
1105 * Update rtt using a gain of 0.125 on the mean
1106 * and a gain of 0.25 on the deviation.
1108 if (rep
->r_flags
& R_TIMING
) {
1110 * Since the timer resolution of
1111 * NFS_HZ is so course, it can often
1112 * result in r_rtt == 0. Since
1113 * r_rtt == N means that the actual
1114 * rtt is between N+dt and N+2-dt ticks,
1117 if (proct
[rep
->r_procnum
] == 0)
1118 panic("nfs_reply: proct[%d] is zero", rep
->r_procnum
);
1119 t1
= rep
->r_rtt
+ 1;
1120 t1
-= (NFS_SRTT(rep
) >> 3);
1121 NFS_SRTT(rep
) += t1
;
1124 t1
-= (NFS_SDRTT(rep
) >> 2);
1125 NFS_SDRTT(rep
) += t1
;
1127 nmp
->nm_timeouts
= 0;
1131 nfs_rcvunlock(myrep
);
1133 * If not matched to a request, drop it.
1134 * If it's mine, get out.
1137 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpcunexpected
);
1139 } else if (rep
== myrep
) {
1140 if (rep
->r_mrep
== NULL
)
1141 panic("nfs_reply: nil r_mrep");
1144 FSDBG(530, myrep
->r_xid
, myrep
, rep
,
1145 rep
? rep
->r_xid
: myrep
->r_flags
);
1150 * nfs_request - goes something like this
1151 * - fill in request struct
1152 * - links it into list
1153 * - calls nfs_send() for first transmit
1154 * - calls nfs_receive() to get reply
1155 * - break down rpc header and return with nfs reply pointed to
1157 * nb: always frees up mreq mbuf list
1160 nfs_request(vp
, mp
, mrest
, procnum
, procp
, cred
, mrp
, mdp
, dposp
, xidp
)
1173 struct nfsreq re
, *rep
;
1176 struct nfsmount
*nmp
;
1177 mbuf_t md
, mheadend
;
1178 char nickv
[RPCX_NICKVERF
];
1181 int t1
, error
= 0, mrest_len
, auth_len
, auth_type
;
1182 int trylater_delay
= NFS_TRYLATERDEL
, failed_auth
= 0;
1183 int verf_len
, verf_type
;
1185 char *auth_str
, *verf_str
;
1186 NFSKERBKEY_T key
; /* save session key */
1199 nmp
= VFSTONFS(vnode_mount(vp
));
1201 (nmp
->nm_state
& (NFSSTA_FORCE
|NFSSTA_TIMEO
)) ==
1202 (NFSSTA_FORCE
|NFSSTA_TIMEO
)) {
1206 nmsotype
= nmp
->nm_sotype
;
1208 FSDBG_TOP(531, vp
, procnum
, nmp
, rep
);
1212 rep
->r_procp
= procp
;
1213 rep
->r_procnum
= procnum
;
1215 rep
->r_lastmsg
= now
.tv_sec
-
1216 ((nmp
->nm_tprintf_delay
) - (nmp
->nm_tprintf_initial_delay
));
1226 * Get the RPC header with authorization.
1229 nmp
= vp
? VFSTONFS(vnode_mount(vp
)) : rep
->r_nmp
;
1231 FSDBG_BOT(531, error
, rep
->r_xid
, nmp
, rep
);
1235 verf_str
= auth_str
= (char *)0;
1236 if (nmp
->nm_flag
& NFSMNT_KERB
) {
1238 verf_len
= sizeof (nickv
);
1239 auth_type
= RPCAUTH_KERB4
;
1240 bzero((caddr_t
)key
, sizeof (key
));
1241 if (failed_auth
|| nfs_getnickauth(nmp
, cred
, &auth_str
,
1242 &auth_len
, verf_str
, verf_len
)) {
1243 nmp
= vp
? VFSTONFS(vnode_mount(vp
)) : rep
->r_nmp
;
1245 FSDBG_BOT(531, 2, vp
, error
, rep
);
1249 error
= nfs_getauth(nmp
, rep
, cred
, &auth_str
,
1250 &auth_len
, verf_str
, &verf_len
, key
);
1251 nmp
= vp
? VFSTONFS(vnode_mount(vp
)) : rep
->r_nmp
;
1255 FSDBG_BOT(531, 2, vp
, error
, rep
);
1261 auth_type
= RPCAUTH_UNIX
;
1262 if (cred
->cr_ngroups
< 1)
1263 panic("nfsreq nogrps");
1264 auth_len
= ((((cred
->cr_ngroups
- 1) > nmp
->nm_numgrps
) ?
1265 nmp
->nm_numgrps
: (cred
->cr_ngroups
- 1)) << 2) +
1268 error
= nfsm_rpchead(cred
, nmp
->nm_flag
, procnum
, auth_type
, auth_len
,
1269 auth_str
, verf_len
, verf_str
, mrest
, mrest_len
, &mheadend
, &xid
, &m
);
1271 _FREE(auth_str
, M_TEMP
);
1274 FSDBG_BOT(531, error
, rep
->r_xid
, nmp
, rep
);
1278 *xidp
= ntohl(xid
) + ((u_int64_t
)nfs_xidwrap
<< 32);
1281 * For stream protocols, insert a Sun RPC Record Mark.
1283 if (nmsotype
== SOCK_STREAM
) {
1284 error
= mbuf_prepend(&m
, NFSX_UNSIGNED
, MBUF_WAITOK
);
1287 FSDBG_BOT(531, error
, rep
->r_xid
, nmp
, rep
);
1290 *((u_long
*)mbuf_data(m
)) =
1291 htonl(0x80000000 | (mbuf_pkthdr_len(m
) - NFSX_UNSIGNED
));
1296 nmp
= vp
? VFSTONFS(vnode_mount(vp
)) : rep
->r_nmp
;
1297 if (nmp
&& (nmp
->nm_flag
& NFSMNT_SOFT
))
1298 rep
->r_retry
= nmp
->nm_retry
;
1300 rep
->r_retry
= NFS_MAXREXMIT
+ 1; /* past clip limit */
1301 rep
->r_rtt
= rep
->r_rexmit
= 0;
1302 if (proct
[procnum
] > 0)
1303 rep
->r_flags
= R_TIMING
;
1309 * Do the client side RPC.
1311 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpcrequests
);
1313 * Chain request into list of outstanding requests. Be sure
1314 * to put it LAST so timer finds oldest requests first.
1316 TAILQ_INSERT_TAIL(&nfs_reqq
, rep
, r_chain
);
1319 * If backing off another request or avoiding congestion, don't
1320 * send this one now but let timer do it. If not timing a request,
1323 if (nmp
&& nmp
->nm_so
&& (nmp
->nm_sotype
!= SOCK_DGRAM
||
1324 (nmp
->nm_flag
& NFSMNT_DUMBTIMR
) ||
1325 nmp
->nm_sent
< nmp
->nm_cwnd
)) {
1326 int connrequired
= (nmp
->nm_sotype
== SOCK_STREAM
);
1329 error
= nfs_sndlock(rep
);
1332 * Set the R_SENT before doing the send in case another thread
1333 * processes the reply before the nfs_send returns here
1336 if ((rep
->r_flags
& R_MUSTRESEND
) == 0) {
1337 FSDBG(531, rep
->r_xid
, rep
, nmp
->nm_sent
,
1339 nmp
->nm_sent
+= NFS_CWNDSCALE
;
1340 rep
->r_flags
|= R_SENT
;
1343 error
= mbuf_copym(m
, 0, MBUF_COPYALL
, MBUF_WAITOK
, &m2
);
1345 error
= nfs_send(nmp
->nm_so
, nmp
->nm_nam
, m2
, rep
);
1349 nmp
= vp
? VFSTONFS(vnode_mount(vp
)) : rep
->r_nmp
;
1352 nmp
->nm_sent
-= NFS_CWNDSCALE
;
1353 rep
->r_flags
&= ~R_SENT
;
1360 * Wait for the reply from our send or the timer's.
1362 if (!error
|| error
== EPIPE
)
1363 error
= nfs_reply(rep
);
1366 * RPC done, unlink the request.
1368 nfs_repdequeue(rep
);
1370 nmp
= vp
? VFSTONFS(vnode_mount(vp
)) : rep
->r_nmp
;
1373 * Decrement the outstanding request count.
1375 if (rep
->r_flags
& R_SENT
) {
1376 rep
->r_flags
&= ~R_SENT
; /* paranoia */
1378 FSDBG(531, rep
->r_xid
, rep
, nmp
->nm_sent
, nmp
->nm_cwnd
);
1379 nmp
->nm_sent
-= NFS_CWNDSCALE
;
1384 * If there was a successful reply and a tprintf msg.
1385 * tprintf a response.
1388 nfs_up(nmp
, procp
, NFSSTA_TIMEO
,
1389 (rep
->r_flags
& R_TPRINTFMSG
) ? "is alive again" : NULL
);
1396 mbuf_freem(rep
->r_mreq
);
1397 FSDBG_BOT(531, error
, rep
->r_xid
, nmp
, rep
);
1402 * break down the rpc header and check if ok
1404 nfsm_dissect(tl
, u_long
*, 3 * NFSX_UNSIGNED
);
1405 if (*tl
++ == rpc_msgdenied
) {
1406 if (*tl
== rpc_mismatch
)
1408 else if ((nmp
->nm_flag
& NFSMNT_KERB
) && *tl
++ == rpc_autherr
) {
1411 error
= mbuf_setnext(mheadend
, NULL
);
1413 mbuf_freem(rep
->r_mreq
);
1416 printf("nfs_request: mbuf_setnext failed\n");
1422 mbuf_freem(rep
->r_mreq
);
1423 FSDBG_BOT(531, error
, rep
->r_xid
, nmp
, rep
);
1428 * Grab any Kerberos verifier, otherwise just throw it away.
1430 verf_type
= fxdr_unsigned(int, *tl
++);
1431 i
= fxdr_unsigned(int, *tl
);
1432 if ((nmp
->nm_flag
& NFSMNT_KERB
) && verf_type
== RPCAUTH_KERB4
) {
1433 error
= nfs_savenickauth(nmp
, cred
, i
, key
, &md
, &dpos
, mrep
);
1437 nfsm_adv(nfsm_rndup(i
));
1438 nfsm_dissect(tl
, u_long
*, NFSX_UNSIGNED
);
1441 nfsm_dissect(tl
, u_long
*, NFSX_UNSIGNED
);
1443 error
= fxdr_unsigned(int, *tl
);
1444 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) &&
1445 error
== NFSERR_TRYLATER
) {
1449 waituntil
= now
.tv_sec
+ trylater_delay
;
1450 while (now
.tv_sec
< waituntil
) {
1451 tsleep((caddr_t
)&lbolt
, PSOCK
, "nfstrylater", 0);
1454 trylater_delay
*= 2;
1455 if (trylater_delay
> 60)
1456 trylater_delay
= 60;
1461 * If the File Handle was stale, invalidate the
1462 * lookup cache, just in case.
1464 if ((error
== ESTALE
) && vp
)
1466 if (nmp
->nm_flag
& NFSMNT_NFSV3
) {
1470 error
|= NFSERR_RETERR
;
1473 error
&= ~NFSERR_RETERR
;
1475 mbuf_freem(rep
->r_mreq
);
1476 FSDBG_BOT(531, error
, rep
->r_xid
, nmp
, rep
);
1483 mbuf_freem(rep
->r_mreq
);
1484 FSDBG_BOT(531, 0xf0f0f0f0, rep
->r_xid
, nmp
, rep
);
1488 error
= EPROTONOSUPPORT
;
1490 mbuf_freem(rep
->r_mreq
);
1491 FSDBG_BOT(531, error
, rep
->r_xid
, nmp
, rep
);
1495 #ifndef NFS_NOSERVER
1497 * Generate the rpc reply header
1498 * siz arg. is used to decide if adding a cluster is worthwhile
1501 nfs_rephead(siz
, nd
, slp
, err
, mrq
, mbp
, bposp
)
1503 struct nfsrv_descript
*nd
;
1504 struct nfssvc_sock
*slp
;
1517 * If this is a big reply, use a cluster else
1518 * try and leave leading space for the lower level headers.
1520 siz
+= RPC_REPLYSIZ
;
1521 if (siz
>= nfs_mbuf_minclsize
) {
1522 error
= mbuf_getpacket(MBUF_WAITOK
, &mreq
);
1524 error
= mbuf_gethdr(MBUF_WAITOK
, MBUF_TYPE_DATA
, &mreq
);
1527 /* unable to allocate packet */
1532 tl
= mbuf_data(mreq
);
1533 mlen
= 6 * NFSX_UNSIGNED
;
1534 if (siz
< nfs_mbuf_minclsize
) {
1535 /* leave space for lower level headers */
1536 tl
+= 80/sizeof(*tl
); /* XXX max_hdr? XXX */
1537 mbuf_setdata(mreq
, tl
, mlen
);
1539 mbuf_setlen(mreq
, mlen
);
1541 bpos
= ((caddr_t
)tl
) + mlen
;
1542 *tl
++ = txdr_unsigned(nd
->nd_retxid
);
1544 if (err
== ERPCMISMATCH
|| (err
& NFSERR_AUTHERR
)) {
1545 *tl
++ = rpc_msgdenied
;
1546 if (err
& NFSERR_AUTHERR
) {
1547 *tl
++ = rpc_autherr
;
1548 *tl
= txdr_unsigned(err
& ~NFSERR_AUTHERR
);
1549 mlen
-= NFSX_UNSIGNED
;
1550 mbuf_setlen(mreq
, mlen
);
1551 bpos
-= NFSX_UNSIGNED
;
1553 *tl
++ = rpc_mismatch
;
1554 *tl
++ = txdr_unsigned(RPC_VER2
);
1555 *tl
= txdr_unsigned(RPC_VER2
);
1558 *tl
++ = rpc_msgaccepted
;
1561 * For Kerberos authentication, we must send the nickname
1562 * verifier back, otherwise just RPCAUTH_NULL.
1564 if (nd
->nd_flag
& ND_KERBFULL
) {
1565 struct nfsuid
*nuidp
;
1566 struct timeval ktvin
, ktvout
;
1567 uid_t uid
= kauth_cred_getuid(nd
->nd_cr
);
1569 lck_rw_lock_shared(&slp
->ns_rwlock
);
1570 for (nuidp
= NUIDHASH(slp
, uid
)->lh_first
;
1571 nuidp
!= 0; nuidp
= nuidp
->nu_hash
.le_next
) {
1572 if (kauth_cred_getuid(nuidp
->nu_cr
) == uid
&&
1573 (!nd
->nd_nam2
|| netaddr_match(NU_NETFAM(nuidp
),
1574 &nuidp
->nu_haddr
, nd
->nd_nam2
)))
1579 txdr_unsigned(nuidp
->nu_timestamp
.tv_sec
- 1);
1581 txdr_unsigned(nuidp
->nu_timestamp
.tv_usec
);
1584 * Encrypt the timestamp in ecb mode using the
1591 *tl
++ = rpc_auth_kerb
;
1592 *tl
++ = txdr_unsigned(3 * NFSX_UNSIGNED
);
1593 *tl
= ktvout
.tv_sec
;
1594 nfsm_build(tl
, u_long
*, 3 * NFSX_UNSIGNED
);
1595 *tl
++ = ktvout
.tv_usec
;
1596 *tl
++ = txdr_unsigned(kauth_cred_getuid(nuidp
->nu_cr
));
1601 lck_rw_done(&slp
->ns_rwlock
);
1608 *tl
= txdr_unsigned(RPC_PROGUNAVAIL
);
1611 *tl
= txdr_unsigned(RPC_PROGMISMATCH
);
1612 nfsm_build(tl
, u_long
*, 2 * NFSX_UNSIGNED
);
1613 // XXX hard coded versions
1614 *tl
++ = txdr_unsigned(2);
1615 *tl
= txdr_unsigned(3);
1618 *tl
= txdr_unsigned(RPC_PROCUNAVAIL
);
1621 *tl
= txdr_unsigned(RPC_GARBAGE
);
1625 if (err
!= NFSERR_RETVOID
) {
1626 nfsm_build(tl
, u_long
*, NFSX_UNSIGNED
);
1628 *tl
= txdr_unsigned(nfsrv_errmap(nd
, err
));
1640 if (err
!= 0 && err
!= NFSERR_RETVOID
) {
1641 OSAddAtomic(1, (SInt32
*)&nfsstats
.srvrpc_errs
);
1647 #endif /* NFS_NOSERVER */
1651 * From FreeBSD 1.58, a Matt Dillon fix...
1652 * Flag a request as being about to terminate.
1653 * The nm_sent count is decremented now to avoid deadlocks when the process
1654 * in soreceive() hasn't yet managed to send its own request.
1657 nfs_softterm(struct nfsreq
*rep
)
1660 rep
->r_flags
|= R_SOFTTERM
;
1661 if (rep
->r_flags
& R_SENT
) {
1662 FSDBG(532, rep
->r_xid
, rep
, rep
->r_nmp
->nm_sent
,
1663 rep
->r_nmp
->nm_cwnd
);
1664 rep
->r_nmp
->nm_sent
-= NFS_CWNDSCALE
;
1665 rep
->r_flags
&= ~R_SENT
;
1670 nfs_timer_funnel(void * arg
)
1672 (void) thread_funnel_set(kernel_flock
, TRUE
);
1674 (void) thread_funnel_set(kernel_flock
, FALSE
);
1679 * Ensure rep isn't in use by the timer, then dequeue it.
1682 nfs_repdequeue(struct nfsreq
*rep
)
1685 while ((rep
->r_flags
& R_BUSY
)) {
1686 rep
->r_flags
|= R_WAITING
;
1687 tsleep(rep
, PSOCK
, "repdeq", 0);
1689 TAILQ_REMOVE(&nfs_reqq
, rep
, r_chain
);
1693 * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1694 * free()'d out from under it.
1697 nfs_repbusy(struct nfsreq
*rep
)
1700 if ((rep
->r_flags
& R_BUSY
))
1701 panic("rep locked");
1702 rep
->r_flags
|= R_BUSY
;
1706 * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1708 static struct nfsreq
*
1709 nfs_repnext(struct nfsreq
*rep
)
1711 struct nfsreq
* nextrep
;
1716 * We need to get and busy the next req before signalling the
1717 * current one, otherwise wakeup() may block us and we'll race to
1718 * grab the next req.
1720 nextrep
= TAILQ_NEXT(rep
, r_chain
);
1721 if (nextrep
!= NULL
)
1722 nfs_repbusy(nextrep
);
1723 /* unbusy and signal. */
1724 rep
->r_flags
&= ~R_BUSY
;
1725 if ((rep
->r_flags
& R_WAITING
)) {
1726 rep
->r_flags
&= ~R_WAITING
;
1734 * Scan the nfsreq list and retranmit any requests that have timed out
1735 * To avoid retransmission attempts on STREAM sockets (in the future) make
1736 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1739 nfs_timer(__unused
void *arg
)
1744 struct nfsmount
*nmp
;
1747 #ifndef NFS_NOSERVER
1748 struct nfssvc_sock
*slp
;
1750 #endif /* NFS_NOSERVER */
1751 int flags
, rexmit
, cwnd
, sent
;
1755 rep
= TAILQ_FIRST(&nfs_reqq
);
1759 for ( ; rep
!= NULL
; rep
= nfs_repnext(rep
)) {
1761 if (!nmp
) /* unmounted */
1763 if (rep
->r_mrep
|| (rep
->r_flags
& R_SOFTTERM
))
1765 if (nfs_sigintr(nmp
, rep
, rep
->r_procp
))
1767 if (nmp
->nm_tprintf_initial_delay
!= 0 &&
1768 (rep
->r_rexmit
> 2 || (rep
->r_flags
& R_RESENDERR
)) &&
1769 rep
->r_lastmsg
+ nmp
->nm_tprintf_delay
< now
.tv_sec
) {
1770 rep
->r_lastmsg
= now
.tv_sec
;
1771 nfs_down(rep
->r_nmp
, rep
->r_procp
, 0, NFSSTA_TIMEO
,
1773 rep
->r_flags
|= R_TPRINTFMSG
;
1774 if (!(nmp
->nm_state
& NFSSTA_MOUNTED
)) {
1775 /* we're not yet completely mounted and */
1776 /* we can't complete an RPC, so we fail */
1777 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpctimeouts
);
1782 if (rep
->r_rtt
>= 0) {
1784 if (nmp
->nm_flag
& NFSMNT_DUMBTIMR
)
1785 timeo
= nmp
->nm_timeo
;
1787 timeo
= NFS_RTO(nmp
, proct
[rep
->r_procnum
]);
1788 /* ensure 62.5 ms floor */
1789 while (16 * timeo
< hz
)
1791 if (nmp
->nm_timeouts
> 0)
1792 timeo
*= nfs_backoff
[nmp
->nm_timeouts
- 1];
1793 if (rep
->r_rtt
<= timeo
)
1795 if (nmp
->nm_timeouts
< 8)
1799 * Check for too many retransmits. This is never true for
1800 * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1801 * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1803 if (rep
->r_rexmit
>= rep
->r_retry
) { /* too many */
1804 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpctimeouts
);
1808 if (nmp
->nm_sotype
!= SOCK_DGRAM
) {
1809 if (++rep
->r_rexmit
> NFS_MAXREXMIT
)
1810 rep
->r_rexmit
= NFS_MAXREXMIT
;
1813 if ((so
= nmp
->nm_so
) == NULL
)
1817 * If there is enough space and the window allows..
1819 * Set r_rtt to -1 in case we fail to send it now.
1822 if (((nmp
->nm_flag
& NFSMNT_DUMBTIMR
) ||
1823 (rep
->r_flags
& R_SENT
) ||
1824 nmp
->nm_sent
< nmp
->nm_cwnd
) &&
1825 (mbuf_copym(rep
->r_mreq
, 0, MBUF_COPYALL
, MBUF_DONTWAIT
, &m
) == 0)){
1828 * Iff first send, start timing
1829 * else turn timing off, backoff timer
1830 * and divide congestion window by 2.
1831 * We update these *before* the send to avoid
1832 * racing against receiving the reply.
1833 * We save them so we can restore them on send error.
1835 flags
= rep
->r_flags
;
1836 rexmit
= rep
->r_rexmit
;
1837 cwnd
= nmp
->nm_cwnd
;
1838 sent
= nmp
->nm_sent
;
1840 if (rep
->r_flags
& R_SENT
) {
1841 rep
->r_flags
&= ~R_TIMING
;
1842 if (++rep
->r_rexmit
> NFS_MAXREXMIT
)
1843 rep
->r_rexmit
= NFS_MAXREXMIT
;
1845 if (nmp
->nm_cwnd
< NFS_CWNDSCALE
)
1846 nmp
->nm_cwnd
= NFS_CWNDSCALE
;
1847 OSAddAtomic(1, (SInt32
*)&nfsstats
.rpcretries
);
1849 rep
->r_flags
|= R_SENT
;
1850 nmp
->nm_sent
+= NFS_CWNDSCALE
;
1852 FSDBG(535, xid
, rep
, nmp
->nm_sent
, nmp
->nm_cwnd
);
1854 bzero(&msg
, sizeof(msg
));
1855 if ((nmp
->nm_flag
& NFSMNT_NOCONN
) == NFSMNT_NOCONN
) {
1856 msg
.msg_name
= mbuf_data(nmp
->nm_nam
);
1857 msg
.msg_namelen
= mbuf_len(nmp
->nm_nam
);
1859 error
= sock_sendmbuf(so
, &msg
, m
, MSG_DONTWAIT
, NULL
);
1861 FSDBG(535, xid
, error
, sent
, cwnd
);
1864 if (error
== EWOULDBLOCK
) {
1865 rep
->r_flags
= flags
;
1866 rep
->r_rexmit
= rexmit
;
1867 nmp
->nm_cwnd
= cwnd
;
1868 nmp
->nm_sent
= sent
;
1872 if (NFSIGNORE_SOERROR(nmp
->nm_sotype
, error
)) {
1874 int optlen
= sizeof(clearerror
);
1875 sock_getsockopt(nmp
->nm_so
, SOL_SOCKET
, SO_ERROR
, &clearerror
, &optlen
);
1877 rep
->r_flags
= flags
| R_RESENDERR
;
1878 rep
->r_rexmit
= rexmit
;
1879 nmp
->nm_cwnd
= cwnd
;
1880 nmp
->nm_sent
= sent
;
1882 OSAddAtomic(-1, (SInt32
*)&nfsstats
.rpcretries
);
1889 #ifndef NFS_NOSERVER
1891 * Scan the write gathering queues for writes that need to be
1894 cur_usec
= (u_quad_t
)now
.tv_sec
* 1000000 + (u_quad_t
)now
.tv_usec
;
1895 lck_mtx_lock(nfsd_mutex
);
1896 TAILQ_FOREACH(slp
, &nfssvc_sockhead
, ns_chain
) {
1897 if (slp
->ns_wgtime
&& (slp
->ns_wgtime
<= cur_usec
))
1898 nfsrv_wakenfsd(slp
);
1900 while ((slp
= TAILQ_FIRST(&nfssvc_deadsockhead
))) {
1901 if ((slp
->ns_timestamp
+ 5) > now
.tv_sec
)
1903 TAILQ_REMOVE(&nfssvc_deadsockhead
, slp
, ns_chain
);
1906 lck_mtx_unlock(nfsd_mutex
);
1907 #endif /* NFS_NOSERVER */
1909 if (nfsbuffreeuptimestamp
+ 30 <= now
.tv_sec
) {
1911 * We haven't called nfs_buf_freeup() in a little while.
1912 * So, see if we can free up any stale/unused bufs now.
1917 timeout(nfs_timer_funnel
, (void *)0, nfs_ticks
);
1923 * Test for a termination condition pending on the process.
1924 * This is used to determine if we need to bail on a mount.
1925 * EIO is returned if there has been a soft timeout.
1926 * EINTR is returned if there is a signal pending that is not being ignored
1927 * and the mount is interruptable, or if we are a thread that is in the process
1928 * of cancellation (also SIGKILL posted).
1931 nfs_sigintr(nmp
, rep
, p
)
1932 struct nfsmount
*nmp
;
1936 sigset_t pending_sigs
;
1937 int context_good
= 0;
1938 struct nfsmount
*repnmp
;
1939 extern proc_t kernproc
;
1944 repnmp
= rep
->r_nmp
;
1945 /* we've had a forced unmount. */
1948 /* request has timed out on a 'soft' mount. */
1949 if (rep
->r_flags
& R_SOFTTERM
)
1952 * We're in the progress of a force unmount and there's
1953 * been a timeout we're dead and fail IO.
1955 if ((repnmp
->nm_state
& (NFSSTA_FORCE
|NFSSTA_TIMEO
)) ==
1956 (NFSSTA_FORCE
|NFSSTA_TIMEO
))
1958 /* Someone is unmounting us, go soft and mark it. */
1959 if (repnmp
->nm_mountp
->mnt_kern_flag
& MNTK_FRCUNMOUNT
) {
1960 repnmp
->nm_flag
|= NFSMNT_SOFT
;
1961 nmp
->nm_state
|= NFSSTA_FORCE
;
1964 * If the mount is hung and we've requested not to hang
1965 * on remote filesystems, then bail now.
1967 if (p
!= NULL
&& (proc_noremotehang(p
)) != 0 &&
1968 (repnmp
->nm_state
& NFSSTA_TIMEO
) != 0)
1971 /* XXX: is this valid? this probably should be an assertion. */
1975 /* Is this thread belongs to kernel task; then abort check is not needed */
1976 if ((current_proc() != kernproc
) && current_thread_aborted()) {
1979 /* mask off thread and process blocked signals. */
1981 pending_sigs
= proc_pendingsignals(p
, NFSINT_SIGMASK
);
1982 if (pending_sigs
&& (nmp
->nm_flag
& NFSMNT_INT
) != 0)
1988 * Lock a socket against others.
1989 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1990 * and also to avoid race conditions between the processes with nfs requests
1991 * in progress when a reconnect is necessary.
1999 int error
, slpflag
= 0, slptimeo
= 0;
2001 if (rep
->r_nmp
== NULL
)
2003 statep
= &rep
->r_nmp
->nm_state
;
2006 if (rep
->r_nmp
->nm_flag
& NFSMNT_INT
)
2008 while (*statep
& NFSSTA_SNDLOCK
) {
2009 error
= nfs_sigintr(rep
->r_nmp
, rep
, p
);
2012 *statep
|= NFSSTA_WANTSND
;
2013 if (p
!= NULL
&& (proc_noremotehang(p
)) != 0)
2015 tsleep((caddr_t
)statep
, slpflag
| (PZERO
- 1), "nfsndlck", slptimeo
);
2016 if (slpflag
== PCATCH
) {
2021 * Make sure while we slept that the mountpoint didn't go away.
2022 * nfs_sigintr and callers expect it in tact.
2025 return (ENXIO
); /* don't have lock until out of loop */
2027 *statep
|= NFSSTA_SNDLOCK
;
2032 * Unlock the stream socket for others.
2040 if (rep
->r_nmp
== NULL
)
2042 statep
= &rep
->r_nmp
->nm_state
;
2043 if ((*statep
& NFSSTA_SNDLOCK
) == 0)
2044 panic("nfs sndunlock");
2045 *statep
&= ~NFSSTA_SNDLOCK
;
2046 if (*statep
& NFSSTA_WANTSND
) {
2047 *statep
&= ~NFSSTA_WANTSND
;
2048 wakeup((caddr_t
)statep
);
2053 nfs_rcvlock(struct nfsreq
*rep
)
2056 int error
, slpflag
, slptimeo
= 0;
2058 /* make sure we still have our mountpoint */
2060 if (rep
->r_mrep
!= NULL
)
2065 statep
= &rep
->r_nmp
->nm_state
;
2066 FSDBG_TOP(534, rep
->r_xid
, rep
, rep
->r_nmp
, *statep
);
2067 if (rep
->r_nmp
->nm_flag
& NFSMNT_INT
)
2071 while (*statep
& NFSSTA_RCVLOCK
) {
2072 if ((error
= nfs_sigintr(rep
->r_nmp
, rep
, rep
->r_procp
))) {
2073 FSDBG_BOT(534, rep
->r_xid
, rep
, rep
->r_nmp
, 0x100);
2075 } else if (rep
->r_mrep
!= NULL
) {
2077 * Don't bother sleeping if reply already arrived
2079 FSDBG_BOT(534, rep
->r_xid
, rep
, rep
->r_nmp
, 0x101);
2082 FSDBG(534, rep
->r_xid
, rep
, rep
->r_nmp
, 0x102);
2083 *statep
|= NFSSTA_WANTRCV
;
2085 * We need to poll if we're P_NOREMOTEHANG so that we
2086 * call nfs_sigintr periodically above.
2088 if (rep
->r_procp
!= NULL
&&
2089 (proc_noremotehang(rep
->r_procp
)) != 0)
2091 tsleep((caddr_t
)statep
, slpflag
| (PZERO
- 1), "nfsrcvlk", slptimeo
);
2092 if (slpflag
== PCATCH
) {
2097 * Make sure while we slept that the mountpoint didn't go away.
2098 * nfs_sigintr and caller nfs_reply expect it intact.
2101 FSDBG_BOT(534, rep
->r_xid
, rep
, rep
->r_nmp
, 0x103);
2102 return (ENXIO
); /* don't have lock until out of loop */
2106 * nfs_reply will handle it if reply already arrived.
2107 * (We may have slept or been preempted).
2109 FSDBG_BOT(534, rep
->r_xid
, rep
, rep
->r_nmp
, *statep
);
2110 *statep
|= NFSSTA_RCVLOCK
;
2115 * Unlock the stream socket for others.
2118 nfs_rcvunlock(struct nfsreq
*rep
)
2122 if (rep
->r_nmp
== NULL
)
2124 statep
= &rep
->r_nmp
->nm_state
;
2126 FSDBG(533, statep
, *statep
, 0, 0);
2127 if ((*statep
& NFSSTA_RCVLOCK
) == 0)
2128 panic("nfs rcvunlock");
2129 *statep
&= ~NFSSTA_RCVLOCK
;
2130 if (*statep
& NFSSTA_WANTRCV
) {
2131 *statep
&= ~NFSSTA_WANTRCV
;
2132 wakeup((caddr_t
)statep
);
2137 #ifndef NFS_NOSERVER
2139 * Socket upcall routine for the nfsd sockets.
2140 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2141 * Essentially do as much as possible non-blocking, else punt and it will
2142 * be called with MBUF_WAITOK from an nfsd.
2145 nfsrv_rcv(socket_t so
, caddr_t arg
, int waitflag
)
2147 struct nfssvc_sock
*slp
= (struct nfssvc_sock
*)arg
;
2149 if (!nfs_numnfsd
|| !(slp
->ns_flag
& SLP_VALID
))
2152 lck_rw_lock_exclusive(&slp
->ns_rwlock
);
2153 nfsrv_rcv_locked(so
, slp
, waitflag
);
2154 /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2157 nfsrv_rcv_locked(socket_t so
, struct nfssvc_sock
*slp
, int waitflag
)
2159 mbuf_t m
, mp
, mhck
, m2
;
2160 int ns_flag
=0, error
;
2164 if ((slp
->ns_flag
& SLP_VALID
) == 0) {
2165 if (waitflag
== MBUF_DONTWAIT
)
2166 lck_rw_done(&slp
->ns_rwlock
);
2172 * Define this to test for nfsds handling this under heavy load.
2174 if (waitflag
== MBUF_DONTWAIT
) {
2175 ns_flag
= SLP_NEEDQ
;
2179 if (slp
->ns_sotype
== SOCK_STREAM
) {
2181 * If there are already records on the queue, defer soreceive()
2182 * to an nfsd so that there is feedback to the TCP layer that
2183 * the nfs servers are heavily loaded.
2185 if (slp
->ns_rec
&& waitflag
== MBUF_DONTWAIT
) {
2186 ns_flag
= SLP_NEEDQ
;
2193 bytes_read
= 1000000000;
2194 error
= sock_receivembuf(so
, NULL
, &mp
, MSG_DONTWAIT
, &bytes_read
);
2195 if (error
|| mp
== NULL
) {
2196 if (error
== EWOULDBLOCK
)
2197 ns_flag
= SLP_NEEDQ
;
2199 ns_flag
= SLP_DISCONN
;
2203 if (slp
->ns_rawend
) {
2204 if ((error
= mbuf_setnext(slp
->ns_rawend
, m
)))
2205 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error
);
2206 slp
->ns_cc
+= bytes_read
;
2209 slp
->ns_cc
= bytes_read
;
2211 while ((m2
= mbuf_next(m
)))
2216 * Now try and parse record(s) out of the raw stream data.
2218 error
= nfsrv_getstream(slp
, waitflag
);
2221 ns_flag
= SLP_DISCONN
;
2223 ns_flag
= SLP_NEEDQ
;
2226 struct sockaddr_storage nam
;
2228 bzero(&msg
, sizeof(msg
));
2229 msg
.msg_name
= (caddr_t
)&nam
;
2230 msg
.msg_namelen
= sizeof(nam
);
2233 bytes_read
= 1000000000;
2234 error
= sock_receivembuf(so
, &msg
, &mp
, MSG_DONTWAIT
| MSG_NEEDSA
, &bytes_read
);
2236 if (msg
.msg_name
&& (mbuf_get(MBUF_WAITOK
, MBUF_TYPE_SONAME
, &mhck
) == 0)) {
2237 mbuf_setlen(mhck
, nam
.ss_len
);
2238 bcopy(&nam
, mbuf_data(mhck
), nam
.ss_len
);
2240 if (mbuf_setnext(m
, mp
)) {
2241 /* trouble... just drop it */
2242 printf("nfsrv_rcv: mbuf_setnext failed\n");
2250 mbuf_setnextpkt(slp
->ns_recend
, m
);
2254 mbuf_setnextpkt(m
, NULL
);
2259 * This may be needed in the future to support
2260 * non-byte-stream connection-oriented protocols
2264 * This (slp->ns_sotype == SOCK_STREAM) should really
2265 * be a check for PR_CONNREQUIRED.
2267 if ((slp
->ns_sotype
== SOCK_STREAM
)
2268 && error
!= EWOULDBLOCK
) {
2269 ns_flag
= SLP_DISCONN
;
2278 * Now try and process the request records, non-blocking.
2282 slp
->ns_flag
|= ns_flag
;
2283 if (waitflag
== MBUF_DONTWAIT
) {
2284 int wake
= (slp
->ns_rec
|| (slp
->ns_flag
& (SLP_NEEDQ
| SLP_DISCONN
)));
2285 lck_rw_done(&slp
->ns_rwlock
);
2286 if (wake
&& nfs_numnfsd
) {
2287 lck_mtx_lock(nfsd_mutex
);
2288 nfsrv_wakenfsd(slp
);
2289 lck_mtx_unlock(nfsd_mutex
);
2295 * Try and extract an RPC request from the mbuf data list received on a
2296 * stream socket. The "waitflag" argument indicates whether or not it
2300 nfsrv_getstream(slp
, waitflag
)
2301 struct nfssvc_sock
*slp
;
2305 char *cp1
, *cp2
, *mdata
;
2306 int len
, mlen
, error
;
2307 mbuf_t om
, m2
, recm
;
2310 if (slp
->ns_flag
& SLP_GETSTREAM
)
2311 panic("nfs getstream");
2312 slp
->ns_flag
|= SLP_GETSTREAM
;
2314 if (slp
->ns_reclen
== 0) {
2315 if (slp
->ns_cc
< NFSX_UNSIGNED
) {
2316 slp
->ns_flag
&= ~SLP_GETSTREAM
;
2320 mdata
= mbuf_data(m
);
2322 if (mlen
>= NFSX_UNSIGNED
) {
2323 bcopy(mdata
, (caddr_t
)&recmark
, NFSX_UNSIGNED
);
2324 mdata
+= NFSX_UNSIGNED
;
2325 mlen
-= NFSX_UNSIGNED
;
2326 mbuf_setdata(m
, mdata
, mlen
);
2328 cp1
= (caddr_t
)&recmark
;
2330 while (cp1
< ((caddr_t
)&recmark
) + NFSX_UNSIGNED
) {
2338 mbuf_setdata(m
, cp2
, mlen
);
2341 slp
->ns_cc
-= NFSX_UNSIGNED
;
2342 recmark
= ntohl(recmark
);
2343 slp
->ns_reclen
= recmark
& ~0x80000000;
2344 if (recmark
& 0x80000000)
2345 slp
->ns_flag
|= SLP_LASTFRAG
;
2347 slp
->ns_flag
&= ~SLP_LASTFRAG
;
2348 if (slp
->ns_reclen
< NFS_MINPACKET
|| slp
->ns_reclen
> NFS_MAXPACKET
) {
2349 slp
->ns_flag
&= ~SLP_GETSTREAM
;
2355 * Now get the record part.
2357 * Note that slp->ns_reclen may be 0. Linux sometimes
2358 * generates 0-length RPCs
2361 if (slp
->ns_cc
== slp
->ns_reclen
) {
2363 slp
->ns_raw
= slp
->ns_rawend
= NULL
;
2364 slp
->ns_cc
= slp
->ns_reclen
= 0;
2365 } else if (slp
->ns_cc
> slp
->ns_reclen
) {
2369 mdata
= mbuf_data(m
);
2371 while (len
< slp
->ns_reclen
) {
2372 if ((len
+ mlen
) > slp
->ns_reclen
) {
2373 if (mbuf_copym(m
, 0, slp
->ns_reclen
- len
, waitflag
, &m2
)) {
2374 slp
->ns_flag
&= ~SLP_GETSTREAM
;
2375 return (EWOULDBLOCK
);
2378 if (mbuf_setnext(om
, m2
)) {
2379 /* trouble... just drop it */
2380 printf("nfsrv_getstream: mbuf_setnext failed\n");
2382 slp
->ns_flag
&= ~SLP_GETSTREAM
;
2383 return (EWOULDBLOCK
);
2389 mdata
+= slp
->ns_reclen
- len
;
2390 mlen
-= slp
->ns_reclen
- len
;
2391 mbuf_setdata(m
, mdata
, mlen
);
2392 len
= slp
->ns_reclen
;
2393 } else if ((len
+ mlen
) == slp
->ns_reclen
) {
2398 if (mbuf_setnext(om
, NULL
)) {
2399 printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2400 slp
->ns_flag
&= ~SLP_GETSTREAM
;
2401 return (EWOULDBLOCK
);
2404 mdata
= mbuf_data(m
);
2410 mdata
= mbuf_data(m
);
2417 slp
->ns_flag
&= ~SLP_GETSTREAM
;
2422 * Accumulate the fragments into a record.
2424 if (slp
->ns_frag
== NULL
) {
2425 slp
->ns_frag
= recm
;
2428 while ((m2
= mbuf_next(m
)))
2430 if ((error
= mbuf_setnext(m
, recm
)))
2431 panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error
);
2433 if (slp
->ns_flag
& SLP_LASTFRAG
) {
2435 mbuf_setnextpkt(slp
->ns_recend
, slp
->ns_frag
);
2437 slp
->ns_rec
= slp
->ns_frag
;
2438 slp
->ns_recend
= slp
->ns_frag
;
2439 slp
->ns_frag
= NULL
;
2445 * Parse an RPC header.
2448 nfsrv_dorec(slp
, nfsd
, ndp
)
2449 struct nfssvc_sock
*slp
;
2451 struct nfsrv_descript
**ndp
;
2455 struct nfsrv_descript
*nd
;
2459 if ((slp
->ns_flag
& SLP_VALID
) == 0 || (slp
->ns_rec
== NULL
))
2461 MALLOC_ZONE(nd
, struct nfsrv_descript
*,
2462 sizeof (struct nfsrv_descript
), M_NFSRVDESC
, M_WAITOK
);
2466 slp
->ns_rec
= mbuf_nextpkt(m
);
2468 mbuf_setnextpkt(m
, NULL
);
2470 slp
->ns_recend
= NULL
;
2471 if (mbuf_type(m
) == MBUF_TYPE_SONAME
) {
2474 if ((error
= mbuf_setnext(nam
, NULL
)))
2475 panic("nfsrv_dorec: mbuf_setnext failed %d\n", error
);
2478 nd
->nd_md
= nd
->nd_mrep
= m
;
2480 nd
->nd_dpos
= mbuf_data(m
);
2481 error
= nfs_getreq(nd
, nfsd
, TRUE
);
2485 FREE_ZONE((caddr_t
)nd
, sizeof *nd
, M_NFSRVDESC
);
2494 * Parse an RPC request
2496 * - fill in the cred struct.
2499 nfs_getreq(nd
, nfsd
, has_header
)
2500 struct nfsrv_descript
*nd
;
2508 caddr_t dpos
, cp2
, cp
;
2509 u_long nfsvers
, auth_type
;
2511 int error
= 0, ticklen
;
2513 struct nfsuid
*nuidp
;
2517 struct ucred temp_cred
;
2518 struct timeval tvin
, tvout
, now
;
2519 char uio_buf
[ UIO_SIZEOF(1) ];
2520 #if 0 /* until encrypted keys are implemented */
2521 NFSKERBKEYSCHED_T keys
; /* stores key schedule */
2530 nfsm_dissect(tl
, u_long
*, 10 * NFSX_UNSIGNED
);
2531 nd
->nd_retxid
= fxdr_unsigned(u_long
, *tl
++);
2532 if (*tl
++ != rpc_call
) {
2537 nfsm_dissect(tl
, u_long
*, 8 * NFSX_UNSIGNED
);
2540 if (*tl
++ != rpc_vers
) {
2541 nd
->nd_repstat
= ERPCMISMATCH
;
2542 nd
->nd_procnum
= NFSPROC_NOOP
;
2545 if (*tl
!= nfs_prog
) {
2546 nd
->nd_repstat
= EPROGUNAVAIL
;
2547 nd
->nd_procnum
= NFSPROC_NOOP
;
2551 nfsvers
= fxdr_unsigned(u_long
, *tl
++);
2552 if ((nfsvers
< NFS_VER2
) || (nfsvers
> NFS_VER3
)) {
2553 nd
->nd_repstat
= EPROGMISMATCH
;
2554 nd
->nd_procnum
= NFSPROC_NOOP
;
2557 else if (nfsvers
== NFS_VER3
)
2558 nd
->nd_flag
= ND_NFSV3
;
2559 nd
->nd_procnum
= fxdr_unsigned(u_long
, *tl
++);
2560 if (nd
->nd_procnum
== NFSPROC_NULL
)
2562 if ((nd
->nd_procnum
>= NFS_NPROCS
) ||
2563 (!nd
->nd_flag
&& nd
->nd_procnum
> NFSV2PROC_STATFS
)) {
2564 nd
->nd_repstat
= EPROCUNAVAIL
;
2565 nd
->nd_procnum
= NFSPROC_NOOP
;
2568 if ((nd
->nd_flag
& ND_NFSV3
) == 0)
2569 nd
->nd_procnum
= nfsv3_procid
[nd
->nd_procnum
];
2571 len
= fxdr_unsigned(int, *tl
++);
2572 if (len
< 0 || len
> RPCAUTH_MAXSIZ
) {
2577 nd
->nd_flag
&= ~ND_KERBAUTH
;
2579 * Handle auth_unix or auth_kerb.
2581 if (auth_type
== rpc_auth_unix
) {
2582 len
= fxdr_unsigned(int, *++tl
);
2583 if (len
< 0 || len
> NFS_MAXNAMLEN
) {
2587 bzero(&temp_cred
, sizeof(temp_cred
));
2588 nfsm_adv(nfsm_rndup(len
));
2589 nfsm_dissect(tl
, u_long
*, 3 * NFSX_UNSIGNED
);
2590 user_id
= fxdr_unsigned(uid_t
, *tl
++);
2591 group_id
= fxdr_unsigned(gid_t
, *tl
++);
2592 temp_cred
.cr_groups
[0] = group_id
;
2593 len
= fxdr_unsigned(int, *tl
);
2594 if (len
< 0 || len
> RPCAUTH_UNIXGIDS
) {
2598 nfsm_dissect(tl
, u_long
*, (len
+ 2) * NFSX_UNSIGNED
);
2599 for (i
= 1; i
<= len
; i
++)
2601 temp_cred
.cr_groups
[i
] = fxdr_unsigned(gid_t
, *tl
++);
2604 ngroups
= (len
>= NGROUPS
) ? NGROUPS
: (len
+ 1);
2606 nfsrvw_sort(&temp_cred
.cr_groups
[0], ngroups
);
2607 len
= fxdr_unsigned(int, *++tl
);
2608 if (len
< 0 || len
> RPCAUTH_MAXSIZ
) {
2612 temp_cred
.cr_uid
= user_id
;
2613 temp_cred
.cr_ngroups
= ngroups
;
2614 nd
->nd_cr
= kauth_cred_create(&temp_cred
);
2615 if (nd
->nd_cr
== NULL
) {
2616 nd
->nd_repstat
= ENOMEM
;
2617 nd
->nd_procnum
= NFSPROC_NOOP
;
2621 nfsm_adv(nfsm_rndup(len
));
2622 } else if (auth_type
== rpc_auth_kerb
) {
2623 switch (fxdr_unsigned(int, *tl
++)) {
2624 case RPCAKN_FULLNAME
:
2625 ticklen
= fxdr_unsigned(int, *tl
);
2626 *((u_long
*)nfsd
->nfsd_authstr
) = *tl
;
2627 uiop
= uio_createwithbuffer(1, 0, UIO_SYSSPACE
, UIO_READ
,
2628 &uio_buf
[0], sizeof(uio_buf
));
2630 nd
->nd_repstat
= ENOMEM
;
2631 nd
->nd_procnum
= NFSPROC_NOOP
;
2635 // LP64todo - fix this
2636 nfsd
->nfsd_authlen
= (nfsm_rndup(ticklen
) + (NFSX_UNSIGNED
* 2));
2637 if ((nfsm_rndup(ticklen
) + NFSX_UNSIGNED
) > (len
- 2 * NFSX_UNSIGNED
)) {
2641 uio_addiov(uiop
, CAST_USER_ADDR_T(&nfsd
->nfsd_authstr
[4]), RPCAUTH_MAXSIZ
- 4);
2642 // LP64todo - fix this
2643 nfsm_mtouio(uiop
, uio_resid(uiop
));
2644 nfsm_dissect(tl
, u_long
*, 2 * NFSX_UNSIGNED
);
2645 if (*tl
++ != rpc_auth_kerb
||
2646 fxdr_unsigned(int, *tl
) != 4 * NFSX_UNSIGNED
) {
2647 printf("Bad kerb verifier\n");
2648 nd
->nd_repstat
= (NFSERR_AUTHERR
|AUTH_BADVERF
);
2649 nd
->nd_procnum
= NFSPROC_NOOP
;
2652 nfsm_dissect(cp
, caddr_t
, 4 * NFSX_UNSIGNED
);
2654 if (fxdr_unsigned(int, *tl
) != RPCAKN_FULLNAME
) {
2655 printf("Not fullname kerb verifier\n");
2656 nd
->nd_repstat
= (NFSERR_AUTHERR
|AUTH_BADVERF
);
2657 nd
->nd_procnum
= NFSPROC_NOOP
;
2660 cp
+= NFSX_UNSIGNED
;
2661 bcopy(cp
, nfsd
->nfsd_verfstr
, 3 * NFSX_UNSIGNED
);
2662 nfsd
->nfsd_verflen
= 3 * NFSX_UNSIGNED
;
2663 nd
->nd_flag
|= ND_KERBFULL
;
2664 nfsd
->nfsd_flag
|= NFSD_NEEDAUTH
;
2666 case RPCAKN_NICKNAME
:
2667 if (len
!= 2 * NFSX_UNSIGNED
) {
2668 printf("Kerb nickname short\n");
2669 nd
->nd_repstat
= (NFSERR_AUTHERR
|AUTH_BADCRED
);
2670 nd
->nd_procnum
= NFSPROC_NOOP
;
2673 nickuid
= fxdr_unsigned(uid_t
, *tl
);
2674 nfsm_dissect(tl
, u_long
*, 2 * NFSX_UNSIGNED
);
2675 if (*tl
++ != rpc_auth_kerb
||
2676 fxdr_unsigned(int, *tl
) != 3 * NFSX_UNSIGNED
) {
2677 printf("Kerb nick verifier bad\n");
2678 nd
->nd_repstat
= (NFSERR_AUTHERR
|AUTH_BADVERF
);
2679 nd
->nd_procnum
= NFSPROC_NOOP
;
2682 nfsm_dissect(tl
, u_long
*, 3 * NFSX_UNSIGNED
);
2683 tvin
.tv_sec
= *tl
++;
2686 for (nuidp
= NUIDHASH(nfsd
->nfsd_slp
,nickuid
)->lh_first
;
2687 nuidp
!= 0; nuidp
= nuidp
->nu_hash
.le_next
) {
2688 if (kauth_cred_getuid(nuidp
->nu_cr
) == nickuid
&&
2690 netaddr_match(NU_NETFAM(nuidp
),
2691 &nuidp
->nu_haddr
, nd
->nd_nam2
)))
2696 (NFSERR_AUTHERR
|AUTH_REJECTCRED
);
2697 nd
->nd_procnum
= NFSPROC_NOOP
;
2702 * Now, decrypt the timestamp using the session key
2709 tvout
.tv_sec
= fxdr_unsigned(long, tvout
.tv_sec
);
2710 tvout
.tv_usec
= fxdr_unsigned(long, tvout
.tv_usec
);
2712 if (nuidp
->nu_expire
< now
.tv_sec
||
2713 nuidp
->nu_timestamp
.tv_sec
> tvout
.tv_sec
||
2714 (nuidp
->nu_timestamp
.tv_sec
== tvout
.tv_sec
&&
2715 nuidp
->nu_timestamp
.tv_usec
> tvout
.tv_usec
)) {
2716 nuidp
->nu_expire
= 0;
2718 (NFSERR_AUTHERR
|AUTH_REJECTVERF
);
2719 nd
->nd_procnum
= NFSPROC_NOOP
;
2722 bzero(&temp_cred
, sizeof(temp_cred
));
2723 ngroups
= nuidp
->nu_cr
->cr_ngroups
;
2724 for (i
= 0; i
< ngroups
; i
++)
2725 temp_cred
.cr_groups
[i
] = nuidp
->nu_cr
->cr_groups
[i
];
2727 nfsrvw_sort(&temp_cred
.cr_groups
[0], ngroups
);
2729 temp_cred
.cr_uid
= kauth_cred_getuid(nuidp
->nu_cr
);
2730 temp_cred
.cr_ngroups
= ngroups
;
2731 nd
->nd_cr
= kauth_cred_create(&temp_cred
);
2733 nd
->nd_repstat
= ENOMEM
;
2734 nd
->nd_procnum
= NFSPROC_NOOP
;
2737 nd
->nd_flag
|= ND_KERBNICK
;
2740 nd
->nd_repstat
= (NFSERR_AUTHERR
| AUTH_REJECTCRED
);
2741 nd
->nd_procnum
= NFSPROC_NOOP
;
2750 kauth_cred_rele(nd
->nd_cr
);
2755 * Search for a sleeping nfsd and wake it up.
2756 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2757 * running nfsds will go look for the work in the nfssvc_sock list.
2758 * Note: Must be called with nfsd_mutex held.
2761 nfsrv_wakenfsd(struct nfssvc_sock
*slp
)
2765 if ((slp
->ns_flag
& SLP_VALID
) == 0)
2768 lck_rw_lock_exclusive(&slp
->ns_rwlock
);
2771 TAILQ_FOREACH(nd
, &nfsd_head
, nfsd_chain
) {
2772 if (nd
->nfsd_flag
& NFSD_WAITING
) {
2773 nd
->nfsd_flag
&= ~NFSD_WAITING
;
2775 panic("nfsd wakeup");
2778 lck_rw_done(&slp
->ns_rwlock
);
2779 wakeup((caddr_t
)nd
);
2785 slp
->ns_flag
|= SLP_DOREC
;
2787 lck_rw_done(&slp
->ns_rwlock
);
2789 nfsd_head_flag
|= NFSD_CHECKSLP
;
2791 #endif /* NFS_NOSERVER */
2802 tpr
= tprintf_open(p
);
2806 tprintf(tpr
, "nfs server %s: %s, error %d\n", server
, msg
,
2809 tprintf(tpr
, "nfs server %s: %s\n", server
, msg
);
2815 nfs_down(nmp
, proc
, error
, flags
, msg
)
2816 struct nfsmount
*nmp
;
2823 if ((flags
& NFSSTA_TIMEO
) && !(nmp
->nm_state
& NFSSTA_TIMEO
)) {
2824 vfs_event_signal(&vfs_statfs(nmp
->nm_mountp
)->f_fsid
, VQ_NOTRESP
, 0);
2825 nmp
->nm_state
|= NFSSTA_TIMEO
;
2827 if ((flags
& NFSSTA_LOCKTIMEO
) && !(nmp
->nm_state
& NFSSTA_LOCKTIMEO
)) {
2828 vfs_event_signal(&vfs_statfs(nmp
->nm_mountp
)->f_fsid
, VQ_NOTRESPLOCK
, 0);
2829 nmp
->nm_state
|= NFSSTA_LOCKTIMEO
;
2831 nfs_msg(proc
, vfs_statfs(nmp
->nm_mountp
)->f_mntfromname
, msg
, error
);
2835 nfs_up(nmp
, proc
, flags
, msg
)
2836 struct nfsmount
*nmp
;
2844 nfs_msg(proc
, vfs_statfs(nmp
->nm_mountp
)->f_mntfromname
, msg
, 0);
2845 if ((flags
& NFSSTA_TIMEO
) && (nmp
->nm_state
& NFSSTA_TIMEO
)) {
2846 nmp
->nm_state
&= ~NFSSTA_TIMEO
;
2847 vfs_event_signal(&vfs_statfs(nmp
->nm_mountp
)->f_fsid
, VQ_NOTRESP
, 1);
2849 if ((flags
& NFSSTA_LOCKTIMEO
) && (nmp
->nm_state
& NFSSTA_LOCKTIMEO
)) {
2850 nmp
->nm_state
&= ~NFSSTA_LOCKTIMEO
;
2851 vfs_event_signal(&vfs_statfs(nmp
->nm_mountp
)->f_fsid
, VQ_NOTRESPLOCK
, 1);