]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_socket.c
197a33ca98e49146bf8a9680e2ea5fae7ec8d287
[apple/xnu.git] / bsd / nfs / nfs_socket.c
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All Rights Reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31 /*
32 * Copyright (c) 1989, 1991, 1993, 1995
33 * The Regents of the University of California. All rights reserved.
34 *
35 * This code is derived from software contributed to Berkeley by
36 * Rick Macklem at The University of Guelph.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
67 * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
68 */
69
70 /*
71 * Socket operations for use by nfs
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/proc.h>
77 #include <sys/kauth.h>
78 #include <sys/mount_internal.h>
79 #include <sys/kernel.h>
80 #include <sys/kpi_mbuf.h>
81 #include <sys/malloc.h>
82 #include <sys/vnode.h>
83 #include <sys/domain.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/syslog.h>
87 #include <sys/tprintf.h>
88 #include <sys/uio_internal.h>
89 #include <libkern/OSAtomic.h>
90
91 #include <sys/time.h>
92 #include <kern/clock.h>
93 #include <kern/task.h>
94 #include <kern/thread.h>
95 #include <sys/user.h>
96
97 #include <netinet/in.h>
98 #include <netinet/tcp.h>
99
100 #include <nfs/rpcv2.h>
101 #include <nfs/nfsproto.h>
102 #include <nfs/nfs.h>
103 #include <nfs/xdr_subs.h>
104 #include <nfs/nfsm_subs.h>
105 #include <nfs/nfsmount.h>
106 #include <nfs/nfsnode.h>
107 #include <nfs/nfsrtt.h>
108
109 #include <sys/kdebug.h>
110
111 #define FSDBG(A, B, C, D, E) \
112 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
113 (int)(B), (int)(C), (int)(D), (int)(E), 0)
114 #define FSDBG_TOP(A, B, C, D, E) \
115 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
116 (int)(B), (int)(C), (int)(D), (int)(E), 0)
117 #define FSDBG_BOT(A, B, C, D, E) \
118 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
119 (int)(B), (int)(C), (int)(D), (int)(E), 0)
120
121 /*
122 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
123 * Use the mean and mean deviation of rtt for the appropriate type of rpc
124 * for the frequent rpcs and a default for the others.
125 * The justification for doing "other" this way is that these rpcs
126 * happen so infrequently that timer est. would probably be stale.
127 * Also, since many of these rpcs are
128 * non-idempotent, a conservative timeout is desired.
129 * getattr, lookup - A+2D
130 * read, write - A+4D
131 * other - nm_timeo
132 */
133 #define NFS_RTO(n, t) \
134 ((t) == 0 ? (n)->nm_timeo : \
135 ((t) < 3 ? \
136 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
137 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
138 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
139 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
140 /*
141 * External data, mostly RPC constants in XDR form
142 */
143 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
144 rpc_msgaccepted, rpc_call, rpc_autherr,
145 rpc_auth_kerb;
146 extern u_long nfs_prog;
147 extern struct nfsstats nfsstats;
148 extern int nfsv3_procid[NFS_NPROCS];
149 extern int nfs_ticks;
150 extern u_long nfs_xidwrap;
151
152 /*
153 * Defines which timer to use for the procnum.
154 * 0 - default
155 * 1 - getattr
156 * 2 - lookup
157 * 3 - read
158 * 4 - write
159 */
160 static int proct[NFS_NPROCS] = {
161 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
162 };
163
164 /*
165 * There is a congestion window for outstanding rpcs maintained per mount
166 * point. The cwnd size is adjusted in roughly the way that:
167 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
168 * SIGCOMM '88". ACM, August 1988.
169 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
170 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
171 * of rpcs is in progress.
172 * (The sent count and cwnd are scaled for integer arith.)
173 * Variants of "slow start" were tried and were found to be too much of a
174 * performance hit (ave. rtt 3 times larger),
175 * I suspect due to the large rtt that nfs rpcs have.
176 */
177 #define NFS_CWNDSCALE 256
178 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
179 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
180 int nfsrtton = 0;
181 struct nfsrtt nfsrtt;
182
183 static int nfs_rcvlock(struct nfsreq *);
184 static void nfs_rcvunlock(struct nfsreq *);
185 static int nfs_receive(struct nfsreq *rep, mbuf_t *mp);
186 static int nfs_reconnect(struct nfsreq *rep);
187 static void nfs_repdequeue(struct nfsreq *rep);
188
189 /* XXX */
190 boolean_t current_thread_aborted(void);
191 kern_return_t thread_terminate(thread_t);
192
193 #ifndef NFS_NOSERVER
194 static int nfsrv_getstream(struct nfssvc_sock *,int);
195
196 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
197 struct nfssvc_sock *slp,
198 proc_t procp,
199 mbuf_t *mreqp) = {
200 nfsrv_null,
201 nfsrv_getattr,
202 nfsrv_setattr,
203 nfsrv_lookup,
204 nfsrv3_access,
205 nfsrv_readlink,
206 nfsrv_read,
207 nfsrv_write,
208 nfsrv_create,
209 nfsrv_mkdir,
210 nfsrv_symlink,
211 nfsrv_mknod,
212 nfsrv_remove,
213 nfsrv_rmdir,
214 nfsrv_rename,
215 nfsrv_link,
216 nfsrv_readdir,
217 nfsrv_readdirplus,
218 nfsrv_statfs,
219 nfsrv_fsinfo,
220 nfsrv_pathconf,
221 nfsrv_commit,
222 nfsrv_noop
223 };
224 #endif /* NFS_NOSERVER */
225
226
227 /*
228 * attempt to bind a socket to a reserved port
229 */
230 static int
231 nfs_bind_resv(struct nfsmount *nmp)
232 {
233 socket_t so = nmp->nm_so;
234 struct sockaddr_in sin;
235 int error;
236 u_short tport;
237
238 if (!so)
239 return (EINVAL);
240
241 sin.sin_len = sizeof (struct sockaddr_in);
242 sin.sin_family = AF_INET;
243 sin.sin_addr.s_addr = INADDR_ANY;
244 tport = IPPORT_RESERVED - 1;
245 sin.sin_port = htons(tport);
246
247 while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
248 (--tport > IPPORT_RESERVED / 2))
249 sin.sin_port = htons(tport);
250 return (error);
251 }
252
253 /*
254 * variables for managing the nfs_bind_resv_thread
255 */
256 int nfs_resv_mounts = 0;
257 static int nfs_bind_resv_thread_state = 0;
258 #define NFS_BIND_RESV_THREAD_STATE_INITTED 1
259 #define NFS_BIND_RESV_THREAD_STATE_RUNNING 2
260 lck_grp_t *nfs_bind_resv_lck_grp;
261 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
262 lck_attr_t *nfs_bind_resv_lck_attr;
263 lck_mtx_t *nfs_bind_resv_mutex;
264 struct nfs_bind_resv_request {
265 TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
266 struct nfsmount *brr_nmp;
267 int brr_error;
268 };
269 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
270
271 /*
272 * thread to handle any reserved port bind requests
273 */
274 static void
275 nfs_bind_resv_thread(void)
276 {
277 struct nfs_bind_resv_request *brreq;
278
279 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
280
281 while (nfs_resv_mounts > 0) {
282 lck_mtx_lock(nfs_bind_resv_mutex);
283 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
284 TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
285 lck_mtx_unlock(nfs_bind_resv_mutex);
286 brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
287 wakeup(brreq);
288 lck_mtx_lock(nfs_bind_resv_mutex);
289 }
290 msleep((caddr_t)&nfs_bind_resv_request_queue,
291 nfs_bind_resv_mutex, PSOCK | PDROP,
292 "nfs_bind_resv_request_queue", 0);
293 }
294
295 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
296 (void) thread_terminate(current_thread());
297 }
298
299 int
300 nfs_bind_resv_thread_wake(void)
301 {
302 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
303 return (EIO);
304 wakeup(&nfs_bind_resv_request_queue);
305 return (0);
306 }
307
308 /*
309 * underprivileged procs call this to request nfs_bind_resv_thread
310 * to perform the reserved port binding for them.
311 */
312 static int
313 nfs_bind_resv_nopriv(struct nfsmount *nmp)
314 {
315 struct nfs_bind_resv_request brreq;
316 int error;
317
318 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
319 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
320 nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
321 lck_grp_attr_setstat(nfs_bind_resv_lck_grp_attr);
322 nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
323 nfs_bind_resv_lck_attr = lck_attr_alloc_init();
324 nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
325 TAILQ_INIT(&nfs_bind_resv_request_queue);
326 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
327 }
328 kernel_thread(kernel_task, nfs_bind_resv_thread);
329 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
330 }
331
332 brreq.brr_nmp = nmp;
333 brreq.brr_error = 0;
334
335 lck_mtx_lock(nfs_bind_resv_mutex);
336 TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
337 lck_mtx_unlock(nfs_bind_resv_mutex);
338
339 error = nfs_bind_resv_thread_wake();
340 if (error) {
341 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
342 /* Note: we might be able to simply restart the thread */
343 return (error);
344 }
345
346 tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
347
348 return (brreq.brr_error);
349 }
350
351 /*
352 * Initialize sockets and congestion for a new NFS connection.
353 * We do not free the sockaddr if error.
354 */
355 int
356 nfs_connect(
357 struct nfsmount *nmp,
358 __unused struct nfsreq *rep)
359 {
360 socket_t so;
361 int error, rcvreserve, sndreserve;
362 struct sockaddr *saddr;
363 struct timeval timeo;
364
365 nmp->nm_so = 0;
366 saddr = mbuf_data(nmp->nm_nam);
367 error = sock_socket(saddr->sa_family, nmp->nm_sotype,
368 nmp->nm_soproto, 0, 0, &nmp->nm_so);
369 if (error) {
370 goto bad;
371 }
372 so = nmp->nm_so;
373
374 /*
375 * Some servers require that the client port be a reserved port number.
376 */
377 if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
378 proc_t p;
379 /*
380 * sobind() requires current_proc() to have superuser privs.
381 * If this bind is part of a reconnect, and the current proc
382 * doesn't have superuser privs, we hand the sobind() off to
383 * a kernel thread to process.
384 */
385 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
386 (p = current_proc()) && suser(kauth_cred_get(), 0)) {
387 /* request nfs_bind_resv_thread() to do bind */
388 error = nfs_bind_resv_nopriv(nmp);
389 } else {
390 error = nfs_bind_resv(nmp);
391 }
392 if (error)
393 goto bad;
394 }
395
396 /*
397 * Protocols that do not require connections may be optionally left
398 * unconnected for servers that reply from a port other than NFS_PORT.
399 */
400 if (nmp->nm_flag & NFSMNT_NOCONN) {
401 if (nmp->nm_sotype == SOCK_STREAM) {
402 error = ENOTCONN;
403 goto bad;
404 }
405 } else {
406 struct timeval tv;
407 tv.tv_sec = 2;
408 tv.tv_usec = 0;
409 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
410 if (error && error != EINPROGRESS) {
411 goto bad;
412 }
413
414 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
415 if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
416 goto bad;
417 }
418 }
419 }
420
421 /*
422 * Always time out on recieve, this allows us to reconnect the
423 * socket to deal with network changes.
424 */
425 timeo.tv_usec = 0;
426 timeo.tv_sec = 2;
427 error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
428 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
429 timeo.tv_sec = 5;
430 } else {
431 timeo.tv_sec = 0;
432 }
433 error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
434
435 if (nmp->nm_sotype == SOCK_DGRAM) {
436 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
437 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
438 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
439 } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
440 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
441 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
442 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
443 } else {
444 int proto;
445 int on = 1;
446
447 sock_gettype(so, NULL, NULL, &proto);
448 if (nmp->nm_sotype != SOCK_STREAM)
449 panic("nfscon sotype");
450
451 // Assume that SOCK_STREAM always requires a connection
452 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
453
454 if (proto == IPPROTO_TCP) {
455 sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
456 }
457
458 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
459 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
460 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
461 }
462
463 if (sndreserve > NFS_MAXSOCKBUF)
464 sndreserve = NFS_MAXSOCKBUF;
465 if (rcvreserve > NFS_MAXSOCKBUF)
466 rcvreserve = NFS_MAXSOCKBUF;
467 error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
468 if (error) {
469 goto bad;
470 }
471 error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
472 if (error) {
473 goto bad;
474 }
475
476 sock_nointerrupt(so, 1);
477
478 /* Initialize other non-zero congestion variables */
479 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
480 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
481 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
482 nmp->nm_sdrtt[3] = 0;
483 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
484 nmp->nm_sent = 0;
485 FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
486 nmp->nm_timeouts = 0;
487 return (0);
488
489 bad:
490 nfs_disconnect(nmp);
491 return (error);
492 }
493
494 /*
495 * Reconnect routine:
496 * Called when a connection is broken on a reliable protocol.
497 * - clean up the old socket
498 * - nfs_connect() again
499 * - set R_MUSTRESEND for all outstanding requests on mount point
500 * If this fails the mount point is DEAD!
501 * nb: Must be called with the nfs_sndlock() set on the mount point.
502 */
503 static int
504 nfs_reconnect(struct nfsreq *rep)
505 {
506 struct nfsreq *rp;
507 struct nfsmount *nmp = rep->r_nmp;
508 int error;
509
510 nfs_disconnect(nmp);
511 while ((error = nfs_connect(nmp, rep))) {
512 if (error == EINTR || error == ERESTART)
513 return (EINTR);
514 if (error == EIO)
515 return (EIO);
516 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
517 "can not connect");
518 rep->r_flags |= R_TPRINTFMSG;
519 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
520 /* we're not yet completely mounted and */
521 /* we can't reconnect, so we fail */
522 return (error);
523 }
524 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
525 return (error);
526 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
527 }
528
529 /*
530 * Loop through outstanding request list and fix up all requests
531 * on old socket.
532 */
533 TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
534 if (rp->r_nmp == nmp)
535 rp->r_flags |= R_MUSTRESEND;
536 }
537 return (0);
538 }
539
540 /*
541 * NFS disconnect. Clean up and unlink.
542 */
543 void
544 nfs_disconnect(struct nfsmount *nmp)
545 {
546 socket_t so;
547
548 if (nmp->nm_so) {
549 so = nmp->nm_so;
550 nmp->nm_so = 0;
551 sock_shutdown(so, 2);
552 sock_close(so);
553 }
554 }
555
556 /*
557 * This is the nfs send routine. For connection based socket types, it
558 * must be called with an nfs_sndlock() on the socket.
559 * "rep == NULL" indicates that it has been called from a server.
560 * For the client side:
561 * - return EINTR if the RPC is terminated, 0 otherwise
562 * - set R_MUSTRESEND if the send fails for any reason
563 * - do any cleanup required by recoverable socket errors (???)
564 * For the server side:
565 * - return EINTR or ERESTART if interrupted by a signal
566 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
567 * - do any cleanup required by recoverable socket errors (???)
568 */
569 int
570 nfs_send(so, nam, top, rep)
571 socket_t so;
572 mbuf_t nam;
573 mbuf_t top;
574 struct nfsreq *rep;
575 {
576 struct sockaddr *sendnam;
577 int error, error2, sotype, flags;
578 u_long xidqueued = 0;
579 struct nfsreq *rp;
580 char savenametolog[MAXPATHLEN];
581 struct msghdr msg;
582
583 if (rep) {
584 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
585 if (error) {
586 mbuf_freem(top);
587 return (error);
588 }
589 if ((so = rep->r_nmp->nm_so) == NULL) {
590 rep->r_flags |= R_MUSTRESEND;
591 mbuf_freem(top);
592 return (0);
593 }
594 rep->r_flags &= ~R_MUSTRESEND;
595 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
596 if (rp == rep)
597 break;
598 if (rp)
599 xidqueued = rp->r_xid;
600 }
601 sock_gettype(so, NULL, &sotype, NULL);
602 if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
603 (nam == 0))
604 sendnam = (struct sockaddr *)0;
605 else
606 sendnam = mbuf_data(nam);
607
608 if (sotype == SOCK_SEQPACKET)
609 flags = MSG_EOR;
610 else
611 flags = 0;
612
613 /*
614 * Save the name here in case mount point goes away if we block.
615 * The name is using local stack and is large, but don't
616 * want to block if we malloc.
617 */
618 if (rep)
619 strncpy(savenametolog,
620 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
621 MAXPATHLEN - 1);
622 bzero(&msg, sizeof(msg));
623 msg.msg_name = (caddr_t)sendnam;
624 msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
625 error = sock_sendmbuf(so, &msg, top, flags, NULL);
626
627 if (error) {
628 if (rep) {
629 if (xidqueued) {
630 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
631 if (rp == rep && rp->r_xid == xidqueued)
632 break;
633 if (!rp)
634 panic("nfs_send: error %d xid %x gone",
635 error, xidqueued);
636 }
637 log(LOG_INFO, "nfs send error %d for server %s\n",
638 error, savenametolog);
639 /*
640 * Deal with errors for the client side.
641 */
642 error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
643 if (error2) {
644 error = error2;
645 } else {
646 rep->r_flags |= R_MUSTRESEND;
647 }
648 } else
649 log(LOG_INFO, "nfsd send error %d\n", error);
650
651 /*
652 * Handle any recoverable (soft) socket errors here. (???)
653 */
654 if (error != EINTR && error != ERESTART && error != EIO &&
655 error != EWOULDBLOCK && error != EPIPE) {
656 error = 0;
657 }
658 }
659 return (error);
660 }
661
662 /*
663 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
664 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
665 * Mark and consolidate the data into a new mbuf list.
666 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
667 * small mbufs.
668 * For SOCK_STREAM we must be very careful to read an entire record once
669 * we have read any of it, even if the system call has been interrupted.
670 */
671 static int
672 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
673 {
674 socket_t so;
675 struct iovec_32 aio;
676 mbuf_t m, mlast;
677 u_long len, fraglen;
678 int error, error2, sotype;
679 proc_t p = current_proc(); /* XXX */
680 struct msghdr msg;
681 size_t rcvlen;
682 int lastfragment;
683
684 /*
685 * Set up arguments for soreceive()
686 */
687 *mp = NULL;
688 sotype = rep->r_nmp->nm_sotype;
689
690 /*
691 * For reliable protocols, lock against other senders/receivers
692 * in case a reconnect is necessary.
693 * For SOCK_STREAM, first get the Record Mark to find out how much
694 * more there is to get.
695 * We must lock the socket against other receivers
696 * until we have an entire rpc request/reply.
697 */
698 if (sotype != SOCK_DGRAM) {
699 error = nfs_sndlock(rep);
700 if (error)
701 return (error);
702 tryagain:
703 /*
704 * Check for fatal errors and resending request.
705 */
706 /*
707 * Ugh: If a reconnect attempt just happened, nm_so
708 * would have changed. NULL indicates a failed
709 * attempt that has essentially shut down this
710 * mount point.
711 */
712 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
713 nfs_sndunlock(rep);
714 if (error)
715 return (error);
716 return (EINTR);
717 }
718 so = rep->r_nmp->nm_so;
719 if (!so) {
720 error = nfs_reconnect(rep);
721 if (error) {
722 nfs_sndunlock(rep);
723 return (error);
724 }
725 goto tryagain;
726 }
727 while (rep->r_flags & R_MUSTRESEND) {
728 error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
729 if (!error) {
730 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
731 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
732 }
733 /*
734 * we also hold rcv lock so rep is still
735 * legit this point
736 */
737 if (error) {
738 if (error == EINTR || error == ERESTART ||
739 (error = nfs_reconnect(rep))) {
740 nfs_sndunlock(rep);
741 return (error);
742 }
743 goto tryagain;
744 }
745 }
746 nfs_sndunlock(rep);
747 if (sotype == SOCK_STREAM) {
748 error = 0;
749 len = 0;
750 lastfragment = 0;
751 mlast = NULL;
752 while (!error && !lastfragment) {
753 aio.iov_base = (uintptr_t) &fraglen;
754 aio.iov_len = sizeof(u_long);
755 bzero(&msg, sizeof(msg));
756 msg.msg_iov = (struct iovec *) &aio;
757 msg.msg_iovlen = 1;
758 do {
759 error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
760 if (!rep->r_nmp) /* if unmounted then bailout */
761 goto shutout;
762 if (error == EWOULDBLOCK && rep) {
763 error2 = nfs_sigintr(rep->r_nmp, rep, p);
764 if (error2)
765 error = error2;
766 }
767 } while (error == EWOULDBLOCK);
768 if (!error && rcvlen < aio.iov_len) {
769 /* only log a message if we got a partial word */
770 if (rcvlen != 0)
771 log(LOG_INFO,
772 "short receive (%d/%d) from nfs server %s\n",
773 rcvlen, sizeof(u_long),
774 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
775 error = EPIPE;
776 }
777 if (error)
778 goto errout;
779 lastfragment = ntohl(fraglen) & 0x80000000;
780 fraglen = ntohl(fraglen) & ~0x80000000;
781 len += fraglen;
782 /*
783 * This is SERIOUS! We are out of sync with the sender
784 * and forcing a disconnect/reconnect is all I can do.
785 */
786 if (len > NFS_MAXPACKET) {
787 log(LOG_ERR, "%s (%d) from nfs server %s\n",
788 "impossible RPC record length", len,
789 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
790 error = EFBIG;
791 goto errout;
792 }
793
794 m = NULL;
795 do {
796 rcvlen = fraglen;
797 error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
798 if (!rep->r_nmp) /* if unmounted then bailout */ {
799 goto shutout;
800 }
801 } while (error == EWOULDBLOCK || error == EINTR ||
802 error == ERESTART);
803
804 if (!error && fraglen > rcvlen) {
805 log(LOG_INFO,
806 "short receive (%d/%d) from nfs server %s\n",
807 rcvlen, fraglen,
808 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
809 error = EPIPE;
810 mbuf_freem(m);
811 }
812 if (!error) {
813 if (!*mp) {
814 *mp = m;
815 mlast = m;
816 } else {
817 error = mbuf_setnext(mlast, m);
818 if (error) {
819 printf("nfs_receive: mbuf_setnext failed %d\n", error);
820 mbuf_freem(m);
821 }
822 }
823 while (mbuf_next(mlast))
824 mlast = mbuf_next(mlast);
825 }
826 }
827 } else {
828 bzero(&msg, sizeof(msg));
829 do {
830 rcvlen = 100000000;
831 error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
832 if (!rep->r_nmp) /* if unmounted then bailout */ {
833 goto shutout;
834 }
835 if (error == EWOULDBLOCK && rep) {
836 error2 = nfs_sigintr(rep->r_nmp, rep, p);
837 if (error2) {
838 return (error2);
839 }
840 }
841 } while (error == EWOULDBLOCK);
842
843 if ((msg.msg_flags & MSG_EOR) == 0)
844 printf("Egad!!\n");
845 if (!error && *mp == NULL)
846 error = EPIPE;
847 len = rcvlen;
848 }
849 errout:
850 if (error && error != EINTR && error != ERESTART) {
851 mbuf_freem(*mp);
852 *mp = NULL;
853 if (error != EPIPE)
854 log(LOG_INFO,
855 "receive error %d from nfs server %s\n", error,
856 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
857 error = nfs_sndlock(rep);
858 if (!error) {
859 error = nfs_reconnect(rep);
860 if (!error)
861 goto tryagain;
862 nfs_sndunlock(rep);
863 }
864 }
865 } else {
866 /*
867 * We could have failed while rebinding the datagram socket
868 * so we need to attempt to rebind here.
869 */
870 if ((so = rep->r_nmp->nm_so) == NULL) {
871 error = nfs_sndlock(rep);
872 if (!error) {
873 error = nfs_reconnect(rep);
874 nfs_sndunlock(rep);
875 }
876 if (error)
877 return (error);
878 if (!rep->r_nmp) /* if unmounted then bailout */
879 return (ENXIO);
880 so = rep->r_nmp->nm_so;
881 }
882 bzero(&msg, sizeof(msg));
883 len = 0;
884 do {
885 rcvlen = 1000000;
886 error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
887 if (!rep->r_nmp) /* if unmounted then bailout */
888 goto shutout;
889 if (error) {
890 error2 = nfs_sigintr(rep->r_nmp, rep, p);
891 if (error2) {
892 error = error2;
893 goto shutout;
894 }
895 }
896 /* Reconnect for all errors. We may be receiving
897 * soft/hard/blocking errors because of a network
898 * change.
899 * XXX: we should rate limit or delay this
900 * to once every N attempts or something.
901 * although TCP doesn't seem to.
902 */
903 if (error) {
904 error2 = nfs_sndlock(rep);
905 if (!error2) {
906 error2 = nfs_reconnect(rep);
907 if (error2)
908 error = error2;
909 else if (!rep->r_nmp) /* if unmounted then bailout */
910 error = ENXIO;
911 else
912 so = rep->r_nmp->nm_so;
913 nfs_sndunlock(rep);
914 } else {
915 error = error2;
916 }
917 }
918 } while (error == EWOULDBLOCK);
919 }
920 shutout:
921 if (error) {
922 mbuf_freem(*mp);
923 *mp = NULL;
924 }
925 return (error);
926 }
927
928 /*
929 * Implement receipt of reply on a socket.
930 * We must search through the list of received datagrams matching them
931 * with outstanding requests using the xid, until ours is found.
932 */
933 /* ARGSUSED */
934 int
935 nfs_reply(myrep)
936 struct nfsreq *myrep;
937 {
938 struct nfsreq *rep;
939 struct nfsmount *nmp = myrep->r_nmp;
940 long t1;
941 mbuf_t mrep, md;
942 u_long rxid, *tl;
943 caddr_t dpos, cp2;
944 int error;
945
946 /*
947 * Loop around until we get our own reply
948 */
949 for (;;) {
950 /*
951 * Lock against other receivers so that I don't get stuck in
952 * sbwait() after someone else has received my reply for me.
953 * Also necessary for connection based protocols to avoid
954 * race conditions during a reconnect.
955 * If nfs_rcvlock() returns EALREADY, that means that
956 * the reply has already been recieved by another
957 * process and we can return immediately. In this
958 * case, the lock is not taken to avoid races with
959 * other processes.
960 */
961 error = nfs_rcvlock(myrep);
962 if (error == EALREADY)
963 return (0);
964 if (error)
965 return (error);
966
967 /*
968 * If we slept after putting bits otw, then reply may have
969 * arrived. In which case returning is required, or we
970 * would hang trying to nfs_receive an already received reply.
971 */
972 if (myrep->r_mrep != NULL) {
973 nfs_rcvunlock(myrep);
974 FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
975 return (0);
976 }
977 /*
978 * Get the next Rpc reply off the socket. Assume myrep->r_nmp
979 * is still intact by checks done in nfs_rcvlock.
980 */
981 error = nfs_receive(myrep, &mrep);
982 /*
983 * Bailout asap if nfsmount struct gone (unmounted).
984 */
985 if (!myrep->r_nmp) {
986 FSDBG(530, myrep->r_xid, myrep, nmp, -2);
987 if (mrep)
988 mbuf_freem(mrep);
989 return (ENXIO);
990 }
991 if (error) {
992 FSDBG(530, myrep->r_xid, myrep, nmp, error);
993 nfs_rcvunlock(myrep);
994
995 /* Bailout asap if nfsmount struct gone (unmounted). */
996 if (!myrep->r_nmp) {
997 if (mrep)
998 mbuf_freem(mrep);
999 return (ENXIO);
1000 }
1001
1002 /*
1003 * Ignore routing errors on connectionless protocols??
1004 */
1005 if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1006 if (nmp->nm_so) {
1007 int clearerror;
1008 int optlen = sizeof(clearerror);
1009 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1010 }
1011 continue;
1012 }
1013 if (mrep)
1014 mbuf_freem(mrep);
1015 return (error);
1016 }
1017
1018 /*
1019 * We assume all is fine, but if we did not have an error
1020 * and mrep is 0, better not dereference it. nfs_receive
1021 * calls soreceive which carefully sets error=0 when it got
1022 * errors on sbwait (tsleep). In most cases, I assume that's
1023 * so we could go back again. In tcp case, EPIPE is returned.
1024 * In udp, case nfs_receive gets back here with no error and no
1025 * mrep. Is the right fix to have soreceive check for process
1026 * aborted after sbwait and return something non-zero? Should
1027 * nfs_receive give an EPIPE? Too risky to play with those
1028 * two this late in game for a shutdown problem. Instead,
1029 * just check here and get out. (ekn)
1030 */
1031 if (!mrep) {
1032 nfs_rcvunlock(myrep);
1033 FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1034 return (ENXIO); /* sounds good */
1035 }
1036
1037 /*
1038 * Get the xid and check that it is an rpc reply
1039 */
1040 md = mrep;
1041 dpos = mbuf_data(md);
1042 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1043 rxid = *tl++;
1044 if (*tl != rpc_reply) {
1045 OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1046 mbuf_freem(mrep);
1047 nfsmout:
1048 if (nmp->nm_state & NFSSTA_RCVLOCK)
1049 nfs_rcvunlock(myrep);
1050 continue;
1051 }
1052
1053 /*
1054 * Loop through the request list to match up the reply
1055 * Iff no match, just drop the datagram
1056 */
1057 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1058 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1059 /* Found it.. */
1060 rep->r_mrep = mrep;
1061 rep->r_md = md;
1062 rep->r_dpos = dpos;
1063 /*
1064 * If we're tracking the round trip time
1065 * then we update the circular log here
1066 * with the stats from our current request.
1067 */
1068 if (nfsrtton) {
1069 struct rttl *rt;
1070
1071 rt = &nfsrtt.rttl[nfsrtt.pos];
1072 rt->proc = rep->r_procnum;
1073 rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1074 rt->sent = nmp->nm_sent;
1075 rt->cwnd = nmp->nm_cwnd;
1076 if (proct[rep->r_procnum] == 0)
1077 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1078 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1079 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1080 rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1081 microtime(&rt->tstamp); // XXX unused
1082 if (rep->r_flags & R_TIMING)
1083 rt->rtt = rep->r_rtt;
1084 else
1085 rt->rtt = 1000000;
1086 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1087 }
1088 /*
1089 * Update congestion window.
1090 * Do the additive increase of
1091 * one rpc/rtt.
1092 */
1093 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1094 nmp->nm_cwnd);
1095 if (nmp->nm_cwnd <= nmp->nm_sent) {
1096 nmp->nm_cwnd +=
1097 (NFS_CWNDSCALE * NFS_CWNDSCALE +
1098 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1099 if (nmp->nm_cwnd > NFS_MAXCWND)
1100 nmp->nm_cwnd = NFS_MAXCWND;
1101 }
1102 if (rep->r_flags & R_SENT) {
1103 rep->r_flags &= ~R_SENT;
1104 nmp->nm_sent -= NFS_CWNDSCALE;
1105 }
1106 /*
1107 * Update rtt using a gain of 0.125 on the mean
1108 * and a gain of 0.25 on the deviation.
1109 */
1110 if (rep->r_flags & R_TIMING) {
1111 /*
1112 * Since the timer resolution of
1113 * NFS_HZ is so course, it can often
1114 * result in r_rtt == 0. Since
1115 * r_rtt == N means that the actual
1116 * rtt is between N+dt and N+2-dt ticks,
1117 * add 1.
1118 */
1119 if (proct[rep->r_procnum] == 0)
1120 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1121 t1 = rep->r_rtt + 1;
1122 t1 -= (NFS_SRTT(rep) >> 3);
1123 NFS_SRTT(rep) += t1;
1124 if (t1 < 0)
1125 t1 = -t1;
1126 t1 -= (NFS_SDRTT(rep) >> 2);
1127 NFS_SDRTT(rep) += t1;
1128 }
1129 nmp->nm_timeouts = 0;
1130 break;
1131 }
1132 }
1133 nfs_rcvunlock(myrep);
1134 /*
1135 * If not matched to a request, drop it.
1136 * If it's mine, get out.
1137 */
1138 if (rep == 0) {
1139 OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1140 mbuf_freem(mrep);
1141 } else if (rep == myrep) {
1142 if (rep->r_mrep == NULL)
1143 panic("nfs_reply: nil r_mrep");
1144 return (0);
1145 }
1146 FSDBG(530, myrep->r_xid, myrep, rep,
1147 rep ? rep->r_xid : myrep->r_flags);
1148 }
1149 }
1150
1151 /*
1152 * nfs_request - goes something like this
1153 * - fill in request struct
1154 * - links it into list
1155 * - calls nfs_send() for first transmit
1156 * - calls nfs_receive() to get reply
1157 * - break down rpc header and return with nfs reply pointed to
1158 * by mrep or error
1159 * nb: always frees up mreq mbuf list
1160 */
1161 int
1162 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1163 vnode_t vp;
1164 mount_t mp;
1165 mbuf_t mrest;
1166 int procnum;
1167 proc_t procp;
1168 kauth_cred_t cred;
1169 mbuf_t *mrp;
1170 mbuf_t *mdp;
1171 caddr_t *dposp;
1172 u_int64_t *xidp;
1173 {
1174 mbuf_t m, mrep, m2;
1175 struct nfsreq re, *rep;
1176 u_long *tl;
1177 int i;
1178 struct nfsmount *nmp;
1179 mbuf_t md, mheadend;
1180 char nickv[RPCX_NICKVERF];
1181 time_t waituntil;
1182 caddr_t dpos, cp2;
1183 int t1, error = 0, mrest_len, auth_len, auth_type;
1184 int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1185 int verf_len, verf_type;
1186 u_long xid;
1187 char *auth_str, *verf_str;
1188 NFSKERBKEY_T key; /* save session key */
1189 int nmsotype;
1190 struct timeval now;
1191
1192 if (mrp)
1193 *mrp = NULL;
1194 if (xidp)
1195 *xidp = 0;
1196 nmp = VFSTONFS(mp);
1197
1198 rep = &re;
1199
1200 if (vp)
1201 nmp = VFSTONFS(vnode_mount(vp));
1202 if (nmp == NULL ||
1203 (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1204 (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1205 mbuf_freem(mrest);
1206 return (ENXIO);
1207 }
1208 nmsotype = nmp->nm_sotype;
1209
1210 FSDBG_TOP(531, vp, procnum, nmp, rep);
1211
1212 rep->r_nmp = nmp;
1213 rep->r_vp = vp;
1214 rep->r_procp = procp;
1215 rep->r_procnum = procnum;
1216 microuptime(&now);
1217 rep->r_lastmsg = now.tv_sec -
1218 ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1219 i = 0;
1220 m = mrest;
1221 while (m) {
1222 i += mbuf_len(m);
1223 m = mbuf_next(m);
1224 }
1225 mrest_len = i;
1226
1227 /*
1228 * Get the RPC header with authorization.
1229 */
1230 kerbauth:
1231 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1232 if (!nmp) {
1233 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1234 mbuf_freem(mrest);
1235 return (ENXIO);
1236 }
1237 verf_str = auth_str = (char *)0;
1238 if (nmp->nm_flag & NFSMNT_KERB) {
1239 verf_str = nickv;
1240 verf_len = sizeof (nickv);
1241 auth_type = RPCAUTH_KERB4;
1242 bzero((caddr_t)key, sizeof (key));
1243 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1244 &auth_len, verf_str, verf_len)) {
1245 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1246 if (!nmp) {
1247 FSDBG_BOT(531, 2, vp, error, rep);
1248 mbuf_freem(mrest);
1249 return (ENXIO);
1250 }
1251 error = nfs_getauth(nmp, rep, cred, &auth_str,
1252 &auth_len, verf_str, &verf_len, key);
1253 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1254 if (!error && !nmp)
1255 error = ENXIO;
1256 if (error) {
1257 FSDBG_BOT(531, 2, vp, error, rep);
1258 mbuf_freem(mrest);
1259 return (error);
1260 }
1261 }
1262 } else {
1263 auth_type = RPCAUTH_UNIX;
1264 if (cred->cr_ngroups < 1)
1265 panic("nfsreq nogrps");
1266 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1267 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1268 5 * NFSX_UNSIGNED;
1269 }
1270 error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1271 auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1272 if (auth_str)
1273 _FREE(auth_str, M_TEMP);
1274 if (error) {
1275 mbuf_freem(mrest);
1276 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1277 return (error);
1278 }
1279 if (xidp)
1280 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1281
1282 /*
1283 * For stream protocols, insert a Sun RPC Record Mark.
1284 */
1285 if (nmsotype == SOCK_STREAM) {
1286 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1287 if (error) {
1288 mbuf_freem(m);
1289 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1290 return (error);
1291 }
1292 *((u_long*)mbuf_data(m)) =
1293 htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1294 }
1295 rep->r_mreq = m;
1296 rep->r_xid = xid;
1297 tryagain:
1298 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1299 if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1300 rep->r_retry = nmp->nm_retry;
1301 else
1302 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
1303 rep->r_rtt = rep->r_rexmit = 0;
1304 if (proct[procnum] > 0)
1305 rep->r_flags = R_TIMING;
1306 else
1307 rep->r_flags = 0;
1308 rep->r_mrep = NULL;
1309
1310 /*
1311 * Do the client side RPC.
1312 */
1313 OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1314 /*
1315 * Chain request into list of outstanding requests. Be sure
1316 * to put it LAST so timer finds oldest requests first.
1317 */
1318 TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1319
1320 /*
1321 * If backing off another request or avoiding congestion, don't
1322 * send this one now but let timer do it. If not timing a request,
1323 * do it now.
1324 */
1325 if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1326 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1327 nmp->nm_sent < nmp->nm_cwnd)) {
1328 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1329
1330 if (connrequired)
1331 error = nfs_sndlock(rep);
1332
1333 /*
1334 * Set the R_SENT before doing the send in case another thread
1335 * processes the reply before the nfs_send returns here
1336 */
1337 if (!error) {
1338 if ((rep->r_flags & R_MUSTRESEND) == 0) {
1339 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1340 nmp->nm_cwnd);
1341 nmp->nm_sent += NFS_CWNDSCALE;
1342 rep->r_flags |= R_SENT;
1343 }
1344
1345 error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1346 if (!error)
1347 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1348 if (connrequired)
1349 nfs_sndunlock(rep);
1350 }
1351 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1352 if (error) {
1353 if (nmp)
1354 nmp->nm_sent -= NFS_CWNDSCALE;
1355 rep->r_flags &= ~R_SENT;
1356 }
1357 } else {
1358 rep->r_rtt = -1;
1359 }
1360
1361 /*
1362 * Wait for the reply from our send or the timer's.
1363 */
1364 if (!error || error == EPIPE)
1365 error = nfs_reply(rep);
1366
1367 /*
1368 * RPC done, unlink the request.
1369 */
1370 nfs_repdequeue(rep);
1371
1372 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1373
1374 /*
1375 * Decrement the outstanding request count.
1376 */
1377 if (rep->r_flags & R_SENT) {
1378 rep->r_flags &= ~R_SENT; /* paranoia */
1379 if (nmp) {
1380 FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1381 nmp->nm_sent -= NFS_CWNDSCALE;
1382 }
1383 }
1384
1385 /*
1386 * If there was a successful reply and a tprintf msg.
1387 * tprintf a response.
1388 */
1389 if (!error)
1390 nfs_up(nmp, procp, NFSSTA_TIMEO,
1391 (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1392 mrep = rep->r_mrep;
1393 md = rep->r_md;
1394 dpos = rep->r_dpos;
1395 if (!error && !nmp)
1396 error = ENXIO;
1397 if (error) {
1398 mbuf_freem(rep->r_mreq);
1399 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1400 return (error);
1401 }
1402
1403 /*
1404 * break down the rpc header and check if ok
1405 */
1406 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1407 if (*tl++ == rpc_msgdenied) {
1408 if (*tl == rpc_mismatch)
1409 error = EOPNOTSUPP;
1410 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1411 if (!failed_auth) {
1412 failed_auth++;
1413 error = mbuf_setnext(mheadend, NULL);
1414 mbuf_freem(mrep);
1415 mbuf_freem(rep->r_mreq);
1416 if (!error)
1417 goto kerbauth;
1418 printf("nfs_request: mbuf_setnext failed\n");
1419 } else
1420 error = EAUTH;
1421 } else
1422 error = EACCES;
1423 mbuf_freem(mrep);
1424 mbuf_freem(rep->r_mreq);
1425 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1426 return (error);
1427 }
1428
1429 /*
1430 * Grab any Kerberos verifier, otherwise just throw it away.
1431 */
1432 verf_type = fxdr_unsigned(int, *tl++);
1433 i = fxdr_unsigned(int, *tl);
1434 if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1435 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1436 if (error)
1437 goto nfsmout;
1438 } else if (i > 0)
1439 nfsm_adv(nfsm_rndup(i));
1440 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1441 /* 0 == ok */
1442 if (*tl == 0) {
1443 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1444 if (*tl != 0) {
1445 error = fxdr_unsigned(int, *tl);
1446 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1447 error == NFSERR_TRYLATER) {
1448 mbuf_freem(mrep);
1449 error = 0;
1450 microuptime(&now);
1451 waituntil = now.tv_sec + trylater_delay;
1452 while (now.tv_sec < waituntil) {
1453 tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1454 microuptime(&now);
1455 }
1456 trylater_delay *= 2;
1457 if (trylater_delay > 60)
1458 trylater_delay = 60;
1459 goto tryagain;
1460 }
1461
1462 /*
1463 * If the File Handle was stale, invalidate the
1464 * lookup cache, just in case.
1465 */
1466 if ((error == ESTALE) && vp)
1467 cache_purge(vp);
1468 if (nmp->nm_flag & NFSMNT_NFSV3) {
1469 *mrp = mrep;
1470 *mdp = md;
1471 *dposp = dpos;
1472 error |= NFSERR_RETERR;
1473 } else {
1474 mbuf_freem(mrep);
1475 error &= ~NFSERR_RETERR;
1476 }
1477 mbuf_freem(rep->r_mreq);
1478 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1479 return (error);
1480 }
1481
1482 *mrp = mrep;
1483 *mdp = md;
1484 *dposp = dpos;
1485 mbuf_freem(rep->r_mreq);
1486 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1487 return (0);
1488 }
1489 mbuf_freem(mrep);
1490 error = EPROTONOSUPPORT;
1491 nfsmout:
1492 mbuf_freem(rep->r_mreq);
1493 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1494 return (error);
1495 }
1496
1497 #ifndef NFS_NOSERVER
1498 /*
1499 * Generate the rpc reply header
1500 * siz arg. is used to decide if adding a cluster is worthwhile
1501 */
1502 int
1503 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1504 int siz;
1505 struct nfsrv_descript *nd;
1506 struct nfssvc_sock *slp;
1507 int err;
1508 mbuf_t *mrq;
1509 mbuf_t *mbp;
1510 caddr_t *bposp;
1511 {
1512 u_long *tl;
1513 mbuf_t mreq;
1514 caddr_t bpos;
1515 mbuf_t mb, mb2;
1516 int error, mlen;
1517
1518 /*
1519 * If this is a big reply, use a cluster else
1520 * try and leave leading space for the lower level headers.
1521 */
1522 siz += RPC_REPLYSIZ;
1523 if (siz >= nfs_mbuf_minclsize) {
1524 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1525 } else {
1526 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1527 }
1528 if (error) {
1529 /* unable to allocate packet */
1530 /* XXX nfsstat? */
1531 return (error);
1532 }
1533 mb = mreq;
1534 tl = mbuf_data(mreq);
1535 mlen = 6 * NFSX_UNSIGNED;
1536 if (siz < nfs_mbuf_minclsize) {
1537 /* leave space for lower level headers */
1538 tl += 80/sizeof(*tl); /* XXX max_hdr? XXX */
1539 mbuf_setdata(mreq, tl, mlen);
1540 } else {
1541 mbuf_setlen(mreq, mlen);
1542 }
1543 bpos = ((caddr_t)tl) + mlen;
1544 *tl++ = txdr_unsigned(nd->nd_retxid);
1545 *tl++ = rpc_reply;
1546 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1547 *tl++ = rpc_msgdenied;
1548 if (err & NFSERR_AUTHERR) {
1549 *tl++ = rpc_autherr;
1550 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1551 mlen -= NFSX_UNSIGNED;
1552 mbuf_setlen(mreq, mlen);
1553 bpos -= NFSX_UNSIGNED;
1554 } else {
1555 *tl++ = rpc_mismatch;
1556 *tl++ = txdr_unsigned(RPC_VER2);
1557 *tl = txdr_unsigned(RPC_VER2);
1558 }
1559 } else {
1560 *tl++ = rpc_msgaccepted;
1561
1562 /*
1563 * For Kerberos authentication, we must send the nickname
1564 * verifier back, otherwise just RPCAUTH_NULL.
1565 */
1566 if (nd->nd_flag & ND_KERBFULL) {
1567 struct nfsuid *nuidp;
1568 struct timeval ktvin, ktvout;
1569 uid_t uid = kauth_cred_getuid(nd->nd_cr);
1570
1571 lck_rw_lock_shared(&slp->ns_rwlock);
1572 for (nuidp = NUIDHASH(slp, uid)->lh_first;
1573 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1574 if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1575 (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1576 &nuidp->nu_haddr, nd->nd_nam2)))
1577 break;
1578 }
1579 if (nuidp) {
1580 ktvin.tv_sec =
1581 txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1582 ktvin.tv_usec =
1583 txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1584
1585 /*
1586 * Encrypt the timestamp in ecb mode using the
1587 * session key.
1588 */
1589 #if NFSKERB
1590 XXX
1591 #endif
1592
1593 *tl++ = rpc_auth_kerb;
1594 *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1595 *tl = ktvout.tv_sec;
1596 nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1597 *tl++ = ktvout.tv_usec;
1598 *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1599 } else {
1600 *tl++ = 0;
1601 *tl++ = 0;
1602 }
1603 lck_rw_done(&slp->ns_rwlock);
1604 } else {
1605 *tl++ = 0;
1606 *tl++ = 0;
1607 }
1608 switch (err) {
1609 case EPROGUNAVAIL:
1610 *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1611 break;
1612 case EPROGMISMATCH:
1613 *tl = txdr_unsigned(RPC_PROGMISMATCH);
1614 nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1615 // XXX hard coded versions
1616 *tl++ = txdr_unsigned(2);
1617 *tl = txdr_unsigned(3);
1618 break;
1619 case EPROCUNAVAIL:
1620 *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1621 break;
1622 case EBADRPC:
1623 *tl = txdr_unsigned(RPC_GARBAGE);
1624 break;
1625 default:
1626 *tl = 0;
1627 if (err != NFSERR_RETVOID) {
1628 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1629 if (err)
1630 *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1631 else
1632 *tl = 0;
1633 }
1634 break;
1635 }
1636 }
1637
1638 if (mrq != NULL)
1639 *mrq = mreq;
1640 *mbp = mb;
1641 *bposp = bpos;
1642 if (err != 0 && err != NFSERR_RETVOID) {
1643 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1644 }
1645 return (0);
1646 }
1647
1648
1649 #endif /* NFS_NOSERVER */
1650
1651
1652 /*
1653 * From FreeBSD 1.58, a Matt Dillon fix...
1654 * Flag a request as being about to terminate.
1655 * The nm_sent count is decremented now to avoid deadlocks when the process
1656 * in soreceive() hasn't yet managed to send its own request.
1657 */
1658 static void
1659 nfs_softterm(struct nfsreq *rep)
1660 {
1661
1662 rep->r_flags |= R_SOFTTERM;
1663 if (rep->r_flags & R_SENT) {
1664 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1665 rep->r_nmp->nm_cwnd);
1666 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1667 rep->r_flags &= ~R_SENT;
1668 }
1669 }
1670
1671 void
1672 nfs_timer_funnel(void * arg)
1673 {
1674 (void) thread_funnel_set(kernel_flock, TRUE);
1675 nfs_timer(arg);
1676 (void) thread_funnel_set(kernel_flock, FALSE);
1677
1678 }
1679
1680 /*
1681 * Ensure rep isn't in use by the timer, then dequeue it.
1682 */
1683 static void
1684 nfs_repdequeue(struct nfsreq *rep)
1685 {
1686
1687 while ((rep->r_flags & R_BUSY)) {
1688 rep->r_flags |= R_WAITING;
1689 tsleep(rep, PSOCK, "repdeq", 0);
1690 }
1691 TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1692 }
1693
1694 /*
1695 * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1696 * free()'d out from under it.
1697 */
1698 static void
1699 nfs_repbusy(struct nfsreq *rep)
1700 {
1701
1702 if ((rep->r_flags & R_BUSY))
1703 panic("rep locked");
1704 rep->r_flags |= R_BUSY;
1705 }
1706
1707 /*
1708 * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1709 */
1710 static struct nfsreq *
1711 nfs_repnext(struct nfsreq *rep)
1712 {
1713 struct nfsreq * nextrep;
1714
1715 if (rep == NULL)
1716 return (NULL);
1717 /*
1718 * We need to get and busy the next req before signalling the
1719 * current one, otherwise wakeup() may block us and we'll race to
1720 * grab the next req.
1721 */
1722 nextrep = TAILQ_NEXT(rep, r_chain);
1723 if (nextrep != NULL)
1724 nfs_repbusy(nextrep);
1725 /* unbusy and signal. */
1726 rep->r_flags &= ~R_BUSY;
1727 if ((rep->r_flags & R_WAITING)) {
1728 rep->r_flags &= ~R_WAITING;
1729 wakeup(rep);
1730 }
1731 return (nextrep);
1732 }
1733
1734 /*
1735 * Nfs timer routine
1736 * Scan the nfsreq list and retranmit any requests that have timed out
1737 * To avoid retransmission attempts on STREAM sockets (in the future) make
1738 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1739 */
1740 void
1741 nfs_timer(__unused void *arg)
1742 {
1743 struct nfsreq *rep;
1744 mbuf_t m;
1745 socket_t so;
1746 struct nfsmount *nmp;
1747 int timeo;
1748 int error;
1749 #ifndef NFS_NOSERVER
1750 struct nfssvc_sock *slp;
1751 u_quad_t cur_usec;
1752 #endif /* NFS_NOSERVER */
1753 int flags, rexmit, cwnd, sent;
1754 u_long xid;
1755 struct timeval now;
1756
1757 rep = TAILQ_FIRST(&nfs_reqq);
1758 if (rep != NULL)
1759 nfs_repbusy(rep);
1760 microuptime(&now);
1761 for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1762 nmp = rep->r_nmp;
1763 if (!nmp) /* unmounted */
1764 continue;
1765 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1766 continue;
1767 if (nfs_sigintr(nmp, rep, rep->r_procp))
1768 continue;
1769 if (nmp->nm_tprintf_initial_delay != 0 &&
1770 (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1771 rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1772 rep->r_lastmsg = now.tv_sec;
1773 nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1774 "not responding");
1775 rep->r_flags |= R_TPRINTFMSG;
1776 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1777 /* we're not yet completely mounted and */
1778 /* we can't complete an RPC, so we fail */
1779 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1780 nfs_softterm(rep);
1781 continue;
1782 }
1783 }
1784 if (rep->r_rtt >= 0) {
1785 rep->r_rtt++;
1786 if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1787 timeo = nmp->nm_timeo;
1788 else
1789 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1790 /* ensure 62.5 ms floor */
1791 while (16 * timeo < hz)
1792 timeo *= 2;
1793 if (nmp->nm_timeouts > 0)
1794 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1795 if (rep->r_rtt <= timeo)
1796 continue;
1797 if (nmp->nm_timeouts < 8)
1798 nmp->nm_timeouts++;
1799 }
1800 /*
1801 * Check for too many retransmits. This is never true for
1802 * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1803 * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1804 */
1805 if (rep->r_rexmit >= rep->r_retry) { /* too many */
1806 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1807 nfs_softterm(rep);
1808 continue;
1809 }
1810 if (nmp->nm_sotype != SOCK_DGRAM) {
1811 if (++rep->r_rexmit > NFS_MAXREXMIT)
1812 rep->r_rexmit = NFS_MAXREXMIT;
1813 continue;
1814 }
1815 if ((so = nmp->nm_so) == NULL)
1816 continue;
1817
1818 /*
1819 * If there is enough space and the window allows..
1820 * Resend it
1821 * Set r_rtt to -1 in case we fail to send it now.
1822 */
1823 rep->r_rtt = -1;
1824 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1825 (rep->r_flags & R_SENT) ||
1826 nmp->nm_sent < nmp->nm_cwnd) &&
1827 (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1828 struct msghdr msg;
1829 /*
1830 * Iff first send, start timing
1831 * else turn timing off, backoff timer
1832 * and divide congestion window by 2.
1833 * We update these *before* the send to avoid
1834 * racing against receiving the reply.
1835 * We save them so we can restore them on send error.
1836 */
1837 flags = rep->r_flags;
1838 rexmit = rep->r_rexmit;
1839 cwnd = nmp->nm_cwnd;
1840 sent = nmp->nm_sent;
1841 xid = rep->r_xid;
1842 if (rep->r_flags & R_SENT) {
1843 rep->r_flags &= ~R_TIMING;
1844 if (++rep->r_rexmit > NFS_MAXREXMIT)
1845 rep->r_rexmit = NFS_MAXREXMIT;
1846 nmp->nm_cwnd >>= 1;
1847 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1848 nmp->nm_cwnd = NFS_CWNDSCALE;
1849 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1850 } else {
1851 rep->r_flags |= R_SENT;
1852 nmp->nm_sent += NFS_CWNDSCALE;
1853 }
1854 FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1855
1856 bzero(&msg, sizeof(msg));
1857 if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1858 msg.msg_name = mbuf_data(nmp->nm_nam);
1859 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1860 }
1861 error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1862
1863 FSDBG(535, xid, error, sent, cwnd);
1864
1865 if (error) {
1866 if (error == EWOULDBLOCK) {
1867 rep->r_flags = flags;
1868 rep->r_rexmit = rexmit;
1869 nmp->nm_cwnd = cwnd;
1870 nmp->nm_sent = sent;
1871 rep->r_xid = xid;
1872 }
1873 else {
1874 if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1875 int clearerror;
1876 int optlen = sizeof(clearerror);
1877 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1878 }
1879 rep->r_flags = flags | R_RESENDERR;
1880 rep->r_rexmit = rexmit;
1881 nmp->nm_cwnd = cwnd;
1882 nmp->nm_sent = sent;
1883 if (flags & R_SENT)
1884 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1885 }
1886 } else
1887 rep->r_rtt = 0;
1888 }
1889 }
1890 microuptime(&now);
1891 #ifndef NFS_NOSERVER
1892 /*
1893 * Scan the write gathering queues for writes that need to be
1894 * completed now.
1895 */
1896 cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1897 lck_mtx_lock(nfsd_mutex);
1898 TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1899 if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1900 nfsrv_wakenfsd(slp);
1901 }
1902 while ((slp = TAILQ_FIRST(&nfssvc_deadsockhead))) {
1903 if ((slp->ns_timestamp + 5) > now.tv_sec)
1904 break;
1905 TAILQ_REMOVE(&nfssvc_deadsockhead, slp, ns_chain);
1906 nfsrv_slpfree(slp);
1907 }
1908 lck_mtx_unlock(nfsd_mutex);
1909 #endif /* NFS_NOSERVER */
1910
1911 if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1912 /*
1913 * We haven't called nfs_buf_freeup() in a little while.
1914 * So, see if we can free up any stale/unused bufs now.
1915 */
1916 nfs_buf_freeup(1);
1917 }
1918
1919 timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1920
1921 }
1922
1923
1924 /*
1925 * Test for a termination condition pending on the process.
1926 * This is used to determine if we need to bail on a mount.
1927 * EIO is returned if there has been a soft timeout.
1928 * EINTR is returned if there is a signal pending that is not being ignored
1929 * and the mount is interruptable, or if we are a thread that is in the process
1930 * of cancellation (also SIGKILL posted).
1931 */
1932 int
1933 nfs_sigintr(nmp, rep, p)
1934 struct nfsmount *nmp;
1935 struct nfsreq *rep;
1936 proc_t p;
1937 {
1938 sigset_t pending_sigs;
1939 int context_good = 0;
1940 struct nfsmount *repnmp;
1941 extern proc_t kernproc;
1942
1943 if (nmp == NULL)
1944 return (ENXIO);
1945 if (rep != NULL) {
1946 repnmp = rep->r_nmp;
1947 /* we've had a forced unmount. */
1948 if (repnmp == NULL)
1949 return (ENXIO);
1950 /* request has timed out on a 'soft' mount. */
1951 if (rep->r_flags & R_SOFTTERM)
1952 return (EIO);
1953 /*
1954 * We're in the progress of a force unmount and there's
1955 * been a timeout we're dead and fail IO.
1956 */
1957 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1958 (NFSSTA_FORCE|NFSSTA_TIMEO))
1959 return (EIO);
1960 /* Someone is unmounting us, go soft and mark it. */
1961 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1962 repnmp->nm_flag |= NFSMNT_SOFT;
1963 nmp->nm_state |= NFSSTA_FORCE;
1964 }
1965 /*
1966 * If the mount is hung and we've requested not to hang
1967 * on remote filesystems, then bail now.
1968 */
1969 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1970 (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1971 return (EIO);
1972 }
1973 /* XXX: is this valid? this probably should be an assertion. */
1974 if (p == NULL)
1975 return (0);
1976
1977 /* Is this thread belongs to kernel task; then abort check is not needed */
1978 if ((current_proc() != kernproc) && current_thread_aborted()) {
1979 return (EINTR);
1980 }
1981 /* mask off thread and process blocked signals. */
1982
1983 pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1984 if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1985 return (EINTR);
1986 return (0);
1987 }
1988
1989 /*
1990 * Lock a socket against others.
1991 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1992 * and also to avoid race conditions between the processes with nfs requests
1993 * in progress when a reconnect is necessary.
1994 */
1995 int
1996 nfs_sndlock(rep)
1997 struct nfsreq *rep;
1998 {
1999 int *statep;
2000 proc_t p;
2001 int error, slpflag = 0, slptimeo = 0;
2002
2003 if (rep->r_nmp == NULL)
2004 return (ENXIO);
2005 statep = &rep->r_nmp->nm_state;
2006
2007 p = rep->r_procp;
2008 if (rep->r_nmp->nm_flag & NFSMNT_INT)
2009 slpflag = PCATCH;
2010 while (*statep & NFSSTA_SNDLOCK) {
2011 error = nfs_sigintr(rep->r_nmp, rep, p);
2012 if (error)
2013 return (error);
2014 *statep |= NFSSTA_WANTSND;
2015 if (p != NULL && (proc_noremotehang(p)) != 0)
2016 slptimeo = hz;
2017 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2018 if (slpflag == PCATCH) {
2019 slpflag = 0;
2020 slptimeo = 2 * hz;
2021 }
2022 /*
2023 * Make sure while we slept that the mountpoint didn't go away.
2024 * nfs_sigintr and callers expect it in tact.
2025 */
2026 if (!rep->r_nmp)
2027 return (ENXIO); /* don't have lock until out of loop */
2028 }
2029 *statep |= NFSSTA_SNDLOCK;
2030 return (0);
2031 }
2032
2033 /*
2034 * Unlock the stream socket for others.
2035 */
2036 void
2037 nfs_sndunlock(rep)
2038 struct nfsreq *rep;
2039 {
2040 int *statep;
2041
2042 if (rep->r_nmp == NULL)
2043 return;
2044 statep = &rep->r_nmp->nm_state;
2045 if ((*statep & NFSSTA_SNDLOCK) == 0)
2046 panic("nfs sndunlock");
2047 *statep &= ~NFSSTA_SNDLOCK;
2048 if (*statep & NFSSTA_WANTSND) {
2049 *statep &= ~NFSSTA_WANTSND;
2050 wakeup((caddr_t)statep);
2051 }
2052 }
2053
2054 static int
2055 nfs_rcvlock(struct nfsreq *rep)
2056 {
2057 int *statep;
2058 int error, slpflag, slptimeo = 0;
2059
2060 /* make sure we still have our mountpoint */
2061 if (!rep->r_nmp) {
2062 if (rep->r_mrep != NULL)
2063 return (EALREADY);
2064 return (ENXIO);
2065 }
2066
2067 statep = &rep->r_nmp->nm_state;
2068 FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2069 if (rep->r_nmp->nm_flag & NFSMNT_INT)
2070 slpflag = PCATCH;
2071 else
2072 slpflag = 0;
2073 while (*statep & NFSSTA_RCVLOCK) {
2074 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2075 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2076 return (error);
2077 } else if (rep->r_mrep != NULL) {
2078 /*
2079 * Don't bother sleeping if reply already arrived
2080 */
2081 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2082 return (EALREADY);
2083 }
2084 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2085 *statep |= NFSSTA_WANTRCV;
2086 /*
2087 * We need to poll if we're P_NOREMOTEHANG so that we
2088 * call nfs_sigintr periodically above.
2089 */
2090 if (rep->r_procp != NULL &&
2091 (proc_noremotehang(rep->r_procp)) != 0)
2092 slptimeo = hz;
2093 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2094 if (slpflag == PCATCH) {
2095 slpflag = 0;
2096 slptimeo = 2 * hz;
2097 }
2098 /*
2099 * Make sure while we slept that the mountpoint didn't go away.
2100 * nfs_sigintr and caller nfs_reply expect it intact.
2101 */
2102 if (!rep->r_nmp) {
2103 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2104 return (ENXIO); /* don't have lock until out of loop */
2105 }
2106 }
2107 /*
2108 * nfs_reply will handle it if reply already arrived.
2109 * (We may have slept or been preempted).
2110 */
2111 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2112 *statep |= NFSSTA_RCVLOCK;
2113 return (0);
2114 }
2115
2116 /*
2117 * Unlock the stream socket for others.
2118 */
2119 static void
2120 nfs_rcvunlock(struct nfsreq *rep)
2121 {
2122 int *statep;
2123
2124 if (rep->r_nmp == NULL)
2125 return;
2126 statep = &rep->r_nmp->nm_state;
2127
2128 FSDBG(533, statep, *statep, 0, 0);
2129 if ((*statep & NFSSTA_RCVLOCK) == 0)
2130 panic("nfs rcvunlock");
2131 *statep &= ~NFSSTA_RCVLOCK;
2132 if (*statep & NFSSTA_WANTRCV) {
2133 *statep &= ~NFSSTA_WANTRCV;
2134 wakeup((caddr_t)statep);
2135 }
2136 }
2137
2138
2139 #ifndef NFS_NOSERVER
2140 /*
2141 * Socket upcall routine for the nfsd sockets.
2142 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2143 * Essentially do as much as possible non-blocking, else punt and it will
2144 * be called with MBUF_WAITOK from an nfsd.
2145 */
2146 void
2147 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2148 {
2149 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2150
2151 if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2152 return;
2153
2154 lck_rw_lock_exclusive(&slp->ns_rwlock);
2155 nfsrv_rcv_locked(so, slp, waitflag);
2156 /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2157 }
2158 void
2159 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2160 {
2161 mbuf_t m, mp, mhck, m2;
2162 int ns_flag=0, error;
2163 struct msghdr msg;
2164 size_t bytes_read;
2165
2166 if ((slp->ns_flag & SLP_VALID) == 0) {
2167 if (waitflag == MBUF_DONTWAIT)
2168 lck_rw_done(&slp->ns_rwlock);
2169 return;
2170 }
2171
2172 #ifdef notdef
2173 /*
2174 * Define this to test for nfsds handling this under heavy load.
2175 */
2176 if (waitflag == MBUF_DONTWAIT) {
2177 ns_flag = SLP_NEEDQ;
2178 goto dorecs;
2179 }
2180 #endif
2181 if (slp->ns_sotype == SOCK_STREAM) {
2182 /*
2183 * If there are already records on the queue, defer soreceive()
2184 * to an nfsd so that there is feedback to the TCP layer that
2185 * the nfs servers are heavily loaded.
2186 */
2187 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2188 ns_flag = SLP_NEEDQ;
2189 goto dorecs;
2190 }
2191
2192 /*
2193 * Do soreceive().
2194 */
2195 bytes_read = 1000000000;
2196 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2197 if (error || mp == NULL) {
2198 if (error == EWOULDBLOCK)
2199 ns_flag = SLP_NEEDQ;
2200 else
2201 ns_flag = SLP_DISCONN;
2202 goto dorecs;
2203 }
2204 m = mp;
2205 if (slp->ns_rawend) {
2206 if ((error = mbuf_setnext(slp->ns_rawend, m)))
2207 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2208 slp->ns_cc += bytes_read;
2209 } else {
2210 slp->ns_raw = m;
2211 slp->ns_cc = bytes_read;
2212 }
2213 while ((m2 = mbuf_next(m)))
2214 m = m2;
2215 slp->ns_rawend = m;
2216
2217 /*
2218 * Now try and parse record(s) out of the raw stream data.
2219 */
2220 error = nfsrv_getstream(slp, waitflag);
2221 if (error) {
2222 if (error == EPERM)
2223 ns_flag = SLP_DISCONN;
2224 else
2225 ns_flag = SLP_NEEDQ;
2226 }
2227 } else {
2228 struct sockaddr_storage nam;
2229
2230 bzero(&msg, sizeof(msg));
2231 msg.msg_name = (caddr_t)&nam;
2232 msg.msg_namelen = sizeof(nam);
2233
2234 do {
2235 bytes_read = 1000000000;
2236 error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2237 if (mp) {
2238 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2239 mbuf_setlen(mhck, nam.ss_len);
2240 bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2241 m = mhck;
2242 if (mbuf_setnext(m, mp)) {
2243 /* trouble... just drop it */
2244 printf("nfsrv_rcv: mbuf_setnext failed\n");
2245 mbuf_free(mhck);
2246 m = mp;
2247 }
2248 } else {
2249 m = mp;
2250 }
2251 if (slp->ns_recend)
2252 mbuf_setnextpkt(slp->ns_recend, m);
2253 else
2254 slp->ns_rec = m;
2255 slp->ns_recend = m;
2256 mbuf_setnextpkt(m, NULL);
2257 }
2258 #if 0
2259 if (error) {
2260 /*
2261 * This may be needed in the future to support
2262 * non-byte-stream connection-oriented protocols
2263 * such as SCTP.
2264 */
2265 /*
2266 * This (slp->ns_sotype == SOCK_STREAM) should really
2267 * be a check for PR_CONNREQUIRED.
2268 */
2269 if ((slp->ns_sotype == SOCK_STREAM)
2270 && error != EWOULDBLOCK) {
2271 ns_flag = SLP_DISCONN;
2272 goto dorecs;
2273 }
2274 }
2275 #endif
2276 } while (mp);
2277 }
2278
2279 /*
2280 * Now try and process the request records, non-blocking.
2281 */
2282 dorecs:
2283 if (ns_flag)
2284 slp->ns_flag |= ns_flag;
2285 if (waitflag == MBUF_DONTWAIT) {
2286 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2287 lck_rw_done(&slp->ns_rwlock);
2288 if (wake && nfs_numnfsd) {
2289 lck_mtx_lock(nfsd_mutex);
2290 nfsrv_wakenfsd(slp);
2291 lck_mtx_unlock(nfsd_mutex);
2292 }
2293 }
2294 }
2295
2296 /*
2297 * Try and extract an RPC request from the mbuf data list received on a
2298 * stream socket. The "waitflag" argument indicates whether or not it
2299 * can sleep.
2300 */
2301 static int
2302 nfsrv_getstream(slp, waitflag)
2303 struct nfssvc_sock *slp;
2304 int waitflag;
2305 {
2306 mbuf_t m;
2307 char *cp1, *cp2, *mdata;
2308 int len, mlen, error;
2309 mbuf_t om, m2, recm;
2310 u_long recmark;
2311
2312 if (slp->ns_flag & SLP_GETSTREAM)
2313 panic("nfs getstream");
2314 slp->ns_flag |= SLP_GETSTREAM;
2315 for (;;) {
2316 if (slp->ns_reclen == 0) {
2317 if (slp->ns_cc < NFSX_UNSIGNED) {
2318 slp->ns_flag &= ~SLP_GETSTREAM;
2319 return (0);
2320 }
2321 m = slp->ns_raw;
2322 mdata = mbuf_data(m);
2323 mlen = mbuf_len(m);
2324 if (mlen >= NFSX_UNSIGNED) {
2325 bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2326 mdata += NFSX_UNSIGNED;
2327 mlen -= NFSX_UNSIGNED;
2328 mbuf_setdata(m, mdata, mlen);
2329 } else {
2330 cp1 = (caddr_t)&recmark;
2331 cp2 = mdata;
2332 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2333 while (mlen == 0) {
2334 m = mbuf_next(m);
2335 cp2 = mbuf_data(m);
2336 mlen = mbuf_len(m);
2337 }
2338 *cp1++ = *cp2++;
2339 mlen--;
2340 mbuf_setdata(m, cp2, mlen);
2341 }
2342 }
2343 slp->ns_cc -= NFSX_UNSIGNED;
2344 recmark = ntohl(recmark);
2345 slp->ns_reclen = recmark & ~0x80000000;
2346 if (recmark & 0x80000000)
2347 slp->ns_flag |= SLP_LASTFRAG;
2348 else
2349 slp->ns_flag &= ~SLP_LASTFRAG;
2350 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2351 slp->ns_flag &= ~SLP_GETSTREAM;
2352 return (EPERM);
2353 }
2354 }
2355
2356 /*
2357 * Now get the record part.
2358 *
2359 * Note that slp->ns_reclen may be 0. Linux sometimes
2360 * generates 0-length RPCs
2361 */
2362 recm = NULL;
2363 if (slp->ns_cc == slp->ns_reclen) {
2364 recm = slp->ns_raw;
2365 slp->ns_raw = slp->ns_rawend = NULL;
2366 slp->ns_cc = slp->ns_reclen = 0;
2367 } else if (slp->ns_cc > slp->ns_reclen) {
2368 len = 0;
2369 m = slp->ns_raw;
2370 mlen = mbuf_len(m);
2371 mdata = mbuf_data(m);
2372 om = NULL;
2373 while (len < slp->ns_reclen) {
2374 if ((len + mlen) > slp->ns_reclen) {
2375 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2376 slp->ns_flag &= ~SLP_GETSTREAM;
2377 return (EWOULDBLOCK);
2378 }
2379 if (om) {
2380 if (mbuf_setnext(om, m2)) {
2381 /* trouble... just drop it */
2382 printf("nfsrv_getstream: mbuf_setnext failed\n");
2383 mbuf_freem(m2);
2384 slp->ns_flag &= ~SLP_GETSTREAM;
2385 return (EWOULDBLOCK);
2386 }
2387 recm = slp->ns_raw;
2388 } else {
2389 recm = m2;
2390 }
2391 mdata += slp->ns_reclen - len;
2392 mlen -= slp->ns_reclen - len;
2393 mbuf_setdata(m, mdata, mlen);
2394 len = slp->ns_reclen;
2395 } else if ((len + mlen) == slp->ns_reclen) {
2396 om = m;
2397 len += mlen;
2398 m = mbuf_next(m);
2399 recm = slp->ns_raw;
2400 if (mbuf_setnext(om, NULL)) {
2401 printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2402 slp->ns_flag &= ~SLP_GETSTREAM;
2403 return (EWOULDBLOCK);
2404 }
2405 mlen = mbuf_len(m);
2406 mdata = mbuf_data(m);
2407 } else {
2408 om = m;
2409 len += mlen;
2410 m = mbuf_next(m);
2411 mlen = mbuf_len(m);
2412 mdata = mbuf_data(m);
2413 }
2414 }
2415 slp->ns_raw = m;
2416 slp->ns_cc -= len;
2417 slp->ns_reclen = 0;
2418 } else {
2419 slp->ns_flag &= ~SLP_GETSTREAM;
2420 return (0);
2421 }
2422
2423 /*
2424 * Accumulate the fragments into a record.
2425 */
2426 if (slp->ns_frag == NULL) {
2427 slp->ns_frag = recm;
2428 } else {
2429 m = slp->ns_frag;
2430 while ((m2 = mbuf_next(m)))
2431 m = m2;
2432 if ((error = mbuf_setnext(m, recm)))
2433 panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2434 }
2435 if (slp->ns_flag & SLP_LASTFRAG) {
2436 if (slp->ns_recend)
2437 mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2438 else
2439 slp->ns_rec = slp->ns_frag;
2440 slp->ns_recend = slp->ns_frag;
2441 slp->ns_frag = NULL;
2442 }
2443 }
2444 }
2445
2446 /*
2447 * Parse an RPC header.
2448 */
2449 int
2450 nfsrv_dorec(slp, nfsd, ndp)
2451 struct nfssvc_sock *slp;
2452 struct nfsd *nfsd;
2453 struct nfsrv_descript **ndp;
2454 {
2455 mbuf_t m;
2456 mbuf_t nam;
2457 struct nfsrv_descript *nd;
2458 int error;
2459
2460 *ndp = NULL;
2461 if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2462 return (ENOBUFS);
2463 MALLOC_ZONE(nd, struct nfsrv_descript *,
2464 sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2465 if (!nd)
2466 return (ENOMEM);
2467 m = slp->ns_rec;
2468 slp->ns_rec = mbuf_nextpkt(m);
2469 if (slp->ns_rec)
2470 mbuf_setnextpkt(m, NULL);
2471 else
2472 slp->ns_recend = NULL;
2473 if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2474 nam = m;
2475 m = mbuf_next(m);
2476 if ((error = mbuf_setnext(nam, NULL)))
2477 panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2478 } else
2479 nam = NULL;
2480 nd->nd_md = nd->nd_mrep = m;
2481 nd->nd_nam2 = nam;
2482 nd->nd_dpos = mbuf_data(m);
2483 error = nfs_getreq(nd, nfsd, TRUE);
2484 if (error) {
2485 if (nam)
2486 mbuf_freem(nam);
2487 FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC);
2488 return (error);
2489 }
2490 *ndp = nd;
2491 nfsd->nfsd_nd = nd;
2492 return (0);
2493 }
2494
2495 /*
2496 * Parse an RPC request
2497 * - verify it
2498 * - fill in the cred struct.
2499 */
2500 int
2501 nfs_getreq(nd, nfsd, has_header)
2502 struct nfsrv_descript *nd;
2503 struct nfsd *nfsd;
2504 int has_header;
2505 {
2506 int len, i;
2507 u_long *tl;
2508 long t1;
2509 uio_t uiop;
2510 caddr_t dpos, cp2, cp;
2511 u_long nfsvers, auth_type;
2512 uid_t nickuid;
2513 int error = 0, ticklen;
2514 mbuf_t mrep, md;
2515 struct nfsuid *nuidp;
2516 uid_t user_id;
2517 gid_t group_id;
2518 int ngroups;
2519 struct ucred temp_cred;
2520 struct timeval tvin, tvout, now;
2521 char uio_buf[ UIO_SIZEOF(1) ];
2522 #if 0 /* until encrypted keys are implemented */
2523 NFSKERBKEYSCHED_T keys; /* stores key schedule */
2524 #endif
2525
2526 nd->nd_cr = NULL;
2527
2528 mrep = nd->nd_mrep;
2529 md = nd->nd_md;
2530 dpos = nd->nd_dpos;
2531 if (has_header) {
2532 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2533 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2534 if (*tl++ != rpc_call) {
2535 mbuf_freem(mrep);
2536 return (EBADRPC);
2537 }
2538 } else
2539 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2540 nd->nd_repstat = 0;
2541 nd->nd_flag = 0;
2542 if (*tl++ != rpc_vers) {
2543 nd->nd_repstat = ERPCMISMATCH;
2544 nd->nd_procnum = NFSPROC_NOOP;
2545 return (0);
2546 }
2547 if (*tl != nfs_prog) {
2548 nd->nd_repstat = EPROGUNAVAIL;
2549 nd->nd_procnum = NFSPROC_NOOP;
2550 return (0);
2551 }
2552 tl++;
2553 nfsvers = fxdr_unsigned(u_long, *tl++);
2554 if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2555 nd->nd_repstat = EPROGMISMATCH;
2556 nd->nd_procnum = NFSPROC_NOOP;
2557 return (0);
2558 }
2559 else if (nfsvers == NFS_VER3)
2560 nd->nd_flag = ND_NFSV3;
2561 nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2562 if (nd->nd_procnum == NFSPROC_NULL)
2563 return (0);
2564 if ((nd->nd_procnum >= NFS_NPROCS) ||
2565 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2566 nd->nd_repstat = EPROCUNAVAIL;
2567 nd->nd_procnum = NFSPROC_NOOP;
2568 return (0);
2569 }
2570 if ((nd->nd_flag & ND_NFSV3) == 0)
2571 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2572 auth_type = *tl++;
2573 len = fxdr_unsigned(int, *tl++);
2574 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2575 mbuf_freem(mrep);
2576 return (EBADRPC);
2577 }
2578
2579 nd->nd_flag &= ~ND_KERBAUTH;
2580 /*
2581 * Handle auth_unix or auth_kerb.
2582 */
2583 if (auth_type == rpc_auth_unix) {
2584 len = fxdr_unsigned(int, *++tl);
2585 if (len < 0 || len > NFS_MAXNAMLEN) {
2586 mbuf_freem(mrep);
2587 return (EBADRPC);
2588 }
2589 bzero(&temp_cred, sizeof(temp_cred));
2590 nfsm_adv(nfsm_rndup(len));
2591 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2592 user_id = fxdr_unsigned(uid_t, *tl++);
2593 group_id = fxdr_unsigned(gid_t, *tl++);
2594 temp_cred.cr_groups[0] = group_id;
2595 len = fxdr_unsigned(int, *tl);
2596 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2597 mbuf_freem(mrep);
2598 return (EBADRPC);
2599 }
2600 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2601 for (i = 1; i <= len; i++)
2602 if (i < NGROUPS)
2603 temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2604 else
2605 tl++;
2606 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2607 if (ngroups > 1)
2608 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2609 len = fxdr_unsigned(int, *++tl);
2610 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2611 mbuf_freem(mrep);
2612 return (EBADRPC);
2613 }
2614 temp_cred.cr_uid = user_id;
2615 temp_cred.cr_ngroups = ngroups;
2616 nd->nd_cr = kauth_cred_create(&temp_cred);
2617 if (nd->nd_cr == NULL) {
2618 nd->nd_repstat = ENOMEM;
2619 nd->nd_procnum = NFSPROC_NOOP;
2620 return (0);
2621 }
2622 if (len > 0)
2623 nfsm_adv(nfsm_rndup(len));
2624 } else if (auth_type == rpc_auth_kerb) {
2625 switch (fxdr_unsigned(int, *tl++)) {
2626 case RPCAKN_FULLNAME:
2627 ticklen = fxdr_unsigned(int, *tl);
2628 *((u_long *)nfsd->nfsd_authstr) = *tl;
2629 uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2630 &uio_buf[0], sizeof(uio_buf));
2631 if (!uiop) {
2632 nd->nd_repstat = ENOMEM;
2633 nd->nd_procnum = NFSPROC_NOOP;
2634 return (0);
2635 }
2636
2637 // LP64todo - fix this
2638 nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2639 if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2640 mbuf_freem(mrep);
2641 return (EBADRPC);
2642 }
2643 uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2644 // LP64todo - fix this
2645 nfsm_mtouio(uiop, uio_resid(uiop));
2646 nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2647 if (*tl++ != rpc_auth_kerb ||
2648 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2649 printf("Bad kerb verifier\n");
2650 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2651 nd->nd_procnum = NFSPROC_NOOP;
2652 return (0);
2653 }
2654 nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2655 tl = (u_long *)cp;
2656 if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2657 printf("Not fullname kerb verifier\n");
2658 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2659 nd->nd_procnum = NFSPROC_NOOP;
2660 return (0);
2661 }
2662 cp += NFSX_UNSIGNED;
2663 bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2664 nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2665 nd->nd_flag |= ND_KERBFULL;
2666 nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2667 break;
2668 case RPCAKN_NICKNAME:
2669 if (len != 2 * NFSX_UNSIGNED) {
2670 printf("Kerb nickname short\n");
2671 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2672 nd->nd_procnum = NFSPROC_NOOP;
2673 return (0);
2674 }
2675 nickuid = fxdr_unsigned(uid_t, *tl);
2676 nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2677 if (*tl++ != rpc_auth_kerb ||
2678 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2679 printf("Kerb nick verifier bad\n");
2680 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2681 nd->nd_procnum = NFSPROC_NOOP;
2682 return (0);
2683 }
2684 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2685 tvin.tv_sec = *tl++;
2686 tvin.tv_usec = *tl;
2687
2688 for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2689 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2690 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2691 (!nd->nd_nam2 ||
2692 netaddr_match(NU_NETFAM(nuidp),
2693 &nuidp->nu_haddr, nd->nd_nam2)))
2694 break;
2695 }
2696 if (!nuidp) {
2697 nd->nd_repstat =
2698 (NFSERR_AUTHERR|AUTH_REJECTCRED);
2699 nd->nd_procnum = NFSPROC_NOOP;
2700 return (0);
2701 }
2702
2703 /*
2704 * Now, decrypt the timestamp using the session key
2705 * and validate it.
2706 */
2707 #if NFSKERB
2708 XXX
2709 #endif
2710
2711 tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2712 tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2713 microtime(&now);
2714 if (nuidp->nu_expire < now.tv_sec ||
2715 nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2716 (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2717 nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2718 nuidp->nu_expire = 0;
2719 nd->nd_repstat =
2720 (NFSERR_AUTHERR|AUTH_REJECTVERF);
2721 nd->nd_procnum = NFSPROC_NOOP;
2722 return (0);
2723 }
2724 bzero(&temp_cred, sizeof(temp_cred));
2725 ngroups = nuidp->nu_cr->cr_ngroups;
2726 for (i = 0; i < ngroups; i++)
2727 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2728 if (ngroups > 1)
2729 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2730
2731 temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2732 temp_cred.cr_ngroups = ngroups;
2733 nd->nd_cr = kauth_cred_create(&temp_cred);
2734 if (!nd->nd_cr) {
2735 nd->nd_repstat = ENOMEM;
2736 nd->nd_procnum = NFSPROC_NOOP;
2737 return (0);
2738 }
2739 nd->nd_flag |= ND_KERBNICK;
2740 };
2741 } else {
2742 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2743 nd->nd_procnum = NFSPROC_NOOP;
2744 return (0);
2745 }
2746
2747 nd->nd_md = md;
2748 nd->nd_dpos = dpos;
2749 return (0);
2750 nfsmout:
2751 if (nd->nd_cr)
2752 kauth_cred_rele(nd->nd_cr);
2753 return (error);
2754 }
2755
2756 /*
2757 * Search for a sleeping nfsd and wake it up.
2758 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2759 * running nfsds will go look for the work in the nfssvc_sock list.
2760 * Note: Must be called with nfsd_mutex held.
2761 */
2762 void
2763 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2764 {
2765 struct nfsd *nd;
2766
2767 if ((slp->ns_flag & SLP_VALID) == 0)
2768 return;
2769
2770 lck_rw_lock_exclusive(&slp->ns_rwlock);
2771
2772 if (nfsd_waiting) {
2773 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2774 if (nd->nfsd_flag & NFSD_WAITING) {
2775 nd->nfsd_flag &= ~NFSD_WAITING;
2776 if (nd->nfsd_slp)
2777 panic("nfsd wakeup");
2778 slp->ns_sref++;
2779 nd->nfsd_slp = slp;
2780 lck_rw_done(&slp->ns_rwlock);
2781 wakeup((caddr_t)nd);
2782 return;
2783 }
2784 }
2785 }
2786
2787 slp->ns_flag |= SLP_DOREC;
2788
2789 lck_rw_done(&slp->ns_rwlock);
2790
2791 nfsd_head_flag |= NFSD_CHECKSLP;
2792 }
2793 #endif /* NFS_NOSERVER */
2794
2795 static int
2796 nfs_msg(proc_t p,
2797 const char *server,
2798 const char *msg,
2799 int error)
2800 {
2801 tpr_t tpr;
2802
2803 if (p)
2804 tpr = tprintf_open(p);
2805 else
2806 tpr = NULL;
2807 if (error)
2808 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2809 error);
2810 else
2811 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2812 tprintf_close(tpr);
2813 return (0);
2814 }
2815
2816 void
2817 nfs_down(nmp, proc, error, flags, msg)
2818 struct nfsmount *nmp;
2819 proc_t proc;
2820 int error, flags;
2821 const char *msg;
2822 {
2823 if (nmp == NULL)
2824 return;
2825 if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2826 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2827 nmp->nm_state |= NFSSTA_TIMEO;
2828 }
2829 if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2830 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2831 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2832 }
2833 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2834 }
2835
2836 void
2837 nfs_up(nmp, proc, flags, msg)
2838 struct nfsmount *nmp;
2839 proc_t proc;
2840 int flags;
2841 const char *msg;
2842 {
2843 if (nmp == NULL)
2844 return;
2845 if (msg)
2846 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2847 if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2848 nmp->nm_state &= ~NFSSTA_TIMEO;
2849 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2850 }
2851 if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2852 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2853 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2854 }
2855 }
2856