]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_socket.c
1f4fda19e576592c72b2d928476a4e13f2409682
[apple/xnu.git] / bsd / nfs / nfs_socket.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1991, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
66 */
67
68 /*
69 * Socket operations for use by nfs
70 */
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/proc.h>
75 #include <sys/kauth.h>
76 #include <sys/mount_internal.h>
77 #include <sys/kernel.h>
78 #include <sys/kpi_mbuf.h>
79 #include <sys/malloc.h>
80 #include <sys/vnode.h>
81 #include <sys/domain.h>
82 #include <sys/protosw.h>
83 #include <sys/socket.h>
84 #include <sys/syslog.h>
85 #include <sys/tprintf.h>
86 #include <sys/uio_internal.h>
87 #include <libkern/OSAtomic.h>
88
89 #include <sys/time.h>
90 #include <kern/clock.h>
91 #include <kern/task.h>
92 #include <kern/thread.h>
93 #include <sys/user.h>
94
95 #include <netinet/in.h>
96 #include <netinet/tcp.h>
97
98 #include <nfs/rpcv2.h>
99 #include <nfs/nfsproto.h>
100 #include <nfs/nfs.h>
101 #include <nfs/xdr_subs.h>
102 #include <nfs/nfsm_subs.h>
103 #include <nfs/nfsmount.h>
104 #include <nfs/nfsnode.h>
105 #include <nfs/nfsrtt.h>
106
107 #include <sys/kdebug.h>
108
109 #define FSDBG(A, B, C, D, E) \
110 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
111 (int)(B), (int)(C), (int)(D), (int)(E), 0)
112 #define FSDBG_TOP(A, B, C, D, E) \
113 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
114 (int)(B), (int)(C), (int)(D), (int)(E), 0)
115 #define FSDBG_BOT(A, B, C, D, E) \
116 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
117 (int)(B), (int)(C), (int)(D), (int)(E), 0)
118
119 /*
120 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
121 * Use the mean and mean deviation of rtt for the appropriate type of rpc
122 * for the frequent rpcs and a default for the others.
123 * The justification for doing "other" this way is that these rpcs
124 * happen so infrequently that timer est. would probably be stale.
125 * Also, since many of these rpcs are
126 * non-idempotent, a conservative timeout is desired.
127 * getattr, lookup - A+2D
128 * read, write - A+4D
129 * other - nm_timeo
130 */
131 #define NFS_RTO(n, t) \
132 ((t) == 0 ? (n)->nm_timeo : \
133 ((t) < 3 ? \
134 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
135 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
136 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
137 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
138 /*
139 * External data, mostly RPC constants in XDR form
140 */
141 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
142 rpc_msgaccepted, rpc_call, rpc_autherr,
143 rpc_auth_kerb;
144 extern u_long nfs_prog;
145 extern struct nfsstats nfsstats;
146 extern int nfsv3_procid[NFS_NPROCS];
147 extern int nfs_ticks;
148 extern u_long nfs_xidwrap;
149
150 /*
151 * Defines which timer to use for the procnum.
152 * 0 - default
153 * 1 - getattr
154 * 2 - lookup
155 * 3 - read
156 * 4 - write
157 */
158 static int proct[NFS_NPROCS] = {
159 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
160 };
161
162 /*
163 * There is a congestion window for outstanding rpcs maintained per mount
164 * point. The cwnd size is adjusted in roughly the way that:
165 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
166 * SIGCOMM '88". ACM, August 1988.
167 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
168 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
169 * of rpcs is in progress.
170 * (The sent count and cwnd are scaled for integer arith.)
171 * Variants of "slow start" were tried and were found to be too much of a
172 * performance hit (ave. rtt 3 times larger),
173 * I suspect due to the large rtt that nfs rpcs have.
174 */
175 #define NFS_CWNDSCALE 256
176 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
177 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
178 int nfsrtton = 0;
179 struct nfsrtt nfsrtt;
180
181 static int nfs_rcvlock(struct nfsreq *);
182 static void nfs_rcvunlock(struct nfsreq *);
183 static int nfs_receive(struct nfsreq *rep, mbuf_t *mp);
184 static int nfs_reconnect(struct nfsreq *rep);
185 static void nfs_repdequeue(struct nfsreq *rep);
186
187 /* XXX */
188 boolean_t current_thread_aborted(void);
189 kern_return_t thread_terminate(thread_t);
190
191 #ifndef NFS_NOSERVER
192 static int nfsrv_getstream(struct nfssvc_sock *,int);
193
194 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
195 struct nfssvc_sock *slp,
196 proc_t procp,
197 mbuf_t *mreqp) = {
198 nfsrv_null,
199 nfsrv_getattr,
200 nfsrv_setattr,
201 nfsrv_lookup,
202 nfsrv3_access,
203 nfsrv_readlink,
204 nfsrv_read,
205 nfsrv_write,
206 nfsrv_create,
207 nfsrv_mkdir,
208 nfsrv_symlink,
209 nfsrv_mknod,
210 nfsrv_remove,
211 nfsrv_rmdir,
212 nfsrv_rename,
213 nfsrv_link,
214 nfsrv_readdir,
215 nfsrv_readdirplus,
216 nfsrv_statfs,
217 nfsrv_fsinfo,
218 nfsrv_pathconf,
219 nfsrv_commit,
220 nfsrv_noop
221 };
222 #endif /* NFS_NOSERVER */
223
224
225 /*
226 * attempt to bind a socket to a reserved port
227 */
228 static int
229 nfs_bind_resv(struct nfsmount *nmp)
230 {
231 socket_t so = nmp->nm_so;
232 struct sockaddr_in sin;
233 int error;
234 u_short tport;
235
236 if (!so)
237 return (EINVAL);
238
239 sin.sin_len = sizeof (struct sockaddr_in);
240 sin.sin_family = AF_INET;
241 sin.sin_addr.s_addr = INADDR_ANY;
242 tport = IPPORT_RESERVED - 1;
243 sin.sin_port = htons(tport);
244
245 while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
246 (--tport > IPPORT_RESERVED / 2))
247 sin.sin_port = htons(tport);
248 return (error);
249 }
250
251 /*
252 * variables for managing the nfs_bind_resv_thread
253 */
254 int nfs_resv_mounts = 0;
255 static int nfs_bind_resv_thread_state = 0;
256 #define NFS_BIND_RESV_THREAD_STATE_INITTED 1
257 #define NFS_BIND_RESV_THREAD_STATE_RUNNING 2
258 lck_grp_t *nfs_bind_resv_lck_grp;
259 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
260 lck_attr_t *nfs_bind_resv_lck_attr;
261 lck_mtx_t *nfs_bind_resv_mutex;
262 struct nfs_bind_resv_request {
263 TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
264 struct nfsmount *brr_nmp;
265 int brr_error;
266 };
267 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
268
269 /*
270 * thread to handle any reserved port bind requests
271 */
272 static void
273 nfs_bind_resv_thread(void)
274 {
275 struct nfs_bind_resv_request *brreq;
276
277 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
278
279 while (nfs_resv_mounts > 0) {
280 lck_mtx_lock(nfs_bind_resv_mutex);
281 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
282 TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
283 lck_mtx_unlock(nfs_bind_resv_mutex);
284 brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
285 wakeup(brreq);
286 lck_mtx_lock(nfs_bind_resv_mutex);
287 }
288 msleep((caddr_t)&nfs_bind_resv_request_queue,
289 nfs_bind_resv_mutex, PSOCK | PDROP,
290 "nfs_bind_resv_request_queue", 0);
291 }
292
293 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
294 (void) thread_terminate(current_thread());
295 }
296
297 int
298 nfs_bind_resv_thread_wake(void)
299 {
300 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
301 return (EIO);
302 wakeup(&nfs_bind_resv_request_queue);
303 return (0);
304 }
305
306 /*
307 * underprivileged procs call this to request nfs_bind_resv_thread
308 * to perform the reserved port binding for them.
309 */
310 static int
311 nfs_bind_resv_nopriv(struct nfsmount *nmp)
312 {
313 struct nfs_bind_resv_request brreq;
314 int error;
315
316 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
317 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
318 nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
319 nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
320 nfs_bind_resv_lck_attr = lck_attr_alloc_init();
321 nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
322 TAILQ_INIT(&nfs_bind_resv_request_queue);
323 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
324 }
325 kernel_thread(kernel_task, nfs_bind_resv_thread);
326 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
327 }
328
329 brreq.brr_nmp = nmp;
330 brreq.brr_error = 0;
331
332 lck_mtx_lock(nfs_bind_resv_mutex);
333 TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
334 lck_mtx_unlock(nfs_bind_resv_mutex);
335
336 error = nfs_bind_resv_thread_wake();
337 if (error) {
338 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
339 /* Note: we might be able to simply restart the thread */
340 return (error);
341 }
342
343 tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
344
345 return (brreq.brr_error);
346 }
347
348 /*
349 * Initialize sockets and congestion for a new NFS connection.
350 * We do not free the sockaddr if error.
351 */
352 int
353 nfs_connect(
354 struct nfsmount *nmp,
355 __unused struct nfsreq *rep)
356 {
357 socket_t so;
358 int error, rcvreserve, sndreserve;
359 struct sockaddr *saddr;
360 struct timeval timeo;
361
362 nmp->nm_so = 0;
363 saddr = mbuf_data(nmp->nm_nam);
364 error = sock_socket(saddr->sa_family, nmp->nm_sotype,
365 nmp->nm_soproto, 0, 0, &nmp->nm_so);
366 if (error) {
367 goto bad;
368 }
369 so = nmp->nm_so;
370
371 /*
372 * Some servers require that the client port be a reserved port number.
373 */
374 if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
375 proc_t p;
376 /*
377 * sobind() requires current_proc() to have superuser privs.
378 * If this bind is part of a reconnect, and the current proc
379 * doesn't have superuser privs, we hand the sobind() off to
380 * a kernel thread to process.
381 */
382 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
383 (p = current_proc()) && suser(kauth_cred_get(), 0)) {
384 /* request nfs_bind_resv_thread() to do bind */
385 error = nfs_bind_resv_nopriv(nmp);
386 } else {
387 error = nfs_bind_resv(nmp);
388 }
389 if (error)
390 goto bad;
391 }
392
393 /*
394 * Protocols that do not require connections may be optionally left
395 * unconnected for servers that reply from a port other than NFS_PORT.
396 */
397 if (nmp->nm_flag & NFSMNT_NOCONN) {
398 if (nmp->nm_sotype == SOCK_STREAM) {
399 error = ENOTCONN;
400 goto bad;
401 }
402 } else {
403 struct timeval tv;
404 tv.tv_sec = 2;
405 tv.tv_usec = 0;
406 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
407 if (error && error != EINPROGRESS) {
408 goto bad;
409 }
410
411 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
412 if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
413 goto bad;
414 }
415 }
416 }
417
418 /*
419 * Always time out on recieve, this allows us to reconnect the
420 * socket to deal with network changes.
421 */
422 timeo.tv_usec = 0;
423 timeo.tv_sec = 2;
424 error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
425 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
426 timeo.tv_sec = 5;
427 } else {
428 timeo.tv_sec = 0;
429 }
430 error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
431
432 if (nmp->nm_sotype == SOCK_DGRAM) {
433 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
434 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
435 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
436 } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
437 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
438 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
439 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
440 } else {
441 int proto;
442 int on = 1;
443
444 sock_gettype(so, NULL, NULL, &proto);
445 if (nmp->nm_sotype != SOCK_STREAM)
446 panic("nfscon sotype");
447
448 // Assume that SOCK_STREAM always requires a connection
449 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
450
451 if (proto == IPPROTO_TCP) {
452 sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
453 }
454
455 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
456 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
457 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
458 }
459
460 if (sndreserve > NFS_MAXSOCKBUF)
461 sndreserve = NFS_MAXSOCKBUF;
462 if (rcvreserve > NFS_MAXSOCKBUF)
463 rcvreserve = NFS_MAXSOCKBUF;
464 error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
465 if (error) {
466 goto bad;
467 }
468 error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
469 if (error) {
470 goto bad;
471 }
472
473 sock_nointerrupt(so, 1);
474
475 /* Initialize other non-zero congestion variables */
476 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
477 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
478 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
479 nmp->nm_sdrtt[3] = 0;
480 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
481 nmp->nm_sent = 0;
482 FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
483 nmp->nm_timeouts = 0;
484 return (0);
485
486 bad:
487 nfs_disconnect(nmp);
488 return (error);
489 }
490
491 /*
492 * Reconnect routine:
493 * Called when a connection is broken on a reliable protocol.
494 * - clean up the old socket
495 * - nfs_connect() again
496 * - set R_MUSTRESEND for all outstanding requests on mount point
497 * If this fails the mount point is DEAD!
498 * nb: Must be called with the nfs_sndlock() set on the mount point.
499 */
500 static int
501 nfs_reconnect(struct nfsreq *rep)
502 {
503 struct nfsreq *rp;
504 struct nfsmount *nmp = rep->r_nmp;
505 int error;
506
507 nfs_disconnect(nmp);
508 while ((error = nfs_connect(nmp, rep))) {
509 if (error == EINTR || error == ERESTART)
510 return (EINTR);
511 if (error == EIO)
512 return (EIO);
513 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
514 "can not connect");
515 rep->r_flags |= R_TPRINTFMSG;
516 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
517 /* we're not yet completely mounted and */
518 /* we can't reconnect, so we fail */
519 return (error);
520 }
521 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
522 return (error);
523 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
524 }
525
526 /*
527 * Loop through outstanding request list and fix up all requests
528 * on old socket.
529 */
530 TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
531 if (rp->r_nmp == nmp)
532 rp->r_flags |= R_MUSTRESEND;
533 }
534 return (0);
535 }
536
537 /*
538 * NFS disconnect. Clean up and unlink.
539 */
540 void
541 nfs_disconnect(struct nfsmount *nmp)
542 {
543 socket_t so;
544
545 if (nmp->nm_so) {
546 so = nmp->nm_so;
547 nmp->nm_so = 0;
548 sock_shutdown(so, 2);
549 sock_close(so);
550 }
551 }
552
553 /*
554 * This is the nfs send routine. For connection based socket types, it
555 * must be called with an nfs_sndlock() on the socket.
556 * "rep == NULL" indicates that it has been called from a server.
557 * For the client side:
558 * - return EINTR if the RPC is terminated, 0 otherwise
559 * - set R_MUSTRESEND if the send fails for any reason
560 * - do any cleanup required by recoverable socket errors (???)
561 * For the server side:
562 * - return EINTR or ERESTART if interrupted by a signal
563 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
564 * - do any cleanup required by recoverable socket errors (???)
565 */
566 int
567 nfs_send(so, nam, top, rep)
568 socket_t so;
569 mbuf_t nam;
570 mbuf_t top;
571 struct nfsreq *rep;
572 {
573 struct sockaddr *sendnam;
574 int error, error2, sotype, flags;
575 u_long xidqueued = 0;
576 struct nfsreq *rp;
577 char savenametolog[MAXPATHLEN];
578 struct msghdr msg;
579
580 if (rep) {
581 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
582 if (error) {
583 mbuf_freem(top);
584 return (error);
585 }
586 if ((so = rep->r_nmp->nm_so) == NULL) {
587 rep->r_flags |= R_MUSTRESEND;
588 mbuf_freem(top);
589 return (0);
590 }
591 rep->r_flags &= ~R_MUSTRESEND;
592 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
593 if (rp == rep)
594 break;
595 if (rp)
596 xidqueued = rp->r_xid;
597 }
598 sock_gettype(so, NULL, &sotype, NULL);
599 if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
600 (nam == 0))
601 sendnam = (struct sockaddr *)0;
602 else
603 sendnam = mbuf_data(nam);
604
605 if (sotype == SOCK_SEQPACKET)
606 flags = MSG_EOR;
607 else
608 flags = 0;
609
610 /*
611 * Save the name here in case mount point goes away if we block.
612 * The name is using local stack and is large, but don't
613 * want to block if we malloc.
614 */
615 if (rep)
616 strncpy(savenametolog,
617 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
618 MAXPATHLEN - 1);
619 bzero(&msg, sizeof(msg));
620 msg.msg_name = (caddr_t)sendnam;
621 msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
622 error = sock_sendmbuf(so, &msg, top, flags, NULL);
623
624 if (error) {
625 if (rep) {
626 if (xidqueued) {
627 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
628 if (rp == rep && rp->r_xid == xidqueued)
629 break;
630 if (!rp)
631 panic("nfs_send: error %d xid %x gone",
632 error, xidqueued);
633 }
634 log(LOG_INFO, "nfs send error %d for server %s\n",
635 error, savenametolog);
636 /*
637 * Deal with errors for the client side.
638 */
639 error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
640 if (error2) {
641 error = error2;
642 } else {
643 rep->r_flags |= R_MUSTRESEND;
644 }
645 } else
646 log(LOG_INFO, "nfsd send error %d\n", error);
647
648 /*
649 * Handle any recoverable (soft) socket errors here. (???)
650 */
651 if (error != EINTR && error != ERESTART && error != EIO &&
652 error != EWOULDBLOCK && error != EPIPE) {
653 error = 0;
654 }
655 }
656 return (error);
657 }
658
659 /*
660 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
661 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
662 * Mark and consolidate the data into a new mbuf list.
663 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
664 * small mbufs.
665 * For SOCK_STREAM we must be very careful to read an entire record once
666 * we have read any of it, even if the system call has been interrupted.
667 */
668 static int
669 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
670 {
671 socket_t so;
672 struct iovec_32 aio;
673 mbuf_t m, mlast;
674 u_long len, fraglen;
675 int error, error2, sotype;
676 proc_t p = current_proc(); /* XXX */
677 struct msghdr msg;
678 size_t rcvlen;
679 int lastfragment;
680
681 /*
682 * Set up arguments for soreceive()
683 */
684 *mp = NULL;
685 sotype = rep->r_nmp->nm_sotype;
686
687 /*
688 * For reliable protocols, lock against other senders/receivers
689 * in case a reconnect is necessary.
690 * For SOCK_STREAM, first get the Record Mark to find out how much
691 * more there is to get.
692 * We must lock the socket against other receivers
693 * until we have an entire rpc request/reply.
694 */
695 if (sotype != SOCK_DGRAM) {
696 error = nfs_sndlock(rep);
697 if (error)
698 return (error);
699 tryagain:
700 /*
701 * Check for fatal errors and resending request.
702 */
703 /*
704 * Ugh: If a reconnect attempt just happened, nm_so
705 * would have changed. NULL indicates a failed
706 * attempt that has essentially shut down this
707 * mount point.
708 */
709 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
710 nfs_sndunlock(rep);
711 if (error)
712 return (error);
713 return (EINTR);
714 }
715 so = rep->r_nmp->nm_so;
716 if (!so) {
717 error = nfs_reconnect(rep);
718 if (error) {
719 nfs_sndunlock(rep);
720 return (error);
721 }
722 goto tryagain;
723 }
724 while (rep->r_flags & R_MUSTRESEND) {
725 error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
726 if (!error) {
727 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
728 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
729 }
730 /*
731 * we also hold rcv lock so rep is still
732 * legit this point
733 */
734 if (error) {
735 if (error == EINTR || error == ERESTART ||
736 (error = nfs_reconnect(rep))) {
737 nfs_sndunlock(rep);
738 return (error);
739 }
740 goto tryagain;
741 }
742 }
743 nfs_sndunlock(rep);
744 if (sotype == SOCK_STREAM) {
745 error = 0;
746 len = 0;
747 lastfragment = 0;
748 mlast = NULL;
749 while (!error && !lastfragment) {
750 aio.iov_base = (uintptr_t) &fraglen;
751 aio.iov_len = sizeof(u_long);
752 bzero(&msg, sizeof(msg));
753 msg.msg_iov = (struct iovec *) &aio;
754 msg.msg_iovlen = 1;
755 do {
756 error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
757 if (!rep->r_nmp) /* if unmounted then bailout */
758 goto shutout;
759 if (error == EWOULDBLOCK && rep) {
760 error2 = nfs_sigintr(rep->r_nmp, rep, p);
761 if (error2)
762 error = error2;
763 }
764 } while (error == EWOULDBLOCK);
765 if (!error && rcvlen < aio.iov_len) {
766 /* only log a message if we got a partial word */
767 if (rcvlen != 0)
768 log(LOG_INFO,
769 "short receive (%d/%d) from nfs server %s\n",
770 rcvlen, sizeof(u_long),
771 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
772 error = EPIPE;
773 }
774 if (error)
775 goto errout;
776 lastfragment = ntohl(fraglen) & 0x80000000;
777 fraglen = ntohl(fraglen) & ~0x80000000;
778 len += fraglen;
779 /*
780 * This is SERIOUS! We are out of sync with the sender
781 * and forcing a disconnect/reconnect is all I can do.
782 */
783 if (len > NFS_MAXPACKET) {
784 log(LOG_ERR, "%s (%d) from nfs server %s\n",
785 "impossible RPC record length", len,
786 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
787 error = EFBIG;
788 goto errout;
789 }
790
791 m = NULL;
792 do {
793 rcvlen = fraglen;
794 error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
795 if (!rep->r_nmp) /* if unmounted then bailout */ {
796 goto shutout;
797 }
798 } while (error == EWOULDBLOCK || error == EINTR ||
799 error == ERESTART);
800
801 if (!error && fraglen > rcvlen) {
802 log(LOG_INFO,
803 "short receive (%d/%d) from nfs server %s\n",
804 rcvlen, fraglen,
805 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
806 error = EPIPE;
807 mbuf_freem(m);
808 }
809 if (!error) {
810 if (!*mp) {
811 *mp = m;
812 mlast = m;
813 } else {
814 error = mbuf_setnext(mlast, m);
815 if (error) {
816 printf("nfs_receive: mbuf_setnext failed %d\n", error);
817 mbuf_freem(m);
818 }
819 }
820 while (mbuf_next(mlast))
821 mlast = mbuf_next(mlast);
822 }
823 }
824 } else {
825 bzero(&msg, sizeof(msg));
826 do {
827 rcvlen = 100000000;
828 error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
829 if (!rep->r_nmp) /* if unmounted then bailout */ {
830 goto shutout;
831 }
832 if (error == EWOULDBLOCK && rep) {
833 error2 = nfs_sigintr(rep->r_nmp, rep, p);
834 if (error2) {
835 return (error2);
836 }
837 }
838 } while (error == EWOULDBLOCK);
839
840 if ((msg.msg_flags & MSG_EOR) == 0)
841 printf("Egad!!\n");
842 if (!error && *mp == NULL)
843 error = EPIPE;
844 len = rcvlen;
845 }
846 errout:
847 if (error && error != EINTR && error != ERESTART) {
848 mbuf_freem(*mp);
849 *mp = NULL;
850 if (error != EPIPE)
851 log(LOG_INFO,
852 "receive error %d from nfs server %s\n", error,
853 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
854 error = nfs_sndlock(rep);
855 if (!error) {
856 error = nfs_reconnect(rep);
857 if (!error)
858 goto tryagain;
859 nfs_sndunlock(rep);
860 }
861 }
862 } else {
863 /*
864 * We could have failed while rebinding the datagram socket
865 * so we need to attempt to rebind here.
866 */
867 if ((so = rep->r_nmp->nm_so) == NULL) {
868 error = nfs_sndlock(rep);
869 if (!error) {
870 error = nfs_reconnect(rep);
871 nfs_sndunlock(rep);
872 }
873 if (error)
874 return (error);
875 if (!rep->r_nmp) /* if unmounted then bailout */
876 return (ENXIO);
877 so = rep->r_nmp->nm_so;
878 }
879 bzero(&msg, sizeof(msg));
880 len = 0;
881 do {
882 rcvlen = 1000000;
883 error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
884 if (!rep->r_nmp) /* if unmounted then bailout */
885 goto shutout;
886 if (error) {
887 error2 = nfs_sigintr(rep->r_nmp, rep, p);
888 if (error2) {
889 error = error2;
890 goto shutout;
891 }
892 }
893 /* Reconnect for all errors. We may be receiving
894 * soft/hard/blocking errors because of a network
895 * change.
896 * XXX: we should rate limit or delay this
897 * to once every N attempts or something.
898 * although TCP doesn't seem to.
899 */
900 if (error) {
901 error2 = nfs_sndlock(rep);
902 if (!error2) {
903 error2 = nfs_reconnect(rep);
904 if (error2)
905 error = error2;
906 else if (!rep->r_nmp) /* if unmounted then bailout */
907 error = ENXIO;
908 else
909 so = rep->r_nmp->nm_so;
910 nfs_sndunlock(rep);
911 } else {
912 error = error2;
913 }
914 }
915 } while (error == EWOULDBLOCK);
916 }
917 shutout:
918 if (error) {
919 mbuf_freem(*mp);
920 *mp = NULL;
921 }
922 return (error);
923 }
924
925 /*
926 * Implement receipt of reply on a socket.
927 * We must search through the list of received datagrams matching them
928 * with outstanding requests using the xid, until ours is found.
929 */
930 /* ARGSUSED */
931 int
932 nfs_reply(myrep)
933 struct nfsreq *myrep;
934 {
935 struct nfsreq *rep;
936 struct nfsmount *nmp = myrep->r_nmp;
937 long t1;
938 mbuf_t mrep, md;
939 u_long rxid, *tl;
940 caddr_t dpos, cp2;
941 int error;
942
943 /*
944 * Loop around until we get our own reply
945 */
946 for (;;) {
947 /*
948 * Lock against other receivers so that I don't get stuck in
949 * sbwait() after someone else has received my reply for me.
950 * Also necessary for connection based protocols to avoid
951 * race conditions during a reconnect.
952 * If nfs_rcvlock() returns EALREADY, that means that
953 * the reply has already been recieved by another
954 * process and we can return immediately. In this
955 * case, the lock is not taken to avoid races with
956 * other processes.
957 */
958 error = nfs_rcvlock(myrep);
959 if (error == EALREADY)
960 return (0);
961 if (error)
962 return (error);
963
964 /*
965 * If we slept after putting bits otw, then reply may have
966 * arrived. In which case returning is required, or we
967 * would hang trying to nfs_receive an already received reply.
968 */
969 if (myrep->r_mrep != NULL) {
970 nfs_rcvunlock(myrep);
971 FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
972 return (0);
973 }
974 /*
975 * Get the next Rpc reply off the socket. Assume myrep->r_nmp
976 * is still intact by checks done in nfs_rcvlock.
977 */
978 error = nfs_receive(myrep, &mrep);
979 /*
980 * Bailout asap if nfsmount struct gone (unmounted).
981 */
982 if (!myrep->r_nmp) {
983 FSDBG(530, myrep->r_xid, myrep, nmp, -2);
984 if (mrep)
985 mbuf_freem(mrep);
986 return (ENXIO);
987 }
988 if (error) {
989 FSDBG(530, myrep->r_xid, myrep, nmp, error);
990 nfs_rcvunlock(myrep);
991
992 /* Bailout asap if nfsmount struct gone (unmounted). */
993 if (!myrep->r_nmp) {
994 if (mrep)
995 mbuf_freem(mrep);
996 return (ENXIO);
997 }
998
999 /*
1000 * Ignore routing errors on connectionless protocols??
1001 */
1002 if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1003 if (nmp->nm_so) {
1004 int clearerror;
1005 int optlen = sizeof(clearerror);
1006 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1007 }
1008 continue;
1009 }
1010 if (mrep)
1011 mbuf_freem(mrep);
1012 return (error);
1013 }
1014
1015 /*
1016 * We assume all is fine, but if we did not have an error
1017 * and mrep is 0, better not dereference it. nfs_receive
1018 * calls soreceive which carefully sets error=0 when it got
1019 * errors on sbwait (tsleep). In most cases, I assume that's
1020 * so we could go back again. In tcp case, EPIPE is returned.
1021 * In udp, case nfs_receive gets back here with no error and no
1022 * mrep. Is the right fix to have soreceive check for process
1023 * aborted after sbwait and return something non-zero? Should
1024 * nfs_receive give an EPIPE? Too risky to play with those
1025 * two this late in game for a shutdown problem. Instead,
1026 * just check here and get out. (ekn)
1027 */
1028 if (!mrep) {
1029 nfs_rcvunlock(myrep);
1030 FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1031 return (ENXIO); /* sounds good */
1032 }
1033
1034 /*
1035 * Get the xid and check that it is an rpc reply
1036 */
1037 md = mrep;
1038 dpos = mbuf_data(md);
1039 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1040 rxid = *tl++;
1041 if (*tl != rpc_reply) {
1042 OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1043 mbuf_freem(mrep);
1044 nfsmout:
1045 if (nmp->nm_state & NFSSTA_RCVLOCK)
1046 nfs_rcvunlock(myrep);
1047 continue;
1048 }
1049
1050 /*
1051 * Loop through the request list to match up the reply
1052 * Iff no match, just drop the datagram
1053 */
1054 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1055 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1056 /* Found it.. */
1057 rep->r_mrep = mrep;
1058 rep->r_md = md;
1059 rep->r_dpos = dpos;
1060 /*
1061 * If we're tracking the round trip time
1062 * then we update the circular log here
1063 * with the stats from our current request.
1064 */
1065 if (nfsrtton) {
1066 struct rttl *rt;
1067
1068 rt = &nfsrtt.rttl[nfsrtt.pos];
1069 rt->proc = rep->r_procnum;
1070 rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1071 rt->sent = nmp->nm_sent;
1072 rt->cwnd = nmp->nm_cwnd;
1073 if (proct[rep->r_procnum] == 0)
1074 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1075 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1076 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1077 rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1078 microtime(&rt->tstamp); // XXX unused
1079 if (rep->r_flags & R_TIMING)
1080 rt->rtt = rep->r_rtt;
1081 else
1082 rt->rtt = 1000000;
1083 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1084 }
1085 /*
1086 * Update congestion window.
1087 * Do the additive increase of
1088 * one rpc/rtt.
1089 */
1090 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1091 nmp->nm_cwnd);
1092 if (nmp->nm_cwnd <= nmp->nm_sent) {
1093 nmp->nm_cwnd +=
1094 (NFS_CWNDSCALE * NFS_CWNDSCALE +
1095 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1096 if (nmp->nm_cwnd > NFS_MAXCWND)
1097 nmp->nm_cwnd = NFS_MAXCWND;
1098 }
1099 if (rep->r_flags & R_SENT) {
1100 rep->r_flags &= ~R_SENT;
1101 nmp->nm_sent -= NFS_CWNDSCALE;
1102 }
1103 /*
1104 * Update rtt using a gain of 0.125 on the mean
1105 * and a gain of 0.25 on the deviation.
1106 */
1107 if (rep->r_flags & R_TIMING) {
1108 /*
1109 * Since the timer resolution of
1110 * NFS_HZ is so course, it can often
1111 * result in r_rtt == 0. Since
1112 * r_rtt == N means that the actual
1113 * rtt is between N+dt and N+2-dt ticks,
1114 * add 1.
1115 */
1116 if (proct[rep->r_procnum] == 0)
1117 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1118 t1 = rep->r_rtt + 1;
1119 t1 -= (NFS_SRTT(rep) >> 3);
1120 NFS_SRTT(rep) += t1;
1121 if (t1 < 0)
1122 t1 = -t1;
1123 t1 -= (NFS_SDRTT(rep) >> 2);
1124 NFS_SDRTT(rep) += t1;
1125 }
1126 nmp->nm_timeouts = 0;
1127 break;
1128 }
1129 }
1130 nfs_rcvunlock(myrep);
1131 /*
1132 * If not matched to a request, drop it.
1133 * If it's mine, get out.
1134 */
1135 if (rep == 0) {
1136 OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1137 mbuf_freem(mrep);
1138 } else if (rep == myrep) {
1139 if (rep->r_mrep == NULL)
1140 panic("nfs_reply: nil r_mrep");
1141 return (0);
1142 }
1143 FSDBG(530, myrep->r_xid, myrep, rep,
1144 rep ? rep->r_xid : myrep->r_flags);
1145 }
1146 }
1147
1148 /*
1149 * nfs_request - goes something like this
1150 * - fill in request struct
1151 * - links it into list
1152 * - calls nfs_send() for first transmit
1153 * - calls nfs_receive() to get reply
1154 * - break down rpc header and return with nfs reply pointed to
1155 * by mrep or error
1156 * nb: always frees up mreq mbuf list
1157 */
1158 int
1159 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1160 vnode_t vp;
1161 mount_t mp;
1162 mbuf_t mrest;
1163 int procnum;
1164 proc_t procp;
1165 kauth_cred_t cred;
1166 mbuf_t *mrp;
1167 mbuf_t *mdp;
1168 caddr_t *dposp;
1169 u_int64_t *xidp;
1170 {
1171 mbuf_t m, mrep, m2;
1172 struct nfsreq re, *rep;
1173 u_long *tl;
1174 int i;
1175 struct nfsmount *nmp;
1176 mbuf_t md, mheadend;
1177 char nickv[RPCX_NICKVERF];
1178 time_t waituntil;
1179 caddr_t dpos, cp2;
1180 int t1, error = 0, mrest_len, auth_len, auth_type;
1181 int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1182 int verf_len, verf_type;
1183 u_long xid;
1184 char *auth_str, *verf_str;
1185 NFSKERBKEY_T key; /* save session key */
1186 int nmsotype;
1187 struct timeval now;
1188
1189 if (mrp)
1190 *mrp = NULL;
1191 if (xidp)
1192 *xidp = 0;
1193 nmp = VFSTONFS(mp);
1194
1195 rep = &re;
1196
1197 if (vp)
1198 nmp = VFSTONFS(vnode_mount(vp));
1199 if (nmp == NULL ||
1200 (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1201 (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1202 mbuf_freem(mrest);
1203 return (ENXIO);
1204 }
1205 nmsotype = nmp->nm_sotype;
1206
1207 FSDBG_TOP(531, vp, procnum, nmp, rep);
1208
1209 rep->r_nmp = nmp;
1210 rep->r_vp = vp;
1211 rep->r_procp = procp;
1212 rep->r_procnum = procnum;
1213 microuptime(&now);
1214 rep->r_lastmsg = now.tv_sec -
1215 ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1216 i = 0;
1217 m = mrest;
1218 while (m) {
1219 i += mbuf_len(m);
1220 m = mbuf_next(m);
1221 }
1222 mrest_len = i;
1223
1224 /*
1225 * Get the RPC header with authorization.
1226 */
1227 kerbauth:
1228 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1229 if (!nmp) {
1230 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1231 mbuf_freem(mrest);
1232 return (ENXIO);
1233 }
1234 verf_str = auth_str = (char *)0;
1235 if (nmp->nm_flag & NFSMNT_KERB) {
1236 verf_str = nickv;
1237 verf_len = sizeof (nickv);
1238 auth_type = RPCAUTH_KERB4;
1239 bzero((caddr_t)key, sizeof (key));
1240 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1241 &auth_len, verf_str, verf_len)) {
1242 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1243 if (!nmp) {
1244 FSDBG_BOT(531, 2, vp, error, rep);
1245 mbuf_freem(mrest);
1246 return (ENXIO);
1247 }
1248 error = nfs_getauth(nmp, rep, cred, &auth_str,
1249 &auth_len, verf_str, &verf_len, key);
1250 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1251 if (!error && !nmp)
1252 error = ENXIO;
1253 if (error) {
1254 FSDBG_BOT(531, 2, vp, error, rep);
1255 mbuf_freem(mrest);
1256 return (error);
1257 }
1258 }
1259 } else {
1260 auth_type = RPCAUTH_UNIX;
1261 if (cred->cr_ngroups < 1)
1262 panic("nfsreq nogrps");
1263 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1264 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1265 5 * NFSX_UNSIGNED;
1266 }
1267 error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1268 auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1269 if (auth_str)
1270 _FREE(auth_str, M_TEMP);
1271 if (error) {
1272 mbuf_freem(mrest);
1273 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1274 return (error);
1275 }
1276 if (xidp)
1277 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1278
1279 /*
1280 * For stream protocols, insert a Sun RPC Record Mark.
1281 */
1282 if (nmsotype == SOCK_STREAM) {
1283 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1284 if (error) {
1285 mbuf_freem(m);
1286 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1287 return (error);
1288 }
1289 *((u_long*)mbuf_data(m)) =
1290 htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1291 }
1292 rep->r_mreq = m;
1293 rep->r_xid = xid;
1294 tryagain:
1295 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1296 if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1297 rep->r_retry = nmp->nm_retry;
1298 else
1299 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
1300 rep->r_rtt = rep->r_rexmit = 0;
1301 if (proct[procnum] > 0)
1302 rep->r_flags = R_TIMING;
1303 else
1304 rep->r_flags = 0;
1305 rep->r_mrep = NULL;
1306
1307 /*
1308 * Do the client side RPC.
1309 */
1310 OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1311 /*
1312 * Chain request into list of outstanding requests. Be sure
1313 * to put it LAST so timer finds oldest requests first.
1314 */
1315 TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1316
1317 /*
1318 * If backing off another request or avoiding congestion, don't
1319 * send this one now but let timer do it. If not timing a request,
1320 * do it now.
1321 */
1322 if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1323 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1324 nmp->nm_sent < nmp->nm_cwnd)) {
1325 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1326
1327 if (connrequired)
1328 error = nfs_sndlock(rep);
1329
1330 /*
1331 * Set the R_SENT before doing the send in case another thread
1332 * processes the reply before the nfs_send returns here
1333 */
1334 if (!error) {
1335 if ((rep->r_flags & R_MUSTRESEND) == 0) {
1336 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1337 nmp->nm_cwnd);
1338 nmp->nm_sent += NFS_CWNDSCALE;
1339 rep->r_flags |= R_SENT;
1340 }
1341
1342 error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1343 if (!error)
1344 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1345 if (connrequired)
1346 nfs_sndunlock(rep);
1347 }
1348 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1349 if (error) {
1350 if (nmp)
1351 nmp->nm_sent -= NFS_CWNDSCALE;
1352 rep->r_flags &= ~R_SENT;
1353 }
1354 } else {
1355 rep->r_rtt = -1;
1356 }
1357
1358 /*
1359 * Wait for the reply from our send or the timer's.
1360 */
1361 if (!error || error == EPIPE)
1362 error = nfs_reply(rep);
1363
1364 /*
1365 * RPC done, unlink the request.
1366 */
1367 nfs_repdequeue(rep);
1368
1369 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1370
1371 /*
1372 * Decrement the outstanding request count.
1373 */
1374 if (rep->r_flags & R_SENT) {
1375 rep->r_flags &= ~R_SENT; /* paranoia */
1376 if (nmp) {
1377 FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1378 nmp->nm_sent -= NFS_CWNDSCALE;
1379 }
1380 }
1381
1382 /*
1383 * If there was a successful reply and a tprintf msg.
1384 * tprintf a response.
1385 */
1386 if (!error)
1387 nfs_up(nmp, procp, NFSSTA_TIMEO,
1388 (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1389 mrep = rep->r_mrep;
1390 md = rep->r_md;
1391 dpos = rep->r_dpos;
1392 if (!error && !nmp)
1393 error = ENXIO;
1394 if (error) {
1395 mbuf_freem(rep->r_mreq);
1396 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1397 return (error);
1398 }
1399
1400 /*
1401 * break down the rpc header and check if ok
1402 */
1403 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1404 if (*tl++ == rpc_msgdenied) {
1405 if (*tl == rpc_mismatch)
1406 error = EOPNOTSUPP;
1407 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1408 if (!failed_auth) {
1409 failed_auth++;
1410 error = mbuf_setnext(mheadend, NULL);
1411 mbuf_freem(mrep);
1412 mbuf_freem(rep->r_mreq);
1413 if (!error)
1414 goto kerbauth;
1415 printf("nfs_request: mbuf_setnext failed\n");
1416 } else
1417 error = EAUTH;
1418 } else
1419 error = EACCES;
1420 mbuf_freem(mrep);
1421 mbuf_freem(rep->r_mreq);
1422 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1423 return (error);
1424 }
1425
1426 /*
1427 * Grab any Kerberos verifier, otherwise just throw it away.
1428 */
1429 verf_type = fxdr_unsigned(int, *tl++);
1430 i = fxdr_unsigned(int, *tl);
1431 if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1432 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1433 if (error)
1434 goto nfsmout;
1435 } else if (i > 0)
1436 nfsm_adv(nfsm_rndup(i));
1437 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1438 /* 0 == ok */
1439 if (*tl == 0) {
1440 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1441 if (*tl != 0) {
1442 error = fxdr_unsigned(int, *tl);
1443 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1444 error == NFSERR_TRYLATER) {
1445 mbuf_freem(mrep);
1446 error = 0;
1447 microuptime(&now);
1448 waituntil = now.tv_sec + trylater_delay;
1449 while (now.tv_sec < waituntil) {
1450 tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1451 microuptime(&now);
1452 }
1453 trylater_delay *= 2;
1454 if (trylater_delay > 60)
1455 trylater_delay = 60;
1456 goto tryagain;
1457 }
1458
1459 /*
1460 * If the File Handle was stale, invalidate the
1461 * lookup cache, just in case.
1462 */
1463 if ((error == ESTALE) && vp)
1464 cache_purge(vp);
1465 if (nmp->nm_flag & NFSMNT_NFSV3) {
1466 *mrp = mrep;
1467 *mdp = md;
1468 *dposp = dpos;
1469 error |= NFSERR_RETERR;
1470 } else {
1471 mbuf_freem(mrep);
1472 error &= ~NFSERR_RETERR;
1473 }
1474 mbuf_freem(rep->r_mreq);
1475 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1476 return (error);
1477 }
1478
1479 *mrp = mrep;
1480 *mdp = md;
1481 *dposp = dpos;
1482 mbuf_freem(rep->r_mreq);
1483 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1484 return (0);
1485 }
1486 mbuf_freem(mrep);
1487 error = EPROTONOSUPPORT;
1488 nfsmout:
1489 mbuf_freem(rep->r_mreq);
1490 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1491 return (error);
1492 }
1493
1494 #ifndef NFS_NOSERVER
1495 /*
1496 * Generate the rpc reply header
1497 * siz arg. is used to decide if adding a cluster is worthwhile
1498 */
1499 int
1500 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1501 int siz;
1502 struct nfsrv_descript *nd;
1503 struct nfssvc_sock *slp;
1504 int err;
1505 mbuf_t *mrq;
1506 mbuf_t *mbp;
1507 caddr_t *bposp;
1508 {
1509 u_long *tl;
1510 mbuf_t mreq;
1511 caddr_t bpos;
1512 mbuf_t mb, mb2;
1513 int error, mlen;
1514
1515 /*
1516 * If this is a big reply, use a cluster else
1517 * try and leave leading space for the lower level headers.
1518 */
1519 siz += RPC_REPLYSIZ;
1520 if (siz >= nfs_mbuf_minclsize) {
1521 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1522 } else {
1523 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1524 }
1525 if (error) {
1526 /* unable to allocate packet */
1527 /* XXX nfsstat? */
1528 return (error);
1529 }
1530 mb = mreq;
1531 tl = mbuf_data(mreq);
1532 mlen = 6 * NFSX_UNSIGNED;
1533 if (siz < nfs_mbuf_minclsize) {
1534 /* leave space for lower level headers */
1535 tl += 80/sizeof(*tl); /* XXX max_hdr? XXX */
1536 mbuf_setdata(mreq, tl, mlen);
1537 } else {
1538 mbuf_setlen(mreq, mlen);
1539 }
1540 bpos = ((caddr_t)tl) + mlen;
1541 *tl++ = txdr_unsigned(nd->nd_retxid);
1542 *tl++ = rpc_reply;
1543 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1544 *tl++ = rpc_msgdenied;
1545 if (err & NFSERR_AUTHERR) {
1546 *tl++ = rpc_autherr;
1547 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1548 mlen -= NFSX_UNSIGNED;
1549 mbuf_setlen(mreq, mlen);
1550 bpos -= NFSX_UNSIGNED;
1551 } else {
1552 *tl++ = rpc_mismatch;
1553 *tl++ = txdr_unsigned(RPC_VER2);
1554 *tl = txdr_unsigned(RPC_VER2);
1555 }
1556 } else {
1557 *tl++ = rpc_msgaccepted;
1558
1559 /*
1560 * For Kerberos authentication, we must send the nickname
1561 * verifier back, otherwise just RPCAUTH_NULL.
1562 */
1563 if (nd->nd_flag & ND_KERBFULL) {
1564 struct nfsuid *nuidp;
1565 struct timeval ktvin, ktvout;
1566 uid_t uid = kauth_cred_getuid(nd->nd_cr);
1567
1568 lck_rw_lock_shared(&slp->ns_rwlock);
1569 for (nuidp = NUIDHASH(slp, uid)->lh_first;
1570 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1571 if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1572 (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1573 &nuidp->nu_haddr, nd->nd_nam2)))
1574 break;
1575 }
1576 if (nuidp) {
1577 ktvin.tv_sec =
1578 txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1579 ktvin.tv_usec =
1580 txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1581
1582 /*
1583 * Encrypt the timestamp in ecb mode using the
1584 * session key.
1585 */
1586 #if NFSKERB
1587 XXX
1588 #endif
1589
1590 *tl++ = rpc_auth_kerb;
1591 *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1592 *tl = ktvout.tv_sec;
1593 nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1594 *tl++ = ktvout.tv_usec;
1595 *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1596 } else {
1597 *tl++ = 0;
1598 *tl++ = 0;
1599 }
1600 lck_rw_done(&slp->ns_rwlock);
1601 } else {
1602 *tl++ = 0;
1603 *tl++ = 0;
1604 }
1605 switch (err) {
1606 case EPROGUNAVAIL:
1607 *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1608 break;
1609 case EPROGMISMATCH:
1610 *tl = txdr_unsigned(RPC_PROGMISMATCH);
1611 nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1612 // XXX hard coded versions
1613 *tl++ = txdr_unsigned(2);
1614 *tl = txdr_unsigned(3);
1615 break;
1616 case EPROCUNAVAIL:
1617 *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1618 break;
1619 case EBADRPC:
1620 *tl = txdr_unsigned(RPC_GARBAGE);
1621 break;
1622 default:
1623 *tl = 0;
1624 if (err != NFSERR_RETVOID) {
1625 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1626 if (err)
1627 *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1628 else
1629 *tl = 0;
1630 }
1631 break;
1632 }
1633 }
1634
1635 if (mrq != NULL)
1636 *mrq = mreq;
1637 *mbp = mb;
1638 *bposp = bpos;
1639 if (err != 0 && err != NFSERR_RETVOID) {
1640 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1641 }
1642 return (0);
1643 }
1644
1645
1646 #endif /* NFS_NOSERVER */
1647
1648
1649 /*
1650 * From FreeBSD 1.58, a Matt Dillon fix...
1651 * Flag a request as being about to terminate.
1652 * The nm_sent count is decremented now to avoid deadlocks when the process
1653 * in soreceive() hasn't yet managed to send its own request.
1654 */
1655 static void
1656 nfs_softterm(struct nfsreq *rep)
1657 {
1658
1659 rep->r_flags |= R_SOFTTERM;
1660 if (rep->r_flags & R_SENT) {
1661 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1662 rep->r_nmp->nm_cwnd);
1663 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1664 rep->r_flags &= ~R_SENT;
1665 }
1666 }
1667
1668 void
1669 nfs_timer_funnel(void * arg)
1670 {
1671 (void) thread_funnel_set(kernel_flock, TRUE);
1672 nfs_timer(arg);
1673 (void) thread_funnel_set(kernel_flock, FALSE);
1674
1675 }
1676
1677 /*
1678 * Ensure rep isn't in use by the timer, then dequeue it.
1679 */
1680 static void
1681 nfs_repdequeue(struct nfsreq *rep)
1682 {
1683
1684 while ((rep->r_flags & R_BUSY)) {
1685 rep->r_flags |= R_WAITING;
1686 tsleep(rep, PSOCK, "repdeq", 0);
1687 }
1688 TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1689 }
1690
1691 /*
1692 * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1693 * free()'d out from under it.
1694 */
1695 static void
1696 nfs_repbusy(struct nfsreq *rep)
1697 {
1698
1699 if ((rep->r_flags & R_BUSY))
1700 panic("rep locked");
1701 rep->r_flags |= R_BUSY;
1702 }
1703
1704 /*
1705 * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1706 */
1707 static struct nfsreq *
1708 nfs_repnext(struct nfsreq *rep)
1709 {
1710 struct nfsreq * nextrep;
1711
1712 if (rep == NULL)
1713 return (NULL);
1714 /*
1715 * We need to get and busy the next req before signalling the
1716 * current one, otherwise wakeup() may block us and we'll race to
1717 * grab the next req.
1718 */
1719 nextrep = TAILQ_NEXT(rep, r_chain);
1720 if (nextrep != NULL)
1721 nfs_repbusy(nextrep);
1722 /* unbusy and signal. */
1723 rep->r_flags &= ~R_BUSY;
1724 if ((rep->r_flags & R_WAITING)) {
1725 rep->r_flags &= ~R_WAITING;
1726 wakeup(rep);
1727 }
1728 return (nextrep);
1729 }
1730
1731 /*
1732 * Nfs timer routine
1733 * Scan the nfsreq list and retranmit any requests that have timed out
1734 * To avoid retransmission attempts on STREAM sockets (in the future) make
1735 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1736 */
1737 void
1738 nfs_timer(__unused void *arg)
1739 {
1740 struct nfsreq *rep;
1741 mbuf_t m;
1742 socket_t so;
1743 struct nfsmount *nmp;
1744 int timeo;
1745 int error;
1746 #ifndef NFS_NOSERVER
1747 struct nfssvc_sock *slp;
1748 u_quad_t cur_usec;
1749 #endif /* NFS_NOSERVER */
1750 int flags, rexmit, cwnd, sent;
1751 u_long xid;
1752 struct timeval now;
1753
1754 rep = TAILQ_FIRST(&nfs_reqq);
1755 if (rep != NULL)
1756 nfs_repbusy(rep);
1757 microuptime(&now);
1758 for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1759 nmp = rep->r_nmp;
1760 if (!nmp) /* unmounted */
1761 continue;
1762 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1763 continue;
1764 if (nfs_sigintr(nmp, rep, rep->r_procp))
1765 continue;
1766 if (nmp->nm_tprintf_initial_delay != 0 &&
1767 (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1768 rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1769 rep->r_lastmsg = now.tv_sec;
1770 nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1771 "not responding");
1772 rep->r_flags |= R_TPRINTFMSG;
1773 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1774 /* we're not yet completely mounted and */
1775 /* we can't complete an RPC, so we fail */
1776 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1777 nfs_softterm(rep);
1778 continue;
1779 }
1780 }
1781 if (rep->r_rtt >= 0) {
1782 rep->r_rtt++;
1783 if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1784 timeo = nmp->nm_timeo;
1785 else
1786 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1787 /* ensure 62.5 ms floor */
1788 while (16 * timeo < hz)
1789 timeo *= 2;
1790 if (nmp->nm_timeouts > 0)
1791 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1792 if (rep->r_rtt <= timeo)
1793 continue;
1794 if (nmp->nm_timeouts < 8)
1795 nmp->nm_timeouts++;
1796 }
1797 /*
1798 * Check for too many retransmits. This is never true for
1799 * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1800 * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1801 */
1802 if (rep->r_rexmit >= rep->r_retry) { /* too many */
1803 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1804 nfs_softterm(rep);
1805 continue;
1806 }
1807 if (nmp->nm_sotype != SOCK_DGRAM) {
1808 if (++rep->r_rexmit > NFS_MAXREXMIT)
1809 rep->r_rexmit = NFS_MAXREXMIT;
1810 continue;
1811 }
1812 if ((so = nmp->nm_so) == NULL)
1813 continue;
1814
1815 /*
1816 * If there is enough space and the window allows..
1817 * Resend it
1818 * Set r_rtt to -1 in case we fail to send it now.
1819 */
1820 rep->r_rtt = -1;
1821 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1822 (rep->r_flags & R_SENT) ||
1823 nmp->nm_sent < nmp->nm_cwnd) &&
1824 (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1825 struct msghdr msg;
1826 /*
1827 * Iff first send, start timing
1828 * else turn timing off, backoff timer
1829 * and divide congestion window by 2.
1830 * We update these *before* the send to avoid
1831 * racing against receiving the reply.
1832 * We save them so we can restore them on send error.
1833 */
1834 flags = rep->r_flags;
1835 rexmit = rep->r_rexmit;
1836 cwnd = nmp->nm_cwnd;
1837 sent = nmp->nm_sent;
1838 xid = rep->r_xid;
1839 if (rep->r_flags & R_SENT) {
1840 rep->r_flags &= ~R_TIMING;
1841 if (++rep->r_rexmit > NFS_MAXREXMIT)
1842 rep->r_rexmit = NFS_MAXREXMIT;
1843 nmp->nm_cwnd >>= 1;
1844 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1845 nmp->nm_cwnd = NFS_CWNDSCALE;
1846 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1847 } else {
1848 rep->r_flags |= R_SENT;
1849 nmp->nm_sent += NFS_CWNDSCALE;
1850 }
1851 FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1852
1853 bzero(&msg, sizeof(msg));
1854 if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1855 msg.msg_name = mbuf_data(nmp->nm_nam);
1856 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1857 }
1858 error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1859
1860 FSDBG(535, xid, error, sent, cwnd);
1861
1862 if (error) {
1863 if (error == EWOULDBLOCK) {
1864 rep->r_flags = flags;
1865 rep->r_rexmit = rexmit;
1866 nmp->nm_cwnd = cwnd;
1867 nmp->nm_sent = sent;
1868 rep->r_xid = xid;
1869 }
1870 else {
1871 if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1872 int clearerror;
1873 int optlen = sizeof(clearerror);
1874 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1875 }
1876 rep->r_flags = flags | R_RESENDERR;
1877 rep->r_rexmit = rexmit;
1878 nmp->nm_cwnd = cwnd;
1879 nmp->nm_sent = sent;
1880 if (flags & R_SENT)
1881 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1882 }
1883 } else
1884 rep->r_rtt = 0;
1885 }
1886 }
1887 microuptime(&now);
1888 #ifndef NFS_NOSERVER
1889 /*
1890 * Scan the write gathering queues for writes that need to be
1891 * completed now.
1892 */
1893 cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1894 lck_mtx_lock(nfsd_mutex);
1895 TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1896 if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1897 nfsrv_wakenfsd(slp);
1898 }
1899 while ((slp = TAILQ_FIRST(&nfssvc_deadsockhead))) {
1900 if ((slp->ns_timestamp + 5) > now.tv_sec)
1901 break;
1902 TAILQ_REMOVE(&nfssvc_deadsockhead, slp, ns_chain);
1903 nfsrv_slpfree(slp);
1904 }
1905 lck_mtx_unlock(nfsd_mutex);
1906 #endif /* NFS_NOSERVER */
1907
1908 if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1909 /*
1910 * We haven't called nfs_buf_freeup() in a little while.
1911 * So, see if we can free up any stale/unused bufs now.
1912 */
1913 nfs_buf_freeup(1);
1914 }
1915
1916 timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1917
1918 }
1919
1920
1921 /*
1922 * Test for a termination condition pending on the process.
1923 * This is used to determine if we need to bail on a mount.
1924 * EIO is returned if there has been a soft timeout.
1925 * EINTR is returned if there is a signal pending that is not being ignored
1926 * and the mount is interruptable, or if we are a thread that is in the process
1927 * of cancellation (also SIGKILL posted).
1928 */
1929 int
1930 nfs_sigintr(nmp, rep, p)
1931 struct nfsmount *nmp;
1932 struct nfsreq *rep;
1933 proc_t p;
1934 {
1935 sigset_t pending_sigs;
1936 int context_good = 0;
1937 struct nfsmount *repnmp;
1938 extern proc_t kernproc;
1939
1940 if (nmp == NULL)
1941 return (ENXIO);
1942 if (rep != NULL) {
1943 repnmp = rep->r_nmp;
1944 /* we've had a forced unmount. */
1945 if (repnmp == NULL)
1946 return (ENXIO);
1947 /* request has timed out on a 'soft' mount. */
1948 if (rep->r_flags & R_SOFTTERM)
1949 return (EIO);
1950 /*
1951 * We're in the progress of a force unmount and there's
1952 * been a timeout we're dead and fail IO.
1953 */
1954 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1955 (NFSSTA_FORCE|NFSSTA_TIMEO))
1956 return (EIO);
1957 /* Someone is unmounting us, go soft and mark it. */
1958 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1959 repnmp->nm_flag |= NFSMNT_SOFT;
1960 nmp->nm_state |= NFSSTA_FORCE;
1961 }
1962 /*
1963 * If the mount is hung and we've requested not to hang
1964 * on remote filesystems, then bail now.
1965 */
1966 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1967 (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1968 return (EIO);
1969 }
1970 /* XXX: is this valid? this probably should be an assertion. */
1971 if (p == NULL)
1972 return (0);
1973
1974 /* Is this thread belongs to kernel task; then abort check is not needed */
1975 if ((current_proc() != kernproc) && current_thread_aborted()) {
1976 return (EINTR);
1977 }
1978 /* mask off thread and process blocked signals. */
1979
1980 pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1981 if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1982 return (EINTR);
1983 return (0);
1984 }
1985
1986 /*
1987 * Lock a socket against others.
1988 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1989 * and also to avoid race conditions between the processes with nfs requests
1990 * in progress when a reconnect is necessary.
1991 */
1992 int
1993 nfs_sndlock(rep)
1994 struct nfsreq *rep;
1995 {
1996 int *statep;
1997 proc_t p;
1998 int error, slpflag = 0, slptimeo = 0;
1999
2000 if (rep->r_nmp == NULL)
2001 return (ENXIO);
2002 statep = &rep->r_nmp->nm_state;
2003
2004 p = rep->r_procp;
2005 if (rep->r_nmp->nm_flag & NFSMNT_INT)
2006 slpflag = PCATCH;
2007 while (*statep & NFSSTA_SNDLOCK) {
2008 error = nfs_sigintr(rep->r_nmp, rep, p);
2009 if (error)
2010 return (error);
2011 *statep |= NFSSTA_WANTSND;
2012 if (p != NULL && (proc_noremotehang(p)) != 0)
2013 slptimeo = hz;
2014 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2015 if (slpflag == PCATCH) {
2016 slpflag = 0;
2017 slptimeo = 2 * hz;
2018 }
2019 /*
2020 * Make sure while we slept that the mountpoint didn't go away.
2021 * nfs_sigintr and callers expect it in tact.
2022 */
2023 if (!rep->r_nmp)
2024 return (ENXIO); /* don't have lock until out of loop */
2025 }
2026 *statep |= NFSSTA_SNDLOCK;
2027 return (0);
2028 }
2029
2030 /*
2031 * Unlock the stream socket for others.
2032 */
2033 void
2034 nfs_sndunlock(rep)
2035 struct nfsreq *rep;
2036 {
2037 int *statep;
2038
2039 if (rep->r_nmp == NULL)
2040 return;
2041 statep = &rep->r_nmp->nm_state;
2042 if ((*statep & NFSSTA_SNDLOCK) == 0)
2043 panic("nfs sndunlock");
2044 *statep &= ~NFSSTA_SNDLOCK;
2045 if (*statep & NFSSTA_WANTSND) {
2046 *statep &= ~NFSSTA_WANTSND;
2047 wakeup((caddr_t)statep);
2048 }
2049 }
2050
2051 static int
2052 nfs_rcvlock(struct nfsreq *rep)
2053 {
2054 int *statep;
2055 int error, slpflag, slptimeo = 0;
2056
2057 /* make sure we still have our mountpoint */
2058 if (!rep->r_nmp) {
2059 if (rep->r_mrep != NULL)
2060 return (EALREADY);
2061 return (ENXIO);
2062 }
2063
2064 statep = &rep->r_nmp->nm_state;
2065 FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2066 if (rep->r_nmp->nm_flag & NFSMNT_INT)
2067 slpflag = PCATCH;
2068 else
2069 slpflag = 0;
2070 while (*statep & NFSSTA_RCVLOCK) {
2071 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2072 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2073 return (error);
2074 } else if (rep->r_mrep != NULL) {
2075 /*
2076 * Don't bother sleeping if reply already arrived
2077 */
2078 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2079 return (EALREADY);
2080 }
2081 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2082 *statep |= NFSSTA_WANTRCV;
2083 /*
2084 * We need to poll if we're P_NOREMOTEHANG so that we
2085 * call nfs_sigintr periodically above.
2086 */
2087 if (rep->r_procp != NULL &&
2088 (proc_noremotehang(rep->r_procp)) != 0)
2089 slptimeo = hz;
2090 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2091 if (slpflag == PCATCH) {
2092 slpflag = 0;
2093 slptimeo = 2 * hz;
2094 }
2095 /*
2096 * Make sure while we slept that the mountpoint didn't go away.
2097 * nfs_sigintr and caller nfs_reply expect it intact.
2098 */
2099 if (!rep->r_nmp) {
2100 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2101 return (ENXIO); /* don't have lock until out of loop */
2102 }
2103 }
2104 /*
2105 * nfs_reply will handle it if reply already arrived.
2106 * (We may have slept or been preempted).
2107 */
2108 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2109 *statep |= NFSSTA_RCVLOCK;
2110 return (0);
2111 }
2112
2113 /*
2114 * Unlock the stream socket for others.
2115 */
2116 static void
2117 nfs_rcvunlock(struct nfsreq *rep)
2118 {
2119 int *statep;
2120
2121 if (rep->r_nmp == NULL)
2122 return;
2123 statep = &rep->r_nmp->nm_state;
2124
2125 FSDBG(533, statep, *statep, 0, 0);
2126 if ((*statep & NFSSTA_RCVLOCK) == 0)
2127 panic("nfs rcvunlock");
2128 *statep &= ~NFSSTA_RCVLOCK;
2129 if (*statep & NFSSTA_WANTRCV) {
2130 *statep &= ~NFSSTA_WANTRCV;
2131 wakeup((caddr_t)statep);
2132 }
2133 }
2134
2135
2136 #ifndef NFS_NOSERVER
2137 /*
2138 * Socket upcall routine for the nfsd sockets.
2139 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2140 * Essentially do as much as possible non-blocking, else punt and it will
2141 * be called with MBUF_WAITOK from an nfsd.
2142 */
2143 void
2144 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2145 {
2146 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2147
2148 if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2149 return;
2150
2151 lck_rw_lock_exclusive(&slp->ns_rwlock);
2152 nfsrv_rcv_locked(so, slp, waitflag);
2153 /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2154 }
2155 void
2156 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2157 {
2158 mbuf_t m, mp, mhck, m2;
2159 int ns_flag=0, error;
2160 struct msghdr msg;
2161 size_t bytes_read;
2162
2163 if ((slp->ns_flag & SLP_VALID) == 0) {
2164 if (waitflag == MBUF_DONTWAIT)
2165 lck_rw_done(&slp->ns_rwlock);
2166 return;
2167 }
2168
2169 #ifdef notdef
2170 /*
2171 * Define this to test for nfsds handling this under heavy load.
2172 */
2173 if (waitflag == MBUF_DONTWAIT) {
2174 ns_flag = SLP_NEEDQ;
2175 goto dorecs;
2176 }
2177 #endif
2178 if (slp->ns_sotype == SOCK_STREAM) {
2179 /*
2180 * If there are already records on the queue, defer soreceive()
2181 * to an nfsd so that there is feedback to the TCP layer that
2182 * the nfs servers are heavily loaded.
2183 */
2184 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2185 ns_flag = SLP_NEEDQ;
2186 goto dorecs;
2187 }
2188
2189 /*
2190 * Do soreceive().
2191 */
2192 bytes_read = 1000000000;
2193 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2194 if (error || mp == NULL) {
2195 if (error == EWOULDBLOCK)
2196 ns_flag = SLP_NEEDQ;
2197 else
2198 ns_flag = SLP_DISCONN;
2199 goto dorecs;
2200 }
2201 m = mp;
2202 if (slp->ns_rawend) {
2203 if ((error = mbuf_setnext(slp->ns_rawend, m)))
2204 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2205 slp->ns_cc += bytes_read;
2206 } else {
2207 slp->ns_raw = m;
2208 slp->ns_cc = bytes_read;
2209 }
2210 while ((m2 = mbuf_next(m)))
2211 m = m2;
2212 slp->ns_rawend = m;
2213
2214 /*
2215 * Now try and parse record(s) out of the raw stream data.
2216 */
2217 error = nfsrv_getstream(slp, waitflag);
2218 if (error) {
2219 if (error == EPERM)
2220 ns_flag = SLP_DISCONN;
2221 else
2222 ns_flag = SLP_NEEDQ;
2223 }
2224 } else {
2225 struct sockaddr_storage nam;
2226
2227 bzero(&msg, sizeof(msg));
2228 msg.msg_name = (caddr_t)&nam;
2229 msg.msg_namelen = sizeof(nam);
2230
2231 do {
2232 bytes_read = 1000000000;
2233 error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2234 if (mp) {
2235 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2236 mbuf_setlen(mhck, nam.ss_len);
2237 bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2238 m = mhck;
2239 if (mbuf_setnext(m, mp)) {
2240 /* trouble... just drop it */
2241 printf("nfsrv_rcv: mbuf_setnext failed\n");
2242 mbuf_free(mhck);
2243 m = mp;
2244 }
2245 } else {
2246 m = mp;
2247 }
2248 if (slp->ns_recend)
2249 mbuf_setnextpkt(slp->ns_recend, m);
2250 else
2251 slp->ns_rec = m;
2252 slp->ns_recend = m;
2253 mbuf_setnextpkt(m, NULL);
2254 }
2255 #if 0
2256 if (error) {
2257 /*
2258 * This may be needed in the future to support
2259 * non-byte-stream connection-oriented protocols
2260 * such as SCTP.
2261 */
2262 /*
2263 * This (slp->ns_sotype == SOCK_STREAM) should really
2264 * be a check for PR_CONNREQUIRED.
2265 */
2266 if ((slp->ns_sotype == SOCK_STREAM)
2267 && error != EWOULDBLOCK) {
2268 ns_flag = SLP_DISCONN;
2269 goto dorecs;
2270 }
2271 }
2272 #endif
2273 } while (mp);
2274 }
2275
2276 /*
2277 * Now try and process the request records, non-blocking.
2278 */
2279 dorecs:
2280 if (ns_flag)
2281 slp->ns_flag |= ns_flag;
2282 if (waitflag == MBUF_DONTWAIT) {
2283 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2284 lck_rw_done(&slp->ns_rwlock);
2285 if (wake && nfs_numnfsd) {
2286 lck_mtx_lock(nfsd_mutex);
2287 nfsrv_wakenfsd(slp);
2288 lck_mtx_unlock(nfsd_mutex);
2289 }
2290 }
2291 }
2292
2293 /*
2294 * Try and extract an RPC request from the mbuf data list received on a
2295 * stream socket. The "waitflag" argument indicates whether or not it
2296 * can sleep.
2297 */
2298 static int
2299 nfsrv_getstream(slp, waitflag)
2300 struct nfssvc_sock *slp;
2301 int waitflag;
2302 {
2303 mbuf_t m;
2304 char *cp1, *cp2, *mdata;
2305 int len, mlen, error;
2306 mbuf_t om, m2, recm;
2307 u_long recmark;
2308
2309 if (slp->ns_flag & SLP_GETSTREAM)
2310 panic("nfs getstream");
2311 slp->ns_flag |= SLP_GETSTREAM;
2312 for (;;) {
2313 if (slp->ns_reclen == 0) {
2314 if (slp->ns_cc < NFSX_UNSIGNED) {
2315 slp->ns_flag &= ~SLP_GETSTREAM;
2316 return (0);
2317 }
2318 m = slp->ns_raw;
2319 mdata = mbuf_data(m);
2320 mlen = mbuf_len(m);
2321 if (mlen >= NFSX_UNSIGNED) {
2322 bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2323 mdata += NFSX_UNSIGNED;
2324 mlen -= NFSX_UNSIGNED;
2325 mbuf_setdata(m, mdata, mlen);
2326 } else {
2327 cp1 = (caddr_t)&recmark;
2328 cp2 = mdata;
2329 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2330 while (mlen == 0) {
2331 m = mbuf_next(m);
2332 cp2 = mbuf_data(m);
2333 mlen = mbuf_len(m);
2334 }
2335 *cp1++ = *cp2++;
2336 mlen--;
2337 mbuf_setdata(m, cp2, mlen);
2338 }
2339 }
2340 slp->ns_cc -= NFSX_UNSIGNED;
2341 recmark = ntohl(recmark);
2342 slp->ns_reclen = recmark & ~0x80000000;
2343 if (recmark & 0x80000000)
2344 slp->ns_flag |= SLP_LASTFRAG;
2345 else
2346 slp->ns_flag &= ~SLP_LASTFRAG;
2347 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2348 slp->ns_flag &= ~SLP_GETSTREAM;
2349 return (EPERM);
2350 }
2351 }
2352
2353 /*
2354 * Now get the record part.
2355 *
2356 * Note that slp->ns_reclen may be 0. Linux sometimes
2357 * generates 0-length RPCs
2358 */
2359 recm = NULL;
2360 if (slp->ns_cc == slp->ns_reclen) {
2361 recm = slp->ns_raw;
2362 slp->ns_raw = slp->ns_rawend = NULL;
2363 slp->ns_cc = slp->ns_reclen = 0;
2364 } else if (slp->ns_cc > slp->ns_reclen) {
2365 len = 0;
2366 m = slp->ns_raw;
2367 mlen = mbuf_len(m);
2368 mdata = mbuf_data(m);
2369 om = NULL;
2370 while (len < slp->ns_reclen) {
2371 if ((len + mlen) > slp->ns_reclen) {
2372 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2373 slp->ns_flag &= ~SLP_GETSTREAM;
2374 return (EWOULDBLOCK);
2375 }
2376 if (om) {
2377 if (mbuf_setnext(om, m2)) {
2378 /* trouble... just drop it */
2379 printf("nfsrv_getstream: mbuf_setnext failed\n");
2380 mbuf_freem(m2);
2381 slp->ns_flag &= ~SLP_GETSTREAM;
2382 return (EWOULDBLOCK);
2383 }
2384 recm = slp->ns_raw;
2385 } else {
2386 recm = m2;
2387 }
2388 mdata += slp->ns_reclen - len;
2389 mlen -= slp->ns_reclen - len;
2390 mbuf_setdata(m, mdata, mlen);
2391 len = slp->ns_reclen;
2392 } else if ((len + mlen) == slp->ns_reclen) {
2393 om = m;
2394 len += mlen;
2395 m = mbuf_next(m);
2396 recm = slp->ns_raw;
2397 if (mbuf_setnext(om, NULL)) {
2398 printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2399 slp->ns_flag &= ~SLP_GETSTREAM;
2400 return (EWOULDBLOCK);
2401 }
2402 mlen = mbuf_len(m);
2403 mdata = mbuf_data(m);
2404 } else {
2405 om = m;
2406 len += mlen;
2407 m = mbuf_next(m);
2408 mlen = mbuf_len(m);
2409 mdata = mbuf_data(m);
2410 }
2411 }
2412 slp->ns_raw = m;
2413 slp->ns_cc -= len;
2414 slp->ns_reclen = 0;
2415 } else {
2416 slp->ns_flag &= ~SLP_GETSTREAM;
2417 return (0);
2418 }
2419
2420 /*
2421 * Accumulate the fragments into a record.
2422 */
2423 if (slp->ns_frag == NULL) {
2424 slp->ns_frag = recm;
2425 } else {
2426 m = slp->ns_frag;
2427 while ((m2 = mbuf_next(m)))
2428 m = m2;
2429 if ((error = mbuf_setnext(m, recm)))
2430 panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2431 }
2432 if (slp->ns_flag & SLP_LASTFRAG) {
2433 if (slp->ns_recend)
2434 mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2435 else
2436 slp->ns_rec = slp->ns_frag;
2437 slp->ns_recend = slp->ns_frag;
2438 slp->ns_frag = NULL;
2439 }
2440 }
2441 }
2442
2443 /*
2444 * Parse an RPC header.
2445 */
2446 int
2447 nfsrv_dorec(slp, nfsd, ndp)
2448 struct nfssvc_sock *slp;
2449 struct nfsd *nfsd;
2450 struct nfsrv_descript **ndp;
2451 {
2452 mbuf_t m;
2453 mbuf_t nam;
2454 struct nfsrv_descript *nd;
2455 int error;
2456
2457 *ndp = NULL;
2458 if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2459 return (ENOBUFS);
2460 MALLOC_ZONE(nd, struct nfsrv_descript *,
2461 sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2462 if (!nd)
2463 return (ENOMEM);
2464 m = slp->ns_rec;
2465 slp->ns_rec = mbuf_nextpkt(m);
2466 if (slp->ns_rec)
2467 mbuf_setnextpkt(m, NULL);
2468 else
2469 slp->ns_recend = NULL;
2470 if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2471 nam = m;
2472 m = mbuf_next(m);
2473 if ((error = mbuf_setnext(nam, NULL)))
2474 panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2475 } else
2476 nam = NULL;
2477 nd->nd_md = nd->nd_mrep = m;
2478 nd->nd_nam2 = nam;
2479 nd->nd_dpos = mbuf_data(m);
2480 error = nfs_getreq(nd, nfsd, TRUE);
2481 if (error) {
2482 if (nam)
2483 mbuf_freem(nam);
2484 FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC);
2485 return (error);
2486 }
2487 *ndp = nd;
2488 nfsd->nfsd_nd = nd;
2489 return (0);
2490 }
2491
2492 /*
2493 * Parse an RPC request
2494 * - verify it
2495 * - fill in the cred struct.
2496 */
2497 int
2498 nfs_getreq(nd, nfsd, has_header)
2499 struct nfsrv_descript *nd;
2500 struct nfsd *nfsd;
2501 int has_header;
2502 {
2503 int len, i;
2504 u_long *tl;
2505 long t1;
2506 uio_t uiop;
2507 caddr_t dpos, cp2, cp;
2508 u_long nfsvers, auth_type;
2509 uid_t nickuid;
2510 int error = 0, ticklen;
2511 mbuf_t mrep, md;
2512 struct nfsuid *nuidp;
2513 uid_t user_id;
2514 gid_t group_id;
2515 int ngroups;
2516 struct ucred temp_cred;
2517 struct timeval tvin, tvout, now;
2518 char uio_buf[ UIO_SIZEOF(1) ];
2519 #if 0 /* until encrypted keys are implemented */
2520 NFSKERBKEYSCHED_T keys; /* stores key schedule */
2521 #endif
2522
2523 nd->nd_cr = NULL;
2524
2525 mrep = nd->nd_mrep;
2526 md = nd->nd_md;
2527 dpos = nd->nd_dpos;
2528 if (has_header) {
2529 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2530 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2531 if (*tl++ != rpc_call) {
2532 mbuf_freem(mrep);
2533 return (EBADRPC);
2534 }
2535 } else
2536 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2537 nd->nd_repstat = 0;
2538 nd->nd_flag = 0;
2539 if (*tl++ != rpc_vers) {
2540 nd->nd_repstat = ERPCMISMATCH;
2541 nd->nd_procnum = NFSPROC_NOOP;
2542 return (0);
2543 }
2544 if (*tl != nfs_prog) {
2545 nd->nd_repstat = EPROGUNAVAIL;
2546 nd->nd_procnum = NFSPROC_NOOP;
2547 return (0);
2548 }
2549 tl++;
2550 nfsvers = fxdr_unsigned(u_long, *tl++);
2551 if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2552 nd->nd_repstat = EPROGMISMATCH;
2553 nd->nd_procnum = NFSPROC_NOOP;
2554 return (0);
2555 }
2556 else if (nfsvers == NFS_VER3)
2557 nd->nd_flag = ND_NFSV3;
2558 nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2559 if (nd->nd_procnum == NFSPROC_NULL)
2560 return (0);
2561 if ((nd->nd_procnum >= NFS_NPROCS) ||
2562 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2563 nd->nd_repstat = EPROCUNAVAIL;
2564 nd->nd_procnum = NFSPROC_NOOP;
2565 return (0);
2566 }
2567 if ((nd->nd_flag & ND_NFSV3) == 0)
2568 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2569 auth_type = *tl++;
2570 len = fxdr_unsigned(int, *tl++);
2571 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2572 mbuf_freem(mrep);
2573 return (EBADRPC);
2574 }
2575
2576 nd->nd_flag &= ~ND_KERBAUTH;
2577 /*
2578 * Handle auth_unix or auth_kerb.
2579 */
2580 if (auth_type == rpc_auth_unix) {
2581 len = fxdr_unsigned(int, *++tl);
2582 if (len < 0 || len > NFS_MAXNAMLEN) {
2583 mbuf_freem(mrep);
2584 return (EBADRPC);
2585 }
2586 bzero(&temp_cred, sizeof(temp_cred));
2587 nfsm_adv(nfsm_rndup(len));
2588 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2589 user_id = fxdr_unsigned(uid_t, *tl++);
2590 group_id = fxdr_unsigned(gid_t, *tl++);
2591 temp_cred.cr_groups[0] = group_id;
2592 len = fxdr_unsigned(int, *tl);
2593 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2594 mbuf_freem(mrep);
2595 return (EBADRPC);
2596 }
2597 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2598 for (i = 1; i <= len; i++)
2599 if (i < NGROUPS)
2600 temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2601 else
2602 tl++;
2603 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2604 if (ngroups > 1)
2605 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2606 len = fxdr_unsigned(int, *++tl);
2607 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2608 mbuf_freem(mrep);
2609 return (EBADRPC);
2610 }
2611 temp_cred.cr_uid = user_id;
2612 temp_cred.cr_ngroups = ngroups;
2613 nd->nd_cr = kauth_cred_create(&temp_cred);
2614 if (nd->nd_cr == NULL) {
2615 nd->nd_repstat = ENOMEM;
2616 nd->nd_procnum = NFSPROC_NOOP;
2617 return (0);
2618 }
2619 if (len > 0)
2620 nfsm_adv(nfsm_rndup(len));
2621 } else if (auth_type == rpc_auth_kerb) {
2622 switch (fxdr_unsigned(int, *tl++)) {
2623 case RPCAKN_FULLNAME:
2624 ticklen = fxdr_unsigned(int, *tl);
2625 *((u_long *)nfsd->nfsd_authstr) = *tl;
2626 uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2627 &uio_buf[0], sizeof(uio_buf));
2628 if (!uiop) {
2629 nd->nd_repstat = ENOMEM;
2630 nd->nd_procnum = NFSPROC_NOOP;
2631 return (0);
2632 }
2633
2634 // LP64todo - fix this
2635 nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2636 if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2637 mbuf_freem(mrep);
2638 return (EBADRPC);
2639 }
2640 uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2641 // LP64todo - fix this
2642 nfsm_mtouio(uiop, uio_resid(uiop));
2643 nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2644 if (*tl++ != rpc_auth_kerb ||
2645 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2646 printf("Bad kerb verifier\n");
2647 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2648 nd->nd_procnum = NFSPROC_NOOP;
2649 return (0);
2650 }
2651 nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2652 tl = (u_long *)cp;
2653 if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2654 printf("Not fullname kerb verifier\n");
2655 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2656 nd->nd_procnum = NFSPROC_NOOP;
2657 return (0);
2658 }
2659 cp += NFSX_UNSIGNED;
2660 bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2661 nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2662 nd->nd_flag |= ND_KERBFULL;
2663 nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2664 break;
2665 case RPCAKN_NICKNAME:
2666 if (len != 2 * NFSX_UNSIGNED) {
2667 printf("Kerb nickname short\n");
2668 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2669 nd->nd_procnum = NFSPROC_NOOP;
2670 return (0);
2671 }
2672 nickuid = fxdr_unsigned(uid_t, *tl);
2673 nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2674 if (*tl++ != rpc_auth_kerb ||
2675 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2676 printf("Kerb nick verifier bad\n");
2677 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2678 nd->nd_procnum = NFSPROC_NOOP;
2679 return (0);
2680 }
2681 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2682 tvin.tv_sec = *tl++;
2683 tvin.tv_usec = *tl;
2684
2685 for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2686 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2687 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2688 (!nd->nd_nam2 ||
2689 netaddr_match(NU_NETFAM(nuidp),
2690 &nuidp->nu_haddr, nd->nd_nam2)))
2691 break;
2692 }
2693 if (!nuidp) {
2694 nd->nd_repstat =
2695 (NFSERR_AUTHERR|AUTH_REJECTCRED);
2696 nd->nd_procnum = NFSPROC_NOOP;
2697 return (0);
2698 }
2699
2700 /*
2701 * Now, decrypt the timestamp using the session key
2702 * and validate it.
2703 */
2704 #if NFSKERB
2705 XXX
2706 #endif
2707
2708 tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2709 tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2710 microtime(&now);
2711 if (nuidp->nu_expire < now.tv_sec ||
2712 nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2713 (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2714 nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2715 nuidp->nu_expire = 0;
2716 nd->nd_repstat =
2717 (NFSERR_AUTHERR|AUTH_REJECTVERF);
2718 nd->nd_procnum = NFSPROC_NOOP;
2719 return (0);
2720 }
2721 bzero(&temp_cred, sizeof(temp_cred));
2722 ngroups = nuidp->nu_cr->cr_ngroups;
2723 for (i = 0; i < ngroups; i++)
2724 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2725 if (ngroups > 1)
2726 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2727
2728 temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2729 temp_cred.cr_ngroups = ngroups;
2730 nd->nd_cr = kauth_cred_create(&temp_cred);
2731 if (!nd->nd_cr) {
2732 nd->nd_repstat = ENOMEM;
2733 nd->nd_procnum = NFSPROC_NOOP;
2734 return (0);
2735 }
2736 nd->nd_flag |= ND_KERBNICK;
2737 };
2738 } else {
2739 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2740 nd->nd_procnum = NFSPROC_NOOP;
2741 return (0);
2742 }
2743
2744 nd->nd_md = md;
2745 nd->nd_dpos = dpos;
2746 return (0);
2747 nfsmout:
2748 if (IS_VALID_CRED(nd->nd_cr))
2749 kauth_cred_unref(&nd->nd_cr);
2750 return (error);
2751 }
2752
2753 /*
2754 * Search for a sleeping nfsd and wake it up.
2755 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2756 * running nfsds will go look for the work in the nfssvc_sock list.
2757 * Note: Must be called with nfsd_mutex held.
2758 */
2759 void
2760 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2761 {
2762 struct nfsd *nd;
2763
2764 if ((slp->ns_flag & SLP_VALID) == 0)
2765 return;
2766
2767 lck_rw_lock_exclusive(&slp->ns_rwlock);
2768
2769 if (nfsd_waiting) {
2770 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2771 if (nd->nfsd_flag & NFSD_WAITING) {
2772 nd->nfsd_flag &= ~NFSD_WAITING;
2773 if (nd->nfsd_slp)
2774 panic("nfsd wakeup");
2775 slp->ns_sref++;
2776 nd->nfsd_slp = slp;
2777 lck_rw_done(&slp->ns_rwlock);
2778 wakeup((caddr_t)nd);
2779 return;
2780 }
2781 }
2782 }
2783
2784 slp->ns_flag |= SLP_DOREC;
2785
2786 lck_rw_done(&slp->ns_rwlock);
2787
2788 nfsd_head_flag |= NFSD_CHECKSLP;
2789 }
2790 #endif /* NFS_NOSERVER */
2791
2792 static int
2793 nfs_msg(proc_t p,
2794 const char *server,
2795 const char *msg,
2796 int error)
2797 {
2798 tpr_t tpr;
2799
2800 if (p)
2801 tpr = tprintf_open(p);
2802 else
2803 tpr = NULL;
2804 if (error)
2805 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2806 error);
2807 else
2808 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2809 tprintf_close(tpr);
2810 return (0);
2811 }
2812
2813 void
2814 nfs_down(nmp, proc, error, flags, msg)
2815 struct nfsmount *nmp;
2816 proc_t proc;
2817 int error, flags;
2818 const char *msg;
2819 {
2820 if (nmp == NULL)
2821 return;
2822 if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2823 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2824 nmp->nm_state |= NFSSTA_TIMEO;
2825 }
2826 if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2827 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2828 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2829 }
2830 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2831 }
2832
2833 void
2834 nfs_up(nmp, proc, flags, msg)
2835 struct nfsmount *nmp;
2836 proc_t proc;
2837 int flags;
2838 const char *msg;
2839 {
2840 if (nmp == NULL)
2841 return;
2842 if (msg)
2843 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2844 if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2845 nmp->nm_state &= ~NFSSTA_TIMEO;
2846 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2847 }
2848 if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2849 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2850 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2851 }
2852 }
2853