]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_socket.c
960acf9906fa9562d7ea8da9a293b7843a013afa
[apple/xnu.git] / bsd / nfs / nfs_socket.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31 /*
32 * Copyright (c) 1989, 1991, 1993, 1995
33 * The Regents of the University of California. All rights reserved.
34 *
35 * This code is derived from software contributed to Berkeley by
36 * Rick Macklem at The University of Guelph.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
67 * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
68 */
69
70 /*
71 * Socket operations for use by nfs
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/proc.h>
77 #include <sys/kauth.h>
78 #include <sys/mount_internal.h>
79 #include <sys/kernel.h>
80 #include <sys/kpi_mbuf.h>
81 #include <sys/malloc.h>
82 #include <sys/vnode.h>
83 #include <sys/domain.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/syslog.h>
87 #include <sys/tprintf.h>
88 #include <sys/uio_internal.h>
89 #include <libkern/OSAtomic.h>
90
91 #include <sys/time.h>
92 #include <kern/clock.h>
93 #include <kern/task.h>
94 #include <kern/thread.h>
95 #include <sys/user.h>
96
97 #include <netinet/in.h>
98 #include <netinet/tcp.h>
99
100 #include <nfs/rpcv2.h>
101 #include <nfs/nfsproto.h>
102 #include <nfs/nfs.h>
103 #include <nfs/xdr_subs.h>
104 #include <nfs/nfsm_subs.h>
105 #include <nfs/nfsmount.h>
106 #include <nfs/nfsnode.h>
107 #include <nfs/nfsrtt.h>
108
109 #include <sys/kdebug.h>
110
111 #define FSDBG(A, B, C, D, E) \
112 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
113 (int)(B), (int)(C), (int)(D), (int)(E), 0)
114 #define FSDBG_TOP(A, B, C, D, E) \
115 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
116 (int)(B), (int)(C), (int)(D), (int)(E), 0)
117 #define FSDBG_BOT(A, B, C, D, E) \
118 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
119 (int)(B), (int)(C), (int)(D), (int)(E), 0)
120
121 /*
122 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
123 * Use the mean and mean deviation of rtt for the appropriate type of rpc
124 * for the frequent rpcs and a default for the others.
125 * The justification for doing "other" this way is that these rpcs
126 * happen so infrequently that timer est. would probably be stale.
127 * Also, since many of these rpcs are
128 * non-idempotent, a conservative timeout is desired.
129 * getattr, lookup - A+2D
130 * read, write - A+4D
131 * other - nm_timeo
132 */
133 #define NFS_RTO(n, t) \
134 ((t) == 0 ? (n)->nm_timeo : \
135 ((t) < 3 ? \
136 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
137 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
138 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
139 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
140 /*
141 * External data, mostly RPC constants in XDR form
142 */
143 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
144 rpc_msgaccepted, rpc_call, rpc_autherr,
145 rpc_auth_kerb;
146 extern u_long nfs_prog;
147 extern struct nfsstats nfsstats;
148 extern int nfsv3_procid[NFS_NPROCS];
149 extern int nfs_ticks;
150 extern u_long nfs_xidwrap;
151
152 /*
153 * Defines which timer to use for the procnum.
154 * 0 - default
155 * 1 - getattr
156 * 2 - lookup
157 * 3 - read
158 * 4 - write
159 */
160 static int proct[NFS_NPROCS] = {
161 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
162 };
163
164 /*
165 * There is a congestion window for outstanding rpcs maintained per mount
166 * point. The cwnd size is adjusted in roughly the way that:
167 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
168 * SIGCOMM '88". ACM, August 1988.
169 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
170 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
171 * of rpcs is in progress.
172 * (The sent count and cwnd are scaled for integer arith.)
173 * Variants of "slow start" were tried and were found to be too much of a
174 * performance hit (ave. rtt 3 times larger),
175 * I suspect due to the large rtt that nfs rpcs have.
176 */
177 #define NFS_CWNDSCALE 256
178 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
179 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
180 int nfsrtton = 0;
181 struct nfsrtt nfsrtt;
182
183 static int nfs_rcvlock(struct nfsreq *);
184 static void nfs_rcvunlock(struct nfsreq *);
185 static int nfs_receive(struct nfsreq *rep, mbuf_t *mp);
186 static int nfs_reconnect(struct nfsreq *rep);
187 static void nfs_repdequeue(struct nfsreq *rep);
188
189 /* XXX */
190 boolean_t current_thread_aborted(void);
191 kern_return_t thread_terminate(thread_t);
192
193 #ifndef NFS_NOSERVER
194 static int nfsrv_getstream(struct nfssvc_sock *,int);
195
196 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
197 struct nfssvc_sock *slp,
198 proc_t procp,
199 mbuf_t *mreqp) = {
200 nfsrv_null,
201 nfsrv_getattr,
202 nfsrv_setattr,
203 nfsrv_lookup,
204 nfsrv3_access,
205 nfsrv_readlink,
206 nfsrv_read,
207 nfsrv_write,
208 nfsrv_create,
209 nfsrv_mkdir,
210 nfsrv_symlink,
211 nfsrv_mknod,
212 nfsrv_remove,
213 nfsrv_rmdir,
214 nfsrv_rename,
215 nfsrv_link,
216 nfsrv_readdir,
217 nfsrv_readdirplus,
218 nfsrv_statfs,
219 nfsrv_fsinfo,
220 nfsrv_pathconf,
221 nfsrv_commit,
222 nfsrv_noop
223 };
224 #endif /* NFS_NOSERVER */
225
226
227 /*
228 * attempt to bind a socket to a reserved port
229 */
230 static int
231 nfs_bind_resv(struct nfsmount *nmp)
232 {
233 socket_t so = nmp->nm_so;
234 struct sockaddr_in sin;
235 int error;
236 u_short tport;
237
238 if (!so)
239 return (EINVAL);
240
241 sin.sin_len = sizeof (struct sockaddr_in);
242 sin.sin_family = AF_INET;
243 sin.sin_addr.s_addr = INADDR_ANY;
244 tport = IPPORT_RESERVED - 1;
245 sin.sin_port = htons(tport);
246
247 while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
248 (--tport > IPPORT_RESERVED / 2))
249 sin.sin_port = htons(tport);
250 return (error);
251 }
252
253 /*
254 * variables for managing the nfs_bind_resv_thread
255 */
256 int nfs_resv_mounts = 0;
257 static int nfs_bind_resv_thread_state = 0;
258 #define NFS_BIND_RESV_THREAD_STATE_INITTED 1
259 #define NFS_BIND_RESV_THREAD_STATE_RUNNING 2
260 lck_grp_t *nfs_bind_resv_lck_grp;
261 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
262 lck_attr_t *nfs_bind_resv_lck_attr;
263 lck_mtx_t *nfs_bind_resv_mutex;
264 struct nfs_bind_resv_request {
265 TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
266 struct nfsmount *brr_nmp;
267 int brr_error;
268 };
269 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
270
271 /*
272 * thread to handle any reserved port bind requests
273 */
274 static void
275 nfs_bind_resv_thread(void)
276 {
277 struct nfs_bind_resv_request *brreq;
278
279 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
280
281 while (nfs_resv_mounts > 0) {
282 lck_mtx_lock(nfs_bind_resv_mutex);
283 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
284 TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
285 lck_mtx_unlock(nfs_bind_resv_mutex);
286 brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
287 wakeup(brreq);
288 lck_mtx_lock(nfs_bind_resv_mutex);
289 }
290 msleep((caddr_t)&nfs_bind_resv_request_queue,
291 nfs_bind_resv_mutex, PSOCK | PDROP,
292 "nfs_bind_resv_request_queue", 0);
293 }
294
295 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
296 (void) thread_terminate(current_thread());
297 }
298
299 int
300 nfs_bind_resv_thread_wake(void)
301 {
302 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
303 return (EIO);
304 wakeup(&nfs_bind_resv_request_queue);
305 return (0);
306 }
307
308 /*
309 * underprivileged procs call this to request nfs_bind_resv_thread
310 * to perform the reserved port binding for them.
311 */
312 static int
313 nfs_bind_resv_nopriv(struct nfsmount *nmp)
314 {
315 struct nfs_bind_resv_request brreq;
316 int error;
317
318 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
319 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
320 nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
321 nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
322 nfs_bind_resv_lck_attr = lck_attr_alloc_init();
323 nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
324 TAILQ_INIT(&nfs_bind_resv_request_queue);
325 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
326 }
327 kernel_thread(kernel_task, nfs_bind_resv_thread);
328 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
329 }
330
331 brreq.brr_nmp = nmp;
332 brreq.brr_error = 0;
333
334 lck_mtx_lock(nfs_bind_resv_mutex);
335 TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
336 lck_mtx_unlock(nfs_bind_resv_mutex);
337
338 error = nfs_bind_resv_thread_wake();
339 if (error) {
340 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
341 /* Note: we might be able to simply restart the thread */
342 return (error);
343 }
344
345 tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
346
347 return (brreq.brr_error);
348 }
349
350 /*
351 * Initialize sockets and congestion for a new NFS connection.
352 * We do not free the sockaddr if error.
353 */
354 int
355 nfs_connect(
356 struct nfsmount *nmp,
357 __unused struct nfsreq *rep)
358 {
359 socket_t so;
360 int error, rcvreserve, sndreserve;
361 struct sockaddr *saddr;
362 struct timeval timeo;
363
364 nmp->nm_so = 0;
365 saddr = mbuf_data(nmp->nm_nam);
366 error = sock_socket(saddr->sa_family, nmp->nm_sotype,
367 nmp->nm_soproto, 0, 0, &nmp->nm_so);
368 if (error) {
369 goto bad;
370 }
371 so = nmp->nm_so;
372
373 /*
374 * Some servers require that the client port be a reserved port number.
375 */
376 if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
377 proc_t p;
378 /*
379 * sobind() requires current_proc() to have superuser privs.
380 * If this bind is part of a reconnect, and the current proc
381 * doesn't have superuser privs, we hand the sobind() off to
382 * a kernel thread to process.
383 */
384 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
385 (p = current_proc()) && suser(kauth_cred_get(), 0)) {
386 /* request nfs_bind_resv_thread() to do bind */
387 error = nfs_bind_resv_nopriv(nmp);
388 } else {
389 error = nfs_bind_resv(nmp);
390 }
391 if (error)
392 goto bad;
393 }
394
395 /*
396 * Protocols that do not require connections may be optionally left
397 * unconnected for servers that reply from a port other than NFS_PORT.
398 */
399 if (nmp->nm_flag & NFSMNT_NOCONN) {
400 if (nmp->nm_sotype == SOCK_STREAM) {
401 error = ENOTCONN;
402 goto bad;
403 }
404 } else {
405 struct timeval tv;
406 tv.tv_sec = 2;
407 tv.tv_usec = 0;
408 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
409 if (error && error != EINPROGRESS) {
410 goto bad;
411 }
412
413 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
414 if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
415 goto bad;
416 }
417 }
418 }
419
420 /*
421 * Always time out on recieve, this allows us to reconnect the
422 * socket to deal with network changes.
423 */
424 timeo.tv_usec = 0;
425 timeo.tv_sec = 2;
426 error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
427 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
428 timeo.tv_sec = 5;
429 } else {
430 timeo.tv_sec = 0;
431 }
432 error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
433
434 if (nmp->nm_sotype == SOCK_DGRAM) {
435 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
436 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
437 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
438 } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
439 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
440 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
441 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
442 } else {
443 int proto;
444 int on = 1;
445
446 sock_gettype(so, NULL, NULL, &proto);
447 if (nmp->nm_sotype != SOCK_STREAM)
448 panic("nfscon sotype");
449
450 // Assume that SOCK_STREAM always requires a connection
451 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
452
453 if (proto == IPPROTO_TCP) {
454 sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
455 }
456
457 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
458 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
459 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
460 }
461
462 if (sndreserve > NFS_MAXSOCKBUF)
463 sndreserve = NFS_MAXSOCKBUF;
464 if (rcvreserve > NFS_MAXSOCKBUF)
465 rcvreserve = NFS_MAXSOCKBUF;
466 error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
467 if (error) {
468 goto bad;
469 }
470 error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
471 if (error) {
472 goto bad;
473 }
474
475 sock_nointerrupt(so, 1);
476
477 /* Initialize other non-zero congestion variables */
478 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
479 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
480 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
481 nmp->nm_sdrtt[3] = 0;
482 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
483 nmp->nm_sent = 0;
484 FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
485 nmp->nm_timeouts = 0;
486 return (0);
487
488 bad:
489 nfs_disconnect(nmp);
490 return (error);
491 }
492
493 /*
494 * Reconnect routine:
495 * Called when a connection is broken on a reliable protocol.
496 * - clean up the old socket
497 * - nfs_connect() again
498 * - set R_MUSTRESEND for all outstanding requests on mount point
499 * If this fails the mount point is DEAD!
500 * nb: Must be called with the nfs_sndlock() set on the mount point.
501 */
502 static int
503 nfs_reconnect(struct nfsreq *rep)
504 {
505 struct nfsreq *rp;
506 struct nfsmount *nmp = rep->r_nmp;
507 int error;
508
509 nfs_disconnect(nmp);
510 while ((error = nfs_connect(nmp, rep))) {
511 if (error == EINTR || error == ERESTART)
512 return (EINTR);
513 if (error == EIO)
514 return (EIO);
515 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
516 "can not connect");
517 rep->r_flags |= R_TPRINTFMSG;
518 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
519 /* we're not yet completely mounted and */
520 /* we can't reconnect, so we fail */
521 return (error);
522 }
523 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
524 return (error);
525 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
526 }
527
528 /*
529 * Loop through outstanding request list and fix up all requests
530 * on old socket.
531 */
532 TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
533 if (rp->r_nmp == nmp)
534 rp->r_flags |= R_MUSTRESEND;
535 }
536 return (0);
537 }
538
539 /*
540 * NFS disconnect. Clean up and unlink.
541 */
542 void
543 nfs_disconnect(struct nfsmount *nmp)
544 {
545 socket_t so;
546
547 if (nmp->nm_so) {
548 so = nmp->nm_so;
549 nmp->nm_so = 0;
550 sock_shutdown(so, 2);
551 sock_close(so);
552 }
553 }
554
555 /*
556 * This is the nfs send routine. For connection based socket types, it
557 * must be called with an nfs_sndlock() on the socket.
558 * "rep == NULL" indicates that it has been called from a server.
559 * For the client side:
560 * - return EINTR if the RPC is terminated, 0 otherwise
561 * - set R_MUSTRESEND if the send fails for any reason
562 * - do any cleanup required by recoverable socket errors (???)
563 * For the server side:
564 * - return EINTR or ERESTART if interrupted by a signal
565 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
566 * - do any cleanup required by recoverable socket errors (???)
567 */
568 int
569 nfs_send(so, nam, top, rep)
570 socket_t so;
571 mbuf_t nam;
572 mbuf_t top;
573 struct nfsreq *rep;
574 {
575 struct sockaddr *sendnam;
576 int error, error2, sotype, flags;
577 u_long xidqueued = 0;
578 struct nfsreq *rp;
579 char savenametolog[MAXPATHLEN];
580 struct msghdr msg;
581
582 if (rep) {
583 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
584 if (error) {
585 mbuf_freem(top);
586 return (error);
587 }
588 if ((so = rep->r_nmp->nm_so) == NULL) {
589 rep->r_flags |= R_MUSTRESEND;
590 mbuf_freem(top);
591 return (0);
592 }
593 rep->r_flags &= ~R_MUSTRESEND;
594 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
595 if (rp == rep)
596 break;
597 if (rp)
598 xidqueued = rp->r_xid;
599 }
600 sock_gettype(so, NULL, &sotype, NULL);
601 if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
602 (nam == 0))
603 sendnam = (struct sockaddr *)0;
604 else
605 sendnam = mbuf_data(nam);
606
607 if (sotype == SOCK_SEQPACKET)
608 flags = MSG_EOR;
609 else
610 flags = 0;
611
612 /*
613 * Save the name here in case mount point goes away if we block.
614 * The name is using local stack and is large, but don't
615 * want to block if we malloc.
616 */
617 if (rep)
618 strncpy(savenametolog,
619 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
620 MAXPATHLEN - 1);
621 bzero(&msg, sizeof(msg));
622 msg.msg_name = (caddr_t)sendnam;
623 msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
624 error = sock_sendmbuf(so, &msg, top, flags, NULL);
625
626 if (error) {
627 if (rep) {
628 if (xidqueued) {
629 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
630 if (rp == rep && rp->r_xid == xidqueued)
631 break;
632 if (!rp)
633 panic("nfs_send: error %d xid %x gone",
634 error, xidqueued);
635 }
636 log(LOG_INFO, "nfs send error %d for server %s\n",
637 error, savenametolog);
638 /*
639 * Deal with errors for the client side.
640 */
641 error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
642 if (error2) {
643 error = error2;
644 } else {
645 rep->r_flags |= R_MUSTRESEND;
646 }
647 } else
648 log(LOG_INFO, "nfsd send error %d\n", error);
649
650 /*
651 * Handle any recoverable (soft) socket errors here. (???)
652 */
653 if (error != EINTR && error != ERESTART && error != EIO &&
654 error != EWOULDBLOCK && error != EPIPE) {
655 error = 0;
656 }
657 }
658 return (error);
659 }
660
661 /*
662 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
663 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
664 * Mark and consolidate the data into a new mbuf list.
665 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
666 * small mbufs.
667 * For SOCK_STREAM we must be very careful to read an entire record once
668 * we have read any of it, even if the system call has been interrupted.
669 */
670 static int
671 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
672 {
673 socket_t so;
674 struct iovec_32 aio;
675 mbuf_t m, mlast;
676 u_long len, fraglen;
677 int error, error2, sotype;
678 proc_t p = current_proc(); /* XXX */
679 struct msghdr msg;
680 size_t rcvlen;
681 int lastfragment;
682
683 /*
684 * Set up arguments for soreceive()
685 */
686 *mp = NULL;
687 sotype = rep->r_nmp->nm_sotype;
688
689 /*
690 * For reliable protocols, lock against other senders/receivers
691 * in case a reconnect is necessary.
692 * For SOCK_STREAM, first get the Record Mark to find out how much
693 * more there is to get.
694 * We must lock the socket against other receivers
695 * until we have an entire rpc request/reply.
696 */
697 if (sotype != SOCK_DGRAM) {
698 error = nfs_sndlock(rep);
699 if (error)
700 return (error);
701 tryagain:
702 /*
703 * Check for fatal errors and resending request.
704 */
705 /*
706 * Ugh: If a reconnect attempt just happened, nm_so
707 * would have changed. NULL indicates a failed
708 * attempt that has essentially shut down this
709 * mount point.
710 */
711 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
712 nfs_sndunlock(rep);
713 if (error)
714 return (error);
715 return (EINTR);
716 }
717 so = rep->r_nmp->nm_so;
718 if (!so) {
719 error = nfs_reconnect(rep);
720 if (error) {
721 nfs_sndunlock(rep);
722 return (error);
723 }
724 goto tryagain;
725 }
726 while (rep->r_flags & R_MUSTRESEND) {
727 error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
728 if (!error) {
729 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
730 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
731 }
732 /*
733 * we also hold rcv lock so rep is still
734 * legit this point
735 */
736 if (error) {
737 if (error == EINTR || error == ERESTART ||
738 (error = nfs_reconnect(rep))) {
739 nfs_sndunlock(rep);
740 return (error);
741 }
742 goto tryagain;
743 }
744 }
745 nfs_sndunlock(rep);
746 if (sotype == SOCK_STREAM) {
747 error = 0;
748 len = 0;
749 lastfragment = 0;
750 mlast = NULL;
751 while (!error && !lastfragment) {
752 aio.iov_base = (uintptr_t) &fraglen;
753 aio.iov_len = sizeof(u_long);
754 bzero(&msg, sizeof(msg));
755 msg.msg_iov = (struct iovec *) &aio;
756 msg.msg_iovlen = 1;
757 do {
758 error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
759 if (!rep->r_nmp) /* if unmounted then bailout */
760 goto shutout;
761 if (error == EWOULDBLOCK && rep) {
762 error2 = nfs_sigintr(rep->r_nmp, rep, p);
763 if (error2)
764 error = error2;
765 }
766 } while (error == EWOULDBLOCK);
767 if (!error && rcvlen < aio.iov_len) {
768 /* only log a message if we got a partial word */
769 if (rcvlen != 0)
770 log(LOG_INFO,
771 "short receive (%d/%d) from nfs server %s\n",
772 rcvlen, sizeof(u_long),
773 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
774 error = EPIPE;
775 }
776 if (error)
777 goto errout;
778 lastfragment = ntohl(fraglen) & 0x80000000;
779 fraglen = ntohl(fraglen) & ~0x80000000;
780 len += fraglen;
781 /*
782 * This is SERIOUS! We are out of sync with the sender
783 * and forcing a disconnect/reconnect is all I can do.
784 */
785 if (len > NFS_MAXPACKET) {
786 log(LOG_ERR, "%s (%d) from nfs server %s\n",
787 "impossible RPC record length", len,
788 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
789 error = EFBIG;
790 goto errout;
791 }
792
793 m = NULL;
794 do {
795 rcvlen = fraglen;
796 error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
797 if (!rep->r_nmp) /* if unmounted then bailout */ {
798 goto shutout;
799 }
800 } while (error == EWOULDBLOCK || error == EINTR ||
801 error == ERESTART);
802
803 if (!error && fraglen > rcvlen) {
804 log(LOG_INFO,
805 "short receive (%d/%d) from nfs server %s\n",
806 rcvlen, fraglen,
807 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
808 error = EPIPE;
809 mbuf_freem(m);
810 }
811 if (!error) {
812 if (!*mp) {
813 *mp = m;
814 mlast = m;
815 } else {
816 error = mbuf_setnext(mlast, m);
817 if (error) {
818 printf("nfs_receive: mbuf_setnext failed %d\n", error);
819 mbuf_freem(m);
820 }
821 }
822 while (mbuf_next(mlast))
823 mlast = mbuf_next(mlast);
824 }
825 }
826 } else {
827 bzero(&msg, sizeof(msg));
828 do {
829 rcvlen = 100000000;
830 error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
831 if (!rep->r_nmp) /* if unmounted then bailout */ {
832 goto shutout;
833 }
834 if (error == EWOULDBLOCK && rep) {
835 error2 = nfs_sigintr(rep->r_nmp, rep, p);
836 if (error2) {
837 return (error2);
838 }
839 }
840 } while (error == EWOULDBLOCK);
841
842 if ((msg.msg_flags & MSG_EOR) == 0)
843 printf("Egad!!\n");
844 if (!error && *mp == NULL)
845 error = EPIPE;
846 len = rcvlen;
847 }
848 errout:
849 if (error && error != EINTR && error != ERESTART) {
850 mbuf_freem(*mp);
851 *mp = NULL;
852 if (error != EPIPE)
853 log(LOG_INFO,
854 "receive error %d from nfs server %s\n", error,
855 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
856 error = nfs_sndlock(rep);
857 if (!error) {
858 error = nfs_reconnect(rep);
859 if (!error)
860 goto tryagain;
861 nfs_sndunlock(rep);
862 }
863 }
864 } else {
865 /*
866 * We could have failed while rebinding the datagram socket
867 * so we need to attempt to rebind here.
868 */
869 if ((so = rep->r_nmp->nm_so) == NULL) {
870 error = nfs_sndlock(rep);
871 if (!error) {
872 error = nfs_reconnect(rep);
873 nfs_sndunlock(rep);
874 }
875 if (error)
876 return (error);
877 if (!rep->r_nmp) /* if unmounted then bailout */
878 return (ENXIO);
879 so = rep->r_nmp->nm_so;
880 }
881 bzero(&msg, sizeof(msg));
882 len = 0;
883 do {
884 rcvlen = 1000000;
885 error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
886 if (!rep->r_nmp) /* if unmounted then bailout */
887 goto shutout;
888 if (error) {
889 error2 = nfs_sigintr(rep->r_nmp, rep, p);
890 if (error2) {
891 error = error2;
892 goto shutout;
893 }
894 }
895 /* Reconnect for all errors. We may be receiving
896 * soft/hard/blocking errors because of a network
897 * change.
898 * XXX: we should rate limit or delay this
899 * to once every N attempts or something.
900 * although TCP doesn't seem to.
901 */
902 if (error) {
903 error2 = nfs_sndlock(rep);
904 if (!error2) {
905 error2 = nfs_reconnect(rep);
906 if (error2)
907 error = error2;
908 else if (!rep->r_nmp) /* if unmounted then bailout */
909 error = ENXIO;
910 else
911 so = rep->r_nmp->nm_so;
912 nfs_sndunlock(rep);
913 } else {
914 error = error2;
915 }
916 }
917 } while (error == EWOULDBLOCK);
918 }
919 shutout:
920 if (error) {
921 mbuf_freem(*mp);
922 *mp = NULL;
923 }
924 return (error);
925 }
926
927 /*
928 * Implement receipt of reply on a socket.
929 * We must search through the list of received datagrams matching them
930 * with outstanding requests using the xid, until ours is found.
931 */
932 /* ARGSUSED */
933 int
934 nfs_reply(myrep)
935 struct nfsreq *myrep;
936 {
937 struct nfsreq *rep;
938 struct nfsmount *nmp = myrep->r_nmp;
939 long t1;
940 mbuf_t mrep, md;
941 u_long rxid, *tl;
942 caddr_t dpos, cp2;
943 int error;
944
945 /*
946 * Loop around until we get our own reply
947 */
948 for (;;) {
949 /*
950 * Lock against other receivers so that I don't get stuck in
951 * sbwait() after someone else has received my reply for me.
952 * Also necessary for connection based protocols to avoid
953 * race conditions during a reconnect.
954 * If nfs_rcvlock() returns EALREADY, that means that
955 * the reply has already been recieved by another
956 * process and we can return immediately. In this
957 * case, the lock is not taken to avoid races with
958 * other processes.
959 */
960 error = nfs_rcvlock(myrep);
961 if (error == EALREADY)
962 return (0);
963 if (error)
964 return (error);
965
966 /*
967 * If we slept after putting bits otw, then reply may have
968 * arrived. In which case returning is required, or we
969 * would hang trying to nfs_receive an already received reply.
970 */
971 if (myrep->r_mrep != NULL) {
972 nfs_rcvunlock(myrep);
973 FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
974 return (0);
975 }
976 /*
977 * Get the next Rpc reply off the socket. Assume myrep->r_nmp
978 * is still intact by checks done in nfs_rcvlock.
979 */
980 error = nfs_receive(myrep, &mrep);
981 /*
982 * Bailout asap if nfsmount struct gone (unmounted).
983 */
984 if (!myrep->r_nmp) {
985 FSDBG(530, myrep->r_xid, myrep, nmp, -2);
986 if (mrep)
987 mbuf_freem(mrep);
988 return (ENXIO);
989 }
990 if (error) {
991 FSDBG(530, myrep->r_xid, myrep, nmp, error);
992 nfs_rcvunlock(myrep);
993
994 /* Bailout asap if nfsmount struct gone (unmounted). */
995 if (!myrep->r_nmp) {
996 if (mrep)
997 mbuf_freem(mrep);
998 return (ENXIO);
999 }
1000
1001 /*
1002 * Ignore routing errors on connectionless protocols??
1003 */
1004 if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1005 if (nmp->nm_so) {
1006 int clearerror;
1007 int optlen = sizeof(clearerror);
1008 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1009 }
1010 continue;
1011 }
1012 if (mrep)
1013 mbuf_freem(mrep);
1014 return (error);
1015 }
1016
1017 /*
1018 * We assume all is fine, but if we did not have an error
1019 * and mrep is 0, better not dereference it. nfs_receive
1020 * calls soreceive which carefully sets error=0 when it got
1021 * errors on sbwait (tsleep). In most cases, I assume that's
1022 * so we could go back again. In tcp case, EPIPE is returned.
1023 * In udp, case nfs_receive gets back here with no error and no
1024 * mrep. Is the right fix to have soreceive check for process
1025 * aborted after sbwait and return something non-zero? Should
1026 * nfs_receive give an EPIPE? Too risky to play with those
1027 * two this late in game for a shutdown problem. Instead,
1028 * just check here and get out. (ekn)
1029 */
1030 if (!mrep) {
1031 nfs_rcvunlock(myrep);
1032 FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1033 return (ENXIO); /* sounds good */
1034 }
1035
1036 /*
1037 * Get the xid and check that it is an rpc reply
1038 */
1039 md = mrep;
1040 dpos = mbuf_data(md);
1041 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1042 rxid = *tl++;
1043 if (*tl != rpc_reply) {
1044 OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1045 mbuf_freem(mrep);
1046 nfsmout:
1047 if (nmp->nm_state & NFSSTA_RCVLOCK)
1048 nfs_rcvunlock(myrep);
1049 continue;
1050 }
1051
1052 /*
1053 * Loop through the request list to match up the reply
1054 * Iff no match, just drop the datagram
1055 */
1056 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1057 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1058 /* Found it.. */
1059 rep->r_mrep = mrep;
1060 rep->r_md = md;
1061 rep->r_dpos = dpos;
1062 /*
1063 * If we're tracking the round trip time
1064 * then we update the circular log here
1065 * with the stats from our current request.
1066 */
1067 if (nfsrtton) {
1068 struct rttl *rt;
1069
1070 rt = &nfsrtt.rttl[nfsrtt.pos];
1071 rt->proc = rep->r_procnum;
1072 rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1073 rt->sent = nmp->nm_sent;
1074 rt->cwnd = nmp->nm_cwnd;
1075 if (proct[rep->r_procnum] == 0)
1076 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1077 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1078 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1079 rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1080 microtime(&rt->tstamp); // XXX unused
1081 if (rep->r_flags & R_TIMING)
1082 rt->rtt = rep->r_rtt;
1083 else
1084 rt->rtt = 1000000;
1085 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1086 }
1087 /*
1088 * Update congestion window.
1089 * Do the additive increase of
1090 * one rpc/rtt.
1091 */
1092 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1093 nmp->nm_cwnd);
1094 if (nmp->nm_cwnd <= nmp->nm_sent) {
1095 nmp->nm_cwnd +=
1096 (NFS_CWNDSCALE * NFS_CWNDSCALE +
1097 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1098 if (nmp->nm_cwnd > NFS_MAXCWND)
1099 nmp->nm_cwnd = NFS_MAXCWND;
1100 }
1101 if (rep->r_flags & R_SENT) {
1102 rep->r_flags &= ~R_SENT;
1103 nmp->nm_sent -= NFS_CWNDSCALE;
1104 }
1105 /*
1106 * Update rtt using a gain of 0.125 on the mean
1107 * and a gain of 0.25 on the deviation.
1108 */
1109 if (rep->r_flags & R_TIMING) {
1110 /*
1111 * Since the timer resolution of
1112 * NFS_HZ is so course, it can often
1113 * result in r_rtt == 0. Since
1114 * r_rtt == N means that the actual
1115 * rtt is between N+dt and N+2-dt ticks,
1116 * add 1.
1117 */
1118 if (proct[rep->r_procnum] == 0)
1119 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1120 t1 = rep->r_rtt + 1;
1121 t1 -= (NFS_SRTT(rep) >> 3);
1122 NFS_SRTT(rep) += t1;
1123 if (t1 < 0)
1124 t1 = -t1;
1125 t1 -= (NFS_SDRTT(rep) >> 2);
1126 NFS_SDRTT(rep) += t1;
1127 }
1128 nmp->nm_timeouts = 0;
1129 break;
1130 }
1131 }
1132 nfs_rcvunlock(myrep);
1133 /*
1134 * If not matched to a request, drop it.
1135 * If it's mine, get out.
1136 */
1137 if (rep == 0) {
1138 OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1139 mbuf_freem(mrep);
1140 } else if (rep == myrep) {
1141 if (rep->r_mrep == NULL)
1142 panic("nfs_reply: nil r_mrep");
1143 return (0);
1144 }
1145 FSDBG(530, myrep->r_xid, myrep, rep,
1146 rep ? rep->r_xid : myrep->r_flags);
1147 }
1148 }
1149
1150 /*
1151 * nfs_request - goes something like this
1152 * - fill in request struct
1153 * - links it into list
1154 * - calls nfs_send() for first transmit
1155 * - calls nfs_receive() to get reply
1156 * - break down rpc header and return with nfs reply pointed to
1157 * by mrep or error
1158 * nb: always frees up mreq mbuf list
1159 */
1160 int
1161 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1162 vnode_t vp;
1163 mount_t mp;
1164 mbuf_t mrest;
1165 int procnum;
1166 proc_t procp;
1167 kauth_cred_t cred;
1168 mbuf_t *mrp;
1169 mbuf_t *mdp;
1170 caddr_t *dposp;
1171 u_int64_t *xidp;
1172 {
1173 mbuf_t m, mrep, m2;
1174 struct nfsreq re, *rep;
1175 u_long *tl;
1176 int i;
1177 struct nfsmount *nmp;
1178 mbuf_t md, mheadend;
1179 char nickv[RPCX_NICKVERF];
1180 time_t waituntil;
1181 caddr_t dpos, cp2;
1182 int t1, error = 0, mrest_len, auth_len, auth_type;
1183 int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1184 int verf_len, verf_type;
1185 u_long xid;
1186 char *auth_str, *verf_str;
1187 NFSKERBKEY_T key; /* save session key */
1188 int nmsotype;
1189 struct timeval now;
1190
1191 if (mrp)
1192 *mrp = NULL;
1193 if (xidp)
1194 *xidp = 0;
1195 nmp = VFSTONFS(mp);
1196
1197 rep = &re;
1198
1199 if (vp)
1200 nmp = VFSTONFS(vnode_mount(vp));
1201 if (nmp == NULL ||
1202 (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1203 (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1204 mbuf_freem(mrest);
1205 return (ENXIO);
1206 }
1207 nmsotype = nmp->nm_sotype;
1208
1209 FSDBG_TOP(531, vp, procnum, nmp, rep);
1210
1211 rep->r_nmp = nmp;
1212 rep->r_vp = vp;
1213 rep->r_procp = procp;
1214 rep->r_procnum = procnum;
1215 microuptime(&now);
1216 rep->r_lastmsg = now.tv_sec -
1217 ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1218 i = 0;
1219 m = mrest;
1220 while (m) {
1221 i += mbuf_len(m);
1222 m = mbuf_next(m);
1223 }
1224 mrest_len = i;
1225
1226 /*
1227 * Get the RPC header with authorization.
1228 */
1229 kerbauth:
1230 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1231 if (!nmp) {
1232 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1233 mbuf_freem(mrest);
1234 return (ENXIO);
1235 }
1236 verf_str = auth_str = (char *)0;
1237 if (nmp->nm_flag & NFSMNT_KERB) {
1238 verf_str = nickv;
1239 verf_len = sizeof (nickv);
1240 auth_type = RPCAUTH_KERB4;
1241 bzero((caddr_t)key, sizeof (key));
1242 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1243 &auth_len, verf_str, verf_len)) {
1244 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1245 if (!nmp) {
1246 FSDBG_BOT(531, 2, vp, error, rep);
1247 mbuf_freem(mrest);
1248 return (ENXIO);
1249 }
1250 error = nfs_getauth(nmp, rep, cred, &auth_str,
1251 &auth_len, verf_str, &verf_len, key);
1252 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1253 if (!error && !nmp)
1254 error = ENXIO;
1255 if (error) {
1256 FSDBG_BOT(531, 2, vp, error, rep);
1257 mbuf_freem(mrest);
1258 return (error);
1259 }
1260 }
1261 } else {
1262 auth_type = RPCAUTH_UNIX;
1263 if (cred->cr_ngroups < 1)
1264 panic("nfsreq nogrps");
1265 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1266 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1267 5 * NFSX_UNSIGNED;
1268 }
1269 error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1270 auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1271 if (auth_str)
1272 _FREE(auth_str, M_TEMP);
1273 if (error) {
1274 mbuf_freem(mrest);
1275 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1276 return (error);
1277 }
1278 if (xidp)
1279 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1280
1281 /*
1282 * For stream protocols, insert a Sun RPC Record Mark.
1283 */
1284 if (nmsotype == SOCK_STREAM) {
1285 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1286 if (error) {
1287 mbuf_freem(m);
1288 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1289 return (error);
1290 }
1291 *((u_long*)mbuf_data(m)) =
1292 htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1293 }
1294 rep->r_mreq = m;
1295 rep->r_xid = xid;
1296 tryagain:
1297 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1298 if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1299 rep->r_retry = nmp->nm_retry;
1300 else
1301 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
1302 rep->r_rtt = rep->r_rexmit = 0;
1303 if (proct[procnum] > 0)
1304 rep->r_flags = R_TIMING;
1305 else
1306 rep->r_flags = 0;
1307 rep->r_mrep = NULL;
1308
1309 /*
1310 * Do the client side RPC.
1311 */
1312 OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1313 /*
1314 * Chain request into list of outstanding requests. Be sure
1315 * to put it LAST so timer finds oldest requests first.
1316 */
1317 TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1318
1319 /*
1320 * If backing off another request or avoiding congestion, don't
1321 * send this one now but let timer do it. If not timing a request,
1322 * do it now.
1323 */
1324 if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1325 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1326 nmp->nm_sent < nmp->nm_cwnd)) {
1327 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1328
1329 if (connrequired)
1330 error = nfs_sndlock(rep);
1331
1332 /*
1333 * Set the R_SENT before doing the send in case another thread
1334 * processes the reply before the nfs_send returns here
1335 */
1336 if (!error) {
1337 if ((rep->r_flags & R_MUSTRESEND) == 0) {
1338 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1339 nmp->nm_cwnd);
1340 nmp->nm_sent += NFS_CWNDSCALE;
1341 rep->r_flags |= R_SENT;
1342 }
1343
1344 error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1345 if (!error)
1346 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1347 if (connrequired)
1348 nfs_sndunlock(rep);
1349 }
1350 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1351 if (error) {
1352 if (nmp)
1353 nmp->nm_sent -= NFS_CWNDSCALE;
1354 rep->r_flags &= ~R_SENT;
1355 }
1356 } else {
1357 rep->r_rtt = -1;
1358 }
1359
1360 /*
1361 * Wait for the reply from our send or the timer's.
1362 */
1363 if (!error || error == EPIPE)
1364 error = nfs_reply(rep);
1365
1366 /*
1367 * RPC done, unlink the request.
1368 */
1369 nfs_repdequeue(rep);
1370
1371 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1372
1373 /*
1374 * Decrement the outstanding request count.
1375 */
1376 if (rep->r_flags & R_SENT) {
1377 rep->r_flags &= ~R_SENT; /* paranoia */
1378 if (nmp) {
1379 FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1380 nmp->nm_sent -= NFS_CWNDSCALE;
1381 }
1382 }
1383
1384 /*
1385 * If there was a successful reply and a tprintf msg.
1386 * tprintf a response.
1387 */
1388 if (!error)
1389 nfs_up(nmp, procp, NFSSTA_TIMEO,
1390 (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1391 mrep = rep->r_mrep;
1392 md = rep->r_md;
1393 dpos = rep->r_dpos;
1394 if (!error && !nmp)
1395 error = ENXIO;
1396 if (error) {
1397 mbuf_freem(rep->r_mreq);
1398 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1399 return (error);
1400 }
1401
1402 /*
1403 * break down the rpc header and check if ok
1404 */
1405 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1406 if (*tl++ == rpc_msgdenied) {
1407 if (*tl == rpc_mismatch)
1408 error = EOPNOTSUPP;
1409 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1410 if (!failed_auth) {
1411 failed_auth++;
1412 error = mbuf_setnext(mheadend, NULL);
1413 mbuf_freem(mrep);
1414 mbuf_freem(rep->r_mreq);
1415 if (!error)
1416 goto kerbauth;
1417 printf("nfs_request: mbuf_setnext failed\n");
1418 } else
1419 error = EAUTH;
1420 } else
1421 error = EACCES;
1422 mbuf_freem(mrep);
1423 mbuf_freem(rep->r_mreq);
1424 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1425 return (error);
1426 }
1427
1428 /*
1429 * Grab any Kerberos verifier, otherwise just throw it away.
1430 */
1431 verf_type = fxdr_unsigned(int, *tl++);
1432 i = fxdr_unsigned(int, *tl);
1433 if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1434 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1435 if (error)
1436 goto nfsmout;
1437 } else if (i > 0)
1438 nfsm_adv(nfsm_rndup(i));
1439 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1440 /* 0 == ok */
1441 if (*tl == 0) {
1442 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1443 if (*tl != 0) {
1444 error = fxdr_unsigned(int, *tl);
1445 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1446 error == NFSERR_TRYLATER) {
1447 mbuf_freem(mrep);
1448 error = 0;
1449 microuptime(&now);
1450 waituntil = now.tv_sec + trylater_delay;
1451 while (now.tv_sec < waituntil) {
1452 tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1453 microuptime(&now);
1454 }
1455 trylater_delay *= 2;
1456 if (trylater_delay > 60)
1457 trylater_delay = 60;
1458 goto tryagain;
1459 }
1460
1461 /*
1462 * If the File Handle was stale, invalidate the
1463 * lookup cache, just in case.
1464 */
1465 if ((error == ESTALE) && vp)
1466 cache_purge(vp);
1467 if (nmp->nm_flag & NFSMNT_NFSV3) {
1468 *mrp = mrep;
1469 *mdp = md;
1470 *dposp = dpos;
1471 error |= NFSERR_RETERR;
1472 } else {
1473 mbuf_freem(mrep);
1474 error &= ~NFSERR_RETERR;
1475 }
1476 mbuf_freem(rep->r_mreq);
1477 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1478 return (error);
1479 }
1480
1481 *mrp = mrep;
1482 *mdp = md;
1483 *dposp = dpos;
1484 mbuf_freem(rep->r_mreq);
1485 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1486 return (0);
1487 }
1488 mbuf_freem(mrep);
1489 error = EPROTONOSUPPORT;
1490 nfsmout:
1491 mbuf_freem(rep->r_mreq);
1492 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1493 return (error);
1494 }
1495
1496 #ifndef NFS_NOSERVER
1497 /*
1498 * Generate the rpc reply header
1499 * siz arg. is used to decide if adding a cluster is worthwhile
1500 */
1501 int
1502 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1503 int siz;
1504 struct nfsrv_descript *nd;
1505 struct nfssvc_sock *slp;
1506 int err;
1507 mbuf_t *mrq;
1508 mbuf_t *mbp;
1509 caddr_t *bposp;
1510 {
1511 u_long *tl;
1512 mbuf_t mreq;
1513 caddr_t bpos;
1514 mbuf_t mb, mb2;
1515 int error, mlen;
1516
1517 /*
1518 * If this is a big reply, use a cluster else
1519 * try and leave leading space for the lower level headers.
1520 */
1521 siz += RPC_REPLYSIZ;
1522 if (siz >= nfs_mbuf_minclsize) {
1523 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1524 } else {
1525 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1526 }
1527 if (error) {
1528 /* unable to allocate packet */
1529 /* XXX nfsstat? */
1530 return (error);
1531 }
1532 mb = mreq;
1533 tl = mbuf_data(mreq);
1534 mlen = 6 * NFSX_UNSIGNED;
1535 if (siz < nfs_mbuf_minclsize) {
1536 /* leave space for lower level headers */
1537 tl += 80/sizeof(*tl); /* XXX max_hdr? XXX */
1538 mbuf_setdata(mreq, tl, mlen);
1539 } else {
1540 mbuf_setlen(mreq, mlen);
1541 }
1542 bpos = ((caddr_t)tl) + mlen;
1543 *tl++ = txdr_unsigned(nd->nd_retxid);
1544 *tl++ = rpc_reply;
1545 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1546 *tl++ = rpc_msgdenied;
1547 if (err & NFSERR_AUTHERR) {
1548 *tl++ = rpc_autherr;
1549 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1550 mlen -= NFSX_UNSIGNED;
1551 mbuf_setlen(mreq, mlen);
1552 bpos -= NFSX_UNSIGNED;
1553 } else {
1554 *tl++ = rpc_mismatch;
1555 *tl++ = txdr_unsigned(RPC_VER2);
1556 *tl = txdr_unsigned(RPC_VER2);
1557 }
1558 } else {
1559 *tl++ = rpc_msgaccepted;
1560
1561 /*
1562 * For Kerberos authentication, we must send the nickname
1563 * verifier back, otherwise just RPCAUTH_NULL.
1564 */
1565 if (nd->nd_flag & ND_KERBFULL) {
1566 struct nfsuid *nuidp;
1567 struct timeval ktvin, ktvout;
1568 uid_t uid = kauth_cred_getuid(nd->nd_cr);
1569
1570 lck_rw_lock_shared(&slp->ns_rwlock);
1571 for (nuidp = NUIDHASH(slp, uid)->lh_first;
1572 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1573 if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1574 (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1575 &nuidp->nu_haddr, nd->nd_nam2)))
1576 break;
1577 }
1578 if (nuidp) {
1579 ktvin.tv_sec =
1580 txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1581 ktvin.tv_usec =
1582 txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1583
1584 /*
1585 * Encrypt the timestamp in ecb mode using the
1586 * session key.
1587 */
1588 #if NFSKERB
1589 XXX
1590 #endif
1591
1592 *tl++ = rpc_auth_kerb;
1593 *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1594 *tl = ktvout.tv_sec;
1595 nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1596 *tl++ = ktvout.tv_usec;
1597 *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1598 } else {
1599 *tl++ = 0;
1600 *tl++ = 0;
1601 }
1602 lck_rw_done(&slp->ns_rwlock);
1603 } else {
1604 *tl++ = 0;
1605 *tl++ = 0;
1606 }
1607 switch (err) {
1608 case EPROGUNAVAIL:
1609 *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1610 break;
1611 case EPROGMISMATCH:
1612 *tl = txdr_unsigned(RPC_PROGMISMATCH);
1613 nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1614 // XXX hard coded versions
1615 *tl++ = txdr_unsigned(2);
1616 *tl = txdr_unsigned(3);
1617 break;
1618 case EPROCUNAVAIL:
1619 *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1620 break;
1621 case EBADRPC:
1622 *tl = txdr_unsigned(RPC_GARBAGE);
1623 break;
1624 default:
1625 *tl = 0;
1626 if (err != NFSERR_RETVOID) {
1627 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1628 if (err)
1629 *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1630 else
1631 *tl = 0;
1632 }
1633 break;
1634 }
1635 }
1636
1637 if (mrq != NULL)
1638 *mrq = mreq;
1639 *mbp = mb;
1640 *bposp = bpos;
1641 if (err != 0 && err != NFSERR_RETVOID) {
1642 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1643 }
1644 return (0);
1645 }
1646
1647
1648 #endif /* NFS_NOSERVER */
1649
1650
1651 /*
1652 * From FreeBSD 1.58, a Matt Dillon fix...
1653 * Flag a request as being about to terminate.
1654 * The nm_sent count is decremented now to avoid deadlocks when the process
1655 * in soreceive() hasn't yet managed to send its own request.
1656 */
1657 static void
1658 nfs_softterm(struct nfsreq *rep)
1659 {
1660
1661 rep->r_flags |= R_SOFTTERM;
1662 if (rep->r_flags & R_SENT) {
1663 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1664 rep->r_nmp->nm_cwnd);
1665 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1666 rep->r_flags &= ~R_SENT;
1667 }
1668 }
1669
1670 void
1671 nfs_timer_funnel(void * arg)
1672 {
1673 (void) thread_funnel_set(kernel_flock, TRUE);
1674 nfs_timer(arg);
1675 (void) thread_funnel_set(kernel_flock, FALSE);
1676
1677 }
1678
1679 /*
1680 * Ensure rep isn't in use by the timer, then dequeue it.
1681 */
1682 static void
1683 nfs_repdequeue(struct nfsreq *rep)
1684 {
1685
1686 while ((rep->r_flags & R_BUSY)) {
1687 rep->r_flags |= R_WAITING;
1688 tsleep(rep, PSOCK, "repdeq", 0);
1689 }
1690 TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1691 }
1692
1693 /*
1694 * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1695 * free()'d out from under it.
1696 */
1697 static void
1698 nfs_repbusy(struct nfsreq *rep)
1699 {
1700
1701 if ((rep->r_flags & R_BUSY))
1702 panic("rep locked");
1703 rep->r_flags |= R_BUSY;
1704 }
1705
1706 /*
1707 * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1708 */
1709 static struct nfsreq *
1710 nfs_repnext(struct nfsreq *rep)
1711 {
1712 struct nfsreq * nextrep;
1713
1714 if (rep == NULL)
1715 return (NULL);
1716 /*
1717 * We need to get and busy the next req before signalling the
1718 * current one, otherwise wakeup() may block us and we'll race to
1719 * grab the next req.
1720 */
1721 nextrep = TAILQ_NEXT(rep, r_chain);
1722 if (nextrep != NULL)
1723 nfs_repbusy(nextrep);
1724 /* unbusy and signal. */
1725 rep->r_flags &= ~R_BUSY;
1726 if ((rep->r_flags & R_WAITING)) {
1727 rep->r_flags &= ~R_WAITING;
1728 wakeup(rep);
1729 }
1730 return (nextrep);
1731 }
1732
1733 /*
1734 * Nfs timer routine
1735 * Scan the nfsreq list and retranmit any requests that have timed out
1736 * To avoid retransmission attempts on STREAM sockets (in the future) make
1737 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1738 */
1739 void
1740 nfs_timer(__unused void *arg)
1741 {
1742 struct nfsreq *rep;
1743 mbuf_t m;
1744 socket_t so;
1745 struct nfsmount *nmp;
1746 int timeo;
1747 int error;
1748 #ifndef NFS_NOSERVER
1749 struct nfssvc_sock *slp;
1750 u_quad_t cur_usec;
1751 #endif /* NFS_NOSERVER */
1752 int flags, rexmit, cwnd, sent;
1753 u_long xid;
1754 struct timeval now;
1755
1756 rep = TAILQ_FIRST(&nfs_reqq);
1757 if (rep != NULL)
1758 nfs_repbusy(rep);
1759 microuptime(&now);
1760 for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1761 nmp = rep->r_nmp;
1762 if (!nmp) /* unmounted */
1763 continue;
1764 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1765 continue;
1766 if (nfs_sigintr(nmp, rep, rep->r_procp))
1767 continue;
1768 if (nmp->nm_tprintf_initial_delay != 0 &&
1769 (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1770 rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1771 rep->r_lastmsg = now.tv_sec;
1772 nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1773 "not responding");
1774 rep->r_flags |= R_TPRINTFMSG;
1775 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1776 /* we're not yet completely mounted and */
1777 /* we can't complete an RPC, so we fail */
1778 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1779 nfs_softterm(rep);
1780 continue;
1781 }
1782 }
1783 if (rep->r_rtt >= 0) {
1784 rep->r_rtt++;
1785 if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1786 timeo = nmp->nm_timeo;
1787 else
1788 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1789 /* ensure 62.5 ms floor */
1790 while (16 * timeo < hz)
1791 timeo *= 2;
1792 if (nmp->nm_timeouts > 0)
1793 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1794 if (rep->r_rtt <= timeo)
1795 continue;
1796 if (nmp->nm_timeouts < 8)
1797 nmp->nm_timeouts++;
1798 }
1799 /*
1800 * Check for too many retransmits. This is never true for
1801 * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1802 * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1803 */
1804 if (rep->r_rexmit >= rep->r_retry) { /* too many */
1805 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1806 nfs_softterm(rep);
1807 continue;
1808 }
1809 if (nmp->nm_sotype != SOCK_DGRAM) {
1810 if (++rep->r_rexmit > NFS_MAXREXMIT)
1811 rep->r_rexmit = NFS_MAXREXMIT;
1812 continue;
1813 }
1814 if ((so = nmp->nm_so) == NULL)
1815 continue;
1816
1817 /*
1818 * If there is enough space and the window allows..
1819 * Resend it
1820 * Set r_rtt to -1 in case we fail to send it now.
1821 */
1822 rep->r_rtt = -1;
1823 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1824 (rep->r_flags & R_SENT) ||
1825 nmp->nm_sent < nmp->nm_cwnd) &&
1826 (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1827 struct msghdr msg;
1828 /*
1829 * Iff first send, start timing
1830 * else turn timing off, backoff timer
1831 * and divide congestion window by 2.
1832 * We update these *before* the send to avoid
1833 * racing against receiving the reply.
1834 * We save them so we can restore them on send error.
1835 */
1836 flags = rep->r_flags;
1837 rexmit = rep->r_rexmit;
1838 cwnd = nmp->nm_cwnd;
1839 sent = nmp->nm_sent;
1840 xid = rep->r_xid;
1841 if (rep->r_flags & R_SENT) {
1842 rep->r_flags &= ~R_TIMING;
1843 if (++rep->r_rexmit > NFS_MAXREXMIT)
1844 rep->r_rexmit = NFS_MAXREXMIT;
1845 nmp->nm_cwnd >>= 1;
1846 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1847 nmp->nm_cwnd = NFS_CWNDSCALE;
1848 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1849 } else {
1850 rep->r_flags |= R_SENT;
1851 nmp->nm_sent += NFS_CWNDSCALE;
1852 }
1853 FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1854
1855 bzero(&msg, sizeof(msg));
1856 if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1857 msg.msg_name = mbuf_data(nmp->nm_nam);
1858 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1859 }
1860 error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1861
1862 FSDBG(535, xid, error, sent, cwnd);
1863
1864 if (error) {
1865 if (error == EWOULDBLOCK) {
1866 rep->r_flags = flags;
1867 rep->r_rexmit = rexmit;
1868 nmp->nm_cwnd = cwnd;
1869 nmp->nm_sent = sent;
1870 rep->r_xid = xid;
1871 }
1872 else {
1873 if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1874 int clearerror;
1875 int optlen = sizeof(clearerror);
1876 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1877 }
1878 rep->r_flags = flags | R_RESENDERR;
1879 rep->r_rexmit = rexmit;
1880 nmp->nm_cwnd = cwnd;
1881 nmp->nm_sent = sent;
1882 if (flags & R_SENT)
1883 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1884 }
1885 } else
1886 rep->r_rtt = 0;
1887 }
1888 }
1889 microuptime(&now);
1890 #ifndef NFS_NOSERVER
1891 /*
1892 * Scan the write gathering queues for writes that need to be
1893 * completed now.
1894 */
1895 cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1896 lck_mtx_lock(nfsd_mutex);
1897 TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1898 if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1899 nfsrv_wakenfsd(slp);
1900 }
1901 while ((slp = TAILQ_FIRST(&nfssvc_deadsockhead))) {
1902 if ((slp->ns_timestamp + 5) > now.tv_sec)
1903 break;
1904 TAILQ_REMOVE(&nfssvc_deadsockhead, slp, ns_chain);
1905 nfsrv_slpfree(slp);
1906 }
1907 lck_mtx_unlock(nfsd_mutex);
1908 #endif /* NFS_NOSERVER */
1909
1910 if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1911 /*
1912 * We haven't called nfs_buf_freeup() in a little while.
1913 * So, see if we can free up any stale/unused bufs now.
1914 */
1915 nfs_buf_freeup(1);
1916 }
1917
1918 timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1919
1920 }
1921
1922
1923 /*
1924 * Test for a termination condition pending on the process.
1925 * This is used to determine if we need to bail on a mount.
1926 * EIO is returned if there has been a soft timeout.
1927 * EINTR is returned if there is a signal pending that is not being ignored
1928 * and the mount is interruptable, or if we are a thread that is in the process
1929 * of cancellation (also SIGKILL posted).
1930 */
1931 int
1932 nfs_sigintr(nmp, rep, p)
1933 struct nfsmount *nmp;
1934 struct nfsreq *rep;
1935 proc_t p;
1936 {
1937 sigset_t pending_sigs;
1938 int context_good = 0;
1939 struct nfsmount *repnmp;
1940 extern proc_t kernproc;
1941
1942 if (nmp == NULL)
1943 return (ENXIO);
1944 if (rep != NULL) {
1945 repnmp = rep->r_nmp;
1946 /* we've had a forced unmount. */
1947 if (repnmp == NULL)
1948 return (ENXIO);
1949 /* request has timed out on a 'soft' mount. */
1950 if (rep->r_flags & R_SOFTTERM)
1951 return (EIO);
1952 /*
1953 * We're in the progress of a force unmount and there's
1954 * been a timeout we're dead and fail IO.
1955 */
1956 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1957 (NFSSTA_FORCE|NFSSTA_TIMEO))
1958 return (EIO);
1959 /* Someone is unmounting us, go soft and mark it. */
1960 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1961 repnmp->nm_flag |= NFSMNT_SOFT;
1962 nmp->nm_state |= NFSSTA_FORCE;
1963 }
1964 /*
1965 * If the mount is hung and we've requested not to hang
1966 * on remote filesystems, then bail now.
1967 */
1968 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1969 (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1970 return (EIO);
1971 }
1972 /* XXX: is this valid? this probably should be an assertion. */
1973 if (p == NULL)
1974 return (0);
1975
1976 /* Is this thread belongs to kernel task; then abort check is not needed */
1977 if ((current_proc() != kernproc) && current_thread_aborted()) {
1978 return (EINTR);
1979 }
1980 /* mask off thread and process blocked signals. */
1981
1982 pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1983 if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1984 return (EINTR);
1985 return (0);
1986 }
1987
1988 /*
1989 * Lock a socket against others.
1990 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1991 * and also to avoid race conditions between the processes with nfs requests
1992 * in progress when a reconnect is necessary.
1993 */
1994 int
1995 nfs_sndlock(rep)
1996 struct nfsreq *rep;
1997 {
1998 int *statep;
1999 proc_t p;
2000 int error, slpflag = 0, slptimeo = 0;
2001
2002 if (rep->r_nmp == NULL)
2003 return (ENXIO);
2004 statep = &rep->r_nmp->nm_state;
2005
2006 p = rep->r_procp;
2007 if (rep->r_nmp->nm_flag & NFSMNT_INT)
2008 slpflag = PCATCH;
2009 while (*statep & NFSSTA_SNDLOCK) {
2010 error = nfs_sigintr(rep->r_nmp, rep, p);
2011 if (error)
2012 return (error);
2013 *statep |= NFSSTA_WANTSND;
2014 if (p != NULL && (proc_noremotehang(p)) != 0)
2015 slptimeo = hz;
2016 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2017 if (slpflag == PCATCH) {
2018 slpflag = 0;
2019 slptimeo = 2 * hz;
2020 }
2021 /*
2022 * Make sure while we slept that the mountpoint didn't go away.
2023 * nfs_sigintr and callers expect it in tact.
2024 */
2025 if (!rep->r_nmp)
2026 return (ENXIO); /* don't have lock until out of loop */
2027 }
2028 *statep |= NFSSTA_SNDLOCK;
2029 return (0);
2030 }
2031
2032 /*
2033 * Unlock the stream socket for others.
2034 */
2035 void
2036 nfs_sndunlock(rep)
2037 struct nfsreq *rep;
2038 {
2039 int *statep;
2040
2041 if (rep->r_nmp == NULL)
2042 return;
2043 statep = &rep->r_nmp->nm_state;
2044 if ((*statep & NFSSTA_SNDLOCK) == 0)
2045 panic("nfs sndunlock");
2046 *statep &= ~NFSSTA_SNDLOCK;
2047 if (*statep & NFSSTA_WANTSND) {
2048 *statep &= ~NFSSTA_WANTSND;
2049 wakeup((caddr_t)statep);
2050 }
2051 }
2052
2053 static int
2054 nfs_rcvlock(struct nfsreq *rep)
2055 {
2056 int *statep;
2057 int error, slpflag, slptimeo = 0;
2058
2059 /* make sure we still have our mountpoint */
2060 if (!rep->r_nmp) {
2061 if (rep->r_mrep != NULL)
2062 return (EALREADY);
2063 return (ENXIO);
2064 }
2065
2066 statep = &rep->r_nmp->nm_state;
2067 FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2068 if (rep->r_nmp->nm_flag & NFSMNT_INT)
2069 slpflag = PCATCH;
2070 else
2071 slpflag = 0;
2072 while (*statep & NFSSTA_RCVLOCK) {
2073 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2074 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2075 return (error);
2076 } else if (rep->r_mrep != NULL) {
2077 /*
2078 * Don't bother sleeping if reply already arrived
2079 */
2080 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2081 return (EALREADY);
2082 }
2083 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2084 *statep |= NFSSTA_WANTRCV;
2085 /*
2086 * We need to poll if we're P_NOREMOTEHANG so that we
2087 * call nfs_sigintr periodically above.
2088 */
2089 if (rep->r_procp != NULL &&
2090 (proc_noremotehang(rep->r_procp)) != 0)
2091 slptimeo = hz;
2092 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2093 if (slpflag == PCATCH) {
2094 slpflag = 0;
2095 slptimeo = 2 * hz;
2096 }
2097 /*
2098 * Make sure while we slept that the mountpoint didn't go away.
2099 * nfs_sigintr and caller nfs_reply expect it intact.
2100 */
2101 if (!rep->r_nmp) {
2102 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2103 return (ENXIO); /* don't have lock until out of loop */
2104 }
2105 }
2106 /*
2107 * nfs_reply will handle it if reply already arrived.
2108 * (We may have slept or been preempted).
2109 */
2110 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2111 *statep |= NFSSTA_RCVLOCK;
2112 return (0);
2113 }
2114
2115 /*
2116 * Unlock the stream socket for others.
2117 */
2118 static void
2119 nfs_rcvunlock(struct nfsreq *rep)
2120 {
2121 int *statep;
2122
2123 if (rep->r_nmp == NULL)
2124 return;
2125 statep = &rep->r_nmp->nm_state;
2126
2127 FSDBG(533, statep, *statep, 0, 0);
2128 if ((*statep & NFSSTA_RCVLOCK) == 0)
2129 panic("nfs rcvunlock");
2130 *statep &= ~NFSSTA_RCVLOCK;
2131 if (*statep & NFSSTA_WANTRCV) {
2132 *statep &= ~NFSSTA_WANTRCV;
2133 wakeup((caddr_t)statep);
2134 }
2135 }
2136
2137
2138 #ifndef NFS_NOSERVER
2139 /*
2140 * Socket upcall routine for the nfsd sockets.
2141 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2142 * Essentially do as much as possible non-blocking, else punt and it will
2143 * be called with MBUF_WAITOK from an nfsd.
2144 */
2145 void
2146 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2147 {
2148 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2149
2150 if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2151 return;
2152
2153 lck_rw_lock_exclusive(&slp->ns_rwlock);
2154 nfsrv_rcv_locked(so, slp, waitflag);
2155 /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2156 }
2157 void
2158 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2159 {
2160 mbuf_t m, mp, mhck, m2;
2161 int ns_flag=0, error;
2162 struct msghdr msg;
2163 size_t bytes_read;
2164
2165 if ((slp->ns_flag & SLP_VALID) == 0) {
2166 if (waitflag == MBUF_DONTWAIT)
2167 lck_rw_done(&slp->ns_rwlock);
2168 return;
2169 }
2170
2171 #ifdef notdef
2172 /*
2173 * Define this to test for nfsds handling this under heavy load.
2174 */
2175 if (waitflag == MBUF_DONTWAIT) {
2176 ns_flag = SLP_NEEDQ;
2177 goto dorecs;
2178 }
2179 #endif
2180 if (slp->ns_sotype == SOCK_STREAM) {
2181 /*
2182 * If there are already records on the queue, defer soreceive()
2183 * to an nfsd so that there is feedback to the TCP layer that
2184 * the nfs servers are heavily loaded.
2185 */
2186 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2187 ns_flag = SLP_NEEDQ;
2188 goto dorecs;
2189 }
2190
2191 /*
2192 * Do soreceive().
2193 */
2194 bytes_read = 1000000000;
2195 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2196 if (error || mp == NULL) {
2197 if (error == EWOULDBLOCK)
2198 ns_flag = SLP_NEEDQ;
2199 else
2200 ns_flag = SLP_DISCONN;
2201 goto dorecs;
2202 }
2203 m = mp;
2204 if (slp->ns_rawend) {
2205 if ((error = mbuf_setnext(slp->ns_rawend, m)))
2206 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2207 slp->ns_cc += bytes_read;
2208 } else {
2209 slp->ns_raw = m;
2210 slp->ns_cc = bytes_read;
2211 }
2212 while ((m2 = mbuf_next(m)))
2213 m = m2;
2214 slp->ns_rawend = m;
2215
2216 /*
2217 * Now try and parse record(s) out of the raw stream data.
2218 */
2219 error = nfsrv_getstream(slp, waitflag);
2220 if (error) {
2221 if (error == EPERM)
2222 ns_flag = SLP_DISCONN;
2223 else
2224 ns_flag = SLP_NEEDQ;
2225 }
2226 } else {
2227 struct sockaddr_storage nam;
2228
2229 bzero(&msg, sizeof(msg));
2230 msg.msg_name = (caddr_t)&nam;
2231 msg.msg_namelen = sizeof(nam);
2232
2233 do {
2234 bytes_read = 1000000000;
2235 error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2236 if (mp) {
2237 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2238 mbuf_setlen(mhck, nam.ss_len);
2239 bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2240 m = mhck;
2241 if (mbuf_setnext(m, mp)) {
2242 /* trouble... just drop it */
2243 printf("nfsrv_rcv: mbuf_setnext failed\n");
2244 mbuf_free(mhck);
2245 m = mp;
2246 }
2247 } else {
2248 m = mp;
2249 }
2250 if (slp->ns_recend)
2251 mbuf_setnextpkt(slp->ns_recend, m);
2252 else
2253 slp->ns_rec = m;
2254 slp->ns_recend = m;
2255 mbuf_setnextpkt(m, NULL);
2256 }
2257 #if 0
2258 if (error) {
2259 /*
2260 * This may be needed in the future to support
2261 * non-byte-stream connection-oriented protocols
2262 * such as SCTP.
2263 */
2264 /*
2265 * This (slp->ns_sotype == SOCK_STREAM) should really
2266 * be a check for PR_CONNREQUIRED.
2267 */
2268 if ((slp->ns_sotype == SOCK_STREAM)
2269 && error != EWOULDBLOCK) {
2270 ns_flag = SLP_DISCONN;
2271 goto dorecs;
2272 }
2273 }
2274 #endif
2275 } while (mp);
2276 }
2277
2278 /*
2279 * Now try and process the request records, non-blocking.
2280 */
2281 dorecs:
2282 if (ns_flag)
2283 slp->ns_flag |= ns_flag;
2284 if (waitflag == MBUF_DONTWAIT) {
2285 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2286 lck_rw_done(&slp->ns_rwlock);
2287 if (wake && nfs_numnfsd) {
2288 lck_mtx_lock(nfsd_mutex);
2289 nfsrv_wakenfsd(slp);
2290 lck_mtx_unlock(nfsd_mutex);
2291 }
2292 }
2293 }
2294
2295 /*
2296 * Try and extract an RPC request from the mbuf data list received on a
2297 * stream socket. The "waitflag" argument indicates whether or not it
2298 * can sleep.
2299 */
2300 static int
2301 nfsrv_getstream(slp, waitflag)
2302 struct nfssvc_sock *slp;
2303 int waitflag;
2304 {
2305 mbuf_t m;
2306 char *cp1, *cp2, *mdata;
2307 int len, mlen, error;
2308 mbuf_t om, m2, recm;
2309 u_long recmark;
2310
2311 if (slp->ns_flag & SLP_GETSTREAM)
2312 panic("nfs getstream");
2313 slp->ns_flag |= SLP_GETSTREAM;
2314 for (;;) {
2315 if (slp->ns_reclen == 0) {
2316 if (slp->ns_cc < NFSX_UNSIGNED) {
2317 slp->ns_flag &= ~SLP_GETSTREAM;
2318 return (0);
2319 }
2320 m = slp->ns_raw;
2321 mdata = mbuf_data(m);
2322 mlen = mbuf_len(m);
2323 if (mlen >= NFSX_UNSIGNED) {
2324 bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2325 mdata += NFSX_UNSIGNED;
2326 mlen -= NFSX_UNSIGNED;
2327 mbuf_setdata(m, mdata, mlen);
2328 } else {
2329 cp1 = (caddr_t)&recmark;
2330 cp2 = mdata;
2331 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2332 while (mlen == 0) {
2333 m = mbuf_next(m);
2334 cp2 = mbuf_data(m);
2335 mlen = mbuf_len(m);
2336 }
2337 *cp1++ = *cp2++;
2338 mlen--;
2339 mbuf_setdata(m, cp2, mlen);
2340 }
2341 }
2342 slp->ns_cc -= NFSX_UNSIGNED;
2343 recmark = ntohl(recmark);
2344 slp->ns_reclen = recmark & ~0x80000000;
2345 if (recmark & 0x80000000)
2346 slp->ns_flag |= SLP_LASTFRAG;
2347 else
2348 slp->ns_flag &= ~SLP_LASTFRAG;
2349 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2350 slp->ns_flag &= ~SLP_GETSTREAM;
2351 return (EPERM);
2352 }
2353 }
2354
2355 /*
2356 * Now get the record part.
2357 *
2358 * Note that slp->ns_reclen may be 0. Linux sometimes
2359 * generates 0-length RPCs
2360 */
2361 recm = NULL;
2362 if (slp->ns_cc == slp->ns_reclen) {
2363 recm = slp->ns_raw;
2364 slp->ns_raw = slp->ns_rawend = NULL;
2365 slp->ns_cc = slp->ns_reclen = 0;
2366 } else if (slp->ns_cc > slp->ns_reclen) {
2367 len = 0;
2368 m = slp->ns_raw;
2369 mlen = mbuf_len(m);
2370 mdata = mbuf_data(m);
2371 om = NULL;
2372 while (len < slp->ns_reclen) {
2373 if ((len + mlen) > slp->ns_reclen) {
2374 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2375 slp->ns_flag &= ~SLP_GETSTREAM;
2376 return (EWOULDBLOCK);
2377 }
2378 if (om) {
2379 if (mbuf_setnext(om, m2)) {
2380 /* trouble... just drop it */
2381 printf("nfsrv_getstream: mbuf_setnext failed\n");
2382 mbuf_freem(m2);
2383 slp->ns_flag &= ~SLP_GETSTREAM;
2384 return (EWOULDBLOCK);
2385 }
2386 recm = slp->ns_raw;
2387 } else {
2388 recm = m2;
2389 }
2390 mdata += slp->ns_reclen - len;
2391 mlen -= slp->ns_reclen - len;
2392 mbuf_setdata(m, mdata, mlen);
2393 len = slp->ns_reclen;
2394 } else if ((len + mlen) == slp->ns_reclen) {
2395 om = m;
2396 len += mlen;
2397 m = mbuf_next(m);
2398 recm = slp->ns_raw;
2399 if (mbuf_setnext(om, NULL)) {
2400 printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2401 slp->ns_flag &= ~SLP_GETSTREAM;
2402 return (EWOULDBLOCK);
2403 }
2404 mlen = mbuf_len(m);
2405 mdata = mbuf_data(m);
2406 } else {
2407 om = m;
2408 len += mlen;
2409 m = mbuf_next(m);
2410 mlen = mbuf_len(m);
2411 mdata = mbuf_data(m);
2412 }
2413 }
2414 slp->ns_raw = m;
2415 slp->ns_cc -= len;
2416 slp->ns_reclen = 0;
2417 } else {
2418 slp->ns_flag &= ~SLP_GETSTREAM;
2419 return (0);
2420 }
2421
2422 /*
2423 * Accumulate the fragments into a record.
2424 */
2425 if (slp->ns_frag == NULL) {
2426 slp->ns_frag = recm;
2427 } else {
2428 m = slp->ns_frag;
2429 while ((m2 = mbuf_next(m)))
2430 m = m2;
2431 if ((error = mbuf_setnext(m, recm)))
2432 panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2433 }
2434 if (slp->ns_flag & SLP_LASTFRAG) {
2435 if (slp->ns_recend)
2436 mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2437 else
2438 slp->ns_rec = slp->ns_frag;
2439 slp->ns_recend = slp->ns_frag;
2440 slp->ns_frag = NULL;
2441 }
2442 }
2443 }
2444
2445 /*
2446 * Parse an RPC header.
2447 */
2448 int
2449 nfsrv_dorec(slp, nfsd, ndp)
2450 struct nfssvc_sock *slp;
2451 struct nfsd *nfsd;
2452 struct nfsrv_descript **ndp;
2453 {
2454 mbuf_t m;
2455 mbuf_t nam;
2456 struct nfsrv_descript *nd;
2457 int error;
2458
2459 *ndp = NULL;
2460 if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2461 return (ENOBUFS);
2462 MALLOC_ZONE(nd, struct nfsrv_descript *,
2463 sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2464 if (!nd)
2465 return (ENOMEM);
2466 m = slp->ns_rec;
2467 slp->ns_rec = mbuf_nextpkt(m);
2468 if (slp->ns_rec)
2469 mbuf_setnextpkt(m, NULL);
2470 else
2471 slp->ns_recend = NULL;
2472 if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2473 nam = m;
2474 m = mbuf_next(m);
2475 if ((error = mbuf_setnext(nam, NULL)))
2476 panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2477 } else
2478 nam = NULL;
2479 nd->nd_md = nd->nd_mrep = m;
2480 nd->nd_nam2 = nam;
2481 nd->nd_dpos = mbuf_data(m);
2482 error = nfs_getreq(nd, nfsd, TRUE);
2483 if (error) {
2484 if (nam)
2485 mbuf_freem(nam);
2486 FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC);
2487 return (error);
2488 }
2489 *ndp = nd;
2490 nfsd->nfsd_nd = nd;
2491 return (0);
2492 }
2493
2494 /*
2495 * Parse an RPC request
2496 * - verify it
2497 * - fill in the cred struct.
2498 */
2499 int
2500 nfs_getreq(nd, nfsd, has_header)
2501 struct nfsrv_descript *nd;
2502 struct nfsd *nfsd;
2503 int has_header;
2504 {
2505 int len, i;
2506 u_long *tl;
2507 long t1;
2508 uio_t uiop;
2509 caddr_t dpos, cp2, cp;
2510 u_long nfsvers, auth_type;
2511 uid_t nickuid;
2512 int error = 0, ticklen;
2513 mbuf_t mrep, md;
2514 struct nfsuid *nuidp;
2515 uid_t user_id;
2516 gid_t group_id;
2517 int ngroups;
2518 struct ucred temp_cred;
2519 struct timeval tvin, tvout, now;
2520 char uio_buf[ UIO_SIZEOF(1) ];
2521 #if 0 /* until encrypted keys are implemented */
2522 NFSKERBKEYSCHED_T keys; /* stores key schedule */
2523 #endif
2524
2525 nd->nd_cr = NULL;
2526
2527 mrep = nd->nd_mrep;
2528 md = nd->nd_md;
2529 dpos = nd->nd_dpos;
2530 if (has_header) {
2531 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2532 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2533 if (*tl++ != rpc_call) {
2534 mbuf_freem(mrep);
2535 return (EBADRPC);
2536 }
2537 } else
2538 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2539 nd->nd_repstat = 0;
2540 nd->nd_flag = 0;
2541 if (*tl++ != rpc_vers) {
2542 nd->nd_repstat = ERPCMISMATCH;
2543 nd->nd_procnum = NFSPROC_NOOP;
2544 return (0);
2545 }
2546 if (*tl != nfs_prog) {
2547 nd->nd_repstat = EPROGUNAVAIL;
2548 nd->nd_procnum = NFSPROC_NOOP;
2549 return (0);
2550 }
2551 tl++;
2552 nfsvers = fxdr_unsigned(u_long, *tl++);
2553 if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2554 nd->nd_repstat = EPROGMISMATCH;
2555 nd->nd_procnum = NFSPROC_NOOP;
2556 return (0);
2557 }
2558 else if (nfsvers == NFS_VER3)
2559 nd->nd_flag = ND_NFSV3;
2560 nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2561 if (nd->nd_procnum == NFSPROC_NULL)
2562 return (0);
2563 if ((nd->nd_procnum >= NFS_NPROCS) ||
2564 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2565 nd->nd_repstat = EPROCUNAVAIL;
2566 nd->nd_procnum = NFSPROC_NOOP;
2567 return (0);
2568 }
2569 if ((nd->nd_flag & ND_NFSV3) == 0)
2570 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2571 auth_type = *tl++;
2572 len = fxdr_unsigned(int, *tl++);
2573 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2574 mbuf_freem(mrep);
2575 return (EBADRPC);
2576 }
2577
2578 nd->nd_flag &= ~ND_KERBAUTH;
2579 /*
2580 * Handle auth_unix or auth_kerb.
2581 */
2582 if (auth_type == rpc_auth_unix) {
2583 len = fxdr_unsigned(int, *++tl);
2584 if (len < 0 || len > NFS_MAXNAMLEN) {
2585 mbuf_freem(mrep);
2586 return (EBADRPC);
2587 }
2588 bzero(&temp_cred, sizeof(temp_cred));
2589 nfsm_adv(nfsm_rndup(len));
2590 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2591 user_id = fxdr_unsigned(uid_t, *tl++);
2592 group_id = fxdr_unsigned(gid_t, *tl++);
2593 temp_cred.cr_groups[0] = group_id;
2594 len = fxdr_unsigned(int, *tl);
2595 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2596 mbuf_freem(mrep);
2597 return (EBADRPC);
2598 }
2599 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2600 for (i = 1; i <= len; i++)
2601 if (i < NGROUPS)
2602 temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2603 else
2604 tl++;
2605 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2606 if (ngroups > 1)
2607 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2608 len = fxdr_unsigned(int, *++tl);
2609 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2610 mbuf_freem(mrep);
2611 return (EBADRPC);
2612 }
2613 temp_cred.cr_uid = user_id;
2614 temp_cred.cr_ngroups = ngroups;
2615 nd->nd_cr = kauth_cred_create(&temp_cred);
2616 if (nd->nd_cr == NULL) {
2617 nd->nd_repstat = ENOMEM;
2618 nd->nd_procnum = NFSPROC_NOOP;
2619 return (0);
2620 }
2621 if (len > 0)
2622 nfsm_adv(nfsm_rndup(len));
2623 } else if (auth_type == rpc_auth_kerb) {
2624 switch (fxdr_unsigned(int, *tl++)) {
2625 case RPCAKN_FULLNAME:
2626 ticklen = fxdr_unsigned(int, *tl);
2627 *((u_long *)nfsd->nfsd_authstr) = *tl;
2628 uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2629 &uio_buf[0], sizeof(uio_buf));
2630 if (!uiop) {
2631 nd->nd_repstat = ENOMEM;
2632 nd->nd_procnum = NFSPROC_NOOP;
2633 return (0);
2634 }
2635
2636 // LP64todo - fix this
2637 nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2638 if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2639 mbuf_freem(mrep);
2640 return (EBADRPC);
2641 }
2642 uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2643 // LP64todo - fix this
2644 nfsm_mtouio(uiop, uio_resid(uiop));
2645 nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2646 if (*tl++ != rpc_auth_kerb ||
2647 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2648 printf("Bad kerb verifier\n");
2649 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2650 nd->nd_procnum = NFSPROC_NOOP;
2651 return (0);
2652 }
2653 nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2654 tl = (u_long *)cp;
2655 if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2656 printf("Not fullname kerb verifier\n");
2657 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2658 nd->nd_procnum = NFSPROC_NOOP;
2659 return (0);
2660 }
2661 cp += NFSX_UNSIGNED;
2662 bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2663 nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2664 nd->nd_flag |= ND_KERBFULL;
2665 nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2666 break;
2667 case RPCAKN_NICKNAME:
2668 if (len != 2 * NFSX_UNSIGNED) {
2669 printf("Kerb nickname short\n");
2670 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2671 nd->nd_procnum = NFSPROC_NOOP;
2672 return (0);
2673 }
2674 nickuid = fxdr_unsigned(uid_t, *tl);
2675 nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2676 if (*tl++ != rpc_auth_kerb ||
2677 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2678 printf("Kerb nick verifier bad\n");
2679 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2680 nd->nd_procnum = NFSPROC_NOOP;
2681 return (0);
2682 }
2683 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2684 tvin.tv_sec = *tl++;
2685 tvin.tv_usec = *tl;
2686
2687 for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2688 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2689 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2690 (!nd->nd_nam2 ||
2691 netaddr_match(NU_NETFAM(nuidp),
2692 &nuidp->nu_haddr, nd->nd_nam2)))
2693 break;
2694 }
2695 if (!nuidp) {
2696 nd->nd_repstat =
2697 (NFSERR_AUTHERR|AUTH_REJECTCRED);
2698 nd->nd_procnum = NFSPROC_NOOP;
2699 return (0);
2700 }
2701
2702 /*
2703 * Now, decrypt the timestamp using the session key
2704 * and validate it.
2705 */
2706 #if NFSKERB
2707 XXX
2708 #endif
2709
2710 tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2711 tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2712 microtime(&now);
2713 if (nuidp->nu_expire < now.tv_sec ||
2714 nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2715 (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2716 nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2717 nuidp->nu_expire = 0;
2718 nd->nd_repstat =
2719 (NFSERR_AUTHERR|AUTH_REJECTVERF);
2720 nd->nd_procnum = NFSPROC_NOOP;
2721 return (0);
2722 }
2723 bzero(&temp_cred, sizeof(temp_cred));
2724 ngroups = nuidp->nu_cr->cr_ngroups;
2725 for (i = 0; i < ngroups; i++)
2726 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2727 if (ngroups > 1)
2728 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2729
2730 temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2731 temp_cred.cr_ngroups = ngroups;
2732 nd->nd_cr = kauth_cred_create(&temp_cred);
2733 if (!nd->nd_cr) {
2734 nd->nd_repstat = ENOMEM;
2735 nd->nd_procnum = NFSPROC_NOOP;
2736 return (0);
2737 }
2738 nd->nd_flag |= ND_KERBNICK;
2739 };
2740 } else {
2741 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2742 nd->nd_procnum = NFSPROC_NOOP;
2743 return (0);
2744 }
2745
2746 nd->nd_md = md;
2747 nd->nd_dpos = dpos;
2748 return (0);
2749 nfsmout:
2750 if (nd->nd_cr)
2751 kauth_cred_rele(nd->nd_cr);
2752 return (error);
2753 }
2754
2755 /*
2756 * Search for a sleeping nfsd and wake it up.
2757 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2758 * running nfsds will go look for the work in the nfssvc_sock list.
2759 * Note: Must be called with nfsd_mutex held.
2760 */
2761 void
2762 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2763 {
2764 struct nfsd *nd;
2765
2766 if ((slp->ns_flag & SLP_VALID) == 0)
2767 return;
2768
2769 lck_rw_lock_exclusive(&slp->ns_rwlock);
2770
2771 if (nfsd_waiting) {
2772 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2773 if (nd->nfsd_flag & NFSD_WAITING) {
2774 nd->nfsd_flag &= ~NFSD_WAITING;
2775 if (nd->nfsd_slp)
2776 panic("nfsd wakeup");
2777 slp->ns_sref++;
2778 nd->nfsd_slp = slp;
2779 lck_rw_done(&slp->ns_rwlock);
2780 wakeup((caddr_t)nd);
2781 return;
2782 }
2783 }
2784 }
2785
2786 slp->ns_flag |= SLP_DOREC;
2787
2788 lck_rw_done(&slp->ns_rwlock);
2789
2790 nfsd_head_flag |= NFSD_CHECKSLP;
2791 }
2792 #endif /* NFS_NOSERVER */
2793
2794 static int
2795 nfs_msg(proc_t p,
2796 const char *server,
2797 const char *msg,
2798 int error)
2799 {
2800 tpr_t tpr;
2801
2802 if (p)
2803 tpr = tprintf_open(p);
2804 else
2805 tpr = NULL;
2806 if (error)
2807 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2808 error);
2809 else
2810 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2811 tprintf_close(tpr);
2812 return (0);
2813 }
2814
2815 void
2816 nfs_down(nmp, proc, error, flags, msg)
2817 struct nfsmount *nmp;
2818 proc_t proc;
2819 int error, flags;
2820 const char *msg;
2821 {
2822 if (nmp == NULL)
2823 return;
2824 if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2825 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2826 nmp->nm_state |= NFSSTA_TIMEO;
2827 }
2828 if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2829 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2830 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2831 }
2832 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2833 }
2834
2835 void
2836 nfs_up(nmp, proc, flags, msg)
2837 struct nfsmount *nmp;
2838 proc_t proc;
2839 int flags;
2840 const char *msg;
2841 {
2842 if (nmp == NULL)
2843 return;
2844 if (msg)
2845 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2846 if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2847 nmp->nm_state &= ~NFSSTA_TIMEO;
2848 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2849 }
2850 if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2851 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2852 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2853 }
2854 }
2855