]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_socket.c
f0ca1838efbb8125ccda5acb4d25857a3b2248e1
[apple/xnu.git] / bsd / nfs / nfs_socket.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1991, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
66 */
67
68 /*
69 * Socket operations for use by nfs
70 */
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/proc.h>
75 #include <sys/kauth.h>
76 #include <sys/mount_internal.h>
77 #include <sys/kernel.h>
78 #include <sys/kpi_mbuf.h>
79 #include <sys/malloc.h>
80 #include <sys/vnode.h>
81 #include <sys/domain.h>
82 #include <sys/protosw.h>
83 #include <sys/socket.h>
84 #include <sys/syslog.h>
85 #include <sys/tprintf.h>
86 #include <sys/uio_internal.h>
87 #include <libkern/OSAtomic.h>
88
89 #include <sys/time.h>
90 #include <kern/clock.h>
91 #include <kern/task.h>
92 #include <kern/thread.h>
93 #include <sys/user.h>
94
95 #include <netinet/in.h>
96 #include <netinet/tcp.h>
97
98 #include <nfs/rpcv2.h>
99 #include <nfs/nfsproto.h>
100 #include <nfs/nfs.h>
101 #include <nfs/xdr_subs.h>
102 #include <nfs/nfsm_subs.h>
103 #include <nfs/nfsmount.h>
104 #include <nfs/nfsnode.h>
105 #include <nfs/nfsrtt.h>
106
107 #include <sys/kdebug.h>
108
109 #define FSDBG(A, B, C, D, E) \
110 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
111 (int)(B), (int)(C), (int)(D), (int)(E), 0)
112 #define FSDBG_TOP(A, B, C, D, E) \
113 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
114 (int)(B), (int)(C), (int)(D), (int)(E), 0)
115 #define FSDBG_BOT(A, B, C, D, E) \
116 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
117 (int)(B), (int)(C), (int)(D), (int)(E), 0)
118
119 /*
120 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
121 * Use the mean and mean deviation of rtt for the appropriate type of rpc
122 * for the frequent rpcs and a default for the others.
123 * The justification for doing "other" this way is that these rpcs
124 * happen so infrequently that timer est. would probably be stale.
125 * Also, since many of these rpcs are
126 * non-idempotent, a conservative timeout is desired.
127 * getattr, lookup - A+2D
128 * read, write - A+4D
129 * other - nm_timeo
130 */
131 #define NFS_RTO(n, t) \
132 ((t) == 0 ? (n)->nm_timeo : \
133 ((t) < 3 ? \
134 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
135 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
136 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
137 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
138 /*
139 * External data, mostly RPC constants in XDR form
140 */
141 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
142 rpc_msgaccepted, rpc_call, rpc_autherr,
143 rpc_auth_kerb;
144 extern u_long nfs_prog;
145 extern struct nfsstats nfsstats;
146 extern int nfsv3_procid[NFS_NPROCS];
147 extern int nfs_ticks;
148 extern u_long nfs_xidwrap;
149
150 /*
151 * Defines which timer to use for the procnum.
152 * 0 - default
153 * 1 - getattr
154 * 2 - lookup
155 * 3 - read
156 * 4 - write
157 */
158 static int proct[NFS_NPROCS] = {
159 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
160 };
161
162 /*
163 * There is a congestion window for outstanding rpcs maintained per mount
164 * point. The cwnd size is adjusted in roughly the way that:
165 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
166 * SIGCOMM '88". ACM, August 1988.
167 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
168 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
169 * of rpcs is in progress.
170 * (The sent count and cwnd are scaled for integer arith.)
171 * Variants of "slow start" were tried and were found to be too much of a
172 * performance hit (ave. rtt 3 times larger),
173 * I suspect due to the large rtt that nfs rpcs have.
174 */
175 #define NFS_CWNDSCALE 256
176 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
177 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
178 int nfsrtton = 0;
179 struct nfsrtt nfsrtt;
180
181 static int nfs_rcvlock(struct nfsreq *);
182 static void nfs_rcvunlock(struct nfsreq *);
183 static int nfs_receive(struct nfsreq *rep, mbuf_t *mp);
184 static int nfs_reconnect(struct nfsreq *rep);
185 static void nfs_repdequeue(struct nfsreq *rep);
186
187 /* XXX */
188 boolean_t current_thread_aborted(void);
189 kern_return_t thread_terminate(thread_t);
190
191 #ifndef NFS_NOSERVER
192 static int nfsrv_getstream(struct nfssvc_sock *,int);
193
194 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
195 struct nfssvc_sock *slp,
196 proc_t procp,
197 mbuf_t *mreqp) = {
198 nfsrv_null,
199 nfsrv_getattr,
200 nfsrv_setattr,
201 nfsrv_lookup,
202 nfsrv3_access,
203 nfsrv_readlink,
204 nfsrv_read,
205 nfsrv_write,
206 nfsrv_create,
207 nfsrv_mkdir,
208 nfsrv_symlink,
209 nfsrv_mknod,
210 nfsrv_remove,
211 nfsrv_rmdir,
212 nfsrv_rename,
213 nfsrv_link,
214 nfsrv_readdir,
215 nfsrv_readdirplus,
216 nfsrv_statfs,
217 nfsrv_fsinfo,
218 nfsrv_pathconf,
219 nfsrv_commit,
220 nfsrv_noop
221 };
222 #endif /* NFS_NOSERVER */
223
224
225 /*
226 * attempt to bind a socket to a reserved port
227 */
228 static int
229 nfs_bind_resv(struct nfsmount *nmp)
230 {
231 socket_t so = nmp->nm_so;
232 struct sockaddr_in sin;
233 int error;
234 u_short tport;
235
236 if (!so)
237 return (EINVAL);
238
239 sin.sin_len = sizeof (struct sockaddr_in);
240 sin.sin_family = AF_INET;
241 sin.sin_addr.s_addr = INADDR_ANY;
242 tport = IPPORT_RESERVED - 1;
243 sin.sin_port = htons(tport);
244
245 while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
246 (--tport > IPPORT_RESERVED / 2))
247 sin.sin_port = htons(tport);
248 return (error);
249 }
250
251 /*
252 * variables for managing the nfs_bind_resv_thread
253 */
254 int nfs_resv_mounts = 0;
255 static int nfs_bind_resv_thread_state = 0;
256 #define NFS_BIND_RESV_THREAD_STATE_INITTED 1
257 #define NFS_BIND_RESV_THREAD_STATE_RUNNING 2
258 lck_grp_t *nfs_bind_resv_lck_grp;
259 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
260 lck_attr_t *nfs_bind_resv_lck_attr;
261 lck_mtx_t *nfs_bind_resv_mutex;
262 struct nfs_bind_resv_request {
263 TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
264 struct nfsmount *brr_nmp;
265 int brr_error;
266 };
267 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
268
269 /*
270 * thread to handle any reserved port bind requests
271 */
272 static void
273 nfs_bind_resv_thread(void)
274 {
275 struct nfs_bind_resv_request *brreq;
276
277 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
278
279 while (nfs_resv_mounts > 0) {
280 lck_mtx_lock(nfs_bind_resv_mutex);
281 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
282 TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
283 lck_mtx_unlock(nfs_bind_resv_mutex);
284 brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
285 wakeup(brreq);
286 lck_mtx_lock(nfs_bind_resv_mutex);
287 }
288 msleep((caddr_t)&nfs_bind_resv_request_queue,
289 nfs_bind_resv_mutex, PSOCK | PDROP,
290 "nfs_bind_resv_request_queue", 0);
291 }
292
293 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
294 (void) thread_terminate(current_thread());
295 }
296
297 int
298 nfs_bind_resv_thread_wake(void)
299 {
300 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
301 return (EIO);
302 wakeup(&nfs_bind_resv_request_queue);
303 return (0);
304 }
305
306 /*
307 * underprivileged procs call this to request nfs_bind_resv_thread
308 * to perform the reserved port binding for them.
309 */
310 static int
311 nfs_bind_resv_nopriv(struct nfsmount *nmp)
312 {
313 struct nfs_bind_resv_request brreq;
314 int error;
315
316 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
317 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
318 nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
319 lck_grp_attr_setstat(nfs_bind_resv_lck_grp_attr);
320 nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
321 nfs_bind_resv_lck_attr = lck_attr_alloc_init();
322 nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
323 TAILQ_INIT(&nfs_bind_resv_request_queue);
324 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
325 }
326 kernel_thread(kernel_task, nfs_bind_resv_thread);
327 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
328 }
329
330 brreq.brr_nmp = nmp;
331 brreq.brr_error = 0;
332
333 lck_mtx_lock(nfs_bind_resv_mutex);
334 TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
335 lck_mtx_unlock(nfs_bind_resv_mutex);
336
337 error = nfs_bind_resv_thread_wake();
338 if (error) {
339 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
340 /* Note: we might be able to simply restart the thread */
341 return (error);
342 }
343
344 tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
345
346 return (brreq.brr_error);
347 }
348
349 /*
350 * Initialize sockets and congestion for a new NFS connection.
351 * We do not free the sockaddr if error.
352 */
353 int
354 nfs_connect(
355 struct nfsmount *nmp,
356 __unused struct nfsreq *rep)
357 {
358 socket_t so;
359 int error, rcvreserve, sndreserve;
360 struct sockaddr *saddr;
361 struct timeval timeo;
362
363 nmp->nm_so = 0;
364 saddr = mbuf_data(nmp->nm_nam);
365 error = sock_socket(saddr->sa_family, nmp->nm_sotype,
366 nmp->nm_soproto, 0, 0, &nmp->nm_so);
367 if (error) {
368 goto bad;
369 }
370 so = nmp->nm_so;
371
372 /*
373 * Some servers require that the client port be a reserved port number.
374 */
375 if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
376 proc_t p;
377 /*
378 * sobind() requires current_proc() to have superuser privs.
379 * If this bind is part of a reconnect, and the current proc
380 * doesn't have superuser privs, we hand the sobind() off to
381 * a kernel thread to process.
382 */
383 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
384 (p = current_proc()) && suser(kauth_cred_get(), 0)) {
385 /* request nfs_bind_resv_thread() to do bind */
386 error = nfs_bind_resv_nopriv(nmp);
387 } else {
388 error = nfs_bind_resv(nmp);
389 }
390 if (error)
391 goto bad;
392 }
393
394 /*
395 * Protocols that do not require connections may be optionally left
396 * unconnected for servers that reply from a port other than NFS_PORT.
397 */
398 if (nmp->nm_flag & NFSMNT_NOCONN) {
399 if (nmp->nm_sotype == SOCK_STREAM) {
400 error = ENOTCONN;
401 goto bad;
402 }
403 } else {
404 struct timeval tv;
405 tv.tv_sec = 2;
406 tv.tv_usec = 0;
407 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
408 if (error && error != EINPROGRESS) {
409 goto bad;
410 }
411
412 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
413 if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
414 goto bad;
415 }
416 }
417 }
418
419 /*
420 * Always time out on recieve, this allows us to reconnect the
421 * socket to deal with network changes.
422 */
423 timeo.tv_usec = 0;
424 timeo.tv_sec = 2;
425 error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
426 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
427 timeo.tv_sec = 5;
428 } else {
429 timeo.tv_sec = 0;
430 }
431 error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
432
433 if (nmp->nm_sotype == SOCK_DGRAM) {
434 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
435 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
436 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
437 } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
438 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
439 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
440 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
441 } else {
442 int proto;
443 int on = 1;
444
445 sock_gettype(so, NULL, NULL, &proto);
446 if (nmp->nm_sotype != SOCK_STREAM)
447 panic("nfscon sotype");
448
449 // Assume that SOCK_STREAM always requires a connection
450 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
451
452 if (proto == IPPROTO_TCP) {
453 sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
454 }
455
456 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
457 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
458 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
459 }
460
461 if (sndreserve > NFS_MAXSOCKBUF)
462 sndreserve = NFS_MAXSOCKBUF;
463 if (rcvreserve > NFS_MAXSOCKBUF)
464 rcvreserve = NFS_MAXSOCKBUF;
465 error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
466 if (error) {
467 goto bad;
468 }
469 error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
470 if (error) {
471 goto bad;
472 }
473
474 sock_nointerrupt(so, 1);
475
476 /* Initialize other non-zero congestion variables */
477 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
478 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
479 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
480 nmp->nm_sdrtt[3] = 0;
481 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
482 nmp->nm_sent = 0;
483 FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
484 nmp->nm_timeouts = 0;
485 return (0);
486
487 bad:
488 nfs_disconnect(nmp);
489 return (error);
490 }
491
492 /*
493 * Reconnect routine:
494 * Called when a connection is broken on a reliable protocol.
495 * - clean up the old socket
496 * - nfs_connect() again
497 * - set R_MUSTRESEND for all outstanding requests on mount point
498 * If this fails the mount point is DEAD!
499 * nb: Must be called with the nfs_sndlock() set on the mount point.
500 */
501 static int
502 nfs_reconnect(struct nfsreq *rep)
503 {
504 struct nfsreq *rp;
505 struct nfsmount *nmp = rep->r_nmp;
506 int error;
507
508 nfs_disconnect(nmp);
509 while ((error = nfs_connect(nmp, rep))) {
510 if (error == EINTR || error == ERESTART)
511 return (EINTR);
512 if (error == EIO)
513 return (EIO);
514 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
515 "can not connect");
516 rep->r_flags |= R_TPRINTFMSG;
517 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
518 /* we're not yet completely mounted and */
519 /* we can't reconnect, so we fail */
520 return (error);
521 }
522 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
523 return (error);
524 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
525 }
526
527 /*
528 * Loop through outstanding request list and fix up all requests
529 * on old socket.
530 */
531 TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
532 if (rp->r_nmp == nmp)
533 rp->r_flags |= R_MUSTRESEND;
534 }
535 return (0);
536 }
537
538 /*
539 * NFS disconnect. Clean up and unlink.
540 */
541 void
542 nfs_disconnect(struct nfsmount *nmp)
543 {
544 socket_t so;
545
546 if (nmp->nm_so) {
547 so = nmp->nm_so;
548 nmp->nm_so = 0;
549 sock_shutdown(so, 2);
550 sock_close(so);
551 }
552 }
553
554 /*
555 * This is the nfs send routine. For connection based socket types, it
556 * must be called with an nfs_sndlock() on the socket.
557 * "rep == NULL" indicates that it has been called from a server.
558 * For the client side:
559 * - return EINTR if the RPC is terminated, 0 otherwise
560 * - set R_MUSTRESEND if the send fails for any reason
561 * - do any cleanup required by recoverable socket errors (???)
562 * For the server side:
563 * - return EINTR or ERESTART if interrupted by a signal
564 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
565 * - do any cleanup required by recoverable socket errors (???)
566 */
567 int
568 nfs_send(so, nam, top, rep)
569 socket_t so;
570 mbuf_t nam;
571 mbuf_t top;
572 struct nfsreq *rep;
573 {
574 struct sockaddr *sendnam;
575 int error, error2, sotype, flags;
576 u_long xidqueued = 0;
577 struct nfsreq *rp;
578 char savenametolog[MAXPATHLEN];
579 struct msghdr msg;
580
581 if (rep) {
582 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
583 if (error) {
584 mbuf_freem(top);
585 return (error);
586 }
587 if ((so = rep->r_nmp->nm_so) == NULL) {
588 rep->r_flags |= R_MUSTRESEND;
589 mbuf_freem(top);
590 return (0);
591 }
592 rep->r_flags &= ~R_MUSTRESEND;
593 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
594 if (rp == rep)
595 break;
596 if (rp)
597 xidqueued = rp->r_xid;
598 }
599 sock_gettype(so, NULL, &sotype, NULL);
600 if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
601 (nam == 0))
602 sendnam = (struct sockaddr *)0;
603 else
604 sendnam = mbuf_data(nam);
605
606 if (sotype == SOCK_SEQPACKET)
607 flags = MSG_EOR;
608 else
609 flags = 0;
610
611 /*
612 * Save the name here in case mount point goes away if we block.
613 * The name is using local stack and is large, but don't
614 * want to block if we malloc.
615 */
616 if (rep)
617 strncpy(savenametolog,
618 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
619 MAXPATHLEN - 1);
620 bzero(&msg, sizeof(msg));
621 msg.msg_name = (caddr_t)sendnam;
622 msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
623 error = sock_sendmbuf(so, &msg, top, flags, NULL);
624
625 if (error) {
626 if (rep) {
627 if (xidqueued) {
628 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
629 if (rp == rep && rp->r_xid == xidqueued)
630 break;
631 if (!rp)
632 panic("nfs_send: error %d xid %x gone",
633 error, xidqueued);
634 }
635 log(LOG_INFO, "nfs send error %d for server %s\n",
636 error, savenametolog);
637 /*
638 * Deal with errors for the client side.
639 */
640 error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
641 if (error2) {
642 error = error2;
643 } else {
644 rep->r_flags |= R_MUSTRESEND;
645 }
646 } else
647 log(LOG_INFO, "nfsd send error %d\n", error);
648
649 /*
650 * Handle any recoverable (soft) socket errors here. (???)
651 */
652 if (error != EINTR && error != ERESTART && error != EIO &&
653 error != EWOULDBLOCK && error != EPIPE) {
654 error = 0;
655 }
656 }
657 return (error);
658 }
659
660 /*
661 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
662 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
663 * Mark and consolidate the data into a new mbuf list.
664 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
665 * small mbufs.
666 * For SOCK_STREAM we must be very careful to read an entire record once
667 * we have read any of it, even if the system call has been interrupted.
668 */
669 static int
670 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
671 {
672 socket_t so;
673 struct iovec_32 aio;
674 mbuf_t m, mlast;
675 u_long len, fraglen;
676 int error, error2, sotype;
677 proc_t p = current_proc(); /* XXX */
678 struct msghdr msg;
679 size_t rcvlen;
680 int lastfragment;
681
682 /*
683 * Set up arguments for soreceive()
684 */
685 *mp = NULL;
686 sotype = rep->r_nmp->nm_sotype;
687
688 /*
689 * For reliable protocols, lock against other senders/receivers
690 * in case a reconnect is necessary.
691 * For SOCK_STREAM, first get the Record Mark to find out how much
692 * more there is to get.
693 * We must lock the socket against other receivers
694 * until we have an entire rpc request/reply.
695 */
696 if (sotype != SOCK_DGRAM) {
697 error = nfs_sndlock(rep);
698 if (error)
699 return (error);
700 tryagain:
701 /*
702 * Check for fatal errors and resending request.
703 */
704 /*
705 * Ugh: If a reconnect attempt just happened, nm_so
706 * would have changed. NULL indicates a failed
707 * attempt that has essentially shut down this
708 * mount point.
709 */
710 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
711 nfs_sndunlock(rep);
712 if (error)
713 return (error);
714 return (EINTR);
715 }
716 so = rep->r_nmp->nm_so;
717 if (!so) {
718 error = nfs_reconnect(rep);
719 if (error) {
720 nfs_sndunlock(rep);
721 return (error);
722 }
723 goto tryagain;
724 }
725 while (rep->r_flags & R_MUSTRESEND) {
726 error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
727 if (!error) {
728 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
729 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
730 }
731 /*
732 * we also hold rcv lock so rep is still
733 * legit this point
734 */
735 if (error) {
736 if (error == EINTR || error == ERESTART ||
737 (error = nfs_reconnect(rep))) {
738 nfs_sndunlock(rep);
739 return (error);
740 }
741 goto tryagain;
742 }
743 }
744 nfs_sndunlock(rep);
745 if (sotype == SOCK_STREAM) {
746 error = 0;
747 len = 0;
748 lastfragment = 0;
749 mlast = NULL;
750 while (!error && !lastfragment) {
751 aio.iov_base = (uintptr_t) &fraglen;
752 aio.iov_len = sizeof(u_long);
753 bzero(&msg, sizeof(msg));
754 msg.msg_iov = (struct iovec *) &aio;
755 msg.msg_iovlen = 1;
756 do {
757 error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
758 if (!rep->r_nmp) /* if unmounted then bailout */
759 goto shutout;
760 if (error == EWOULDBLOCK && rep) {
761 error2 = nfs_sigintr(rep->r_nmp, rep, p);
762 if (error2)
763 error = error2;
764 }
765 } while (error == EWOULDBLOCK);
766 if (!error && rcvlen < aio.iov_len) {
767 /* only log a message if we got a partial word */
768 if (rcvlen != 0)
769 log(LOG_INFO,
770 "short receive (%d/%d) from nfs server %s\n",
771 rcvlen, sizeof(u_long),
772 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
773 error = EPIPE;
774 }
775 if (error)
776 goto errout;
777 lastfragment = ntohl(fraglen) & 0x80000000;
778 fraglen = ntohl(fraglen) & ~0x80000000;
779 len += fraglen;
780 /*
781 * This is SERIOUS! We are out of sync with the sender
782 * and forcing a disconnect/reconnect is all I can do.
783 */
784 if (len > NFS_MAXPACKET) {
785 log(LOG_ERR, "%s (%d) from nfs server %s\n",
786 "impossible RPC record length", len,
787 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
788 error = EFBIG;
789 goto errout;
790 }
791
792 m = NULL;
793 do {
794 rcvlen = fraglen;
795 error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
796 if (!rep->r_nmp) /* if unmounted then bailout */ {
797 goto shutout;
798 }
799 } while (error == EWOULDBLOCK || error == EINTR ||
800 error == ERESTART);
801
802 if (!error && fraglen > rcvlen) {
803 log(LOG_INFO,
804 "short receive (%d/%d) from nfs server %s\n",
805 rcvlen, fraglen,
806 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
807 error = EPIPE;
808 mbuf_freem(m);
809 }
810 if (!error) {
811 if (!*mp) {
812 *mp = m;
813 mlast = m;
814 } else {
815 error = mbuf_setnext(mlast, m);
816 if (error) {
817 printf("nfs_receive: mbuf_setnext failed %d\n", error);
818 mbuf_freem(m);
819 }
820 }
821 while (mbuf_next(mlast))
822 mlast = mbuf_next(mlast);
823 }
824 }
825 } else {
826 bzero(&msg, sizeof(msg));
827 do {
828 rcvlen = 100000000;
829 error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
830 if (!rep->r_nmp) /* if unmounted then bailout */ {
831 goto shutout;
832 }
833 if (error == EWOULDBLOCK && rep) {
834 error2 = nfs_sigintr(rep->r_nmp, rep, p);
835 if (error2) {
836 return (error2);
837 }
838 }
839 } while (error == EWOULDBLOCK);
840
841 if ((msg.msg_flags & MSG_EOR) == 0)
842 printf("Egad!!\n");
843 if (!error && *mp == NULL)
844 error = EPIPE;
845 len = rcvlen;
846 }
847 errout:
848 if (error && error != EINTR && error != ERESTART) {
849 mbuf_freem(*mp);
850 *mp = NULL;
851 if (error != EPIPE)
852 log(LOG_INFO,
853 "receive error %d from nfs server %s\n", error,
854 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
855 error = nfs_sndlock(rep);
856 if (!error) {
857 error = nfs_reconnect(rep);
858 if (!error)
859 goto tryagain;
860 nfs_sndunlock(rep);
861 }
862 }
863 } else {
864 /*
865 * We could have failed while rebinding the datagram socket
866 * so we need to attempt to rebind here.
867 */
868 if ((so = rep->r_nmp->nm_so) == NULL) {
869 error = nfs_sndlock(rep);
870 if (!error) {
871 error = nfs_reconnect(rep);
872 nfs_sndunlock(rep);
873 }
874 if (error)
875 return (error);
876 if (!rep->r_nmp) /* if unmounted then bailout */
877 return (ENXIO);
878 so = rep->r_nmp->nm_so;
879 }
880 bzero(&msg, sizeof(msg));
881 len = 0;
882 do {
883 rcvlen = 1000000;
884 error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
885 if (!rep->r_nmp) /* if unmounted then bailout */
886 goto shutout;
887 if (error) {
888 error2 = nfs_sigintr(rep->r_nmp, rep, p);
889 if (error2) {
890 error = error2;
891 goto shutout;
892 }
893 }
894 /* Reconnect for all errors. We may be receiving
895 * soft/hard/blocking errors because of a network
896 * change.
897 * XXX: we should rate limit or delay this
898 * to once every N attempts or something.
899 * although TCP doesn't seem to.
900 */
901 if (error) {
902 error2 = nfs_sndlock(rep);
903 if (!error2) {
904 error2 = nfs_reconnect(rep);
905 if (error2)
906 error = error2;
907 else if (!rep->r_nmp) /* if unmounted then bailout */
908 error = ENXIO;
909 else
910 so = rep->r_nmp->nm_so;
911 nfs_sndunlock(rep);
912 } else {
913 error = error2;
914 }
915 }
916 } while (error == EWOULDBLOCK);
917 }
918 shutout:
919 if (error) {
920 mbuf_freem(*mp);
921 *mp = NULL;
922 }
923 return (error);
924 }
925
926 /*
927 * Implement receipt of reply on a socket.
928 * We must search through the list of received datagrams matching them
929 * with outstanding requests using the xid, until ours is found.
930 */
931 /* ARGSUSED */
932 int
933 nfs_reply(myrep)
934 struct nfsreq *myrep;
935 {
936 struct nfsreq *rep;
937 struct nfsmount *nmp = myrep->r_nmp;
938 long t1;
939 mbuf_t mrep, md;
940 u_long rxid, *tl;
941 caddr_t dpos, cp2;
942 int error;
943
944 /*
945 * Loop around until we get our own reply
946 */
947 for (;;) {
948 /*
949 * Lock against other receivers so that I don't get stuck in
950 * sbwait() after someone else has received my reply for me.
951 * Also necessary for connection based protocols to avoid
952 * race conditions during a reconnect.
953 * If nfs_rcvlock() returns EALREADY, that means that
954 * the reply has already been recieved by another
955 * process and we can return immediately. In this
956 * case, the lock is not taken to avoid races with
957 * other processes.
958 */
959 error = nfs_rcvlock(myrep);
960 if (error == EALREADY)
961 return (0);
962 if (error)
963 return (error);
964
965 /*
966 * If we slept after putting bits otw, then reply may have
967 * arrived. In which case returning is required, or we
968 * would hang trying to nfs_receive an already received reply.
969 */
970 if (myrep->r_mrep != NULL) {
971 nfs_rcvunlock(myrep);
972 FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
973 return (0);
974 }
975 /*
976 * Get the next Rpc reply off the socket. Assume myrep->r_nmp
977 * is still intact by checks done in nfs_rcvlock.
978 */
979 error = nfs_receive(myrep, &mrep);
980 /*
981 * Bailout asap if nfsmount struct gone (unmounted).
982 */
983 if (!myrep->r_nmp) {
984 FSDBG(530, myrep->r_xid, myrep, nmp, -2);
985 if (mrep)
986 mbuf_freem(mrep);
987 return (ENXIO);
988 }
989 if (error) {
990 FSDBG(530, myrep->r_xid, myrep, nmp, error);
991 nfs_rcvunlock(myrep);
992
993 /* Bailout asap if nfsmount struct gone (unmounted). */
994 if (!myrep->r_nmp) {
995 if (mrep)
996 mbuf_freem(mrep);
997 return (ENXIO);
998 }
999
1000 /*
1001 * Ignore routing errors on connectionless protocols??
1002 */
1003 if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1004 if (nmp->nm_so) {
1005 int clearerror;
1006 int optlen = sizeof(clearerror);
1007 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1008 }
1009 continue;
1010 }
1011 if (mrep)
1012 mbuf_freem(mrep);
1013 return (error);
1014 }
1015
1016 /*
1017 * We assume all is fine, but if we did not have an error
1018 * and mrep is 0, better not dereference it. nfs_receive
1019 * calls soreceive which carefully sets error=0 when it got
1020 * errors on sbwait (tsleep). In most cases, I assume that's
1021 * so we could go back again. In tcp case, EPIPE is returned.
1022 * In udp, case nfs_receive gets back here with no error and no
1023 * mrep. Is the right fix to have soreceive check for process
1024 * aborted after sbwait and return something non-zero? Should
1025 * nfs_receive give an EPIPE? Too risky to play with those
1026 * two this late in game for a shutdown problem. Instead,
1027 * just check here and get out. (ekn)
1028 */
1029 if (!mrep) {
1030 nfs_rcvunlock(myrep);
1031 FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1032 return (ENXIO); /* sounds good */
1033 }
1034
1035 /*
1036 * Get the xid and check that it is an rpc reply
1037 */
1038 md = mrep;
1039 dpos = mbuf_data(md);
1040 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1041 rxid = *tl++;
1042 if (*tl != rpc_reply) {
1043 OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1044 mbuf_freem(mrep);
1045 nfsmout:
1046 if (nmp->nm_state & NFSSTA_RCVLOCK)
1047 nfs_rcvunlock(myrep);
1048 continue;
1049 }
1050
1051 /*
1052 * Loop through the request list to match up the reply
1053 * Iff no match, just drop the datagram
1054 */
1055 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1056 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1057 /* Found it.. */
1058 rep->r_mrep = mrep;
1059 rep->r_md = md;
1060 rep->r_dpos = dpos;
1061 /*
1062 * If we're tracking the round trip time
1063 * then we update the circular log here
1064 * with the stats from our current request.
1065 */
1066 if (nfsrtton) {
1067 struct rttl *rt;
1068
1069 rt = &nfsrtt.rttl[nfsrtt.pos];
1070 rt->proc = rep->r_procnum;
1071 rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1072 rt->sent = nmp->nm_sent;
1073 rt->cwnd = nmp->nm_cwnd;
1074 if (proct[rep->r_procnum] == 0)
1075 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1076 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1077 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1078 rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1079 microtime(&rt->tstamp); // XXX unused
1080 if (rep->r_flags & R_TIMING)
1081 rt->rtt = rep->r_rtt;
1082 else
1083 rt->rtt = 1000000;
1084 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1085 }
1086 /*
1087 * Update congestion window.
1088 * Do the additive increase of
1089 * one rpc/rtt.
1090 */
1091 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1092 nmp->nm_cwnd);
1093 if (nmp->nm_cwnd <= nmp->nm_sent) {
1094 nmp->nm_cwnd +=
1095 (NFS_CWNDSCALE * NFS_CWNDSCALE +
1096 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1097 if (nmp->nm_cwnd > NFS_MAXCWND)
1098 nmp->nm_cwnd = NFS_MAXCWND;
1099 }
1100 if (rep->r_flags & R_SENT) {
1101 rep->r_flags &= ~R_SENT;
1102 nmp->nm_sent -= NFS_CWNDSCALE;
1103 }
1104 /*
1105 * Update rtt using a gain of 0.125 on the mean
1106 * and a gain of 0.25 on the deviation.
1107 */
1108 if (rep->r_flags & R_TIMING) {
1109 /*
1110 * Since the timer resolution of
1111 * NFS_HZ is so course, it can often
1112 * result in r_rtt == 0. Since
1113 * r_rtt == N means that the actual
1114 * rtt is between N+dt and N+2-dt ticks,
1115 * add 1.
1116 */
1117 if (proct[rep->r_procnum] == 0)
1118 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1119 t1 = rep->r_rtt + 1;
1120 t1 -= (NFS_SRTT(rep) >> 3);
1121 NFS_SRTT(rep) += t1;
1122 if (t1 < 0)
1123 t1 = -t1;
1124 t1 -= (NFS_SDRTT(rep) >> 2);
1125 NFS_SDRTT(rep) += t1;
1126 }
1127 nmp->nm_timeouts = 0;
1128 break;
1129 }
1130 }
1131 nfs_rcvunlock(myrep);
1132 /*
1133 * If not matched to a request, drop it.
1134 * If it's mine, get out.
1135 */
1136 if (rep == 0) {
1137 OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1138 mbuf_freem(mrep);
1139 } else if (rep == myrep) {
1140 if (rep->r_mrep == NULL)
1141 panic("nfs_reply: nil r_mrep");
1142 return (0);
1143 }
1144 FSDBG(530, myrep->r_xid, myrep, rep,
1145 rep ? rep->r_xid : myrep->r_flags);
1146 }
1147 }
1148
1149 /*
1150 * nfs_request - goes something like this
1151 * - fill in request struct
1152 * - links it into list
1153 * - calls nfs_send() for first transmit
1154 * - calls nfs_receive() to get reply
1155 * - break down rpc header and return with nfs reply pointed to
1156 * by mrep or error
1157 * nb: always frees up mreq mbuf list
1158 */
1159 int
1160 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1161 vnode_t vp;
1162 mount_t mp;
1163 mbuf_t mrest;
1164 int procnum;
1165 proc_t procp;
1166 kauth_cred_t cred;
1167 mbuf_t *mrp;
1168 mbuf_t *mdp;
1169 caddr_t *dposp;
1170 u_int64_t *xidp;
1171 {
1172 mbuf_t m, mrep, m2;
1173 struct nfsreq re, *rep;
1174 u_long *tl;
1175 int i;
1176 struct nfsmount *nmp;
1177 mbuf_t md, mheadend;
1178 char nickv[RPCX_NICKVERF];
1179 time_t waituntil;
1180 caddr_t dpos, cp2;
1181 int t1, error = 0, mrest_len, auth_len, auth_type;
1182 int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1183 int verf_len, verf_type;
1184 u_long xid;
1185 char *auth_str, *verf_str;
1186 NFSKERBKEY_T key; /* save session key */
1187 int nmsotype;
1188 struct timeval now;
1189
1190 if (mrp)
1191 *mrp = NULL;
1192 if (xidp)
1193 *xidp = 0;
1194 nmp = VFSTONFS(mp);
1195
1196 rep = &re;
1197
1198 if (vp)
1199 nmp = VFSTONFS(vnode_mount(vp));
1200 if (nmp == NULL ||
1201 (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1202 (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1203 mbuf_freem(mrest);
1204 return (ENXIO);
1205 }
1206 nmsotype = nmp->nm_sotype;
1207
1208 FSDBG_TOP(531, vp, procnum, nmp, rep);
1209
1210 rep->r_nmp = nmp;
1211 rep->r_vp = vp;
1212 rep->r_procp = procp;
1213 rep->r_procnum = procnum;
1214 microuptime(&now);
1215 rep->r_lastmsg = now.tv_sec -
1216 ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1217 i = 0;
1218 m = mrest;
1219 while (m) {
1220 i += mbuf_len(m);
1221 m = mbuf_next(m);
1222 }
1223 mrest_len = i;
1224
1225 /*
1226 * Get the RPC header with authorization.
1227 */
1228 kerbauth:
1229 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1230 if (!nmp) {
1231 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1232 mbuf_freem(mrest);
1233 return (ENXIO);
1234 }
1235 verf_str = auth_str = (char *)0;
1236 if (nmp->nm_flag & NFSMNT_KERB) {
1237 verf_str = nickv;
1238 verf_len = sizeof (nickv);
1239 auth_type = RPCAUTH_KERB4;
1240 bzero((caddr_t)key, sizeof (key));
1241 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1242 &auth_len, verf_str, verf_len)) {
1243 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1244 if (!nmp) {
1245 FSDBG_BOT(531, 2, vp, error, rep);
1246 mbuf_freem(mrest);
1247 return (ENXIO);
1248 }
1249 error = nfs_getauth(nmp, rep, cred, &auth_str,
1250 &auth_len, verf_str, &verf_len, key);
1251 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1252 if (!error && !nmp)
1253 error = ENXIO;
1254 if (error) {
1255 FSDBG_BOT(531, 2, vp, error, rep);
1256 mbuf_freem(mrest);
1257 return (error);
1258 }
1259 }
1260 } else {
1261 auth_type = RPCAUTH_UNIX;
1262 if (cred->cr_ngroups < 1)
1263 panic("nfsreq nogrps");
1264 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1265 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1266 5 * NFSX_UNSIGNED;
1267 }
1268 error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1269 auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1270 if (auth_str)
1271 _FREE(auth_str, M_TEMP);
1272 if (error) {
1273 mbuf_freem(mrest);
1274 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1275 return (error);
1276 }
1277 if (xidp)
1278 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1279
1280 /*
1281 * For stream protocols, insert a Sun RPC Record Mark.
1282 */
1283 if (nmsotype == SOCK_STREAM) {
1284 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1285 if (error) {
1286 mbuf_freem(m);
1287 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1288 return (error);
1289 }
1290 *((u_long*)mbuf_data(m)) =
1291 htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1292 }
1293 rep->r_mreq = m;
1294 rep->r_xid = xid;
1295 tryagain:
1296 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1297 if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1298 rep->r_retry = nmp->nm_retry;
1299 else
1300 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
1301 rep->r_rtt = rep->r_rexmit = 0;
1302 if (proct[procnum] > 0)
1303 rep->r_flags = R_TIMING;
1304 else
1305 rep->r_flags = 0;
1306 rep->r_mrep = NULL;
1307
1308 /*
1309 * Do the client side RPC.
1310 */
1311 OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1312 /*
1313 * Chain request into list of outstanding requests. Be sure
1314 * to put it LAST so timer finds oldest requests first.
1315 */
1316 TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1317
1318 /*
1319 * If backing off another request or avoiding congestion, don't
1320 * send this one now but let timer do it. If not timing a request,
1321 * do it now.
1322 */
1323 if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1324 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1325 nmp->nm_sent < nmp->nm_cwnd)) {
1326 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1327
1328 if (connrequired)
1329 error = nfs_sndlock(rep);
1330
1331 /*
1332 * Set the R_SENT before doing the send in case another thread
1333 * processes the reply before the nfs_send returns here
1334 */
1335 if (!error) {
1336 if ((rep->r_flags & R_MUSTRESEND) == 0) {
1337 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1338 nmp->nm_cwnd);
1339 nmp->nm_sent += NFS_CWNDSCALE;
1340 rep->r_flags |= R_SENT;
1341 }
1342
1343 error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1344 if (!error)
1345 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1346 if (connrequired)
1347 nfs_sndunlock(rep);
1348 }
1349 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1350 if (error) {
1351 if (nmp)
1352 nmp->nm_sent -= NFS_CWNDSCALE;
1353 rep->r_flags &= ~R_SENT;
1354 }
1355 } else {
1356 rep->r_rtt = -1;
1357 }
1358
1359 /*
1360 * Wait for the reply from our send or the timer's.
1361 */
1362 if (!error || error == EPIPE)
1363 error = nfs_reply(rep);
1364
1365 /*
1366 * RPC done, unlink the request.
1367 */
1368 nfs_repdequeue(rep);
1369
1370 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1371
1372 /*
1373 * Decrement the outstanding request count.
1374 */
1375 if (rep->r_flags & R_SENT) {
1376 rep->r_flags &= ~R_SENT; /* paranoia */
1377 if (nmp) {
1378 FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1379 nmp->nm_sent -= NFS_CWNDSCALE;
1380 }
1381 }
1382
1383 /*
1384 * If there was a successful reply and a tprintf msg.
1385 * tprintf a response.
1386 */
1387 if (!error)
1388 nfs_up(nmp, procp, NFSSTA_TIMEO,
1389 (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1390 mrep = rep->r_mrep;
1391 md = rep->r_md;
1392 dpos = rep->r_dpos;
1393 if (!error && !nmp)
1394 error = ENXIO;
1395 if (error) {
1396 mbuf_freem(rep->r_mreq);
1397 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1398 return (error);
1399 }
1400
1401 /*
1402 * break down the rpc header and check if ok
1403 */
1404 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1405 if (*tl++ == rpc_msgdenied) {
1406 if (*tl == rpc_mismatch)
1407 error = EOPNOTSUPP;
1408 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1409 if (!failed_auth) {
1410 failed_auth++;
1411 error = mbuf_setnext(mheadend, NULL);
1412 mbuf_freem(mrep);
1413 mbuf_freem(rep->r_mreq);
1414 if (!error)
1415 goto kerbauth;
1416 printf("nfs_request: mbuf_setnext failed\n");
1417 } else
1418 error = EAUTH;
1419 } else
1420 error = EACCES;
1421 mbuf_freem(mrep);
1422 mbuf_freem(rep->r_mreq);
1423 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1424 return (error);
1425 }
1426
1427 /*
1428 * Grab any Kerberos verifier, otherwise just throw it away.
1429 */
1430 verf_type = fxdr_unsigned(int, *tl++);
1431 i = fxdr_unsigned(int, *tl);
1432 if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1433 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1434 if (error)
1435 goto nfsmout;
1436 } else if (i > 0)
1437 nfsm_adv(nfsm_rndup(i));
1438 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1439 /* 0 == ok */
1440 if (*tl == 0) {
1441 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1442 if (*tl != 0) {
1443 error = fxdr_unsigned(int, *tl);
1444 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1445 error == NFSERR_TRYLATER) {
1446 mbuf_freem(mrep);
1447 error = 0;
1448 microuptime(&now);
1449 waituntil = now.tv_sec + trylater_delay;
1450 while (now.tv_sec < waituntil) {
1451 tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1452 microuptime(&now);
1453 }
1454 trylater_delay *= 2;
1455 if (trylater_delay > 60)
1456 trylater_delay = 60;
1457 goto tryagain;
1458 }
1459
1460 /*
1461 * If the File Handle was stale, invalidate the
1462 * lookup cache, just in case.
1463 */
1464 if ((error == ESTALE) && vp)
1465 cache_purge(vp);
1466 if (nmp->nm_flag & NFSMNT_NFSV3) {
1467 *mrp = mrep;
1468 *mdp = md;
1469 *dposp = dpos;
1470 error |= NFSERR_RETERR;
1471 } else {
1472 mbuf_freem(mrep);
1473 error &= ~NFSERR_RETERR;
1474 }
1475 mbuf_freem(rep->r_mreq);
1476 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1477 return (error);
1478 }
1479
1480 *mrp = mrep;
1481 *mdp = md;
1482 *dposp = dpos;
1483 mbuf_freem(rep->r_mreq);
1484 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1485 return (0);
1486 }
1487 mbuf_freem(mrep);
1488 error = EPROTONOSUPPORT;
1489 nfsmout:
1490 mbuf_freem(rep->r_mreq);
1491 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1492 return (error);
1493 }
1494
1495 #ifndef NFS_NOSERVER
1496 /*
1497 * Generate the rpc reply header
1498 * siz arg. is used to decide if adding a cluster is worthwhile
1499 */
1500 int
1501 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1502 int siz;
1503 struct nfsrv_descript *nd;
1504 struct nfssvc_sock *slp;
1505 int err;
1506 mbuf_t *mrq;
1507 mbuf_t *mbp;
1508 caddr_t *bposp;
1509 {
1510 u_long *tl;
1511 mbuf_t mreq;
1512 caddr_t bpos;
1513 mbuf_t mb, mb2;
1514 int error, mlen;
1515
1516 /*
1517 * If this is a big reply, use a cluster else
1518 * try and leave leading space for the lower level headers.
1519 */
1520 siz += RPC_REPLYSIZ;
1521 if (siz >= nfs_mbuf_minclsize) {
1522 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1523 } else {
1524 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1525 }
1526 if (error) {
1527 /* unable to allocate packet */
1528 /* XXX nfsstat? */
1529 return (error);
1530 }
1531 mb = mreq;
1532 tl = mbuf_data(mreq);
1533 mlen = 6 * NFSX_UNSIGNED;
1534 if (siz < nfs_mbuf_minclsize) {
1535 /* leave space for lower level headers */
1536 tl += 80/sizeof(*tl); /* XXX max_hdr? XXX */
1537 mbuf_setdata(mreq, tl, mlen);
1538 } else {
1539 mbuf_setlen(mreq, mlen);
1540 }
1541 bpos = ((caddr_t)tl) + mlen;
1542 *tl++ = txdr_unsigned(nd->nd_retxid);
1543 *tl++ = rpc_reply;
1544 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1545 *tl++ = rpc_msgdenied;
1546 if (err & NFSERR_AUTHERR) {
1547 *tl++ = rpc_autherr;
1548 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1549 mlen -= NFSX_UNSIGNED;
1550 mbuf_setlen(mreq, mlen);
1551 bpos -= NFSX_UNSIGNED;
1552 } else {
1553 *tl++ = rpc_mismatch;
1554 *tl++ = txdr_unsigned(RPC_VER2);
1555 *tl = txdr_unsigned(RPC_VER2);
1556 }
1557 } else {
1558 *tl++ = rpc_msgaccepted;
1559
1560 /*
1561 * For Kerberos authentication, we must send the nickname
1562 * verifier back, otherwise just RPCAUTH_NULL.
1563 */
1564 if (nd->nd_flag & ND_KERBFULL) {
1565 struct nfsuid *nuidp;
1566 struct timeval ktvin, ktvout;
1567 uid_t uid = kauth_cred_getuid(nd->nd_cr);
1568
1569 lck_rw_lock_shared(&slp->ns_rwlock);
1570 for (nuidp = NUIDHASH(slp, uid)->lh_first;
1571 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1572 if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1573 (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1574 &nuidp->nu_haddr, nd->nd_nam2)))
1575 break;
1576 }
1577 if (nuidp) {
1578 ktvin.tv_sec =
1579 txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1580 ktvin.tv_usec =
1581 txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1582
1583 /*
1584 * Encrypt the timestamp in ecb mode using the
1585 * session key.
1586 */
1587 #if NFSKERB
1588 XXX
1589 #endif
1590
1591 *tl++ = rpc_auth_kerb;
1592 *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1593 *tl = ktvout.tv_sec;
1594 nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1595 *tl++ = ktvout.tv_usec;
1596 *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1597 } else {
1598 *tl++ = 0;
1599 *tl++ = 0;
1600 }
1601 lck_rw_done(&slp->ns_rwlock);
1602 } else {
1603 *tl++ = 0;
1604 *tl++ = 0;
1605 }
1606 switch (err) {
1607 case EPROGUNAVAIL:
1608 *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1609 break;
1610 case EPROGMISMATCH:
1611 *tl = txdr_unsigned(RPC_PROGMISMATCH);
1612 nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1613 // XXX hard coded versions
1614 *tl++ = txdr_unsigned(2);
1615 *tl = txdr_unsigned(3);
1616 break;
1617 case EPROCUNAVAIL:
1618 *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1619 break;
1620 case EBADRPC:
1621 *tl = txdr_unsigned(RPC_GARBAGE);
1622 break;
1623 default:
1624 *tl = 0;
1625 if (err != NFSERR_RETVOID) {
1626 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1627 if (err)
1628 *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1629 else
1630 *tl = 0;
1631 }
1632 break;
1633 }
1634 }
1635
1636 if (mrq != NULL)
1637 *mrq = mreq;
1638 *mbp = mb;
1639 *bposp = bpos;
1640 if (err != 0 && err != NFSERR_RETVOID) {
1641 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1642 }
1643 return (0);
1644 }
1645
1646
1647 #endif /* NFS_NOSERVER */
1648
1649
1650 /*
1651 * From FreeBSD 1.58, a Matt Dillon fix...
1652 * Flag a request as being about to terminate.
1653 * The nm_sent count is decremented now to avoid deadlocks when the process
1654 * in soreceive() hasn't yet managed to send its own request.
1655 */
1656 static void
1657 nfs_softterm(struct nfsreq *rep)
1658 {
1659
1660 rep->r_flags |= R_SOFTTERM;
1661 if (rep->r_flags & R_SENT) {
1662 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1663 rep->r_nmp->nm_cwnd);
1664 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1665 rep->r_flags &= ~R_SENT;
1666 }
1667 }
1668
1669 void
1670 nfs_timer_funnel(void * arg)
1671 {
1672 (void) thread_funnel_set(kernel_flock, TRUE);
1673 nfs_timer(arg);
1674 (void) thread_funnel_set(kernel_flock, FALSE);
1675
1676 }
1677
1678 /*
1679 * Ensure rep isn't in use by the timer, then dequeue it.
1680 */
1681 static void
1682 nfs_repdequeue(struct nfsreq *rep)
1683 {
1684
1685 while ((rep->r_flags & R_BUSY)) {
1686 rep->r_flags |= R_WAITING;
1687 tsleep(rep, PSOCK, "repdeq", 0);
1688 }
1689 TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1690 }
1691
1692 /*
1693 * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1694 * free()'d out from under it.
1695 */
1696 static void
1697 nfs_repbusy(struct nfsreq *rep)
1698 {
1699
1700 if ((rep->r_flags & R_BUSY))
1701 panic("rep locked");
1702 rep->r_flags |= R_BUSY;
1703 }
1704
1705 /*
1706 * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1707 */
1708 static struct nfsreq *
1709 nfs_repnext(struct nfsreq *rep)
1710 {
1711 struct nfsreq * nextrep;
1712
1713 if (rep == NULL)
1714 return (NULL);
1715 /*
1716 * We need to get and busy the next req before signalling the
1717 * current one, otherwise wakeup() may block us and we'll race to
1718 * grab the next req.
1719 */
1720 nextrep = TAILQ_NEXT(rep, r_chain);
1721 if (nextrep != NULL)
1722 nfs_repbusy(nextrep);
1723 /* unbusy and signal. */
1724 rep->r_flags &= ~R_BUSY;
1725 if ((rep->r_flags & R_WAITING)) {
1726 rep->r_flags &= ~R_WAITING;
1727 wakeup(rep);
1728 }
1729 return (nextrep);
1730 }
1731
1732 /*
1733 * Nfs timer routine
1734 * Scan the nfsreq list and retranmit any requests that have timed out
1735 * To avoid retransmission attempts on STREAM sockets (in the future) make
1736 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1737 */
1738 void
1739 nfs_timer(__unused void *arg)
1740 {
1741 struct nfsreq *rep;
1742 mbuf_t m;
1743 socket_t so;
1744 struct nfsmount *nmp;
1745 int timeo;
1746 int error;
1747 #ifndef NFS_NOSERVER
1748 struct nfssvc_sock *slp;
1749 u_quad_t cur_usec;
1750 #endif /* NFS_NOSERVER */
1751 int flags, rexmit, cwnd, sent;
1752 u_long xid;
1753 struct timeval now;
1754
1755 rep = TAILQ_FIRST(&nfs_reqq);
1756 if (rep != NULL)
1757 nfs_repbusy(rep);
1758 microuptime(&now);
1759 for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1760 nmp = rep->r_nmp;
1761 if (!nmp) /* unmounted */
1762 continue;
1763 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1764 continue;
1765 if (nfs_sigintr(nmp, rep, rep->r_procp))
1766 continue;
1767 if (nmp->nm_tprintf_initial_delay != 0 &&
1768 (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1769 rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1770 rep->r_lastmsg = now.tv_sec;
1771 nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1772 "not responding");
1773 rep->r_flags |= R_TPRINTFMSG;
1774 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1775 /* we're not yet completely mounted and */
1776 /* we can't complete an RPC, so we fail */
1777 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1778 nfs_softterm(rep);
1779 continue;
1780 }
1781 }
1782 if (rep->r_rtt >= 0) {
1783 rep->r_rtt++;
1784 if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1785 timeo = nmp->nm_timeo;
1786 else
1787 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1788 /* ensure 62.5 ms floor */
1789 while (16 * timeo < hz)
1790 timeo *= 2;
1791 if (nmp->nm_timeouts > 0)
1792 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1793 if (rep->r_rtt <= timeo)
1794 continue;
1795 if (nmp->nm_timeouts < 8)
1796 nmp->nm_timeouts++;
1797 }
1798 /*
1799 * Check for too many retransmits. This is never true for
1800 * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1801 * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1802 */
1803 if (rep->r_rexmit >= rep->r_retry) { /* too many */
1804 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1805 nfs_softterm(rep);
1806 continue;
1807 }
1808 if (nmp->nm_sotype != SOCK_DGRAM) {
1809 if (++rep->r_rexmit > NFS_MAXREXMIT)
1810 rep->r_rexmit = NFS_MAXREXMIT;
1811 continue;
1812 }
1813 if ((so = nmp->nm_so) == NULL)
1814 continue;
1815
1816 /*
1817 * If there is enough space and the window allows..
1818 * Resend it
1819 * Set r_rtt to -1 in case we fail to send it now.
1820 */
1821 rep->r_rtt = -1;
1822 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1823 (rep->r_flags & R_SENT) ||
1824 nmp->nm_sent < nmp->nm_cwnd) &&
1825 (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1826 struct msghdr msg;
1827 /*
1828 * Iff first send, start timing
1829 * else turn timing off, backoff timer
1830 * and divide congestion window by 2.
1831 * We update these *before* the send to avoid
1832 * racing against receiving the reply.
1833 * We save them so we can restore them on send error.
1834 */
1835 flags = rep->r_flags;
1836 rexmit = rep->r_rexmit;
1837 cwnd = nmp->nm_cwnd;
1838 sent = nmp->nm_sent;
1839 xid = rep->r_xid;
1840 if (rep->r_flags & R_SENT) {
1841 rep->r_flags &= ~R_TIMING;
1842 if (++rep->r_rexmit > NFS_MAXREXMIT)
1843 rep->r_rexmit = NFS_MAXREXMIT;
1844 nmp->nm_cwnd >>= 1;
1845 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1846 nmp->nm_cwnd = NFS_CWNDSCALE;
1847 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1848 } else {
1849 rep->r_flags |= R_SENT;
1850 nmp->nm_sent += NFS_CWNDSCALE;
1851 }
1852 FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1853
1854 bzero(&msg, sizeof(msg));
1855 if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1856 msg.msg_name = mbuf_data(nmp->nm_nam);
1857 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1858 }
1859 error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1860
1861 FSDBG(535, xid, error, sent, cwnd);
1862
1863 if (error) {
1864 if (error == EWOULDBLOCK) {
1865 rep->r_flags = flags;
1866 rep->r_rexmit = rexmit;
1867 nmp->nm_cwnd = cwnd;
1868 nmp->nm_sent = sent;
1869 rep->r_xid = xid;
1870 }
1871 else {
1872 if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1873 int clearerror;
1874 int optlen = sizeof(clearerror);
1875 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1876 }
1877 rep->r_flags = flags | R_RESENDERR;
1878 rep->r_rexmit = rexmit;
1879 nmp->nm_cwnd = cwnd;
1880 nmp->nm_sent = sent;
1881 if (flags & R_SENT)
1882 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1883 }
1884 } else
1885 rep->r_rtt = 0;
1886 }
1887 }
1888 microuptime(&now);
1889 #ifndef NFS_NOSERVER
1890 /*
1891 * Scan the write gathering queues for writes that need to be
1892 * completed now.
1893 */
1894 cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1895 lck_mtx_lock(nfsd_mutex);
1896 TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1897 if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1898 nfsrv_wakenfsd(slp);
1899 }
1900 while ((slp = TAILQ_FIRST(&nfssvc_deadsockhead))) {
1901 if ((slp->ns_timestamp + 5) > now.tv_sec)
1902 break;
1903 TAILQ_REMOVE(&nfssvc_deadsockhead, slp, ns_chain);
1904 nfsrv_slpfree(slp);
1905 }
1906 lck_mtx_unlock(nfsd_mutex);
1907 #endif /* NFS_NOSERVER */
1908
1909 if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1910 /*
1911 * We haven't called nfs_buf_freeup() in a little while.
1912 * So, see if we can free up any stale/unused bufs now.
1913 */
1914 nfs_buf_freeup(1);
1915 }
1916
1917 timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1918
1919 }
1920
1921
1922 /*
1923 * Test for a termination condition pending on the process.
1924 * This is used to determine if we need to bail on a mount.
1925 * EIO is returned if there has been a soft timeout.
1926 * EINTR is returned if there is a signal pending that is not being ignored
1927 * and the mount is interruptable, or if we are a thread that is in the process
1928 * of cancellation (also SIGKILL posted).
1929 */
1930 int
1931 nfs_sigintr(nmp, rep, p)
1932 struct nfsmount *nmp;
1933 struct nfsreq *rep;
1934 proc_t p;
1935 {
1936 sigset_t pending_sigs;
1937 int context_good = 0;
1938 struct nfsmount *repnmp;
1939 extern proc_t kernproc;
1940
1941 if (nmp == NULL)
1942 return (ENXIO);
1943 if (rep != NULL) {
1944 repnmp = rep->r_nmp;
1945 /* we've had a forced unmount. */
1946 if (repnmp == NULL)
1947 return (ENXIO);
1948 /* request has timed out on a 'soft' mount. */
1949 if (rep->r_flags & R_SOFTTERM)
1950 return (EIO);
1951 /*
1952 * We're in the progress of a force unmount and there's
1953 * been a timeout we're dead and fail IO.
1954 */
1955 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1956 (NFSSTA_FORCE|NFSSTA_TIMEO))
1957 return (EIO);
1958 /* Someone is unmounting us, go soft and mark it. */
1959 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1960 repnmp->nm_flag |= NFSMNT_SOFT;
1961 nmp->nm_state |= NFSSTA_FORCE;
1962 }
1963 /*
1964 * If the mount is hung and we've requested not to hang
1965 * on remote filesystems, then bail now.
1966 */
1967 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1968 (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1969 return (EIO);
1970 }
1971 /* XXX: is this valid? this probably should be an assertion. */
1972 if (p == NULL)
1973 return (0);
1974
1975 /* Is this thread belongs to kernel task; then abort check is not needed */
1976 if ((current_proc() != kernproc) && current_thread_aborted()) {
1977 return (EINTR);
1978 }
1979 /* mask off thread and process blocked signals. */
1980
1981 pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1982 if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1983 return (EINTR);
1984 return (0);
1985 }
1986
1987 /*
1988 * Lock a socket against others.
1989 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1990 * and also to avoid race conditions between the processes with nfs requests
1991 * in progress when a reconnect is necessary.
1992 */
1993 int
1994 nfs_sndlock(rep)
1995 struct nfsreq *rep;
1996 {
1997 int *statep;
1998 proc_t p;
1999 int error, slpflag = 0, slptimeo = 0;
2000
2001 if (rep->r_nmp == NULL)
2002 return (ENXIO);
2003 statep = &rep->r_nmp->nm_state;
2004
2005 p = rep->r_procp;
2006 if (rep->r_nmp->nm_flag & NFSMNT_INT)
2007 slpflag = PCATCH;
2008 while (*statep & NFSSTA_SNDLOCK) {
2009 error = nfs_sigintr(rep->r_nmp, rep, p);
2010 if (error)
2011 return (error);
2012 *statep |= NFSSTA_WANTSND;
2013 if (p != NULL && (proc_noremotehang(p)) != 0)
2014 slptimeo = hz;
2015 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2016 if (slpflag == PCATCH) {
2017 slpflag = 0;
2018 slptimeo = 2 * hz;
2019 }
2020 /*
2021 * Make sure while we slept that the mountpoint didn't go away.
2022 * nfs_sigintr and callers expect it in tact.
2023 */
2024 if (!rep->r_nmp)
2025 return (ENXIO); /* don't have lock until out of loop */
2026 }
2027 *statep |= NFSSTA_SNDLOCK;
2028 return (0);
2029 }
2030
2031 /*
2032 * Unlock the stream socket for others.
2033 */
2034 void
2035 nfs_sndunlock(rep)
2036 struct nfsreq *rep;
2037 {
2038 int *statep;
2039
2040 if (rep->r_nmp == NULL)
2041 return;
2042 statep = &rep->r_nmp->nm_state;
2043 if ((*statep & NFSSTA_SNDLOCK) == 0)
2044 panic("nfs sndunlock");
2045 *statep &= ~NFSSTA_SNDLOCK;
2046 if (*statep & NFSSTA_WANTSND) {
2047 *statep &= ~NFSSTA_WANTSND;
2048 wakeup((caddr_t)statep);
2049 }
2050 }
2051
2052 static int
2053 nfs_rcvlock(struct nfsreq *rep)
2054 {
2055 int *statep;
2056 int error, slpflag, slptimeo = 0;
2057
2058 /* make sure we still have our mountpoint */
2059 if (!rep->r_nmp) {
2060 if (rep->r_mrep != NULL)
2061 return (EALREADY);
2062 return (ENXIO);
2063 }
2064
2065 statep = &rep->r_nmp->nm_state;
2066 FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2067 if (rep->r_nmp->nm_flag & NFSMNT_INT)
2068 slpflag = PCATCH;
2069 else
2070 slpflag = 0;
2071 while (*statep & NFSSTA_RCVLOCK) {
2072 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2073 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2074 return (error);
2075 } else if (rep->r_mrep != NULL) {
2076 /*
2077 * Don't bother sleeping if reply already arrived
2078 */
2079 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2080 return (EALREADY);
2081 }
2082 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2083 *statep |= NFSSTA_WANTRCV;
2084 /*
2085 * We need to poll if we're P_NOREMOTEHANG so that we
2086 * call nfs_sigintr periodically above.
2087 */
2088 if (rep->r_procp != NULL &&
2089 (proc_noremotehang(rep->r_procp)) != 0)
2090 slptimeo = hz;
2091 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2092 if (slpflag == PCATCH) {
2093 slpflag = 0;
2094 slptimeo = 2 * hz;
2095 }
2096 /*
2097 * Make sure while we slept that the mountpoint didn't go away.
2098 * nfs_sigintr and caller nfs_reply expect it intact.
2099 */
2100 if (!rep->r_nmp) {
2101 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2102 return (ENXIO); /* don't have lock until out of loop */
2103 }
2104 }
2105 /*
2106 * nfs_reply will handle it if reply already arrived.
2107 * (We may have slept or been preempted).
2108 */
2109 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2110 *statep |= NFSSTA_RCVLOCK;
2111 return (0);
2112 }
2113
2114 /*
2115 * Unlock the stream socket for others.
2116 */
2117 static void
2118 nfs_rcvunlock(struct nfsreq *rep)
2119 {
2120 int *statep;
2121
2122 if (rep->r_nmp == NULL)
2123 return;
2124 statep = &rep->r_nmp->nm_state;
2125
2126 FSDBG(533, statep, *statep, 0, 0);
2127 if ((*statep & NFSSTA_RCVLOCK) == 0)
2128 panic("nfs rcvunlock");
2129 *statep &= ~NFSSTA_RCVLOCK;
2130 if (*statep & NFSSTA_WANTRCV) {
2131 *statep &= ~NFSSTA_WANTRCV;
2132 wakeup((caddr_t)statep);
2133 }
2134 }
2135
2136
2137 #ifndef NFS_NOSERVER
2138 /*
2139 * Socket upcall routine for the nfsd sockets.
2140 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2141 * Essentially do as much as possible non-blocking, else punt and it will
2142 * be called with MBUF_WAITOK from an nfsd.
2143 */
2144 void
2145 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2146 {
2147 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2148
2149 if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2150 return;
2151
2152 lck_rw_lock_exclusive(&slp->ns_rwlock);
2153 nfsrv_rcv_locked(so, slp, waitflag);
2154 /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2155 }
2156 void
2157 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2158 {
2159 mbuf_t m, mp, mhck, m2;
2160 int ns_flag=0, error;
2161 struct msghdr msg;
2162 size_t bytes_read;
2163
2164 if ((slp->ns_flag & SLP_VALID) == 0) {
2165 if (waitflag == MBUF_DONTWAIT)
2166 lck_rw_done(&slp->ns_rwlock);
2167 return;
2168 }
2169
2170 #ifdef notdef
2171 /*
2172 * Define this to test for nfsds handling this under heavy load.
2173 */
2174 if (waitflag == MBUF_DONTWAIT) {
2175 ns_flag = SLP_NEEDQ;
2176 goto dorecs;
2177 }
2178 #endif
2179 if (slp->ns_sotype == SOCK_STREAM) {
2180 /*
2181 * If there are already records on the queue, defer soreceive()
2182 * to an nfsd so that there is feedback to the TCP layer that
2183 * the nfs servers are heavily loaded.
2184 */
2185 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2186 ns_flag = SLP_NEEDQ;
2187 goto dorecs;
2188 }
2189
2190 /*
2191 * Do soreceive().
2192 */
2193 bytes_read = 1000000000;
2194 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2195 if (error || mp == NULL) {
2196 if (error == EWOULDBLOCK)
2197 ns_flag = SLP_NEEDQ;
2198 else
2199 ns_flag = SLP_DISCONN;
2200 goto dorecs;
2201 }
2202 m = mp;
2203 if (slp->ns_rawend) {
2204 if ((error = mbuf_setnext(slp->ns_rawend, m)))
2205 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2206 slp->ns_cc += bytes_read;
2207 } else {
2208 slp->ns_raw = m;
2209 slp->ns_cc = bytes_read;
2210 }
2211 while ((m2 = mbuf_next(m)))
2212 m = m2;
2213 slp->ns_rawend = m;
2214
2215 /*
2216 * Now try and parse record(s) out of the raw stream data.
2217 */
2218 error = nfsrv_getstream(slp, waitflag);
2219 if (error) {
2220 if (error == EPERM)
2221 ns_flag = SLP_DISCONN;
2222 else
2223 ns_flag = SLP_NEEDQ;
2224 }
2225 } else {
2226 struct sockaddr_storage nam;
2227
2228 bzero(&msg, sizeof(msg));
2229 msg.msg_name = (caddr_t)&nam;
2230 msg.msg_namelen = sizeof(nam);
2231
2232 do {
2233 bytes_read = 1000000000;
2234 error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2235 if (mp) {
2236 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2237 mbuf_setlen(mhck, nam.ss_len);
2238 bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2239 m = mhck;
2240 if (mbuf_setnext(m, mp)) {
2241 /* trouble... just drop it */
2242 printf("nfsrv_rcv: mbuf_setnext failed\n");
2243 mbuf_free(mhck);
2244 m = mp;
2245 }
2246 } else {
2247 m = mp;
2248 }
2249 if (slp->ns_recend)
2250 mbuf_setnextpkt(slp->ns_recend, m);
2251 else
2252 slp->ns_rec = m;
2253 slp->ns_recend = m;
2254 mbuf_setnextpkt(m, NULL);
2255 }
2256 #if 0
2257 if (error) {
2258 /*
2259 * This may be needed in the future to support
2260 * non-byte-stream connection-oriented protocols
2261 * such as SCTP.
2262 */
2263 /*
2264 * This (slp->ns_sotype == SOCK_STREAM) should really
2265 * be a check for PR_CONNREQUIRED.
2266 */
2267 if ((slp->ns_sotype == SOCK_STREAM)
2268 && error != EWOULDBLOCK) {
2269 ns_flag = SLP_DISCONN;
2270 goto dorecs;
2271 }
2272 }
2273 #endif
2274 } while (mp);
2275 }
2276
2277 /*
2278 * Now try and process the request records, non-blocking.
2279 */
2280 dorecs:
2281 if (ns_flag)
2282 slp->ns_flag |= ns_flag;
2283 if (waitflag == MBUF_DONTWAIT) {
2284 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2285 lck_rw_done(&slp->ns_rwlock);
2286 if (wake && nfs_numnfsd) {
2287 lck_mtx_lock(nfsd_mutex);
2288 nfsrv_wakenfsd(slp);
2289 lck_mtx_unlock(nfsd_mutex);
2290 }
2291 }
2292 }
2293
2294 /*
2295 * Try and extract an RPC request from the mbuf data list received on a
2296 * stream socket. The "waitflag" argument indicates whether or not it
2297 * can sleep.
2298 */
2299 static int
2300 nfsrv_getstream(slp, waitflag)
2301 struct nfssvc_sock *slp;
2302 int waitflag;
2303 {
2304 mbuf_t m;
2305 char *cp1, *cp2, *mdata;
2306 int len, mlen, error;
2307 mbuf_t om, m2, recm;
2308 u_long recmark;
2309
2310 if (slp->ns_flag & SLP_GETSTREAM)
2311 panic("nfs getstream");
2312 slp->ns_flag |= SLP_GETSTREAM;
2313 for (;;) {
2314 if (slp->ns_reclen == 0) {
2315 if (slp->ns_cc < NFSX_UNSIGNED) {
2316 slp->ns_flag &= ~SLP_GETSTREAM;
2317 return (0);
2318 }
2319 m = slp->ns_raw;
2320 mdata = mbuf_data(m);
2321 mlen = mbuf_len(m);
2322 if (mlen >= NFSX_UNSIGNED) {
2323 bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2324 mdata += NFSX_UNSIGNED;
2325 mlen -= NFSX_UNSIGNED;
2326 mbuf_setdata(m, mdata, mlen);
2327 } else {
2328 cp1 = (caddr_t)&recmark;
2329 cp2 = mdata;
2330 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2331 while (mlen == 0) {
2332 m = mbuf_next(m);
2333 cp2 = mbuf_data(m);
2334 mlen = mbuf_len(m);
2335 }
2336 *cp1++ = *cp2++;
2337 mlen--;
2338 mbuf_setdata(m, cp2, mlen);
2339 }
2340 }
2341 slp->ns_cc -= NFSX_UNSIGNED;
2342 recmark = ntohl(recmark);
2343 slp->ns_reclen = recmark & ~0x80000000;
2344 if (recmark & 0x80000000)
2345 slp->ns_flag |= SLP_LASTFRAG;
2346 else
2347 slp->ns_flag &= ~SLP_LASTFRAG;
2348 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2349 slp->ns_flag &= ~SLP_GETSTREAM;
2350 return (EPERM);
2351 }
2352 }
2353
2354 /*
2355 * Now get the record part.
2356 *
2357 * Note that slp->ns_reclen may be 0. Linux sometimes
2358 * generates 0-length RPCs
2359 */
2360 recm = NULL;
2361 if (slp->ns_cc == slp->ns_reclen) {
2362 recm = slp->ns_raw;
2363 slp->ns_raw = slp->ns_rawend = NULL;
2364 slp->ns_cc = slp->ns_reclen = 0;
2365 } else if (slp->ns_cc > slp->ns_reclen) {
2366 len = 0;
2367 m = slp->ns_raw;
2368 mlen = mbuf_len(m);
2369 mdata = mbuf_data(m);
2370 om = NULL;
2371 while (len < slp->ns_reclen) {
2372 if ((len + mlen) > slp->ns_reclen) {
2373 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2374 slp->ns_flag &= ~SLP_GETSTREAM;
2375 return (EWOULDBLOCK);
2376 }
2377 if (om) {
2378 if (mbuf_setnext(om, m2)) {
2379 /* trouble... just drop it */
2380 printf("nfsrv_getstream: mbuf_setnext failed\n");
2381 mbuf_freem(m2);
2382 slp->ns_flag &= ~SLP_GETSTREAM;
2383 return (EWOULDBLOCK);
2384 }
2385 recm = slp->ns_raw;
2386 } else {
2387 recm = m2;
2388 }
2389 mdata += slp->ns_reclen - len;
2390 mlen -= slp->ns_reclen - len;
2391 mbuf_setdata(m, mdata, mlen);
2392 len = slp->ns_reclen;
2393 } else if ((len + mlen) == slp->ns_reclen) {
2394 om = m;
2395 len += mlen;
2396 m = mbuf_next(m);
2397 recm = slp->ns_raw;
2398 if (mbuf_setnext(om, NULL)) {
2399 printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2400 slp->ns_flag &= ~SLP_GETSTREAM;
2401 return (EWOULDBLOCK);
2402 }
2403 mlen = mbuf_len(m);
2404 mdata = mbuf_data(m);
2405 } else {
2406 om = m;
2407 len += mlen;
2408 m = mbuf_next(m);
2409 mlen = mbuf_len(m);
2410 mdata = mbuf_data(m);
2411 }
2412 }
2413 slp->ns_raw = m;
2414 slp->ns_cc -= len;
2415 slp->ns_reclen = 0;
2416 } else {
2417 slp->ns_flag &= ~SLP_GETSTREAM;
2418 return (0);
2419 }
2420
2421 /*
2422 * Accumulate the fragments into a record.
2423 */
2424 if (slp->ns_frag == NULL) {
2425 slp->ns_frag = recm;
2426 } else {
2427 m = slp->ns_frag;
2428 while ((m2 = mbuf_next(m)))
2429 m = m2;
2430 if ((error = mbuf_setnext(m, recm)))
2431 panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2432 }
2433 if (slp->ns_flag & SLP_LASTFRAG) {
2434 if (slp->ns_recend)
2435 mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2436 else
2437 slp->ns_rec = slp->ns_frag;
2438 slp->ns_recend = slp->ns_frag;
2439 slp->ns_frag = NULL;
2440 }
2441 }
2442 }
2443
2444 /*
2445 * Parse an RPC header.
2446 */
2447 int
2448 nfsrv_dorec(slp, nfsd, ndp)
2449 struct nfssvc_sock *slp;
2450 struct nfsd *nfsd;
2451 struct nfsrv_descript **ndp;
2452 {
2453 mbuf_t m;
2454 mbuf_t nam;
2455 struct nfsrv_descript *nd;
2456 int error;
2457
2458 *ndp = NULL;
2459 if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2460 return (ENOBUFS);
2461 MALLOC_ZONE(nd, struct nfsrv_descript *,
2462 sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2463 if (!nd)
2464 return (ENOMEM);
2465 m = slp->ns_rec;
2466 slp->ns_rec = mbuf_nextpkt(m);
2467 if (slp->ns_rec)
2468 mbuf_setnextpkt(m, NULL);
2469 else
2470 slp->ns_recend = NULL;
2471 if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2472 nam = m;
2473 m = mbuf_next(m);
2474 if ((error = mbuf_setnext(nam, NULL)))
2475 panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2476 } else
2477 nam = NULL;
2478 nd->nd_md = nd->nd_mrep = m;
2479 nd->nd_nam2 = nam;
2480 nd->nd_dpos = mbuf_data(m);
2481 error = nfs_getreq(nd, nfsd, TRUE);
2482 if (error) {
2483 if (nam)
2484 mbuf_freem(nam);
2485 FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC);
2486 return (error);
2487 }
2488 *ndp = nd;
2489 nfsd->nfsd_nd = nd;
2490 return (0);
2491 }
2492
2493 /*
2494 * Parse an RPC request
2495 * - verify it
2496 * - fill in the cred struct.
2497 */
2498 int
2499 nfs_getreq(nd, nfsd, has_header)
2500 struct nfsrv_descript *nd;
2501 struct nfsd *nfsd;
2502 int has_header;
2503 {
2504 int len, i;
2505 u_long *tl;
2506 long t1;
2507 uio_t uiop;
2508 caddr_t dpos, cp2, cp;
2509 u_long nfsvers, auth_type;
2510 uid_t nickuid;
2511 int error = 0, ticklen;
2512 mbuf_t mrep, md;
2513 struct nfsuid *nuidp;
2514 uid_t user_id;
2515 gid_t group_id;
2516 int ngroups;
2517 struct ucred temp_cred;
2518 struct timeval tvin, tvout, now;
2519 char uio_buf[ UIO_SIZEOF(1) ];
2520 #if 0 /* until encrypted keys are implemented */
2521 NFSKERBKEYSCHED_T keys; /* stores key schedule */
2522 #endif
2523
2524 nd->nd_cr = NULL;
2525
2526 mrep = nd->nd_mrep;
2527 md = nd->nd_md;
2528 dpos = nd->nd_dpos;
2529 if (has_header) {
2530 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2531 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2532 if (*tl++ != rpc_call) {
2533 mbuf_freem(mrep);
2534 return (EBADRPC);
2535 }
2536 } else
2537 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2538 nd->nd_repstat = 0;
2539 nd->nd_flag = 0;
2540 if (*tl++ != rpc_vers) {
2541 nd->nd_repstat = ERPCMISMATCH;
2542 nd->nd_procnum = NFSPROC_NOOP;
2543 return (0);
2544 }
2545 if (*tl != nfs_prog) {
2546 nd->nd_repstat = EPROGUNAVAIL;
2547 nd->nd_procnum = NFSPROC_NOOP;
2548 return (0);
2549 }
2550 tl++;
2551 nfsvers = fxdr_unsigned(u_long, *tl++);
2552 if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2553 nd->nd_repstat = EPROGMISMATCH;
2554 nd->nd_procnum = NFSPROC_NOOP;
2555 return (0);
2556 }
2557 else if (nfsvers == NFS_VER3)
2558 nd->nd_flag = ND_NFSV3;
2559 nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2560 if (nd->nd_procnum == NFSPROC_NULL)
2561 return (0);
2562 if ((nd->nd_procnum >= NFS_NPROCS) ||
2563 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2564 nd->nd_repstat = EPROCUNAVAIL;
2565 nd->nd_procnum = NFSPROC_NOOP;
2566 return (0);
2567 }
2568 if ((nd->nd_flag & ND_NFSV3) == 0)
2569 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2570 auth_type = *tl++;
2571 len = fxdr_unsigned(int, *tl++);
2572 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2573 mbuf_freem(mrep);
2574 return (EBADRPC);
2575 }
2576
2577 nd->nd_flag &= ~ND_KERBAUTH;
2578 /*
2579 * Handle auth_unix or auth_kerb.
2580 */
2581 if (auth_type == rpc_auth_unix) {
2582 len = fxdr_unsigned(int, *++tl);
2583 if (len < 0 || len > NFS_MAXNAMLEN) {
2584 mbuf_freem(mrep);
2585 return (EBADRPC);
2586 }
2587 bzero(&temp_cred, sizeof(temp_cred));
2588 nfsm_adv(nfsm_rndup(len));
2589 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2590 user_id = fxdr_unsigned(uid_t, *tl++);
2591 group_id = fxdr_unsigned(gid_t, *tl++);
2592 temp_cred.cr_groups[0] = group_id;
2593 len = fxdr_unsigned(int, *tl);
2594 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2595 mbuf_freem(mrep);
2596 return (EBADRPC);
2597 }
2598 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2599 for (i = 1; i <= len; i++)
2600 if (i < NGROUPS)
2601 temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2602 else
2603 tl++;
2604 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2605 if (ngroups > 1)
2606 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2607 len = fxdr_unsigned(int, *++tl);
2608 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2609 mbuf_freem(mrep);
2610 return (EBADRPC);
2611 }
2612 temp_cred.cr_uid = user_id;
2613 temp_cred.cr_ngroups = ngroups;
2614 nd->nd_cr = kauth_cred_create(&temp_cred);
2615 if (nd->nd_cr == NULL) {
2616 nd->nd_repstat = ENOMEM;
2617 nd->nd_procnum = NFSPROC_NOOP;
2618 return (0);
2619 }
2620 if (len > 0)
2621 nfsm_adv(nfsm_rndup(len));
2622 } else if (auth_type == rpc_auth_kerb) {
2623 switch (fxdr_unsigned(int, *tl++)) {
2624 case RPCAKN_FULLNAME:
2625 ticklen = fxdr_unsigned(int, *tl);
2626 *((u_long *)nfsd->nfsd_authstr) = *tl;
2627 uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2628 &uio_buf[0], sizeof(uio_buf));
2629 if (!uiop) {
2630 nd->nd_repstat = ENOMEM;
2631 nd->nd_procnum = NFSPROC_NOOP;
2632 return (0);
2633 }
2634
2635 // LP64todo - fix this
2636 nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2637 if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2638 mbuf_freem(mrep);
2639 return (EBADRPC);
2640 }
2641 uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2642 // LP64todo - fix this
2643 nfsm_mtouio(uiop, uio_resid(uiop));
2644 nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2645 if (*tl++ != rpc_auth_kerb ||
2646 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2647 printf("Bad kerb verifier\n");
2648 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2649 nd->nd_procnum = NFSPROC_NOOP;
2650 return (0);
2651 }
2652 nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2653 tl = (u_long *)cp;
2654 if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2655 printf("Not fullname kerb verifier\n");
2656 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2657 nd->nd_procnum = NFSPROC_NOOP;
2658 return (0);
2659 }
2660 cp += NFSX_UNSIGNED;
2661 bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2662 nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2663 nd->nd_flag |= ND_KERBFULL;
2664 nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2665 break;
2666 case RPCAKN_NICKNAME:
2667 if (len != 2 * NFSX_UNSIGNED) {
2668 printf("Kerb nickname short\n");
2669 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2670 nd->nd_procnum = NFSPROC_NOOP;
2671 return (0);
2672 }
2673 nickuid = fxdr_unsigned(uid_t, *tl);
2674 nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2675 if (*tl++ != rpc_auth_kerb ||
2676 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2677 printf("Kerb nick verifier bad\n");
2678 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2679 nd->nd_procnum = NFSPROC_NOOP;
2680 return (0);
2681 }
2682 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2683 tvin.tv_sec = *tl++;
2684 tvin.tv_usec = *tl;
2685
2686 for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2687 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2688 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2689 (!nd->nd_nam2 ||
2690 netaddr_match(NU_NETFAM(nuidp),
2691 &nuidp->nu_haddr, nd->nd_nam2)))
2692 break;
2693 }
2694 if (!nuidp) {
2695 nd->nd_repstat =
2696 (NFSERR_AUTHERR|AUTH_REJECTCRED);
2697 nd->nd_procnum = NFSPROC_NOOP;
2698 return (0);
2699 }
2700
2701 /*
2702 * Now, decrypt the timestamp using the session key
2703 * and validate it.
2704 */
2705 #if NFSKERB
2706 XXX
2707 #endif
2708
2709 tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2710 tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2711 microtime(&now);
2712 if (nuidp->nu_expire < now.tv_sec ||
2713 nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2714 (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2715 nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2716 nuidp->nu_expire = 0;
2717 nd->nd_repstat =
2718 (NFSERR_AUTHERR|AUTH_REJECTVERF);
2719 nd->nd_procnum = NFSPROC_NOOP;
2720 return (0);
2721 }
2722 bzero(&temp_cred, sizeof(temp_cred));
2723 ngroups = nuidp->nu_cr->cr_ngroups;
2724 for (i = 0; i < ngroups; i++)
2725 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2726 if (ngroups > 1)
2727 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2728
2729 temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2730 temp_cred.cr_ngroups = ngroups;
2731 nd->nd_cr = kauth_cred_create(&temp_cred);
2732 if (!nd->nd_cr) {
2733 nd->nd_repstat = ENOMEM;
2734 nd->nd_procnum = NFSPROC_NOOP;
2735 return (0);
2736 }
2737 nd->nd_flag |= ND_KERBNICK;
2738 };
2739 } else {
2740 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2741 nd->nd_procnum = NFSPROC_NOOP;
2742 return (0);
2743 }
2744
2745 nd->nd_md = md;
2746 nd->nd_dpos = dpos;
2747 return (0);
2748 nfsmout:
2749 if (nd->nd_cr)
2750 kauth_cred_rele(nd->nd_cr);
2751 return (error);
2752 }
2753
2754 /*
2755 * Search for a sleeping nfsd and wake it up.
2756 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2757 * running nfsds will go look for the work in the nfssvc_sock list.
2758 * Note: Must be called with nfsd_mutex held.
2759 */
2760 void
2761 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2762 {
2763 struct nfsd *nd;
2764
2765 if ((slp->ns_flag & SLP_VALID) == 0)
2766 return;
2767
2768 lck_rw_lock_exclusive(&slp->ns_rwlock);
2769
2770 if (nfsd_waiting) {
2771 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2772 if (nd->nfsd_flag & NFSD_WAITING) {
2773 nd->nfsd_flag &= ~NFSD_WAITING;
2774 if (nd->nfsd_slp)
2775 panic("nfsd wakeup");
2776 slp->ns_sref++;
2777 nd->nfsd_slp = slp;
2778 lck_rw_done(&slp->ns_rwlock);
2779 wakeup((caddr_t)nd);
2780 return;
2781 }
2782 }
2783 }
2784
2785 slp->ns_flag |= SLP_DOREC;
2786
2787 lck_rw_done(&slp->ns_rwlock);
2788
2789 nfsd_head_flag |= NFSD_CHECKSLP;
2790 }
2791 #endif /* NFS_NOSERVER */
2792
2793 static int
2794 nfs_msg(proc_t p,
2795 const char *server,
2796 const char *msg,
2797 int error)
2798 {
2799 tpr_t tpr;
2800
2801 if (p)
2802 tpr = tprintf_open(p);
2803 else
2804 tpr = NULL;
2805 if (error)
2806 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2807 error);
2808 else
2809 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2810 tprintf_close(tpr);
2811 return (0);
2812 }
2813
2814 void
2815 nfs_down(nmp, proc, error, flags, msg)
2816 struct nfsmount *nmp;
2817 proc_t proc;
2818 int error, flags;
2819 const char *msg;
2820 {
2821 if (nmp == NULL)
2822 return;
2823 if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2824 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2825 nmp->nm_state |= NFSSTA_TIMEO;
2826 }
2827 if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2828 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2829 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2830 }
2831 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2832 }
2833
2834 void
2835 nfs_up(nmp, proc, flags, msg)
2836 struct nfsmount *nmp;
2837 proc_t proc;
2838 int flags;
2839 const char *msg;
2840 {
2841 if (nmp == NULL)
2842 return;
2843 if (msg)
2844 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2845 if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2846 nmp->nm_state &= ~NFSSTA_TIMEO;
2847 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2848 }
2849 if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2850 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2851 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2852 }
2853 }
2854