]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_socket.c
008d7ad167bdb5c083773f143ebb14d855847151
[apple/xnu.git] / bsd / nfs / nfs_socket.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
24 /*
25 * Copyright (c) 1989, 1991, 1993, 1995
26 * The Regents of the University of California. All rights reserved.
27 *
28 * This code is derived from software contributed to Berkeley by
29 * Rick Macklem at The University of Guelph.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 * notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions and the following disclaimer in the
38 * documentation and/or other materials provided with the distribution.
39 * 3. All advertising materials mentioning features or use of this software
40 * must display the following acknowledgement:
41 * This product includes software developed by the University of
42 * California, Berkeley and its contributors.
43 * 4. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
60 * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
61 */
62
63 /*
64 * Socket operations for use by nfs
65 */
66
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>
70 #include <sys/kauth.h>
71 #include <sys/mount_internal.h>
72 #include <sys/kernel.h>
73 #include <sys/kpi_mbuf.h>
74 #include <sys/malloc.h>
75 #include <sys/vnode.h>
76 #include <sys/domain.h>
77 #include <sys/protosw.h>
78 #include <sys/socket.h>
79 #include <sys/syslog.h>
80 #include <sys/tprintf.h>
81 #include <sys/uio_internal.h>
82 #include <libkern/OSAtomic.h>
83
84 #include <sys/time.h>
85 #include <kern/clock.h>
86 #include <kern/task.h>
87 #include <kern/thread.h>
88 #include <sys/user.h>
89
90 #include <netinet/in.h>
91 #include <netinet/tcp.h>
92
93 #include <nfs/rpcv2.h>
94 #include <nfs/nfsproto.h>
95 #include <nfs/nfs.h>
96 #include <nfs/xdr_subs.h>
97 #include <nfs/nfsm_subs.h>
98 #include <nfs/nfsmount.h>
99 #include <nfs/nfsnode.h>
100 #include <nfs/nfsrtt.h>
101
102 #include <sys/kdebug.h>
103
104 #define FSDBG(A, B, C, D, E) \
105 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
106 (int)(B), (int)(C), (int)(D), (int)(E), 0)
107 #define FSDBG_TOP(A, B, C, D, E) \
108 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
109 (int)(B), (int)(C), (int)(D), (int)(E), 0)
110 #define FSDBG_BOT(A, B, C, D, E) \
111 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
112 (int)(B), (int)(C), (int)(D), (int)(E), 0)
113
114 /*
115 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
116 * Use the mean and mean deviation of rtt for the appropriate type of rpc
117 * for the frequent rpcs and a default for the others.
118 * The justification for doing "other" this way is that these rpcs
119 * happen so infrequently that timer est. would probably be stale.
120 * Also, since many of these rpcs are
121 * non-idempotent, a conservative timeout is desired.
122 * getattr, lookup - A+2D
123 * read, write - A+4D
124 * other - nm_timeo
125 */
126 #define NFS_RTO(n, t) \
127 ((t) == 0 ? (n)->nm_timeo : \
128 ((t) < 3 ? \
129 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
130 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
131 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
132 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
133 /*
134 * External data, mostly RPC constants in XDR form
135 */
136 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
137 rpc_msgaccepted, rpc_call, rpc_autherr,
138 rpc_auth_kerb;
139 extern u_long nfs_prog;
140 extern struct nfsstats nfsstats;
141 extern int nfsv3_procid[NFS_NPROCS];
142 extern int nfs_ticks;
143 extern u_long nfs_xidwrap;
144
145 /*
146 * Defines which timer to use for the procnum.
147 * 0 - default
148 * 1 - getattr
149 * 2 - lookup
150 * 3 - read
151 * 4 - write
152 */
153 static int proct[NFS_NPROCS] = {
154 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
155 };
156
157 /*
158 * There is a congestion window for outstanding rpcs maintained per mount
159 * point. The cwnd size is adjusted in roughly the way that:
160 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
161 * SIGCOMM '88". ACM, August 1988.
162 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
163 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
164 * of rpcs is in progress.
165 * (The sent count and cwnd are scaled for integer arith.)
166 * Variants of "slow start" were tried and were found to be too much of a
167 * performance hit (ave. rtt 3 times larger),
168 * I suspect due to the large rtt that nfs rpcs have.
169 */
170 #define NFS_CWNDSCALE 256
171 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
172 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
173 int nfsrtton = 0;
174 struct nfsrtt nfsrtt;
175
176 static int nfs_rcvlock(struct nfsreq *);
177 static void nfs_rcvunlock(struct nfsreq *);
178 static int nfs_receive(struct nfsreq *rep, mbuf_t *mp);
179 static int nfs_reconnect(struct nfsreq *rep);
180 static void nfs_repdequeue(struct nfsreq *rep);
181
182 /* XXX */
183 boolean_t current_thread_aborted(void);
184 kern_return_t thread_terminate(thread_t);
185
186 #ifndef NFS_NOSERVER
187 static int nfsrv_getstream(struct nfssvc_sock *,int);
188
189 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
190 struct nfssvc_sock *slp,
191 proc_t procp,
192 mbuf_t *mreqp) = {
193 nfsrv_null,
194 nfsrv_getattr,
195 nfsrv_setattr,
196 nfsrv_lookup,
197 nfsrv3_access,
198 nfsrv_readlink,
199 nfsrv_read,
200 nfsrv_write,
201 nfsrv_create,
202 nfsrv_mkdir,
203 nfsrv_symlink,
204 nfsrv_mknod,
205 nfsrv_remove,
206 nfsrv_rmdir,
207 nfsrv_rename,
208 nfsrv_link,
209 nfsrv_readdir,
210 nfsrv_readdirplus,
211 nfsrv_statfs,
212 nfsrv_fsinfo,
213 nfsrv_pathconf,
214 nfsrv_commit,
215 nfsrv_noop
216 };
217 #endif /* NFS_NOSERVER */
218
219
220 /*
221 * attempt to bind a socket to a reserved port
222 */
223 static int
224 nfs_bind_resv(struct nfsmount *nmp)
225 {
226 socket_t so = nmp->nm_so;
227 struct sockaddr_in sin;
228 int error;
229 u_short tport;
230
231 if (!so)
232 return (EINVAL);
233
234 sin.sin_len = sizeof (struct sockaddr_in);
235 sin.sin_family = AF_INET;
236 sin.sin_addr.s_addr = INADDR_ANY;
237 tport = IPPORT_RESERVED - 1;
238 sin.sin_port = htons(tport);
239
240 while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
241 (--tport > IPPORT_RESERVED / 2))
242 sin.sin_port = htons(tport);
243 return (error);
244 }
245
246 /*
247 * variables for managing the nfs_bind_resv_thread
248 */
249 int nfs_resv_mounts = 0;
250 static int nfs_bind_resv_thread_state = 0;
251 #define NFS_BIND_RESV_THREAD_STATE_INITTED 1
252 #define NFS_BIND_RESV_THREAD_STATE_RUNNING 2
253 lck_grp_t *nfs_bind_resv_lck_grp;
254 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
255 lck_attr_t *nfs_bind_resv_lck_attr;
256 lck_mtx_t *nfs_bind_resv_mutex;
257 struct nfs_bind_resv_request {
258 TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
259 struct nfsmount *brr_nmp;
260 int brr_error;
261 };
262 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
263
264 /*
265 * thread to handle any reserved port bind requests
266 */
267 static void
268 nfs_bind_resv_thread(void)
269 {
270 struct nfs_bind_resv_request *brreq;
271
272 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
273
274 while (nfs_resv_mounts > 0) {
275 lck_mtx_lock(nfs_bind_resv_mutex);
276 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
277 TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
278 lck_mtx_unlock(nfs_bind_resv_mutex);
279 brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
280 wakeup(brreq);
281 lck_mtx_lock(nfs_bind_resv_mutex);
282 }
283 msleep((caddr_t)&nfs_bind_resv_request_queue,
284 nfs_bind_resv_mutex, PSOCK | PDROP,
285 "nfs_bind_resv_request_queue", 0);
286 }
287
288 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
289 (void) thread_terminate(current_thread());
290 }
291
292 int
293 nfs_bind_resv_thread_wake(void)
294 {
295 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
296 return (EIO);
297 wakeup(&nfs_bind_resv_request_queue);
298 return (0);
299 }
300
301 /*
302 * underprivileged procs call this to request nfs_bind_resv_thread
303 * to perform the reserved port binding for them.
304 */
305 static int
306 nfs_bind_resv_nopriv(struct nfsmount *nmp)
307 {
308 struct nfs_bind_resv_request brreq;
309 int error;
310
311 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
312 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
313 nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
314 lck_grp_attr_setstat(nfs_bind_resv_lck_grp_attr);
315 nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
316 nfs_bind_resv_lck_attr = lck_attr_alloc_init();
317 nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
318 TAILQ_INIT(&nfs_bind_resv_request_queue);
319 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
320 }
321 kernel_thread(kernel_task, nfs_bind_resv_thread);
322 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
323 }
324
325 brreq.brr_nmp = nmp;
326 brreq.brr_error = 0;
327
328 lck_mtx_lock(nfs_bind_resv_mutex);
329 TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
330 lck_mtx_unlock(nfs_bind_resv_mutex);
331
332 error = nfs_bind_resv_thread_wake();
333 if (error) {
334 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
335 /* Note: we might be able to simply restart the thread */
336 return (error);
337 }
338
339 tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
340
341 return (brreq.brr_error);
342 }
343
344 /*
345 * Initialize sockets and congestion for a new NFS connection.
346 * We do not free the sockaddr if error.
347 */
348 int
349 nfs_connect(
350 struct nfsmount *nmp,
351 __unused struct nfsreq *rep)
352 {
353 socket_t so;
354 int error, rcvreserve, sndreserve;
355 struct sockaddr *saddr;
356 struct timeval timeo;
357
358 nmp->nm_so = 0;
359 saddr = mbuf_data(nmp->nm_nam);
360 error = sock_socket(saddr->sa_family, nmp->nm_sotype,
361 nmp->nm_soproto, 0, 0, &nmp->nm_so);
362 if (error) {
363 goto bad;
364 }
365 so = nmp->nm_so;
366
367 /*
368 * Some servers require that the client port be a reserved port number.
369 */
370 if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
371 proc_t p;
372 /*
373 * sobind() requires current_proc() to have superuser privs.
374 * If this bind is part of a reconnect, and the current proc
375 * doesn't have superuser privs, we hand the sobind() off to
376 * a kernel thread to process.
377 */
378 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
379 (p = current_proc()) && suser(kauth_cred_get(), 0)) {
380 /* request nfs_bind_resv_thread() to do bind */
381 error = nfs_bind_resv_nopriv(nmp);
382 } else {
383 error = nfs_bind_resv(nmp);
384 }
385 if (error)
386 goto bad;
387 }
388
389 /*
390 * Protocols that do not require connections may be optionally left
391 * unconnected for servers that reply from a port other than NFS_PORT.
392 */
393 if (nmp->nm_flag & NFSMNT_NOCONN) {
394 if (nmp->nm_sotype == SOCK_STREAM) {
395 error = ENOTCONN;
396 goto bad;
397 }
398 } else {
399 struct timeval tv;
400 tv.tv_sec = 2;
401 tv.tv_usec = 0;
402 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
403 if (error && error != EINPROGRESS) {
404 goto bad;
405 }
406
407 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
408 if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
409 goto bad;
410 }
411 }
412 }
413
414 /*
415 * Always time out on recieve, this allows us to reconnect the
416 * socket to deal with network changes.
417 */
418 timeo.tv_usec = 0;
419 timeo.tv_sec = 2;
420 error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
421 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
422 timeo.tv_sec = 5;
423 } else {
424 timeo.tv_sec = 0;
425 }
426 error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
427
428 if (nmp->nm_sotype == SOCK_DGRAM) {
429 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
430 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
431 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
432 } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
433 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
434 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
435 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
436 } else {
437 int proto;
438 int on = 1;
439
440 sock_gettype(so, NULL, NULL, &proto);
441 if (nmp->nm_sotype != SOCK_STREAM)
442 panic("nfscon sotype");
443
444 // Assume that SOCK_STREAM always requires a connection
445 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
446
447 if (proto == IPPROTO_TCP) {
448 sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
449 }
450
451 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
452 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
453 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
454 }
455
456 if (sndreserve > NFS_MAXSOCKBUF)
457 sndreserve = NFS_MAXSOCKBUF;
458 if (rcvreserve > NFS_MAXSOCKBUF)
459 rcvreserve = NFS_MAXSOCKBUF;
460 error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
461 if (error) {
462 goto bad;
463 }
464 error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
465 if (error) {
466 goto bad;
467 }
468
469 sock_nointerrupt(so, 1);
470
471 /* Initialize other non-zero congestion variables */
472 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
473 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
474 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
475 nmp->nm_sdrtt[3] = 0;
476 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
477 nmp->nm_sent = 0;
478 FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
479 nmp->nm_timeouts = 0;
480 return (0);
481
482 bad:
483 nfs_disconnect(nmp);
484 return (error);
485 }
486
487 /*
488 * Reconnect routine:
489 * Called when a connection is broken on a reliable protocol.
490 * - clean up the old socket
491 * - nfs_connect() again
492 * - set R_MUSTRESEND for all outstanding requests on mount point
493 * If this fails the mount point is DEAD!
494 * nb: Must be called with the nfs_sndlock() set on the mount point.
495 */
496 static int
497 nfs_reconnect(struct nfsreq *rep)
498 {
499 struct nfsreq *rp;
500 struct nfsmount *nmp = rep->r_nmp;
501 int error;
502
503 nfs_disconnect(nmp);
504 while ((error = nfs_connect(nmp, rep))) {
505 if (error == EINTR || error == ERESTART)
506 return (EINTR);
507 if (error == EIO)
508 return (EIO);
509 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
510 "can not connect");
511 rep->r_flags |= R_TPRINTFMSG;
512 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
513 /* we're not yet completely mounted and */
514 /* we can't reconnect, so we fail */
515 return (error);
516 }
517 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
518 return (error);
519 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
520 }
521
522 /*
523 * Loop through outstanding request list and fix up all requests
524 * on old socket.
525 */
526 TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
527 if (rp->r_nmp == nmp)
528 rp->r_flags |= R_MUSTRESEND;
529 }
530 return (0);
531 }
532
533 /*
534 * NFS disconnect. Clean up and unlink.
535 */
536 void
537 nfs_disconnect(struct nfsmount *nmp)
538 {
539 socket_t so;
540
541 if (nmp->nm_so) {
542 so = nmp->nm_so;
543 nmp->nm_so = 0;
544 sock_shutdown(so, 2);
545 sock_close(so);
546 }
547 }
548
549 /*
550 * This is the nfs send routine. For connection based socket types, it
551 * must be called with an nfs_sndlock() on the socket.
552 * "rep == NULL" indicates that it has been called from a server.
553 * For the client side:
554 * - return EINTR if the RPC is terminated, 0 otherwise
555 * - set R_MUSTRESEND if the send fails for any reason
556 * - do any cleanup required by recoverable socket errors (???)
557 * For the server side:
558 * - return EINTR or ERESTART if interrupted by a signal
559 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
560 * - do any cleanup required by recoverable socket errors (???)
561 */
562 int
563 nfs_send(so, nam, top, rep)
564 socket_t so;
565 mbuf_t nam;
566 mbuf_t top;
567 struct nfsreq *rep;
568 {
569 struct sockaddr *sendnam;
570 int error, error2, sotype, flags;
571 u_long xidqueued = 0;
572 struct nfsreq *rp;
573 char savenametolog[MAXPATHLEN];
574 struct msghdr msg;
575
576 if (rep) {
577 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
578 if (error) {
579 mbuf_freem(top);
580 return (error);
581 }
582 if ((so = rep->r_nmp->nm_so) == NULL) {
583 rep->r_flags |= R_MUSTRESEND;
584 mbuf_freem(top);
585 return (0);
586 }
587 rep->r_flags &= ~R_MUSTRESEND;
588 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
589 if (rp == rep)
590 break;
591 if (rp)
592 xidqueued = rp->r_xid;
593 }
594 sock_gettype(so, NULL, &sotype, NULL);
595 if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
596 (nam == 0))
597 sendnam = (struct sockaddr *)0;
598 else
599 sendnam = mbuf_data(nam);
600
601 if (sotype == SOCK_SEQPACKET)
602 flags = MSG_EOR;
603 else
604 flags = 0;
605
606 /*
607 * Save the name here in case mount point goes away if we block.
608 * The name is using local stack and is large, but don't
609 * want to block if we malloc.
610 */
611 if (rep)
612 strncpy(savenametolog,
613 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
614 MAXPATHLEN - 1);
615 bzero(&msg, sizeof(msg));
616 msg.msg_name = (caddr_t)sendnam;
617 msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
618 error = sock_sendmbuf(so, &msg, top, flags, NULL);
619
620 if (error) {
621 if (rep) {
622 if (xidqueued) {
623 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
624 if (rp == rep && rp->r_xid == xidqueued)
625 break;
626 if (!rp)
627 panic("nfs_send: error %d xid %x gone",
628 error, xidqueued);
629 }
630 log(LOG_INFO, "nfs send error %d for server %s\n",
631 error, savenametolog);
632 /*
633 * Deal with errors for the client side.
634 */
635 error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
636 if (error2) {
637 error = error2;
638 } else {
639 rep->r_flags |= R_MUSTRESEND;
640 }
641 } else
642 log(LOG_INFO, "nfsd send error %d\n", error);
643
644 /*
645 * Handle any recoverable (soft) socket errors here. (???)
646 */
647 if (error != EINTR && error != ERESTART && error != EIO &&
648 error != EWOULDBLOCK && error != EPIPE) {
649 error = 0;
650 }
651 }
652 return (error);
653 }
654
655 /*
656 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
657 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
658 * Mark and consolidate the data into a new mbuf list.
659 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
660 * small mbufs.
661 * For SOCK_STREAM we must be very careful to read an entire record once
662 * we have read any of it, even if the system call has been interrupted.
663 */
664 static int
665 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
666 {
667 socket_t so;
668 struct iovec_32 aio;
669 mbuf_t m, mlast;
670 u_long len, fraglen;
671 int error, error2, sotype;
672 proc_t p = current_proc(); /* XXX */
673 struct msghdr msg;
674 size_t rcvlen;
675 int lastfragment;
676
677 /*
678 * Set up arguments for soreceive()
679 */
680 *mp = NULL;
681 sotype = rep->r_nmp->nm_sotype;
682
683 /*
684 * For reliable protocols, lock against other senders/receivers
685 * in case a reconnect is necessary.
686 * For SOCK_STREAM, first get the Record Mark to find out how much
687 * more there is to get.
688 * We must lock the socket against other receivers
689 * until we have an entire rpc request/reply.
690 */
691 if (sotype != SOCK_DGRAM) {
692 error = nfs_sndlock(rep);
693 if (error)
694 return (error);
695 tryagain:
696 /*
697 * Check for fatal errors and resending request.
698 */
699 /*
700 * Ugh: If a reconnect attempt just happened, nm_so
701 * would have changed. NULL indicates a failed
702 * attempt that has essentially shut down this
703 * mount point.
704 */
705 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
706 nfs_sndunlock(rep);
707 if (error)
708 return (error);
709 return (EINTR);
710 }
711 so = rep->r_nmp->nm_so;
712 if (!so) {
713 error = nfs_reconnect(rep);
714 if (error) {
715 nfs_sndunlock(rep);
716 return (error);
717 }
718 goto tryagain;
719 }
720 while (rep->r_flags & R_MUSTRESEND) {
721 error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
722 if (!error) {
723 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
724 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
725 }
726 /*
727 * we also hold rcv lock so rep is still
728 * legit this point
729 */
730 if (error) {
731 if (error == EINTR || error == ERESTART ||
732 (error = nfs_reconnect(rep))) {
733 nfs_sndunlock(rep);
734 return (error);
735 }
736 goto tryagain;
737 }
738 }
739 nfs_sndunlock(rep);
740 if (sotype == SOCK_STREAM) {
741 error = 0;
742 len = 0;
743 lastfragment = 0;
744 mlast = NULL;
745 while (!error && !lastfragment) {
746 aio.iov_base = (uintptr_t) &fraglen;
747 aio.iov_len = sizeof(u_long);
748 bzero(&msg, sizeof(msg));
749 msg.msg_iov = (struct iovec *) &aio;
750 msg.msg_iovlen = 1;
751 do {
752 error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
753 if (!rep->r_nmp) /* if unmounted then bailout */
754 goto shutout;
755 if (error == EWOULDBLOCK && rep) {
756 error2 = nfs_sigintr(rep->r_nmp, rep, p);
757 if (error2)
758 error = error2;
759 }
760 } while (error == EWOULDBLOCK);
761 if (!error && rcvlen < aio.iov_len) {
762 /* only log a message if we got a partial word */
763 if (rcvlen != 0)
764 log(LOG_INFO,
765 "short receive (%d/%d) from nfs server %s\n",
766 rcvlen, sizeof(u_long),
767 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
768 error = EPIPE;
769 }
770 if (error)
771 goto errout;
772 lastfragment = ntohl(fraglen) & 0x80000000;
773 fraglen = ntohl(fraglen) & ~0x80000000;
774 len += fraglen;
775 /*
776 * This is SERIOUS! We are out of sync with the sender
777 * and forcing a disconnect/reconnect is all I can do.
778 */
779 if (len > NFS_MAXPACKET) {
780 log(LOG_ERR, "%s (%d) from nfs server %s\n",
781 "impossible RPC record length", len,
782 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
783 error = EFBIG;
784 goto errout;
785 }
786
787 m = NULL;
788 do {
789 rcvlen = fraglen;
790 error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
791 if (!rep->r_nmp) /* if unmounted then bailout */ {
792 goto shutout;
793 }
794 } while (error == EWOULDBLOCK || error == EINTR ||
795 error == ERESTART);
796
797 if (!error && fraglen > rcvlen) {
798 log(LOG_INFO,
799 "short receive (%d/%d) from nfs server %s\n",
800 rcvlen, fraglen,
801 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
802 error = EPIPE;
803 mbuf_freem(m);
804 }
805 if (!error) {
806 if (!*mp) {
807 *mp = m;
808 mlast = m;
809 } else {
810 error = mbuf_setnext(mlast, m);
811 if (error) {
812 printf("nfs_receive: mbuf_setnext failed %d\n", error);
813 mbuf_freem(m);
814 }
815 }
816 while (mbuf_next(mlast))
817 mlast = mbuf_next(mlast);
818 }
819 }
820 } else {
821 bzero(&msg, sizeof(msg));
822 do {
823 rcvlen = 100000000;
824 error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
825 if (!rep->r_nmp) /* if unmounted then bailout */ {
826 goto shutout;
827 }
828 if (error == EWOULDBLOCK && rep) {
829 error2 = nfs_sigintr(rep->r_nmp, rep, p);
830 if (error2) {
831 return (error2);
832 }
833 }
834 } while (error == EWOULDBLOCK);
835
836 if ((msg.msg_flags & MSG_EOR) == 0)
837 printf("Egad!!\n");
838 if (!error && *mp == NULL)
839 error = EPIPE;
840 len = rcvlen;
841 }
842 errout:
843 if (error && error != EINTR && error != ERESTART) {
844 mbuf_freem(*mp);
845 *mp = NULL;
846 if (error != EPIPE)
847 log(LOG_INFO,
848 "receive error %d from nfs server %s\n", error,
849 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
850 error = nfs_sndlock(rep);
851 if (!error) {
852 error = nfs_reconnect(rep);
853 if (!error)
854 goto tryagain;
855 nfs_sndunlock(rep);
856 }
857 }
858 } else {
859 /*
860 * We could have failed while rebinding the datagram socket
861 * so we need to attempt to rebind here.
862 */
863 if ((so = rep->r_nmp->nm_so) == NULL) {
864 error = nfs_sndlock(rep);
865 if (!error) {
866 error = nfs_reconnect(rep);
867 nfs_sndunlock(rep);
868 }
869 if (error)
870 return (error);
871 if (!rep->r_nmp) /* if unmounted then bailout */
872 return (ENXIO);
873 so = rep->r_nmp->nm_so;
874 }
875 bzero(&msg, sizeof(msg));
876 len = 0;
877 do {
878 rcvlen = 1000000;
879 error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
880 if (!rep->r_nmp) /* if unmounted then bailout */
881 goto shutout;
882 if (error) {
883 error2 = nfs_sigintr(rep->r_nmp, rep, p);
884 if (error2) {
885 error = error2;
886 goto shutout;
887 }
888 }
889 /* Reconnect for all errors. We may be receiving
890 * soft/hard/blocking errors because of a network
891 * change.
892 * XXX: we should rate limit or delay this
893 * to once every N attempts or something.
894 * although TCP doesn't seem to.
895 */
896 if (error) {
897 error2 = nfs_sndlock(rep);
898 if (!error2) {
899 error2 = nfs_reconnect(rep);
900 if (error2)
901 error = error2;
902 else if (!rep->r_nmp) /* if unmounted then bailout */
903 error = ENXIO;
904 else
905 so = rep->r_nmp->nm_so;
906 nfs_sndunlock(rep);
907 } else {
908 error = error2;
909 }
910 }
911 } while (error == EWOULDBLOCK);
912 }
913 shutout:
914 if (error) {
915 mbuf_freem(*mp);
916 *mp = NULL;
917 }
918 return (error);
919 }
920
921 /*
922 * Implement receipt of reply on a socket.
923 * We must search through the list of received datagrams matching them
924 * with outstanding requests using the xid, until ours is found.
925 */
926 /* ARGSUSED */
927 int
928 nfs_reply(myrep)
929 struct nfsreq *myrep;
930 {
931 struct nfsreq *rep;
932 struct nfsmount *nmp = myrep->r_nmp;
933 long t1;
934 mbuf_t mrep, md;
935 u_long rxid, *tl;
936 caddr_t dpos, cp2;
937 int error;
938
939 /*
940 * Loop around until we get our own reply
941 */
942 for (;;) {
943 /*
944 * Lock against other receivers so that I don't get stuck in
945 * sbwait() after someone else has received my reply for me.
946 * Also necessary for connection based protocols to avoid
947 * race conditions during a reconnect.
948 * If nfs_rcvlock() returns EALREADY, that means that
949 * the reply has already been recieved by another
950 * process and we can return immediately. In this
951 * case, the lock is not taken to avoid races with
952 * other processes.
953 */
954 error = nfs_rcvlock(myrep);
955 if (error == EALREADY)
956 return (0);
957 if (error)
958 return (error);
959
960 /*
961 * If we slept after putting bits otw, then reply may have
962 * arrived. In which case returning is required, or we
963 * would hang trying to nfs_receive an already received reply.
964 */
965 if (myrep->r_mrep != NULL) {
966 nfs_rcvunlock(myrep);
967 FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
968 return (0);
969 }
970 /*
971 * Get the next Rpc reply off the socket. Assume myrep->r_nmp
972 * is still intact by checks done in nfs_rcvlock.
973 */
974 error = nfs_receive(myrep, &mrep);
975 /*
976 * Bailout asap if nfsmount struct gone (unmounted).
977 */
978 if (!myrep->r_nmp) {
979 FSDBG(530, myrep->r_xid, myrep, nmp, -2);
980 if (mrep)
981 mbuf_freem(mrep);
982 return (ENXIO);
983 }
984 if (error) {
985 FSDBG(530, myrep->r_xid, myrep, nmp, error);
986 nfs_rcvunlock(myrep);
987
988 /* Bailout asap if nfsmount struct gone (unmounted). */
989 if (!myrep->r_nmp) {
990 if (mrep)
991 mbuf_freem(mrep);
992 return (ENXIO);
993 }
994
995 /*
996 * Ignore routing errors on connectionless protocols??
997 */
998 if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
999 if (nmp->nm_so) {
1000 int clearerror;
1001 int optlen = sizeof(clearerror);
1002 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1003 }
1004 continue;
1005 }
1006 if (mrep)
1007 mbuf_freem(mrep);
1008 return (error);
1009 }
1010
1011 /*
1012 * We assume all is fine, but if we did not have an error
1013 * and mrep is 0, better not dereference it. nfs_receive
1014 * calls soreceive which carefully sets error=0 when it got
1015 * errors on sbwait (tsleep). In most cases, I assume that's
1016 * so we could go back again. In tcp case, EPIPE is returned.
1017 * In udp, case nfs_receive gets back here with no error and no
1018 * mrep. Is the right fix to have soreceive check for process
1019 * aborted after sbwait and return something non-zero? Should
1020 * nfs_receive give an EPIPE? Too risky to play with those
1021 * two this late in game for a shutdown problem. Instead,
1022 * just check here and get out. (ekn)
1023 */
1024 if (!mrep) {
1025 nfs_rcvunlock(myrep);
1026 FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1027 return (ENXIO); /* sounds good */
1028 }
1029
1030 /*
1031 * Get the xid and check that it is an rpc reply
1032 */
1033 md = mrep;
1034 dpos = mbuf_data(md);
1035 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1036 rxid = *tl++;
1037 if (*tl != rpc_reply) {
1038 OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1039 mbuf_freem(mrep);
1040 nfsmout:
1041 if (nmp->nm_state & NFSSTA_RCVLOCK)
1042 nfs_rcvunlock(myrep);
1043 continue;
1044 }
1045
1046 /*
1047 * Loop through the request list to match up the reply
1048 * Iff no match, just drop the datagram
1049 */
1050 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1051 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1052 /* Found it.. */
1053 rep->r_mrep = mrep;
1054 rep->r_md = md;
1055 rep->r_dpos = dpos;
1056 /*
1057 * If we're tracking the round trip time
1058 * then we update the circular log here
1059 * with the stats from our current request.
1060 */
1061 if (nfsrtton) {
1062 struct rttl *rt;
1063
1064 rt = &nfsrtt.rttl[nfsrtt.pos];
1065 rt->proc = rep->r_procnum;
1066 rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1067 rt->sent = nmp->nm_sent;
1068 rt->cwnd = nmp->nm_cwnd;
1069 if (proct[rep->r_procnum] == 0)
1070 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1071 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1072 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1073 rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1074 microtime(&rt->tstamp); // XXX unused
1075 if (rep->r_flags & R_TIMING)
1076 rt->rtt = rep->r_rtt;
1077 else
1078 rt->rtt = 1000000;
1079 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1080 }
1081 /*
1082 * Update congestion window.
1083 * Do the additive increase of
1084 * one rpc/rtt.
1085 */
1086 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1087 nmp->nm_cwnd);
1088 if (nmp->nm_cwnd <= nmp->nm_sent) {
1089 nmp->nm_cwnd +=
1090 (NFS_CWNDSCALE * NFS_CWNDSCALE +
1091 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1092 if (nmp->nm_cwnd > NFS_MAXCWND)
1093 nmp->nm_cwnd = NFS_MAXCWND;
1094 }
1095 if (rep->r_flags & R_SENT) {
1096 rep->r_flags &= ~R_SENT;
1097 nmp->nm_sent -= NFS_CWNDSCALE;
1098 }
1099 /*
1100 * Update rtt using a gain of 0.125 on the mean
1101 * and a gain of 0.25 on the deviation.
1102 */
1103 if (rep->r_flags & R_TIMING) {
1104 /*
1105 * Since the timer resolution of
1106 * NFS_HZ is so course, it can often
1107 * result in r_rtt == 0. Since
1108 * r_rtt == N means that the actual
1109 * rtt is between N+dt and N+2-dt ticks,
1110 * add 1.
1111 */
1112 if (proct[rep->r_procnum] == 0)
1113 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1114 t1 = rep->r_rtt + 1;
1115 t1 -= (NFS_SRTT(rep) >> 3);
1116 NFS_SRTT(rep) += t1;
1117 if (t1 < 0)
1118 t1 = -t1;
1119 t1 -= (NFS_SDRTT(rep) >> 2);
1120 NFS_SDRTT(rep) += t1;
1121 }
1122 nmp->nm_timeouts = 0;
1123 break;
1124 }
1125 }
1126 nfs_rcvunlock(myrep);
1127 /*
1128 * If not matched to a request, drop it.
1129 * If it's mine, get out.
1130 */
1131 if (rep == 0) {
1132 OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1133 mbuf_freem(mrep);
1134 } else if (rep == myrep) {
1135 if (rep->r_mrep == NULL)
1136 panic("nfs_reply: nil r_mrep");
1137 return (0);
1138 }
1139 FSDBG(530, myrep->r_xid, myrep, rep,
1140 rep ? rep->r_xid : myrep->r_flags);
1141 }
1142 }
1143
1144 /*
1145 * nfs_request - goes something like this
1146 * - fill in request struct
1147 * - links it into list
1148 * - calls nfs_send() for first transmit
1149 * - calls nfs_receive() to get reply
1150 * - break down rpc header and return with nfs reply pointed to
1151 * by mrep or error
1152 * nb: always frees up mreq mbuf list
1153 */
1154 int
1155 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1156 vnode_t vp;
1157 mount_t mp;
1158 mbuf_t mrest;
1159 int procnum;
1160 proc_t procp;
1161 kauth_cred_t cred;
1162 mbuf_t *mrp;
1163 mbuf_t *mdp;
1164 caddr_t *dposp;
1165 u_int64_t *xidp;
1166 {
1167 mbuf_t m, mrep, m2;
1168 struct nfsreq re, *rep;
1169 u_long *tl;
1170 int i;
1171 struct nfsmount *nmp;
1172 mbuf_t md, mheadend;
1173 char nickv[RPCX_NICKVERF];
1174 time_t waituntil;
1175 caddr_t dpos, cp2;
1176 int t1, error = 0, mrest_len, auth_len, auth_type;
1177 int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1178 int verf_len, verf_type;
1179 u_long xid;
1180 char *auth_str, *verf_str;
1181 NFSKERBKEY_T key; /* save session key */
1182 int nmsotype;
1183 struct timeval now;
1184
1185 if (mrp)
1186 *mrp = NULL;
1187 if (xidp)
1188 *xidp = 0;
1189 nmp = VFSTONFS(mp);
1190
1191 rep = &re;
1192
1193 if (vp)
1194 nmp = VFSTONFS(vnode_mount(vp));
1195 if (nmp == NULL ||
1196 (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1197 (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1198 mbuf_freem(mrest);
1199 return (ENXIO);
1200 }
1201 nmsotype = nmp->nm_sotype;
1202
1203 FSDBG_TOP(531, vp, procnum, nmp, rep);
1204
1205 rep->r_nmp = nmp;
1206 rep->r_vp = vp;
1207 rep->r_procp = procp;
1208 rep->r_procnum = procnum;
1209 microuptime(&now);
1210 rep->r_lastmsg = now.tv_sec -
1211 ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1212 i = 0;
1213 m = mrest;
1214 while (m) {
1215 i += mbuf_len(m);
1216 m = mbuf_next(m);
1217 }
1218 mrest_len = i;
1219
1220 /*
1221 * Get the RPC header with authorization.
1222 */
1223 kerbauth:
1224 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1225 if (!nmp) {
1226 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1227 mbuf_freem(mrest);
1228 return (ENXIO);
1229 }
1230 verf_str = auth_str = (char *)0;
1231 if (nmp->nm_flag & NFSMNT_KERB) {
1232 verf_str = nickv;
1233 verf_len = sizeof (nickv);
1234 auth_type = RPCAUTH_KERB4;
1235 bzero((caddr_t)key, sizeof (key));
1236 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1237 &auth_len, verf_str, verf_len)) {
1238 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1239 if (!nmp) {
1240 FSDBG_BOT(531, 2, vp, error, rep);
1241 mbuf_freem(mrest);
1242 return (ENXIO);
1243 }
1244 error = nfs_getauth(nmp, rep, cred, &auth_str,
1245 &auth_len, verf_str, &verf_len, key);
1246 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1247 if (!error && !nmp)
1248 error = ENXIO;
1249 if (error) {
1250 FSDBG_BOT(531, 2, vp, error, rep);
1251 mbuf_freem(mrest);
1252 return (error);
1253 }
1254 }
1255 } else {
1256 auth_type = RPCAUTH_UNIX;
1257 if (cred->cr_ngroups < 1)
1258 panic("nfsreq nogrps");
1259 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1260 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1261 5 * NFSX_UNSIGNED;
1262 }
1263 error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1264 auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1265 if (auth_str)
1266 _FREE(auth_str, M_TEMP);
1267 if (error) {
1268 mbuf_freem(mrest);
1269 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1270 return (error);
1271 }
1272 if (xidp)
1273 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1274
1275 /*
1276 * For stream protocols, insert a Sun RPC Record Mark.
1277 */
1278 if (nmsotype == SOCK_STREAM) {
1279 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1280 if (error) {
1281 mbuf_freem(m);
1282 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1283 return (error);
1284 }
1285 *((u_long*)mbuf_data(m)) =
1286 htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1287 }
1288 rep->r_mreq = m;
1289 rep->r_xid = xid;
1290 tryagain:
1291 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1292 if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1293 rep->r_retry = nmp->nm_retry;
1294 else
1295 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
1296 rep->r_rtt = rep->r_rexmit = 0;
1297 if (proct[procnum] > 0)
1298 rep->r_flags = R_TIMING;
1299 else
1300 rep->r_flags = 0;
1301 rep->r_mrep = NULL;
1302
1303 /*
1304 * Do the client side RPC.
1305 */
1306 OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1307 /*
1308 * Chain request into list of outstanding requests. Be sure
1309 * to put it LAST so timer finds oldest requests first.
1310 */
1311 TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1312
1313 /*
1314 * If backing off another request or avoiding congestion, don't
1315 * send this one now but let timer do it. If not timing a request,
1316 * do it now.
1317 */
1318 if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1319 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1320 nmp->nm_sent < nmp->nm_cwnd)) {
1321 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1322
1323 if (connrequired)
1324 error = nfs_sndlock(rep);
1325
1326 /*
1327 * Set the R_SENT before doing the send in case another thread
1328 * processes the reply before the nfs_send returns here
1329 */
1330 if (!error) {
1331 if ((rep->r_flags & R_MUSTRESEND) == 0) {
1332 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1333 nmp->nm_cwnd);
1334 nmp->nm_sent += NFS_CWNDSCALE;
1335 rep->r_flags |= R_SENT;
1336 }
1337
1338 error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1339 if (!error)
1340 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1341 if (connrequired)
1342 nfs_sndunlock(rep);
1343 }
1344 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1345 if (error) {
1346 if (nmp)
1347 nmp->nm_sent -= NFS_CWNDSCALE;
1348 rep->r_flags &= ~R_SENT;
1349 }
1350 } else {
1351 rep->r_rtt = -1;
1352 }
1353
1354 /*
1355 * Wait for the reply from our send or the timer's.
1356 */
1357 if (!error || error == EPIPE)
1358 error = nfs_reply(rep);
1359
1360 /*
1361 * RPC done, unlink the request.
1362 */
1363 nfs_repdequeue(rep);
1364
1365 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1366
1367 /*
1368 * Decrement the outstanding request count.
1369 */
1370 if (rep->r_flags & R_SENT) {
1371 rep->r_flags &= ~R_SENT; /* paranoia */
1372 if (nmp) {
1373 FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1374 nmp->nm_sent -= NFS_CWNDSCALE;
1375 }
1376 }
1377
1378 /*
1379 * If there was a successful reply and a tprintf msg.
1380 * tprintf a response.
1381 */
1382 if (!error)
1383 nfs_up(nmp, procp, NFSSTA_TIMEO,
1384 (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1385 mrep = rep->r_mrep;
1386 md = rep->r_md;
1387 dpos = rep->r_dpos;
1388 if (!error && !nmp)
1389 error = ENXIO;
1390 if (error) {
1391 mbuf_freem(rep->r_mreq);
1392 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1393 return (error);
1394 }
1395
1396 /*
1397 * break down the rpc header and check if ok
1398 */
1399 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1400 if (*tl++ == rpc_msgdenied) {
1401 if (*tl == rpc_mismatch)
1402 error = EOPNOTSUPP;
1403 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1404 if (!failed_auth) {
1405 failed_auth++;
1406 error = mbuf_setnext(mheadend, NULL);
1407 mbuf_freem(mrep);
1408 mbuf_freem(rep->r_mreq);
1409 if (!error)
1410 goto kerbauth;
1411 printf("nfs_request: mbuf_setnext failed\n");
1412 } else
1413 error = EAUTH;
1414 } else
1415 error = EACCES;
1416 mbuf_freem(mrep);
1417 mbuf_freem(rep->r_mreq);
1418 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1419 return (error);
1420 }
1421
1422 /*
1423 * Grab any Kerberos verifier, otherwise just throw it away.
1424 */
1425 verf_type = fxdr_unsigned(int, *tl++);
1426 i = fxdr_unsigned(int, *tl);
1427 if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1428 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1429 if (error)
1430 goto nfsmout;
1431 } else if (i > 0)
1432 nfsm_adv(nfsm_rndup(i));
1433 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1434 /* 0 == ok */
1435 if (*tl == 0) {
1436 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1437 if (*tl != 0) {
1438 error = fxdr_unsigned(int, *tl);
1439 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1440 error == NFSERR_TRYLATER) {
1441 mbuf_freem(mrep);
1442 error = 0;
1443 microuptime(&now);
1444 waituntil = now.tv_sec + trylater_delay;
1445 while (now.tv_sec < waituntil) {
1446 tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1447 microuptime(&now);
1448 }
1449 trylater_delay *= 2;
1450 if (trylater_delay > 60)
1451 trylater_delay = 60;
1452 goto tryagain;
1453 }
1454
1455 /*
1456 * If the File Handle was stale, invalidate the
1457 * lookup cache, just in case.
1458 */
1459 if ((error == ESTALE) && vp)
1460 cache_purge(vp);
1461 if (nmp->nm_flag & NFSMNT_NFSV3) {
1462 *mrp = mrep;
1463 *mdp = md;
1464 *dposp = dpos;
1465 error |= NFSERR_RETERR;
1466 } else {
1467 mbuf_freem(mrep);
1468 error &= ~NFSERR_RETERR;
1469 }
1470 mbuf_freem(rep->r_mreq);
1471 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1472 return (error);
1473 }
1474
1475 *mrp = mrep;
1476 *mdp = md;
1477 *dposp = dpos;
1478 mbuf_freem(rep->r_mreq);
1479 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1480 return (0);
1481 }
1482 mbuf_freem(mrep);
1483 error = EPROTONOSUPPORT;
1484 nfsmout:
1485 mbuf_freem(rep->r_mreq);
1486 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1487 return (error);
1488 }
1489
1490 #ifndef NFS_NOSERVER
1491 /*
1492 * Generate the rpc reply header
1493 * siz arg. is used to decide if adding a cluster is worthwhile
1494 */
1495 int
1496 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1497 int siz;
1498 struct nfsrv_descript *nd;
1499 struct nfssvc_sock *slp;
1500 int err;
1501 mbuf_t *mrq;
1502 mbuf_t *mbp;
1503 caddr_t *bposp;
1504 {
1505 u_long *tl;
1506 mbuf_t mreq;
1507 caddr_t bpos;
1508 mbuf_t mb, mb2;
1509 int error, mlen;
1510
1511 /*
1512 * If this is a big reply, use a cluster else
1513 * try and leave leading space for the lower level headers.
1514 */
1515 siz += RPC_REPLYSIZ;
1516 if (siz >= nfs_mbuf_minclsize) {
1517 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1518 } else {
1519 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1520 }
1521 if (error) {
1522 /* unable to allocate packet */
1523 /* XXX nfsstat? */
1524 return (error);
1525 }
1526 mb = mreq;
1527 tl = mbuf_data(mreq);
1528 mlen = 6 * NFSX_UNSIGNED;
1529 if (siz < nfs_mbuf_minclsize) {
1530 /* leave space for lower level headers */
1531 tl += 80/sizeof(*tl); /* XXX max_hdr? XXX */
1532 mbuf_setdata(mreq, tl, mlen);
1533 } else {
1534 mbuf_setlen(mreq, mlen);
1535 }
1536 bpos = ((caddr_t)tl) + mlen;
1537 *tl++ = txdr_unsigned(nd->nd_retxid);
1538 *tl++ = rpc_reply;
1539 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1540 *tl++ = rpc_msgdenied;
1541 if (err & NFSERR_AUTHERR) {
1542 *tl++ = rpc_autherr;
1543 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1544 mlen -= NFSX_UNSIGNED;
1545 mbuf_setlen(mreq, mlen);
1546 bpos -= NFSX_UNSIGNED;
1547 } else {
1548 *tl++ = rpc_mismatch;
1549 *tl++ = txdr_unsigned(RPC_VER2);
1550 *tl = txdr_unsigned(RPC_VER2);
1551 }
1552 } else {
1553 *tl++ = rpc_msgaccepted;
1554
1555 /*
1556 * For Kerberos authentication, we must send the nickname
1557 * verifier back, otherwise just RPCAUTH_NULL.
1558 */
1559 if (nd->nd_flag & ND_KERBFULL) {
1560 struct nfsuid *nuidp;
1561 struct timeval ktvin, ktvout;
1562 uid_t uid = kauth_cred_getuid(nd->nd_cr);
1563
1564 lck_rw_lock_shared(&slp->ns_rwlock);
1565 for (nuidp = NUIDHASH(slp, uid)->lh_first;
1566 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1567 if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1568 (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1569 &nuidp->nu_haddr, nd->nd_nam2)))
1570 break;
1571 }
1572 if (nuidp) {
1573 ktvin.tv_sec =
1574 txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1575 ktvin.tv_usec =
1576 txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1577
1578 /*
1579 * Encrypt the timestamp in ecb mode using the
1580 * session key.
1581 */
1582 #if NFSKERB
1583 XXX
1584 #endif
1585
1586 *tl++ = rpc_auth_kerb;
1587 *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1588 *tl = ktvout.tv_sec;
1589 nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1590 *tl++ = ktvout.tv_usec;
1591 *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1592 } else {
1593 *tl++ = 0;
1594 *tl++ = 0;
1595 }
1596 lck_rw_done(&slp->ns_rwlock);
1597 } else {
1598 *tl++ = 0;
1599 *tl++ = 0;
1600 }
1601 switch (err) {
1602 case EPROGUNAVAIL:
1603 *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1604 break;
1605 case EPROGMISMATCH:
1606 *tl = txdr_unsigned(RPC_PROGMISMATCH);
1607 nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1608 // XXX hard coded versions
1609 *tl++ = txdr_unsigned(2);
1610 *tl = txdr_unsigned(3);
1611 break;
1612 case EPROCUNAVAIL:
1613 *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1614 break;
1615 case EBADRPC:
1616 *tl = txdr_unsigned(RPC_GARBAGE);
1617 break;
1618 default:
1619 *tl = 0;
1620 if (err != NFSERR_RETVOID) {
1621 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1622 if (err)
1623 *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1624 else
1625 *tl = 0;
1626 }
1627 break;
1628 }
1629 }
1630
1631 if (mrq != NULL)
1632 *mrq = mreq;
1633 *mbp = mb;
1634 *bposp = bpos;
1635 if (err != 0 && err != NFSERR_RETVOID) {
1636 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1637 }
1638 return (0);
1639 }
1640
1641
1642 #endif /* NFS_NOSERVER */
1643
1644
1645 /*
1646 * From FreeBSD 1.58, a Matt Dillon fix...
1647 * Flag a request as being about to terminate.
1648 * The nm_sent count is decremented now to avoid deadlocks when the process
1649 * in soreceive() hasn't yet managed to send its own request.
1650 */
1651 static void
1652 nfs_softterm(struct nfsreq *rep)
1653 {
1654
1655 rep->r_flags |= R_SOFTTERM;
1656 if (rep->r_flags & R_SENT) {
1657 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1658 rep->r_nmp->nm_cwnd);
1659 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1660 rep->r_flags &= ~R_SENT;
1661 }
1662 }
1663
1664 void
1665 nfs_timer_funnel(void * arg)
1666 {
1667 (void) thread_funnel_set(kernel_flock, TRUE);
1668 nfs_timer(arg);
1669 (void) thread_funnel_set(kernel_flock, FALSE);
1670
1671 }
1672
1673 /*
1674 * Ensure rep isn't in use by the timer, then dequeue it.
1675 */
1676 static void
1677 nfs_repdequeue(struct nfsreq *rep)
1678 {
1679
1680 while ((rep->r_flags & R_BUSY)) {
1681 rep->r_flags |= R_WAITING;
1682 tsleep(rep, PSOCK, "repdeq", 0);
1683 }
1684 TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1685 }
1686
1687 /*
1688 * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1689 * free()'d out from under it.
1690 */
1691 static void
1692 nfs_repbusy(struct nfsreq *rep)
1693 {
1694
1695 if ((rep->r_flags & R_BUSY))
1696 panic("rep locked");
1697 rep->r_flags |= R_BUSY;
1698 }
1699
1700 /*
1701 * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1702 */
1703 static struct nfsreq *
1704 nfs_repnext(struct nfsreq *rep)
1705 {
1706 struct nfsreq * nextrep;
1707
1708 if (rep == NULL)
1709 return (NULL);
1710 /*
1711 * We need to get and busy the next req before signalling the
1712 * current one, otherwise wakeup() may block us and we'll race to
1713 * grab the next req.
1714 */
1715 nextrep = TAILQ_NEXT(rep, r_chain);
1716 if (nextrep != NULL)
1717 nfs_repbusy(nextrep);
1718 /* unbusy and signal. */
1719 rep->r_flags &= ~R_BUSY;
1720 if ((rep->r_flags & R_WAITING)) {
1721 rep->r_flags &= ~R_WAITING;
1722 wakeup(rep);
1723 }
1724 return (nextrep);
1725 }
1726
1727 /*
1728 * Nfs timer routine
1729 * Scan the nfsreq list and retranmit any requests that have timed out
1730 * To avoid retransmission attempts on STREAM sockets (in the future) make
1731 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1732 */
1733 void
1734 nfs_timer(__unused void *arg)
1735 {
1736 struct nfsreq *rep;
1737 mbuf_t m;
1738 socket_t so;
1739 struct nfsmount *nmp;
1740 int timeo;
1741 int error;
1742 #ifndef NFS_NOSERVER
1743 struct nfssvc_sock *slp;
1744 u_quad_t cur_usec;
1745 #endif /* NFS_NOSERVER */
1746 int flags, rexmit, cwnd, sent;
1747 u_long xid;
1748 struct timeval now;
1749
1750 rep = TAILQ_FIRST(&nfs_reqq);
1751 if (rep != NULL)
1752 nfs_repbusy(rep);
1753 microuptime(&now);
1754 for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1755 nmp = rep->r_nmp;
1756 if (!nmp) /* unmounted */
1757 continue;
1758 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1759 continue;
1760 if (nfs_sigintr(nmp, rep, rep->r_procp))
1761 continue;
1762 if (nmp->nm_tprintf_initial_delay != 0 &&
1763 (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1764 rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1765 rep->r_lastmsg = now.tv_sec;
1766 nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1767 "not responding");
1768 rep->r_flags |= R_TPRINTFMSG;
1769 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1770 /* we're not yet completely mounted and */
1771 /* we can't complete an RPC, so we fail */
1772 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1773 nfs_softterm(rep);
1774 continue;
1775 }
1776 }
1777 if (rep->r_rtt >= 0) {
1778 rep->r_rtt++;
1779 if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1780 timeo = nmp->nm_timeo;
1781 else
1782 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1783 /* ensure 62.5 ms floor */
1784 while (16 * timeo < hz)
1785 timeo *= 2;
1786 if (nmp->nm_timeouts > 0)
1787 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1788 if (rep->r_rtt <= timeo)
1789 continue;
1790 if (nmp->nm_timeouts < 8)
1791 nmp->nm_timeouts++;
1792 }
1793 /*
1794 * Check for too many retransmits. This is never true for
1795 * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1796 * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1797 */
1798 if (rep->r_rexmit >= rep->r_retry) { /* too many */
1799 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1800 nfs_softterm(rep);
1801 continue;
1802 }
1803 if (nmp->nm_sotype != SOCK_DGRAM) {
1804 if (++rep->r_rexmit > NFS_MAXREXMIT)
1805 rep->r_rexmit = NFS_MAXREXMIT;
1806 continue;
1807 }
1808 if ((so = nmp->nm_so) == NULL)
1809 continue;
1810
1811 /*
1812 * If there is enough space and the window allows..
1813 * Resend it
1814 * Set r_rtt to -1 in case we fail to send it now.
1815 */
1816 rep->r_rtt = -1;
1817 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1818 (rep->r_flags & R_SENT) ||
1819 nmp->nm_sent < nmp->nm_cwnd) &&
1820 (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1821 struct msghdr msg;
1822 /*
1823 * Iff first send, start timing
1824 * else turn timing off, backoff timer
1825 * and divide congestion window by 2.
1826 * We update these *before* the send to avoid
1827 * racing against receiving the reply.
1828 * We save them so we can restore them on send error.
1829 */
1830 flags = rep->r_flags;
1831 rexmit = rep->r_rexmit;
1832 cwnd = nmp->nm_cwnd;
1833 sent = nmp->nm_sent;
1834 xid = rep->r_xid;
1835 if (rep->r_flags & R_SENT) {
1836 rep->r_flags &= ~R_TIMING;
1837 if (++rep->r_rexmit > NFS_MAXREXMIT)
1838 rep->r_rexmit = NFS_MAXREXMIT;
1839 nmp->nm_cwnd >>= 1;
1840 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1841 nmp->nm_cwnd = NFS_CWNDSCALE;
1842 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1843 } else {
1844 rep->r_flags |= R_SENT;
1845 nmp->nm_sent += NFS_CWNDSCALE;
1846 }
1847 FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1848
1849 bzero(&msg, sizeof(msg));
1850 if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1851 msg.msg_name = mbuf_data(nmp->nm_nam);
1852 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1853 }
1854 error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1855
1856 FSDBG(535, xid, error, sent, cwnd);
1857
1858 if (error) {
1859 if (error == EWOULDBLOCK) {
1860 rep->r_flags = flags;
1861 rep->r_rexmit = rexmit;
1862 nmp->nm_cwnd = cwnd;
1863 nmp->nm_sent = sent;
1864 rep->r_xid = xid;
1865 }
1866 else {
1867 if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1868 int clearerror;
1869 int optlen = sizeof(clearerror);
1870 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1871 }
1872 rep->r_flags = flags | R_RESENDERR;
1873 rep->r_rexmit = rexmit;
1874 nmp->nm_cwnd = cwnd;
1875 nmp->nm_sent = sent;
1876 if (flags & R_SENT)
1877 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1878 }
1879 } else
1880 rep->r_rtt = 0;
1881 }
1882 }
1883 microuptime(&now);
1884 #ifndef NFS_NOSERVER
1885 /*
1886 * Scan the write gathering queues for writes that need to be
1887 * completed now.
1888 */
1889 cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1890 lck_mtx_lock(nfsd_mutex);
1891 TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1892 if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1893 nfsrv_wakenfsd(slp);
1894 }
1895 lck_mtx_unlock(nfsd_mutex);
1896 #endif /* NFS_NOSERVER */
1897
1898 if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1899 /*
1900 * We haven't called nfs_buf_freeup() in a little while.
1901 * So, see if we can free up any stale/unused bufs now.
1902 */
1903 nfs_buf_freeup(1);
1904 }
1905
1906 timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1907
1908 }
1909
1910
1911 /*
1912 * Test for a termination condition pending on the process.
1913 * This is used to determine if we need to bail on a mount.
1914 * EIO is returned if there has been a soft timeout.
1915 * EINTR is returned if there is a signal pending that is not being ignored
1916 * and the mount is interruptable, or if we are a thread that is in the process
1917 * of cancellation (also SIGKILL posted).
1918 */
1919 int
1920 nfs_sigintr(nmp, rep, p)
1921 struct nfsmount *nmp;
1922 struct nfsreq *rep;
1923 proc_t p;
1924 {
1925 sigset_t pending_sigs;
1926 int context_good = 0;
1927 struct nfsmount *repnmp;
1928 extern proc_t kernproc;
1929
1930 if (nmp == NULL)
1931 return (ENXIO);
1932 if (rep != NULL) {
1933 repnmp = rep->r_nmp;
1934 /* we've had a forced unmount. */
1935 if (repnmp == NULL)
1936 return (ENXIO);
1937 /* request has timed out on a 'soft' mount. */
1938 if (rep->r_flags & R_SOFTTERM)
1939 return (EIO);
1940 /*
1941 * We're in the progress of a force unmount and there's
1942 * been a timeout we're dead and fail IO.
1943 */
1944 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1945 (NFSSTA_FORCE|NFSSTA_TIMEO))
1946 return (EIO);
1947 /* Someone is unmounting us, go soft and mark it. */
1948 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1949 repnmp->nm_flag |= NFSMNT_SOFT;
1950 nmp->nm_state |= NFSSTA_FORCE;
1951 }
1952 /*
1953 * If the mount is hung and we've requested not to hang
1954 * on remote filesystems, then bail now.
1955 */
1956 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1957 (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1958 return (EIO);
1959 }
1960 /* XXX: is this valid? this probably should be an assertion. */
1961 if (p == NULL)
1962 return (0);
1963
1964 /* Is this thread belongs to kernel task; then abort check is not needed */
1965 if ((current_proc() != kernproc) && current_thread_aborted()) {
1966 return (EINTR);
1967 }
1968 /* mask off thread and process blocked signals. */
1969
1970 pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1971 if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1972 return (EINTR);
1973 return (0);
1974 }
1975
1976 /*
1977 * Lock a socket against others.
1978 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1979 * and also to avoid race conditions between the processes with nfs requests
1980 * in progress when a reconnect is necessary.
1981 */
1982 int
1983 nfs_sndlock(rep)
1984 struct nfsreq *rep;
1985 {
1986 int *statep;
1987 proc_t p;
1988 int error, slpflag = 0, slptimeo = 0;
1989
1990 if (rep->r_nmp == NULL)
1991 return (ENXIO);
1992 statep = &rep->r_nmp->nm_state;
1993
1994 p = rep->r_procp;
1995 if (rep->r_nmp->nm_flag & NFSMNT_INT)
1996 slpflag = PCATCH;
1997 while (*statep & NFSSTA_SNDLOCK) {
1998 error = nfs_sigintr(rep->r_nmp, rep, p);
1999 if (error)
2000 return (error);
2001 *statep |= NFSSTA_WANTSND;
2002 if (p != NULL && (proc_noremotehang(p)) != 0)
2003 slptimeo = hz;
2004 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2005 if (slpflag == PCATCH) {
2006 slpflag = 0;
2007 slptimeo = 2 * hz;
2008 }
2009 /*
2010 * Make sure while we slept that the mountpoint didn't go away.
2011 * nfs_sigintr and callers expect it in tact.
2012 */
2013 if (!rep->r_nmp)
2014 return (ENXIO); /* don't have lock until out of loop */
2015 }
2016 *statep |= NFSSTA_SNDLOCK;
2017 return (0);
2018 }
2019
2020 /*
2021 * Unlock the stream socket for others.
2022 */
2023 void
2024 nfs_sndunlock(rep)
2025 struct nfsreq *rep;
2026 {
2027 int *statep;
2028
2029 if (rep->r_nmp == NULL)
2030 return;
2031 statep = &rep->r_nmp->nm_state;
2032 if ((*statep & NFSSTA_SNDLOCK) == 0)
2033 panic("nfs sndunlock");
2034 *statep &= ~NFSSTA_SNDLOCK;
2035 if (*statep & NFSSTA_WANTSND) {
2036 *statep &= ~NFSSTA_WANTSND;
2037 wakeup((caddr_t)statep);
2038 }
2039 }
2040
2041 static int
2042 nfs_rcvlock(struct nfsreq *rep)
2043 {
2044 int *statep;
2045 int error, slpflag, slptimeo = 0;
2046
2047 /* make sure we still have our mountpoint */
2048 if (!rep->r_nmp) {
2049 if (rep->r_mrep != NULL)
2050 return (EALREADY);
2051 return (ENXIO);
2052 }
2053
2054 statep = &rep->r_nmp->nm_state;
2055 FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2056 if (rep->r_nmp->nm_flag & NFSMNT_INT)
2057 slpflag = PCATCH;
2058 else
2059 slpflag = 0;
2060 while (*statep & NFSSTA_RCVLOCK) {
2061 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2062 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2063 return (error);
2064 } else if (rep->r_mrep != NULL) {
2065 /*
2066 * Don't bother sleeping if reply already arrived
2067 */
2068 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2069 return (EALREADY);
2070 }
2071 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2072 *statep |= NFSSTA_WANTRCV;
2073 /*
2074 * We need to poll if we're P_NOREMOTEHANG so that we
2075 * call nfs_sigintr periodically above.
2076 */
2077 if (rep->r_procp != NULL &&
2078 (proc_noremotehang(rep->r_procp)) != 0)
2079 slptimeo = hz;
2080 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2081 if (slpflag == PCATCH) {
2082 slpflag = 0;
2083 slptimeo = 2 * hz;
2084 }
2085 /*
2086 * Make sure while we slept that the mountpoint didn't go away.
2087 * nfs_sigintr and caller nfs_reply expect it intact.
2088 */
2089 if (!rep->r_nmp) {
2090 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2091 return (ENXIO); /* don't have lock until out of loop */
2092 }
2093 }
2094 /*
2095 * nfs_reply will handle it if reply already arrived.
2096 * (We may have slept or been preempted).
2097 */
2098 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2099 *statep |= NFSSTA_RCVLOCK;
2100 return (0);
2101 }
2102
2103 /*
2104 * Unlock the stream socket for others.
2105 */
2106 static void
2107 nfs_rcvunlock(struct nfsreq *rep)
2108 {
2109 int *statep;
2110
2111 if (rep->r_nmp == NULL)
2112 return;
2113 statep = &rep->r_nmp->nm_state;
2114
2115 FSDBG(533, statep, *statep, 0, 0);
2116 if ((*statep & NFSSTA_RCVLOCK) == 0)
2117 panic("nfs rcvunlock");
2118 *statep &= ~NFSSTA_RCVLOCK;
2119 if (*statep & NFSSTA_WANTRCV) {
2120 *statep &= ~NFSSTA_WANTRCV;
2121 wakeup((caddr_t)statep);
2122 }
2123 }
2124
2125
2126 #ifndef NFS_NOSERVER
2127 /*
2128 * Socket upcall routine for the nfsd sockets.
2129 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2130 * Essentially do as much as possible non-blocking, else punt and it will
2131 * be called with MBUF_WAITOK from an nfsd.
2132 */
2133 void
2134 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2135 {
2136 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2137
2138 if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2139 return;
2140
2141 lck_rw_lock_exclusive(&slp->ns_rwlock);
2142 nfsrv_rcv_locked(so, slp, waitflag);
2143 /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2144 }
2145 void
2146 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2147 {
2148 mbuf_t m, mp, mhck, m2;
2149 int ns_flag=0, error;
2150 struct msghdr msg;
2151 size_t bytes_read;
2152
2153 if ((slp->ns_flag & SLP_VALID) == 0) {
2154 if (waitflag == MBUF_DONTWAIT)
2155 lck_rw_done(&slp->ns_rwlock);
2156 return;
2157 }
2158
2159 #ifdef notdef
2160 /*
2161 * Define this to test for nfsds handling this under heavy load.
2162 */
2163 if (waitflag == MBUF_DONTWAIT) {
2164 ns_flag = SLP_NEEDQ;
2165 goto dorecs;
2166 }
2167 #endif
2168 if (slp->ns_sotype == SOCK_STREAM) {
2169 /*
2170 * If there are already records on the queue, defer soreceive()
2171 * to an nfsd so that there is feedback to the TCP layer that
2172 * the nfs servers are heavily loaded.
2173 */
2174 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2175 ns_flag = SLP_NEEDQ;
2176 goto dorecs;
2177 }
2178
2179 /*
2180 * Do soreceive().
2181 */
2182 bytes_read = 1000000000;
2183 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2184 if (error || mp == NULL) {
2185 if (error == EWOULDBLOCK)
2186 ns_flag = SLP_NEEDQ;
2187 else
2188 ns_flag = SLP_DISCONN;
2189 goto dorecs;
2190 }
2191 m = mp;
2192 if (slp->ns_rawend) {
2193 if ((error = mbuf_setnext(slp->ns_rawend, m)))
2194 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2195 slp->ns_cc += bytes_read;
2196 } else {
2197 slp->ns_raw = m;
2198 slp->ns_cc = bytes_read;
2199 }
2200 while ((m2 = mbuf_next(m)))
2201 m = m2;
2202 slp->ns_rawend = m;
2203
2204 /*
2205 * Now try and parse record(s) out of the raw stream data.
2206 */
2207 error = nfsrv_getstream(slp, waitflag);
2208 if (error) {
2209 if (error == EPERM)
2210 ns_flag = SLP_DISCONN;
2211 else
2212 ns_flag = SLP_NEEDQ;
2213 }
2214 } else {
2215 struct sockaddr_storage nam;
2216
2217 bzero(&msg, sizeof(msg));
2218 msg.msg_name = (caddr_t)&nam;
2219 msg.msg_namelen = sizeof(nam);
2220
2221 do {
2222 bytes_read = 1000000000;
2223 error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2224 if (mp) {
2225 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2226 mbuf_setlen(mhck, nam.ss_len);
2227 bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2228 m = mhck;
2229 if (mbuf_setnext(m, mp)) {
2230 /* trouble... just drop it */
2231 printf("nfsrv_rcv: mbuf_setnext failed\n");
2232 mbuf_free(mhck);
2233 m = mp;
2234 }
2235 } else {
2236 m = mp;
2237 }
2238 if (slp->ns_recend)
2239 mbuf_setnextpkt(slp->ns_recend, m);
2240 else
2241 slp->ns_rec = m;
2242 slp->ns_recend = m;
2243 mbuf_setnextpkt(m, NULL);
2244 }
2245 #if 0
2246 if (error) {
2247 /*
2248 * This may be needed in the future to support
2249 * non-byte-stream connection-oriented protocols
2250 * such as SCTP.
2251 */
2252 /*
2253 * This (slp->ns_sotype == SOCK_STREAM) should really
2254 * be a check for PR_CONNREQUIRED.
2255 */
2256 if ((slp->ns_sotype == SOCK_STREAM)
2257 && error != EWOULDBLOCK) {
2258 ns_flag = SLP_DISCONN;
2259 goto dorecs;
2260 }
2261 }
2262 #endif
2263 } while (mp);
2264 }
2265
2266 /*
2267 * Now try and process the request records, non-blocking.
2268 */
2269 dorecs:
2270 if (ns_flag)
2271 slp->ns_flag |= ns_flag;
2272 if (waitflag == MBUF_DONTWAIT) {
2273 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2274 lck_rw_done(&slp->ns_rwlock);
2275 if (wake && nfs_numnfsd) {
2276 lck_mtx_lock(nfsd_mutex);
2277 nfsrv_wakenfsd(slp);
2278 lck_mtx_unlock(nfsd_mutex);
2279 }
2280 }
2281 }
2282
2283 /*
2284 * Try and extract an RPC request from the mbuf data list received on a
2285 * stream socket. The "waitflag" argument indicates whether or not it
2286 * can sleep.
2287 */
2288 static int
2289 nfsrv_getstream(slp, waitflag)
2290 struct nfssvc_sock *slp;
2291 int waitflag;
2292 {
2293 mbuf_t m;
2294 char *cp1, *cp2, *mdata;
2295 int len, mlen, error;
2296 mbuf_t om, m2, recm;
2297 u_long recmark;
2298
2299 if (slp->ns_flag & SLP_GETSTREAM)
2300 panic("nfs getstream");
2301 slp->ns_flag |= SLP_GETSTREAM;
2302 for (;;) {
2303 if (slp->ns_reclen == 0) {
2304 if (slp->ns_cc < NFSX_UNSIGNED) {
2305 slp->ns_flag &= ~SLP_GETSTREAM;
2306 return (0);
2307 }
2308 m = slp->ns_raw;
2309 mdata = mbuf_data(m);
2310 mlen = mbuf_len(m);
2311 if (mlen >= NFSX_UNSIGNED) {
2312 bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2313 mdata += NFSX_UNSIGNED;
2314 mlen -= NFSX_UNSIGNED;
2315 mbuf_setdata(m, mdata, mlen);
2316 } else {
2317 cp1 = (caddr_t)&recmark;
2318 cp2 = mdata;
2319 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2320 while (mlen == 0) {
2321 m = mbuf_next(m);
2322 cp2 = mbuf_data(m);
2323 mlen = mbuf_len(m);
2324 }
2325 *cp1++ = *cp2++;
2326 mlen--;
2327 mbuf_setdata(m, cp2, mlen);
2328 }
2329 }
2330 slp->ns_cc -= NFSX_UNSIGNED;
2331 recmark = ntohl(recmark);
2332 slp->ns_reclen = recmark & ~0x80000000;
2333 if (recmark & 0x80000000)
2334 slp->ns_flag |= SLP_LASTFRAG;
2335 else
2336 slp->ns_flag &= ~SLP_LASTFRAG;
2337 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2338 slp->ns_flag &= ~SLP_GETSTREAM;
2339 return (EPERM);
2340 }
2341 }
2342
2343 /*
2344 * Now get the record part.
2345 *
2346 * Note that slp->ns_reclen may be 0. Linux sometimes
2347 * generates 0-length RPCs
2348 */
2349 recm = NULL;
2350 if (slp->ns_cc == slp->ns_reclen) {
2351 recm = slp->ns_raw;
2352 slp->ns_raw = slp->ns_rawend = NULL;
2353 slp->ns_cc = slp->ns_reclen = 0;
2354 } else if (slp->ns_cc > slp->ns_reclen) {
2355 len = 0;
2356 m = slp->ns_raw;
2357 mlen = mbuf_len(m);
2358 mdata = mbuf_data(m);
2359 om = NULL;
2360 while (len < slp->ns_reclen) {
2361 if ((len + mlen) > slp->ns_reclen) {
2362 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2363 slp->ns_flag &= ~SLP_GETSTREAM;
2364 return (EWOULDBLOCK);
2365 }
2366 if (om) {
2367 if (mbuf_setnext(om, m2)) {
2368 /* trouble... just drop it */
2369 printf("nfsrv_getstream: mbuf_setnext failed\n");
2370 mbuf_freem(m2);
2371 slp->ns_flag &= ~SLP_GETSTREAM;
2372 return (EWOULDBLOCK);
2373 }
2374 recm = slp->ns_raw;
2375 } else {
2376 recm = m2;
2377 }
2378 mdata += slp->ns_reclen - len;
2379 mlen -= slp->ns_reclen - len;
2380 mbuf_setdata(m, mdata, mlen);
2381 len = slp->ns_reclen;
2382 } else if ((len + mlen) == slp->ns_reclen) {
2383 om = m;
2384 len += mlen;
2385 m = mbuf_next(m);
2386 recm = slp->ns_raw;
2387 if (mbuf_setnext(om, NULL)) {
2388 printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2389 slp->ns_flag &= ~SLP_GETSTREAM;
2390 return (EWOULDBLOCK);
2391 }
2392 mlen = mbuf_len(m);
2393 mdata = mbuf_data(m);
2394 } else {
2395 om = m;
2396 len += mlen;
2397 m = mbuf_next(m);
2398 mlen = mbuf_len(m);
2399 mdata = mbuf_data(m);
2400 }
2401 }
2402 slp->ns_raw = m;
2403 slp->ns_cc -= len;
2404 slp->ns_reclen = 0;
2405 } else {
2406 slp->ns_flag &= ~SLP_GETSTREAM;
2407 return (0);
2408 }
2409
2410 /*
2411 * Accumulate the fragments into a record.
2412 */
2413 if (slp->ns_frag == NULL) {
2414 slp->ns_frag = recm;
2415 } else {
2416 m = slp->ns_frag;
2417 while ((m2 = mbuf_next(m)))
2418 m = m2;
2419 if ((error = mbuf_setnext(m, recm)))
2420 panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2421 }
2422 if (slp->ns_flag & SLP_LASTFRAG) {
2423 if (slp->ns_recend)
2424 mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2425 else
2426 slp->ns_rec = slp->ns_frag;
2427 slp->ns_recend = slp->ns_frag;
2428 slp->ns_frag = NULL;
2429 }
2430 }
2431 }
2432
2433 /*
2434 * Parse an RPC header.
2435 */
2436 int
2437 nfsrv_dorec(slp, nfsd, ndp)
2438 struct nfssvc_sock *slp;
2439 struct nfsd *nfsd;
2440 struct nfsrv_descript **ndp;
2441 {
2442 mbuf_t m;
2443 mbuf_t nam;
2444 struct nfsrv_descript *nd;
2445 int error;
2446
2447 *ndp = NULL;
2448 if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2449 return (ENOBUFS);
2450 MALLOC_ZONE(nd, struct nfsrv_descript *,
2451 sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2452 if (!nd)
2453 return (ENOMEM);
2454 m = slp->ns_rec;
2455 slp->ns_rec = mbuf_nextpkt(m);
2456 if (slp->ns_rec)
2457 mbuf_setnextpkt(m, NULL);
2458 else
2459 slp->ns_recend = NULL;
2460 if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2461 nam = m;
2462 m = mbuf_next(m);
2463 if ((error = mbuf_setnext(nam, NULL)))
2464 panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2465 } else
2466 nam = NULL;
2467 nd->nd_md = nd->nd_mrep = m;
2468 nd->nd_nam2 = nam;
2469 nd->nd_dpos = mbuf_data(m);
2470 error = nfs_getreq(nd, nfsd, TRUE);
2471 if (error) {
2472 if (nam)
2473 mbuf_freem(nam);
2474 FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC);
2475 return (error);
2476 }
2477 *ndp = nd;
2478 nfsd->nfsd_nd = nd;
2479 return (0);
2480 }
2481
2482 /*
2483 * Parse an RPC request
2484 * - verify it
2485 * - fill in the cred struct.
2486 */
2487 int
2488 nfs_getreq(nd, nfsd, has_header)
2489 struct nfsrv_descript *nd;
2490 struct nfsd *nfsd;
2491 int has_header;
2492 {
2493 int len, i;
2494 u_long *tl;
2495 long t1;
2496 uio_t uiop;
2497 caddr_t dpos, cp2, cp;
2498 u_long nfsvers, auth_type;
2499 uid_t nickuid;
2500 int error = 0, ticklen;
2501 mbuf_t mrep, md;
2502 struct nfsuid *nuidp;
2503 uid_t user_id;
2504 gid_t group_id;
2505 int ngroups;
2506 struct ucred temp_cred;
2507 struct timeval tvin, tvout, now;
2508 char uio_buf[ UIO_SIZEOF(1) ];
2509 #if 0 /* until encrypted keys are implemented */
2510 NFSKERBKEYSCHED_T keys; /* stores key schedule */
2511 #endif
2512
2513 nd->nd_cr = NULL;
2514
2515 mrep = nd->nd_mrep;
2516 md = nd->nd_md;
2517 dpos = nd->nd_dpos;
2518 if (has_header) {
2519 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2520 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2521 if (*tl++ != rpc_call) {
2522 mbuf_freem(mrep);
2523 return (EBADRPC);
2524 }
2525 } else
2526 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2527 nd->nd_repstat = 0;
2528 nd->nd_flag = 0;
2529 if (*tl++ != rpc_vers) {
2530 nd->nd_repstat = ERPCMISMATCH;
2531 nd->nd_procnum = NFSPROC_NOOP;
2532 return (0);
2533 }
2534 if (*tl != nfs_prog) {
2535 nd->nd_repstat = EPROGUNAVAIL;
2536 nd->nd_procnum = NFSPROC_NOOP;
2537 return (0);
2538 }
2539 tl++;
2540 nfsvers = fxdr_unsigned(u_long, *tl++);
2541 if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2542 nd->nd_repstat = EPROGMISMATCH;
2543 nd->nd_procnum = NFSPROC_NOOP;
2544 return (0);
2545 }
2546 else if (nfsvers == NFS_VER3)
2547 nd->nd_flag = ND_NFSV3;
2548 nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2549 if (nd->nd_procnum == NFSPROC_NULL)
2550 return (0);
2551 if ((nd->nd_procnum >= NFS_NPROCS) ||
2552 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2553 nd->nd_repstat = EPROCUNAVAIL;
2554 nd->nd_procnum = NFSPROC_NOOP;
2555 return (0);
2556 }
2557 if ((nd->nd_flag & ND_NFSV3) == 0)
2558 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2559 auth_type = *tl++;
2560 len = fxdr_unsigned(int, *tl++);
2561 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2562 mbuf_freem(mrep);
2563 return (EBADRPC);
2564 }
2565
2566 nd->nd_flag &= ~ND_KERBAUTH;
2567 /*
2568 * Handle auth_unix or auth_kerb.
2569 */
2570 if (auth_type == rpc_auth_unix) {
2571 len = fxdr_unsigned(int, *++tl);
2572 if (len < 0 || len > NFS_MAXNAMLEN) {
2573 mbuf_freem(mrep);
2574 return (EBADRPC);
2575 }
2576 bzero(&temp_cred, sizeof(temp_cred));
2577 nfsm_adv(nfsm_rndup(len));
2578 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2579 user_id = fxdr_unsigned(uid_t, *tl++);
2580 group_id = fxdr_unsigned(gid_t, *tl++);
2581 temp_cred.cr_groups[0] = group_id;
2582 len = fxdr_unsigned(int, *tl);
2583 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2584 mbuf_freem(mrep);
2585 return (EBADRPC);
2586 }
2587 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2588 for (i = 1; i <= len; i++)
2589 if (i < NGROUPS)
2590 temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2591 else
2592 tl++;
2593 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2594 if (ngroups > 1)
2595 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2596 len = fxdr_unsigned(int, *++tl);
2597 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2598 mbuf_freem(mrep);
2599 return (EBADRPC);
2600 }
2601 temp_cred.cr_uid = user_id;
2602 temp_cred.cr_ngroups = ngroups;
2603 nd->nd_cr = kauth_cred_create(&temp_cred);
2604 if (nd->nd_cr == NULL) {
2605 nd->nd_repstat = ENOMEM;
2606 nd->nd_procnum = NFSPROC_NOOP;
2607 return (0);
2608 }
2609 if (len > 0)
2610 nfsm_adv(nfsm_rndup(len));
2611 } else if (auth_type == rpc_auth_kerb) {
2612 switch (fxdr_unsigned(int, *tl++)) {
2613 case RPCAKN_FULLNAME:
2614 ticklen = fxdr_unsigned(int, *tl);
2615 *((u_long *)nfsd->nfsd_authstr) = *tl;
2616 uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2617 &uio_buf[0], sizeof(uio_buf));
2618 if (!uiop) {
2619 nd->nd_repstat = ENOMEM;
2620 nd->nd_procnum = NFSPROC_NOOP;
2621 return (0);
2622 }
2623
2624 // LP64todo - fix this
2625 nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2626 if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2627 mbuf_freem(mrep);
2628 return (EBADRPC);
2629 }
2630 uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2631 // LP64todo - fix this
2632 nfsm_mtouio(uiop, uio_resid(uiop));
2633 nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2634 if (*tl++ != rpc_auth_kerb ||
2635 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2636 printf("Bad kerb verifier\n");
2637 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2638 nd->nd_procnum = NFSPROC_NOOP;
2639 return (0);
2640 }
2641 nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2642 tl = (u_long *)cp;
2643 if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2644 printf("Not fullname kerb verifier\n");
2645 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2646 nd->nd_procnum = NFSPROC_NOOP;
2647 return (0);
2648 }
2649 cp += NFSX_UNSIGNED;
2650 bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2651 nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2652 nd->nd_flag |= ND_KERBFULL;
2653 nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2654 break;
2655 case RPCAKN_NICKNAME:
2656 if (len != 2 * NFSX_UNSIGNED) {
2657 printf("Kerb nickname short\n");
2658 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2659 nd->nd_procnum = NFSPROC_NOOP;
2660 return (0);
2661 }
2662 nickuid = fxdr_unsigned(uid_t, *tl);
2663 nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2664 if (*tl++ != rpc_auth_kerb ||
2665 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2666 printf("Kerb nick verifier bad\n");
2667 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2668 nd->nd_procnum = NFSPROC_NOOP;
2669 return (0);
2670 }
2671 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2672 tvin.tv_sec = *tl++;
2673 tvin.tv_usec = *tl;
2674
2675 for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2676 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2677 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2678 (!nd->nd_nam2 ||
2679 netaddr_match(NU_NETFAM(nuidp),
2680 &nuidp->nu_haddr, nd->nd_nam2)))
2681 break;
2682 }
2683 if (!nuidp) {
2684 nd->nd_repstat =
2685 (NFSERR_AUTHERR|AUTH_REJECTCRED);
2686 nd->nd_procnum = NFSPROC_NOOP;
2687 return (0);
2688 }
2689
2690 /*
2691 * Now, decrypt the timestamp using the session key
2692 * and validate it.
2693 */
2694 #if NFSKERB
2695 XXX
2696 #endif
2697
2698 tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2699 tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2700 microtime(&now);
2701 if (nuidp->nu_expire < now.tv_sec ||
2702 nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2703 (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2704 nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2705 nuidp->nu_expire = 0;
2706 nd->nd_repstat =
2707 (NFSERR_AUTHERR|AUTH_REJECTVERF);
2708 nd->nd_procnum = NFSPROC_NOOP;
2709 return (0);
2710 }
2711 bzero(&temp_cred, sizeof(temp_cred));
2712 ngroups = nuidp->nu_cr->cr_ngroups;
2713 for (i = 0; i < ngroups; i++)
2714 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2715 if (ngroups > 1)
2716 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2717
2718 temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2719 temp_cred.cr_ngroups = ngroups;
2720 nd->nd_cr = kauth_cred_create(&temp_cred);
2721 if (!nd->nd_cr) {
2722 nd->nd_repstat = ENOMEM;
2723 nd->nd_procnum = NFSPROC_NOOP;
2724 return (0);
2725 }
2726 nd->nd_flag |= ND_KERBNICK;
2727 };
2728 } else {
2729 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2730 nd->nd_procnum = NFSPROC_NOOP;
2731 return (0);
2732 }
2733
2734 nd->nd_md = md;
2735 nd->nd_dpos = dpos;
2736 return (0);
2737 nfsmout:
2738 if (nd->nd_cr)
2739 kauth_cred_rele(nd->nd_cr);
2740 return (error);
2741 }
2742
2743 /*
2744 * Search for a sleeping nfsd and wake it up.
2745 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2746 * running nfsds will go look for the work in the nfssvc_sock list.
2747 * Note: Must be called with nfsd_mutex held.
2748 */
2749 void
2750 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2751 {
2752 struct nfsd *nd;
2753
2754 if ((slp->ns_flag & SLP_VALID) == 0)
2755 return;
2756
2757 lck_rw_lock_exclusive(&slp->ns_rwlock);
2758
2759 if (nfsd_waiting) {
2760 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2761 if (nd->nfsd_flag & NFSD_WAITING) {
2762 nd->nfsd_flag &= ~NFSD_WAITING;
2763 if (nd->nfsd_slp)
2764 panic("nfsd wakeup");
2765 slp->ns_sref++;
2766 nd->nfsd_slp = slp;
2767 lck_rw_done(&slp->ns_rwlock);
2768 wakeup((caddr_t)nd);
2769 return;
2770 }
2771 }
2772 }
2773
2774 slp->ns_flag |= SLP_DOREC;
2775
2776 lck_rw_done(&slp->ns_rwlock);
2777
2778 nfsd_head_flag |= NFSD_CHECKSLP;
2779 }
2780 #endif /* NFS_NOSERVER */
2781
2782 static int
2783 nfs_msg(proc_t p,
2784 const char *server,
2785 const char *msg,
2786 int error)
2787 {
2788 tpr_t tpr;
2789
2790 if (p)
2791 tpr = tprintf_open(p);
2792 else
2793 tpr = NULL;
2794 if (error)
2795 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2796 error);
2797 else
2798 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2799 tprintf_close(tpr);
2800 return (0);
2801 }
2802
2803 void
2804 nfs_down(nmp, proc, error, flags, msg)
2805 struct nfsmount *nmp;
2806 proc_t proc;
2807 int error, flags;
2808 const char *msg;
2809 {
2810 if (nmp == NULL)
2811 return;
2812 if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2813 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2814 nmp->nm_state |= NFSSTA_TIMEO;
2815 }
2816 if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2817 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2818 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2819 }
2820 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2821 }
2822
2823 void
2824 nfs_up(nmp, proc, flags, msg)
2825 struct nfsmount *nmp;
2826 proc_t proc;
2827 int flags;
2828 const char *msg;
2829 {
2830 if (nmp == NULL)
2831 return;
2832 if (msg)
2833 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2834 if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2835 nmp->nm_state &= ~NFSSTA_TIMEO;
2836 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2837 }
2838 if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2839 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2840 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2841 }
2842 }
2843