]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_socket.c
xnu-792.24.17.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_socket.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1989, 1991, 1993, 1995
25 * The Regents of the University of California. All rights reserved.
26 *
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
60 */
61
62 /*
63 * Socket operations for use by nfs
64 */
65
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/proc.h>
69 #include <sys/kauth.h>
70 #include <sys/mount_internal.h>
71 #include <sys/kernel.h>
72 #include <sys/kpi_mbuf.h>
73 #include <sys/malloc.h>
74 #include <sys/vnode.h>
75 #include <sys/domain.h>
76 #include <sys/protosw.h>
77 #include <sys/socket.h>
78 #include <sys/syslog.h>
79 #include <sys/tprintf.h>
80 #include <sys/uio_internal.h>
81 #include <libkern/OSAtomic.h>
82
83 #include <sys/time.h>
84 #include <kern/clock.h>
85 #include <kern/task.h>
86 #include <kern/thread.h>
87 #include <sys/user.h>
88
89 #include <netinet/in.h>
90 #include <netinet/tcp.h>
91
92 #include <nfs/rpcv2.h>
93 #include <nfs/nfsproto.h>
94 #include <nfs/nfs.h>
95 #include <nfs/xdr_subs.h>
96 #include <nfs/nfsm_subs.h>
97 #include <nfs/nfsmount.h>
98 #include <nfs/nfsnode.h>
99 #include <nfs/nfsrtt.h>
100
101 #include <sys/kdebug.h>
102
103 #define FSDBG(A, B, C, D, E) \
104 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
105 (int)(B), (int)(C), (int)(D), (int)(E), 0)
106 #define FSDBG_TOP(A, B, C, D, E) \
107 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
108 (int)(B), (int)(C), (int)(D), (int)(E), 0)
109 #define FSDBG_BOT(A, B, C, D, E) \
110 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
111 (int)(B), (int)(C), (int)(D), (int)(E), 0)
112
113 /*
114 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
115 * Use the mean and mean deviation of rtt for the appropriate type of rpc
116 * for the frequent rpcs and a default for the others.
117 * The justification for doing "other" this way is that these rpcs
118 * happen so infrequently that timer est. would probably be stale.
119 * Also, since many of these rpcs are
120 * non-idempotent, a conservative timeout is desired.
121 * getattr, lookup - A+2D
122 * read, write - A+4D
123 * other - nm_timeo
124 */
125 #define NFS_RTO(n, t) \
126 ((t) == 0 ? (n)->nm_timeo : \
127 ((t) < 3 ? \
128 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
129 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
130 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
131 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
132 /*
133 * External data, mostly RPC constants in XDR form
134 */
135 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
136 rpc_msgaccepted, rpc_call, rpc_autherr,
137 rpc_auth_kerb;
138 extern u_long nfs_prog;
139 extern struct nfsstats nfsstats;
140 extern int nfsv3_procid[NFS_NPROCS];
141 extern int nfs_ticks;
142 extern u_long nfs_xidwrap;
143
144 /*
145 * Defines which timer to use for the procnum.
146 * 0 - default
147 * 1 - getattr
148 * 2 - lookup
149 * 3 - read
150 * 4 - write
151 */
152 static int proct[NFS_NPROCS] = {
153 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
154 };
155
156 /*
157 * There is a congestion window for outstanding rpcs maintained per mount
158 * point. The cwnd size is adjusted in roughly the way that:
159 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
160 * SIGCOMM '88". ACM, August 1988.
161 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
162 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
163 * of rpcs is in progress.
164 * (The sent count and cwnd are scaled for integer arith.)
165 * Variants of "slow start" were tried and were found to be too much of a
166 * performance hit (ave. rtt 3 times larger),
167 * I suspect due to the large rtt that nfs rpcs have.
168 */
169 #define NFS_CWNDSCALE 256
170 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
171 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
172 int nfsrtton = 0;
173 struct nfsrtt nfsrtt;
174
175 static int nfs_rcvlock(struct nfsreq *);
176 static void nfs_rcvunlock(struct nfsreq *);
177 static int nfs_receive(struct nfsreq *rep, mbuf_t *mp);
178 static int nfs_reconnect(struct nfsreq *rep);
179 static void nfs_repdequeue(struct nfsreq *rep);
180
181 /* XXX */
182 boolean_t current_thread_aborted(void);
183 kern_return_t thread_terminate(thread_t);
184
185 #ifndef NFS_NOSERVER
186 static int nfsrv_getstream(struct nfssvc_sock *,int);
187
188 int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
189 struct nfssvc_sock *slp,
190 proc_t procp,
191 mbuf_t *mreqp) = {
192 nfsrv_null,
193 nfsrv_getattr,
194 nfsrv_setattr,
195 nfsrv_lookup,
196 nfsrv3_access,
197 nfsrv_readlink,
198 nfsrv_read,
199 nfsrv_write,
200 nfsrv_create,
201 nfsrv_mkdir,
202 nfsrv_symlink,
203 nfsrv_mknod,
204 nfsrv_remove,
205 nfsrv_rmdir,
206 nfsrv_rename,
207 nfsrv_link,
208 nfsrv_readdir,
209 nfsrv_readdirplus,
210 nfsrv_statfs,
211 nfsrv_fsinfo,
212 nfsrv_pathconf,
213 nfsrv_commit,
214 nfsrv_noop
215 };
216 #endif /* NFS_NOSERVER */
217
218
219 /*
220 * attempt to bind a socket to a reserved port
221 */
222 static int
223 nfs_bind_resv(struct nfsmount *nmp)
224 {
225 socket_t so = nmp->nm_so;
226 struct sockaddr_in sin;
227 int error;
228 u_short tport;
229
230 if (!so)
231 return (EINVAL);
232
233 sin.sin_len = sizeof (struct sockaddr_in);
234 sin.sin_family = AF_INET;
235 sin.sin_addr.s_addr = INADDR_ANY;
236 tport = IPPORT_RESERVED - 1;
237 sin.sin_port = htons(tport);
238
239 while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
240 (--tport > IPPORT_RESERVED / 2))
241 sin.sin_port = htons(tport);
242 return (error);
243 }
244
245 /*
246 * variables for managing the nfs_bind_resv_thread
247 */
248 int nfs_resv_mounts = 0;
249 static int nfs_bind_resv_thread_state = 0;
250 #define NFS_BIND_RESV_THREAD_STATE_INITTED 1
251 #define NFS_BIND_RESV_THREAD_STATE_RUNNING 2
252 lck_grp_t *nfs_bind_resv_lck_grp;
253 lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
254 lck_attr_t *nfs_bind_resv_lck_attr;
255 lck_mtx_t *nfs_bind_resv_mutex;
256 struct nfs_bind_resv_request {
257 TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
258 struct nfsmount *brr_nmp;
259 int brr_error;
260 };
261 static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
262
263 /*
264 * thread to handle any reserved port bind requests
265 */
266 static void
267 nfs_bind_resv_thread(void)
268 {
269 struct nfs_bind_resv_request *brreq;
270
271 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
272
273 while (nfs_resv_mounts > 0) {
274 lck_mtx_lock(nfs_bind_resv_mutex);
275 while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
276 TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
277 lck_mtx_unlock(nfs_bind_resv_mutex);
278 brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
279 wakeup(brreq);
280 lck_mtx_lock(nfs_bind_resv_mutex);
281 }
282 msleep((caddr_t)&nfs_bind_resv_request_queue,
283 nfs_bind_resv_mutex, PSOCK | PDROP,
284 "nfs_bind_resv_request_queue", 0);
285 }
286
287 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
288 (void) thread_terminate(current_thread());
289 }
290
291 int
292 nfs_bind_resv_thread_wake(void)
293 {
294 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
295 return (EIO);
296 wakeup(&nfs_bind_resv_request_queue);
297 return (0);
298 }
299
300 /*
301 * underprivileged procs call this to request nfs_bind_resv_thread
302 * to perform the reserved port binding for them.
303 */
304 static int
305 nfs_bind_resv_nopriv(struct nfsmount *nmp)
306 {
307 struct nfs_bind_resv_request brreq;
308 int error;
309
310 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
311 if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
312 nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
313 lck_grp_attr_setstat(nfs_bind_resv_lck_grp_attr);
314 nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
315 nfs_bind_resv_lck_attr = lck_attr_alloc_init();
316 nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
317 TAILQ_INIT(&nfs_bind_resv_request_queue);
318 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
319 }
320 kernel_thread(kernel_task, nfs_bind_resv_thread);
321 nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
322 }
323
324 brreq.brr_nmp = nmp;
325 brreq.brr_error = 0;
326
327 lck_mtx_lock(nfs_bind_resv_mutex);
328 TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
329 lck_mtx_unlock(nfs_bind_resv_mutex);
330
331 error = nfs_bind_resv_thread_wake();
332 if (error) {
333 TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
334 /* Note: we might be able to simply restart the thread */
335 return (error);
336 }
337
338 tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
339
340 return (brreq.brr_error);
341 }
342
343 /*
344 * Initialize sockets and congestion for a new NFS connection.
345 * We do not free the sockaddr if error.
346 */
347 int
348 nfs_connect(
349 struct nfsmount *nmp,
350 __unused struct nfsreq *rep)
351 {
352 socket_t so;
353 int error, rcvreserve, sndreserve;
354 struct sockaddr *saddr;
355 struct timeval timeo;
356
357 nmp->nm_so = 0;
358 saddr = mbuf_data(nmp->nm_nam);
359 error = sock_socket(saddr->sa_family, nmp->nm_sotype,
360 nmp->nm_soproto, 0, 0, &nmp->nm_so);
361 if (error) {
362 goto bad;
363 }
364 so = nmp->nm_so;
365
366 /*
367 * Some servers require that the client port be a reserved port number.
368 */
369 if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
370 proc_t p;
371 /*
372 * sobind() requires current_proc() to have superuser privs.
373 * If this bind is part of a reconnect, and the current proc
374 * doesn't have superuser privs, we hand the sobind() off to
375 * a kernel thread to process.
376 */
377 if ((nmp->nm_state & NFSSTA_MOUNTED) &&
378 (p = current_proc()) && suser(kauth_cred_get(), 0)) {
379 /* request nfs_bind_resv_thread() to do bind */
380 error = nfs_bind_resv_nopriv(nmp);
381 } else {
382 error = nfs_bind_resv(nmp);
383 }
384 if (error)
385 goto bad;
386 }
387
388 /*
389 * Protocols that do not require connections may be optionally left
390 * unconnected for servers that reply from a port other than NFS_PORT.
391 */
392 if (nmp->nm_flag & NFSMNT_NOCONN) {
393 if (nmp->nm_sotype == SOCK_STREAM) {
394 error = ENOTCONN;
395 goto bad;
396 }
397 } else {
398 struct timeval tv;
399 tv.tv_sec = 2;
400 tv.tv_usec = 0;
401 error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
402 if (error && error != EINPROGRESS) {
403 goto bad;
404 }
405
406 while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
407 if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
408 goto bad;
409 }
410 }
411 }
412
413 /*
414 * Always time out on recieve, this allows us to reconnect the
415 * socket to deal with network changes.
416 */
417 timeo.tv_usec = 0;
418 timeo.tv_sec = 2;
419 error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
420 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
421 timeo.tv_sec = 5;
422 } else {
423 timeo.tv_sec = 0;
424 }
425 error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
426
427 if (nmp->nm_sotype == SOCK_DGRAM) {
428 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
429 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
430 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
431 } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
432 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
433 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
434 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
435 } else {
436 int proto;
437 int on = 1;
438
439 sock_gettype(so, NULL, NULL, &proto);
440 if (nmp->nm_sotype != SOCK_STREAM)
441 panic("nfscon sotype");
442
443 // Assume that SOCK_STREAM always requires a connection
444 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
445
446 if (proto == IPPROTO_TCP) {
447 sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
448 }
449
450 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
451 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
452 (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
453 }
454
455 if (sndreserve > NFS_MAXSOCKBUF)
456 sndreserve = NFS_MAXSOCKBUF;
457 if (rcvreserve > NFS_MAXSOCKBUF)
458 rcvreserve = NFS_MAXSOCKBUF;
459 error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
460 if (error) {
461 goto bad;
462 }
463 error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
464 if (error) {
465 goto bad;
466 }
467
468 sock_nointerrupt(so, 1);
469
470 /* Initialize other non-zero congestion variables */
471 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
472 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
473 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
474 nmp->nm_sdrtt[3] = 0;
475 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
476 nmp->nm_sent = 0;
477 FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
478 nmp->nm_timeouts = 0;
479 return (0);
480
481 bad:
482 nfs_disconnect(nmp);
483 return (error);
484 }
485
486 /*
487 * Reconnect routine:
488 * Called when a connection is broken on a reliable protocol.
489 * - clean up the old socket
490 * - nfs_connect() again
491 * - set R_MUSTRESEND for all outstanding requests on mount point
492 * If this fails the mount point is DEAD!
493 * nb: Must be called with the nfs_sndlock() set on the mount point.
494 */
495 static int
496 nfs_reconnect(struct nfsreq *rep)
497 {
498 struct nfsreq *rp;
499 struct nfsmount *nmp = rep->r_nmp;
500 int error;
501
502 nfs_disconnect(nmp);
503 while ((error = nfs_connect(nmp, rep))) {
504 if (error == EINTR || error == ERESTART)
505 return (EINTR);
506 if (error == EIO)
507 return (EIO);
508 nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
509 "can not connect");
510 rep->r_flags |= R_TPRINTFMSG;
511 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
512 /* we're not yet completely mounted and */
513 /* we can't reconnect, so we fail */
514 return (error);
515 }
516 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
517 return (error);
518 tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
519 }
520
521 /*
522 * Loop through outstanding request list and fix up all requests
523 * on old socket.
524 */
525 TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
526 if (rp->r_nmp == nmp)
527 rp->r_flags |= R_MUSTRESEND;
528 }
529 return (0);
530 }
531
532 /*
533 * NFS disconnect. Clean up and unlink.
534 */
535 void
536 nfs_disconnect(struct nfsmount *nmp)
537 {
538 socket_t so;
539
540 if (nmp->nm_so) {
541 so = nmp->nm_so;
542 nmp->nm_so = 0;
543 sock_shutdown(so, 2);
544 sock_close(so);
545 }
546 }
547
548 /*
549 * This is the nfs send routine. For connection based socket types, it
550 * must be called with an nfs_sndlock() on the socket.
551 * "rep == NULL" indicates that it has been called from a server.
552 * For the client side:
553 * - return EINTR if the RPC is terminated, 0 otherwise
554 * - set R_MUSTRESEND if the send fails for any reason
555 * - do any cleanup required by recoverable socket errors (???)
556 * For the server side:
557 * - return EINTR or ERESTART if interrupted by a signal
558 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
559 * - do any cleanup required by recoverable socket errors (???)
560 */
561 int
562 nfs_send(so, nam, top, rep)
563 socket_t so;
564 mbuf_t nam;
565 mbuf_t top;
566 struct nfsreq *rep;
567 {
568 struct sockaddr *sendnam;
569 int error, error2, sotype, flags;
570 u_long xidqueued = 0;
571 struct nfsreq *rp;
572 char savenametolog[MAXPATHLEN];
573 struct msghdr msg;
574
575 if (rep) {
576 error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
577 if (error) {
578 mbuf_freem(top);
579 return (error);
580 }
581 if ((so = rep->r_nmp->nm_so) == NULL) {
582 rep->r_flags |= R_MUSTRESEND;
583 mbuf_freem(top);
584 return (0);
585 }
586 rep->r_flags &= ~R_MUSTRESEND;
587 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
588 if (rp == rep)
589 break;
590 if (rp)
591 xidqueued = rp->r_xid;
592 }
593 sock_gettype(so, NULL, &sotype, NULL);
594 if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
595 (nam == 0))
596 sendnam = (struct sockaddr *)0;
597 else
598 sendnam = mbuf_data(nam);
599
600 if (sotype == SOCK_SEQPACKET)
601 flags = MSG_EOR;
602 else
603 flags = 0;
604
605 /*
606 * Save the name here in case mount point goes away if we block.
607 * The name is using local stack and is large, but don't
608 * want to block if we malloc.
609 */
610 if (rep)
611 strncpy(savenametolog,
612 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
613 MAXPATHLEN - 1);
614 bzero(&msg, sizeof(msg));
615 msg.msg_name = (caddr_t)sendnam;
616 msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
617 error = sock_sendmbuf(so, &msg, top, flags, NULL);
618
619 if (error) {
620 if (rep) {
621 if (xidqueued) {
622 TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
623 if (rp == rep && rp->r_xid == xidqueued)
624 break;
625 if (!rp)
626 panic("nfs_send: error %d xid %x gone",
627 error, xidqueued);
628 }
629 log(LOG_INFO, "nfs send error %d for server %s\n",
630 error, savenametolog);
631 /*
632 * Deal with errors for the client side.
633 */
634 error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
635 if (error2) {
636 error = error2;
637 } else {
638 rep->r_flags |= R_MUSTRESEND;
639 }
640 } else
641 log(LOG_INFO, "nfsd send error %d\n", error);
642
643 /*
644 * Handle any recoverable (soft) socket errors here. (???)
645 */
646 if (error != EINTR && error != ERESTART && error != EIO &&
647 error != EWOULDBLOCK && error != EPIPE) {
648 error = 0;
649 }
650 }
651 return (error);
652 }
653
654 /*
655 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
656 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
657 * Mark and consolidate the data into a new mbuf list.
658 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
659 * small mbufs.
660 * For SOCK_STREAM we must be very careful to read an entire record once
661 * we have read any of it, even if the system call has been interrupted.
662 */
663 static int
664 nfs_receive(struct nfsreq *rep, mbuf_t *mp)
665 {
666 socket_t so;
667 struct iovec_32 aio;
668 mbuf_t m, mlast;
669 u_long len, fraglen;
670 int error, error2, sotype;
671 proc_t p = current_proc(); /* XXX */
672 struct msghdr msg;
673 size_t rcvlen;
674 int lastfragment;
675
676 /*
677 * Set up arguments for soreceive()
678 */
679 *mp = NULL;
680 sotype = rep->r_nmp->nm_sotype;
681
682 /*
683 * For reliable protocols, lock against other senders/receivers
684 * in case a reconnect is necessary.
685 * For SOCK_STREAM, first get the Record Mark to find out how much
686 * more there is to get.
687 * We must lock the socket against other receivers
688 * until we have an entire rpc request/reply.
689 */
690 if (sotype != SOCK_DGRAM) {
691 error = nfs_sndlock(rep);
692 if (error)
693 return (error);
694 tryagain:
695 /*
696 * Check for fatal errors and resending request.
697 */
698 /*
699 * Ugh: If a reconnect attempt just happened, nm_so
700 * would have changed. NULL indicates a failed
701 * attempt that has essentially shut down this
702 * mount point.
703 */
704 if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
705 nfs_sndunlock(rep);
706 if (error)
707 return (error);
708 return (EINTR);
709 }
710 so = rep->r_nmp->nm_so;
711 if (!so) {
712 error = nfs_reconnect(rep);
713 if (error) {
714 nfs_sndunlock(rep);
715 return (error);
716 }
717 goto tryagain;
718 }
719 while (rep->r_flags & R_MUSTRESEND) {
720 error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
721 if (!error) {
722 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
723 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
724 }
725 /*
726 * we also hold rcv lock so rep is still
727 * legit this point
728 */
729 if (error) {
730 if (error == EINTR || error == ERESTART ||
731 (error = nfs_reconnect(rep))) {
732 nfs_sndunlock(rep);
733 return (error);
734 }
735 goto tryagain;
736 }
737 }
738 nfs_sndunlock(rep);
739 if (sotype == SOCK_STREAM) {
740 error = 0;
741 len = 0;
742 lastfragment = 0;
743 mlast = NULL;
744 while (!error && !lastfragment) {
745 aio.iov_base = (uintptr_t) &fraglen;
746 aio.iov_len = sizeof(u_long);
747 bzero(&msg, sizeof(msg));
748 msg.msg_iov = (struct iovec *) &aio;
749 msg.msg_iovlen = 1;
750 do {
751 error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
752 if (!rep->r_nmp) /* if unmounted then bailout */
753 goto shutout;
754 if (error == EWOULDBLOCK && rep) {
755 error2 = nfs_sigintr(rep->r_nmp, rep, p);
756 if (error2)
757 error = error2;
758 }
759 } while (error == EWOULDBLOCK);
760 if (!error && rcvlen < aio.iov_len) {
761 /* only log a message if we got a partial word */
762 if (rcvlen != 0)
763 log(LOG_INFO,
764 "short receive (%d/%d) from nfs server %s\n",
765 rcvlen, sizeof(u_long),
766 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
767 error = EPIPE;
768 }
769 if (error)
770 goto errout;
771 lastfragment = ntohl(fraglen) & 0x80000000;
772 fraglen = ntohl(fraglen) & ~0x80000000;
773 len += fraglen;
774 /*
775 * This is SERIOUS! We are out of sync with the sender
776 * and forcing a disconnect/reconnect is all I can do.
777 */
778 if (len > NFS_MAXPACKET) {
779 log(LOG_ERR, "%s (%d) from nfs server %s\n",
780 "impossible RPC record length", len,
781 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
782 error = EFBIG;
783 goto errout;
784 }
785
786 m = NULL;
787 do {
788 rcvlen = fraglen;
789 error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
790 if (!rep->r_nmp) /* if unmounted then bailout */ {
791 goto shutout;
792 }
793 } while (error == EWOULDBLOCK || error == EINTR ||
794 error == ERESTART);
795
796 if (!error && fraglen > rcvlen) {
797 log(LOG_INFO,
798 "short receive (%d/%d) from nfs server %s\n",
799 rcvlen, fraglen,
800 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
801 error = EPIPE;
802 mbuf_freem(m);
803 }
804 if (!error) {
805 if (!*mp) {
806 *mp = m;
807 mlast = m;
808 } else {
809 error = mbuf_setnext(mlast, m);
810 if (error) {
811 printf("nfs_receive: mbuf_setnext failed %d\n", error);
812 mbuf_freem(m);
813 }
814 }
815 while (mbuf_next(mlast))
816 mlast = mbuf_next(mlast);
817 }
818 }
819 } else {
820 bzero(&msg, sizeof(msg));
821 do {
822 rcvlen = 100000000;
823 error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
824 if (!rep->r_nmp) /* if unmounted then bailout */ {
825 goto shutout;
826 }
827 if (error == EWOULDBLOCK && rep) {
828 error2 = nfs_sigintr(rep->r_nmp, rep, p);
829 if (error2) {
830 return (error2);
831 }
832 }
833 } while (error == EWOULDBLOCK);
834
835 if ((msg.msg_flags & MSG_EOR) == 0)
836 printf("Egad!!\n");
837 if (!error && *mp == NULL)
838 error = EPIPE;
839 len = rcvlen;
840 }
841 errout:
842 if (error && error != EINTR && error != ERESTART) {
843 mbuf_freem(*mp);
844 *mp = NULL;
845 if (error != EPIPE)
846 log(LOG_INFO,
847 "receive error %d from nfs server %s\n", error,
848 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
849 error = nfs_sndlock(rep);
850 if (!error) {
851 error = nfs_reconnect(rep);
852 if (!error)
853 goto tryagain;
854 nfs_sndunlock(rep);
855 }
856 }
857 } else {
858 /*
859 * We could have failed while rebinding the datagram socket
860 * so we need to attempt to rebind here.
861 */
862 if ((so = rep->r_nmp->nm_so) == NULL) {
863 error = nfs_sndlock(rep);
864 if (!error) {
865 error = nfs_reconnect(rep);
866 nfs_sndunlock(rep);
867 }
868 if (error)
869 return (error);
870 if (!rep->r_nmp) /* if unmounted then bailout */
871 return (ENXIO);
872 so = rep->r_nmp->nm_so;
873 }
874 bzero(&msg, sizeof(msg));
875 len = 0;
876 do {
877 rcvlen = 1000000;
878 error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
879 if (!rep->r_nmp) /* if unmounted then bailout */
880 goto shutout;
881 if (error) {
882 error2 = nfs_sigintr(rep->r_nmp, rep, p);
883 if (error2) {
884 error = error2;
885 goto shutout;
886 }
887 }
888 /* Reconnect for all errors. We may be receiving
889 * soft/hard/blocking errors because of a network
890 * change.
891 * XXX: we should rate limit or delay this
892 * to once every N attempts or something.
893 * although TCP doesn't seem to.
894 */
895 if (error) {
896 error2 = nfs_sndlock(rep);
897 if (!error2) {
898 error2 = nfs_reconnect(rep);
899 if (error2)
900 error = error2;
901 else if (!rep->r_nmp) /* if unmounted then bailout */
902 error = ENXIO;
903 else
904 so = rep->r_nmp->nm_so;
905 nfs_sndunlock(rep);
906 } else {
907 error = error2;
908 }
909 }
910 } while (error == EWOULDBLOCK);
911 }
912 shutout:
913 if (error) {
914 mbuf_freem(*mp);
915 *mp = NULL;
916 }
917 return (error);
918 }
919
920 /*
921 * Implement receipt of reply on a socket.
922 * We must search through the list of received datagrams matching them
923 * with outstanding requests using the xid, until ours is found.
924 */
925 /* ARGSUSED */
926 int
927 nfs_reply(myrep)
928 struct nfsreq *myrep;
929 {
930 struct nfsreq *rep;
931 struct nfsmount *nmp = myrep->r_nmp;
932 long t1;
933 mbuf_t mrep, md;
934 u_long rxid, *tl;
935 caddr_t dpos, cp2;
936 int error;
937
938 /*
939 * Loop around until we get our own reply
940 */
941 for (;;) {
942 /*
943 * Lock against other receivers so that I don't get stuck in
944 * sbwait() after someone else has received my reply for me.
945 * Also necessary for connection based protocols to avoid
946 * race conditions during a reconnect.
947 * If nfs_rcvlock() returns EALREADY, that means that
948 * the reply has already been recieved by another
949 * process and we can return immediately. In this
950 * case, the lock is not taken to avoid races with
951 * other processes.
952 */
953 error = nfs_rcvlock(myrep);
954 if (error == EALREADY)
955 return (0);
956 if (error)
957 return (error);
958
959 /*
960 * If we slept after putting bits otw, then reply may have
961 * arrived. In which case returning is required, or we
962 * would hang trying to nfs_receive an already received reply.
963 */
964 if (myrep->r_mrep != NULL) {
965 nfs_rcvunlock(myrep);
966 FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
967 return (0);
968 }
969 /*
970 * Get the next Rpc reply off the socket. Assume myrep->r_nmp
971 * is still intact by checks done in nfs_rcvlock.
972 */
973 error = nfs_receive(myrep, &mrep);
974 /*
975 * Bailout asap if nfsmount struct gone (unmounted).
976 */
977 if (!myrep->r_nmp) {
978 FSDBG(530, myrep->r_xid, myrep, nmp, -2);
979 if (mrep)
980 mbuf_freem(mrep);
981 return (ENXIO);
982 }
983 if (error) {
984 FSDBG(530, myrep->r_xid, myrep, nmp, error);
985 nfs_rcvunlock(myrep);
986
987 /* Bailout asap if nfsmount struct gone (unmounted). */
988 if (!myrep->r_nmp) {
989 if (mrep)
990 mbuf_freem(mrep);
991 return (ENXIO);
992 }
993
994 /*
995 * Ignore routing errors on connectionless protocols??
996 */
997 if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
998 if (nmp->nm_so) {
999 int clearerror;
1000 int optlen = sizeof(clearerror);
1001 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1002 }
1003 continue;
1004 }
1005 if (mrep)
1006 mbuf_freem(mrep);
1007 return (error);
1008 }
1009
1010 /*
1011 * We assume all is fine, but if we did not have an error
1012 * and mrep is 0, better not dereference it. nfs_receive
1013 * calls soreceive which carefully sets error=0 when it got
1014 * errors on sbwait (tsleep). In most cases, I assume that's
1015 * so we could go back again. In tcp case, EPIPE is returned.
1016 * In udp, case nfs_receive gets back here with no error and no
1017 * mrep. Is the right fix to have soreceive check for process
1018 * aborted after sbwait and return something non-zero? Should
1019 * nfs_receive give an EPIPE? Too risky to play with those
1020 * two this late in game for a shutdown problem. Instead,
1021 * just check here and get out. (ekn)
1022 */
1023 if (!mrep) {
1024 nfs_rcvunlock(myrep);
1025 FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1026 return (ENXIO); /* sounds good */
1027 }
1028
1029 /*
1030 * Get the xid and check that it is an rpc reply
1031 */
1032 md = mrep;
1033 dpos = mbuf_data(md);
1034 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1035 rxid = *tl++;
1036 if (*tl != rpc_reply) {
1037 OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1038 mbuf_freem(mrep);
1039 nfsmout:
1040 if (nmp->nm_state & NFSSTA_RCVLOCK)
1041 nfs_rcvunlock(myrep);
1042 continue;
1043 }
1044
1045 /*
1046 * Loop through the request list to match up the reply
1047 * Iff no match, just drop the datagram
1048 */
1049 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1050 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1051 /* Found it.. */
1052 rep->r_mrep = mrep;
1053 rep->r_md = md;
1054 rep->r_dpos = dpos;
1055 /*
1056 * If we're tracking the round trip time
1057 * then we update the circular log here
1058 * with the stats from our current request.
1059 */
1060 if (nfsrtton) {
1061 struct rttl *rt;
1062
1063 rt = &nfsrtt.rttl[nfsrtt.pos];
1064 rt->proc = rep->r_procnum;
1065 rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1066 rt->sent = nmp->nm_sent;
1067 rt->cwnd = nmp->nm_cwnd;
1068 if (proct[rep->r_procnum] == 0)
1069 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1070 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1071 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1072 rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1073 microtime(&rt->tstamp); // XXX unused
1074 if (rep->r_flags & R_TIMING)
1075 rt->rtt = rep->r_rtt;
1076 else
1077 rt->rtt = 1000000;
1078 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1079 }
1080 /*
1081 * Update congestion window.
1082 * Do the additive increase of
1083 * one rpc/rtt.
1084 */
1085 FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1086 nmp->nm_cwnd);
1087 if (nmp->nm_cwnd <= nmp->nm_sent) {
1088 nmp->nm_cwnd +=
1089 (NFS_CWNDSCALE * NFS_CWNDSCALE +
1090 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1091 if (nmp->nm_cwnd > NFS_MAXCWND)
1092 nmp->nm_cwnd = NFS_MAXCWND;
1093 }
1094 if (rep->r_flags & R_SENT) {
1095 rep->r_flags &= ~R_SENT;
1096 nmp->nm_sent -= NFS_CWNDSCALE;
1097 }
1098 /*
1099 * Update rtt using a gain of 0.125 on the mean
1100 * and a gain of 0.25 on the deviation.
1101 */
1102 if (rep->r_flags & R_TIMING) {
1103 /*
1104 * Since the timer resolution of
1105 * NFS_HZ is so course, it can often
1106 * result in r_rtt == 0. Since
1107 * r_rtt == N means that the actual
1108 * rtt is between N+dt and N+2-dt ticks,
1109 * add 1.
1110 */
1111 if (proct[rep->r_procnum] == 0)
1112 panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1113 t1 = rep->r_rtt + 1;
1114 t1 -= (NFS_SRTT(rep) >> 3);
1115 NFS_SRTT(rep) += t1;
1116 if (t1 < 0)
1117 t1 = -t1;
1118 t1 -= (NFS_SDRTT(rep) >> 2);
1119 NFS_SDRTT(rep) += t1;
1120 }
1121 nmp->nm_timeouts = 0;
1122 break;
1123 }
1124 }
1125 nfs_rcvunlock(myrep);
1126 /*
1127 * If not matched to a request, drop it.
1128 * If it's mine, get out.
1129 */
1130 if (rep == 0) {
1131 OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1132 mbuf_freem(mrep);
1133 } else if (rep == myrep) {
1134 if (rep->r_mrep == NULL)
1135 panic("nfs_reply: nil r_mrep");
1136 return (0);
1137 }
1138 FSDBG(530, myrep->r_xid, myrep, rep,
1139 rep ? rep->r_xid : myrep->r_flags);
1140 }
1141 }
1142
1143 /*
1144 * nfs_request - goes something like this
1145 * - fill in request struct
1146 * - links it into list
1147 * - calls nfs_send() for first transmit
1148 * - calls nfs_receive() to get reply
1149 * - break down rpc header and return with nfs reply pointed to
1150 * by mrep or error
1151 * nb: always frees up mreq mbuf list
1152 */
1153 int
1154 nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1155 vnode_t vp;
1156 mount_t mp;
1157 mbuf_t mrest;
1158 int procnum;
1159 proc_t procp;
1160 kauth_cred_t cred;
1161 mbuf_t *mrp;
1162 mbuf_t *mdp;
1163 caddr_t *dposp;
1164 u_int64_t *xidp;
1165 {
1166 mbuf_t m, mrep, m2;
1167 struct nfsreq re, *rep;
1168 u_long *tl;
1169 int i;
1170 struct nfsmount *nmp;
1171 mbuf_t md, mheadend;
1172 char nickv[RPCX_NICKVERF];
1173 time_t waituntil;
1174 caddr_t dpos, cp2;
1175 int t1, error = 0, mrest_len, auth_len, auth_type;
1176 int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1177 int verf_len, verf_type;
1178 u_long xid;
1179 char *auth_str, *verf_str;
1180 NFSKERBKEY_T key; /* save session key */
1181 int nmsotype;
1182 struct timeval now;
1183
1184 if (mrp)
1185 *mrp = NULL;
1186 if (xidp)
1187 *xidp = 0;
1188 nmp = VFSTONFS(mp);
1189
1190 rep = &re;
1191
1192 if (vp)
1193 nmp = VFSTONFS(vnode_mount(vp));
1194 if (nmp == NULL ||
1195 (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1196 (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1197 mbuf_freem(mrest);
1198 return (ENXIO);
1199 }
1200 nmsotype = nmp->nm_sotype;
1201
1202 FSDBG_TOP(531, vp, procnum, nmp, rep);
1203
1204 rep->r_nmp = nmp;
1205 rep->r_vp = vp;
1206 rep->r_procp = procp;
1207 rep->r_procnum = procnum;
1208 microuptime(&now);
1209 rep->r_lastmsg = now.tv_sec -
1210 ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1211 i = 0;
1212 m = mrest;
1213 while (m) {
1214 i += mbuf_len(m);
1215 m = mbuf_next(m);
1216 }
1217 mrest_len = i;
1218
1219 /*
1220 * Get the RPC header with authorization.
1221 */
1222 kerbauth:
1223 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1224 if (!nmp) {
1225 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1226 mbuf_freem(mrest);
1227 return (ENXIO);
1228 }
1229 verf_str = auth_str = (char *)0;
1230 if (nmp->nm_flag & NFSMNT_KERB) {
1231 verf_str = nickv;
1232 verf_len = sizeof (nickv);
1233 auth_type = RPCAUTH_KERB4;
1234 bzero((caddr_t)key, sizeof (key));
1235 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1236 &auth_len, verf_str, verf_len)) {
1237 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1238 if (!nmp) {
1239 FSDBG_BOT(531, 2, vp, error, rep);
1240 mbuf_freem(mrest);
1241 return (ENXIO);
1242 }
1243 error = nfs_getauth(nmp, rep, cred, &auth_str,
1244 &auth_len, verf_str, &verf_len, key);
1245 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1246 if (!error && !nmp)
1247 error = ENXIO;
1248 if (error) {
1249 FSDBG_BOT(531, 2, vp, error, rep);
1250 mbuf_freem(mrest);
1251 return (error);
1252 }
1253 }
1254 } else {
1255 auth_type = RPCAUTH_UNIX;
1256 if (cred->cr_ngroups < 1)
1257 panic("nfsreq nogrps");
1258 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1259 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1260 5 * NFSX_UNSIGNED;
1261 }
1262 error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1263 auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1264 if (auth_str)
1265 _FREE(auth_str, M_TEMP);
1266 if (error) {
1267 mbuf_freem(mrest);
1268 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1269 return (error);
1270 }
1271 if (xidp)
1272 *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1273
1274 /*
1275 * For stream protocols, insert a Sun RPC Record Mark.
1276 */
1277 if (nmsotype == SOCK_STREAM) {
1278 error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1279 if (error) {
1280 mbuf_freem(m);
1281 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1282 return (error);
1283 }
1284 *((u_long*)mbuf_data(m)) =
1285 htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1286 }
1287 rep->r_mreq = m;
1288 rep->r_xid = xid;
1289 tryagain:
1290 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1291 if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1292 rep->r_retry = nmp->nm_retry;
1293 else
1294 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
1295 rep->r_rtt = rep->r_rexmit = 0;
1296 if (proct[procnum] > 0)
1297 rep->r_flags = R_TIMING;
1298 else
1299 rep->r_flags = 0;
1300 rep->r_mrep = NULL;
1301
1302 /*
1303 * Do the client side RPC.
1304 */
1305 OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1306 /*
1307 * Chain request into list of outstanding requests. Be sure
1308 * to put it LAST so timer finds oldest requests first.
1309 */
1310 TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1311
1312 /*
1313 * If backing off another request or avoiding congestion, don't
1314 * send this one now but let timer do it. If not timing a request,
1315 * do it now.
1316 */
1317 if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1318 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1319 nmp->nm_sent < nmp->nm_cwnd)) {
1320 int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1321
1322 if (connrequired)
1323 error = nfs_sndlock(rep);
1324
1325 /*
1326 * Set the R_SENT before doing the send in case another thread
1327 * processes the reply before the nfs_send returns here
1328 */
1329 if (!error) {
1330 if ((rep->r_flags & R_MUSTRESEND) == 0) {
1331 FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1332 nmp->nm_cwnd);
1333 nmp->nm_sent += NFS_CWNDSCALE;
1334 rep->r_flags |= R_SENT;
1335 }
1336
1337 error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1338 if (!error)
1339 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1340 if (connrequired)
1341 nfs_sndunlock(rep);
1342 }
1343 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1344 if (error) {
1345 if (nmp)
1346 nmp->nm_sent -= NFS_CWNDSCALE;
1347 rep->r_flags &= ~R_SENT;
1348 }
1349 } else {
1350 rep->r_rtt = -1;
1351 }
1352
1353 /*
1354 * Wait for the reply from our send or the timer's.
1355 */
1356 if (!error || error == EPIPE)
1357 error = nfs_reply(rep);
1358
1359 /*
1360 * RPC done, unlink the request.
1361 */
1362 nfs_repdequeue(rep);
1363
1364 nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1365
1366 /*
1367 * Decrement the outstanding request count.
1368 */
1369 if (rep->r_flags & R_SENT) {
1370 rep->r_flags &= ~R_SENT; /* paranoia */
1371 if (nmp) {
1372 FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1373 nmp->nm_sent -= NFS_CWNDSCALE;
1374 }
1375 }
1376
1377 /*
1378 * If there was a successful reply and a tprintf msg.
1379 * tprintf a response.
1380 */
1381 if (!error)
1382 nfs_up(nmp, procp, NFSSTA_TIMEO,
1383 (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1384 mrep = rep->r_mrep;
1385 md = rep->r_md;
1386 dpos = rep->r_dpos;
1387 if (!error && !nmp)
1388 error = ENXIO;
1389 if (error) {
1390 mbuf_freem(rep->r_mreq);
1391 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1392 return (error);
1393 }
1394
1395 /*
1396 * break down the rpc header and check if ok
1397 */
1398 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1399 if (*tl++ == rpc_msgdenied) {
1400 if (*tl == rpc_mismatch)
1401 error = EOPNOTSUPP;
1402 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1403 if (!failed_auth) {
1404 failed_auth++;
1405 error = mbuf_setnext(mheadend, NULL);
1406 mbuf_freem(mrep);
1407 mbuf_freem(rep->r_mreq);
1408 if (!error)
1409 goto kerbauth;
1410 printf("nfs_request: mbuf_setnext failed\n");
1411 } else
1412 error = EAUTH;
1413 } else
1414 error = EACCES;
1415 mbuf_freem(mrep);
1416 mbuf_freem(rep->r_mreq);
1417 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1418 return (error);
1419 }
1420
1421 /*
1422 * Grab any Kerberos verifier, otherwise just throw it away.
1423 */
1424 verf_type = fxdr_unsigned(int, *tl++);
1425 i = fxdr_unsigned(int, *tl);
1426 if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1427 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1428 if (error)
1429 goto nfsmout;
1430 } else if (i > 0)
1431 nfsm_adv(nfsm_rndup(i));
1432 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1433 /* 0 == ok */
1434 if (*tl == 0) {
1435 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1436 if (*tl != 0) {
1437 error = fxdr_unsigned(int, *tl);
1438 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1439 error == NFSERR_TRYLATER) {
1440 mbuf_freem(mrep);
1441 error = 0;
1442 microuptime(&now);
1443 waituntil = now.tv_sec + trylater_delay;
1444 while (now.tv_sec < waituntil) {
1445 tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1446 microuptime(&now);
1447 }
1448 trylater_delay *= 2;
1449 if (trylater_delay > 60)
1450 trylater_delay = 60;
1451 goto tryagain;
1452 }
1453
1454 /*
1455 * If the File Handle was stale, invalidate the
1456 * lookup cache, just in case.
1457 */
1458 if ((error == ESTALE) && vp)
1459 cache_purge(vp);
1460 if (nmp->nm_flag & NFSMNT_NFSV3) {
1461 *mrp = mrep;
1462 *mdp = md;
1463 *dposp = dpos;
1464 error |= NFSERR_RETERR;
1465 } else {
1466 mbuf_freem(mrep);
1467 error &= ~NFSERR_RETERR;
1468 }
1469 mbuf_freem(rep->r_mreq);
1470 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1471 return (error);
1472 }
1473
1474 *mrp = mrep;
1475 *mdp = md;
1476 *dposp = dpos;
1477 mbuf_freem(rep->r_mreq);
1478 FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1479 return (0);
1480 }
1481 mbuf_freem(mrep);
1482 error = EPROTONOSUPPORT;
1483 nfsmout:
1484 mbuf_freem(rep->r_mreq);
1485 FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1486 return (error);
1487 }
1488
1489 #ifndef NFS_NOSERVER
1490 /*
1491 * Generate the rpc reply header
1492 * siz arg. is used to decide if adding a cluster is worthwhile
1493 */
1494 int
1495 nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1496 int siz;
1497 struct nfsrv_descript *nd;
1498 struct nfssvc_sock *slp;
1499 int err;
1500 mbuf_t *mrq;
1501 mbuf_t *mbp;
1502 caddr_t *bposp;
1503 {
1504 u_long *tl;
1505 mbuf_t mreq;
1506 caddr_t bpos;
1507 mbuf_t mb, mb2;
1508 int error, mlen;
1509
1510 /*
1511 * If this is a big reply, use a cluster else
1512 * try and leave leading space for the lower level headers.
1513 */
1514 siz += RPC_REPLYSIZ;
1515 if (siz >= nfs_mbuf_minclsize) {
1516 error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1517 } else {
1518 error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1519 }
1520 if (error) {
1521 /* unable to allocate packet */
1522 /* XXX nfsstat? */
1523 return (error);
1524 }
1525 mb = mreq;
1526 tl = mbuf_data(mreq);
1527 mlen = 6 * NFSX_UNSIGNED;
1528 if (siz < nfs_mbuf_minclsize) {
1529 /* leave space for lower level headers */
1530 tl += 80/sizeof(*tl); /* XXX max_hdr? XXX */
1531 mbuf_setdata(mreq, tl, mlen);
1532 } else {
1533 mbuf_setlen(mreq, mlen);
1534 }
1535 bpos = ((caddr_t)tl) + mlen;
1536 *tl++ = txdr_unsigned(nd->nd_retxid);
1537 *tl++ = rpc_reply;
1538 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1539 *tl++ = rpc_msgdenied;
1540 if (err & NFSERR_AUTHERR) {
1541 *tl++ = rpc_autherr;
1542 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1543 mlen -= NFSX_UNSIGNED;
1544 mbuf_setlen(mreq, mlen);
1545 bpos -= NFSX_UNSIGNED;
1546 } else {
1547 *tl++ = rpc_mismatch;
1548 *tl++ = txdr_unsigned(RPC_VER2);
1549 *tl = txdr_unsigned(RPC_VER2);
1550 }
1551 } else {
1552 *tl++ = rpc_msgaccepted;
1553
1554 /*
1555 * For Kerberos authentication, we must send the nickname
1556 * verifier back, otherwise just RPCAUTH_NULL.
1557 */
1558 if (nd->nd_flag & ND_KERBFULL) {
1559 struct nfsuid *nuidp;
1560 struct timeval ktvin, ktvout;
1561 uid_t uid = kauth_cred_getuid(nd->nd_cr);
1562
1563 lck_rw_lock_shared(&slp->ns_rwlock);
1564 for (nuidp = NUIDHASH(slp, uid)->lh_first;
1565 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1566 if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1567 (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1568 &nuidp->nu_haddr, nd->nd_nam2)))
1569 break;
1570 }
1571 if (nuidp) {
1572 ktvin.tv_sec =
1573 txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1574 ktvin.tv_usec =
1575 txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1576
1577 /*
1578 * Encrypt the timestamp in ecb mode using the
1579 * session key.
1580 */
1581 #if NFSKERB
1582 XXX
1583 #endif
1584
1585 *tl++ = rpc_auth_kerb;
1586 *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1587 *tl = ktvout.tv_sec;
1588 nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1589 *tl++ = ktvout.tv_usec;
1590 *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1591 } else {
1592 *tl++ = 0;
1593 *tl++ = 0;
1594 }
1595 lck_rw_done(&slp->ns_rwlock);
1596 } else {
1597 *tl++ = 0;
1598 *tl++ = 0;
1599 }
1600 switch (err) {
1601 case EPROGUNAVAIL:
1602 *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1603 break;
1604 case EPROGMISMATCH:
1605 *tl = txdr_unsigned(RPC_PROGMISMATCH);
1606 nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1607 // XXX hard coded versions
1608 *tl++ = txdr_unsigned(2);
1609 *tl = txdr_unsigned(3);
1610 break;
1611 case EPROCUNAVAIL:
1612 *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1613 break;
1614 case EBADRPC:
1615 *tl = txdr_unsigned(RPC_GARBAGE);
1616 break;
1617 default:
1618 *tl = 0;
1619 if (err != NFSERR_RETVOID) {
1620 nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1621 if (err)
1622 *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1623 else
1624 *tl = 0;
1625 }
1626 break;
1627 }
1628 }
1629
1630 if (mrq != NULL)
1631 *mrq = mreq;
1632 *mbp = mb;
1633 *bposp = bpos;
1634 if (err != 0 && err != NFSERR_RETVOID) {
1635 OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1636 }
1637 return (0);
1638 }
1639
1640
1641 #endif /* NFS_NOSERVER */
1642
1643
1644 /*
1645 * From FreeBSD 1.58, a Matt Dillon fix...
1646 * Flag a request as being about to terminate.
1647 * The nm_sent count is decremented now to avoid deadlocks when the process
1648 * in soreceive() hasn't yet managed to send its own request.
1649 */
1650 static void
1651 nfs_softterm(struct nfsreq *rep)
1652 {
1653
1654 rep->r_flags |= R_SOFTTERM;
1655 if (rep->r_flags & R_SENT) {
1656 FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1657 rep->r_nmp->nm_cwnd);
1658 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1659 rep->r_flags &= ~R_SENT;
1660 }
1661 }
1662
1663 void
1664 nfs_timer_funnel(void * arg)
1665 {
1666 (void) thread_funnel_set(kernel_flock, TRUE);
1667 nfs_timer(arg);
1668 (void) thread_funnel_set(kernel_flock, FALSE);
1669
1670 }
1671
1672 /*
1673 * Ensure rep isn't in use by the timer, then dequeue it.
1674 */
1675 static void
1676 nfs_repdequeue(struct nfsreq *rep)
1677 {
1678
1679 while ((rep->r_flags & R_BUSY)) {
1680 rep->r_flags |= R_WAITING;
1681 tsleep(rep, PSOCK, "repdeq", 0);
1682 }
1683 TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1684 }
1685
1686 /*
1687 * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1688 * free()'d out from under it.
1689 */
1690 static void
1691 nfs_repbusy(struct nfsreq *rep)
1692 {
1693
1694 if ((rep->r_flags & R_BUSY))
1695 panic("rep locked");
1696 rep->r_flags |= R_BUSY;
1697 }
1698
1699 /*
1700 * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1701 */
1702 static struct nfsreq *
1703 nfs_repnext(struct nfsreq *rep)
1704 {
1705 struct nfsreq * nextrep;
1706
1707 if (rep == NULL)
1708 return (NULL);
1709 /*
1710 * We need to get and busy the next req before signalling the
1711 * current one, otherwise wakeup() may block us and we'll race to
1712 * grab the next req.
1713 */
1714 nextrep = TAILQ_NEXT(rep, r_chain);
1715 if (nextrep != NULL)
1716 nfs_repbusy(nextrep);
1717 /* unbusy and signal. */
1718 rep->r_flags &= ~R_BUSY;
1719 if ((rep->r_flags & R_WAITING)) {
1720 rep->r_flags &= ~R_WAITING;
1721 wakeup(rep);
1722 }
1723 return (nextrep);
1724 }
1725
1726 /*
1727 * Nfs timer routine
1728 * Scan the nfsreq list and retranmit any requests that have timed out
1729 * To avoid retransmission attempts on STREAM sockets (in the future) make
1730 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1731 */
1732 void
1733 nfs_timer(__unused void *arg)
1734 {
1735 struct nfsreq *rep;
1736 mbuf_t m;
1737 socket_t so;
1738 struct nfsmount *nmp;
1739 int timeo;
1740 int error;
1741 #ifndef NFS_NOSERVER
1742 struct nfssvc_sock *slp;
1743 u_quad_t cur_usec;
1744 #endif /* NFS_NOSERVER */
1745 int flags, rexmit, cwnd, sent;
1746 u_long xid;
1747 struct timeval now;
1748
1749 rep = TAILQ_FIRST(&nfs_reqq);
1750 if (rep != NULL)
1751 nfs_repbusy(rep);
1752 microuptime(&now);
1753 for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1754 nmp = rep->r_nmp;
1755 if (!nmp) /* unmounted */
1756 continue;
1757 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1758 continue;
1759 if (nfs_sigintr(nmp, rep, rep->r_procp))
1760 continue;
1761 if (nmp->nm_tprintf_initial_delay != 0 &&
1762 (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1763 rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1764 rep->r_lastmsg = now.tv_sec;
1765 nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1766 "not responding");
1767 rep->r_flags |= R_TPRINTFMSG;
1768 if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1769 /* we're not yet completely mounted and */
1770 /* we can't complete an RPC, so we fail */
1771 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1772 nfs_softterm(rep);
1773 continue;
1774 }
1775 }
1776 if (rep->r_rtt >= 0) {
1777 rep->r_rtt++;
1778 if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1779 timeo = nmp->nm_timeo;
1780 else
1781 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1782 /* ensure 62.5 ms floor */
1783 while (16 * timeo < hz)
1784 timeo *= 2;
1785 if (nmp->nm_timeouts > 0)
1786 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1787 if (rep->r_rtt <= timeo)
1788 continue;
1789 if (nmp->nm_timeouts < 8)
1790 nmp->nm_timeouts++;
1791 }
1792 /*
1793 * Check for too many retransmits. This is never true for
1794 * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1795 * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1796 */
1797 if (rep->r_rexmit >= rep->r_retry) { /* too many */
1798 OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1799 nfs_softterm(rep);
1800 continue;
1801 }
1802 if (nmp->nm_sotype != SOCK_DGRAM) {
1803 if (++rep->r_rexmit > NFS_MAXREXMIT)
1804 rep->r_rexmit = NFS_MAXREXMIT;
1805 continue;
1806 }
1807 if ((so = nmp->nm_so) == NULL)
1808 continue;
1809
1810 /*
1811 * If there is enough space and the window allows..
1812 * Resend it
1813 * Set r_rtt to -1 in case we fail to send it now.
1814 */
1815 rep->r_rtt = -1;
1816 if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1817 (rep->r_flags & R_SENT) ||
1818 nmp->nm_sent < nmp->nm_cwnd) &&
1819 (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1820 struct msghdr msg;
1821 /*
1822 * Iff first send, start timing
1823 * else turn timing off, backoff timer
1824 * and divide congestion window by 2.
1825 * We update these *before* the send to avoid
1826 * racing against receiving the reply.
1827 * We save them so we can restore them on send error.
1828 */
1829 flags = rep->r_flags;
1830 rexmit = rep->r_rexmit;
1831 cwnd = nmp->nm_cwnd;
1832 sent = nmp->nm_sent;
1833 xid = rep->r_xid;
1834 if (rep->r_flags & R_SENT) {
1835 rep->r_flags &= ~R_TIMING;
1836 if (++rep->r_rexmit > NFS_MAXREXMIT)
1837 rep->r_rexmit = NFS_MAXREXMIT;
1838 nmp->nm_cwnd >>= 1;
1839 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1840 nmp->nm_cwnd = NFS_CWNDSCALE;
1841 OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1842 } else {
1843 rep->r_flags |= R_SENT;
1844 nmp->nm_sent += NFS_CWNDSCALE;
1845 }
1846 FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1847
1848 bzero(&msg, sizeof(msg));
1849 if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1850 msg.msg_name = mbuf_data(nmp->nm_nam);
1851 msg.msg_namelen = mbuf_len(nmp->nm_nam);
1852 }
1853 error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1854
1855 FSDBG(535, xid, error, sent, cwnd);
1856
1857 if (error) {
1858 if (error == EWOULDBLOCK) {
1859 rep->r_flags = flags;
1860 rep->r_rexmit = rexmit;
1861 nmp->nm_cwnd = cwnd;
1862 nmp->nm_sent = sent;
1863 rep->r_xid = xid;
1864 }
1865 else {
1866 if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1867 int clearerror;
1868 int optlen = sizeof(clearerror);
1869 sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1870 }
1871 rep->r_flags = flags | R_RESENDERR;
1872 rep->r_rexmit = rexmit;
1873 nmp->nm_cwnd = cwnd;
1874 nmp->nm_sent = sent;
1875 if (flags & R_SENT)
1876 OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1877 }
1878 } else
1879 rep->r_rtt = 0;
1880 }
1881 }
1882 microuptime(&now);
1883 #ifndef NFS_NOSERVER
1884 /*
1885 * Scan the write gathering queues for writes that need to be
1886 * completed now.
1887 */
1888 cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1889 lck_mtx_lock(nfsd_mutex);
1890 TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1891 if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1892 nfsrv_wakenfsd(slp);
1893 }
1894 while ((slp = TAILQ_FIRST(&nfssvc_deadsockhead))) {
1895 if ((slp->ns_timestamp + 5) > now.tv_sec)
1896 break;
1897 TAILQ_REMOVE(&nfssvc_deadsockhead, slp, ns_chain);
1898 nfsrv_slpfree(slp);
1899 }
1900 lck_mtx_unlock(nfsd_mutex);
1901 #endif /* NFS_NOSERVER */
1902
1903 if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1904 /*
1905 * We haven't called nfs_buf_freeup() in a little while.
1906 * So, see if we can free up any stale/unused bufs now.
1907 */
1908 nfs_buf_freeup(1);
1909 }
1910
1911 timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1912
1913 }
1914
1915
1916 /*
1917 * Test for a termination condition pending on the process.
1918 * This is used to determine if we need to bail on a mount.
1919 * EIO is returned if there has been a soft timeout.
1920 * EINTR is returned if there is a signal pending that is not being ignored
1921 * and the mount is interruptable, or if we are a thread that is in the process
1922 * of cancellation (also SIGKILL posted).
1923 */
1924 int
1925 nfs_sigintr(nmp, rep, p)
1926 struct nfsmount *nmp;
1927 struct nfsreq *rep;
1928 proc_t p;
1929 {
1930 sigset_t pending_sigs;
1931 int context_good = 0;
1932 struct nfsmount *repnmp;
1933 extern proc_t kernproc;
1934
1935 if (nmp == NULL)
1936 return (ENXIO);
1937 if (rep != NULL) {
1938 repnmp = rep->r_nmp;
1939 /* we've had a forced unmount. */
1940 if (repnmp == NULL)
1941 return (ENXIO);
1942 /* request has timed out on a 'soft' mount. */
1943 if (rep->r_flags & R_SOFTTERM)
1944 return (EIO);
1945 /*
1946 * We're in the progress of a force unmount and there's
1947 * been a timeout we're dead and fail IO.
1948 */
1949 if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1950 (NFSSTA_FORCE|NFSSTA_TIMEO))
1951 return (EIO);
1952 /* Someone is unmounting us, go soft and mark it. */
1953 if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1954 repnmp->nm_flag |= NFSMNT_SOFT;
1955 nmp->nm_state |= NFSSTA_FORCE;
1956 }
1957 /*
1958 * If the mount is hung and we've requested not to hang
1959 * on remote filesystems, then bail now.
1960 */
1961 if (p != NULL && (proc_noremotehang(p)) != 0 &&
1962 (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1963 return (EIO);
1964 }
1965 /* XXX: is this valid? this probably should be an assertion. */
1966 if (p == NULL)
1967 return (0);
1968
1969 /* Is this thread belongs to kernel task; then abort check is not needed */
1970 if ((current_proc() != kernproc) && current_thread_aborted()) {
1971 return (EINTR);
1972 }
1973 /* mask off thread and process blocked signals. */
1974
1975 pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1976 if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1977 return (EINTR);
1978 return (0);
1979 }
1980
1981 /*
1982 * Lock a socket against others.
1983 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1984 * and also to avoid race conditions between the processes with nfs requests
1985 * in progress when a reconnect is necessary.
1986 */
1987 int
1988 nfs_sndlock(rep)
1989 struct nfsreq *rep;
1990 {
1991 int *statep;
1992 proc_t p;
1993 int error, slpflag = 0, slptimeo = 0;
1994
1995 if (rep->r_nmp == NULL)
1996 return (ENXIO);
1997 statep = &rep->r_nmp->nm_state;
1998
1999 p = rep->r_procp;
2000 if (rep->r_nmp->nm_flag & NFSMNT_INT)
2001 slpflag = PCATCH;
2002 while (*statep & NFSSTA_SNDLOCK) {
2003 error = nfs_sigintr(rep->r_nmp, rep, p);
2004 if (error)
2005 return (error);
2006 *statep |= NFSSTA_WANTSND;
2007 if (p != NULL && (proc_noremotehang(p)) != 0)
2008 slptimeo = hz;
2009 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2010 if (slpflag == PCATCH) {
2011 slpflag = 0;
2012 slptimeo = 2 * hz;
2013 }
2014 /*
2015 * Make sure while we slept that the mountpoint didn't go away.
2016 * nfs_sigintr and callers expect it in tact.
2017 */
2018 if (!rep->r_nmp)
2019 return (ENXIO); /* don't have lock until out of loop */
2020 }
2021 *statep |= NFSSTA_SNDLOCK;
2022 return (0);
2023 }
2024
2025 /*
2026 * Unlock the stream socket for others.
2027 */
2028 void
2029 nfs_sndunlock(rep)
2030 struct nfsreq *rep;
2031 {
2032 int *statep;
2033
2034 if (rep->r_nmp == NULL)
2035 return;
2036 statep = &rep->r_nmp->nm_state;
2037 if ((*statep & NFSSTA_SNDLOCK) == 0)
2038 panic("nfs sndunlock");
2039 *statep &= ~NFSSTA_SNDLOCK;
2040 if (*statep & NFSSTA_WANTSND) {
2041 *statep &= ~NFSSTA_WANTSND;
2042 wakeup((caddr_t)statep);
2043 }
2044 }
2045
2046 static int
2047 nfs_rcvlock(struct nfsreq *rep)
2048 {
2049 int *statep;
2050 int error, slpflag, slptimeo = 0;
2051
2052 /* make sure we still have our mountpoint */
2053 if (!rep->r_nmp) {
2054 if (rep->r_mrep != NULL)
2055 return (EALREADY);
2056 return (ENXIO);
2057 }
2058
2059 statep = &rep->r_nmp->nm_state;
2060 FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2061 if (rep->r_nmp->nm_flag & NFSMNT_INT)
2062 slpflag = PCATCH;
2063 else
2064 slpflag = 0;
2065 while (*statep & NFSSTA_RCVLOCK) {
2066 if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2067 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2068 return (error);
2069 } else if (rep->r_mrep != NULL) {
2070 /*
2071 * Don't bother sleeping if reply already arrived
2072 */
2073 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2074 return (EALREADY);
2075 }
2076 FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2077 *statep |= NFSSTA_WANTRCV;
2078 /*
2079 * We need to poll if we're P_NOREMOTEHANG so that we
2080 * call nfs_sigintr periodically above.
2081 */
2082 if (rep->r_procp != NULL &&
2083 (proc_noremotehang(rep->r_procp)) != 0)
2084 slptimeo = hz;
2085 tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2086 if (slpflag == PCATCH) {
2087 slpflag = 0;
2088 slptimeo = 2 * hz;
2089 }
2090 /*
2091 * Make sure while we slept that the mountpoint didn't go away.
2092 * nfs_sigintr and caller nfs_reply expect it intact.
2093 */
2094 if (!rep->r_nmp) {
2095 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2096 return (ENXIO); /* don't have lock until out of loop */
2097 }
2098 }
2099 /*
2100 * nfs_reply will handle it if reply already arrived.
2101 * (We may have slept or been preempted).
2102 */
2103 FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2104 *statep |= NFSSTA_RCVLOCK;
2105 return (0);
2106 }
2107
2108 /*
2109 * Unlock the stream socket for others.
2110 */
2111 static void
2112 nfs_rcvunlock(struct nfsreq *rep)
2113 {
2114 int *statep;
2115
2116 if (rep->r_nmp == NULL)
2117 return;
2118 statep = &rep->r_nmp->nm_state;
2119
2120 FSDBG(533, statep, *statep, 0, 0);
2121 if ((*statep & NFSSTA_RCVLOCK) == 0)
2122 panic("nfs rcvunlock");
2123 *statep &= ~NFSSTA_RCVLOCK;
2124 if (*statep & NFSSTA_WANTRCV) {
2125 *statep &= ~NFSSTA_WANTRCV;
2126 wakeup((caddr_t)statep);
2127 }
2128 }
2129
2130
2131 #ifndef NFS_NOSERVER
2132 /*
2133 * Socket upcall routine for the nfsd sockets.
2134 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2135 * Essentially do as much as possible non-blocking, else punt and it will
2136 * be called with MBUF_WAITOK from an nfsd.
2137 */
2138 void
2139 nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2140 {
2141 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2142
2143 if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2144 return;
2145
2146 lck_rw_lock_exclusive(&slp->ns_rwlock);
2147 nfsrv_rcv_locked(so, slp, waitflag);
2148 /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2149 }
2150 void
2151 nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2152 {
2153 mbuf_t m, mp, mhck, m2;
2154 int ns_flag=0, error;
2155 struct msghdr msg;
2156 size_t bytes_read;
2157
2158 if ((slp->ns_flag & SLP_VALID) == 0) {
2159 if (waitflag == MBUF_DONTWAIT)
2160 lck_rw_done(&slp->ns_rwlock);
2161 return;
2162 }
2163
2164 #ifdef notdef
2165 /*
2166 * Define this to test for nfsds handling this under heavy load.
2167 */
2168 if (waitflag == MBUF_DONTWAIT) {
2169 ns_flag = SLP_NEEDQ;
2170 goto dorecs;
2171 }
2172 #endif
2173 if (slp->ns_sotype == SOCK_STREAM) {
2174 /*
2175 * If there are already records on the queue, defer soreceive()
2176 * to an nfsd so that there is feedback to the TCP layer that
2177 * the nfs servers are heavily loaded.
2178 */
2179 if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2180 ns_flag = SLP_NEEDQ;
2181 goto dorecs;
2182 }
2183
2184 /*
2185 * Do soreceive().
2186 */
2187 bytes_read = 1000000000;
2188 error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2189 if (error || mp == NULL) {
2190 if (error == EWOULDBLOCK)
2191 ns_flag = SLP_NEEDQ;
2192 else
2193 ns_flag = SLP_DISCONN;
2194 goto dorecs;
2195 }
2196 m = mp;
2197 if (slp->ns_rawend) {
2198 if ((error = mbuf_setnext(slp->ns_rawend, m)))
2199 panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2200 slp->ns_cc += bytes_read;
2201 } else {
2202 slp->ns_raw = m;
2203 slp->ns_cc = bytes_read;
2204 }
2205 while ((m2 = mbuf_next(m)))
2206 m = m2;
2207 slp->ns_rawend = m;
2208
2209 /*
2210 * Now try and parse record(s) out of the raw stream data.
2211 */
2212 error = nfsrv_getstream(slp, waitflag);
2213 if (error) {
2214 if (error == EPERM)
2215 ns_flag = SLP_DISCONN;
2216 else
2217 ns_flag = SLP_NEEDQ;
2218 }
2219 } else {
2220 struct sockaddr_storage nam;
2221
2222 bzero(&msg, sizeof(msg));
2223 msg.msg_name = (caddr_t)&nam;
2224 msg.msg_namelen = sizeof(nam);
2225
2226 do {
2227 bytes_read = 1000000000;
2228 error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2229 if (mp) {
2230 if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2231 mbuf_setlen(mhck, nam.ss_len);
2232 bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2233 m = mhck;
2234 if (mbuf_setnext(m, mp)) {
2235 /* trouble... just drop it */
2236 printf("nfsrv_rcv: mbuf_setnext failed\n");
2237 mbuf_free(mhck);
2238 m = mp;
2239 }
2240 } else {
2241 m = mp;
2242 }
2243 if (slp->ns_recend)
2244 mbuf_setnextpkt(slp->ns_recend, m);
2245 else
2246 slp->ns_rec = m;
2247 slp->ns_recend = m;
2248 mbuf_setnextpkt(m, NULL);
2249 }
2250 #if 0
2251 if (error) {
2252 /*
2253 * This may be needed in the future to support
2254 * non-byte-stream connection-oriented protocols
2255 * such as SCTP.
2256 */
2257 /*
2258 * This (slp->ns_sotype == SOCK_STREAM) should really
2259 * be a check for PR_CONNREQUIRED.
2260 */
2261 if ((slp->ns_sotype == SOCK_STREAM)
2262 && error != EWOULDBLOCK) {
2263 ns_flag = SLP_DISCONN;
2264 goto dorecs;
2265 }
2266 }
2267 #endif
2268 } while (mp);
2269 }
2270
2271 /*
2272 * Now try and process the request records, non-blocking.
2273 */
2274 dorecs:
2275 if (ns_flag)
2276 slp->ns_flag |= ns_flag;
2277 if (waitflag == MBUF_DONTWAIT) {
2278 int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2279 lck_rw_done(&slp->ns_rwlock);
2280 if (wake && nfs_numnfsd) {
2281 lck_mtx_lock(nfsd_mutex);
2282 nfsrv_wakenfsd(slp);
2283 lck_mtx_unlock(nfsd_mutex);
2284 }
2285 }
2286 }
2287
2288 /*
2289 * Try and extract an RPC request from the mbuf data list received on a
2290 * stream socket. The "waitflag" argument indicates whether or not it
2291 * can sleep.
2292 */
2293 static int
2294 nfsrv_getstream(slp, waitflag)
2295 struct nfssvc_sock *slp;
2296 int waitflag;
2297 {
2298 mbuf_t m;
2299 char *cp1, *cp2, *mdata;
2300 int len, mlen, error;
2301 mbuf_t om, m2, recm;
2302 u_long recmark;
2303
2304 if (slp->ns_flag & SLP_GETSTREAM)
2305 panic("nfs getstream");
2306 slp->ns_flag |= SLP_GETSTREAM;
2307 for (;;) {
2308 if (slp->ns_reclen == 0) {
2309 if (slp->ns_cc < NFSX_UNSIGNED) {
2310 slp->ns_flag &= ~SLP_GETSTREAM;
2311 return (0);
2312 }
2313 m = slp->ns_raw;
2314 mdata = mbuf_data(m);
2315 mlen = mbuf_len(m);
2316 if (mlen >= NFSX_UNSIGNED) {
2317 bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2318 mdata += NFSX_UNSIGNED;
2319 mlen -= NFSX_UNSIGNED;
2320 mbuf_setdata(m, mdata, mlen);
2321 } else {
2322 cp1 = (caddr_t)&recmark;
2323 cp2 = mdata;
2324 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2325 while (mlen == 0) {
2326 m = mbuf_next(m);
2327 cp2 = mbuf_data(m);
2328 mlen = mbuf_len(m);
2329 }
2330 *cp1++ = *cp2++;
2331 mlen--;
2332 mbuf_setdata(m, cp2, mlen);
2333 }
2334 }
2335 slp->ns_cc -= NFSX_UNSIGNED;
2336 recmark = ntohl(recmark);
2337 slp->ns_reclen = recmark & ~0x80000000;
2338 if (recmark & 0x80000000)
2339 slp->ns_flag |= SLP_LASTFRAG;
2340 else
2341 slp->ns_flag &= ~SLP_LASTFRAG;
2342 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2343 slp->ns_flag &= ~SLP_GETSTREAM;
2344 return (EPERM);
2345 }
2346 }
2347
2348 /*
2349 * Now get the record part.
2350 *
2351 * Note that slp->ns_reclen may be 0. Linux sometimes
2352 * generates 0-length RPCs
2353 */
2354 recm = NULL;
2355 if (slp->ns_cc == slp->ns_reclen) {
2356 recm = slp->ns_raw;
2357 slp->ns_raw = slp->ns_rawend = NULL;
2358 slp->ns_cc = slp->ns_reclen = 0;
2359 } else if (slp->ns_cc > slp->ns_reclen) {
2360 len = 0;
2361 m = slp->ns_raw;
2362 mlen = mbuf_len(m);
2363 mdata = mbuf_data(m);
2364 om = NULL;
2365 while (len < slp->ns_reclen) {
2366 if ((len + mlen) > slp->ns_reclen) {
2367 if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2368 slp->ns_flag &= ~SLP_GETSTREAM;
2369 return (EWOULDBLOCK);
2370 }
2371 if (om) {
2372 if (mbuf_setnext(om, m2)) {
2373 /* trouble... just drop it */
2374 printf("nfsrv_getstream: mbuf_setnext failed\n");
2375 mbuf_freem(m2);
2376 slp->ns_flag &= ~SLP_GETSTREAM;
2377 return (EWOULDBLOCK);
2378 }
2379 recm = slp->ns_raw;
2380 } else {
2381 recm = m2;
2382 }
2383 mdata += slp->ns_reclen - len;
2384 mlen -= slp->ns_reclen - len;
2385 mbuf_setdata(m, mdata, mlen);
2386 len = slp->ns_reclen;
2387 } else if ((len + mlen) == slp->ns_reclen) {
2388 om = m;
2389 len += mlen;
2390 m = mbuf_next(m);
2391 recm = slp->ns_raw;
2392 if (mbuf_setnext(om, NULL)) {
2393 printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2394 slp->ns_flag &= ~SLP_GETSTREAM;
2395 return (EWOULDBLOCK);
2396 }
2397 mlen = mbuf_len(m);
2398 mdata = mbuf_data(m);
2399 } else {
2400 om = m;
2401 len += mlen;
2402 m = mbuf_next(m);
2403 mlen = mbuf_len(m);
2404 mdata = mbuf_data(m);
2405 }
2406 }
2407 slp->ns_raw = m;
2408 slp->ns_cc -= len;
2409 slp->ns_reclen = 0;
2410 } else {
2411 slp->ns_flag &= ~SLP_GETSTREAM;
2412 return (0);
2413 }
2414
2415 /*
2416 * Accumulate the fragments into a record.
2417 */
2418 if (slp->ns_frag == NULL) {
2419 slp->ns_frag = recm;
2420 } else {
2421 m = slp->ns_frag;
2422 while ((m2 = mbuf_next(m)))
2423 m = m2;
2424 if ((error = mbuf_setnext(m, recm)))
2425 panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2426 }
2427 if (slp->ns_flag & SLP_LASTFRAG) {
2428 if (slp->ns_recend)
2429 mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2430 else
2431 slp->ns_rec = slp->ns_frag;
2432 slp->ns_recend = slp->ns_frag;
2433 slp->ns_frag = NULL;
2434 }
2435 }
2436 }
2437
2438 /*
2439 * Parse an RPC header.
2440 */
2441 int
2442 nfsrv_dorec(slp, nfsd, ndp)
2443 struct nfssvc_sock *slp;
2444 struct nfsd *nfsd;
2445 struct nfsrv_descript **ndp;
2446 {
2447 mbuf_t m;
2448 mbuf_t nam;
2449 struct nfsrv_descript *nd;
2450 int error;
2451
2452 *ndp = NULL;
2453 if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2454 return (ENOBUFS);
2455 MALLOC_ZONE(nd, struct nfsrv_descript *,
2456 sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2457 if (!nd)
2458 return (ENOMEM);
2459 m = slp->ns_rec;
2460 slp->ns_rec = mbuf_nextpkt(m);
2461 if (slp->ns_rec)
2462 mbuf_setnextpkt(m, NULL);
2463 else
2464 slp->ns_recend = NULL;
2465 if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2466 nam = m;
2467 m = mbuf_next(m);
2468 if ((error = mbuf_setnext(nam, NULL)))
2469 panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2470 } else
2471 nam = NULL;
2472 nd->nd_md = nd->nd_mrep = m;
2473 nd->nd_nam2 = nam;
2474 nd->nd_dpos = mbuf_data(m);
2475 error = nfs_getreq(nd, nfsd, TRUE);
2476 if (error) {
2477 if (nam)
2478 mbuf_freem(nam);
2479 FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC);
2480 return (error);
2481 }
2482 *ndp = nd;
2483 nfsd->nfsd_nd = nd;
2484 return (0);
2485 }
2486
2487 /*
2488 * Parse an RPC request
2489 * - verify it
2490 * - fill in the cred struct.
2491 */
2492 int
2493 nfs_getreq(nd, nfsd, has_header)
2494 struct nfsrv_descript *nd;
2495 struct nfsd *nfsd;
2496 int has_header;
2497 {
2498 int len, i;
2499 u_long *tl;
2500 long t1;
2501 uio_t uiop;
2502 caddr_t dpos, cp2, cp;
2503 u_long nfsvers, auth_type;
2504 uid_t nickuid;
2505 int error = 0, ticklen;
2506 mbuf_t mrep, md;
2507 struct nfsuid *nuidp;
2508 uid_t user_id;
2509 gid_t group_id;
2510 int ngroups;
2511 struct ucred temp_cred;
2512 struct timeval tvin, tvout, now;
2513 char uio_buf[ UIO_SIZEOF(1) ];
2514 #if 0 /* until encrypted keys are implemented */
2515 NFSKERBKEYSCHED_T keys; /* stores key schedule */
2516 #endif
2517
2518 nd->nd_cr = NULL;
2519
2520 mrep = nd->nd_mrep;
2521 md = nd->nd_md;
2522 dpos = nd->nd_dpos;
2523 if (has_header) {
2524 nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2525 nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2526 if (*tl++ != rpc_call) {
2527 mbuf_freem(mrep);
2528 return (EBADRPC);
2529 }
2530 } else
2531 nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2532 nd->nd_repstat = 0;
2533 nd->nd_flag = 0;
2534 if (*tl++ != rpc_vers) {
2535 nd->nd_repstat = ERPCMISMATCH;
2536 nd->nd_procnum = NFSPROC_NOOP;
2537 return (0);
2538 }
2539 if (*tl != nfs_prog) {
2540 nd->nd_repstat = EPROGUNAVAIL;
2541 nd->nd_procnum = NFSPROC_NOOP;
2542 return (0);
2543 }
2544 tl++;
2545 nfsvers = fxdr_unsigned(u_long, *tl++);
2546 if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2547 nd->nd_repstat = EPROGMISMATCH;
2548 nd->nd_procnum = NFSPROC_NOOP;
2549 return (0);
2550 }
2551 else if (nfsvers == NFS_VER3)
2552 nd->nd_flag = ND_NFSV3;
2553 nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2554 if (nd->nd_procnum == NFSPROC_NULL)
2555 return (0);
2556 if ((nd->nd_procnum >= NFS_NPROCS) ||
2557 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2558 nd->nd_repstat = EPROCUNAVAIL;
2559 nd->nd_procnum = NFSPROC_NOOP;
2560 return (0);
2561 }
2562 if ((nd->nd_flag & ND_NFSV3) == 0)
2563 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2564 auth_type = *tl++;
2565 len = fxdr_unsigned(int, *tl++);
2566 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2567 mbuf_freem(mrep);
2568 return (EBADRPC);
2569 }
2570
2571 nd->nd_flag &= ~ND_KERBAUTH;
2572 /*
2573 * Handle auth_unix or auth_kerb.
2574 */
2575 if (auth_type == rpc_auth_unix) {
2576 len = fxdr_unsigned(int, *++tl);
2577 if (len < 0 || len > NFS_MAXNAMLEN) {
2578 mbuf_freem(mrep);
2579 return (EBADRPC);
2580 }
2581 bzero(&temp_cred, sizeof(temp_cred));
2582 nfsm_adv(nfsm_rndup(len));
2583 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2584 user_id = fxdr_unsigned(uid_t, *tl++);
2585 group_id = fxdr_unsigned(gid_t, *tl++);
2586 temp_cred.cr_groups[0] = group_id;
2587 len = fxdr_unsigned(int, *tl);
2588 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2589 mbuf_freem(mrep);
2590 return (EBADRPC);
2591 }
2592 nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2593 for (i = 1; i <= len; i++)
2594 if (i < NGROUPS)
2595 temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2596 else
2597 tl++;
2598 ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2599 if (ngroups > 1)
2600 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2601 len = fxdr_unsigned(int, *++tl);
2602 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2603 mbuf_freem(mrep);
2604 return (EBADRPC);
2605 }
2606 temp_cred.cr_uid = user_id;
2607 temp_cred.cr_ngroups = ngroups;
2608 nd->nd_cr = kauth_cred_create(&temp_cred);
2609 if (nd->nd_cr == NULL) {
2610 nd->nd_repstat = ENOMEM;
2611 nd->nd_procnum = NFSPROC_NOOP;
2612 return (0);
2613 }
2614 if (len > 0)
2615 nfsm_adv(nfsm_rndup(len));
2616 } else if (auth_type == rpc_auth_kerb) {
2617 switch (fxdr_unsigned(int, *tl++)) {
2618 case RPCAKN_FULLNAME:
2619 ticklen = fxdr_unsigned(int, *tl);
2620 *((u_long *)nfsd->nfsd_authstr) = *tl;
2621 uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2622 &uio_buf[0], sizeof(uio_buf));
2623 if (!uiop) {
2624 nd->nd_repstat = ENOMEM;
2625 nd->nd_procnum = NFSPROC_NOOP;
2626 return (0);
2627 }
2628
2629 // LP64todo - fix this
2630 nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2631 if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2632 mbuf_freem(mrep);
2633 return (EBADRPC);
2634 }
2635 uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2636 // LP64todo - fix this
2637 nfsm_mtouio(uiop, uio_resid(uiop));
2638 nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2639 if (*tl++ != rpc_auth_kerb ||
2640 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2641 printf("Bad kerb verifier\n");
2642 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2643 nd->nd_procnum = NFSPROC_NOOP;
2644 return (0);
2645 }
2646 nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2647 tl = (u_long *)cp;
2648 if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2649 printf("Not fullname kerb verifier\n");
2650 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2651 nd->nd_procnum = NFSPROC_NOOP;
2652 return (0);
2653 }
2654 cp += NFSX_UNSIGNED;
2655 bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2656 nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2657 nd->nd_flag |= ND_KERBFULL;
2658 nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2659 break;
2660 case RPCAKN_NICKNAME:
2661 if (len != 2 * NFSX_UNSIGNED) {
2662 printf("Kerb nickname short\n");
2663 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2664 nd->nd_procnum = NFSPROC_NOOP;
2665 return (0);
2666 }
2667 nickuid = fxdr_unsigned(uid_t, *tl);
2668 nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2669 if (*tl++ != rpc_auth_kerb ||
2670 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2671 printf("Kerb nick verifier bad\n");
2672 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2673 nd->nd_procnum = NFSPROC_NOOP;
2674 return (0);
2675 }
2676 nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2677 tvin.tv_sec = *tl++;
2678 tvin.tv_usec = *tl;
2679
2680 for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2681 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2682 if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2683 (!nd->nd_nam2 ||
2684 netaddr_match(NU_NETFAM(nuidp),
2685 &nuidp->nu_haddr, nd->nd_nam2)))
2686 break;
2687 }
2688 if (!nuidp) {
2689 nd->nd_repstat =
2690 (NFSERR_AUTHERR|AUTH_REJECTCRED);
2691 nd->nd_procnum = NFSPROC_NOOP;
2692 return (0);
2693 }
2694
2695 /*
2696 * Now, decrypt the timestamp using the session key
2697 * and validate it.
2698 */
2699 #if NFSKERB
2700 XXX
2701 #endif
2702
2703 tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2704 tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2705 microtime(&now);
2706 if (nuidp->nu_expire < now.tv_sec ||
2707 nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2708 (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2709 nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2710 nuidp->nu_expire = 0;
2711 nd->nd_repstat =
2712 (NFSERR_AUTHERR|AUTH_REJECTVERF);
2713 nd->nd_procnum = NFSPROC_NOOP;
2714 return (0);
2715 }
2716 bzero(&temp_cred, sizeof(temp_cred));
2717 ngroups = nuidp->nu_cr->cr_ngroups;
2718 for (i = 0; i < ngroups; i++)
2719 temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2720 if (ngroups > 1)
2721 nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2722
2723 temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2724 temp_cred.cr_ngroups = ngroups;
2725 nd->nd_cr = kauth_cred_create(&temp_cred);
2726 if (!nd->nd_cr) {
2727 nd->nd_repstat = ENOMEM;
2728 nd->nd_procnum = NFSPROC_NOOP;
2729 return (0);
2730 }
2731 nd->nd_flag |= ND_KERBNICK;
2732 };
2733 } else {
2734 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2735 nd->nd_procnum = NFSPROC_NOOP;
2736 return (0);
2737 }
2738
2739 nd->nd_md = md;
2740 nd->nd_dpos = dpos;
2741 return (0);
2742 nfsmout:
2743 if (nd->nd_cr)
2744 kauth_cred_rele(nd->nd_cr);
2745 return (error);
2746 }
2747
2748 /*
2749 * Search for a sleeping nfsd and wake it up.
2750 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2751 * running nfsds will go look for the work in the nfssvc_sock list.
2752 * Note: Must be called with nfsd_mutex held.
2753 */
2754 void
2755 nfsrv_wakenfsd(struct nfssvc_sock *slp)
2756 {
2757 struct nfsd *nd;
2758
2759 if ((slp->ns_flag & SLP_VALID) == 0)
2760 return;
2761
2762 lck_rw_lock_exclusive(&slp->ns_rwlock);
2763
2764 if (nfsd_waiting) {
2765 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2766 if (nd->nfsd_flag & NFSD_WAITING) {
2767 nd->nfsd_flag &= ~NFSD_WAITING;
2768 if (nd->nfsd_slp)
2769 panic("nfsd wakeup");
2770 slp->ns_sref++;
2771 nd->nfsd_slp = slp;
2772 lck_rw_done(&slp->ns_rwlock);
2773 wakeup((caddr_t)nd);
2774 return;
2775 }
2776 }
2777 }
2778
2779 slp->ns_flag |= SLP_DOREC;
2780
2781 lck_rw_done(&slp->ns_rwlock);
2782
2783 nfsd_head_flag |= NFSD_CHECKSLP;
2784 }
2785 #endif /* NFS_NOSERVER */
2786
2787 static int
2788 nfs_msg(proc_t p,
2789 const char *server,
2790 const char *msg,
2791 int error)
2792 {
2793 tpr_t tpr;
2794
2795 if (p)
2796 tpr = tprintf_open(p);
2797 else
2798 tpr = NULL;
2799 if (error)
2800 tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2801 error);
2802 else
2803 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2804 tprintf_close(tpr);
2805 return (0);
2806 }
2807
2808 void
2809 nfs_down(nmp, proc, error, flags, msg)
2810 struct nfsmount *nmp;
2811 proc_t proc;
2812 int error, flags;
2813 const char *msg;
2814 {
2815 if (nmp == NULL)
2816 return;
2817 if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2818 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2819 nmp->nm_state |= NFSSTA_TIMEO;
2820 }
2821 if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2822 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2823 nmp->nm_state |= NFSSTA_LOCKTIMEO;
2824 }
2825 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2826 }
2827
2828 void
2829 nfs_up(nmp, proc, flags, msg)
2830 struct nfsmount *nmp;
2831 proc_t proc;
2832 int flags;
2833 const char *msg;
2834 {
2835 if (nmp == NULL)
2836 return;
2837 if (msg)
2838 nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2839 if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2840 nmp->nm_state &= ~NFSSTA_TIMEO;
2841 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2842 }
2843 if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2844 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2845 vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2846 }
2847 }
2848