]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_usrreq.c
xnu-792.12.6.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_usrreq.c
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All Rights Reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /*
31 * Copyright (c) 1982, 1986, 1988, 1993
32 * The Regents of the University of California. All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 * 3. All advertising materials mentioning features or use of this software
43 * must display the following acknowledgement:
44 * This product includes software developed by the University of
45 * California, Berkeley and its contributors.
46 * 4. Neither the name of the University nor the names of its contributors
47 * may be used to endorse or promote products derived from this software
48 * without specific prior written permission.
49 *
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE.
61 *
62 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
63 * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.9 2001/08/22 00:59:12 silby Exp $
64 */
65
66
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/kernel.h>
70 #include <sys/sysctl.h>
71 #include <sys/mbuf.h>
72 #if INET6
73 #include <sys/domain.h>
74 #endif /* INET6 */
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/protosw.h>
78
79 #include <net/if.h>
80 #include <net/route.h>
81
82 #include <netinet/in.h>
83 #include <netinet/in_systm.h>
84 #if INET6
85 #include <netinet/ip6.h>
86 #endif
87 #include <netinet/in_pcb.h>
88 #if INET6
89 #include <netinet6/in6_pcb.h>
90 #endif
91 #include <netinet/in_var.h>
92 #include <netinet/ip_var.h>
93 #if INET6
94 #include <netinet6/ip6_var.h>
95 #endif
96 #include <netinet/tcp.h>
97 #include <netinet/tcp_fsm.h>
98 #include <netinet/tcp_seq.h>
99 #include <netinet/tcp_timer.h>
100 #include <netinet/tcp_var.h>
101 #include <netinet/tcpip.h>
102 #if TCPDEBUG
103 #include <netinet/tcp_debug.h>
104 #endif
105
106 #if IPSEC
107 #include <netinet6/ipsec.h>
108 #endif /*IPSEC*/
109
110 /*
111 * TCP protocol interface to socket abstraction.
112 */
113 extern char *tcpstates[]; /* XXX ??? */
114
115 static int tcp_attach(struct socket *, struct proc *);
116 static int tcp_connect(struct tcpcb *, struct sockaddr *, struct proc *);
117 #if INET6
118 static int tcp6_connect(struct tcpcb *, struct sockaddr *, struct proc *);
119 #endif /* INET6 */
120 static struct tcpcb *
121 tcp_disconnect(struct tcpcb *);
122 static struct tcpcb *
123 tcp_usrclosed(struct tcpcb *);
124
125 #if TCPDEBUG
126 #define TCPDEBUG0 int ostate = 0
127 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0
128 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \
129 tcp_trace(TA_USER, ostate, tp, 0, 0, req)
130 #else
131 #define TCPDEBUG0
132 #define TCPDEBUG1()
133 #define TCPDEBUG2(req)
134 #endif
135
136 /*
137 * TCP attaches to socket via pru_attach(), reserving space,
138 * and an internet control block.
139 */
140 static int
141 tcp_usr_attach(struct socket *so, int proto, struct proc *p)
142 {
143 int error;
144 struct inpcb *inp = sotoinpcb(so);
145 struct tcpcb *tp = 0;
146 TCPDEBUG0;
147
148 TCPDEBUG1();
149 if (inp) {
150 error = EISCONN;
151 goto out;
152 }
153
154 error = tcp_attach(so, p);
155 if (error)
156 goto out;
157
158 if ((so->so_options & SO_LINGER) && so->so_linger == 0)
159 so->so_linger = TCP_LINGERTIME * hz;
160 tp = sototcpcb(so);
161 out:
162 TCPDEBUG2(PRU_ATTACH);
163 return error;
164 }
165
166 /*
167 * pru_detach() detaches the TCP protocol from the socket.
168 * If the protocol state is non-embryonic, then can't
169 * do this directly: have to initiate a pru_disconnect(),
170 * which may finish later; embryonic TCB's can just
171 * be discarded here.
172 */
173 static int
174 tcp_usr_detach(struct socket *so)
175 {
176 int error = 0;
177 struct inpcb *inp = sotoinpcb(so);
178 struct tcpcb *tp;
179 TCPDEBUG0;
180
181 if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) {
182 return EINVAL; /* XXX */
183 }
184 #if 1
185 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
186 #endif
187 tp = intotcpcb(inp);
188 /* In case we got disconnected from the peer */
189 if (tp == 0)
190 goto out;
191 TCPDEBUG1();
192 tp = tcp_disconnect(tp);
193 out:
194 TCPDEBUG2(PRU_DETACH);
195 return error;
196 }
197
198 #define COMMON_START() TCPDEBUG0; \
199 do { \
200 if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) { \
201 return EINVAL; \
202 } \
203 tp = intotcpcb(inp); \
204 TCPDEBUG1(); \
205 } while(0)
206
207 #define COMMON_END(req) out: TCPDEBUG2(req); return error; goto out
208
209
210 /*
211 * Give the socket an address.
212 */
213 static int
214 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
215 {
216 int error = 0;
217 struct inpcb *inp = sotoinpcb(so);
218 struct tcpcb *tp;
219 struct sockaddr_in *sinp;
220
221 COMMON_START();
222
223 /*
224 * Must check for multicast addresses and disallow binding
225 * to them.
226 */
227 sinp = (struct sockaddr_in *)nam;
228 if (sinp->sin_family == AF_INET &&
229 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
230 error = EAFNOSUPPORT;
231 goto out;
232 }
233 error = in_pcbbind(inp, nam, p);
234 if (error)
235 goto out;
236 COMMON_END(PRU_BIND);
237
238 }
239
240 #if INET6
241 static int
242 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
243 {
244 int error = 0;
245 struct inpcb *inp = sotoinpcb(so);
246 struct tcpcb *tp;
247 struct sockaddr_in6 *sin6p;
248
249 COMMON_START();
250
251 /*
252 * Must check for multicast addresses and disallow binding
253 * to them.
254 */
255 sin6p = (struct sockaddr_in6 *)nam;
256 if (sin6p->sin6_family == AF_INET6 &&
257 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
258 error = EAFNOSUPPORT;
259 goto out;
260 }
261 inp->inp_vflag &= ~INP_IPV4;
262 inp->inp_vflag |= INP_IPV6;
263 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
264 if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
265 inp->inp_vflag |= INP_IPV4;
266 else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
267 struct sockaddr_in sin;
268
269 in6_sin6_2_sin(&sin, sin6p);
270 inp->inp_vflag |= INP_IPV4;
271 inp->inp_vflag &= ~INP_IPV6;
272 error = in_pcbbind(inp, (struct sockaddr *)&sin, p);
273 goto out;
274 }
275 }
276 error = in6_pcbbind(inp, nam, p);
277 if (error)
278 goto out;
279 COMMON_END(PRU_BIND);
280 }
281 #endif /* INET6 */
282
283 /*
284 * Prepare to accept connections.
285 */
286 static int
287 tcp_usr_listen(struct socket *so, struct proc *p)
288 {
289 int error = 0;
290 struct inpcb *inp = sotoinpcb(so);
291 struct tcpcb *tp;
292
293 COMMON_START();
294 if (inp->inp_lport == 0)
295 error = in_pcbbind(inp, (struct sockaddr *)0, p);
296 if (error == 0)
297 tp->t_state = TCPS_LISTEN;
298 COMMON_END(PRU_LISTEN);
299 }
300
301 #if INET6
302 static int
303 tcp6_usr_listen(struct socket *so, struct proc *p)
304 {
305 int error = 0;
306 struct inpcb *inp = sotoinpcb(so);
307 struct tcpcb *tp;
308
309 COMMON_START();
310 if (inp->inp_lport == 0) {
311 inp->inp_vflag &= ~INP_IPV4;
312 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
313 inp->inp_vflag |= INP_IPV4;
314 error = in6_pcbbind(inp, (struct sockaddr *)0, p);
315 }
316 if (error == 0)
317 tp->t_state = TCPS_LISTEN;
318 COMMON_END(PRU_LISTEN);
319 }
320 #endif /* INET6 */
321
322 /*
323 * Initiate connection to peer.
324 * Create a template for use in transmissions on this connection.
325 * Enter SYN_SENT state, and mark socket as connecting.
326 * Start keep-alive timer, and seed output sequence space.
327 * Send initial segment on connection.
328 */
329 static int
330 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
331 {
332 int error = 0;
333 struct inpcb *inp = sotoinpcb(so);
334 struct tcpcb *tp;
335 struct sockaddr_in *sinp;
336
337 COMMON_START();
338
339 /*
340 * Must disallow TCP ``connections'' to multicast addresses.
341 */
342 sinp = (struct sockaddr_in *)nam;
343 if (sinp->sin_family == AF_INET
344 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
345 error = EAFNOSUPPORT;
346 goto out;
347 }
348
349 #ifndef __APPLE__
350 prison_remote_ip(p, 0, &sinp->sin_addr.s_addr);
351 #endif
352
353 if ((error = tcp_connect(tp, nam, p)) != 0)
354 goto out;
355 error = tcp_output(tp);
356 COMMON_END(PRU_CONNECT);
357 }
358
359 #if INET6
360 static int
361 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
362 {
363 int error = 0;
364 struct inpcb *inp = sotoinpcb(so);
365 struct tcpcb *tp;
366 struct sockaddr_in6 *sin6p;
367
368 COMMON_START();
369
370 /*
371 * Must disallow TCP ``connections'' to multicast addresses.
372 */
373 sin6p = (struct sockaddr_in6 *)nam;
374 if (sin6p->sin6_family == AF_INET6
375 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
376 error = EAFNOSUPPORT;
377 goto out;
378 }
379
380 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
381 struct sockaddr_in sin;
382
383 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
384 return (EINVAL);
385
386 in6_sin6_2_sin(&sin, sin6p);
387 inp->inp_vflag |= INP_IPV4;
388 inp->inp_vflag &= ~INP_IPV6;
389 if ((error = tcp_connect(tp, (struct sockaddr *)&sin, p)) != 0)
390 goto out;
391 error = tcp_output(tp);
392 goto out;
393 }
394 inp->inp_vflag &= ~INP_IPV4;
395 inp->inp_vflag |= INP_IPV6;
396 if ((error = tcp6_connect(tp, nam, p)) != 0)
397 goto out;
398 error = tcp_output(tp);
399 if (error)
400 goto out;
401 COMMON_END(PRU_CONNECT);
402 }
403 #endif /* INET6 */
404
405 /*
406 * Initiate disconnect from peer.
407 * If connection never passed embryonic stage, just drop;
408 * else if don't need to let data drain, then can just drop anyways,
409 * else have to begin TCP shutdown process: mark socket disconnecting,
410 * drain unread data, state switch to reflect user close, and
411 * send segment (e.g. FIN) to peer. Socket will be really disconnected
412 * when peer sends FIN and acks ours.
413 *
414 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
415 */
416 static int
417 tcp_usr_disconnect(struct socket *so)
418 {
419 int error = 0;
420 struct inpcb *inp = sotoinpcb(so);
421 struct tcpcb *tp;
422
423 #if 1
424 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
425 #endif
426 COMMON_START();
427 /* In case we got disconnected from the peer */
428 if (tp == 0)
429 goto out;
430 tp = tcp_disconnect(tp);
431 COMMON_END(PRU_DISCONNECT);
432 }
433
434 /*
435 * Accept a connection. Essentially all the work is
436 * done at higher levels; just return the address
437 * of the peer, storing through addr.
438 */
439 static int
440 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
441 {
442 int error = 0;
443 struct inpcb *inp = sotoinpcb(so);
444 struct tcpcb *tp = NULL;
445 TCPDEBUG0;
446
447 if (so->so_state & SS_ISDISCONNECTED) {
448 error = ECONNABORTED;
449 goto out;
450 }
451 if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) {
452 return (EINVAL);
453 }
454 tp = intotcpcb(inp);
455 TCPDEBUG1();
456 in_setpeeraddr(so, nam);
457 COMMON_END(PRU_ACCEPT);
458 }
459
460 #if INET6
461 static int
462 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
463 {
464 int error = 0;
465 struct inpcb *inp = sotoinpcb(so);
466 struct tcpcb *tp = NULL;
467 TCPDEBUG0;
468
469 if (so->so_state & SS_ISDISCONNECTED) {
470 error = ECONNABORTED;
471 goto out;
472 }
473 if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) {
474 return (EINVAL);
475 }
476 tp = intotcpcb(inp);
477 TCPDEBUG1();
478 in6_mapped_peeraddr(so, nam);
479 COMMON_END(PRU_ACCEPT);
480 }
481 #endif /* INET6 */
482 /*
483 * Mark the connection as being incapable of further output.
484 */
485 static int
486 tcp_usr_shutdown(struct socket *so)
487 {
488 int error = 0;
489 struct inpcb *inp = sotoinpcb(so);
490 struct tcpcb *tp;
491
492 COMMON_START();
493 socantsendmore(so);
494 /* In case we got disconnected from the peer */
495 if (tp == 0)
496 goto out;
497 tp = tcp_usrclosed(tp);
498 if (tp)
499 error = tcp_output(tp);
500 COMMON_END(PRU_SHUTDOWN);
501 }
502
503 /*
504 * After a receive, possibly send window update to peer.
505 */
506 static int
507 tcp_usr_rcvd(struct socket *so, int flags)
508 {
509 int error = 0;
510 struct inpcb *inp = sotoinpcb(so);
511 struct tcpcb *tp;
512
513 COMMON_START();
514 /* In case we got disconnected from the peer */
515 if (tp == 0)
516 goto out;
517 tcp_output(tp);
518 COMMON_END(PRU_RCVD);
519 }
520
521 /*
522 * Do a send by putting data in output queue and updating urgent
523 * marker if URG set. Possibly send more data. Unlike the other
524 * pru_*() routines, the mbuf chains are our responsibility. We
525 * must either enqueue them or free them. The other pru_* routines
526 * generally are caller-frees.
527 */
528 static int
529 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
530 struct sockaddr *nam, struct mbuf *control, struct proc *p)
531 {
532 int error = 0;
533 struct inpcb *inp = sotoinpcb(so);
534 struct tcpcb *tp;
535 #if INET6
536 int isipv6;
537 #endif
538 TCPDEBUG0;
539
540 if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD) {
541 /*
542 * OOPS! we lost a race, the TCP session got reset after
543 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
544 * network interrupt in the non-splnet() section of sosend().
545 */
546 if (m)
547 m_freem(m);
548 if (control)
549 m_freem(control);
550 error = ECONNRESET; /* XXX EPIPE? */
551 tp = NULL;
552 TCPDEBUG1();
553 goto out;
554 }
555 #if INET6
556 isipv6 = nam && nam->sa_family == AF_INET6;
557 #endif /* INET6 */
558 tp = intotcpcb(inp);
559 TCPDEBUG1();
560 if (control) {
561 /* TCP doesn't do control messages (rights, creds, etc) */
562 if (control->m_len) {
563 m_freem(control);
564 if (m)
565 m_freem(m);
566 error = EINVAL;
567 goto out;
568 }
569 m_freem(control); /* empty control, just free it */
570 }
571 if(!(flags & PRUS_OOB)) {
572 sbappend(&so->so_snd, m);
573 if (nam && tp->t_state < TCPS_SYN_SENT) {
574 /*
575 * Do implied connect if not yet connected,
576 * initialize window to default value, and
577 * initialize maxseg/maxopd using peer's cached
578 * MSS.
579 */
580 #if INET6
581 if (isipv6)
582 error = tcp6_connect(tp, nam, p);
583 else
584 #endif /* INET6 */
585 error = tcp_connect(tp, nam, p);
586 if (error)
587 goto out;
588 tp->snd_wnd = TTCP_CLIENT_SND_WND;
589 tcp_mss(tp, -1);
590 }
591
592 if (flags & PRUS_EOF) {
593 /*
594 * Close the send side of the connection after
595 * the data is sent.
596 */
597 socantsendmore(so);
598 tp = tcp_usrclosed(tp);
599 }
600 if (tp != NULL) {
601 if (flags & PRUS_MORETOCOME)
602 tp->t_flags |= TF_MORETOCOME;
603 error = tcp_output(tp);
604 if (flags & PRUS_MORETOCOME)
605 tp->t_flags &= ~TF_MORETOCOME;
606 }
607 } else {
608 if (sbspace(&so->so_snd) < -512) {
609 m_freem(m);
610 error = ENOBUFS;
611 goto out;
612 }
613 /*
614 * According to RFC961 (Assigned Protocols),
615 * the urgent pointer points to the last octet
616 * of urgent data. We continue, however,
617 * to consider it to indicate the first octet
618 * of data past the urgent section.
619 * Otherwise, snd_up should be one lower.
620 */
621 sbappend(&so->so_snd, m);
622 if (nam && tp->t_state < TCPS_SYN_SENT) {
623 /*
624 * Do implied connect if not yet connected,
625 * initialize window to default value, and
626 * initialize maxseg/maxopd using peer's cached
627 * MSS.
628 */
629 #if INET6
630 if (isipv6)
631 error = tcp6_connect(tp, nam, p);
632 else
633 #endif /* INET6 */
634 error = tcp_connect(tp, nam, p);
635 if (error)
636 goto out;
637 tp->snd_wnd = TTCP_CLIENT_SND_WND;
638 tcp_mss(tp, -1);
639 }
640 tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
641 tp->t_force = 1;
642 error = tcp_output(tp);
643 tp->t_force = 0;
644 }
645 COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
646 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
647 }
648
649 /*
650 * Abort the TCP.
651 */
652 static int
653 tcp_usr_abort(struct socket *so)
654 {
655 int error = 0;
656 struct inpcb *inp = sotoinpcb(so);
657 struct tcpcb *tp;
658
659 COMMON_START();
660 /* In case we got disconnected from the peer */
661 if (tp == 0)
662 goto out;
663 tp = tcp_drop(tp, ECONNABORTED);
664 so->so_usecount--;
665 COMMON_END(PRU_ABORT);
666 }
667
668 /*
669 * Receive out-of-band data.
670 */
671 static int
672 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
673 {
674 int error = 0;
675 struct inpcb *inp = sotoinpcb(so);
676 struct tcpcb *tp;
677
678 COMMON_START();
679 if ((so->so_oobmark == 0 &&
680 (so->so_state & SS_RCVATMARK) == 0) ||
681 so->so_options & SO_OOBINLINE ||
682 tp->t_oobflags & TCPOOB_HADDATA) {
683 error = EINVAL;
684 goto out;
685 }
686 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
687 error = EWOULDBLOCK;
688 goto out;
689 }
690 m->m_len = 1;
691 *mtod(m, caddr_t) = tp->t_iobc;
692 if ((flags & MSG_PEEK) == 0)
693 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
694 COMMON_END(PRU_RCVOOB);
695 }
696
697 /* xxx - should be const */
698 struct pr_usrreqs tcp_usrreqs = {
699 tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind,
700 tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach,
701 tcp_usr_disconnect, tcp_usr_listen, in_setpeeraddr, tcp_usr_rcvd,
702 tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
703 in_setsockaddr, sosend, soreceive, pru_sopoll_notsupp
704 };
705
706 #if INET6
707 struct pr_usrreqs tcp6_usrreqs = {
708 tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind,
709 tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach,
710 tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd,
711 tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
712 in6_mapped_sockaddr, sosend, soreceive, pru_sopoll_notsupp
713 };
714 #endif /* INET6 */
715
716 /*
717 * Common subroutine to open a TCP connection to remote host specified
718 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
719 * port number if needed. Call in_pcbladdr to do the routing and to choose
720 * a local host address (interface). If there is an existing incarnation
721 * of the same connection in TIME-WAIT state and if the remote host was
722 * sending CC options and if the connection duration was < MSL, then
723 * truncate the previous TIME-WAIT state and proceed.
724 * Initialize connection parameters and enter SYN-SENT state.
725 */
726 static int
727 tcp_connect(tp, nam, p)
728 register struct tcpcb *tp;
729 struct sockaddr *nam;
730 struct proc *p;
731 {
732 struct inpcb *inp = tp->t_inpcb, *oinp;
733 struct socket *so = inp->inp_socket;
734 struct tcpcb *otp;
735 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
736 struct sockaddr_in *ifaddr;
737 struct rmxp_tao *taop;
738 struct rmxp_tao tao_noncached;
739 int error;
740
741 if (inp->inp_lport == 0) {
742 error = in_pcbbind(inp, (struct sockaddr *)0, p);
743 if (error)
744 return error;
745 }
746
747 /*
748 * Cannot simply call in_pcbconnect, because there might be an
749 * earlier incarnation of this same connection still in
750 * TIME_WAIT state, creating an ADDRINUSE error.
751 */
752 error = in_pcbladdr(inp, nam, &ifaddr);
753 if (error)
754 return error;
755
756 tcp_unlock(inp->inp_socket, 0, 0);
757 oinp = in_pcblookup_hash(inp->inp_pcbinfo,
758 sin->sin_addr, sin->sin_port,
759 inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr
760 : ifaddr->sin_addr,
761 inp->inp_lport, 0, NULL);
762
763 tcp_lock(inp->inp_socket, 0, 0);
764 if (oinp) {
765 if (oinp != inp) /* 4143933: avoid deadlock if inp == oinp */
766 tcp_lock(oinp->inp_socket, 1, 0);
767 if (in_pcb_checkstate(oinp, WNT_RELEASE, 1) == WNT_STOPUSING) {
768 if (oinp != inp)
769 tcp_unlock(oinp->inp_socket, 1, 0);
770 goto skip_oinp;
771 }
772
773 if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
774 otp->t_state == TCPS_TIME_WAIT &&
775 otp->t_starttime < tcp_msl &&
776 (otp->t_flags & TF_RCVD_CC))
777 otp = tcp_close(otp);
778 else {
779 printf("tcp_connect: inp=%x err=EADDRINUSE\n", inp);
780 if (oinp != inp)
781 tcp_unlock(oinp->inp_socket, 1, 0);
782 return EADDRINUSE;
783 }
784 if (oinp != inp)
785 tcp_unlock(oinp->inp_socket, 1, 0);
786 }
787 skip_oinp:
788 if ((inp->inp_laddr.s_addr == INADDR_ANY ? ifaddr->sin_addr.s_addr :
789 inp->inp_laddr.s_addr) == sin->sin_addr.s_addr &&
790 inp->inp_lport == sin->sin_port)
791 return EINVAL;
792 if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
793 /*lock inversion issue, mostly with udp multicast packets */
794 socket_unlock(inp->inp_socket, 0);
795 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
796 socket_lock(inp->inp_socket, 0);
797 }
798 if (inp->inp_laddr.s_addr == INADDR_ANY)
799 inp->inp_laddr = ifaddr->sin_addr;
800 inp->inp_faddr = sin->sin_addr;
801 inp->inp_fport = sin->sin_port;
802 in_pcbrehash(inp);
803 lck_rw_done(inp->inp_pcbinfo->mtx);
804
805 /* Compute window scaling to request. */
806 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
807 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
808 tp->request_r_scale++;
809
810 soisconnecting(so);
811 tcpstat.tcps_connattempt++;
812 tp->t_state = TCPS_SYN_SENT;
813 tp->t_timer[TCPT_KEEP] = tcp_keepinit;
814 tp->iss = tcp_new_isn(tp);
815 tcp_sendseqinit(tp);
816
817 /*
818 * Generate a CC value for this connection and
819 * check whether CC or CCnew should be used.
820 */
821 if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
822 taop = &tao_noncached;
823 bzero(taop, sizeof(*taop));
824 }
825
826 tp->cc_send = CC_INC(tcp_ccgen);
827 if (taop->tao_ccsent != 0 &&
828 CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
829 taop->tao_ccsent = tp->cc_send;
830 } else {
831 taop->tao_ccsent = 0;
832 tp->t_flags |= TF_SENDCCNEW;
833 }
834
835 return 0;
836 }
837
838 #if INET6
839 static int
840 tcp6_connect(tp, nam, p)
841 register struct tcpcb *tp;
842 struct sockaddr *nam;
843 struct proc *p;
844 {
845 struct inpcb *inp = tp->t_inpcb, *oinp;
846 struct socket *so = inp->inp_socket;
847 struct tcpcb *otp;
848 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
849 struct in6_addr addr6;
850 struct rmxp_tao *taop;
851 struct rmxp_tao tao_noncached;
852 int error;
853
854 if (inp->inp_lport == 0) {
855 error = in6_pcbbind(inp, (struct sockaddr *)0, p);
856 if (error)
857 return error;
858 }
859
860 /*
861 * Cannot simply call in_pcbconnect, because there might be an
862 * earlier incarnation of this same connection still in
863 * TIME_WAIT state, creating an ADDRINUSE error.
864 */
865 error = in6_pcbladdr(inp, nam, &addr6);
866 if (error)
867 return error;
868 tcp_unlock(inp->inp_socket, 0, 0);
869 oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
870 &sin6->sin6_addr, sin6->sin6_port,
871 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
872 ? &addr6
873 : &inp->in6p_laddr,
874 inp->inp_lport, 0, NULL);
875 tcp_lock(inp->inp_socket, 0, 0);
876 if (oinp) {
877 if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
878 otp->t_state == TCPS_TIME_WAIT &&
879 otp->t_starttime < tcp_msl &&
880 (otp->t_flags & TF_RCVD_CC))
881 otp = tcp_close(otp);
882 else
883 return EADDRINUSE;
884 }
885 if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
886 /*lock inversion issue, mostly with udp multicast packets */
887 socket_unlock(inp->inp_socket, 0);
888 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
889 socket_lock(inp->inp_socket, 0);
890 }
891 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
892 inp->in6p_laddr = addr6;
893 inp->in6p_faddr = sin6->sin6_addr;
894 inp->inp_fport = sin6->sin6_port;
895 if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != NULL)
896 inp->in6p_flowinfo = sin6->sin6_flowinfo;
897 in_pcbrehash(inp);
898 lck_rw_done(inp->inp_pcbinfo->mtx);
899
900 /* Compute window scaling to request. */
901 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
902 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
903 tp->request_r_scale++;
904
905 soisconnecting(so);
906 tcpstat.tcps_connattempt++;
907 tp->t_state = TCPS_SYN_SENT;
908 tp->t_timer[TCPT_KEEP] = tcp_keepinit;
909 tp->iss = tcp_new_isn(tp);
910 tcp_sendseqinit(tp);
911
912 /*
913 * Generate a CC value for this connection and
914 * check whether CC or CCnew should be used.
915 */
916 if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
917 taop = &tao_noncached;
918 bzero(taop, sizeof(*taop));
919 }
920
921 tp->cc_send = CC_INC(tcp_ccgen);
922 if (taop->tao_ccsent != 0 &&
923 CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
924 taop->tao_ccsent = tp->cc_send;
925 } else {
926 taop->tao_ccsent = 0;
927 tp->t_flags |= TF_SENDCCNEW;
928 }
929
930 return 0;
931 }
932 #endif /* INET6 */
933
934 /*
935 * The new sockopt interface makes it possible for us to block in the
936 * copyin/out step (if we take a page fault). Taking a page fault at
937 * splnet() is probably a Bad Thing. (Since sockets and pcbs both now
938 * use TSM, there probably isn't any need for this function to run at
939 * splnet() any more. This needs more examination.)
940 */
941 int
942 tcp_ctloutput(so, sopt)
943 struct socket *so;
944 struct sockopt *sopt;
945 {
946 int error, opt, optval;
947 struct inpcb *inp;
948 struct tcpcb *tp;
949
950 error = 0;
951 inp = sotoinpcb(so);
952 if (inp == NULL) {
953 return (ECONNRESET);
954 }
955 if (sopt->sopt_level != IPPROTO_TCP) {
956 #if INET6
957 if (INP_CHECK_SOCKAF(so, AF_INET6))
958 error = ip6_ctloutput(so, sopt);
959 else
960 #endif /* INET6 */
961 error = ip_ctloutput(so, sopt);
962 return (error);
963 }
964 tp = intotcpcb(inp);
965 if (tp == NULL) {
966 return (ECONNRESET);
967 }
968
969 switch (sopt->sopt_dir) {
970 case SOPT_SET:
971 switch (sopt->sopt_name) {
972 case TCP_NODELAY:
973 case TCP_NOOPT:
974 case TCP_NOPUSH:
975 error = sooptcopyin(sopt, &optval, sizeof optval,
976 sizeof optval);
977 if (error)
978 break;
979
980 switch (sopt->sopt_name) {
981 case TCP_NODELAY:
982 opt = TF_NODELAY;
983 break;
984 case TCP_NOOPT:
985 opt = TF_NOOPT;
986 break;
987 case TCP_NOPUSH:
988 opt = TF_NOPUSH;
989 break;
990 default:
991 opt = 0; /* dead code to fool gcc */
992 break;
993 }
994
995 if (optval)
996 tp->t_flags |= opt;
997 else
998 tp->t_flags &= ~opt;
999 break;
1000
1001 case TCP_MAXSEG:
1002 error = sooptcopyin(sopt, &optval, sizeof optval,
1003 sizeof optval);
1004 if (error)
1005 break;
1006
1007 if (optval > 0 && optval <= tp->t_maxseg &&
1008 optval + 40 >= tcp_minmss)
1009 tp->t_maxseg = optval;
1010 else
1011 error = EINVAL;
1012 break;
1013
1014 case TCP_KEEPALIVE:
1015 error = sooptcopyin(sopt, &optval, sizeof optval,
1016 sizeof optval);
1017 if (error)
1018 break;
1019 if (optval < 0)
1020 error = EINVAL;
1021 else
1022 tp->t_keepidle = optval * PR_SLOWHZ;
1023 break;
1024
1025 default:
1026 error = ENOPROTOOPT;
1027 break;
1028 }
1029 break;
1030
1031 case SOPT_GET:
1032 switch (sopt->sopt_name) {
1033 case TCP_NODELAY:
1034 optval = tp->t_flags & TF_NODELAY;
1035 break;
1036 case TCP_MAXSEG:
1037 optval = tp->t_maxseg;
1038 break;
1039 case TCP_KEEPALIVE:
1040 optval = tp->t_keepidle / PR_SLOWHZ;
1041 break;
1042 case TCP_NOOPT:
1043 optval = tp->t_flags & TF_NOOPT;
1044 break;
1045 case TCP_NOPUSH:
1046 optval = tp->t_flags & TF_NOPUSH;
1047 break;
1048 default:
1049 error = ENOPROTOOPT;
1050 break;
1051 }
1052 if (error == 0)
1053 error = sooptcopyout(sopt, &optval, sizeof optval);
1054 break;
1055 }
1056 return (error);
1057 }
1058
1059 /*
1060 * tcp_sendspace and tcp_recvspace are the default send and receive window
1061 * sizes, respectively. These are obsolescent (this information should
1062 * be set by the route).
1063 */
1064 u_long tcp_sendspace = 1024*16;
1065 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
1066 &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
1067 u_long tcp_recvspace = 1024*16;
1068 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
1069 &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
1070
1071 __private_extern__ int tcp_sockthreshold = 256;
1072 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sockthreshold, CTLFLAG_RW,
1073 &tcp_sockthreshold , 0, "TCP Socket size increased if less than threshold");
1074
1075 #define TCP_INCREASED_SPACE 65535 /* Automatically increase tcp send/rcv space to this value */
1076 /*
1077 * Attach TCP protocol to socket, allocating
1078 * internet protocol control block, tcp control block,
1079 * bufer space, and entering LISTEN state if to accept connections.
1080 */
1081 static int
1082 tcp_attach(so, p)
1083 struct socket *so;
1084 struct proc *p;
1085 {
1086 register struct tcpcb *tp;
1087 struct inpcb *inp;
1088 int error;
1089 #if INET6
1090 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != NULL;
1091 #endif
1092
1093 error = in_pcballoc(so, &tcbinfo, p);
1094 if (error)
1095 return (error);
1096
1097 inp = sotoinpcb(so);
1098
1099 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
1100 /*
1101 * The goal is to let clients have large send/rcv default windows (TCP_INCREASED_SPACE)
1102 * while not hogging mbuf space for servers. This is done by watching a threshold
1103 * of tcpcbs in use and bumping the default send and rcvspace only if under that threshold.
1104 * The theory being that busy servers have a lot more active tcpcbs and don't want the potential
1105 * memory penalty of having much larger sockbuffs. The sysctl allows to fine tune that threshold value. */
1106
1107 if (inp->inp_pcbinfo->ipi_count < tcp_sockthreshold)
1108 error = soreserve(so, MAX(TCP_INCREASED_SPACE, tcp_sendspace), MAX(TCP_INCREASED_SPACE,tcp_recvspace));
1109 else
1110 error = soreserve(so, tcp_sendspace, tcp_recvspace);
1111 if (error)
1112 return (error);
1113 }
1114
1115 #if INET6
1116 if (isipv6) {
1117 inp->inp_vflag |= INP_IPV6;
1118 inp->in6p_hops = -1; /* use kernel default */
1119 }
1120 else
1121 #endif /* INET6 */
1122 inp->inp_vflag |= INP_IPV4;
1123 tp = tcp_newtcpcb(inp);
1124 if (tp == 0) {
1125 int nofd = so->so_state & SS_NOFDREF; /* XXX */
1126
1127 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
1128 #if INET6
1129 if (isipv6)
1130 in6_pcbdetach(inp);
1131 else
1132 #endif /* INET6 */
1133 in_pcbdetach(inp);
1134 so->so_state |= nofd;
1135 return (ENOBUFS);
1136 }
1137 tp->t_state = TCPS_CLOSED;
1138 return (0);
1139 }
1140
1141 /*
1142 * Initiate (or continue) disconnect.
1143 * If embryonic state, just send reset (once).
1144 * If in ``let data drain'' option and linger null, just drop.
1145 * Otherwise (hard), mark socket disconnecting and drop
1146 * current input data; switch states based on user close, and
1147 * send segment to peer (with FIN).
1148 */
1149 static struct tcpcb *
1150 tcp_disconnect(tp)
1151 register struct tcpcb *tp;
1152 {
1153 struct socket *so = tp->t_inpcb->inp_socket;
1154
1155 if (tp->t_state < TCPS_ESTABLISHED)
1156 tp = tcp_close(tp);
1157 else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1158 tp = tcp_drop(tp, 0);
1159 else {
1160 soisdisconnecting(so);
1161 sbflush(&so->so_rcv);
1162 tp = tcp_usrclosed(tp);
1163 if (tp)
1164 (void) tcp_output(tp);
1165 }
1166 return (tp);
1167 }
1168
1169 /*
1170 * User issued close, and wish to trail through shutdown states:
1171 * if never received SYN, just forget it. If got a SYN from peer,
1172 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1173 * If already got a FIN from peer, then almost done; go to LAST_ACK
1174 * state. In all other cases, have already sent FIN to peer (e.g.
1175 * after PRU_SHUTDOWN), and just have to play tedious game waiting
1176 * for peer to send FIN or not respond to keep-alives, etc.
1177 * We can let the user exit from the close as soon as the FIN is acked.
1178 */
1179 static struct tcpcb *
1180 tcp_usrclosed(tp)
1181 register struct tcpcb *tp;
1182 {
1183
1184 switch (tp->t_state) {
1185
1186 case TCPS_CLOSED:
1187 case TCPS_LISTEN:
1188 tp->t_state = TCPS_CLOSED;
1189 tp = tcp_close(tp);
1190 break;
1191
1192 case TCPS_SYN_SENT:
1193 case TCPS_SYN_RECEIVED:
1194 tp->t_flags |= TF_NEEDFIN;
1195 break;
1196
1197 case TCPS_ESTABLISHED:
1198 tp->t_state = TCPS_FIN_WAIT_1;
1199 break;
1200
1201 case TCPS_CLOSE_WAIT:
1202 tp->t_state = TCPS_LAST_ACK;
1203 break;
1204 }
1205 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1206 soisdisconnected(tp->t_inpcb->inp_socket);
1207 /* To prevent the connection hanging in FIN_WAIT_2 forever. */
1208 if (tp->t_state == TCPS_FIN_WAIT_2)
1209 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
1210 }
1211 return (tp);
1212 }
1213