git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* Copyright (c) 1982, 1986, 1988, 1993
	30	* The Regents of the University of California. All rights reserved.
	31	*
	32	* Redistribution and use in source and binary forms, with or without
	33	* modification, are permitted provided that the following conditions
	34	* are met:
	35	* 1. Redistributions of source code must retain the above copyright
	36	* notice, this list of conditions and the following disclaimer.
	37	* 2. Redistributions in binary form must reproduce the above copyright
	38	* notice, this list of conditions and the following disclaimer in the
	39	* documentation and/or other materials provided with the distribution.
	40	* 3. All advertising materials mentioning features or use of this software
	41	* must display the following acknowledgement:
	42	* This product includes software developed by the University of
	43	* California, Berkeley and its contributors.
	44	* 4. Neither the name of the University nor the names of its contributors
	45	* may be used to endorse or promote products derived from this software
	46	* without specific prior written permission.
	47	*
	48	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	49	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	50	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	51	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	52	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	53	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	54	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	55	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	56	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	57	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	58	* SUCH DAMAGE.
	59	*
	60	* From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
	61	* $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.9 2001/08/22 00:59:12 silby Exp $
	62	*/
	63
	64
	65	#include <sys/param.h>
	66	#include <sys/systm.h>
	67	#include <sys/kernel.h>
	68	#include <sys/sysctl.h>
	69	#include <sys/mbuf.h>
	70	#if INET6
	71	#include <sys/domain.h>
	72	#endif /* INET6 */
	73	#include <sys/socket.h>
	74	#include <sys/socketvar.h>
	75	#include <sys/protosw.h>
	76
	77	#include <net/if.h>
	78	#include <net/route.h>
	79
	80	#include <netinet/in.h>
	81	#include <netinet/in_systm.h>
	82	#if INET6
	83	#include <netinet/ip6.h>
	84	#endif
	85	#include <netinet/in_pcb.h>
	86	#if INET6
	87	#include <netinet6/in6_pcb.h>
	88	#endif
	89	#include <netinet/in_var.h>
	90	#include <netinet/ip_var.h>
	91	#if INET6
	92	#include <netinet6/ip6_var.h>
	93	#endif
	94	#include <netinet/tcp.h>
	95	#include <netinet/tcp_fsm.h>
	96	#include <netinet/tcp_seq.h>
	97	#include <netinet/tcp_timer.h>
	98	#include <netinet/tcp_var.h>
	99	#include <netinet/tcpip.h>
	100	#if TCPDEBUG
	101	#include <netinet/tcp_debug.h>
	102	#endif
	103
	104	#if IPSEC
	105	#include <netinet6/ipsec.h>
	106	#endif /IPSEC/
	107
	108	/*
	109	* TCP protocol interface to socket abstraction.
	110	*/
	111	extern char tcpstates[]; / XXX ??? */
	112
	113	static int tcp_attach(struct socket , struct proc );
	114	static int tcp_connect(struct tcpcb , struct sockaddr , struct proc *);
	115	#if INET6
	116	static int tcp6_connect(struct tcpcb , struct sockaddr , struct proc *);
	117	#endif /* INET6 */
	118	static struct tcpcb *
	119	tcp_disconnect(struct tcpcb *);
	120	static struct tcpcb *
	121	tcp_usrclosed(struct tcpcb *);
	122
	123	#if TCPDEBUG
	124	#define TCPDEBUG0 int ostate = 0
	125	#define TCPDEBUG1() ostate = tp ? tp->t_state : 0
	126	#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \
	127	tcp_trace(TA_USER, ostate, tp, 0, 0, req)
	128	#else
	129	#define TCPDEBUG0
	130	#define TCPDEBUG1()
	131	#define TCPDEBUG2(req)
	132	#endif
	133
	134	/*
	135	* TCP attaches to socket via pru_attach(), reserving space,
	136	* and an internet control block.
	137	*/
	138	static int
	139	tcp_usr_attach(struct socket so, int proto, struct proc p)
	140	{
	141	int error;
	142	struct inpcb *inp = sotoinpcb(so);
	143	struct tcpcb *tp = 0;
	144	TCPDEBUG0;
	145
	146	TCPDEBUG1();
	147	if (inp) {
	148	error = EISCONN;
	149	goto out;
	150	}
	151
	152	error = tcp_attach(so, p);
	153	if (error)
	154	goto out;
	155
	156	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
	157	so->so_linger = TCP_LINGERTIME * hz;
	158	tp = sototcpcb(so);
	159	out:
	160	TCPDEBUG2(PRU_ATTACH);
	161	return error;
	162	}
	163
	164	/*
	165	* pru_detach() detaches the TCP protocol from the socket.
	166	* If the protocol state is non-embryonic, then can't
	167	* do this directly: have to initiate a pru_disconnect(),
	168	* which may finish later; embryonic TCB's can just
	169	* be discarded here.
	170	*/
	171	static int
	172	tcp_usr_detach(struct socket *so)
	173	{
	174	int error = 0;
	175	struct inpcb *inp = sotoinpcb(so);
	176	struct tcpcb *tp;
	177	TCPDEBUG0;
	178
	179	if (inp == 0 \|\| (inp->inp_state == INPCB_STATE_DEAD)) {
	180	return EINVAL; /* XXX */
	181	}
	182	#if 1
	183	lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
	184	#endif
	185	tp = intotcpcb(inp);
	186	/* In case we got disconnected from the peer */
	187	if (tp == 0)
	188	goto out;
	189	TCPDEBUG1();
	190	tp = tcp_disconnect(tp);
	191	out:
	192	TCPDEBUG2(PRU_DETACH);
	193	return error;
	194	}
	195
	196	#define COMMON_START() TCPDEBUG0; \
	197	do { \
	198	if (inp == 0 \|\| (inp->inp_state == INPCB_STATE_DEAD)) { \
	199	return EINVAL; \
	200	} \
	201	tp = intotcpcb(inp); \
	202	TCPDEBUG1(); \
	203	} while(0)
	204
	205	#define COMMON_END(req) out: TCPDEBUG2(req); return error; goto out
	206
	207
	208	/*
	209	* Give the socket an address.
	210	*/
	211	static int
	212	tcp_usr_bind(struct socket so, struct sockaddr nam, struct proc *p)
	213	{
	214	int error = 0;
	215	struct inpcb *inp = sotoinpcb(so);
	216	struct tcpcb *tp;
	217	struct sockaddr_in *sinp;
	218
	219	COMMON_START();
	220
	221	/*
	222	* Must check for multicast addresses and disallow binding
	223	* to them.
	224	*/
	225	sinp = (struct sockaddr_in *)nam;
	226	if (sinp->sin_family == AF_INET &&
	227	IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
	228	error = EAFNOSUPPORT;
	229	goto out;
	230	}
	231	error = in_pcbbind(inp, nam, p);
	232	if (error)
	233	goto out;
	234	COMMON_END(PRU_BIND);
	235
	236	}
	237
	238	#if INET6
	239	static int
	240	tcp6_usr_bind(struct socket so, struct sockaddr nam, struct proc *p)
	241	{
	242	int error = 0;
	243	struct inpcb *inp = sotoinpcb(so);
	244	struct tcpcb *tp;
	245	struct sockaddr_in6 *sin6p;
	246
	247	COMMON_START();
	248
	249	/*
	250	* Must check for multicast addresses and disallow binding
	251	* to them.
	252	*/
	253	sin6p = (struct sockaddr_in6 *)nam;
	254	if (sin6p->sin6_family == AF_INET6 &&
	255	IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
	256	error = EAFNOSUPPORT;
	257	goto out;
	258	}
	259	inp->inp_vflag &= ~INP_IPV4;
	260	inp->inp_vflag \|= INP_IPV6;
	261	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
	262	if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
	263	inp->inp_vflag \|= INP_IPV4;
	264	else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
	265	struct sockaddr_in sin;
	266
	267	in6_sin6_2_sin(&sin, sin6p);
	268	inp->inp_vflag \|= INP_IPV4;
	269	inp->inp_vflag &= ~INP_IPV6;
	270	error = in_pcbbind(inp, (struct sockaddr *)&sin, p);
	271	goto out;
	272	}
	273	}
	274	error = in6_pcbbind(inp, nam, p);
	275	if (error)
	276	goto out;
	277	COMMON_END(PRU_BIND);
	278	}
	279	#endif /* INET6 */
	280
	281	/*
	282	* Prepare to accept connections.
	283	*/
	284	static int
	285	tcp_usr_listen(struct socket so, struct proc p)
	286	{
	287	int error = 0;
	288	struct inpcb *inp = sotoinpcb(so);
	289	struct tcpcb *tp;
	290
	291	COMMON_START();
	292	if (inp->inp_lport == 0)
	293	error = in_pcbbind(inp, (struct sockaddr *)0, p);
	294	if (error == 0)
	295	tp->t_state = TCPS_LISTEN;
	296	COMMON_END(PRU_LISTEN);
	297	}
	298
	299	#if INET6
	300	static int
	301	tcp6_usr_listen(struct socket so, struct proc p)
	302	{
	303	int error = 0;
	304	struct inpcb *inp = sotoinpcb(so);
	305	struct tcpcb *tp;
	306
	307	COMMON_START();
	308	if (inp->inp_lport == 0) {
	309	inp->inp_vflag &= ~INP_IPV4;
	310	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
	311	inp->inp_vflag \|= INP_IPV4;
	312	error = in6_pcbbind(inp, (struct sockaddr *)0, p);
	313	}
	314	if (error == 0)
	315	tp->t_state = TCPS_LISTEN;
	316	COMMON_END(PRU_LISTEN);
	317	}
	318	#endif /* INET6 */
	319
	320	/*
	321	* Initiate connection to peer.
	322	* Create a template for use in transmissions on this connection.
	323	* Enter SYN_SENT state, and mark socket as connecting.
	324	* Start keep-alive timer, and seed output sequence space.
	325	* Send initial segment on connection.
	326	*/
	327	static int
	328	tcp_usr_connect(struct socket so, struct sockaddr nam, struct proc *p)
	329	{
	330	int error = 0;
	331	struct inpcb *inp = sotoinpcb(so);
	332	struct tcpcb *tp;
	333	struct sockaddr_in *sinp;
	334
	335	COMMON_START();
	336
	337	/*
	338	* Must disallow TCP ``connections'' to multicast addresses.
	339	*/
	340	sinp = (struct sockaddr_in *)nam;
	341	if (sinp->sin_family == AF_INET
	342	&& IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
	343	error = EAFNOSUPPORT;
	344	goto out;
	345	}
	346
	347	#ifndef __APPLE__
	348	prison_remote_ip(p, 0, &sinp->sin_addr.s_addr);
	349	#endif
	350
	351	if ((error = tcp_connect(tp, nam, p)) != 0)
	352	goto out;
	353	error = tcp_output(tp);
	354	COMMON_END(PRU_CONNECT);
	355	}
	356
	357	#if INET6
	358	static int
	359	tcp6_usr_connect(struct socket so, struct sockaddr nam, struct proc *p)
	360	{
	361	int error = 0;
	362	struct inpcb *inp = sotoinpcb(so);
	363	struct tcpcb *tp;
	364	struct sockaddr_in6 *sin6p;
	365
	366	COMMON_START();
	367
	368	/*
	369	* Must disallow TCP ``connections'' to multicast addresses.
	370	*/
	371	sin6p = (struct sockaddr_in6 *)nam;
	372	if (sin6p->sin6_family == AF_INET6
	373	&& IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
	374	error = EAFNOSUPPORT;
	375	goto out;
	376	}
	377
	378	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
	379	struct sockaddr_in sin;
	380
	381	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
	382	return (EINVAL);
	383
	384	in6_sin6_2_sin(&sin, sin6p);
	385	inp->inp_vflag \|= INP_IPV4;
	386	inp->inp_vflag &= ~INP_IPV6;
	387	if ((error = tcp_connect(tp, (struct sockaddr *)&sin, p)) != 0)
	388	goto out;
	389	error = tcp_output(tp);
	390	goto out;
	391	}
	392	inp->inp_vflag &= ~INP_IPV4;
	393	inp->inp_vflag \|= INP_IPV6;
	394	if ((error = tcp6_connect(tp, nam, p)) != 0)
	395	goto out;
	396	error = tcp_output(tp);
	397	if (error)
	398	goto out;
	399	COMMON_END(PRU_CONNECT);
	400	}
	401	#endif /* INET6 */
	402
	403	/*
	404	* Initiate disconnect from peer.
	405	* If connection never passed embryonic stage, just drop;
	406	* else if don't need to let data drain, then can just drop anyways,
	407	* else have to begin TCP shutdown process: mark socket disconnecting,
	408	* drain unread data, state switch to reflect user close, and
	409	* send segment (e.g. FIN) to peer. Socket will be really disconnected
	410	* when peer sends FIN and acks ours.
	411	*
	412	* SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
	413	*/
	414	static int
	415	tcp_usr_disconnect(struct socket *so)
	416	{
	417	int error = 0;
	418	struct inpcb *inp = sotoinpcb(so);
	419	struct tcpcb *tp;
	420
	421	#if 1
	422	lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
	423	#endif
	424	COMMON_START();
	425	/* In case we got disconnected from the peer */
	426	if (tp == 0)
	427	goto out;
	428	tp = tcp_disconnect(tp);
	429	COMMON_END(PRU_DISCONNECT);
	430	}
	431
	432	/*
	433	* Accept a connection. Essentially all the work is
	434	* done at higher levels; just return the address
	435	* of the peer, storing through addr.
	436	*/
	437	static int
	438	tcp_usr_accept(struct socket so, struct sockaddr *nam)
	439	{
	440	int error = 0;
	441	struct inpcb *inp = sotoinpcb(so);
	442	struct tcpcb *tp = NULL;
	443	TCPDEBUG0;
	444
	445	if (so->so_state & SS_ISDISCONNECTED) {
	446	error = ECONNABORTED;
	447	goto out;
	448	}
	449	if (inp == 0 \|\| (inp->inp_state == INPCB_STATE_DEAD)) {
	450	return (EINVAL);
	451	}
	452	tp = intotcpcb(inp);
	453	TCPDEBUG1();
	454	in_setpeeraddr(so, nam);
	455	COMMON_END(PRU_ACCEPT);
	456	}
	457
	458	#if INET6
	459	static int
	460	tcp6_usr_accept(struct socket so, struct sockaddr *nam)
	461	{
	462	int error = 0;
	463	struct inpcb *inp = sotoinpcb(so);
	464	struct tcpcb *tp = NULL;
	465	TCPDEBUG0;
	466
	467	if (so->so_state & SS_ISDISCONNECTED) {
	468	error = ECONNABORTED;
	469	goto out;
	470	}
	471	if (inp == 0 \|\| (inp->inp_state == INPCB_STATE_DEAD)) {
	472	return (EINVAL);
	473	}
	474	tp = intotcpcb(inp);
	475	TCPDEBUG1();
	476	in6_mapped_peeraddr(so, nam);
	477	COMMON_END(PRU_ACCEPT);
	478	}
	479	#endif /* INET6 */
	480	/*
	481	* Mark the connection as being incapable of further output.
	482	*/
	483	static int
	484	tcp_usr_shutdown(struct socket *so)
	485	{
	486	int error = 0;
	487	struct inpcb *inp = sotoinpcb(so);
	488	struct tcpcb *tp;
	489
	490	COMMON_START();
	491	socantsendmore(so);
	492	/* In case we got disconnected from the peer */
	493	if (tp == 0)
	494	goto out;
	495	tp = tcp_usrclosed(tp);
	496	if (tp)
	497	error = tcp_output(tp);
	498	COMMON_END(PRU_SHUTDOWN);
	499	}
	500
	501	/*
	502	* After a receive, possibly send window update to peer.
	503	*/
	504	static int
	505	tcp_usr_rcvd(struct socket *so, int flags)
	506	{
	507	int error = 0;
	508	struct inpcb *inp = sotoinpcb(so);
	509	struct tcpcb *tp;
	510
	511	COMMON_START();
	512	/* In case we got disconnected from the peer */
	513	if (tp == 0)
	514	goto out;
	515	tcp_output(tp);
	516	COMMON_END(PRU_RCVD);
	517	}
	518
	519	/*
	520	* Do a send by putting data in output queue and updating urgent
	521	* marker if URG set. Possibly send more data. Unlike the other
	522	* pru_*() routines, the mbuf chains are our responsibility. We
	523	* must either enqueue them or free them. The other pru_* routines
	524	* generally are caller-frees.
	525	*/
	526	static int
	527	tcp_usr_send(struct socket so, int flags, struct mbuf m,
	528	struct sockaddr nam, struct mbuf control, struct proc *p)
	529	{
	530	int error = 0;
	531	struct inpcb *inp = sotoinpcb(so);
	532	struct tcpcb *tp;
	533	#if INET6
	534	int isipv6;
	535	#endif
	536	TCPDEBUG0;
	537
	538	if (inp == NULL \|\| inp->inp_state == INPCB_STATE_DEAD) {
	539	/*
	540	* OOPS! we lost a race, the TCP session got reset after
	541	* we checked SS_CANTSENDMORE, eg: while doing uiomove or a
	542	* network interrupt in the non-splnet() section of sosend().
	543	*/
	544	if (m)
	545	m_freem(m);
	546	if (control)
	547	m_freem(control);
	548	error = ECONNRESET; /* XXX EPIPE? */
	549	tp = NULL;
	550	TCPDEBUG1();
	551	goto out;
	552	}
	553	#if INET6
	554	isipv6 = nam && nam->sa_family == AF_INET6;
	555	#endif /* INET6 */
	556	tp = intotcpcb(inp);
	557	TCPDEBUG1();
	558	if (control) {
	559	/* TCP doesn't do control messages (rights, creds, etc) */
	560	if (control->m_len) {
	561	m_freem(control);
	562	if (m)
	563	m_freem(m);
	564	error = EINVAL;
	565	goto out;
	566	}
	567	m_freem(control); /* empty control, just free it */
	568	}
	569	if(!(flags & PRUS_OOB)) {
	570	sbappend(&so->so_snd, m);
	571	if (nam && tp->t_state < TCPS_SYN_SENT) {
	572	/*
	573	* Do implied connect if not yet connected,
	574	* initialize window to default value, and
	575	* initialize maxseg/maxopd using peer's cached
	576	* MSS.
	577	*/
	578	#if INET6
	579	if (isipv6)
	580	error = tcp6_connect(tp, nam, p);
	581	else
	582	#endif /* INET6 */
	583	error = tcp_connect(tp, nam, p);
	584	if (error)
	585	goto out;
	586	tp->snd_wnd = TTCP_CLIENT_SND_WND;
	587	tcp_mss(tp, -1);
	588	}
	589
	590	if (flags & PRUS_EOF) {
	591	/*
	592	* Close the send side of the connection after
	593	* the data is sent.
	594	*/
	595	socantsendmore(so);
	596	tp = tcp_usrclosed(tp);
	597	}
	598	if (tp != NULL) {
	599	if (flags & PRUS_MORETOCOME)
	600	tp->t_flags \|= TF_MORETOCOME;
	601	error = tcp_output(tp);
	602	if (flags & PRUS_MORETOCOME)
	603	tp->t_flags &= ~TF_MORETOCOME;
	604	}
	605	} else {
	606	if (sbspace(&so->so_snd) < -512) {
	607	m_freem(m);
	608	error = ENOBUFS;
	609	goto out;
	610	}
	611	/*
	612	* According to RFC961 (Assigned Protocols),
	613	* the urgent pointer points to the last octet
	614	* of urgent data. We continue, however,
	615	* to consider it to indicate the first octet
	616	* of data past the urgent section.
	617	* Otherwise, snd_up should be one lower.
	618	*/
	619	sbappend(&so->so_snd, m);
	620	if (nam && tp->t_state < TCPS_SYN_SENT) {
	621	/*
	622	* Do implied connect if not yet connected,
	623	* initialize window to default value, and
	624	* initialize maxseg/maxopd using peer's cached
	625	* MSS.
	626	*/
	627	#if INET6
	628	if (isipv6)
	629	error = tcp6_connect(tp, nam, p);
	630	else
	631	#endif /* INET6 */
	632	error = tcp_connect(tp, nam, p);
	633	if (error)
	634	goto out;
	635	tp->snd_wnd = TTCP_CLIENT_SND_WND;
	636	tcp_mss(tp, -1);
	637	}
	638	tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
	639	tp->t_force = 1;
	640	error = tcp_output(tp);
	641	tp->t_force = 0;
	642	}
	643	COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
	644	((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
	645	}
	646
	647	/*
	648	* Abort the TCP.
	649	*/
	650	static int
	651	tcp_usr_abort(struct socket *so)
	652	{
	653	int error = 0;
	654	struct inpcb *inp = sotoinpcb(so);
	655	struct tcpcb *tp;
	656
	657	COMMON_START();
	658	/* In case we got disconnected from the peer */
	659	if (tp == 0)
	660	goto out;
	661	tp = tcp_drop(tp, ECONNABORTED);
	662	so->so_usecount--;
	663	COMMON_END(PRU_ABORT);
	664	}
	665
	666	/*
	667	* Receive out-of-band data.
	668	*/
	669	static int
	670	tcp_usr_rcvoob(struct socket so, struct mbuf m, int flags)
	671	{
	672	int error = 0;
	673	struct inpcb *inp = sotoinpcb(so);
	674	struct tcpcb *tp;
	675
	676	COMMON_START();
	677	if ((so->so_oobmark == 0 &&
	678	(so->so_state & SS_RCVATMARK) == 0) \|\|
	679	so->so_options & SO_OOBINLINE \|\|
	680	tp->t_oobflags & TCPOOB_HADDATA) {
	681	error = EINVAL;
	682	goto out;
	683	}
	684	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
	685	error = EWOULDBLOCK;
	686	goto out;
	687	}
	688	m->m_len = 1;
	689	*mtod(m, caddr_t) = tp->t_iobc;
	690	if ((flags & MSG_PEEK) == 0)
	691	tp->t_oobflags ^= (TCPOOB_HAVEDATA \| TCPOOB_HADDATA);
	692	COMMON_END(PRU_RCVOOB);
	693	}
	694
	695	/* xxx - should be const */
	696	struct pr_usrreqs tcp_usrreqs = {
	697	tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind,
	698	tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach,
	699	tcp_usr_disconnect, tcp_usr_listen, in_setpeeraddr, tcp_usr_rcvd,
	700	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
	701	in_setsockaddr, sosend, soreceive, pru_sopoll_notsupp
	702	};
	703
	704	#if INET6
	705	struct pr_usrreqs tcp6_usrreqs = {
	706	tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind,
	707	tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach,
	708	tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd,
	709	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
	710	in6_mapped_sockaddr, sosend, soreceive, pru_sopoll_notsupp
	711	};
	712	#endif /* INET6 */
	713
	714	/*
	715	* Common subroutine to open a TCP connection to remote host specified
	716	* by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
	717	* port number if needed. Call in_pcbladdr to do the routing and to choose
	718	* a local host address (interface). If there is an existing incarnation
	719	* of the same connection in TIME-WAIT state and if the remote host was
	720	* sending CC options and if the connection duration was < MSL, then
	721	* truncate the previous TIME-WAIT state and proceed.
	722	* Initialize connection parameters and enter SYN-SENT state.
	723	*/
	724	static int
	725	tcp_connect(tp, nam, p)
	726	register struct tcpcb *tp;
	727	struct sockaddr *nam;
	728	struct proc *p;
	729	{
	730	struct inpcb inp = tp->t_inpcb, oinp;
	731	struct socket *so = inp->inp_socket;
	732	struct tcpcb *otp;
	733	struct sockaddr_in sin = (struct sockaddr_in )nam;
	734	struct sockaddr_in *ifaddr;
	735	struct rmxp_tao *taop;
	736	struct rmxp_tao tao_noncached;
	737	int error;
	738
	739	if (inp->inp_lport == 0) {
	740	error = in_pcbbind(inp, (struct sockaddr *)0, p);
	741	if (error)
	742	return error;
	743	}
	744
	745	/*
	746	* Cannot simply call in_pcbconnect, because there might be an
	747	* earlier incarnation of this same connection still in
	748	* TIME_WAIT state, creating an ADDRINUSE error.
	749	*/
	750	error = in_pcbladdr(inp, nam, &ifaddr);
	751	if (error)
	752	return error;
	753
	754	tcp_unlock(inp->inp_socket, 0, 0);
	755	oinp = in_pcblookup_hash(inp->inp_pcbinfo,
	756	sin->sin_addr, sin->sin_port,
	757	inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr
	758	: ifaddr->sin_addr,
	759	inp->inp_lport, 0, NULL);
	760
	761	tcp_lock(inp->inp_socket, 0, 0);
	762	if (oinp) {
	763	if (oinp != inp) /* 4143933: avoid deadlock if inp == oinp */
	764	tcp_lock(oinp->inp_socket, 1, 0);
	765	if (in_pcb_checkstate(oinp, WNT_RELEASE, 1) == WNT_STOPUSING) {
	766	if (oinp != inp)
	767	tcp_unlock(oinp->inp_socket, 1, 0);
	768	goto skip_oinp;
	769	}
	770
	771	if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
	772	otp->t_state == TCPS_TIME_WAIT &&
	773	otp->t_starttime < tcp_msl &&
	774	(otp->t_flags & TF_RCVD_CC))
	775	otp = tcp_close(otp);
	776	else {
	777	printf("tcp_connect: inp=%x err=EADDRINUSE\n", inp);
	778	if (oinp != inp)
	779	tcp_unlock(oinp->inp_socket, 1, 0);
	780	return EADDRINUSE;
	781	}
	782	if (oinp != inp)
	783	tcp_unlock(oinp->inp_socket, 1, 0);
	784	}
	785	skip_oinp:
	786	if ((inp->inp_laddr.s_addr == INADDR_ANY ? ifaddr->sin_addr.s_addr :
	787	inp->inp_laddr.s_addr) == sin->sin_addr.s_addr &&
	788	inp->inp_lport == sin->sin_port)
	789	return EINVAL;
	790	if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
	791	/lock inversion issue, mostly with udp multicast packets /
	792	socket_unlock(inp->inp_socket, 0);
	793	lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
	794	socket_lock(inp->inp_socket, 0);
	795	}
	796	if (inp->inp_laddr.s_addr == INADDR_ANY)
	797	inp->inp_laddr = ifaddr->sin_addr;
	798	inp->inp_faddr = sin->sin_addr;
	799	inp->inp_fport = sin->sin_port;
	800	in_pcbrehash(inp);
	801	lck_rw_done(inp->inp_pcbinfo->mtx);
	802
	803	/* Compute window scaling to request. */
	804	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
	805	(TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
	806	tp->request_r_scale++;
	807
	808	soisconnecting(so);
	809	tcpstat.tcps_connattempt++;
	810	tp->t_state = TCPS_SYN_SENT;
	811	tp->t_timer[TCPT_KEEP] = tcp_keepinit;
	812	tp->iss = tcp_new_isn(tp);
	813	tcp_sendseqinit(tp);
	814
	815	/*
	816	* Generate a CC value for this connection and
	817	* check whether CC or CCnew should be used.
	818	*/
	819	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
	820	taop = &tao_noncached;
	821	bzero(taop, sizeof(*taop));
	822	}
	823
	824	tp->cc_send = CC_INC(tcp_ccgen);
	825	if (taop->tao_ccsent != 0 &&
	826	CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
	827	taop->tao_ccsent = tp->cc_send;
	828	} else {
	829	taop->tao_ccsent = 0;
	830	tp->t_flags \|= TF_SENDCCNEW;
	831	}
	832
	833	return 0;
	834	}
	835
	836	#if INET6
	837	static int
	838	tcp6_connect(tp, nam, p)
	839	register struct tcpcb *tp;
	840	struct sockaddr *nam;
	841	struct proc *p;
	842	{
	843	struct inpcb inp = tp->t_inpcb, oinp;
	844	struct socket *so = inp->inp_socket;
	845	struct tcpcb *otp;
	846	struct sockaddr_in6 sin6 = (struct sockaddr_in6 )nam;
	847	struct in6_addr addr6;
	848	struct rmxp_tao *taop;
	849	struct rmxp_tao tao_noncached;
	850	int error;
	851
	852	if (inp->inp_lport == 0) {
	853	error = in6_pcbbind(inp, (struct sockaddr *)0, p);
	854	if (error)
	855	return error;
	856	}
	857
	858	/*
	859	* Cannot simply call in_pcbconnect, because there might be an
	860	* earlier incarnation of this same connection still in
	861	* TIME_WAIT state, creating an ADDRINUSE error.
	862	*/
	863	error = in6_pcbladdr(inp, nam, &addr6);
	864	if (error)
	865	return error;
	866	tcp_unlock(inp->inp_socket, 0, 0);
	867	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
	868	&sin6->sin6_addr, sin6->sin6_port,
	869	IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
	870	? &addr6
	871	: &inp->in6p_laddr,
	872	inp->inp_lport, 0, NULL);
	873	tcp_lock(inp->inp_socket, 0, 0);
	874	if (oinp) {
	875	if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
	876	otp->t_state == TCPS_TIME_WAIT &&
	877	otp->t_starttime < tcp_msl &&
	878	(otp->t_flags & TF_RCVD_CC))
	879	otp = tcp_close(otp);
	880	else
	881	return EADDRINUSE;
	882	}
	883	if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
	884	/lock inversion issue, mostly with udp multicast packets /
	885	socket_unlock(inp->inp_socket, 0);
	886	lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
	887	socket_lock(inp->inp_socket, 0);
	888	}
	889	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
	890	inp->in6p_laddr = addr6;
	891	inp->in6p_faddr = sin6->sin6_addr;
	892	inp->inp_fport = sin6->sin6_port;
	893	if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != NULL)
	894	inp->in6p_flowinfo = sin6->sin6_flowinfo;
	895	in_pcbrehash(inp);
	896	lck_rw_done(inp->inp_pcbinfo->mtx);
	897
	898	/* Compute window scaling to request. */
	899	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
	900	(TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
	901	tp->request_r_scale++;
	902
	903	soisconnecting(so);
	904	tcpstat.tcps_connattempt++;
	905	tp->t_state = TCPS_SYN_SENT;
	906	tp->t_timer[TCPT_KEEP] = tcp_keepinit;
	907	tp->iss = tcp_new_isn(tp);
	908	tcp_sendseqinit(tp);
	909
	910	/*
	911	* Generate a CC value for this connection and
	912	* check whether CC or CCnew should be used.
	913	*/
	914	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
	915	taop = &tao_noncached;
	916	bzero(taop, sizeof(*taop));
	917	}
	918
	919	tp->cc_send = CC_INC(tcp_ccgen);
	920	if (taop->tao_ccsent != 0 &&
	921	CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
	922	taop->tao_ccsent = tp->cc_send;
	923	} else {
	924	taop->tao_ccsent = 0;
	925	tp->t_flags \|= TF_SENDCCNEW;
	926	}
	927
	928	return 0;
	929	}
	930	#endif /* INET6 */
	931
	932	/*
	933	* The new sockopt interface makes it possible for us to block in the
	934	* copyin/out step (if we take a page fault). Taking a page fault at
	935	* splnet() is probably a Bad Thing. (Since sockets and pcbs both now
	936	* use TSM, there probably isn't any need for this function to run at
	937	* splnet() any more. This needs more examination.)
	938	*/
	939	int
	940	tcp_ctloutput(so, sopt)
	941	struct socket *so;
	942	struct sockopt *sopt;
	943	{
	944	int error, opt, optval;
	945	struct inpcb *inp;
	946	struct tcpcb *tp;
	947
	948	error = 0;
	949	inp = sotoinpcb(so);
	950	if (inp == NULL) {
	951	return (ECONNRESET);
	952	}
	953	if (sopt->sopt_level != IPPROTO_TCP) {
	954	#if INET6
	955	if (INP_CHECK_SOCKAF(so, AF_INET6))
	956	error = ip6_ctloutput(so, sopt);
	957	else
	958	#endif /* INET6 */
	959	error = ip_ctloutput(so, sopt);
	960	return (error);
	961	}
	962	tp = intotcpcb(inp);
	963	if (tp == NULL) {
	964	return (ECONNRESET);
	965	}
	966
	967	switch (sopt->sopt_dir) {
	968	case SOPT_SET:
	969	switch (sopt->sopt_name) {
	970	case TCP_NODELAY:
	971	case TCP_NOOPT:
	972	case TCP_NOPUSH:
	973	error = sooptcopyin(sopt, &optval, sizeof optval,
	974	sizeof optval);
	975	if (error)
	976	break;
	977
	978	switch (sopt->sopt_name) {
	979	case TCP_NODELAY:
	980	opt = TF_NODELAY;
	981	break;
	982	case TCP_NOOPT:
	983	opt = TF_NOOPT;
	984	break;
	985	case TCP_NOPUSH:
	986	opt = TF_NOPUSH;
	987	break;
	988	default:
	989	opt = 0; /* dead code to fool gcc */
	990	break;
	991	}
	992
	993	if (optval)
	994	tp->t_flags \|= opt;
	995	else
	996	tp->t_flags &= ~opt;
	997	break;
	998
	999	case TCP_MAXSEG:
	1000	error = sooptcopyin(sopt, &optval, sizeof optval,
	1001	sizeof optval);
	1002	if (error)
	1003	break;
	1004
	1005	if (optval > 0 && optval <= tp->t_maxseg &&
	1006	optval + 40 >= tcp_minmss)
	1007	tp->t_maxseg = optval;
	1008	else
	1009	error = EINVAL;
	1010	break;
	1011
	1012	case TCP_KEEPALIVE:
	1013	error = sooptcopyin(sopt, &optval, sizeof optval,
	1014	sizeof optval);
	1015	if (error)
	1016	break;
	1017	if (optval < 0)
	1018	error = EINVAL;
	1019	else
	1020	tp->t_keepidle = optval * PR_SLOWHZ;
	1021	break;
	1022
	1023	default:
	1024	error = ENOPROTOOPT;
	1025	break;
	1026	}
	1027	break;
	1028
	1029	case SOPT_GET:
	1030	switch (sopt->sopt_name) {
	1031	case TCP_NODELAY:
	1032	optval = tp->t_flags & TF_NODELAY;
	1033	break;
	1034	case TCP_MAXSEG:
	1035	optval = tp->t_maxseg;
	1036	break;
	1037	case TCP_KEEPALIVE:
	1038	optval = tp->t_keepidle / PR_SLOWHZ;
	1039	break;
	1040	case TCP_NOOPT:
	1041	optval = tp->t_flags & TF_NOOPT;
	1042	break;
	1043	case TCP_NOPUSH:
	1044	optval = tp->t_flags & TF_NOPUSH;
	1045	break;
	1046	default:
	1047	error = ENOPROTOOPT;
	1048	break;
	1049	}
	1050	if (error == 0)
	1051	error = sooptcopyout(sopt, &optval, sizeof optval);
	1052	break;
	1053	}
	1054	return (error);
	1055	}
	1056
	1057	/*
	1058	* tcp_sendspace and tcp_recvspace are the default send and receive window
	1059	* sizes, respectively. These are obsolescent (this information should
	1060	* be set by the route).
	1061	*/
	1062	u_long tcp_sendspace = 1024*16;
	1063	SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
	1064	&tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
	1065	u_long tcp_recvspace = 1024*16;
	1066	SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
	1067	&tcp_recvspace , 0, "Maximum incoming TCP datagram size");
	1068
	1069	__private_extern__ int tcp_sockthreshold = 256;
	1070	SYSCTL_INT(_net_inet_tcp, OID_AUTO, sockthreshold, CTLFLAG_RW,
	1071	&tcp_sockthreshold , 0, "TCP Socket size increased if less than threshold");
	1072
	1073	#define TCP_INCREASED_SPACE 65535 /* Automatically increase tcp send/rcv space to this value */
	1074	/*
	1075	* Attach TCP protocol to socket, allocating
	1076	* internet protocol control block, tcp control block,
	1077	* bufer space, and entering LISTEN state if to accept connections.
	1078	*/
	1079	static int
	1080	tcp_attach(so, p)
	1081	struct socket *so;
	1082	struct proc *p;
	1083	{
	1084	register struct tcpcb *tp;
	1085	struct inpcb *inp;
	1086	int error;
	1087	#if INET6
	1088	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != NULL;
	1089	#endif
	1090
	1091	error = in_pcballoc(so, &tcbinfo, p);
	1092	if (error)
	1093	return (error);
	1094
	1095	inp = sotoinpcb(so);
	1096
	1097	if (so->so_snd.sb_hiwat == 0 \|\| so->so_rcv.sb_hiwat == 0) {
	1098	/*
	1099	* The goal is to let clients have large send/rcv default windows (TCP_INCREASED_SPACE)
	1100	* while not hogging mbuf space for servers. This is done by watching a threshold
	1101	* of tcpcbs in use and bumping the default send and rcvspace only if under that threshold.
	1102	* The theory being that busy servers have a lot more active tcpcbs and don't want the potential
	1103	* memory penalty of having much larger sockbuffs. The sysctl allows to fine tune that threshold value. */
	1104
	1105	if (inp->inp_pcbinfo->ipi_count < tcp_sockthreshold)
	1106	error = soreserve(so, MAX(TCP_INCREASED_SPACE, tcp_sendspace), MAX(TCP_INCREASED_SPACE,tcp_recvspace));
	1107	else
	1108	error = soreserve(so, tcp_sendspace, tcp_recvspace);
	1109	if (error)
	1110	return (error);
	1111	}
	1112
	1113	#if INET6
	1114	if (isipv6) {
	1115	inp->inp_vflag \|= INP_IPV6;
	1116	inp->in6p_hops = -1; /* use kernel default */
	1117	}
	1118	else
	1119	#endif /* INET6 */
	1120	inp->inp_vflag \|= INP_IPV4;
	1121	tp = tcp_newtcpcb(inp);
	1122	if (tp == 0) {
	1123	int nofd = so->so_state & SS_NOFDREF; /* XXX */
	1124
	1125	so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
	1126	#if INET6
	1127	if (isipv6)
	1128	in6_pcbdetach(inp);
	1129	else
	1130	#endif /* INET6 */
	1131	in_pcbdetach(inp);
	1132	so->so_state \|= nofd;
	1133	return (ENOBUFS);
	1134	}
	1135	tp->t_state = TCPS_CLOSED;
	1136	return (0);
	1137	}
	1138
	1139	/*
	1140	* Initiate (or continue) disconnect.
	1141	* If embryonic state, just send reset (once).
	1142	* If in ``let data drain'' option and linger null, just drop.
	1143	* Otherwise (hard), mark socket disconnecting and drop
	1144	* current input data; switch states based on user close, and
	1145	* send segment to peer (with FIN).
	1146	*/
	1147	static struct tcpcb *
	1148	tcp_disconnect(tp)
	1149	register struct tcpcb *tp;
	1150	{
	1151	struct socket *so = tp->t_inpcb->inp_socket;
	1152
	1153	if (tp->t_state < TCPS_ESTABLISHED)
	1154	tp = tcp_close(tp);
	1155	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
	1156	tp = tcp_drop(tp, 0);
	1157	else {
	1158	soisdisconnecting(so);
	1159	sbflush(&so->so_rcv);
	1160	tp = tcp_usrclosed(tp);
	1161	if (tp)
	1162	(void) tcp_output(tp);
	1163	}
	1164	return (tp);
	1165	}
	1166
	1167	/*
	1168	* User issued close, and wish to trail through shutdown states:
	1169	* if never received SYN, just forget it. If got a SYN from peer,
	1170	* but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
	1171	* If already got a FIN from peer, then almost done; go to LAST_ACK
	1172	* state. In all other cases, have already sent FIN to peer (e.g.
	1173	* after PRU_SHUTDOWN), and just have to play tedious game waiting
	1174	* for peer to send FIN or not respond to keep-alives, etc.
	1175	* We can let the user exit from the close as soon as the FIN is acked.
	1176	*/
	1177	static struct tcpcb *
	1178	tcp_usrclosed(tp)
	1179	register struct tcpcb *tp;
	1180	{
	1181
	1182	switch (tp->t_state) {
	1183
	1184	case TCPS_CLOSED:
	1185	case TCPS_LISTEN:
	1186	tp->t_state = TCPS_CLOSED;
	1187	tp = tcp_close(tp);
	1188	break;
	1189
	1190	case TCPS_SYN_SENT:
	1191	case TCPS_SYN_RECEIVED:
	1192	tp->t_flags \|= TF_NEEDFIN;
	1193	break;
	1194
	1195	case TCPS_ESTABLISHED:
	1196	tp->t_state = TCPS_FIN_WAIT_1;
	1197	break;
	1198
	1199	case TCPS_CLOSE_WAIT:
	1200	tp->t_state = TCPS_LAST_ACK;
	1201	break;
	1202	}
	1203	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
	1204	soisdisconnected(tp->t_inpcb->inp_socket);
	1205	/* To prevent the connection hanging in FIN_WAIT_2 forever. */
	1206	if (tp->t_state == TCPS_FIN_WAIT_2)
	1207	tp->t_timer[TCPT_2MSL] = tcp_maxidle;
	1208	}
	1209	return (tp);
	1210	}
	1211