git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2015 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	30	* The Regents of the University of California. All rights reserved.
	31	*
	32	* Redistribution and use in source and binary forms, with or without
	33	* modification, are permitted provided that the following conditions
	34	* are met:
	35	* 1. Redistributions of source code must retain the above copyright
	36	* notice, this list of conditions and the following disclaimer.
	37	* 2. Redistributions in binary form must reproduce the above copyright
	38	* notice, this list of conditions and the following disclaimer in the
	39	* documentation and/or other materials provided with the distribution.
	40	* 3. All advertising materials mentioning features or use of this software
	41	* must display the following acknowledgement:
	42	* This product includes software developed by the University of
	43	* California, Berkeley and its contributors.
	44	* 4. Neither the name of the University nor the names of its contributors
	45	* may be used to endorse or promote products derived from this software
	46	* without specific prior written permission.
	47	*
	48	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	49	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	50	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	51	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	52	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	53	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	54	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	55	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	56	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	57	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	58	* SUCH DAMAGE.
	59	*
	60	* @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
	61	* $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.73.2.22 2001/08/22 00:59:12 silby Exp $
	62	*/
	63	/*
	64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
	65	* support for mandatory and extensible security protections. This notice
	66	* is included in support of clause 2.2 (b) of the Apple Public License,
	67	* Version 2.0.
	68	*/
	69
	70	#include <sys/param.h>
	71	#include <sys/systm.h>
	72	#include <sys/callout.h>
	73	#include <sys/kernel.h>
	74	#include <sys/sysctl.h>
	75	#include <sys/malloc.h>
	76	#include <sys/mbuf.h>
	77	#include <sys/domain.h>
	78	#include <sys/proc.h>
	79	#include <sys/kauth.h>
	80	#include <sys/socket.h>
	81	#include <sys/socketvar.h>
	82	#include <sys/protosw.h>
	83	#include <sys/random.h>
	84	#include <sys/syslog.h>
	85	#include <sys/mcache.h>
	86	#include <kern/locks.h>
	87	#include <kern/zalloc.h>
	88
	89	#include <dev/random/randomdev.h>
	90
	91	#include <net/route.h>
	92	#include <net/if.h>
	93	#include <net/content_filter.h>
	94
	95	#define tcp_minmssoverload fring
	96	#define _IP_VHL
	97	#include <netinet/in.h>
	98	#include <netinet/in_systm.h>
	99	#include <netinet/ip.h>
	100	#include <netinet/ip_icmp.h>
	101	#if INET6
	102	#include <netinet/ip6.h>
	103	#endif
	104	#include <netinet/in_pcb.h>
	105	#if INET6
	106	#include <netinet6/in6_pcb.h>
	107	#endif
	108	#include <netinet/in_var.h>
	109	#include <netinet/ip_var.h>
	110	#include <netinet/icmp_var.h>
	111	#if INET6
	112	#include <netinet6/ip6_var.h>
	113	#endif
	114	#include <netinet/tcp.h>
	115	#include <netinet/tcp_fsm.h>
	116	#include <netinet/tcp_seq.h>
	117	#include <netinet/tcp_timer.h>
	118	#include <netinet/tcp_var.h>
	119	#include <netinet/tcp_cc.h>
	120	#include <netinet/tcp_cache.h>
	121	#include <kern/thread_call.h>
	122
	123	#if INET6
	124	#include <netinet6/tcp6_var.h>
	125	#endif
	126	#include <netinet/tcpip.h>
	127	#if TCPDEBUG
	128	#include <netinet/tcp_debug.h>
	129	#endif
	130	#include <netinet6/ip6protosw.h>
	131
	132	#if IPSEC
	133	#include <netinet6/ipsec.h>
	134	#if INET6
	135	#include <netinet6/ipsec6.h>
	136	#endif
	137	#endif /IPSEC/
	138
	139	#if NECP
	140	#include <net/necp.h>
	141	#endif /* NECP */
	142
	143	#undef tcp_minmssoverload
	144
	145	#if CONFIG_MACF_NET
	146	#include <security/mac_framework.h>
	147	#endif /* MAC_NET */
	148
	149	#include <corecrypto/ccaes.h>
	150	#include <libkern/crypto/aes.h>
	151	#include <libkern/crypto/md5.h>
	152	#include <sys/kdebug.h>
	153	#include <mach/sdt.h>
	154
	155	#include <netinet/lro_ext.h>
	156
	157	#define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) \| 2))
	158
	159	extern int tcp_lq_overflow;
	160
	161	extern struct tcptimerlist tcp_timer_list;
	162	extern struct tcptailq tcp_tw_tailq;
	163
	164	int tcp_mssdflt = TCP_MSS;
	165	SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW \| CTLFLAG_LOCKED,
	166	&tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
	167
	168	#if INET6
	169	int tcp_v6mssdflt = TCP6_MSS;
	170	SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
	171	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_v6mssdflt , 0,
	172	"Default TCP Maximum Segment Size for IPv6");
	173	#endif
	174
	175	extern int tcp_do_autorcvbuf;
	176
	177	int tcp_sysctl_fastopenkey(struct sysctl_oid , void , int ,
	178	struct sysctl_req *);
	179	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, fastopen_key,
	180	CTLTYPE_STRING \| CTLFLAG_WR,
	181	0 , 0, tcp_sysctl_fastopenkey, "S", "TCP Fastopen key");
	182
	183	/* Current count of half-open TFO connections */
	184	int tcp_tfo_halfcnt = 0;
	185
	186	/* Maximum of half-open TFO connection backlog */
	187	int tcp_tfo_backlog = 10;
	188	SYSCTL_INT(_net_inet_tcp, OID_AUTO, fastopen_backlog, CTLFLAG_RW \| CTLFLAG_LOCKED,
	189	&tcp_tfo_backlog, 0, "Backlog queue for half-open TFO connections");
	190
	191	int tcp_fastopen = TCP_FASTOPEN_CLIENT \| TCP_FASTOPEN_SERVER;
	192	SYSCTL_INT(_net_inet_tcp, OID_AUTO, fastopen, CTLFLAG_RW \| CTLFLAG_LOCKED,
	193	&tcp_fastopen, 0, "Enable TCP Fastopen (RFC 7413)");
	194
	195	int tcp_tfo_fallback_min = 10;
	196	SYSCTL_INT(_net_inet_tcp, OID_AUTO, fastopen_fallback_min, CTLFLAG_RW \| CTLFLAG_LOCKED,
	197	&tcp_tfo_fallback_min, 0, "Mininum number of trials without TFO when in fallback mode");
	198
	199	/*
	200	* Minimum MSS we accept and use. This prevents DoS attacks where
	201	* we are forced to a ridiculous low MSS like 20 and send hundreds
	202	* of packets instead of one. The effect scales with the available
	203	* bandwidth and quickly saturates the CPU and network interface
	204	* with packet generation and sending. Set to zero to disable MINMSS
	205	* checking. This setting prevents us from sending too small packets.
	206	*/
	207	int tcp_minmss = TCP_MINMSS;
	208	SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW \| CTLFLAG_LOCKED,
	209	&tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
	210	int tcp_do_rfc1323 = 1;
	211	#if (DEVELOPMENT \|\| DEBUG)
	212	SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323,
	213	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_do_rfc1323 , 0,
	214	"Enable rfc1323 (high performance TCP) extensions");
	215	#endif /* (DEVELOPMENT \|\| DEBUG) */
	216
	217	// Not used
	218	static int tcp_do_rfc1644 = 0;
	219	SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW \| CTLFLAG_LOCKED,
	220	&tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
	221
	222	static int do_tcpdrain = 0;
	223	SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW \| CTLFLAG_LOCKED, &do_tcpdrain, 0,
	224	"Enable tcp_drain routine for extra help when low on mbufs");
	225
	226	SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD \| CTLFLAG_LOCKED,
	227	&tcbinfo.ipi_count, 0, "Number of active PCBs");
	228
	229	SYSCTL_INT(_net_inet_tcp, OID_AUTO, tw_pcbcount,
	230	CTLFLAG_RD \| CTLFLAG_LOCKED,
	231	&tcbinfo.ipi_twcount, 0, "Number of pcbs in time-wait state");
	232
	233	static int icmp_may_rst = 1;
	234	SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW \| CTLFLAG_LOCKED, &icmp_may_rst, 0,
	235	"Certain ICMP unreachable messages may abort connections in SYN_SENT");
	236
	237	static int tcp_strict_rfc1948 = 0;
	238	static int tcp_isn_reseed_interval = 0;
	239	#if (DEVELOPMENT \|\| DEBUG)
	240	SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948,
	241	CTLFLAG_RW \| CTLFLAG_LOCKED,
	242	&tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly");
	243
	244	SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval,
	245	CTLFLAG_RW \| CTLFLAG_LOCKED,
	246	&tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
	247	#endif /* (DEVELOPMENT \|\| DEBUG) */
	248
	249	int tcp_TCPTV_MIN = 100; /* 100ms minimum RTT */
	250	SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_min, CTLFLAG_RW \| CTLFLAG_LOCKED,
	251	&tcp_TCPTV_MIN, 0, "min rtt value allowed");
	252
	253	int tcp_rexmt_slop = TCPTV_REXMTSLOP;
	254	SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmt_slop, CTLFLAG_RW,
	255	&tcp_rexmt_slop, 0, "Slop added to retransmit timeout");
	256
	257	__private_extern__ int tcp_use_randomport = 0;
	258	SYSCTL_INT(_net_inet_tcp, OID_AUTO, randomize_ports, CTLFLAG_RW \| CTLFLAG_LOCKED,
	259	&tcp_use_randomport, 0, "Randomize TCP port numbers");
	260
	261	__private_extern__ int tcp_win_scale = 3;
	262	SYSCTL_INT(_net_inet_tcp, OID_AUTO, win_scale_factor,
	263	CTLFLAG_RW \| CTLFLAG_LOCKED,
	264	&tcp_win_scale, 0, "Window scaling factor");
	265
	266	static void tcp_cleartaocache(void);
	267	static void tcp_notify(struct inpcb *, int);
	268
	269	struct zone *sack_hole_zone;
	270	struct zone *tcp_reass_zone;
	271	struct zone *tcp_bwmeas_zone;
	272	struct zone *tcp_rxt_seg_zone;
	273
	274	extern int slowlink_wsize; /* window correction for slow links */
	275	extern int path_mtu_discovery;
	276
	277	extern u_int32_t tcp_autorcvbuf_max;
	278	extern u_int32_t tcp_autorcvbuf_inc_shift;
	279	static void tcp_sbrcv_grow_rwin(struct tcpcb tp, struct sockbuf sb);
	280
	281	#define TCP_BWMEAS_BURST_MINSIZE 6
	282	#define TCP_BWMEAS_BURST_MAXSIZE 25
	283
	284	static uint32_t bwmeas_elm_size;
	285
	286	/*
	287	* Target size of TCP PCB hash tables. Must be a power of two.
	288	*
	289	* Note that this can be overridden by the kernel environment
	290	* variable net.inet.tcp.tcbhashsize
	291	*/
	292	#ifndef TCBHASHSIZE
	293	#define TCBHASHSIZE CONFIG_TCBHASHSIZE
	294	#endif
	295
	296	__private_extern__ int tcp_tcbhashsize = TCBHASHSIZE;
	297	SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD \| CTLFLAG_LOCKED,
	298	&tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
	299
	300	/*
	301	* This is the actual shape of what we allocate using the zone
	302	* allocator. Doing it this way allows us to protect both structures
	303	* using the same generation count, and also eliminates the overhead
	304	* of allocating tcpcbs separately. By hiding the structure here,
	305	* we avoid changing most of the rest of the code (although it needs
	306	* to be changed, eventually, for greater efficiency).
	307	*/
	308	#define ALIGNMENT 32
	309	struct inp_tp {
	310	struct inpcb inp;
	311	struct tcpcb tcb __attribute__((aligned(ALIGNMENT)));
	312	};
	313	#undef ALIGNMENT
	314
	315	int get_inpcb_str_size(void);
	316	int get_tcp_str_size(void);
	317
	318	static void tcpcb_to_otcpcb(struct tcpcb , struct otcpcb );
	319
	320	static lck_attr_t tcp_uptime_mtx_attr = NULL; / mutex attributes */
	321	static lck_grp_t tcp_uptime_mtx_grp = NULL; / mutex group definition */
	322	static lck_grp_attr_t tcp_uptime_mtx_grp_attr = NULL; / mutex group attributes */
	323	int tcp_notsent_lowat_check(struct socket *so);
	324
	325	static aes_encrypt_ctx tfo_ctx; /* Crypto-context for TFO */
	326
	327	void
	328	tcp_tfo_gen_cookie(struct inpcb inp, u_char out, size_t blk_size)
	329	{
	330	u_char in[CCAES_BLOCK_SIZE];
	331	#if INET6
	332	int isipv6 = inp->inp_vflag & INP_IPV6;
	333	#endif
	334
	335	VERIFY(blk_size == CCAES_BLOCK_SIZE);
	336
	337	bzero(&in[0], CCAES_BLOCK_SIZE);
	338	bzero(&out[0], CCAES_BLOCK_SIZE);
	339
	340	#if INET6
	341	if (isipv6)
	342	memcpy(in, &inp->in6p_faddr, sizeof(struct in6_addr));
	343	else
	344	#endif /* INET6 */
	345	memcpy(in, &inp->inp_faddr, sizeof(struct in_addr));
	346
	347	aes_encrypt_cbc(in, NULL, 1, out, &tfo_ctx);
	348	}
	349
	350	__private_extern__ int
	351	tcp_sysctl_fastopenkey(__unused struct sysctl_oid oidp, __unused void arg1,
	352	__unused int arg2, struct sysctl_req *req)
	353	{
	354	int error = 0;
	355	/* TFO-key is expressed as a string in hex format (+1 to account for \0 char) */
	356	char keystring[TCP_FASTOPEN_KEYLEN * 2 + 1];
	357	u_int32_t key[TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)];
	358	int i;
	359
	360	/* -1, because newlen is len without the terminating \0 character */
	361	if (req->newlen != (sizeof(keystring) - 1)) {
	362	error = EINVAL;
	363	goto exit;
	364	}
	365
	366	/* sysctl_io_string copies keystring into the oldptr of the sysctl_req.
	367	* Make sure everything is zero, to avoid putting garbage in there or
	368	* leaking the stack.
	369	*/
	370	bzero(keystring, sizeof(keystring));
	371
	372	error = sysctl_io_string(req, keystring, sizeof(keystring), 0, NULL);
	373	if (error)
	374	goto exit;
	375
	376	for (i = 0; i < (TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)); i++) {
	377	/* We jump over the keystring in 8-character (4 byte in hex) steps */
	378	if (sscanf(&keystring[i * 8], "%8x", &key[i]) != 1) {
	379	error = EINVAL;
	380	goto exit;
	381	}
	382	}
	383
	384	aes_encrypt_key128((u_char *)key, &tfo_ctx);
	385
	386	exit:
	387	return (error);
	388	}
	389
	390	int get_inpcb_str_size(void)
	391	{
	392	return sizeof(struct inpcb);
	393	}
	394
	395	int get_tcp_str_size(void)
	396	{
	397	return sizeof(struct tcpcb);
	398	}
	399
	400	int tcp_freeq(struct tcpcb *tp);
	401
	402	static int scale_to_powerof2(int size);
	403
	404	/*
	405	* This helper routine returns one of the following scaled value of size:
	406	* 1. Rounded down power of two value of size if the size value passed as
	407	* argument is not a power of two and the rounded up value overflows.
	408	* OR
	409	* 2. Rounded up power of two value of size if the size value passed as
	410	* argument is not a power of two and the rounded up value does not overflow
	411	* OR
	412	* 3. Same value as argument size if it is already a power of two.
	413	*/
	414	static int scale_to_powerof2(int size) {
	415	/* Handle special case of size = 0 */
	416	int ret = size ? size : 1;
	417
	418	if (!powerof2(ret)) {
	419	while(!powerof2(size)) {
	420	/*
	421	* Clear out least significant
	422	* set bit till size is left with
	423	* its highest set bit at which point
	424	* it is rounded down power of two.
	425	*/
	426	size = size & (size -1);
	427	}
	428
	429	/* Check for overflow when rounding up */
	430	if (0 == (size << 1)) {
	431	ret = size;
	432	} else {
	433	ret = size << 1;
	434	}
	435	}
	436
	437	return ret;
	438	}
	439
	440	static void
	441	tcp_tfo_init()
	442	{
	443	u_char key[TCP_FASTOPEN_KEYLEN];
	444
	445	read_random(key, sizeof(key));
	446	aes_encrypt_key128(key, &tfo_ctx);
	447	}
	448
	449	/*
	450	* Tcp initialization
	451	*/
	452	void
	453	tcp_init(struct protosw pp, struct domain dp)
	454	{
	455	#pragma unused(dp)
	456	static int tcp_initialized = 0;
	457	vm_size_t str_size;
	458	struct inpcbinfo *pcbinfo;
	459
	460	VERIFY((pp->pr_flags & (PR_INITIALIZED\|PR_ATTACHED)) == PR_ATTACHED);
	461
	462	if (tcp_initialized)
	463	return;
	464	tcp_initialized = 1;
	465
	466	tcp_ccgen = 1;
	467	tcp_cleartaocache();
	468
	469	tcp_keepinit = TCPTV_KEEP_INIT;
	470	tcp_keepidle = TCPTV_KEEP_IDLE;
	471	tcp_keepintvl = TCPTV_KEEPINTVL;
	472	tcp_keepcnt = TCPTV_KEEPCNT;
	473	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
	474	tcp_msl = TCPTV_MSL;
	475
	476	microuptime(&tcp_uptime);
	477	read_random(&tcp_now, sizeof(tcp_now));
	478	tcp_now = tcp_now & 0x3fffffff; /* Starts tcp internal clock at a random value */
	479
	480	tcp_tfo_init();
	481
	482	LIST_INIT(&tcb);
	483	tcbinfo.ipi_listhead = &tcb;
	484
	485	pcbinfo = &tcbinfo;
	486	/*
	487	* allocate lock group attribute and group for tcp pcb mutexes
	488	*/
	489	pcbinfo->ipi_lock_grp_attr = lck_grp_attr_alloc_init();
	490	pcbinfo->ipi_lock_grp = lck_grp_alloc_init("tcppcb", pcbinfo->ipi_lock_grp_attr);
	491
	492	/*
	493	* allocate the lock attribute for tcp pcb mutexes
	494	*/
	495	pcbinfo->ipi_lock_attr = lck_attr_alloc_init();
	496
	497	if ((pcbinfo->ipi_lock = lck_rw_alloc_init(pcbinfo->ipi_lock_grp,
	498	pcbinfo->ipi_lock_attr)) == NULL) {
	499	panic("%s: unable to allocate PCB lock\n", __func__);
	500	/* NOTREACHED */
	501	}
	502
	503	if (tcp_tcbhashsize == 0) {
	504	/* Set to default */
	505	tcp_tcbhashsize = 512;
	506	}
	507
	508	if (!powerof2(tcp_tcbhashsize)) {
	509	int old_hash_size = tcp_tcbhashsize;
	510	tcp_tcbhashsize = scale_to_powerof2(tcp_tcbhashsize);
	511	/* Lower limit of 16 */
	512	if (tcp_tcbhashsize < 16) {
	513	tcp_tcbhashsize = 16;
	514	}
	515	printf("WARNING: TCB hash size not a power of 2, "
	516	"scaled from %d to %d.\n",
	517	old_hash_size,
	518	tcp_tcbhashsize);
	519	}
	520
	521	tcbinfo.ipi_hashbase = hashinit(tcp_tcbhashsize, M_PCB, &tcbinfo.ipi_hashmask);
	522	tcbinfo.ipi_porthashbase = hashinit(tcp_tcbhashsize, M_PCB,
	523	&tcbinfo.ipi_porthashmask);
	524	str_size = P2ROUNDUP(sizeof(struct inp_tp), sizeof(u_int64_t));
	525	tcbinfo.ipi_zone = zinit(str_size, 120000*str_size, 8192, "tcpcb");
	526	zone_change(tcbinfo.ipi_zone, Z_CALLERACCT, FALSE);
	527	zone_change(tcbinfo.ipi_zone, Z_EXPAND, TRUE);
	528
	529	tcbinfo.ipi_gc = tcp_gc;
	530	tcbinfo.ipi_timer = tcp_itimer;
	531	in_pcbinfo_attach(&tcbinfo);
	532
	533	str_size = P2ROUNDUP(sizeof(struct sackhole), sizeof(u_int64_t));
	534	sack_hole_zone = zinit(str_size, 120000*str_size, 8192, "sack_hole zone");
	535	zone_change(sack_hole_zone, Z_CALLERACCT, FALSE);
	536	zone_change(sack_hole_zone, Z_EXPAND, TRUE);
	537
	538	str_size = P2ROUNDUP(sizeof(struct tseg_qent), sizeof(u_int64_t));
	539	tcp_reass_zone = zinit(str_size, (nmbclusters >> 4) * str_size,
	540	0, "tcp_reass_zone");
	541	if (tcp_reass_zone == NULL) {
	542	panic("%s: failed allocating tcp_reass_zone", __func__);
	543	/* NOTREACHED */
	544	}
	545	zone_change(tcp_reass_zone, Z_CALLERACCT, FALSE);
	546	zone_change(tcp_reass_zone, Z_EXPAND, TRUE);
	547
	548	bwmeas_elm_size = P2ROUNDUP(sizeof(struct bwmeas), sizeof(u_int64_t));
	549	tcp_bwmeas_zone = zinit(bwmeas_elm_size, (100 * bwmeas_elm_size), 0, "tcp_bwmeas_zone");
	550	if (tcp_bwmeas_zone == NULL) {
	551	panic("%s: failed allocating tcp_bwmeas_zone", __func__);
	552	/* NOTREACHED */
	553	}
	554	zone_change(tcp_bwmeas_zone, Z_CALLERACCT, FALSE);
	555	zone_change(tcp_bwmeas_zone, Z_EXPAND, TRUE);
	556
	557	str_size = P2ROUNDUP(sizeof(struct tcp_ccstate), sizeof(u_int64_t));
	558	tcp_cc_zone = zinit(str_size, 20000 * str_size, 0, "tcp_cc_zone");
	559	zone_change(tcp_cc_zone, Z_CALLERACCT, FALSE);
	560	zone_change(tcp_cc_zone, Z_EXPAND, TRUE);
	561
	562	str_size = P2ROUNDUP(sizeof(struct tcp_rxt_seg), sizeof(u_int64_t));
	563	tcp_rxt_seg_zone = zinit(str_size, 10000 * str_size, 0,
	564	"tcp_rxt_seg_zone");
	565	zone_change(tcp_rxt_seg_zone, Z_CALLERACCT, FALSE);
	566	zone_change(tcp_rxt_seg_zone, Z_EXPAND, TRUE);
	567
	568	#if INET6
	569	#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
	570	#else /* INET6 */
	571	#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
	572	#endif /* INET6 */
	573	if (max_protohdr < TCP_MINPROTOHDR) {
	574	_max_protohdr = TCP_MINPROTOHDR;
	575	_max_protohdr = max_protohdr; /* round it up */
	576	}
	577	if (max_linkhdr + max_protohdr > MCLBYTES)
	578	panic("tcp_init");
	579	#undef TCP_MINPROTOHDR
	580
	581	/* Initialize time wait and timer lists */
	582	TAILQ_INIT(&tcp_tw_tailq);
	583
	584	bzero(&tcp_timer_list, sizeof(tcp_timer_list));
	585	LIST_INIT(&tcp_timer_list.lhead);
	586	/*
	587	* allocate lock group attribute, group and attribute for the tcp timer list
	588	*/
	589	tcp_timer_list.mtx_grp_attr = lck_grp_attr_alloc_init();
	590	tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist", tcp_timer_list.mtx_grp_attr);
	591	tcp_timer_list.mtx_attr = lck_attr_alloc_init();
	592	if ((tcp_timer_list.mtx = lck_mtx_alloc_init(tcp_timer_list.mtx_grp, tcp_timer_list.mtx_attr)) == NULL) {
	593	panic("failed to allocate memory for tcp_timer_list.mtx\n");
	594	};
	595	if ((tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL)) == NULL) {
	596	panic("failed to allocate call entry 1 in tcp_init\n");
	597	}
	598
	599	/*
	600	* allocate lock group attribute, group and attribute for tcp_uptime_lock
	601	*/
	602	tcp_uptime_mtx_grp_attr = lck_grp_attr_alloc_init();
	603	tcp_uptime_mtx_grp = lck_grp_alloc_init("tcpuptime", tcp_uptime_mtx_grp_attr);
	604	tcp_uptime_mtx_attr = lck_attr_alloc_init();
	605	tcp_uptime_lock = lck_spin_alloc_init(tcp_uptime_mtx_grp, tcp_uptime_mtx_attr);
	606
	607	/* Initialize TCP LRO data structures */
	608	tcp_lro_init();
	609
	610	/* Initialize TCP Cache */
	611	tcp_cache_init();
	612
	613	/*
	614	* If more than 60 MB of mbuf pool is available, increase the
	615	* maximum allowed receive and send socket buffer size.
	616	*/
	617	if (nmbclusters > 30720) {
	618	tcp_autorcvbuf_max = 1024 * 1024;
	619	tcp_autosndbuf_max = 1024 * 1024;
	620	}
	621	}
	622
	623	/*
	624	* Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
	625	* tcp_template used to store this data in mbufs, but we now recopy it out
	626	* of the tcpcb each time to conserve mbufs.
	627	*/
	628	void
	629	tcp_fillheaders(tp, ip_ptr, tcp_ptr)
	630	struct tcpcb *tp;
	631	void *ip_ptr;
	632	void *tcp_ptr;
	633	{
	634	struct inpcb *inp = tp->t_inpcb;
	635	struct tcphdr tcp_hdr = (struct tcphdr )tcp_ptr;
	636
	637	#if INET6
	638	if ((inp->inp_vflag & INP_IPV6) != 0) {
	639	struct ip6_hdr *ip6;
	640
	641	ip6 = (struct ip6_hdr *)ip_ptr;
	642	ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) \|
	643	(inp->inp_flow & IPV6_FLOWINFO_MASK);
	644	ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) \|
	645	(IPV6_VERSION & IPV6_VERSION_MASK);
	646	ip6->ip6_nxt = IPPROTO_TCP;
	647	ip6->ip6_plen = sizeof(struct tcphdr);
	648	ip6->ip6_src = inp->in6p_laddr;
	649	ip6->ip6_dst = inp->in6p_faddr;
	650	tcp_hdr->th_sum = in6_pseudo(&inp->in6p_laddr, &inp->in6p_faddr,
	651	htonl(sizeof (struct tcphdr) + IPPROTO_TCP));
	652	} else
	653	#endif
	654	{
	655	struct ip ip = (struct ip ) ip_ptr;
	656
	657	ip->ip_vhl = IP_VHL_BORING;
	658	ip->ip_tos = 0;
	659	ip->ip_len = 0;
	660	ip->ip_id = 0;
	661	ip->ip_off = 0;
	662	ip->ip_ttl = 0;
	663	ip->ip_sum = 0;
	664	ip->ip_p = IPPROTO_TCP;
	665	ip->ip_src = inp->inp_laddr;
	666	ip->ip_dst = inp->inp_faddr;
	667	tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
	668	htons(sizeof(struct tcphdr) + IPPROTO_TCP));
	669	}
	670
	671	tcp_hdr->th_sport = inp->inp_lport;
	672	tcp_hdr->th_dport = inp->inp_fport;
	673	tcp_hdr->th_seq = 0;
	674	tcp_hdr->th_ack = 0;
	675	tcp_hdr->th_x2 = 0;
	676	tcp_hdr->th_off = 5;
	677	tcp_hdr->th_flags = 0;
	678	tcp_hdr->th_win = 0;
	679	tcp_hdr->th_urp = 0;
	680	}
	681
	682	/*
	683	* Create template to be used to send tcp packets on a connection.
	684	* Allocates an mbuf and fills in a skeletal tcp/ip header. The only
	685	* use for this function is in keepalives, which use tcp_respond.
	686	*/
	687	struct tcptemp *
	688	tcp_maketemplate(tp)
	689	struct tcpcb *tp;
	690	{
	691	struct mbuf *m;
	692	struct tcptemp *n;
	693
	694	m = m_get(M_DONTWAIT, MT_HEADER);
	695	if (m == NULL)
	696	return (0);
	697	m->m_len = sizeof(struct tcptemp);
	698	n = mtod(m, struct tcptemp *);
	699
	700	tcp_fillheaders(tp, (void )&n->tt_ipgen, (void )&n->tt_t);
	701	return (n);
	702	}
	703
	704	/*
	705	* Send a single message to the TCP at address specified by
	706	* the given TCP/IP header. If m == 0, then we make a copy
	707	* of the tcpiphdr at ti and send directly to the addressed host.
	708	* This is used to force keep alive messages out using the TCP
	709	* template for a connection. If flags are given then we send
	710	* a message back to the TCP which originated the * segment ti,
	711	* and discard the mbuf containing it and any other attached mbufs.
	712	*
	713	* In any case the ack and sequence number of the transmitted
	714	* segment are as specified by the parameters.
	715	*
	716	* NOTE: If m != NULL, then ti must point to inside the mbuf.
	717	*/
	718	void
	719	tcp_respond(struct tcpcb tp, void ipgen, struct tcphdr th, struct mbuf m,
	720	tcp_seq ack, tcp_seq seq, int flags, struct tcp_respond_args *tra)
	721	{
	722	int tlen;
	723	int win = 0;
	724	struct route *ro = 0;
	725	struct route sro;
	726	struct ip *ip;
	727	struct tcphdr *nth;
	728	#if INET6
	729	struct route_in6 *ro6 = 0;
	730	struct route_in6 sro6;
	731	struct ip6_hdr *ip6;
	732	int isipv6;
	733	#endif /* INET6 */
	734	struct ifnet *outif;
	735
	736	#if INET6
	737	isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
	738	ip6 = ipgen;
	739	#endif /* INET6 */
	740	ip = ipgen;
	741
	742	if (tp) {
	743	if (!(flags & TH_RST)) {
	744	win = tcp_sbspace(tp);
	745	if (win > (int32_t)TCP_MAXWIN << tp->rcv_scale)
	746	win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
	747	}
	748	#if INET6
	749	if (isipv6)
	750	ro6 = &tp->t_inpcb->in6p_route;
	751	else
	752	#endif /* INET6 */
	753	ro = &tp->t_inpcb->inp_route;
	754	} else {
	755	#if INET6
	756	if (isipv6) {
	757	ro6 = &sro6;
	758	bzero(ro6, sizeof *ro6);
	759	} else
	760	#endif /* INET6 */
	761	{
	762	ro = &sro;
	763	bzero(ro, sizeof *ro);
	764	}
	765	}
	766	if (m == 0) {
	767	m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */
	768	if (m == NULL)
	769	return;
	770	tlen = 0;
	771	m->m_data += max_linkhdr;
	772	#if INET6
	773	if (isipv6) {
	774	VERIFY((MHLEN - max_linkhdr) >=
	775	(sizeof (ip6) + sizeof (nth)));
	776	bcopy((caddr_t)ip6, mtod(m, caddr_t),
	777	sizeof(struct ip6_hdr));
	778	ip6 = mtod(m, struct ip6_hdr *);
	779	nth = (struct tcphdr )(void )(ip6 + 1);
	780	} else
	781	#endif /* INET6 */
	782	{
	783	VERIFY((MHLEN - max_linkhdr) >=
	784	(sizeof (ip) + sizeof (nth)));
	785	bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
	786	ip = mtod(m, struct ip *);
	787	nth = (struct tcphdr )(void )(ip + 1);
	788	}
	789	bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
	790	#if MPTCP
	791	if ((tp) && (tp->t_mpflags & TMPF_RESET))
	792	flags = (TH_RST \| TH_ACK);
	793	else
	794	#endif
	795	flags = TH_ACK;
	796	} else {
	797	m_freem(m->m_next);
	798	m->m_next = 0;
	799	m->m_data = (caddr_t)ipgen;
	800	/* m_len is set later */
	801	tlen = 0;
	802	#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
	803	#if INET6
	804	if (isipv6) {
	805	/* Expect 32-bit aligned IP on strict-align platforms */
	806	IP6_HDR_STRICT_ALIGNMENT_CHECK(ip6);
	807	xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
	808	nth = (struct tcphdr )(void )(ip6 + 1);
	809	} else
	810	#endif /* INET6 */
	811	{
	812	/* Expect 32-bit aligned IP on strict-align platforms */
	813	IP_HDR_STRICT_ALIGNMENT_CHECK(ip);
	814	xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
	815	nth = (struct tcphdr )(void )(ip + 1);
	816	}
	817	if (th != nth) {
	818	/*
	819	* this is usually a case when an extension header
	820	* exists between the IPv6 header and the
	821	* TCP header.
	822	*/
	823	nth->th_sport = th->th_sport;
	824	nth->th_dport = th->th_dport;
	825	}
	826	xchg(nth->th_dport, nth->th_sport, n_short);
	827	#undef xchg
	828	}
	829	#if INET6
	830	if (isipv6) {
	831	ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
	832	tlen));
	833	tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
	834	} else
	835	#endif
	836	{
	837	tlen += sizeof (struct tcpiphdr);
	838	ip->ip_len = tlen;
	839	ip->ip_ttl = ip_defttl;
	840	}
	841	m->m_len = tlen;
	842	m->m_pkthdr.len = tlen;
	843	m->m_pkthdr.rcvif = 0;
	844	#if CONFIG_MACF_NET
	845	if (tp != NULL && tp->t_inpcb != NULL) {
	846	/*
	847	* Packet is associated with a socket, so allow the
	848	* label of the response to reflect the socket label.
	849	*/
	850	mac_mbuf_label_associate_inpcb(tp->t_inpcb, m);
	851	} else {
	852	/*
	853	* Packet is not associated with a socket, so possibly
	854	* update the label in place.
	855	*/
	856	mac_netinet_tcp_reply(m);
	857	}
	858	#endif
	859
	860	nth->th_seq = htonl(seq);
	861	nth->th_ack = htonl(ack);
	862	nth->th_x2 = 0;
	863	nth->th_off = sizeof (struct tcphdr) >> 2;
	864	nth->th_flags = flags;
	865	if (tp)
	866	nth->th_win = htons((u_short) (win >> tp->rcv_scale));
	867	else
	868	nth->th_win = htons((u_short)win);
	869	nth->th_urp = 0;
	870	#if INET6
	871	if (isipv6) {
	872	nth->th_sum = 0;
	873	nth->th_sum = in6_pseudo(&ip6->ip6_src, &ip6->ip6_dst,
	874	htonl((tlen - sizeof (struct ip6_hdr)) + IPPROTO_TCP));
	875	m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
	876	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	877	ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
	878	ro6 && ro6->ro_rt ?
	879	ro6->ro_rt->rt_ifp :
	880	NULL);
	881	} else
	882	#endif /* INET6 */
	883	{
	884	nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
	885	htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
	886	m->m_pkthdr.csum_flags = CSUM_TCP;
	887	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	888	}
	889	#if TCPDEBUG
	890	if (tp == NULL \|\| (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
	891	tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
	892	#endif
	893
	894	#if NECP
	895	necp_mark_packet_from_socket(m, tp ? tp->t_inpcb : NULL, 0, 0);
	896	#endif /* NECP */
	897
	898	#if IPSEC
	899	if (tp != NULL && tp->t_inpcb->inp_sp != NULL &&
	900	ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
	901	m_freem(m);
	902	return;
	903	}
	904	#endif
	905
	906	if (tp != NULL) {
	907	u_int32_t svc_flags = 0;
	908	if (isipv6) {
	909	svc_flags \|= PKT_SCF_IPV6;
	910	}
	911	set_packet_service_class(m, tp->t_inpcb->inp_socket,
	912	MBUF_SC_UNSPEC, svc_flags);
	913
	914	/* Embed flowhash and flow control flags */
	915	m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
	916	m->m_pkthdr.pkt_flowid = tp->t_inpcb->inp_flowhash;
	917	m->m_pkthdr.pkt_flags \|= PKTF_FLOW_ID \| PKTF_FLOW_LOCALSRC;
	918	#if MPTCP
	919	/* Disable flow advisory when using MPTCP. */
	920	if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
	921	#endif /* MPTCP */
	922	m->m_pkthdr.pkt_flags \|= PKTF_FLOW_ADV;
	923	m->m_pkthdr.pkt_proto = IPPROTO_TCP;
	924	}
	925
	926	#if INET6
	927	if (isipv6) {
	928	struct ip6_out_args ip6oa = { tra->ifscope, { 0 },
	929	IP6OAF_SELECT_SRCIF \| IP6OAF_BOUND_SRCADDR, 0 };
	930
	931	if (tra->ifscope != IFSCOPE_NONE)
	932	ip6oa.ip6oa_flags \|= IP6OAF_BOUND_IF;
	933	if (tra->nocell)
	934	ip6oa.ip6oa_flags \|= IP6OAF_NO_CELLULAR;
	935	if (tra->noexpensive)
	936	ip6oa.ip6oa_flags \|= IP6OAF_NO_EXPENSIVE;
	937	if (tra->awdl_unrestricted)
	938	ip6oa.ip6oa_flags \|= IP6OAF_AWDL_UNRESTRICTED;
	939
	940	(void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL,
	941	NULL, &ip6oa);
	942
	943	if (tp != NULL && ro6 != NULL && ro6->ro_rt != NULL &&
	944	(outif = ro6->ro_rt->rt_ifp) !=
	945	tp->t_inpcb->in6p_last_outifp)
	946	tp->t_inpcb->in6p_last_outifp = outif;
	947
	948	if (ro6 == &sro6)
	949	ROUTE_RELEASE(ro6);
	950	} else
	951	#endif /* INET6 */
	952	{
	953	struct ip_out_args ipoa = { tra->ifscope, { 0 },
	954	IPOAF_SELECT_SRCIF \| IPOAF_BOUND_SRCADDR, 0 };
	955
	956	if (tra->ifscope != IFSCOPE_NONE)
	957	ipoa.ipoa_flags \|= IPOAF_BOUND_IF;
	958	if (tra->nocell)
	959	ipoa.ipoa_flags \|= IPOAF_NO_CELLULAR;
	960	if (tra->noexpensive)
	961	ipoa.ipoa_flags \|= IPOAF_NO_EXPENSIVE;
	962	if (tra->awdl_unrestricted)
	963	ipoa.ipoa_flags \|= IPOAF_AWDL_UNRESTRICTED;
	964
	965	if (ro != &sro) {
	966	/* Copy the cached route and take an extra reference */
	967	inp_route_copyout(tp->t_inpcb, &sro);
	968	}
	969	/*
	970	* For consistency, pass a local route copy.
	971	*/
	972	(void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa);
	973
	974	if (tp != NULL && sro.ro_rt != NULL &&
	975	(outif = sro.ro_rt->rt_ifp) !=
	976	tp->t_inpcb->inp_last_outifp)
	977	tp->t_inpcb->inp_last_outifp = outif;
	978
	979	if (ro != &sro) {
	980	/* Synchronize cached PCB route */
	981	inp_route_copyin(tp->t_inpcb, &sro);
	982	} else {
	983	ROUTE_RELEASE(&sro);
	984	}
	985	}
	986	}
	987
	988	/*
	989	* Create a new TCP control block, making an
	990	* empty reassembly queue and hooking it to the argument
	991	* protocol control block. The `inp' parameter must have
	992	* come from the zone allocator set up in tcp_init().
	993	*/
	994	struct tcpcb *
	995	tcp_newtcpcb(inp)
	996	struct inpcb *inp;
	997	{
	998	struct inp_tp *it;
	999	register struct tcpcb *tp;
	1000	register struct socket *so = inp->inp_socket;
	1001	#if INET6
	1002	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
	1003	#endif /* INET6 */
	1004
	1005	calculate_tcp_clock();
	1006
	1007	if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
	1008	it = (struct inp_tp )(void )inp;
	1009	tp = &it->tcb;
	1010	} else {
	1011	tp = (struct tcpcb )(void )inp->inp_saved_ppcb;
	1012	}
	1013
	1014	bzero((char *) tp, sizeof(struct tcpcb));
	1015	LIST_INIT(&tp->t_segq);
	1016	tp->t_maxseg = tp->t_maxopd =
	1017	#if INET6
	1018	isipv6 ? tcp_v6mssdflt :
	1019	#endif /* INET6 */
	1020	tcp_mssdflt;
	1021
	1022	if (tcp_do_rfc1323)
	1023	tp->t_flags = (TF_REQ_SCALE\|TF_REQ_TSTMP);
	1024	if (tcp_do_sack)
	1025	tp->t_flagsext \|= TF_SACK_ENABLE;
	1026
	1027	TAILQ_INIT(&tp->snd_holes);
	1028	SLIST_INIT(&tp->t_rxt_segments);
	1029	tp->t_inpcb = inp; /* XXX */
	1030	/*
	1031	* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
	1032	* rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
	1033	* reasonable initial retransmit time.
	1034	*/
	1035	tp->t_srtt = TCPTV_SRTTBASE;
	1036	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
	1037	tp->t_rttmin = tcp_TCPTV_MIN;
	1038	tp->t_rxtcur = TCPTV_RTOBASE;
	1039
	1040	if (tcp_use_newreno)
	1041	/* use newreno by default */
	1042	tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX;
	1043	else
	1044	tp->tcp_cc_index = TCP_CC_ALGO_CUBIC_INDEX;
	1045
	1046	tcp_cc_allocate_state(tp);
	1047
	1048	if (CC_ALGO(tp)->init != NULL)
	1049	CC_ALGO(tp)->init(tp);
	1050
	1051	tp->snd_cwnd = TCP_CC_CWND_INIT_BYTES;
	1052	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
	1053	tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT;
	1054	tp->t_rcvtime = tcp_now;
	1055	tp->tentry.timer_start = tcp_now;
	1056	tp->t_persist_timeout = tcp_max_persist_timeout;
	1057	tp->t_persist_stop = 0;
	1058	tp->t_flagsext \|= TF_RCVUNACK_WAITSS;
	1059	tp->t_rexmtthresh = tcprexmtthresh;
	1060
	1061	/* Clear time wait tailq entry */
	1062	tp->t_twentry.tqe_next = NULL;
	1063	tp->t_twentry.tqe_prev = NULL;
	1064
	1065	/*
	1066	* IPv4 TTL initialization is necessary for an IPv6 socket as well,
	1067	* because the socket may be bound to an IPv6 wildcard address,
	1068	* which may match an IPv4-mapped IPv6 address.
	1069	*/
	1070	inp->inp_ip_ttl = ip_defttl;
	1071	inp->inp_ppcb = (caddr_t)tp;
	1072	return (tp); /* XXX */
	1073	}
	1074
	1075	/*
	1076	* Drop a TCP connection, reporting
	1077	* the specified error. If connection is synchronized,
	1078	* then send a RST to peer.
	1079	*/
	1080	struct tcpcb *
	1081	tcp_drop(tp, errno)
	1082	register struct tcpcb *tp;
	1083	int errno;
	1084	{
	1085	struct socket *so = tp->t_inpcb->inp_socket;
	1086	#if CONFIG_DTRACE
	1087	struct inpcb *inp = tp->t_inpcb;
	1088	#endif
	1089
	1090	if (TCPS_HAVERCVDSYN(tp->t_state)) {
	1091	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
	1092	struct tcpcb *, tp, int32_t, TCPS_CLOSED);
	1093	tp->t_state = TCPS_CLOSED;
	1094	(void) tcp_output(tp);
	1095	tcpstat.tcps_drops++;
	1096	} else
	1097	tcpstat.tcps_conndrops++;
	1098	if (errno == ETIMEDOUT && tp->t_softerror)
	1099	errno = tp->t_softerror;
	1100	so->so_error = errno;
	1101	return (tcp_close(tp));
	1102	}
	1103
	1104	void
	1105	tcp_getrt_rtt(struct tcpcb tp, struct rtentry rt)
	1106	{
	1107	u_int32_t rtt = rt->rt_rmx.rmx_rtt;
	1108	int isnetlocal = (tp->t_flags & TF_LOCAL);
	1109
	1110	if (rtt != 0) {
	1111	/*
	1112	* XXX the lock bit for RTT indicates that the value
	1113	* is also a minimum value; this is subject to time.
	1114	*/
	1115	if (rt->rt_rmx.rmx_locks & RTV_RTT)
	1116	tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ);
	1117	else
	1118	tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
	1119	tp->t_srtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
	1120	tcpstat.tcps_usedrtt++;
	1121	if (rt->rt_rmx.rmx_rttvar) {
	1122	tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
	1123	(RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
	1124	tcpstat.tcps_usedrttvar++;
	1125	} else {
	1126	/* default variation is +- 1 rtt */
	1127	tp->t_rttvar =
	1128	tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
	1129	}
	1130	TCPT_RANGESET(tp->t_rxtcur,
	1131	((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
	1132	tp->t_rttmin, TCPTV_REXMTMAX,
	1133	TCP_ADD_REXMTSLOP(tp));
	1134	}
	1135	}
	1136
	1137	static inline void
	1138	tcp_update_ecn_perf_stats(struct tcpcb *tp,
	1139	struct if_tcp_ecn_perf_stat *stat)
	1140	{
	1141	u_int64_t curval, oldval;
	1142	struct inpcb *inp = tp->t_inpcb;
	1143	stat->total_txpkts += inp->inp_stat->txpackets;
	1144	stat->total_rxpkts += inp->inp_stat->rxpackets;
	1145	stat->total_rxmitpkts += tp->t_stat.rxmitpkts;
	1146	stat->total_oopkts += tp->t_rcvoopack;
	1147	stat->total_reorderpkts += (tp->t_reordered_pkts + tp->t_pawsdrop +
	1148	tp->t_dsack_sent + tp->t_dsack_recvd);
	1149
	1150	/* Average RTT */
	1151	curval = (tp->t_srtt >> TCP_RTT_SHIFT);
	1152	if (curval > 0 && tp->t_rttupdated >= 16) {
	1153	if (stat->rtt_avg == 0) {
	1154	stat->rtt_avg = curval;
	1155	} else {
	1156	oldval = stat->rtt_avg;
	1157	stat->rtt_avg =
	1158	((oldval << 4) - oldval + curval) >> 4;
	1159	}
	1160	}
	1161
	1162	/* RTT variance */
	1163	curval = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
	1164	if (curval > 0 && tp->t_rttupdated >= 16) {
	1165	if (stat->rtt_var == 0) {
	1166	stat->rtt_var = curval;
	1167	} else {
	1168	oldval = stat->rtt_var;
	1169	stat->rtt_var =
	1170	((oldval << 4) - oldval + curval) >> 4;
	1171	}
	1172	}
	1173
	1174	/* Total number of SACK recovery episodes */
	1175	stat->sack_episodes += tp->t_sack_recovery_episode;
	1176
	1177	if (inp->inp_socket->so_error == ECONNRESET)
	1178	stat->rst_drop++;
	1179	return;
	1180	}
	1181
	1182	/*
	1183	* Close a TCP control block:
	1184	* discard all space held by the tcp
	1185	* discard internet protocol block
	1186	* wake up any sleepers
	1187	*/
	1188	struct tcpcb *
	1189	tcp_close(tp)
	1190	register struct tcpcb *tp;
	1191	{
	1192	struct inpcb *inp = tp->t_inpcb;
	1193	struct socket *so = inp->inp_socket;
	1194	#if INET6
	1195	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
	1196	#endif /* INET6 */
	1197	struct route *ro;
	1198	struct rtentry *rt;
	1199	int dosavessthresh;
	1200
	1201	/* tcp_close was called previously, bail */
	1202	if (inp->inp_ppcb == NULL)
	1203	return(NULL);
	1204
	1205	tcp_canceltimers(tp);
	1206	KERNEL_DEBUG(DBG_FNC_TCP_CLOSE \| DBG_FUNC_START, tp,0,0,0,0);
	1207
	1208	/*
	1209	* If another thread for this tcp is currently in ip (indicated by
	1210	* the TF_SENDINPROG flag), defer the cleanup until after it returns
	1211	* back to tcp. This is done to serialize the close until after all
	1212	* pending output is finished, in order to avoid having the PCB be
	1213	* detached and the cached route cleaned, only for ip to cache the
	1214	* route back into the PCB again. Note that we've cleared all the
	1215	* timers at this point. Set TF_CLOSING to indicate to tcp_output()
	1216	* that is should call us again once it returns from ip; at that
	1217	* point both flags should be cleared and we can proceed further
	1218	* with the cleanup.
	1219	*/
	1220	if ((tp->t_flags & TF_CLOSING) \|\|
	1221	inp->inp_sndinprog_cnt > 0) {
	1222	tp->t_flags \|= TF_CLOSING;
	1223	return (NULL);
	1224	}
	1225
	1226	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
	1227	struct tcpcb *, tp, int32_t, TCPS_CLOSED);
	1228
	1229	#if INET6
	1230	ro = (isipv6 ? (struct route *)&inp->in6p_route : &inp->inp_route);
	1231	#else
	1232	ro = &inp->inp_route;
	1233	#endif
	1234	rt = ro->ro_rt;
	1235	if (rt != NULL)
	1236	RT_LOCK_SPIN(rt);
	1237
	1238	/*
	1239	* If we got enough samples through the srtt filter,
	1240	* save the rtt and rttvar in the routing entry.
	1241	* 'Enough' is arbitrarily defined as the 16 samples.
	1242	* 16 samples is enough for the srtt filter to converge
	1243	* to within 5% of the correct value; fewer samples and
	1244	* we could save a very bogus rtt.
	1245	*
	1246	* Don't update the default route's characteristics and don't
	1247	* update anything that the user "locked".
	1248	*/
	1249	if (tp->t_rttupdated >= 16) {
	1250	register u_int32_t i = 0;
	1251
	1252	#if INET6
	1253	if (isipv6) {
	1254	struct sockaddr_in6 *sin6;
	1255
	1256	if (rt == NULL)
	1257	goto no_valid_rt;
	1258	sin6 = (struct sockaddr_in6 )(void )rt_key(rt);
	1259	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
	1260	goto no_valid_rt;
	1261	}
	1262	else
	1263	#endif /* INET6 */
	1264	if (ROUTE_UNUSABLE(ro) \|\|
	1265	SIN(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) {
	1266	DTRACE_TCP4(state__change, void, NULL,
	1267	struct inpcb , inp, struct tcpcb , tp,
	1268	int32_t, TCPS_CLOSED);
	1269	tp->t_state = TCPS_CLOSED;
	1270	goto no_valid_rt;
	1271	}
	1272
	1273	RT_LOCK_ASSERT_HELD(rt);
	1274	if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
	1275	i = tp->t_srtt *
	1276	(RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
	1277	if (rt->rt_rmx.rmx_rtt && i)
	1278	/*
	1279	* filter this update to half the old & half
	1280	* the new values, converting scale.
	1281	* See route.h and tcp_var.h for a
	1282	* description of the scaling constants.
	1283	*/
	1284	rt->rt_rmx.rmx_rtt =
	1285	(rt->rt_rmx.rmx_rtt + i) / 2;
	1286	else
	1287	rt->rt_rmx.rmx_rtt = i;
	1288	tcpstat.tcps_cachedrtt++;
	1289	}
	1290	if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
	1291	i = tp->t_rttvar *
	1292	(RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
	1293	if (rt->rt_rmx.rmx_rttvar && i)
	1294	rt->rt_rmx.rmx_rttvar =
	1295	(rt->rt_rmx.rmx_rttvar + i) / 2;
	1296	else
	1297	rt->rt_rmx.rmx_rttvar = i;
	1298	tcpstat.tcps_cachedrttvar++;
	1299	}
	1300	/*
	1301	* The old comment here said:
	1302	* update the pipelimit (ssthresh) if it has been updated
	1303	* already or if a pipesize was specified & the threshhold
	1304	* got below half the pipesize. I.e., wait for bad news
	1305	* before we start updating, then update on both good
	1306	* and bad news.
	1307	*
	1308	* But we want to save the ssthresh even if no pipesize is
	1309	* specified explicitly in the route, because such
	1310	* connections still have an implicit pipesize specified
	1311	* by the global tcp_sendspace. In the absence of a reliable
	1312	* way to calculate the pipesize, it will have to do.
	1313	*/
	1314	i = tp->snd_ssthresh;
	1315	if (rt->rt_rmx.rmx_sendpipe != 0)
	1316	dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
	1317	else
	1318	dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
	1319	if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
	1320	i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
	1321	\|\| dosavessthresh) {
	1322	/*
	1323	* convert the limit from user data bytes to
	1324	* packets then to packet data bytes.
	1325	*/
	1326	i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
	1327	if (i < 2)
	1328	i = 2;
	1329	i *= (u_int32_t)(tp->t_maxseg +
	1330	#if INET6
	1331	(isipv6 ? sizeof (struct ip6_hdr) +
	1332	sizeof (struct tcphdr) :
	1333	#endif
	1334	sizeof (struct tcpiphdr)
	1335	#if INET6
	1336	)
	1337	#endif
	1338	);
	1339	if (rt->rt_rmx.rmx_ssthresh)
	1340	rt->rt_rmx.rmx_ssthresh =
	1341	(rt->rt_rmx.rmx_ssthresh + i) / 2;
	1342	else
	1343	rt->rt_rmx.rmx_ssthresh = i;
	1344	tcpstat.tcps_cachedssthresh++;
	1345	}
	1346	}
	1347
	1348	/*
	1349	* Mark route for deletion if no information is cached.
	1350	*/
	1351	if (rt != NULL && (so->so_flags & SOF_OVERFLOW) && tcp_lq_overflow) {
	1352	if (!(rt->rt_rmx.rmx_locks & RTV_RTT) &&
	1353	rt->rt_rmx.rmx_rtt == 0) {
	1354	rt->rt_flags \|= RTF_DELCLONE;
	1355	}
	1356	}
	1357
	1358	no_valid_rt:
	1359	if (rt != NULL)
	1360	RT_UNLOCK(rt);
	1361
	1362	/* free the reassembly queue, if any */
	1363	(void) tcp_freeq(tp);
	1364
	1365	/* Collect ECN related statistics */
	1366	if (tp->ecn_flags & TE_SETUPSENT) {
	1367	if (tp->ecn_flags & TE_CLIENT_SETUP) {
	1368	INP_INC_IFNET_STAT(inp, ecn_client_setup);
	1369	if (TCP_ECN_ENABLED(tp)) {
	1370	INP_INC_IFNET_STAT(inp,
	1371	ecn_client_success);
	1372	} else if (tp->ecn_flags & TE_LOST_SYN) {
	1373	INP_INC_IFNET_STAT(inp, ecn_syn_lost);
	1374	} else {
	1375	INP_INC_IFNET_STAT(inp,
	1376	ecn_peer_nosupport);
	1377	}
	1378	} else {
	1379	INP_INC_IFNET_STAT(inp, ecn_server_setup);
	1380	if (TCP_ECN_ENABLED(tp)) {
	1381	INP_INC_IFNET_STAT(inp,
	1382	ecn_server_success);
	1383	} else if (tp->ecn_flags & TE_LOST_SYNACK) {
	1384	INP_INC_IFNET_STAT(inp,
	1385	ecn_synack_lost);
	1386	} else {
	1387	INP_INC_IFNET_STAT(inp,
	1388	ecn_peer_nosupport);
	1389	}
	1390	}
	1391	} else {
	1392	INP_INC_IFNET_STAT(inp, ecn_off_conn);
	1393	}
	1394	if (TCP_ECN_ENABLED(tp)) {
	1395	if (tp->ecn_flags & TE_RECV_ECN_CE) {
	1396	tcpstat.tcps_ecn_conn_recv_ce++;
	1397	INP_INC_IFNET_STAT(inp, ecn_conn_recv_ce);
	1398	}
	1399	if (tp->ecn_flags & TE_RECV_ECN_ECE) {
	1400	tcpstat.tcps_ecn_conn_recv_ece++;
	1401	INP_INC_IFNET_STAT(inp, ecn_conn_recv_ece);
	1402	}
	1403	if (tp->ecn_flags & (TE_RECV_ECN_CE \| TE_RECV_ECN_ECE)) {
	1404	if (tp->t_stat.txretransmitbytes > 0 \|\|
	1405	tp->t_stat.rxoutoforderbytes > 0) {
	1406	tcpstat.tcps_ecn_conn_pl_ce++;
	1407	INP_INC_IFNET_STAT(inp, ecn_conn_plce);
	1408	} else {
	1409	tcpstat.tcps_ecn_conn_nopl_ce++;
	1410	INP_INC_IFNET_STAT(inp, ecn_conn_noplce);
	1411	}
	1412	} else {
	1413	if (tp->t_stat.txretransmitbytes > 0 \|\|
	1414	tp->t_stat.rxoutoforderbytes > 0) {
	1415	tcpstat.tcps_ecn_conn_plnoce++;
	1416	INP_INC_IFNET_STAT(inp, ecn_conn_plnoce);
	1417	}
	1418	}
	1419	}
	1420
	1421	/* Aggregate performance stats */
	1422	if (inp->inp_last_outifp != NULL && !(tp->t_flags & TF_LOCAL)) {
	1423	struct ifnet *ifp = inp->inp_last_outifp;
	1424	ifnet_lock_shared(ifp);
	1425	if ((ifp->if_refflags & (IFRF_ATTACHED \| IFRF_DETACHING)) ==
	1426	IFRF_ATTACHED) {
	1427	if (inp->inp_vflag & INP_IPV6) {
	1428	ifp->if_ipv6_stat->timestamp = net_uptime();
	1429	if (TCP_ECN_ENABLED(tp)) {
	1430	tcp_update_ecn_perf_stats(tp,
	1431	&ifp->if_ipv6_stat->ecn_on);
	1432	} else {
	1433	tcp_update_ecn_perf_stats(tp,
	1434	&ifp->if_ipv6_stat->ecn_off);
	1435	}
	1436	} else {
	1437	ifp->if_ipv4_stat->timestamp = net_uptime();
	1438	if (TCP_ECN_ENABLED(tp)) {
	1439	tcp_update_ecn_perf_stats(tp,
	1440	&ifp->if_ipv4_stat->ecn_on);
	1441	} else {
	1442	tcp_update_ecn_perf_stats(tp,
	1443	&ifp->if_ipv4_stat->ecn_off);
	1444	}
	1445	}
	1446	}
	1447	ifnet_lock_done(ifp);
	1448	}
	1449
	1450	tcp_free_sackholes(tp);
	1451	if (tp->t_bwmeas != NULL) {
	1452	tcp_bwmeas_free(tp);
	1453	}
	1454	tcp_rxtseg_clean(tp);
	1455	/* Free the packet list */
	1456	if (tp->t_pktlist_head != NULL)
	1457	m_freem_list(tp->t_pktlist_head);
	1458	TCP_PKTLIST_CLEAR(tp);
	1459
	1460	#if MPTCP
	1461	/* Clear MPTCP state */
	1462	if ((so->so_flags & SOF_MPTCP_TRUE) \|\|
	1463	(so->so_flags & SOF_MP_SUBFLOW)) {
	1464	soevent(so, (SO_FILT_HINT_LOCKED \| SO_FILT_HINT_DELETEOK));
	1465	}
	1466	tp->t_mpflags = 0;
	1467	tp->t_mptcb = NULL;
	1468	#endif /* MPTCP */
	1469
	1470	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER)
	1471	inp->inp_saved_ppcb = (caddr_t) tp;
	1472
	1473	tp->t_state = TCPS_CLOSED;
	1474
	1475	/* Issue a wakeup before detach so that we don't miss
	1476	* a wakeup
	1477	*/
	1478	sodisconnectwakeup(so);
	1479
	1480	/*
	1481	* Clean up any LRO state
	1482	*/
	1483	if (tp->t_flagsext & TF_LRO_OFFLOADED) {
	1484	tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr,
	1485	inp->inp_lport, inp->inp_fport);
	1486	tp->t_flagsext &= ~TF_LRO_OFFLOADED;
	1487	}
	1488
	1489	/*
	1490	* If this is a socket that does not want to wakeup the device
	1491	* for it's traffic, the application might need to know that the
	1492	* socket is closed, send a notification.
	1493	*/
	1494	if ((so->so_options & SO_NOWAKEFROMSLEEP) &&
	1495	inp->inp_state != INPCB_STATE_DEAD &&
	1496	!(inp->inp_flags2 & INP2_TIMEWAIT))
	1497	socket_post_kev_msg_closed(so);
	1498
	1499	if (CC_ALGO(tp)->cleanup != NULL) {
	1500	CC_ALGO(tp)->cleanup(tp);
	1501	}
	1502
	1503	if (tp->t_ccstate != NULL) {
	1504	zfree(tcp_cc_zone, tp->t_ccstate);
	1505	tp->t_ccstate = NULL;
	1506	}
	1507	tp->tcp_cc_index = TCP_CC_ALGO_NONE;
	1508
	1509	/* Can happen if we close the socket before receiving the third ACK */
	1510	if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
	1511	OSDecrementAtomic(&tcp_tfo_halfcnt);
	1512
	1513	/* Panic if something has gone terribly wrong. */
	1514	VERIFY(tcp_tfo_halfcnt >= 0);
	1515
	1516	tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
	1517	}
	1518
	1519	#if INET6
	1520	if (SOCK_CHECK_DOM(so, PF_INET6))
	1521	in6_pcbdetach(inp);
	1522	else
	1523	#endif /* INET6 */
	1524	in_pcbdetach(inp);
	1525
	1526	/* Call soisdisconnected after detach because it might unlock the socket */
	1527	soisdisconnected(so);
	1528	tcpstat.tcps_closed++;
	1529	KERNEL_DEBUG(DBG_FNC_TCP_CLOSE \| DBG_FUNC_END,
	1530	tcpstat.tcps_closed, 0, 0, 0, 0);
	1531	return(NULL);
	1532	}
	1533
	1534	int
	1535	tcp_freeq(tp)
	1536	struct tcpcb *tp;
	1537	{
	1538
	1539	register struct tseg_qent *q;
	1540	int rv = 0;
	1541
	1542	while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
	1543	LIST_REMOVE(q, tqe_q);
	1544	m_freem(q->tqe_m);
	1545	zfree(tcp_reass_zone, q);
	1546	rv = 1;
	1547	}
	1548	tp->t_reassqlen = 0;
	1549	return (rv);
	1550	}
	1551
	1552
	1553	/*
	1554	* Walk the tcpbs, if existing, and flush the reassembly queue,
	1555	* if there is one when do_tcpdrain is enabled
	1556	* Also defunct the extended background idle socket
	1557	* Do it next time if the pcbinfo lock is in use
	1558	*/
	1559	void
	1560	tcp_drain()
	1561	{
	1562	struct inpcb *inp;
	1563	struct tcpcb *tp;
	1564
	1565	if (!lck_rw_try_lock_exclusive(tcbinfo.ipi_lock))
	1566	return;
	1567
	1568	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
	1569	if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
	1570	WNT_STOPUSING) {
	1571	tcp_lock(inp->inp_socket, 1, 0);
	1572	if (in_pcb_checkstate(inp, WNT_RELEASE, 1)
	1573	== WNT_STOPUSING) {
	1574	/* lost a race, try the next one */
	1575	tcp_unlock(inp->inp_socket, 1, 0);
	1576	continue;
	1577	}
	1578	tp = intotcpcb(inp);
	1579
	1580	if (do_tcpdrain)
	1581	tcp_freeq(tp);
	1582
	1583	so_drain_extended_bk_idle(inp->inp_socket);
	1584
	1585	tcp_unlock(inp->inp_socket, 1, 0);
	1586	}
	1587	}
	1588	lck_rw_done(tcbinfo.ipi_lock);
	1589
	1590	}
	1591
	1592	/*
	1593	* Notify a tcp user of an asynchronous error;
	1594	* store error as soft error, but wake up user
	1595	* (for now, won't do anything until can select for soft error).
	1596	*
	1597	* Do not wake up user since there currently is no mechanism for
	1598	* reporting soft errors (yet - a kqueue filter may be added).
	1599	*/
	1600	static void
	1601	tcp_notify(inp, error)
	1602	struct inpcb *inp;
	1603	int error;
	1604	{
	1605	struct tcpcb *tp;
	1606
	1607	if (inp == NULL \|\| (inp->inp_state == INPCB_STATE_DEAD))
	1608	return; /* pcb is gone already */
	1609
	1610	tp = (struct tcpcb *)inp->inp_ppcb;
	1611
	1612	/*
	1613	* Ignore some errors if we are hooked up.
	1614	* If connection hasn't completed, has retransmitted several times,
	1615	* and receives a second error, give up now. This is better
	1616	* than waiting a long time to establish a connection that
	1617	* can never complete.
	1618	*/
	1619	if (tp->t_state == TCPS_ESTABLISHED &&
	1620	(error == EHOSTUNREACH \|\| error == ENETUNREACH \|\|
	1621	error == EHOSTDOWN)) {
	1622	return;
	1623	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
	1624	tp->t_softerror)
	1625	tcp_drop(tp, error);
	1626	else
	1627	tp->t_softerror = error;
	1628	#if 0
	1629	wakeup((caddr_t) &so->so_timeo);
	1630	sorwakeup(so);
	1631	sowwakeup(so);
	1632	#endif
	1633	}
	1634
	1635	struct bwmeas*
	1636	tcp_bwmeas_alloc(struct tcpcb *tp)
	1637	{
	1638	struct bwmeas *elm;
	1639	elm = zalloc(tcp_bwmeas_zone);
	1640	if (elm == NULL)
	1641	return(elm);
	1642
	1643	bzero(elm, bwmeas_elm_size);
	1644	elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE;
	1645	elm->bw_maxsizepkts = TCP_BWMEAS_BURST_MAXSIZE;
	1646	elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg;
	1647	elm->bw_maxsize = elm->bw_maxsizepkts * tp->t_maxseg;
	1648	return(elm);
	1649	}
	1650
	1651	void
	1652	tcp_bwmeas_free(struct tcpcb* tp)
	1653	{
	1654	zfree(tcp_bwmeas_zone, tp->t_bwmeas);
	1655	tp->t_bwmeas = NULL;
	1656	tp->t_flagsext &= ~(TF_MEASURESNDBW);
	1657	}
	1658
	1659	/*
	1660	* tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format.
	1661	* The otcpcb data structure is passed to user space and must not change.
	1662	*/
	1663	static void
	1664	tcpcb_to_otcpcb(struct tcpcb tp, struct otcpcb otp)
	1665	{
	1666	otp->t_segq = (uint32_t)VM_KERNEL_ADDRPERM(tp->t_segq.lh_first);
	1667	otp->t_dupacks = tp->t_dupacks;
	1668	otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
	1669	otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
	1670	otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
	1671	otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
	1672	otp->t_inpcb = (_TCPCB_PTR(struct inpcb *))VM_KERNEL_ADDRPERM(tp->t_inpcb);
	1673	otp->t_state = tp->t_state;
	1674	otp->t_flags = tp->t_flags;
	1675	otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
	1676	otp->snd_una = tp->snd_una;
	1677	otp->snd_max = tp->snd_max;
	1678	otp->snd_nxt = tp->snd_nxt;
	1679	otp->snd_up = tp->snd_up;
	1680	otp->snd_wl1 = tp->snd_wl1;
	1681	otp->snd_wl2 = tp->snd_wl2;
	1682	otp->iss = tp->iss;
	1683	otp->irs = tp->irs;
	1684	otp->rcv_nxt = tp->rcv_nxt;
	1685	otp->rcv_adv = tp->rcv_adv;
	1686	otp->rcv_wnd = tp->rcv_wnd;
	1687	otp->rcv_up = tp->rcv_up;
	1688	otp->snd_wnd = tp->snd_wnd;
	1689	otp->snd_cwnd = tp->snd_cwnd;
	1690	otp->snd_ssthresh = tp->snd_ssthresh;
	1691	otp->t_maxopd = tp->t_maxopd;
	1692	otp->t_rcvtime = tp->t_rcvtime;
	1693	otp->t_starttime = tp->t_starttime;
	1694	otp->t_rtttime = tp->t_rtttime;
	1695	otp->t_rtseq = tp->t_rtseq;
	1696	otp->t_rxtcur = tp->t_rxtcur;
	1697	otp->t_maxseg = tp->t_maxseg;
	1698	otp->t_srtt = tp->t_srtt;
	1699	otp->t_rttvar = tp->t_rttvar;
	1700	otp->t_rxtshift = tp->t_rxtshift;
	1701	otp->t_rttmin = tp->t_rttmin;
	1702	otp->t_rttupdated = tp->t_rttupdated;
	1703	otp->max_sndwnd = tp->max_sndwnd;
	1704	otp->t_softerror = tp->t_softerror;
	1705	otp->t_oobflags = tp->t_oobflags;
	1706	otp->t_iobc = tp->t_iobc;
	1707	otp->snd_scale = tp->snd_scale;
	1708	otp->rcv_scale = tp->rcv_scale;
	1709	otp->request_r_scale = tp->request_r_scale;
	1710	otp->requested_s_scale = tp->requested_s_scale;
	1711	otp->ts_recent = tp->ts_recent;
	1712	otp->ts_recent_age = tp->ts_recent_age;
	1713	otp->last_ack_sent = tp->last_ack_sent;
	1714	otp->cc_send = 0;
	1715	otp->cc_recv = 0;
	1716	otp->snd_recover = tp->snd_recover;
	1717	otp->snd_cwnd_prev = tp->snd_cwnd_prev;
	1718	otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
	1719	otp->t_badrxtwin = 0;
	1720	}
	1721
	1722	static int
	1723	tcp_pcblist SYSCTL_HANDLER_ARGS
	1724	{
	1725	#pragma unused(oidp, arg1, arg2)
	1726	int error, i = 0, n;
	1727	struct inpcb inp, *inp_list;
	1728	struct tcpcb *tp;
	1729	inp_gen_t gencnt;
	1730	struct xinpgen xig;
	1731
	1732	/*
	1733	* The process of preparing the TCB list is too time-consuming and
	1734	* resource-intensive to repeat twice on every request.
	1735	*/
	1736	lck_rw_lock_shared(tcbinfo.ipi_lock);
	1737	if (req->oldptr == USER_ADDR_NULL) {
	1738	n = tcbinfo.ipi_count;
	1739	req->oldidx = 2 * (sizeof xig)
	1740	+ (n + n/8) * sizeof(struct xtcpcb);
	1741	lck_rw_done(tcbinfo.ipi_lock);
	1742	return 0;
	1743	}
	1744
	1745	if (req->newptr != USER_ADDR_NULL) {
	1746	lck_rw_done(tcbinfo.ipi_lock);
	1747	return EPERM;
	1748	}
	1749
	1750	/*
	1751	* OK, now we're committed to doing something.
	1752	*/
	1753	gencnt = tcbinfo.ipi_gencnt;
	1754	n = tcbinfo.ipi_count;
	1755
	1756	bzero(&xig, sizeof(xig));
	1757	xig.xig_len = sizeof xig;
	1758	xig.xig_count = n;
	1759	xig.xig_gen = gencnt;
	1760	xig.xig_sogen = so_gencnt;
	1761	error = SYSCTL_OUT(req, &xig, sizeof xig);
	1762	if (error) {
	1763	lck_rw_done(tcbinfo.ipi_lock);
	1764	return error;
	1765	}
	1766	/*
	1767	* We are done if there is no pcb
	1768	*/
	1769	if (n == 0) {
	1770	lck_rw_done(tcbinfo.ipi_lock);
	1771	return 0;
	1772	}
	1773
	1774	inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK);
	1775	if (inp_list == 0) {
	1776	lck_rw_done(tcbinfo.ipi_lock);
	1777	return ENOMEM;
	1778	}
	1779
	1780	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
	1781	if (inp->inp_gencnt <= gencnt &&
	1782	inp->inp_state != INPCB_STATE_DEAD)
	1783	inp_list[i++] = inp;
	1784	if (i >= n) break;
	1785	}
	1786
	1787	TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) {
	1788	inp = tp->t_inpcb;
	1789	if (inp->inp_gencnt <= gencnt &&
	1790	inp->inp_state != INPCB_STATE_DEAD)
	1791	inp_list[i++] = inp;
	1792	if (i >= n) break;
	1793	}
	1794
	1795	n = i;
	1796
	1797	error = 0;
	1798	for (i = 0; i < n; i++) {
	1799	inp = inp_list[i];
	1800	if (inp->inp_gencnt <= gencnt &&
	1801	inp->inp_state != INPCB_STATE_DEAD) {
	1802	struct xtcpcb xt;
	1803	caddr_t inp_ppcb;
	1804
	1805	bzero(&xt, sizeof(xt));
	1806	xt.xt_len = sizeof xt;
	1807	/* XXX should avoid extra copy */
	1808	inpcb_to_compat(inp, &xt.xt_inp);
	1809	inp_ppcb = inp->inp_ppcb;
	1810	if (inp_ppcb != NULL) {
	1811	tcpcb_to_otcpcb(
	1812	(struct tcpcb )(void )inp_ppcb,
	1813	&xt.xt_tp);
	1814	} else {
	1815	bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
	1816	}
	1817	if (inp->inp_socket)
	1818	sotoxsocket(inp->inp_socket, &xt.xt_socket);
	1819	error = SYSCTL_OUT(req, &xt, sizeof xt);
	1820	}
	1821	}
	1822	if (!error) {
	1823	/*
	1824	* Give the user an updated idea of our state.
	1825	* If the generation differs from what we told
	1826	* her before, she knows that something happened
	1827	* while we were processing this request, and it
	1828	* might be necessary to retry.
	1829	*/
	1830	bzero(&xig, sizeof(xig));
	1831	xig.xig_len = sizeof xig;
	1832	xig.xig_gen = tcbinfo.ipi_gencnt;
	1833	xig.xig_sogen = so_gencnt;
	1834	xig.xig_count = tcbinfo.ipi_count;
	1835	error = SYSCTL_OUT(req, &xig, sizeof xig);
	1836	}
	1837	FREE(inp_list, M_TEMP);
	1838	lck_rw_done(tcbinfo.ipi_lock);
	1839	return error;
	1840	}
	1841
	1842	SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
	1843	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, 0, 0,
	1844	tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
	1845
	1846
	1847	static void
	1848	tcpcb_to_xtcpcb64(struct tcpcb tp, struct xtcpcb64 otp)
	1849	{
	1850	otp->t_segq = (uint32_t)VM_KERNEL_ADDRPERM(tp->t_segq.lh_first);
	1851	otp->t_dupacks = tp->t_dupacks;
	1852	otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
	1853	otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
	1854	otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
	1855	otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
	1856	otp->t_state = tp->t_state;
	1857	otp->t_flags = tp->t_flags;
	1858	otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
	1859	otp->snd_una = tp->snd_una;
	1860	otp->snd_max = tp->snd_max;
	1861	otp->snd_nxt = tp->snd_nxt;
	1862	otp->snd_up = tp->snd_up;
	1863	otp->snd_wl1 = tp->snd_wl1;
	1864	otp->snd_wl2 = tp->snd_wl2;
	1865	otp->iss = tp->iss;
	1866	otp->irs = tp->irs;
	1867	otp->rcv_nxt = tp->rcv_nxt;
	1868	otp->rcv_adv = tp->rcv_adv;
	1869	otp->rcv_wnd = tp->rcv_wnd;
	1870	otp->rcv_up = tp->rcv_up;
	1871	otp->snd_wnd = tp->snd_wnd;
	1872	otp->snd_cwnd = tp->snd_cwnd;
	1873	otp->snd_ssthresh = tp->snd_ssthresh;
	1874	otp->t_maxopd = tp->t_maxopd;
	1875	otp->t_rcvtime = tp->t_rcvtime;
	1876	otp->t_starttime = tp->t_starttime;
	1877	otp->t_rtttime = tp->t_rtttime;
	1878	otp->t_rtseq = tp->t_rtseq;
	1879	otp->t_rxtcur = tp->t_rxtcur;
	1880	otp->t_maxseg = tp->t_maxseg;
	1881	otp->t_srtt = tp->t_srtt;
	1882	otp->t_rttvar = tp->t_rttvar;
	1883	otp->t_rxtshift = tp->t_rxtshift;
	1884	otp->t_rttmin = tp->t_rttmin;
	1885	otp->t_rttupdated = tp->t_rttupdated;
	1886	otp->max_sndwnd = tp->max_sndwnd;
	1887	otp->t_softerror = tp->t_softerror;
	1888	otp->t_oobflags = tp->t_oobflags;
	1889	otp->t_iobc = tp->t_iobc;
	1890	otp->snd_scale = tp->snd_scale;
	1891	otp->rcv_scale = tp->rcv_scale;
	1892	otp->request_r_scale = tp->request_r_scale;
	1893	otp->requested_s_scale = tp->requested_s_scale;
	1894	otp->ts_recent = tp->ts_recent;
	1895	otp->ts_recent_age = tp->ts_recent_age;
	1896	otp->last_ack_sent = tp->last_ack_sent;
	1897	otp->cc_send = 0;
	1898	otp->cc_recv = 0;
	1899	otp->snd_recover = tp->snd_recover;
	1900	otp->snd_cwnd_prev = tp->snd_cwnd_prev;
	1901	otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
	1902	otp->t_badrxtwin = 0;
	1903	}
	1904
	1905
	1906	static int
	1907	tcp_pcblist64 SYSCTL_HANDLER_ARGS
	1908	{
	1909	#pragma unused(oidp, arg1, arg2)
	1910	int error, i = 0, n;
	1911	struct inpcb inp, *inp_list;
	1912	struct tcpcb *tp;
	1913	inp_gen_t gencnt;
	1914	struct xinpgen xig;
	1915
	1916	/*
	1917	* The process of preparing the TCB list is too time-consuming and
	1918	* resource-intensive to repeat twice on every request.
	1919	*/
	1920	lck_rw_lock_shared(tcbinfo.ipi_lock);
	1921	if (req->oldptr == USER_ADDR_NULL) {
	1922	n = tcbinfo.ipi_count;
	1923	req->oldidx = 2 * (sizeof xig)
	1924	+ (n + n/8) * sizeof(struct xtcpcb64);
	1925	lck_rw_done(tcbinfo.ipi_lock);
	1926	return 0;
	1927	}
	1928
	1929	if (req->newptr != USER_ADDR_NULL) {
	1930	lck_rw_done(tcbinfo.ipi_lock);
	1931	return EPERM;
	1932	}
	1933
	1934	/*
	1935	* OK, now we're committed to doing something.
	1936	*/
	1937	gencnt = tcbinfo.ipi_gencnt;
	1938	n = tcbinfo.ipi_count;
	1939
	1940	bzero(&xig, sizeof(xig));
	1941	xig.xig_len = sizeof xig;
	1942	xig.xig_count = n;
	1943	xig.xig_gen = gencnt;
	1944	xig.xig_sogen = so_gencnt;
	1945	error = SYSCTL_OUT(req, &xig, sizeof xig);
	1946	if (error) {
	1947	lck_rw_done(tcbinfo.ipi_lock);
	1948	return error;
	1949	}
	1950	/*
	1951	* We are done if there is no pcb
	1952	*/
	1953	if (n == 0) {
	1954	lck_rw_done(tcbinfo.ipi_lock);
	1955	return 0;
	1956	}
	1957
	1958	inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK);
	1959	if (inp_list == 0) {
	1960	lck_rw_done(tcbinfo.ipi_lock);
	1961	return ENOMEM;
	1962	}
	1963
	1964	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
	1965	if (inp->inp_gencnt <= gencnt &&
	1966	inp->inp_state != INPCB_STATE_DEAD)
	1967	inp_list[i++] = inp;
	1968	if (i >= n) break;
	1969	}
	1970
	1971	TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) {
	1972	inp = tp->t_inpcb;
	1973	if (inp->inp_gencnt <= gencnt &&
	1974	inp->inp_state != INPCB_STATE_DEAD)
	1975	inp_list[i++] = inp;
	1976	if (i >= n) break;
	1977	}
	1978
	1979	n = i;
	1980
	1981	error = 0;
	1982	for (i = 0; i < n; i++) {
	1983	inp = inp_list[i];
	1984	if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
	1985	struct xtcpcb64 xt;
	1986
	1987	bzero(&xt, sizeof(xt));
	1988	xt.xt_len = sizeof xt;
	1989	inpcb_to_xinpcb64(inp, &xt.xt_inpcb);
	1990	xt.xt_inpcb.inp_ppcb = (uint64_t)VM_KERNEL_ADDRPERM(inp->inp_ppcb);
	1991	if (inp->inp_ppcb != NULL)
	1992	tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb, &xt);
	1993	if (inp->inp_socket)
	1994	sotoxsocket64(inp->inp_socket, &xt.xt_inpcb.xi_socket);
	1995	error = SYSCTL_OUT(req, &xt, sizeof xt);
	1996	}
	1997	}
	1998	if (!error) {
	1999	/*
	2000	* Give the user an updated idea of our state.
	2001	* If the generation differs from what we told
	2002	* her before, she knows that something happened
	2003	* while we were processing this request, and it
	2004	* might be necessary to retry.
	2005	*/
	2006	bzero(&xig, sizeof(xig));
	2007	xig.xig_len = sizeof xig;
	2008	xig.xig_gen = tcbinfo.ipi_gencnt;
	2009	xig.xig_sogen = so_gencnt;
	2010	xig.xig_count = tcbinfo.ipi_count;
	2011	error = SYSCTL_OUT(req, &xig, sizeof xig);
	2012	}
	2013	FREE(inp_list, M_TEMP);
	2014	lck_rw_done(tcbinfo.ipi_lock);
	2015	return error;
	2016	}
	2017
	2018	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64,
	2019	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, 0, 0,
	2020	tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections");
	2021
	2022
	2023	static int
	2024	tcp_pcblist_n SYSCTL_HANDLER_ARGS
	2025	{
	2026	#pragma unused(oidp, arg1, arg2)
	2027	int error = 0;
	2028
	2029	error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo);
	2030
	2031	return error;
	2032	}
	2033
	2034
	2035	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n,
	2036	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, 0, 0,
	2037	tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections");
	2038
	2039
	2040	__private_extern__ void
	2041	tcp_get_ports_used(uint32_t ifindex, int protocol, uint32_t flags,
	2042	bitstr_t *bitfield)
	2043	{
	2044	inpcb_get_ports_used(ifindex, protocol, flags,
	2045	bitfield, &tcbinfo);
	2046	}
	2047
	2048	__private_extern__ uint32_t
	2049	tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags)
	2050	{
	2051	return inpcb_count_opportunistic(ifindex, &tcbinfo, flags);
	2052	}
	2053
	2054	__private_extern__ uint32_t
	2055	tcp_find_anypcb_byaddr(struct ifaddr *ifa)
	2056	{
	2057	return inpcb_find_anypcb_byaddr(ifa, &tcbinfo);
	2058	}
	2059
	2060	void
	2061	tcp_ctlinput(cmd, sa, vip)
	2062	int cmd;
	2063	struct sockaddr *sa;
	2064	void *vip;
	2065	{
	2066	tcp_seq icmp_tcp_seq;
	2067	struct ip *ip = vip;
	2068	struct in_addr faddr;
	2069	struct inpcb *inp;
	2070	struct tcpcb *tp;
	2071
	2072	void (notify)(struct inpcb , int) = tcp_notify;
	2073
	2074	faddr = ((struct sockaddr_in )(void )sa)->sin_addr;
	2075	if (sa->sa_family != AF_INET \|\| faddr.s_addr == INADDR_ANY)
	2076	return;
	2077
	2078	if ((unsigned)cmd >= PRC_NCMDS)
	2079	return;
	2080
	2081	if (cmd == PRC_MSGSIZE)
	2082	notify = tcp_mtudisc;
	2083	else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB \|\|
	2084	cmd == PRC_UNREACH_PORT) && ip)
	2085	notify = tcp_drop_syn_sent;
	2086	else if (PRC_IS_REDIRECT(cmd)) {
	2087	ip = 0;
	2088	notify = in_rtchange;
	2089	} else if (cmd == PRC_HOSTDEAD)
	2090	ip = 0;
	2091	/* Source quench is deprecated */
	2092	else if (cmd == PRC_QUENCH)
	2093	return;
	2094	else if (inetctlerrmap[cmd] == 0)
	2095	return;
	2096	if (ip) {
	2097	struct tcphdr th;
	2098	struct icmp *icp;
	2099
	2100	icp = (struct icmp )(void )
	2101	((caddr_t)ip - offsetof(struct icmp, icmp_ip));
	2102	bcopy(((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2)),
	2103	&th, sizeof (th));
	2104	inp = in_pcblookup_hash(&tcbinfo, faddr, th.th_dport,
	2105	ip->ip_src, th.th_sport, 0, NULL);
	2106	if (inp != NULL && inp->inp_socket != NULL) {
	2107	tcp_lock(inp->inp_socket, 1, 0);
	2108	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
	2109	tcp_unlock(inp->inp_socket, 1, 0);
	2110	return;
	2111	}
	2112	icmp_tcp_seq = htonl(th.th_seq);
	2113	tp = intotcpcb(inp);
	2114	if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
	2115	SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
	2116	if (cmd == PRC_MSGSIZE) {
	2117
	2118	/*
	2119	* MTU discovery:
	2120	* If we got a needfrag and there is a host route to the
	2121	* original destination, and the MTU is not locked, then
	2122	* set the MTU in the route to the suggested new value
	2123	* (if given) and then notify as usual. The ULPs will
	2124	* notice that the MTU has changed and adapt accordingly.
	2125	* If no new MTU was suggested, then we guess a new one
	2126	* less than the current value. If the new MTU is
	2127	* unreasonably small (defined by sysctl tcp_minmss), then
	2128	* we reset the MTU to the interface value and enable the
	2129	* lock bit, indicating that we are no longer doing MTU
	2130	* discovery.
	2131	*/
	2132	struct rtentry *rt;
	2133	int mtu;
	2134	struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET,
	2135	0 , { 0 }, { 0,0,0,0,0,0,0,0 } };
	2136	icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
	2137
	2138	rt = rtalloc1((struct sockaddr *)&icmpsrc, 0,
	2139	RTF_CLONING \| RTF_PRCLONING);
	2140	if (rt != NULL) {
	2141	RT_LOCK(rt);
	2142	if ((rt->rt_flags & RTF_HOST) &&
	2143	!(rt->rt_rmx.rmx_locks & RTV_MTU)) {
	2144	mtu = ntohs(icp->icmp_nextmtu);
	2145	if (!mtu)
	2146	mtu = ip_next_mtu(rt->rt_rmx.
	2147	rmx_mtu, 1);
	2148	#if DEBUG_MTUDISC
	2149	printf("MTU for %s reduced to %d\n",
	2150	inet_ntop(AF_INET,
	2151	&icmpsrc.sin_addr, ipv4str,
	2152	sizeof (ipv4str)), mtu);
	2153	#endif
	2154	if (mtu < max(296, (tcp_minmss +
	2155	sizeof (struct tcpiphdr)))) {
	2156	/* rt->rt_rmx.rmx_mtu =
	2157	rt->rt_ifp->if_mtu; */
	2158	rt->rt_rmx.rmx_locks \|= RTV_MTU;
	2159	} else if (rt->rt_rmx.rmx_mtu > mtu) {
	2160	rt->rt_rmx.rmx_mtu = mtu;
	2161	}
	2162	}
	2163	RT_UNLOCK(rt);
	2164	rtfree(rt);
	2165	}
	2166	}
	2167
	2168	(*notify)(inp, inetctlerrmap[cmd]);
	2169	}
	2170	tcp_unlock(inp->inp_socket, 1, 0);
	2171	}
	2172	} else
	2173	in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
	2174	}
	2175
	2176	#if INET6
	2177	void
	2178	tcp6_ctlinput(cmd, sa, d)
	2179	int cmd;
	2180	struct sockaddr *sa;
	2181	void *d;
	2182	{
	2183	struct tcphdr th;
	2184	void (notify)(struct inpcb , int) = tcp_notify;
	2185	struct ip6_hdr *ip6;
	2186	struct mbuf *m;
	2187	struct ip6ctlparam *ip6cp = NULL;
	2188	const struct sockaddr_in6 *sa6_src = NULL;
	2189	int off;
	2190	struct tcp_portonly {
	2191	u_int16_t th_sport;
	2192	u_int16_t th_dport;
	2193	} *thp;
	2194
	2195	if (sa->sa_family != AF_INET6 \|\|
	2196	sa->sa_len != sizeof(struct sockaddr_in6))
	2197	return;
	2198
	2199	if ((unsigned)cmd >= PRC_NCMDS)
	2200	return;
	2201
	2202	if (cmd == PRC_MSGSIZE)
	2203	notify = tcp_mtudisc;
	2204	else if (!PRC_IS_REDIRECT(cmd) && (inet6ctlerrmap[cmd] == 0))
	2205	return;
	2206	/* Source quench is deprecated */
	2207	else if (cmd == PRC_QUENCH)
	2208	return;
	2209
	2210	/* if the parameter is from icmp6, decode it. */
	2211	if (d != NULL) {
	2212	ip6cp = (struct ip6ctlparam *)d;
	2213	m = ip6cp->ip6c_m;
	2214	ip6 = ip6cp->ip6c_ip6;
	2215	off = ip6cp->ip6c_off;
	2216	sa6_src = ip6cp->ip6c_src;
	2217	} else {
	2218	m = NULL;
	2219	ip6 = NULL;
	2220	off = 0; /* fool gcc */
	2221	sa6_src = &sa6_any;
	2222	}
	2223
	2224	if (ip6) {
	2225	/*
	2226	* XXX: We assume that when IPV6 is non NULL,
	2227	* M and OFF are valid.
	2228	*/
	2229
	2230	/* check if we can safely examine src and dst ports */
	2231	if (m->m_pkthdr.len < off + sizeof(*thp))
	2232	return;
	2233
	2234	bzero(&th, sizeof(th));
	2235	m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
	2236
	2237	in6_pcbnotify(&tcbinfo, sa, th.th_dport,
	2238	(struct sockaddr *)ip6cp->ip6c_src,
	2239	th.th_sport, cmd, NULL, notify);
	2240	} else {
	2241	in6_pcbnotify(&tcbinfo, sa, 0,
	2242	(struct sockaddr *)(size_t)sa6_src, 0, cmd, NULL, notify);
	2243	}
	2244	}
	2245	#endif /* INET6 */
	2246
	2247
	2248	/*
	2249	* Following is where TCP initial sequence number generation occurs.
	2250	*
	2251	* There are two places where we must use initial sequence numbers:
	2252	* 1. In SYN-ACK packets.
	2253	* 2. In SYN packets.
	2254	*
	2255	* The ISNs in SYN-ACK packets have no monotonicity requirement,
	2256	* and should be as unpredictable as possible to avoid the possibility
	2257	* of spoofing and/or connection hijacking. To satisfy this
	2258	* requirement, SYN-ACK ISNs are generated via the arc4random()
	2259	* function. If exact RFC 1948 compliance is requested via sysctl,
	2260	* these ISNs will be generated just like those in SYN packets.
	2261	*
	2262	* The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
	2263	* depends on this property. In addition, these ISNs should be
	2264	* unguessable so as to prevent connection hijacking. To satisfy
	2265	* the requirements of this situation, the algorithm outlined in
	2266	* RFC 1948 is used to generate sequence numbers.
	2267	*
	2268	* For more information on the theory of operation, please see
	2269	* RFC 1948.
	2270	*
	2271	* Implementation details:
	2272	*
	2273	* Time is based off the system timer, and is corrected so that it
	2274	* increases by one megabyte per second. This allows for proper
	2275	* recycling on high speed LANs while still leaving over an hour
	2276	* before rollover.
	2277	*
	2278	* Two sysctls control the generation of ISNs:
	2279	*
	2280	* net.inet.tcp.isn_reseed_interval controls the number of seconds
	2281	* between seeding of isn_secret. This is normally set to zero,
	2282	* as reseeding should not be necessary.
	2283	*
	2284	* net.inet.tcp.strict_rfc1948 controls whether RFC 1948 is followed
	2285	* strictly. When strict compliance is requested, reseeding is
	2286	* disabled and SYN-ACKs will be generated in the same manner as
	2287	* SYNs. Strict mode is disabled by default.
	2288	*
	2289	*/
	2290
	2291	#define ISN_BYTES_PER_SECOND 1048576
	2292
	2293	tcp_seq
	2294	tcp_new_isn(tp)
	2295	struct tcpcb *tp;
	2296	{
	2297	u_int32_t md5_buffer[4];
	2298	tcp_seq new_isn;
	2299	struct timeval timenow;
	2300	u_char isn_secret[32];
	2301	int isn_last_reseed = 0;
	2302	MD5_CTX isn_ctx;
	2303
	2304	/* Use arc4random for SYN-ACKs when not in exact RFC1948 mode. */
	2305	if (((tp->t_state == TCPS_LISTEN) \|\| (tp->t_state == TCPS_TIME_WAIT))
	2306	&& tcp_strict_rfc1948 == 0)
	2307	#ifdef __APPLE__
	2308	return RandomULong();
	2309	#else
	2310	return arc4random();
	2311	#endif
	2312	getmicrotime(&timenow);
	2313
	2314	/* Seed if this is the first use, reseed if requested. */
	2315	if ((isn_last_reseed == 0) \|\|
	2316	((tcp_strict_rfc1948 == 0) && (tcp_isn_reseed_interval > 0) &&
	2317	(((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
	2318	< (u_int)timenow.tv_sec))) {
	2319	#ifdef __APPLE__
	2320	read_random(&isn_secret, sizeof(isn_secret));
	2321	#else
	2322	read_random_unlimited(&isn_secret, sizeof(isn_secret));
	2323	#endif
	2324	isn_last_reseed = timenow.tv_sec;
	2325	}
	2326
	2327	/* Compute the md5 hash and return the ISN. */
	2328	MD5Init(&isn_ctx);
	2329	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
	2330	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
	2331	#if INET6
	2332	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
	2333	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
	2334	sizeof(struct in6_addr));
	2335	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
	2336	sizeof(struct in6_addr));
	2337	} else
	2338	#endif
	2339	{
	2340	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
	2341	sizeof(struct in_addr));
	2342	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
	2343	sizeof(struct in_addr));
	2344	}
	2345	MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
	2346	MD5Final((u_char *) &md5_buffer, &isn_ctx);
	2347	new_isn = (tcp_seq) md5_buffer[0];
	2348	new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz);
	2349	return new_isn;
	2350	}
	2351
	2352
	2353	/*
	2354	* When a specific ICMP unreachable message is received and the
	2355	* connection state is SYN-SENT, drop the connection. This behavior
	2356	* is controlled by the icmp_may_rst sysctl.
	2357	*/
	2358	void
	2359	tcp_drop_syn_sent(inp, errno)
	2360	struct inpcb *inp;
	2361	int errno;
	2362	{
	2363	struct tcpcb *tp = intotcpcb(inp);
	2364
	2365	if (tp && tp->t_state == TCPS_SYN_SENT)
	2366	tcp_drop(tp, errno);
	2367	}
	2368
	2369	/*
	2370	* When `need fragmentation' ICMP is received, update our idea of the MSS
	2371	* based on the new value in the route. Also nudge TCP to send something,
	2372	* since we know the packet we just sent was dropped.
	2373	* This duplicates some code in the tcp_mss() function in tcp_input.c.
	2374	*/
	2375	void
	2376	tcp_mtudisc(
	2377	struct inpcb *inp,
	2378	__unused int errno
	2379	)
	2380	{
	2381	struct tcpcb *tp = intotcpcb(inp);
	2382	struct rtentry *rt;
	2383	struct rmxp_tao *taop;
	2384	struct socket *so = inp->inp_socket;
	2385	int offered;
	2386	int mss;
	2387	u_int32_t mtu;
	2388	#if INET6
	2389	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
	2390	#endif /* INET6 */
	2391
	2392	if (tp) {
	2393	#if INET6
	2394	if (isipv6)
	2395	rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
	2396	else
	2397	#endif /* INET6 */
	2398	rt = tcp_rtlookup(inp, IFSCOPE_NONE);
	2399	if (!rt \|\| !rt->rt_rmx.rmx_mtu) {
	2400	tp->t_maxopd = tp->t_maxseg =
	2401	#if INET6
	2402	isipv6 ? tcp_v6mssdflt :
	2403	#endif /* INET6 */
	2404	tcp_mssdflt;
	2405
	2406	/* Route locked during lookup above */
	2407	if (rt != NULL)
	2408	RT_UNLOCK(rt);
	2409	return;
	2410	}
	2411	taop = rmx_taop(rt->rt_rmx);
	2412	offered = taop->tao_mssopt;
	2413	mtu = rt->rt_rmx.rmx_mtu;
	2414
	2415	/* Route locked during lookup above */
	2416	RT_UNLOCK(rt);
	2417
	2418	#if NECP
	2419	// Adjust MTU if necessary.
	2420	mtu = necp_socket_get_effective_mtu(inp, mtu);
	2421	#endif /* NECP */
	2422
	2423	mss = mtu -
	2424	#if INET6
	2425	(isipv6 ?
	2426	sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
	2427	#endif /* INET6 */
	2428	sizeof(struct tcpiphdr)
	2429	#if INET6
	2430	)
	2431	#endif /* INET6 */
	2432	;
	2433
	2434	if (offered)
	2435	mss = min(mss, offered);
	2436	/*
	2437	* XXX - The above conditional probably violates the TCP
	2438	* spec. The problem is that, since we don't know the
	2439	* other end's MSS, we are supposed to use a conservative
	2440	* default. But, if we do that, then MTU discovery will
	2441	* never actually take place, because the conservative
	2442	* default is much less than the MTUs typically seen
	2443	* on the Internet today. For the moment, we'll sweep
	2444	* this under the carpet.
	2445	*
	2446	* The conservative default might not actually be a problem
	2447	* if the only case this occurs is when sending an initial
	2448	* SYN with options and data to a host we've never talked
	2449	* to before. Then, they will reply with an MSS value which
	2450	* will get recorded and the new parameters should get
	2451	* recomputed. For Further Study.
	2452	*/
	2453	if (tp->t_maxopd <= mss)
	2454	return;
	2455	tp->t_maxopd = mss;
	2456
	2457	if ((tp->t_flags & (TF_REQ_TSTMP\|TF_NOOPT)) == TF_REQ_TSTMP &&
	2458	(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
	2459	mss -= TCPOLEN_TSTAMP_APPA;
	2460
	2461	#if MPTCP
	2462	mss -= mptcp_adj_mss(tp, TRUE);
	2463	#endif
	2464	if (so->so_snd.sb_hiwat < mss)
	2465	mss = so->so_snd.sb_hiwat;
	2466
	2467	tp->t_maxseg = mss;
	2468
	2469	/*
	2470	* Reset the slow-start flight size as it may depends on the new MSS
	2471	*/
	2472	if (CC_ALGO(tp)->cwnd_init != NULL)
	2473	CC_ALGO(tp)->cwnd_init(tp);
	2474	tcpstat.tcps_mturesent++;
	2475	tp->t_rtttime = 0;
	2476	tp->snd_nxt = tp->snd_una;
	2477	tcp_output(tp);
	2478	}
	2479	}
	2480
	2481	/*
	2482	* Look-up the routing entry to the peer of this inpcb. If no route
	2483	* is found and it cannot be allocated the return NULL. This routine
	2484	* is called by TCP routines that access the rmx structure and by tcp_mss
	2485	* to get the interface MTU. If a route is found, this routine will
	2486	* hold the rtentry lock; the caller is responsible for unlocking.
	2487	*/
	2488	struct rtentry *
	2489	tcp_rtlookup(inp, input_ifscope)
	2490	struct inpcb *inp;
	2491	unsigned int input_ifscope;
	2492	{
	2493	struct route *ro;
	2494	struct rtentry *rt;
	2495	struct tcpcb *tp;
	2496
	2497	lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
	2498
	2499	ro = &inp->inp_route;
	2500	if ((rt = ro->ro_rt) != NULL)
	2501	RT_LOCK(rt);
	2502
	2503	if (ROUTE_UNUSABLE(ro)) {
	2504	if (rt != NULL) {
	2505	RT_UNLOCK(rt);
	2506	rt = NULL;
	2507	}
	2508	ROUTE_RELEASE(ro);
	2509	/* No route yet, so try to acquire one */
	2510	if (inp->inp_faddr.s_addr != INADDR_ANY) {
	2511	unsigned int ifscope;
	2512
	2513	ro->ro_dst.sa_family = AF_INET;
	2514	ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
	2515	((struct sockaddr_in )(void )&ro->ro_dst)->sin_addr =
	2516	inp->inp_faddr;
	2517
	2518	/*
	2519	* If the socket was bound to an interface, then
	2520	* the bound-to-interface takes precedence over
	2521	* the inbound interface passed in by the caller
	2522	* (if we get here as part of the output path then
	2523	* input_ifscope is IFSCOPE_NONE).
	2524	*/
	2525	ifscope = (inp->inp_flags & INP_BOUND_IF) ?
	2526	inp->inp_boundifp->if_index : input_ifscope;
	2527
	2528	rtalloc_scoped(ro, ifscope);
	2529	if ((rt = ro->ro_rt) != NULL)
	2530	RT_LOCK(rt);
	2531	}
	2532	}
	2533	if (rt != NULL)
	2534	RT_LOCK_ASSERT_HELD(rt);
	2535
	2536	/*
	2537	* Update MTU discovery determination. Don't do it if:
	2538	* 1) it is disabled via the sysctl
	2539	* 2) the route isn't up
	2540	* 3) the MTU is locked (if it is, then discovery has been
	2541	* disabled)
	2542	*/
	2543
	2544	tp = intotcpcb(inp);
	2545
	2546	if (!path_mtu_discovery \|\| ((rt != NULL) &&
	2547	(!(rt->rt_flags & RTF_UP) \|\| (rt->rt_rmx.rmx_locks & RTV_MTU))))
	2548	tp->t_flags &= ~TF_PMTUD;
	2549	else
	2550	tp->t_flags \|= TF_PMTUD;
	2551
	2552	#if CONFIG_IFEF_NOWINDOWSCALE
	2553	if (tcp_obey_ifef_nowindowscale &&
	2554	tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL &&
	2555	(rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE)) {
	2556	/* Window scaling is enabled on this interface */
	2557	tp->t_flags &= ~TF_REQ_SCALE;
	2558	}
	2559	#endif
	2560
	2561	if (rt != NULL && rt->rt_ifp != NULL) {
	2562	somultipages(inp->inp_socket,
	2563	(rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
	2564	tcp_set_tso(tp, rt->rt_ifp);
	2565	soif2kcl(inp->inp_socket,
	2566	(rt->rt_ifp->if_eflags & IFEF_2KCL));
	2567	tcp_set_ecn(tp, rt->rt_ifp);
	2568	}
	2569
	2570	/* Note if the peer is local */
	2571	if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
	2572	(rt->rt_gateway->sa_family == AF_LINK \|\|
	2573	rt->rt_ifp->if_flags & IFF_LOOPBACK \|\|
	2574	in_localaddr(inp->inp_faddr))) {
	2575	tp->t_flags \|= TF_LOCAL;
	2576	}
	2577
	2578	/*
	2579	* Caller needs to call RT_UNLOCK(rt).
	2580	*/
	2581	return rt;
	2582	}
	2583
	2584	#if INET6
	2585	struct rtentry *
	2586	tcp_rtlookup6(inp, input_ifscope)
	2587	struct inpcb *inp;
	2588	unsigned int input_ifscope;
	2589	{
	2590	struct route_in6 *ro6;
	2591	struct rtentry *rt;
	2592	struct tcpcb *tp;
	2593
	2594	lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
	2595
	2596	ro6 = &inp->in6p_route;
	2597	if ((rt = ro6->ro_rt) != NULL)
	2598	RT_LOCK(rt);
	2599
	2600	if (ROUTE_UNUSABLE(ro6)) {
	2601	if (rt != NULL) {
	2602	RT_UNLOCK(rt);
	2603	rt = NULL;
	2604	}
	2605	ROUTE_RELEASE(ro6);
	2606	/* No route yet, so try to acquire one */
	2607	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	2608	struct sockaddr_in6 *dst6;
	2609	unsigned int ifscope;
	2610
	2611	dst6 = (struct sockaddr_in6 *)&ro6->ro_dst;
	2612	dst6->sin6_family = AF_INET6;
	2613	dst6->sin6_len = sizeof(*dst6);
	2614	dst6->sin6_addr = inp->in6p_faddr;
	2615
	2616	/*
	2617	* If the socket was bound to an interface, then
	2618	* the bound-to-interface takes precedence over
	2619	* the inbound interface passed in by the caller
	2620	* (if we get here as part of the output path then
	2621	* input_ifscope is IFSCOPE_NONE).
	2622	*/
	2623	ifscope = (inp->inp_flags & INP_BOUND_IF) ?
	2624	inp->inp_boundifp->if_index : input_ifscope;
	2625
	2626	rtalloc_scoped((struct route *)ro6, ifscope);
	2627	if ((rt = ro6->ro_rt) != NULL)
	2628	RT_LOCK(rt);
	2629	}
	2630	}
	2631	if (rt != NULL)
	2632	RT_LOCK_ASSERT_HELD(rt);
	2633
	2634	/*
	2635	* Update path MTU Discovery determination
	2636	* while looking up the route:
	2637	* 1) we have a valid route to the destination
	2638	* 2) the MTU is not locked (if it is, then discovery has been
	2639	* disabled)
	2640	*/
	2641
	2642
	2643	tp = intotcpcb(inp);
	2644
	2645	/*
	2646	* Update MTU discovery determination. Don't do it if:
	2647	* 1) it is disabled via the sysctl
	2648	* 2) the route isn't up
	2649	* 3) the MTU is locked (if it is, then discovery has been
	2650	* disabled)
	2651	*/
	2652
	2653	if (!path_mtu_discovery \|\| ((rt != NULL) &&
	2654	(!(rt->rt_flags & RTF_UP) \|\| (rt->rt_rmx.rmx_locks & RTV_MTU))))
	2655	tp->t_flags &= ~TF_PMTUD;
	2656	else
	2657	tp->t_flags \|= TF_PMTUD;
	2658
	2659	#if CONFIG_IFEF_NOWINDOWSCALE
	2660	if (tcp_obey_ifef_nowindowscale &&
	2661	tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL &&
	2662	(rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE)) {
	2663	/* Window scaling is not enabled on this interface */
	2664	tp->t_flags &= ~TF_REQ_SCALE;
	2665	}
	2666	#endif
	2667
	2668	if (rt != NULL && rt->rt_ifp != NULL) {
	2669	somultipages(inp->inp_socket,
	2670	(rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
	2671	tcp_set_tso(tp, rt->rt_ifp);
	2672	soif2kcl(inp->inp_socket,
	2673	(rt->rt_ifp->if_eflags & IFEF_2KCL));
	2674	tcp_set_ecn(tp, rt->rt_ifp);
	2675	}
	2676
	2677	/* Note if the peer is local */
	2678	if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
	2679	(IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) \|\|
	2680	IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) \|\|
	2681	rt->rt_gateway->sa_family == AF_LINK \|\|
	2682	in6_localaddr(&inp->in6p_faddr))) {
	2683	tp->t_flags \|= TF_LOCAL;
	2684	}
	2685
	2686	/*
	2687	* Caller needs to call RT_UNLOCK(rt).
	2688	*/
	2689	return rt;
	2690	}
	2691	#endif /* INET6 */
	2692
	2693	#if IPSEC
	2694	/* compute ESP/AH header size for TCP, including outer IP header. */
	2695	size_t
	2696	ipsec_hdrsiz_tcp(tp)
	2697	struct tcpcb *tp;
	2698	{
	2699	struct inpcb *inp;
	2700	struct mbuf *m;
	2701	size_t hdrsiz;
	2702	struct ip *ip;
	2703	#if INET6
	2704	struct ip6_hdr *ip6 = NULL;
	2705	#endif /* INET6 */
	2706	struct tcphdr *th;
	2707
	2708	if ((tp == NULL) \|\| ((inp = tp->t_inpcb) == NULL))
	2709	return 0;
	2710	MGETHDR(m, M_DONTWAIT, MT_DATA); /* MAC-OK */
	2711	if (!m)
	2712	return 0;
	2713
	2714	#if INET6
	2715	if ((inp->inp_vflag & INP_IPV6) != 0) {
	2716	ip6 = mtod(m, struct ip6_hdr *);
	2717	th = (struct tcphdr )(void )(ip6 + 1);
	2718	m->m_pkthdr.len = m->m_len =
	2719	sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	2720	tcp_fillheaders(tp, ip6, th);
	2721	hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
	2722	} else
	2723	#endif /* INET6 */
	2724	{
	2725	ip = mtod(m, struct ip *);
	2726	th = (struct tcphdr *)(ip + 1);
	2727	m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
	2728	tcp_fillheaders(tp, ip, th);
	2729	hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
	2730	}
	2731	m_free(m);
	2732	return hdrsiz;
	2733	}
	2734	#endif /IPSEC/
	2735
	2736	/*
	2737	* Return a pointer to the cached information about the remote host.
	2738	* The cached information is stored in the protocol specific part of
	2739	* the route metrics.
	2740	*/
	2741	struct rmxp_tao *
	2742	tcp_gettaocache(inp)
	2743	struct inpcb *inp;
	2744	{
	2745	struct rtentry *rt;
	2746	struct rmxp_tao *taop;
	2747
	2748	#if INET6
	2749	if ((inp->inp_vflag & INP_IPV6) != 0)
	2750	rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
	2751	else
	2752	#endif /* INET6 */
	2753	rt = tcp_rtlookup(inp, IFSCOPE_NONE);
	2754
	2755	/* Make sure this is a host route and is up. */
	2756	if (rt == NULL \|\|
	2757	(rt->rt_flags & (RTF_UP\|RTF_HOST)) != (RTF_UP\|RTF_HOST)) {
	2758	/* Route locked during lookup above */
	2759	if (rt != NULL)
	2760	RT_UNLOCK(rt);
	2761	return NULL;
	2762	}
	2763
	2764	taop = rmx_taop(rt->rt_rmx);
	2765	/* Route locked during lookup above */
	2766	RT_UNLOCK(rt);
	2767	return (taop);
	2768	}
	2769
	2770	/*
	2771	* Clear all the TAO cache entries, called from tcp_init.
	2772	*
	2773	* XXX
	2774	* This routine is just an empty one, because we assume that the routing
	2775	* routing tables are initialized at the same time when TCP, so there is
	2776	* nothing in the cache left over.
	2777	*/
	2778	static void
	2779	tcp_cleartaocache()
	2780	{
	2781	}
	2782
	2783	int
	2784	tcp_lock(struct socket so, int refcount, void lr)
	2785	{
	2786	void *lr_saved;
	2787
	2788	if (lr == NULL)
	2789	lr_saved = __builtin_return_address(0);
	2790	else
	2791	lr_saved = lr;
	2792
	2793	if (so->so_pcb != NULL) {
	2794	lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
	2795	} else {
	2796	panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s\n",
	2797	so, lr_saved, solockhistory_nr(so));
	2798	/* NOTREACHED */
	2799	}
	2800
	2801	if (so->so_usecount < 0) {
	2802	panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n",
	2803	so, so->so_pcb, lr_saved, so->so_usecount, solockhistory_nr(so));
	2804	/* NOTREACHED */
	2805	}
	2806	if (refcount)
	2807	so->so_usecount++;
	2808	so->lock_lr[so->next_lock_lr] = lr_saved;
	2809	so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
	2810	return (0);
	2811	}
	2812
	2813	int
	2814	tcp_unlock(struct socket so, int refcount, void lr)
	2815	{
	2816	void *lr_saved;
	2817
	2818	if (lr == NULL)
	2819	lr_saved = __builtin_return_address(0);
	2820	else
	2821	lr_saved = lr;
	2822
	2823	#ifdef MORE_TCPLOCK_DEBUG
	2824	printf("tcp_unlock: so=0x%llx sopcb=0x%llx lock=0x%llx ref=%x "
	2825	"lr=0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(so),
	2826	(uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb),
	2827	(uint64_t)VM_KERNEL_ADDRPERM(&(sotoinpcb(so)->inpcb_mtx)),
	2828	so->so_usecount, (uint64_t)VM_KERNEL_ADDRPERM(lr_saved));
	2829	#endif
	2830	if (refcount)
	2831	so->so_usecount--;
	2832
	2833	if (so->so_usecount < 0) {
	2834	panic("tcp_unlock: so=%p usecount=%x lrh= %s\n",
	2835	so, so->so_usecount, solockhistory_nr(so));
	2836	/* NOTREACHED */
	2837	}
	2838	if (so->so_pcb == NULL) {
	2839	panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s\n",
	2840	so, so->so_usecount, lr_saved, solockhistory_nr(so));
	2841	/* NOTREACHED */
	2842	} else {
	2843	lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
	2844	LCK_MTX_ASSERT_OWNED);
	2845	so->unlock_lr[so->next_unlock_lr] = lr_saved;
	2846	so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
	2847	lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
	2848	}
	2849	return (0);
	2850	}
	2851
	2852	lck_mtx_t *
	2853	tcp_getlock(
	2854	struct socket *so,
	2855	__unused int locktype)
	2856	{
	2857	struct inpcb *inp = sotoinpcb(so);
	2858
	2859	if (so->so_pcb) {
	2860	if (so->so_usecount < 0)
	2861	panic("tcp_getlock: so=%p usecount=%x lrh= %s\n",
	2862	so, so->so_usecount, solockhistory_nr(so));
	2863	return(&inp->inpcb_mtx);
	2864	}
	2865	else {
	2866	panic("tcp_getlock: so=%p NULL so_pcb %s\n",
	2867	so, solockhistory_nr(so));
	2868	return (so->so_proto->pr_domain->dom_mtx);
	2869	}
	2870	}
	2871
	2872	/*
	2873	* Determine if we can grow the recieve socket buffer to avoid sending
	2874	* a zero window update to the peer. We allow even socket buffers that
	2875	* have fixed size (set by the application) to grow if the resource
	2876	* constraints are met. They will also be trimmed after the application
	2877	* reads data.
	2878	*/
	2879	static void
	2880	tcp_sbrcv_grow_rwin(struct tcpcb tp, struct sockbuf sb)
	2881	{
	2882	u_int32_t rcvbufinc = tp->t_maxseg << 4;
	2883	u_int32_t rcvbuf = sb->sb_hiwat;
	2884	struct socket *so = tp->t_inpcb->inp_socket;
	2885
	2886	/*
	2887	* If message delivery is enabled, do not count
	2888	* unordered bytes in receive buffer towards hiwat
	2889	*/
	2890	if (so->so_flags & SOF_ENABLE_MSGS)
	2891	rcvbuf = rcvbuf - so->so_msg_state->msg_uno_bytes;
	2892
	2893	if (tcp_do_autorcvbuf == 1 &&
	2894	tcp_cansbgrow(sb) &&
	2895	(tp->t_flags & TF_SLOWLINK) == 0 &&
	2896	(so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
	2897	(rcvbuf - sb->sb_cc) < rcvbufinc &&
	2898	rcvbuf < tcp_autorcvbuf_max &&
	2899	(sb->sb_idealsize > 0 &&
	2900	sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
	2901	sbreserve(sb,
	2902	min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
	2903	}
	2904	}
	2905
	2906	int32_t
	2907	tcp_sbspace(struct tcpcb *tp)
	2908	{
	2909	struct sockbuf *sb = &tp->t_inpcb->inp_socket->so_rcv;
	2910	u_int32_t rcvbuf = sb->sb_hiwat;
	2911	int32_t space;
	2912	struct socket *so = tp->t_inpcb->inp_socket;
	2913	int32_t pending = 0;
	2914
	2915	/*
	2916	* If message delivery is enabled, do not count
	2917	* unordered bytes in receive buffer towards hiwat mark.
	2918	* This value is used to return correct rwnd that does
	2919	* not reflect the extra unordered bytes added to the
	2920	* receive socket buffer.
	2921	*/
	2922	if (so->so_flags & SOF_ENABLE_MSGS)
	2923	rcvbuf = rcvbuf - so->so_msg_state->msg_uno_bytes;
	2924
	2925	tcp_sbrcv_grow_rwin(tp, sb);
	2926
	2927	space = ((int32_t) imin((rcvbuf - sb->sb_cc),
	2928	(sb->sb_mbmax - sb->sb_mbcnt)));
	2929	if (space < 0)
	2930	space = 0;
	2931
	2932	#if CONTENT_FILTER
	2933	/* Compensate for data being processed by content filters */
	2934	pending = cfil_sock_data_space(sb);
	2935	#endif /* CONTENT_FILTER */
	2936	if (pending > space)
	2937	space = 0;
	2938	else
	2939	space -= pending;
	2940
	2941	/* Avoid increasing window size if the current window
	2942	* is already very low, we could be in "persist" mode and
	2943	* we could break some apps (see rdar://5409343)
	2944	*/
	2945
	2946	if (space < tp->t_maxseg)
	2947	return space;
	2948
	2949	/* Clip window size for slower link */
	2950
	2951	if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0 )
	2952	return imin(space, slowlink_wsize);
	2953
	2954	return space;
	2955	}
	2956	/*
	2957	* Checks TCP Segment Offloading capability for a given connection
	2958	* and interface pair.
	2959	*/
	2960	void
	2961	tcp_set_tso(struct tcpcb tp, struct ifnet ifp)
	2962	{
	2963	#if INET6
	2964	struct inpcb *inp;
	2965	int isipv6;
	2966	#endif /* INET6 */
	2967	#if MPTCP
	2968	/*
	2969	* We can't use TSO if this tcpcb belongs to an MPTCP session.
	2970	*/
	2971	if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
	2972	tp->t_flags &= ~TF_TSO;
	2973	return;
	2974	}
	2975	#endif
	2976	#if INET6
	2977	inp = tp->t_inpcb;
	2978	isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
	2979
	2980	if (isipv6) {
	2981	if (ifp && (ifp->if_hwassist & IFNET_TSO_IPV6)) {
	2982	tp->t_flags \|= TF_TSO;
	2983	if (ifp->if_tso_v6_mtu != 0)
	2984	tp->tso_max_segment_size = ifp->if_tso_v6_mtu;
	2985	else
	2986	tp->tso_max_segment_size = TCP_MAXWIN;
	2987	} else
	2988	tp->t_flags &= ~TF_TSO;
	2989
	2990	} else
	2991	#endif /* INET6 */
	2992
	2993	{
	2994	if (ifp && (ifp->if_hwassist & IFNET_TSO_IPV4)) {
	2995	tp->t_flags \|= TF_TSO;
	2996	if (ifp->if_tso_v4_mtu != 0)
	2997	tp->tso_max_segment_size = ifp->if_tso_v4_mtu;
	2998	else
	2999	tp->tso_max_segment_size = TCP_MAXWIN;
	3000	} else
	3001	tp->t_flags &= ~TF_TSO;
	3002	}
	3003	}
	3004
	3005	#define TIMEVAL_TO_TCPHZ(_tv_) ((_tv_).tv_sec * TCP_RETRANSHZ + (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC)
	3006
	3007	/* Function to calculate the tcp clock. The tcp clock will get updated
	3008	* at the boundaries of the tcp layer. This is done at 3 places:
	3009	* 1. Right before processing an input tcp packet
	3010	* 2. Whenever a connection wants to access the network using tcp_usrreqs
	3011	* 3. When a tcp timer fires or before tcp slow timeout
	3012	*
	3013	*/
	3014
	3015	void
	3016	calculate_tcp_clock()
	3017	{
	3018	struct timeval tv = tcp_uptime;
	3019	struct timeval interval = {0, TCP_RETRANSHZ_TO_USEC};
	3020	struct timeval now, hold_now;
	3021	uint32_t incr = 0;
	3022
	3023	microuptime(&now);
	3024
	3025	/*
	3026	* Update coarse-grained networking timestamp (in sec.); the idea
	3027	* is to update the counter returnable via net_uptime() when
	3028	* we read time.
	3029	*/
	3030	net_update_uptime_secs(now.tv_sec);
	3031
	3032	timevaladd(&tv, &interval);
	3033	if (timevalcmp(&now, &tv, >)) {
	3034	/* time to update the clock */
	3035	lck_spin_lock(tcp_uptime_lock);
	3036	if (timevalcmp(&tcp_uptime, &now, >=)) {
	3037	/* clock got updated while waiting for the lock */
	3038	lck_spin_unlock(tcp_uptime_lock);
	3039	return;
	3040	}
	3041
	3042	microuptime(&now);
	3043	hold_now = now;
	3044	tv = tcp_uptime;
	3045	timevalsub(&now, &tv);
	3046
	3047	incr = TIMEVAL_TO_TCPHZ(now);
	3048	if (incr > 0) {
	3049	tcp_uptime = hold_now;
	3050	tcp_now += incr;
	3051	}
	3052
	3053	lck_spin_unlock(tcp_uptime_lock);
	3054	}
	3055	return;
	3056	}
	3057
	3058	/* Compute receive window scaling that we are going to request
	3059	* for this connection based on sb_hiwat. Try to leave some
	3060	* room to potentially increase the window size upto a maximum
	3061	* defined by the constant tcp_autorcvbuf_max.
	3062	*/
	3063	void
	3064	tcp_set_max_rwinscale(struct tcpcb tp, struct socket so) {
	3065	u_int32_t maxsockbufsize;
	3066	if (!tcp_do_rfc1323) {
	3067	tp->request_r_scale = 0;
	3068	return;
	3069	}
	3070
	3071	tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale);
	3072	maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ?
	3073	so->so_rcv.sb_hiwat : tcp_autorcvbuf_max;
	3074
	3075	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
	3076	(TCP_MAXWIN << tp->request_r_scale) < maxsockbufsize)
	3077	tp->request_r_scale++;
	3078	tp->request_r_scale = min(tp->request_r_scale, TCP_MAX_WINSHIFT);
	3079
	3080	}
	3081
	3082	int
	3083	tcp_notsent_lowat_check(struct socket *so) {
	3084	struct inpcb *inp = sotoinpcb(so);
	3085	struct tcpcb *tp = NULL;
	3086	int notsent = 0;
	3087	if (inp != NULL) {
	3088	tp = intotcpcb(inp);
	3089	}
	3090
	3091	notsent = so->so_snd.sb_cc -
	3092	(tp->snd_nxt - tp->snd_una);
	3093
	3094	/* When we send a FIN or SYN, not_sent can be negative.
	3095	* In that case also we need to send a write event to the
	3096	* process if it is waiting. In the FIN case, it will
	3097	* get an error from send because cantsendmore will be set.
	3098	*/
	3099	if (notsent <= tp->t_notsent_lowat) {
	3100	return(1);
	3101	}
	3102
	3103	/* When Nagle's algorithm is not disabled, it is better
	3104	* to wakeup the client until there is atleast one
	3105	* maxseg of data to write.
	3106	*/
	3107	if ((tp->t_flags & TF_NODELAY) == 0 &&
	3108	notsent > 0 && notsent < tp->t_maxseg) {
	3109	return(1);
	3110	}
	3111	return(0);
	3112	}
	3113
	3114	void
	3115	tcp_rxtseg_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end) {
	3116	struct tcp_rxt_seg rxseg = NULL, prev = NULL, *next = NULL;
	3117	u_int32_t rxcount = 0;
	3118
	3119	if (SLIST_EMPTY(&tp->t_rxt_segments))
	3120	tp->t_dsack_lastuna = tp->snd_una;
	3121	/*
	3122	* First check if there is a segment already existing for this
	3123	* sequence space.
	3124	*/
	3125
	3126	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
	3127	if (SEQ_GT(rxseg->rx_start, start))
	3128	break;
	3129	prev = rxseg;
	3130	}
	3131	next = rxseg;
	3132
	3133	/* check if prev seg is for this sequence */
	3134	if (prev != NULL && SEQ_LEQ(prev->rx_start, start) &&
	3135	SEQ_GEQ(prev->rx_end, end)) {
	3136	prev->rx_count++;
	3137	return;
	3138	}
	3139
	3140	/*
	3141	* There are a couple of possibilities at this point.
	3142	* 1. prev overlaps with the beginning of this sequence
	3143	* 2. next overlaps with the end of this sequence
	3144	* 3. there is no overlap.
	3145	*/
	3146
	3147	if (prev != NULL && SEQ_GT(prev->rx_end, start)) {
	3148	if (prev->rx_start == start && SEQ_GT(end, prev->rx_end)) {
	3149	start = prev->rx_end + 1;
	3150	prev->rx_count++;
	3151	} else {
	3152	prev->rx_end = (start - 1);
	3153	rxcount = prev->rx_count;
	3154	}
	3155	}
	3156
	3157	if (next != NULL && SEQ_LT(next->rx_start, end)) {
	3158	if (SEQ_LEQ(next->rx_end, end)) {
	3159	end = next->rx_start - 1;
	3160	next->rx_count++;
	3161	} else {
	3162	next->rx_start = end + 1;
	3163	rxcount = next->rx_count;
	3164	}
	3165	}
	3166	if (!SEQ_LT(start, end))
	3167	return;
	3168
	3169	rxseg = (struct tcp_rxt_seg *) zalloc(tcp_rxt_seg_zone);
	3170	if (rxseg == NULL) {
	3171	return;
	3172	}
	3173	bzero(rxseg, sizeof(*rxseg));
	3174	rxseg->rx_start = start;
	3175	rxseg->rx_end = end;
	3176	rxseg->rx_count = rxcount + 1;
	3177
	3178	if (prev != NULL) {
	3179	SLIST_INSERT_AFTER(prev, rxseg, rx_link);
	3180	} else {
	3181	SLIST_INSERT_HEAD(&tp->t_rxt_segments, rxseg, rx_link);
	3182	}
	3183	return;
	3184	}
	3185
	3186	struct tcp_rxt_seg *
	3187	tcp_rxtseg_find(struct tcpcb *tp, tcp_seq start, tcp_seq end)
	3188	{
	3189	struct tcp_rxt_seg *rxseg;
	3190	if (SLIST_EMPTY(&tp->t_rxt_segments))
	3191	return (NULL);
	3192
	3193	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
	3194	if (SEQ_LEQ(rxseg->rx_start, start) &&
	3195	SEQ_GEQ(rxseg->rx_end, end))
	3196	return (rxseg);
	3197	if (SEQ_GT(rxseg->rx_start, start))
	3198	break;
	3199	}
	3200	return (NULL);
	3201	}
	3202
	3203	void
	3204	tcp_rxtseg_clean(struct tcpcb *tp)
	3205	{
	3206	struct tcp_rxt_seg rxseg, next;
	3207
	3208	SLIST_FOREACH_SAFE(rxseg, &tp->t_rxt_segments, rx_link, next) {
	3209	SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
	3210	tcp_rxt_seg, rx_link);
	3211	zfree(tcp_rxt_seg_zone, rxseg);
	3212	}
	3213	tp->t_dsack_lastuna = tp->snd_max;
	3214	}
	3215
	3216	boolean_t
	3217	tcp_rxtseg_detect_bad_rexmt(struct tcpcb *tp, tcp_seq th_ack)
	3218	{
	3219	boolean_t bad_rexmt;
	3220	struct tcp_rxt_seg *rxseg;
	3221
	3222	if (SLIST_EMPTY(&tp->t_rxt_segments))
	3223	return (FALSE);
	3224
	3225	/*
	3226	* If all of the segments in this window are not cumulatively
	3227	* acknowledged, then there can still be undetected packet loss.
	3228	* Do not restore congestion window in that case.
	3229	*/
	3230	if (SEQ_LT(th_ack, tp->snd_recover))
	3231	return (FALSE);
	3232
	3233	bad_rexmt = TRUE;
	3234	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
	3235	if (rxseg->rx_count > 1 \|\|
	3236	!(rxseg->rx_flags & TCP_RXT_SPURIOUS)) {
	3237	bad_rexmt = FALSE;
	3238	break;
	3239	}
	3240	}
	3241	return (bad_rexmt);
	3242	}
	3243
	3244	boolean_t
	3245	tcp_rxtseg_dsack_for_tlp(struct tcpcb *tp)
	3246	{
	3247	boolean_t dsack_for_tlp = FALSE;
	3248	struct tcp_rxt_seg *rxseg;
	3249	if (SLIST_EMPTY(&tp->t_rxt_segments))
	3250	return (FALSE);
	3251
	3252	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
	3253	if (rxseg->rx_count == 1 &&
	3254	SLIST_NEXT(rxseg,rx_link) == NULL &&
	3255	(rxseg->rx_flags & TCP_RXT_DSACK_FOR_TLP)) {
	3256	dsack_for_tlp = TRUE;
	3257	break;
	3258	}
	3259	}
	3260	return (dsack_for_tlp);
	3261	}
	3262
	3263	u_int32_t
	3264	tcp_rxtseg_total_size(struct tcpcb *tp) {
	3265	struct tcp_rxt_seg *rxseg;
	3266	u_int32_t total_size = 0;
	3267
	3268	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
	3269	total_size += (rxseg->rx_end - rxseg->rx_start) + 1;
	3270	}
	3271	return (total_size);
	3272	}
	3273
	3274	void
	3275	tcp_get_connectivity_status(struct tcpcb *tp,
	3276	struct tcp_conn_status *connstatus)
	3277	{
	3278	if (tp == NULL \|\| connstatus == NULL)
	3279	return;
	3280	bzero(connstatus, sizeof(*connstatus));
	3281	if (tp->t_rxtshift >= TCP_CONNECTIVITY_PROBES_MAX) {
	3282	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
	3283	connstatus->write_probe_failed = 1;
	3284	} else {
	3285	connstatus->conn_probe_failed = 1;
	3286	}
	3287	}
	3288	if (tp->t_rtimo_probes >= TCP_CONNECTIVITY_PROBES_MAX)
	3289	connstatus->read_probe_failed = 1;
	3290	if (tp->t_inpcb != NULL && tp->t_inpcb->inp_last_outifp != NULL
	3291	&& (tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_PROBE_CONNECTIVITY))
	3292	connstatus->probe_activated = 1;
	3293	return;
	3294	}
	3295
	3296	boolean_t
	3297	tfo_enabled(const struct tcpcb *tp)
	3298	{
	3299	return !!(tp->t_flagsext & TF_FASTOPEN);
	3300	}
	3301
	3302	void
	3303	tcp_disable_tfo(struct tcpcb *tp)
	3304	{
	3305	tp->t_flagsext &= ~TF_FASTOPEN;
	3306	}
	3307