git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
	30	* The Regents of the University of California. All rights reserved.
	31	*
	32	* Redistribution and use in source and binary forms, with or without
	33	* modification, are permitted provided that the following conditions
	34	* are met:
	35	* 1. Redistributions of source code must retain the above copyright
	36	* notice, this list of conditions and the following disclaimer.
	37	* 2. Redistributions in binary form must reproduce the above copyright
	38	* notice, this list of conditions and the following disclaimer in the
	39	* documentation and/or other materials provided with the distribution.
	40	* 3. All advertising materials mentioning features or use of this software
	41	* must display the following acknowledgement:
	42	* This product includes software developed by the University of
	43	* California, Berkeley and its contributors.
	44	* 4. Neither the name of the University nor the names of its contributors
	45	* may be used to endorse or promote products derived from this software
	46	* without specific prior written permission.
	47	*
	48	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	49	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	50	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	51	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	52	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	53	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	54	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	55	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	56	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	57	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	58	* SUCH DAMAGE.
	59	*
	60	* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
	61	* $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
	62	*/
	63	/*
	64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
	65	* support for mandatory and extensible security protections. This notice
	66	* is included in support of clause 2.2 (b) of the Apple Public License,
	67	* Version 2.0.
	68	*/
	69
	70	#include <sys/param.h>
	71	#include <sys/systm.h>
	72	#include <sys/kernel.h>
	73	#include <sys/sysctl.h>
	74	#include <sys/malloc.h>
	75	#include <sys/mbuf.h>
	76	#include <sys/proc.h> /* for proc0 declaration */
	77	#include <sys/protosw.h>
	78	#include <sys/socket.h>
	79	#include <sys/socketvar.h>
	80	#include <sys/syslog.h>
	81	#include <sys/mcache.h>
	82	#if XNU_TARGET_OS_OSX
	83	#include <sys/kasl.h>
	84	#endif /* XNU_TARGET_OS_OSX */
	85	#include <sys/kauth.h>
	86	#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
	87
	88	#include <machine/endian.h>
	89
	90	#include <net/if.h>
	91	#include <net/if_types.h>
	92	#include <net/route.h>
	93	#include <net/ntstat.h>
	94	#include <net/content_filter.h>
	95	#include <net/dlil.h>
	96	#include <net/multi_layer_pkt_log.h>
	97
	98	#include <netinet/in.h>
	99	#include <netinet/in_systm.h>
	100	#include <netinet/ip.h>
	101	#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
	102	#include <netinet/in_var.h>
	103	#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
	104	#include <netinet/in_pcb.h>
	105	#include <netinet/ip_var.h>
	106	#include <mach/sdt.h>
	107	#include <netinet/ip6.h>
	108	#include <netinet/icmp6.h>
	109	#include <netinet6/nd6.h>
	110	#include <netinet6/ip6_var.h>
	111	#include <netinet6/in6_pcb.h>
	112	#include <netinet/tcp.h>
	113	#include <netinet/tcp_cache.h>
	114	#include <netinet/tcp_fsm.h>
	115	#include <netinet/tcp_seq.h>
	116	#include <netinet/tcp_timer.h>
	117	#include <netinet/tcp_var.h>
	118	#include <netinet/tcp_cc.h>
	119	#include <dev/random/randomdev.h>
	120	#include <kern/zalloc.h>
	121	#include <netinet6/tcp6_var.h>
	122	#include <netinet/tcpip.h>
	123	#if TCPDEBUG
	124	#include <netinet/tcp_debug.h>
	125	u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
	126	struct tcphdr tcp_savetcp;
	127	#endif /* TCPDEBUG */
	128	#include <netinet/tcp_log.h>
	129
	130	#if IPSEC
	131	#include <netinet6/ipsec.h>
	132	#include <netinet6/ipsec6.h>
	133	#include <netkey/key.h>
	134	#endif /IPSEC/
	135
	136	#include <sys/kdebug.h>
	137	#if MPTCP
	138	#include <netinet/mptcp_var.h>
	139	#include <netinet/mptcp.h>
	140	#include <netinet/mptcp_opt.h>
	141	#endif /* MPTCP */
	142
	143	#include <corecrypto/ccaes.h>
	144
	145	#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0)
	146	#define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2)
	147	#define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8))
	148	#define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8))
	149
	150	#define TCP_RTT_HISTORY_EXPIRE_TIME (60 * TCP_RETRANSHZ)
	151	#define TCP_RECV_THROTTLE_WIN (5 * TCP_RETRANSHZ)
	152	#define TCP_STRETCHACK_ENABLE_PKTCNT 2000
	153
	154	struct tcpstat tcpstat;
	155
	156	SYSCTL_SKMEM_TCP_INT(OID_AUTO, flow_control_response,
	157	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_flow_control_response, 1,
	158	"Improved response to Flow-control events");
	159
	160	static int log_in_vain = 0;
	161	SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain,
	162	CTLFLAG_RW \| CTLFLAG_LOCKED, &log_in_vain, 0,
	163	"Log all incoming TCP connections");
	164
	165	SYSCTL_SKMEM_TCP_INT(OID_AUTO, ack_strategy,
	166	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_ack_strategy, TCP_ACK_STRATEGY_MODERN,
	167	"Revised TCP ACK-strategy, avoiding stretch-ACK implementation");
	168
	169	static int blackhole = 0;
	170	SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole,
	171	CTLFLAG_RW \| CTLFLAG_LOCKED, &blackhole, 0,
	172	"Do not send RST when dropping refused connections");
	173
	174	SYSCTL_SKMEM_TCP_INT(OID_AUTO, aggressive_rcvwnd_inc,
	175	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_aggressive_rcvwnd_inc, 1,
	176	"Be more aggressive about increasing the receive-window.");
	177
	178	SYSCTL_SKMEM_TCP_INT(OID_AUTO, delayed_ack,
	179	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_delack_enabled, 3,
	180	"Delay ACK to try and piggyback it onto a data packet");
	181
	182	SYSCTL_SKMEM_TCP_INT(OID_AUTO, recvbg, CTLFLAG_RW \| CTLFLAG_LOCKED,
	183	int, tcp_recv_bg, 0, "Receive background");
	184
	185	SYSCTL_SKMEM_TCP_INT(OID_AUTO, drop_synfin,
	186	CTLFLAG_RW \| CTLFLAG_LOCKED, static int, drop_synfin, 1,
	187	"Drop TCP packets with SYN+FIN set");
	188
	189	SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW \| CTLFLAG_LOCKED, 0,
	190	"TCP Segment Reassembly Queue");
	191
	192	static int tcp_reass_overflows = 0;
	193	SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
	194	CTLFLAG_RD \| CTLFLAG_LOCKED, &tcp_reass_overflows, 0,
	195	"Global number of TCP Segment Reassembly Queue Overflows");
	196
	197
	198	SYSCTL_SKMEM_TCP_INT(OID_AUTO, slowlink_wsize, CTLFLAG_RW \| CTLFLAG_LOCKED,
	199	__private_extern__ int, slowlink_wsize, 8192,
	200	"Maximum advertised window size for slowlink");
	201
	202	SYSCTL_SKMEM_TCP_INT(OID_AUTO, maxseg_unacked,
	203	CTLFLAG_RW \| CTLFLAG_LOCKED, int, maxseg_unacked, 8,
	204	"Maximum number of outstanding segments left unacked");
	205
	206	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rfc3465, CTLFLAG_RW \| CTLFLAG_LOCKED,
	207	int, tcp_do_rfc3465, 1, "");
	208
	209	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rfc3465_lim2,
	210	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_do_rfc3465_lim2, 1,
	211	"Appropriate bytes counting w/ L=2*SMSS");
	212
	213	int rtt_samples_per_slot = 20;
	214
	215	int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH;
	216	u_int32_t tcp_autorcvbuf_inc_shift = 3;
	217	SYSCTL_SKMEM_TCP_INT(OID_AUTO, recv_allowed_iaj,
	218	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_allowed_iaj, ALLOWED_IAJ,
	219	"Allowed inter-packet arrival jiter");
	220
	221	SYSCTL_SKMEM_TCP_INT(OID_AUTO, doautorcvbuf,
	222	CTLFLAG_RW \| CTLFLAG_LOCKED, u_int32_t, tcp_do_autorcvbuf, 1,
	223	"Enable automatic socket buffer tuning");
	224
	225	SYSCTL_SKMEM_TCP_INT(OID_AUTO, autotunereorder,
	226	CTLFLAG_RW \| CTLFLAG_LOCKED, u_int32_t, tcp_autotune_reorder, 1,
	227	"Enable automatic socket buffer tuning even when reordering is present");
	228
	229	SYSCTL_SKMEM_TCP_INT(OID_AUTO, autorcvbufmax,
	230	CTLFLAG_RW \| CTLFLAG_LOCKED, u_int32_t, tcp_autorcvbuf_max, 2 * 1024 * 1024,
	231	"Maximum receive socket buffer size");
	232
	233	int tcp_disable_access_to_stats = 1;
	234	SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_access_to_stats,
	235	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_disable_access_to_stats, 0,
	236	"Disable access to tcpstat");
	237
	238	SYSCTL_SKMEM_TCP_INT(OID_AUTO, challengeack_limit,
	239	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_challengeack_limit, 10,
	240	"Maximum number of challenge ACKs per connection per second");
	241
	242	/* TO BE REMOVED */
	243	SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_rfc5961,
	244	CTLFLAG_RW \| CTLFLAG_LOCKED, static int, tcp_do_rfc5961, 1,
	245	"Enable/Disable full RFC 5961 compliance");
	246
	247	SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_better_lr,
	248	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_do_better_lr, 1,
	249	"Improved TCP Loss Recovery");
	250
	251	extern int tcp_acc_iaj_high;
	252	extern int tcp_acc_iaj_react_limit;
	253
	254	int tcprexmtthresh = 3;
	255
	256	u_int32_t tcp_now;
	257	struct timeval tcp_uptime; /* uptime when tcp_now was last updated */
	258	lck_spin_t tcp_uptime_lock; / Used to sychronize updates to tcp_now */
	259
	260	struct inpcbhead tcb;
	261	#define tcb6 tcb /* for KAME src sync over BSD's /
	262	struct inpcbinfo tcbinfo;
	263
	264	static void tcp_dooptions(struct tcpcb , u_char , int, struct tcphdr *,
	265	struct tcpopt *);
	266	static void tcp_finalize_options(struct tcpcb , struct tcpopt , unsigned int);
	267	static void tcp_pulloutofband(struct socket *,
	268	struct tcphdr , struct mbuf , int);
	269	static void tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq);
	270	static inline unsigned int tcp_maxmtu(struct rtentry *);
	271	static inline int tcp_stretch_ack_enable(struct tcpcb *tp, int thflags);
	272	static inline void tcp_adaptive_rwtimo_check(struct tcpcb *, int);
	273
	274	#if TRAFFIC_MGT
	275	static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen,
	276	int reset_size);
	277	static inline void compute_iaj(struct tcpcb *tp);
	278	static inline void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj);
	279	#endif /* TRAFFIC_MGT */
	280
	281	static inline unsigned int tcp_maxmtu6(struct rtentry *);
	282	unsigned int get_maxmtu(struct rtentry *);
	283
	284	static void tcp_sbrcv_grow(struct tcpcb tp, struct sockbuf sb,
	285	struct tcpopt *to, uint32_t tlen);
	286	void tcp_sbrcv_trim(struct tcpcb tp, struct sockbuf sb);
	287	static void tcp_sbsnd_trim(struct sockbuf *sbsnd);
	288	static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp);
	289	static inline void tcp_sbrcv_reserve(struct tcpcb tp, struct sockbuf sb,
	290	u_int32_t newsize, u_int32_t idealsize, u_int32_t rcvbuf_max);
	291	static void tcp_bad_rexmt_restore_state(struct tcpcb tp, struct tcphdr th);
	292	static void tcp_compute_rtt(struct tcpcb tp, struct tcpopt to,
	293	struct tcphdr *th);
	294	static void tcp_early_rexmt_check(struct tcpcb tp, struct tcphdr th);
	295	static void tcp_bad_rexmt_check(struct tcpcb tp, struct tcphdr th,
	296	struct tcpopt *to);
	297	/*
	298	* Constants used for resizing receive socket buffer
	299	* when timestamps are not supported
	300	*/
	301	#define TCPTV_RCVNOTS_QUANTUM 100
	302	#define TCP_RCVNOTS_BYTELEVEL 204800
	303
	304	/*
	305	* Constants used for limiting early retransmits
	306	* to 10 per minute.
	307	*/
	308	#define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */
	309	#define TCP_EARLY_REXMT_LIMIT 10
	310
	311	#define log_in_vain_log( a ) { log a; }
	312
	313	int tcp_rcvunackwin = TCPTV_UNACKWIN;
	314	int tcp_maxrcvidle = TCPTV_MAXRCVIDLE;
	315	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rcvsspktcnt, CTLFLAG_RW \| CTLFLAG_LOCKED,
	316	int, tcp_rcvsspktcnt, TCP_RCV_SS_PKTCOUNT, "packets to be seen before receiver stretches acks");
	317
	318	#define DELAY_ACK(tp, th) \
	319	(CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
	320
	321	static int tcp_dropdropablreq(struct socket *head);
	322	static void tcp_newreno_partial_ack(struct tcpcb tp, struct tcphdr th);
	323	static void update_base_rtt(struct tcpcb *tp, uint32_t rtt);
	324	void tcp_set_background_cc(struct socket *so);
	325	void tcp_set_foreground_cc(struct socket *so);
	326	static void tcp_set_new_cc(struct socket *so, uint16_t cc_index);
	327	static void tcp_bwmeas_check(struct tcpcb *tp);
	328
	329	#if TRAFFIC_MGT
	330	void
	331	reset_acc_iaj(struct tcpcb *tp)
	332	{
	333	tp->acc_iaj = 0;
	334	CLEAR_IAJ_STATE(tp);
	335	}
	336
	337	static inline void
	338	update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size)
	339	{
	340	if (rst_size > 0) {
	341	tp->iaj_size = 0;
	342	}
	343	if (tp->iaj_size == 0 \|\| size >= tp->iaj_size) {
	344	tp->iaj_size = size;
	345	tp->iaj_rcv_ts = tcp_now;
	346	tp->iaj_small_pkt = 0;
	347	}
	348	}
	349
	350	/* For every 32 bit unsigned integer(v), this function will find the
	351	* largest integer n such that (n*n <= v). This takes at most 16 iterations
	352	* irrespective of the value of v and does not involve multiplications.
	353	*/
	354	static inline int
	355	isqrt(unsigned int val)
	356	{
	357	unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100};
	358	unsigned int temp, g = 0, b = 0x8000, bshft = 15;
	359	if (val <= 100) {
	360	for (g = 0; g <= 10; ++g) {
	361	if (sqrt_cache[g] > val) {
	362	g--;
	363	break;
	364	} else if (sqrt_cache[g] == val) {
	365	break;
	366	}
	367	}
	368	} else {
	369	do {
	370	temp = (((g << 1) + b) << (bshft--));
	371	if (val >= temp) {
	372	g += b;
	373	val -= temp;
	374	}
	375	b >>= 1;
	376	} while (b > 0 && val > 0);
	377	}
	378	return g;
	379	}
	380
	381	static inline void
	382	compute_iaj(struct tcpcb *tp)
	383	{
	384	compute_iaj_meat(tp, (tcp_now - tp->iaj_rcv_ts));
	385	}
	386
	387	static inline void
	388	compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj)
	389	{
	390	/* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds,
	391	* throttle the receive window to a minimum of MIN_IAJ_WIN packets
	392	*/
	393	#define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
	394	#define IAJ_DIV_SHIFT 4
	395	#define IAJ_ROUNDUP_CONST (1 << (IAJ_DIV_SHIFT - 1))
	396
	397	uint32_t allowed_iaj, acc_iaj = 0;
	398
	399	uint32_t mean, temp;
	400	int32_t cur_iaj_dev;
	401
	402	cur_iaj_dev = (cur_iaj - tp->avg_iaj);
	403
	404	/* Allow a jitter of "allowed_iaj" milliseconds. Some connections
	405	* may have a constant jitter more than that. We detect this by
	406	* using standard deviation.
	407	*/
	408	allowed_iaj = tp->avg_iaj + tp->std_dev_iaj;
	409	if (allowed_iaj < tcp_allowed_iaj) {
	410	allowed_iaj = tcp_allowed_iaj;
	411	}
	412
	413	/* Initially when the connection starts, the senders congestion
	414	* window is small. During this period we avoid throttling a
	415	* connection because we do not have a good starting point for
	416	* allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over
	417	* the first few packets.
	418	*/
	419	if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) {
	420	if (cur_iaj <= allowed_iaj) {
	421	if (tp->acc_iaj >= 2) {
	422	acc_iaj = tp->acc_iaj - 2;
	423	} else {
	424	acc_iaj = 0;
	425	}
	426	} else {
	427	acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj);
	428	}
	429
	430	if (acc_iaj > MAX_ACC_IAJ) {
	431	acc_iaj = MAX_ACC_IAJ;
	432	}
	433	tp->acc_iaj = acc_iaj;
	434	}
	435
	436	/* Compute weighted average where the history has a weight of
	437	* 15 out of 16 and the current value has a weight of 1 out of 16.
	438	* This will make the short-term measurements have more weight.
	439	*
	440	* The addition of 8 will help to round-up the value
	441	* instead of round-down
	442	*/
	443	tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj)
	444	+ cur_iaj + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
	445
	446	/* Compute Root-mean-square of deviation where mean is a weighted
	447	* average as described above.
	448	*/
	449	temp = tp->std_dev_iaj * tp->std_dev_iaj;
	450	mean = (((temp << IAJ_DIV_SHIFT) - temp)
	451	+ (cur_iaj_dev * cur_iaj_dev)
	452	+ IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
	453
	454	tp->std_dev_iaj = isqrt(mean);
	455
	456	DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj,
	457	uint32_t, allowed_iaj);
	458
	459	return;
	460	}
	461	#endif /* TRAFFIC_MGT */
	462
	463	/*
	464	* Perform rate limit check per connection per second
	465	* tp->t_challengeack_last is the last_time diff was greater than 1sec
	466	* tp->t_challengeack_count is the number of ACKs sent (within 1sec)
	467	* Return TRUE if we shouldn't send the ACK due to rate limitation
	468	* Return FALSE if it is still ok to send challenge ACK
	469	*/
	470	static boolean_t
	471	tcp_is_ack_ratelimited(struct tcpcb *tp)
	472	{
	473	boolean_t ret = TRUE;
	474	uint32_t now = tcp_now;
	475	int32_t diff = 0;
	476
	477	diff = timer_diff(now, 0, tp->t_challengeack_last, 0);
	478	/* If it is first time or diff > 1000ms,
	479	* update the challengeack_last and reset the
	480	* current count of ACKs
	481	*/
	482	if (tp->t_challengeack_last == 0 \|\| diff >= 1000) {
	483	tp->t_challengeack_last = now;
	484	tp->t_challengeack_count = 0;
	485	ret = FALSE;
	486	} else if (tp->t_challengeack_count < tcp_challengeack_limit) {
	487	ret = FALSE;
	488	}
	489
	490	/* Careful about wrap-around */
	491	if (ret == FALSE && (tp->t_challengeack_count + 1 > 0)) {
	492	tp->t_challengeack_count++;
	493	}
	494
	495	return ret;
	496	}
	497
	498	/* Check if enough amount of data has been acknowledged since
	499	* bw measurement was started
	500	*/
	501	static void
	502	tcp_bwmeas_check(struct tcpcb *tp)
	503	{
	504	int32_t bw_meas_bytes;
	505	uint32_t bw, bytes, elapsed_time;
	506
	507	if (SEQ_LEQ(tp->snd_una, tp->t_bwmeas->bw_start)) {
	508	return;
	509	}
	510
	511	bw_meas_bytes = tp->snd_una - tp->t_bwmeas->bw_start;
	512	if ((tp->t_flagsext & TF_BWMEAS_INPROGRESS) &&
	513	bw_meas_bytes >= (int32_t)(tp->t_bwmeas->bw_size)) {
	514	bytes = bw_meas_bytes;
	515	elapsed_time = tcp_now - tp->t_bwmeas->bw_ts;
	516	if (elapsed_time > 0) {
	517	bw = bytes / elapsed_time;
	518	if (bw > 0) {
	519	if (tp->t_bwmeas->bw_sndbw > 0) {
	520	tp->t_bwmeas->bw_sndbw =
	521	(((tp->t_bwmeas->bw_sndbw << 3)
	522	- tp->t_bwmeas->bw_sndbw)
	523	+ bw) >> 3;
	524	} else {
	525	tp->t_bwmeas->bw_sndbw = bw;
	526	}
	527
	528	/* Store the maximum value */
	529	if (tp->t_bwmeas->bw_sndbw_max == 0) {
	530	tp->t_bwmeas->bw_sndbw_max =
	531	tp->t_bwmeas->bw_sndbw;
	532	} else {
	533	tp->t_bwmeas->bw_sndbw_max =
	534	max(tp->t_bwmeas->bw_sndbw,
	535	tp->t_bwmeas->bw_sndbw_max);
	536	}
	537	}
	538	}
	539	tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
	540	}
	541	}
	542
	543	static int
	544	tcp_reass(struct tcpcb tp, struct tcphdr th, int tlenp, struct mbuf m,
	545	struct ifnet ifp, int dowakeup)
	546	{
	547	struct tseg_qent *q;
	548	struct tseg_qent *p = NULL;
	549	struct tseg_qent *nq;
	550	struct tseg_qent *te = NULL;
	551	struct inpcb *inp = tp->t_inpcb;
	552	struct socket *so = inp->inp_socket;
	553	int flags = 0;
	554	u_int16_t qlimit;
	555	boolean_t cell = IFNET_IS_CELLULAR(ifp);
	556	boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
	557	boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
	558	boolean_t dsack_set = FALSE;
	559
	560	/*
	561	* Call with th==0 after become established to
	562	* force pre-ESTABLISHED data up to user socket.
	563	*/
	564	if (th == NULL) {
	565	goto present;
	566	}
	567
	568	/*
	569	* If the reassembly queue already has entries or if we are going
	570	* to add a new one, then the connection has reached a loss state.
	571	* Reset the stretch-ack algorithm at this point.
	572	*/
	573	tcp_reset_stretch_ack(tp);
	574	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
	575
	576	#if TRAFFIC_MGT
	577	if (tp->acc_iaj > 0) {
	578	reset_acc_iaj(tp);
	579	}
	580	#endif /* TRAFFIC_MGT */
	581
	582	if (th->th_seq != tp->rcv_nxt) {
	583	struct mbuf *tmp = m;
	584	while (tmp != NULL) {
	585	if (mbuf_class_under_pressure(tmp)) {
	586	m_freem(m);
	587	tcp_reass_overflows++;
	588	tcpstat.tcps_rcvmemdrop++;
	589	*tlenp = 0;
	590	return 0;
	591	}
	592
	593	tmp = tmp->m_next;
	594	}
	595	}
	596
	597	/*
	598	* Limit the number of segments in the reassembly queue to prevent
	599	* holding on to too many segments (and thus running out of mbufs).
	600	* Make sure to let the missing segment through which caused this
	601	* queue. Always keep one global queue entry spare to be able to
	602	* process the missing segment.
	603	*/
	604	qlimit = min(max(100, so->so_rcv.sb_hiwat >> 10),
	605	(tcp_autorcvbuf_max >> 10));
	606	if (th->th_seq != tp->rcv_nxt &&
	607	(tp->t_reassqlen + 1) >= qlimit) {
	608	tcp_reass_overflows++;
	609	tcpstat.tcps_rcvmemdrop++;
	610	m_freem(m);
	611	*tlenp = 0;
	612	return 0;
	613	}
	614
	615	/* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
	616	te = (struct tseg_qent *) zalloc(tcp_reass_zone);
	617	if (te == NULL) {
	618	tcpstat.tcps_rcvmemdrop++;
	619	m_freem(m);
	620	return 0;
	621	}
	622	tp->t_reassqlen++;
	623
	624	/*
	625	* Find a segment which begins after this one does.
	626	*/
	627	LIST_FOREACH(q, &tp->t_segq, tqe_q) {
	628	if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) {
	629	break;
	630	}
	631	p = q;
	632	}
	633
	634	/*
	635	* If there is a preceding segment, it may provide some of
	636	* our data already. If so, drop the data from the incoming
	637	* segment. If it provides all of our data, drop us.
	638	*/
	639	if (p != NULL) {
	640	int i;
	641	/* conversion to int (in i) handles seq wraparound */
	642	i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
	643	if (i > 0) {
	644	if (i > 1) {
	645	/*
	646	* Note duplicate data sequnce numbers
	647	* to report in DSACK option
	648	*/
	649	tp->t_dsack_lseq = th->th_seq;
	650	tp->t_dsack_rseq = th->th_seq +
	651	min(i, *tlenp);
	652
	653	/*
	654	* Report only the first part of partial/
	655	* non-contiguous duplicate sequence space
	656	*/
	657	dsack_set = TRUE;
	658	}
	659	if (i >= *tlenp) {
	660	tcpstat.tcps_rcvduppack++;
	661	tcpstat.tcps_rcvdupbyte += *tlenp;
	662	if (nstat_collect) {
	663	nstat_route_rx(inp->inp_route.ro_rt,
	664	1, *tlenp,
	665	NSTAT_RX_FLAG_DUPLICATE);
	666	INP_ADD_STAT(inp, cell, wifi, wired,
	667	rxpackets, 1);
	668	INP_ADD_STAT(inp, cell, wifi, wired,
	669	rxbytes, *tlenp);
	670	tp->t_stat.rxduplicatebytes += *tlenp;
	671	inp_set_activity_bitmap(inp);
	672	}
	673	m_freem(m);
	674	zfree(tcp_reass_zone, te);
	675	te = NULL;
	676	tp->t_reassqlen--;
	677	/*
	678	* Try to present any queued data
	679	* at the left window edge to the user.
	680	* This is needed after the 3-WHS
	681	* completes.
	682	*/
	683	goto present;
	684	}
	685	m_adj(m, i);
	686	*tlenp -= i;
	687	th->th_seq += i;
	688	}
	689	}
	690	tp->t_rcvoopack++;
	691	tcpstat.tcps_rcvoopack++;
	692	tcpstat.tcps_rcvoobyte += *tlenp;
	693	if (nstat_collect) {
	694	nstat_route_rx(inp->inp_route.ro_rt, 1, *tlenp,
	695	NSTAT_RX_FLAG_OUT_OF_ORDER);
	696	INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
	697	INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, *tlenp);
	698	tp->t_stat.rxoutoforderbytes += *tlenp;
	699	inp_set_activity_bitmap(inp);
	700	}
	701
	702	/*
	703	* While we overlap succeeding segments trim them or,
	704	* if they are completely covered, dequeue them.
	705	*/
	706	while (q) {
	707	int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
	708	if (i <= 0) {
	709	break;
	710	}
	711
	712	/*
	713	* Report only the first part of partial/non-contiguous
	714	* duplicate segment in dsack option. The variable
	715	* dsack_set will be true if a previous entry has some of
	716	* the duplicate sequence space.
	717	*/
	718	if (i > 1 && !dsack_set) {
	719	if (tp->t_dsack_lseq == 0) {
	720	tp->t_dsack_lseq = q->tqe_th->th_seq;
	721	tp->t_dsack_rseq =
	722	tp->t_dsack_lseq + min(i, q->tqe_len);
	723	} else {
	724	/*
	725	* this segment overlaps data in multple
	726	* entries in the reassembly queue, move
	727	* the right sequence number further.
	728	*/
	729	tp->t_dsack_rseq =
	730	tp->t_dsack_rseq + min(i, q->tqe_len);
	731	}
	732	}
	733	if (i < q->tqe_len) {
	734	q->tqe_th->th_seq += i;
	735	q->tqe_len -= i;
	736	m_adj(q->tqe_m, i);
	737	break;
	738	}
	739
	740	nq = LIST_NEXT(q, tqe_q);
	741	LIST_REMOVE(q, tqe_q);
	742	m_freem(q->tqe_m);
	743	zfree(tcp_reass_zone, q);
	744	tp->t_reassqlen--;
	745	q = nq;
	746	}
	747
	748	/* Insert the new segment queue entry into place. */
	749	te->tqe_m = m;
	750	te->tqe_th = th;
	751	te->tqe_len = *tlenp;
	752
	753	if (p == NULL) {
	754	LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
	755	} else {
	756	LIST_INSERT_AFTER(p, te, tqe_q);
	757	}
	758
	759	present:
	760	/*
	761	* Present data to user, advancing rcv_nxt through
	762	* completed sequence space.
	763	*/
	764	if (!TCPS_HAVEESTABLISHED(tp->t_state)) {
	765	return 0;
	766	}
	767	q = LIST_FIRST(&tp->t_segq);
	768	if (!q \|\| q->tqe_th->th_seq != tp->rcv_nxt) {
	769	return 0;
	770	}
	771
	772	/*
	773	* If there is already another thread doing reassembly for this
	774	* connection, it is better to let it finish the job --
	775	* (radar 16316196)
	776	*/
	777	if (tp->t_flagsext & TF_REASS_INPROG) {
	778	return 0;
	779	}
	780
	781	tp->t_flagsext \|= TF_REASS_INPROG;
	782	/* lost packet was recovered, so ooo data can be returned */
	783	tcpstat.tcps_recovered_pkts++;
	784
	785	do {
	786	tp->rcv_nxt += q->tqe_len;
	787	flags = q->tqe_th->th_flags & TH_FIN;
	788	LIST_REMOVE(q, tqe_q);
	789	if (so->so_state & SS_CANTRCVMORE) {
	790	m_freem(q->tqe_m);
	791	} else {
	792	/*
	793	* The mbuf may be freed after it has been added to the
	794	* receive socket buffer so we reinitialize th to point
	795	* to a safe copy of the TCP header
	796	*/
	797	struct tcphdr saved_tcphdr = {};
	798
	799	so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */
	800	memcpy(&saved_tcphdr, th, sizeof(struct tcphdr));
	801
	802	if (q->tqe_th->th_flags & TH_PUSH) {
	803	tp->t_flagsext \|= TF_LAST_IS_PSH;
	804	} else {
	805	tp->t_flagsext &= ~TF_LAST_IS_PSH;
	806	}
	807
	808	if (sbappendstream_rcvdemux(so, q->tqe_m)) {
	809	*dowakeup = 1;
	810	}
	811	th = &saved_tcphdr;
	812	}
	813	zfree(tcp_reass_zone, q);
	814	tp->t_reassqlen--;
	815	q = LIST_FIRST(&tp->t_segq);
	816	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
	817	tp->t_flagsext &= ~TF_REASS_INPROG;
	818
	819	if ((inp->inp_vflag & INP_IPV6) != 0) {
	820	KERNEL_DEBUG(DBG_LAYER_BEG,
	821	((inp->inp_fport << 16) \| inp->inp_lport),
	822	(((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) \|
	823	(inp->in6p_faddr.s6_addr16[0] & 0xffff)),
	824	0, 0, 0);
	825	} else {
	826	KERNEL_DEBUG(DBG_LAYER_BEG,
	827	((inp->inp_fport << 16) \| inp->inp_lport),
	828	(((inp->inp_laddr.s_addr & 0xffff) << 16) \|
	829	(inp->inp_faddr.s_addr & 0xffff)),
	830	0, 0, 0);
	831	}
	832
	833	return flags;
	834	}
	835
	836	/*
	837	* Reduce congestion window -- used when ECN is seen or when a tail loss
	838	* probe recovers the last packet.
	839	*/
	840	static void
	841	tcp_reduce_congestion_window(struct tcpcb *tp)
	842	{
	843	/*
	844	* If the current tcp cc module has
	845	* defined a hook for tasks to run
	846	* before entering FR, call it
	847	*/
	848	if (CC_ALGO(tp)->pre_fr != NULL) {
	849	CC_ALGO(tp)->pre_fr(tp);
	850	}
	851	ENTER_FASTRECOVERY(tp);
	852	if (tp->t_flags & TF_SENTFIN) {
	853	tp->snd_recover = tp->snd_max - 1;
	854	} else {
	855	tp->snd_recover = tp->snd_max;
	856	}
	857	tp->t_timer[TCPT_REXMT] = 0;
	858	tp->t_timer[TCPT_PTO] = 0;
	859	tp->t_rtttime = 0;
	860	if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
	861	tcp_cc_adjust_nonvalidated_cwnd(tp);
	862	} else {
	863	tp->snd_cwnd = tp->snd_ssthresh +
	864	tp->t_maxseg * tcprexmtthresh;
	865	}
	866	}
	867
	868	/*
	869	* This function is called upon reception of data on a socket. It's purpose is
	870	* to handle the adaptive keepalive timers that monitor whether the connection
	871	* is making progress. First the adaptive read-timer, second the TFO probe-timer.
	872	*
	873	* The application wants to get an event if there is a stall during read.
	874	* Set the initial keepalive timeout to be equal to twice RTO.
	875	*
	876	* If the outgoing interface is in marginal conditions, we need to
	877	* enable read probes for that too.
	878	*/
	879	static inline void
	880	tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen)
	881	{
	882	struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
	883
	884	if ((tp->t_adaptive_rtimo > 0 \|\|
	885	(outifp != NULL &&
	886	(outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)))
	887	&& tlen > 0 &&
	888	tp->t_state == TCPS_ESTABLISHED) {
	889	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
	890	(TCP_REXMTVAL(tp) << 1));
	891	tp->t_flagsext \|= TF_DETECT_READSTALL;
	892	tp->t_rtimo_probes = 0;
	893	}
	894	}
	895
	896	inline void
	897	tcp_keepalive_reset(struct tcpcb *tp)
	898	{
	899	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
	900	TCP_CONN_KEEPIDLE(tp));
	901	tp->t_flagsext &= ~(TF_DETECT_READSTALL);
	902	tp->t_rtimo_probes = 0;
	903	}
	904
	905	/*
	906	* TCP input routine, follows pages 65-76 of the
	907	* protocol specification dated September, 1981 very closely.
	908	*/
	909	int
	910	tcp6_input(struct mbuf *mp, int offp, int proto)
	911	{
	912	#pragma unused(proto)
	913	struct mbuf m = mp;
	914	uint32_t ia6_flags;
	915	struct ifnet *ifp = m->m_pkthdr.rcvif;
	916
	917	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
	918
	919	/* Expect 32-bit aligned data pointer on strict-align platforms */
	920	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
	921
	922	/*
	923	* draft-itojun-ipv6-tcp-to-anycast
	924	* better place to put this in?
	925	*/
	926	if (ip6_getdstifaddr_info(m, NULL, &ia6_flags) == 0) {
	927	if (ia6_flags & IN6_IFF_ANYCAST) {
	928	struct ip6_hdr *ip6;
	929
	930	ip6 = mtod(m, struct ip6_hdr *);
	931	icmp6_error(m, ICMP6_DST_UNREACH,
	932	ICMP6_DST_UNREACH_ADDR,
	933	(caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
	934
	935	IF_TCP_STATINC(ifp, icmp6unreach);
	936
	937	return IPPROTO_DONE;
	938	}
	939	}
	940
	941	tcp_input(m, *offp);
	942	return IPPROTO_DONE;
	943	}
	944
	945	/* Depending on the usage of mbuf space in the system, this function
	946	* will return true or false. This is used to determine if a socket
	947	* buffer can take more memory from the system for auto-tuning or not.
	948	*/
	949	u_int8_t
	950	tcp_cansbgrow(struct sockbuf *sb)
	951	{
	952	/* Calculate the host level space limit in terms of MSIZE buffers.
	953	* We can use a maximum of half of the available mbuf space for
	954	* socket buffers.
	955	*/
	956	u_int32_t mblim = ((nmbclusters >> 1) << (MCLSHIFT - MSIZESHIFT));
	957
	958	/* Calculate per sb limit in terms of bytes. We optimize this limit
	959	* for upto 16 socket buffers.
	960	*/
	961
	962	u_int32_t sbspacelim = ((nmbclusters >> 4) << MCLSHIFT);
	963
	964	if ((total_sbmb_cnt < mblim) &&
	965	(sb->sb_hiwat < sbspacelim)) {
	966	return 1;
	967	} else {
	968	OSIncrementAtomic64(&sbmb_limreached);
	969	}
	970	return 0;
	971	}
	972
	973	static void
	974	tcp_sbrcv_reserve(struct tcpcb tp, struct sockbuf sbrcv,
	975	u_int32_t newsize, u_int32_t idealsize, u_int32_t rcvbuf_max)
	976	{
	977	/* newsize should not exceed max */
	978	newsize = min(newsize, rcvbuf_max);
	979
	980	/* The receive window scale negotiated at the
	981	* beginning of the connection will also set a
	982	* limit on the socket buffer size
	983	*/
	984	newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale);
	985
	986	/* Set new socket buffer size */
	987	if (newsize > sbrcv->sb_hiwat &&
	988	(sbreserve(sbrcv, newsize) == 1)) {
	989	sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
	990	(idealsize != 0) ? idealsize : newsize), rcvbuf_max);
	991
	992	/* Again check the limit set by the advertised
	993	* window scale
	994	*/
	995	sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
	996	TCP_MAXWIN << tp->rcv_scale);
	997	}
	998	}
	999
	1000	/*
	1001	* This function is used to grow a receive socket buffer. It
	1002	* will take into account system-level memory usage and the
	1003	* bandwidth available on the link to make a decision.
	1004	*/
	1005	static void
	1006	tcp_sbrcv_grow(struct tcpcb tp, struct sockbuf sbrcv,
	1007	struct tcpopt *to, uint32_t pktlen)
	1008	{
	1009	struct socket *so = sbrcv->sb_so;
	1010
	1011	/*
	1012	* Do not grow the receive socket buffer if
	1013	* - auto resizing is disabled, globally or on this socket
	1014	* - the high water mark already reached the maximum
	1015	* - the stream is in background and receive side is being
	1016	* throttled
	1017	*/
	1018	if (tcp_do_autorcvbuf == 0 \|\|
	1019	(sbrcv->sb_flags & SB_AUTOSIZE) == 0 \|\|
	1020	tcp_cansbgrow(sbrcv) == 0 \|\|
	1021	sbrcv->sb_hiwat >= tcp_autorcvbuf_max \|\|
	1022	(tp->t_flagsext & TF_RECV_THROTTLE) \|\|
	1023	(so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) \|\|
	1024	(!tcp_autotune_reorder && !LIST_EMPTY(&tp->t_segq))) {
	1025	/* Can not resize the socket buffer, just return */
	1026	goto out;
	1027	}
	1028
	1029	if (!TSTMP_SUPPORTED(tp)) {
	1030	/*
	1031	* Timestamp option is not supported on this connection.
	1032	* If the connection reached a state to indicate that
	1033	* the receive socket buffer needs to grow, increase
	1034	* the high water mark.
	1035	*/
	1036	if (TSTMP_GEQ(tcp_now,
	1037	tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) {
	1038	if (tp->rfbuf_cnt + pktlen >= TCP_RCVNOTS_BYTELEVEL) {
	1039	tcp_sbrcv_reserve(tp, sbrcv,
	1040	tcp_autorcvbuf_max, 0,
	1041	tcp_autorcvbuf_max);
	1042	}
	1043	goto out;
	1044	} else {
	1045	tp->rfbuf_cnt += pktlen;
	1046	return;
	1047	}
	1048	} else if (to->to_tsecr != 0) {
	1049	/*
	1050	* If the timestamp shows that one RTT has
	1051	* completed, we can stop counting the
	1052	* bytes. Here we consider increasing
	1053	* the socket buffer if the bandwidth measured in
	1054	* last rtt, is more than half of sb_hiwat, this will
	1055	* help to scale the buffer according to the bandwidth
	1056	* on the link.
	1057	*/
	1058	if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
	1059	if (tcp_aggressive_rcvwnd_inc) {
	1060	tp->rfbuf_cnt += pktlen;
	1061	}
	1062
	1063	if ((tcp_aggressive_rcvwnd_inc == 0 &&
	1064	tp->rfbuf_cnt + pktlen > (sbrcv->sb_hiwat -
	1065	(sbrcv->sb_hiwat >> 1))) \|\|
	1066	(tcp_aggressive_rcvwnd_inc &&
	1067	tp->rfbuf_cnt > tp->rfbuf_space)) {
	1068	int32_t rcvbuf_inc;
	1069	uint32_t idealsize;
	1070
	1071	if (tcp_aggressive_rcvwnd_inc == 0) {
	1072	int32_t min_incr;
	1073
	1074	tp->rfbuf_cnt += pktlen;
	1075	/*
	1076	* Increment the receive window by a
	1077	* multiple of maximum sized segments.
	1078	* This will prevent a connection from
	1079	* sending smaller segments on wire if it
	1080	* is limited by the receive window.
	1081	*
	1082	* Set the ideal size based on current
	1083	* bandwidth measurements. We set the
	1084	* ideal size on receive socket buffer to
	1085	* be twice the bandwidth delay product.
	1086	*/
	1087	rcvbuf_inc = (tp->rfbuf_cnt << 1)
	1088	- sbrcv->sb_hiwat;
	1089
	1090	/*
	1091	* Make the increment equal to 8 segments
	1092	* at least
	1093	*/
	1094	min_incr = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
	1095	if (rcvbuf_inc < min_incr) {
	1096	rcvbuf_inc = min_incr;
	1097	}
	1098
	1099	idealsize = (tp->rfbuf_cnt << 1);
	1100	} else {
	1101	if (tp->rfbuf_cnt > tp->rfbuf_space + (tp->rfbuf_space >> 1)) {
	1102	rcvbuf_inc = (tp->rfbuf_cnt << 2) - sbrcv->sb_hiwat;
	1103	idealsize = (tp->rfbuf_cnt << 2);
	1104	} else {
	1105	rcvbuf_inc = (tp->rfbuf_cnt << 1) - sbrcv->sb_hiwat;
	1106	idealsize = (tp->rfbuf_cnt << 1);
	1107	}
	1108	}
	1109
	1110	tp->rfbuf_space = tp->rfbuf_cnt;
	1111
	1112	if (rcvbuf_inc > 0) {
	1113	rcvbuf_inc =
	1114	(rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
	1115
	1116	tcp_sbrcv_reserve(tp, sbrcv,
	1117	sbrcv->sb_hiwat + rcvbuf_inc,
	1118	idealsize, tcp_autorcvbuf_max);
	1119	}
	1120	}
	1121	/* Measure instantaneous receive bandwidth */
	1122	if (tp->t_bwmeas != NULL && tp->rfbuf_cnt > 0 &&
	1123	TSTMP_GT(tcp_now, tp->rfbuf_ts)) {
	1124	u_int32_t rcv_bw;
	1125	rcv_bw = tp->rfbuf_cnt /
	1126	(int)(tcp_now - tp->rfbuf_ts);
	1127	if (tp->t_bwmeas->bw_rcvbw_max == 0) {
	1128	tp->t_bwmeas->bw_rcvbw_max = rcv_bw;
	1129	} else {
	1130	tp->t_bwmeas->bw_rcvbw_max = max(
	1131	tp->t_bwmeas->bw_rcvbw_max, rcv_bw);
	1132	}
	1133	}
	1134	goto out;
	1135	} else {
	1136	tp->rfbuf_cnt += pktlen;
	1137	return;
	1138	}
	1139	}
	1140	out:
	1141	/* Restart the measurement */
	1142	tp->rfbuf_ts = tcp_now;
	1143	tp->rfbuf_cnt = 0;
	1144	return;
	1145	}
	1146
	1147	/* This function will trim the excess space added to the socket buffer
	1148	* to help a slow-reading app. The ideal-size of a socket buffer depends
	1149	* on the link bandwidth or it is set by an application and we aim to
	1150	* reach that size.
	1151	*/
	1152	void
	1153	tcp_sbrcv_trim(struct tcpcb tp, struct sockbuf sbrcv)
	1154	{
	1155	if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
	1156	sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
	1157	int32_t trim;
	1158	/* compute the difference between ideal and current sizes */
	1159	u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
	1160
	1161	/* Compute the maximum advertised window for
	1162	* this connection.
	1163	*/
	1164	u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
	1165
	1166	/* How much can we trim the receive socket buffer?
	1167	* 1. it can not be trimmed beyond the max rcv win advertised
	1168	* 2. if possible, leave 1/16 of bandwidth*delay to
	1169	* avoid closing the win completely
	1170	*/
	1171	u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4));
	1172
	1173	/* Sometimes leave can be zero, in that case leave at least
	1174	* a few segments worth of space.
	1175	*/
	1176	if (leave == 0) {
	1177	leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
	1178	}
	1179
	1180	trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
	1181	trim = imin(trim, (int32_t)diff);
	1182
	1183	if (trim > 0) {
	1184	sbreserve(sbrcv, (sbrcv->sb_hiwat - trim));
	1185	}
	1186	}
	1187	}
	1188
	1189	/* We may need to trim the send socket buffer size for two reasons:
	1190	* 1. if the rtt seen on the connection is climbing up, we do not
	1191	* want to fill the buffers any more.
	1192	* 2. if the congestion win on the socket backed off, there is no need
	1193	* to hold more mbufs for that connection than what the cwnd will allow.
	1194	*/
	1195	void
	1196	tcp_sbsnd_trim(struct sockbuf *sbsnd)
	1197	{
	1198	if (((sbsnd->sb_flags & (SB_AUTOSIZE \| SB_TRIM)) ==
	1199	(SB_AUTOSIZE \| SB_TRIM)) &&
	1200	(sbsnd->sb_idealsize > 0) &&
	1201	(sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
	1202	u_int32_t trim = 0;
	1203	if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
	1204	trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
	1205	} else {
	1206	trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
	1207	}
	1208	sbreserve(sbsnd, (sbsnd->sb_hiwat - trim));
	1209	}
	1210	if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize) {
	1211	sbsnd->sb_flags &= ~(SB_TRIM);
	1212	}
	1213	}
	1214
	1215	/*
	1216	* If timestamp option was not negotiated on this connection
	1217	* and this connection is on the receiving side of a stream
	1218	* then we can not measure the delay on the link accurately.
	1219	* Instead of enabling automatic receive socket buffer
	1220	* resizing, just give more space to the receive socket buffer.
	1221	*/
	1222	static inline void
	1223	tcp_sbrcv_tstmp_check(struct tcpcb *tp)
	1224	{
	1225	struct socket *so = tp->t_inpcb->inp_socket;
	1226	u_int32_t newsize = 2 * tcp_recvspace;
	1227	struct sockbuf *sbrcv = &so->so_rcv;
	1228
	1229	if ((tp->t_flags & (TF_REQ_TSTMP \| TF_RCVD_TSTMP)) !=
	1230	(TF_REQ_TSTMP \| TF_RCVD_TSTMP) &&
	1231	(sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
	1232	tcp_sbrcv_reserve(tp, sbrcv, newsize, 0, newsize);
	1233	}
	1234	}
	1235
	1236	/* A receiver will evaluate the flow of packets on a connection
	1237	* to see if it can reduce ack traffic. The receiver will start
	1238	* stretching acks if all of the following conditions are met:
	1239	* 1. tcp_delack_enabled is set to 3
	1240	* 2. If the bytes received in the last 100ms is greater than a threshold
	1241	* defined by maxseg_unacked
	1242	* 3. If the connection has not been idle for tcp_maxrcvidle period.
	1243	* 4. If the connection has seen enough packets to let the slow-start
	1244	* finish after connection establishment or after some packet loss.
	1245	*
	1246	* The receiver will stop stretching acks if there is congestion/reordering
	1247	* as indicated by packets on reassembly queue or an ECN. If the delayed-ack
	1248	* timer fires while stretching acks, it means that the packet flow has gone
	1249	* below the threshold defined by maxseg_unacked and the receiver will stop
	1250	* stretching acks. The receiver gets no indication when slow-start is completed
	1251	* or when the connection reaches an idle state. That is why we use
	1252	* tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
	1253	* state.
	1254	*/
	1255	static inline int
	1256	tcp_stretch_ack_enable(struct tcpcb *tp, int thflags)
	1257	{
	1258	if (tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
	1259	TSTMP_GEQ(tp->rcv_unackwin, tcp_now)) {
	1260	tp->t_flags \|= TF_STREAMING_ON;
	1261	} else {
	1262	tp->t_flags &= ~TF_STREAMING_ON;
	1263	}
	1264
	1265	/* If there has been an idle time, reset streaming detection */
	1266	if (TSTMP_GT(tcp_now, tp->rcv_unackwin + tcp_maxrcvidle)) {
	1267	tp->t_flags &= ~TF_STREAMING_ON;
	1268	}
	1269
	1270	/*
	1271	* If there are flags other than TH_ACK set, reset streaming
	1272	* detection
	1273	*/
	1274	if (thflags & ~TH_ACK) {
	1275	tp->t_flags &= ~TF_STREAMING_ON;
	1276	}
	1277
	1278	if (tp->t_flagsext & TF_DISABLE_STRETCHACK) {
	1279	if (tp->rcv_nostrack_pkts >= TCP_STRETCHACK_ENABLE_PKTCNT) {
	1280	tp->t_flagsext &= ~TF_DISABLE_STRETCHACK;
	1281	tp->rcv_nostrack_pkts = 0;
	1282	tp->rcv_nostrack_ts = 0;
	1283	} else {
	1284	tp->rcv_nostrack_pkts++;
	1285	}
	1286	}
	1287
	1288	if (!(tp->t_flagsext & (TF_NOSTRETCHACK \| TF_DISABLE_STRETCHACK)) &&
	1289	(tp->t_flags & TF_STREAMING_ON) &&
	1290	(!(tp->t_flagsext & TF_RCVUNACK_WAITSS) \|\|
	1291	(tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
	1292	return 1;
	1293	}
	1294
	1295	return 0;
	1296	}
	1297
	1298	/*
	1299	* Reset the state related to stretch-ack algorithm. This will make
	1300	* the receiver generate an ack every other packet. The receiver
	1301	* will start re-evaluating the rate at which packets come to decide
	1302	* if it can benefit by lowering the ack traffic.
	1303	*/
	1304	void
	1305	tcp_reset_stretch_ack(struct tcpcb *tp)
	1306	{
	1307	tp->t_flags &= ~(TF_STRETCHACK \| TF_STREAMING_ON);
	1308	tp->rcv_by_unackwin = 0;
	1309	tp->rcv_by_unackhalfwin = 0;
	1310	tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
	1311
	1312	/*
	1313	* When there is packet loss or packet re-ordering or CWR due to
	1314	* ECN, the sender's congestion window is reduced. In these states,
	1315	* generate an ack for every other packet for some time to allow
	1316	* the sender's congestion window to grow.
	1317	*/
	1318	tp->t_flagsext \|= TF_RCVUNACK_WAITSS;
	1319	tp->rcv_waitforss = 0;
	1320	}
	1321
	1322	/*
	1323	* The last packet was a retransmission, check if this ack
	1324	* indicates that the retransmission was spurious.
	1325	*
	1326	* If the connection supports timestamps, we could use it to
	1327	* detect if the last retransmit was not needed. Otherwise,
	1328	* we check if the ACK arrived within RTT/2 window, then it
	1329	* was a mistake to do the retransmit in the first place.
	1330	*
	1331	* This function will return 1 if it is a spurious retransmit,
	1332	* 0 otherwise.
	1333	*/
	1334	int
	1335	tcp_detect_bad_rexmt(struct tcpcb tp, struct tcphdr th,
	1336	struct tcpopt *to, u_int32_t rxtime)
	1337	{
	1338	int32_t tdiff, bad_rexmt_win;
	1339	bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
	1340
	1341	/* If the ack has ECN CE bit, then cwnd has to be adjusted */
	1342	if (TCP_ECN_ENABLED(tp) && (th->th_flags & TH_ECE)) {
	1343	return 0;
	1344	}
	1345	if (TSTMP_SUPPORTED(tp)) {
	1346	if (rxtime > 0 && (to->to_flags & TOF_TS) && to->to_tsecr != 0 &&
	1347	TSTMP_LT(to->to_tsecr, rxtime)) {
	1348	return 1;
	1349	}
	1350	} else {
	1351	if ((tp->t_rxtshift == 1 \|\| (tp->t_flagsext & TF_SENT_TLPROBE)) &&
	1352	rxtime > 0) {
	1353	tdiff = (int32_t)(tcp_now - rxtime);
	1354	if (tdiff < bad_rexmt_win) {
	1355	return 1;
	1356	}
	1357	}
	1358	}
	1359	return 0;
	1360	}
	1361
	1362
	1363	/*
	1364	* Restore congestion window state if a spurious timeout
	1365	* was detected.
	1366	*/
	1367	static void
	1368	tcp_bad_rexmt_restore_state(struct tcpcb tp, struct tcphdr th)
	1369	{
	1370	if (TSTMP_SUPPORTED(tp)) {
	1371	u_int32_t fsize, acked;
	1372	fsize = tp->snd_max - th->th_ack;
	1373	acked = BYTES_ACKED(th, tp);
	1374
	1375	/*
	1376	* Implement bad retransmit recovery as
	1377	* described in RFC 4015.
	1378	*/
	1379	tp->snd_ssthresh = tp->snd_ssthresh_prev;
	1380
	1381	/* Initialize cwnd to the initial window */
	1382	if (CC_ALGO(tp)->cwnd_init != NULL) {
	1383	CC_ALGO(tp)->cwnd_init(tp);
	1384	}
	1385
	1386	tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
	1387	} else {
	1388	tp->snd_cwnd = tp->snd_cwnd_prev;
	1389	tp->snd_ssthresh = tp->snd_ssthresh_prev;
	1390	if (tp->t_flags & TF_WASFRECOVERY) {
	1391	ENTER_FASTRECOVERY(tp);
	1392	}
	1393
	1394	/* Do not use the loss flight size in this case */
	1395	tp->t_lossflightsize = 0;
	1396	}
	1397	tp->snd_cwnd = max(tp->snd_cwnd, tcp_initial_cwnd(tp));
	1398	tp->snd_recover = tp->snd_recover_prev;
	1399	tp->snd_nxt = tp->snd_max;
	1400
	1401	/* Fix send socket buffer to reflect the change in cwnd */
	1402	tcp_bad_rexmt_fix_sndbuf(tp);
	1403
	1404	/*
	1405	* This RTT might reflect the extra delay induced
	1406	* by the network. Skip using this sample for RTO
	1407	* calculation and mark the connection so we can
	1408	* recompute RTT when the next eligible sample is
	1409	* found.
	1410	*/
	1411	tp->t_flagsext \|= TF_RECOMPUTE_RTT;
	1412	tp->t_badrexmt_time = tcp_now;
	1413	tp->t_rtttime = 0;
	1414	}
	1415
	1416	/*
	1417	* If the previous packet was sent in retransmission timer, and it was
	1418	* not needed, then restore the congestion window to the state before that
	1419	* transmission.
	1420	*
	1421	* If the last packet was sent in tail loss probe timeout, check if that
	1422	* recovered the last packet. If so, that will indicate a real loss and
	1423	* the congestion window needs to be lowered.
	1424	*/
	1425	static void
	1426	tcp_bad_rexmt_check(struct tcpcb tp, struct tcphdr th, struct tcpopt *to)
	1427	{
	1428	if (tp->t_rxtshift > 0 &&
	1429	tcp_detect_bad_rexmt(tp, th, to, tp->t_rxtstart)) {
	1430	++tcpstat.tcps_sndrexmitbad;
	1431	tcp_bad_rexmt_restore_state(tp, th);
	1432	tcp_ccdbg_trace(tp, th, TCP_CC_BAD_REXMT_RECOVERY);
	1433	} else if ((tp->t_flagsext & TF_SENT_TLPROBE) && tp->t_tlphighrxt > 0 &&
	1434	SEQ_GEQ(th->th_ack, tp->t_tlphighrxt) &&
	1435	!tcp_detect_bad_rexmt(tp, th, to, tp->t_tlpstart)) {
	1436	/*
	1437	* check DSACK information also to make sure that
	1438	* the TLP was indeed needed
	1439	*/
	1440	if (tcp_rxtseg_dsack_for_tlp(tp)) {
	1441	/*
	1442	* received a DSACK to indicate that TLP was
	1443	* not needed
	1444	*/
	1445	tcp_rxtseg_clean(tp);
	1446	goto out;
	1447	}
	1448
	1449	/*
	1450	* The tail loss probe recovered the last packet and
	1451	* we need to adjust the congestion window to take
	1452	* this loss into account.
	1453	*/
	1454	++tcpstat.tcps_tlp_recoverlastpkt;
	1455	if (!IN_FASTRECOVERY(tp)) {
	1456	tcp_reduce_congestion_window(tp);
	1457	EXIT_FASTRECOVERY(tp);
	1458	}
	1459	tcp_ccdbg_trace(tp, th, TCP_CC_TLP_RECOVER_LASTPACKET);
	1460	} else if (tcp_rxtseg_detect_bad_rexmt(tp, th->th_ack)) {
	1461	/*
	1462	* All of the retransmitted segments were duplicated, this
	1463	* can be an indication of bad fast retransmit.
	1464	*/
	1465	tcpstat.tcps_dsack_badrexmt++;
	1466	tcp_bad_rexmt_restore_state(tp, th);
	1467	tcp_ccdbg_trace(tp, th, TCP_CC_DSACK_BAD_REXMT);
	1468	tcp_rxtseg_clean(tp);
	1469	}
	1470	out:
	1471	tp->t_flagsext &= ~(TF_SENT_TLPROBE);
	1472	tp->t_tlphighrxt = 0;
	1473	tp->t_tlpstart = 0;
	1474
	1475	/*
	1476	* check if the latest ack was for a segment sent during PMTU
	1477	* blackhole detection. If the timestamp on the ack is before
	1478	* PMTU blackhole detection, then revert the size of the max
	1479	* segment to previous size.
	1480	*/
	1481	if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) &&
	1482	tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) {
	1483	if ((to->to_flags & TOF_TS) && to->to_tsecr != 0
	1484	&& TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
	1485	tcp_pmtud_revert_segment_size(tp);
	1486	}
	1487	}
	1488	if (tp->t_pmtud_start_ts > 0) {
	1489	tp->t_pmtud_start_ts = 0;
	1490	}
	1491
	1492	tp->t_pmtud_lastseg_size = 0;
	1493	}
	1494
	1495	/*
	1496	* Check if early retransmit can be attempted according to RFC 5827.
	1497	*
	1498	* If packet reordering is detected on a connection, fast recovery will
	1499	* be delayed until it is clear that the packet was lost and not reordered.
	1500	* But reordering detection is done only when SACK is enabled.
	1501	*
	1502	* On connections that do not support SACK, there is a limit on the number
	1503	* of early retransmits that can be done per minute. This limit is needed
	1504	* to make sure that too many packets are not retransmitted when there is
	1505	* packet reordering.
	1506	*/
	1507	static void
	1508	tcp_early_rexmt_check(struct tcpcb tp, struct tcphdr th)
	1509	{
	1510	u_int32_t obytes, snd_off;
	1511	int32_t snd_len;
	1512	struct socket *so = tp->t_inpcb->inp_socket;
	1513
	1514	if ((SACK_ENABLED(tp) \|\| tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) &&
	1515	SEQ_GT(tp->snd_max, tp->snd_una) &&
	1516	(tp->t_dupacks == 1 \|\| (SACK_ENABLED(tp) && !TAILQ_EMPTY(&tp->snd_holes)))) {
	1517	/*
	1518	* If there are only a few outstanding
	1519	* segments on the connection, we might need
	1520	* to lower the retransmit threshold. This
	1521	* will allow us to do Early Retransmit as
	1522	* described in RFC 5827.
	1523	*/
	1524	if (SACK_ENABLED(tp) &&
	1525	!TAILQ_EMPTY(&tp->snd_holes)) {
	1526	obytes = (tp->snd_max - tp->snd_fack) +
	1527	tp->sackhint.sack_bytes_rexmit;
	1528	} else {
	1529	obytes = (tp->snd_max - tp->snd_una);
	1530	}
	1531
	1532	/*
	1533	* In order to lower retransmit threshold the
	1534	* following two conditions must be met.
	1535	* 1. the amount of outstanding data is less
	1536	* than 4*SMSS bytes
	1537	* 2. there is no unsent data ready for
	1538	* transmission or the advertised window
	1539	* will limit sending new segments.
	1540	*/
	1541	snd_off = tp->snd_max - tp->snd_una;
	1542	snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off;
	1543	if (obytes < (tp->t_maxseg << 2) &&
	1544	snd_len <= 0) {
	1545	u_int32_t osegs;
	1546
	1547	osegs = obytes / tp->t_maxseg;
	1548	if ((osegs * tp->t_maxseg) < obytes) {
	1549	osegs++;
	1550	}
	1551
	1552	/*
	1553	* Since the connection might have already
	1554	* received some dupacks, we add them to
	1555	* to the outstanding segments count to get
	1556	* the correct retransmit threshold.
	1557	*
	1558	* By checking for early retransmit after
	1559	* receiving some duplicate acks when SACK
	1560	* is supported, the connection will
	1561	* enter fast recovery even if multiple
	1562	* segments are lost in the same window.
	1563	*/
	1564	osegs += tp->t_dupacks;
	1565	if (osegs < 4) {
	1566	tp->t_rexmtthresh =
	1567	((osegs - 1) > 1) ? (osegs - 1) : 1;
	1568	tp->t_rexmtthresh =
	1569	min(tp->t_rexmtthresh, tcprexmtthresh);
	1570	tp->t_rexmtthresh =
	1571	max(tp->t_rexmtthresh, tp->t_dupacks);
	1572
	1573	if (tp->t_early_rexmt_count == 0) {
	1574	tp->t_early_rexmt_win = tcp_now;
	1575	}
	1576
	1577	if (tp->t_flagsext & TF_SENT_TLPROBE) {
	1578	tcpstat.tcps_tlp_recovery++;
	1579	tcp_ccdbg_trace(tp, th,
	1580	TCP_CC_TLP_RECOVERY);
	1581	} else {
	1582	tcpstat.tcps_early_rexmt++;
	1583	tp->t_early_rexmt_count++;
	1584	tcp_ccdbg_trace(tp, th,
	1585	TCP_CC_EARLY_RETRANSMIT);
	1586	}
	1587	}
	1588	}
	1589	}
	1590
	1591	/*
	1592	* If we ever sent a TLP probe, the acknowledgement will trigger
	1593	* early retransmit because the value of snd_fack will be close
	1594	* to snd_max. This will take care of adjustments to the
	1595	* congestion window. So we can reset TF_SENT_PROBE flag.
	1596	*/
	1597	tp->t_flagsext &= ~(TF_SENT_TLPROBE);
	1598	tp->t_tlphighrxt = 0;
	1599	tp->t_tlpstart = 0;
	1600	}
	1601
	1602	static boolean_t
	1603	tcp_tfo_syn(struct tcpcb tp, struct tcpopt to)
	1604	{
	1605	u_char out[CCAES_BLOCK_SIZE];
	1606	unsigned char len;
	1607
	1608	if (!(to->to_flags & (TOF_TFO \| TOF_TFOREQ)) \|\|
	1609	!(tcp_fastopen & TCP_FASTOPEN_SERVER)) {
	1610	return FALSE;
	1611	}
	1612
	1613	if ((to->to_flags & TOF_TFOREQ)) {
	1614	tp->t_tfo_flags \|= TFO_F_OFFER_COOKIE;
	1615
	1616	tp->t_tfo_stats \|= TFO_S_COOKIEREQ_RECV;
	1617	tcpstat.tcps_tfo_cookie_req_rcv++;
	1618	return FALSE;
	1619	}
	1620
	1621	/* Ok, then it must be an offered cookie. We need to check that ... */
	1622	tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out));
	1623
	1624	len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
	1625	to->to_tfo++;
	1626	if (memcmp(out, to->to_tfo, len)) {
	1627	/* Cookies are different! Let's return and offer a new cookie */
	1628	tp->t_tfo_flags \|= TFO_F_OFFER_COOKIE;
	1629
	1630	tp->t_tfo_stats \|= TFO_S_COOKIE_INVALID;
	1631	tcpstat.tcps_tfo_cookie_invalid++;
	1632	return FALSE;
	1633	}
	1634
	1635	if (OSIncrementAtomic(&tcp_tfo_halfcnt) >= tcp_tfo_backlog) {
	1636	/* Need to decrement again as we just increased it... */
	1637	OSDecrementAtomic(&tcp_tfo_halfcnt);
	1638	return FALSE;
	1639	}
	1640
	1641	tp->t_tfo_flags \|= TFO_F_COOKIE_VALID;
	1642
	1643	tp->t_tfo_stats \|= TFO_S_SYNDATA_RCV;
	1644	tcpstat.tcps_tfo_syn_data_rcv++;
	1645
	1646	return TRUE;
	1647	}
	1648
	1649	static void
	1650	tcp_tfo_synack(struct tcpcb tp, struct tcpopt to)
	1651	{
	1652	if (to->to_flags & TOF_TFO) {
	1653	unsigned char len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
	1654
	1655	/*
	1656	* If this happens, things have gone terribly wrong. len should
	1657	* have been checked in tcp_dooptions.
	1658	*/
	1659	VERIFY(len <= TFO_COOKIE_LEN_MAX);
	1660
	1661	to->to_tfo++;
	1662
	1663	tcp_cache_set_cookie(tp, to->to_tfo, len);
	1664	tcp_heuristic_tfo_success(tp);
	1665
	1666	tp->t_tfo_stats \|= TFO_S_COOKIE_RCV;
	1667	tcpstat.tcps_tfo_cookie_rcv++;
	1668	if (tp->t_tfo_flags & TFO_F_COOKIE_SENT) {
	1669	tcpstat.tcps_tfo_cookie_wrong++;
	1670	tp->t_tfo_stats \|= TFO_S_COOKIE_WRONG;
	1671	}
	1672	} else {
	1673	/*
	1674	* Thus, no cookie in the response, but we either asked for one
	1675	* or sent SYN+DATA. Now, we need to check whether we had to
	1676	* rexmit the SYN. If that's the case, it's better to start
	1677	* backing of TFO-cookie requests.
	1678	*/
	1679	if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
	1680	tp->t_tfo_flags & TFO_F_SYN_LOSS) {
	1681	tp->t_tfo_stats \|= TFO_S_SYN_LOSS;
	1682	tcpstat.tcps_tfo_syn_loss++;
	1683
	1684	tcp_heuristic_tfo_loss(tp);
	1685	} else {
	1686	if (tp->t_tfo_flags & TFO_F_COOKIE_REQ) {
	1687	tp->t_tfo_stats \|= TFO_S_NO_COOKIE_RCV;
	1688	tcpstat.tcps_tfo_no_cookie_rcv++;
	1689	}
	1690
	1691	tcp_heuristic_tfo_success(tp);
	1692	}
	1693	}
	1694	}
	1695
	1696	static void
	1697	tcp_tfo_rcv_probe(struct tcpcb *tp, int tlen)
	1698	{
	1699	if (tlen != 0) {
	1700	return;
	1701	}
	1702
	1703	tp->t_tfo_probe_state = TFO_PROBE_PROBING;
	1704
	1705	/*
	1706	* We send the probe out rather quickly (after one RTO). It does not
	1707	* really hurt that much, it's only one additional segment on the wire.
	1708	*/
	1709	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, (TCP_REXMTVAL(tp)));
	1710	}
	1711
	1712	static void
	1713	tcp_tfo_rcv_data(struct tcpcb *tp)
	1714	{
	1715	/* Transition from PROBING to NONE as data has been received */
	1716	if (tp->t_tfo_probe_state >= TFO_PROBE_PROBING) {
	1717	tp->t_tfo_probe_state = TFO_PROBE_NONE;
	1718	}
	1719	}
	1720
	1721	static void
	1722	tcp_tfo_rcv_ack(struct tcpcb tp, struct tcphdr th)
	1723	{
	1724	if (tp->t_tfo_probe_state == TFO_PROBE_PROBING &&
	1725	tp->t_tfo_probes > 0) {
	1726	if (th->th_seq == tp->rcv_nxt) {
	1727	/* No hole, so stop probing */
	1728	tp->t_tfo_probe_state = TFO_PROBE_NONE;
	1729	} else if (SEQ_GT(th->th_seq, tp->rcv_nxt)) {
	1730	/* There is a hole! Wait a bit for data... */
	1731	tp->t_tfo_probe_state = TFO_PROBE_WAIT_DATA;
	1732	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
	1733	TCP_REXMTVAL(tp));
	1734	}
	1735	}
	1736	}
	1737
	1738	/*
	1739	* Update snd_wnd information.
	1740	*/
	1741	static inline bool
	1742	tcp_update_window(struct tcpcb tp, int thflags, struct tcphdr th,
	1743	u_int32_t tiwin, int tlen)
	1744	{
	1745	/* Don't look at the window if there is no ACK flag */
	1746	if ((thflags & TH_ACK) &&
	1747	(SEQ_LT(tp->snd_wl1, th->th_seq) \|\|
	1748	(tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) \|\|
	1749	(tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
	1750	/* keep track of pure window updates */
	1751	if (tlen == 0 &&
	1752	tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
	1753	tcpstat.tcps_rcvwinupd++;
	1754	}
	1755	tp->snd_wnd = tiwin;
	1756	tp->snd_wl1 = th->th_seq;
	1757	tp->snd_wl2 = th->th_ack;
	1758	if (tp->snd_wnd > tp->max_sndwnd) {
	1759	tp->max_sndwnd = tp->snd_wnd;
	1760	}
	1761
	1762	if (tp->t_inpcb->inp_socket->so_flags & SOF_MP_SUBFLOW) {
	1763	mptcp_update_window_wakeup(tp);
	1764	}
	1765	return true;
	1766	}
	1767	return false;
	1768	}
	1769
	1770	static void
	1771	tcp_handle_wakeup(struct socket *so, int read_wakeup, int write_wakeup)
	1772	{
	1773	if (read_wakeup != 0) {
	1774	sorwakeup(so);
	1775	}
	1776	if (write_wakeup != 0) {
	1777	sowwakeup(so);
	1778	}
	1779	}
	1780
	1781	static void
	1782	tcp_update_snd_una(struct tcpcb *tp, uint32_t ack)
	1783	{
	1784	tp->snd_una = ack;
	1785	if (SACK_ENABLED(tp) && SEQ_LT(tp->send_highest_sack, tp->snd_una)) {
	1786	tp->send_highest_sack = tp->snd_una;
	1787
	1788	/* If we move our marker, we need to start fresh */
	1789	tp->t_new_dupacks = 0;
	1790	}
	1791	}
	1792
	1793	static bool
	1794	tcp_syn_data_valid(struct tcpcb tp, struct tcphdr tcp_hdr, int tlen)
	1795	{
	1796	/* No data? */
	1797	if (tlen <= 0) {
	1798	return false;
	1799	}
	1800
	1801	/* Not the right sequence-number? */
	1802	if (tcp_hdr->th_seq != tp->irs) {
	1803	return false;
	1804	}
	1805
	1806	/* We could have wrapped around, check that */
	1807	if (tp->t_inpcb->inp_stat->rxbytes > INT32_MAX) {
	1808	return false;
	1809	}
	1810
	1811	return true;
	1812	}
	1813
	1814	void
	1815	tcp_input(struct mbuf *m, int off0)
	1816	{
	1817	int exiting_fr = 0;
	1818	struct tcphdr *th;
	1819	struct ip *ip = NULL;
	1820	struct inpcb *inp;
	1821	u_char *optp = NULL;
	1822	int optlen = 0;
	1823	int tlen, off;
	1824	int drop_hdrlen;
	1825	struct tcpcb *tp = 0;
	1826	int thflags;
	1827	struct socket *so = 0;
	1828	int todrop, acked, ourfinisacked, needoutput = 0;
	1829	int read_wakeup = 0;
	1830	int write_wakeup = 0;
	1831	struct in_addr laddr;
	1832	struct in6_addr laddr6;
	1833	int dropsocket = 0;
	1834	int iss = 0, nosock = 0;
	1835	u_int32_t tiwin, sack_bytes_acked = 0, sack_bytes_newly_acked = 0;
	1836	struct tcpopt to; /* options in this segment */
	1837	#if TCPDEBUG
	1838	short ostate = 0;
	1839	#endif
	1840	u_char ip_ecn = IPTOS_ECN_NOTECT;
	1841	unsigned int ifscope;
	1842	uint8_t isconnected, isdisconnected;
	1843	struct ifnet *ifp = m->m_pkthdr.rcvif;
	1844	int segment_count = m->m_pkthdr.seg_cnt ? : 1;
	1845	int win;
	1846	u_int16_t pf_tag = 0;
	1847	#if MPTCP
	1848	struct mptcb *mp_tp = NULL;
	1849	#endif /* MPTCP */
	1850	boolean_t cell = IFNET_IS_CELLULAR(ifp);
	1851	boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
	1852	boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
	1853	boolean_t recvd_dsack = FALSE;
	1854	struct tcp_respond_args tra;
	1855	int prev_t_state;
	1856	boolean_t check_cfil = cfil_filter_present();
	1857	bool findpcb_iterated = false;
	1858	/*
	1859	* The mbuf may be freed after it has been added to the receive socket
	1860	* buffer or the reassembly queue, so we reinitialize th to point to a
	1861	* safe copy of the TCP header
	1862	*/
	1863	struct tcphdr saved_tcphdr = {};
	1864	/*
	1865	* Save copy of the IPv4/IPv6 header.
	1866	* Note: use array of uint32_t to silence compiler warning when casting
	1867	* to a struct ip6_hdr pointer.
	1868	*/
	1869	#define MAX_IPWORDS ((sizeof(struct ip) + MAX_IPOPTLEN) / sizeof(uint32_t))
	1870	uint32_t saved_hdr[MAX_IPWORDS];
	1871
	1872	#define TCP_INC_VAR(stat, npkts) do { \
	1873	stat += npkts; \
	1874	} while (0)
	1875
	1876	if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY) {
	1877	segment_count = 1;
	1878	}
	1879	TCP_INC_VAR(tcpstat.tcps_rcvtotal, segment_count);
	1880
	1881	struct ip6_hdr *ip6 = NULL;
	1882	int isipv6;
	1883	struct proc *kernel_proc = current_proc();
	1884
	1885	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_START, 0, 0, 0, 0, 0);
	1886
	1887	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
	1888	bzero((char *)&to, sizeof(to));
	1889
	1890	if (m->m_flags & M_PKTHDR) {
	1891	pf_tag = m_pftag(m)->pftag_tag;
	1892	}
	1893
	1894	if (isipv6) {
	1895	/*
	1896	* Expect 32-bit aligned data pointer on
	1897	* strict-align platforms
	1898	*/
	1899	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
	1900
	1901	/* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
	1902	ip6 = mtod(m, struct ip6_hdr *);
	1903	tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
	1904	th = (struct tcphdr )(void )((caddr_t)ip6 + off0);
	1905
	1906	if (tcp_input_checksum(AF_INET6, m, th, off0, tlen)) {
	1907	TCP_LOG_DROP_PKT(ip6, th, ifp, "IPv6 bad tcp checksum");
	1908	goto dropnosock;
	1909	}
	1910
	1911	KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) \| th->th_sport),
	1912	(((ip6->ip6_src.s6_addr16[0]) << 16) \| (ip6->ip6_dst.s6_addr16[0])),
	1913	th->th_seq, th->th_ack, th->th_win);
	1914	/*
	1915	* Be proactive about unspecified IPv6 address in source.
	1916	* As we use all-zero to indicate unbounded/unconnected pcb,
	1917	* unspecified IPv6 address can be used to confuse us.
	1918	*
	1919	* Note that packets with unspecified IPv6 destination is
	1920	* already dropped in ip6_input.
	1921	*/
	1922	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
	1923	/* XXX stat */
	1924	IF_TCP_STATINC(ifp, unspecv6);
	1925	TCP_LOG_DROP_PKT(ip6, th, ifp, "src IPv6 address unspecified");
	1926	goto dropnosock;
	1927	}
	1928	DTRACE_TCP5(receive, struct mbuf , m, struct inpcb , NULL,
	1929	struct ip6_hdr , ip6, struct tcpcb , NULL,
	1930	struct tcphdr *, th);
	1931
	1932	ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
	1933	} else {
	1934	/*
	1935	* Get IP and TCP header together in first mbuf.
	1936	* Note: IP leaves IP header in first mbuf.
	1937	*/
	1938	if (off0 > sizeof(struct ip)) {
	1939	ip_stripoptions(m);
	1940	off0 = sizeof(struct ip);
	1941	}
	1942	if (m->m_len < sizeof(struct tcpiphdr)) {
	1943	if ((m = m_pullup(m, sizeof(struct tcpiphdr))) == 0) {
	1944	tcpstat.tcps_rcvshort++;
	1945	return;
	1946	}
	1947	}
	1948
	1949	/* Expect 32-bit aligned data pointer on strict-align platforms */
	1950	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
	1951
	1952	ip = mtod(m, struct ip *);
	1953	th = (struct tcphdr )(void )((caddr_t)ip + off0);
	1954	tlen = ip->ip_len;
	1955
	1956	if (tcp_input_checksum(AF_INET, m, th, off0, tlen)) {
	1957	TCP_LOG_DROP_PKT(ip, th, ifp, "IPv4 bad tcp checksum");
	1958	goto dropnosock;
	1959	}
	1960
	1961	/* Re-initialization for later version check */
	1962	ip->ip_v = IPVERSION;
	1963	ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK);
	1964
	1965	DTRACE_TCP5(receive, struct mbuf , m, struct inpcb , NULL,
	1966	struct ip , ip, struct tcpcb , NULL, struct tcphdr *, th);
	1967
	1968	KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) \| th->th_sport),
	1969	(((ip->ip_src.s_addr & 0xffff) << 16) \| (ip->ip_dst.s_addr & 0xffff)),
	1970	th->th_seq, th->th_ack, th->th_win);
	1971	}
	1972
	1973	#define TCP_LOG_HDR (isipv6 ? (void )ip6 : (void )ip)
	1974
	1975	/*
	1976	* Check that TCP offset makes sense,
	1977	* pull out TCP options and adjust length.
	1978	*/
	1979	off = th->th_off << 2;
	1980	if (off < sizeof(struct tcphdr) \|\| off > tlen) {
	1981	tcpstat.tcps_rcvbadoff++;
	1982	IF_TCP_STATINC(ifp, badformat);
	1983	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "bad tcp offset");
	1984	goto dropnosock;
	1985	}
	1986	tlen -= off; /* tlen is used instead of ti->ti_len */
	1987	if (off > sizeof(struct tcphdr)) {
	1988	if (isipv6) {
	1989	IP6_EXTHDR_CHECK(m, off0, off, return );
	1990	ip6 = mtod(m, struct ip6_hdr *);
	1991	th = (struct tcphdr )(void )((caddr_t)ip6 + off0);
	1992	} else {
	1993	if (m->m_len < sizeof(struct ip) + off) {
	1994	if ((m = m_pullup(m, sizeof(struct ip) + off)) == 0) {
	1995	tcpstat.tcps_rcvshort++;
	1996	return;
	1997	}
	1998	ip = mtod(m, struct ip *);
	1999	th = (struct tcphdr )(void )((caddr_t)ip + off0);
	2000	}
	2001	}
	2002	optlen = off - sizeof(struct tcphdr);
	2003	optp = (u_char *)(th + 1);
	2004	/*
	2005	* Do quick retrieval of timestamp options ("options
	2006	* prediction?"). If timestamp is the only option and it's
	2007	* formatted as recommended in RFC 1323 appendix A, we
	2008	* quickly get the values now and not bother calling
	2009	* tcp_dooptions(), etc.
	2010	*/
	2011	if ((optlen == TCPOLEN_TSTAMP_APPA \|\|
	2012	(optlen > TCPOLEN_TSTAMP_APPA &&
	2013	optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
	2014	(u_int32_t )(void *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
	2015	(th->th_flags & TH_SYN) == 0) {
	2016	to.to_flags \|= TOF_TS;
	2017	to.to_tsval = ntohl((u_int32_t )(void *)(optp + 4));
	2018	to.to_tsecr = ntohl((u_int32_t )(void *)(optp + 8));
	2019	optp = NULL; /* we've parsed the options */
	2020	}
	2021	}
	2022	thflags = th->th_flags;
	2023
	2024	/*
	2025	* Drop all packets with both the SYN and FIN bits set.
	2026	* This prevents e.g. nmap from identifying the TCP/IP stack.
	2027	*
	2028	* This is a violation of the TCP specification.
	2029	*/
	2030	if ((thflags & (TH_SYN \| TH_FIN)) == (TH_SYN \| TH_FIN)) {
	2031	IF_TCP_STATINC(ifp, synfin);
	2032	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "drop SYN FIN");
	2033	goto dropnosock;
	2034	}
	2035
	2036	/*
	2037	* Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
	2038	* until after ip6_savecontrol() is called and before other functions
	2039	* which don't want those proto headers.
	2040	* Because ip6_savecontrol() is going to parse the mbuf to
	2041	* search for data to be passed up to user-land, it wants mbuf
	2042	* parameters to be unchanged.
	2043	*/
	2044	drop_hdrlen = off0 + off;
	2045
	2046	/* Since this is an entry point for input processing of tcp packets, we
	2047	* can update the tcp clock here.
	2048	*/
	2049	calculate_tcp_clock();
	2050
	2051	/*
	2052	* Record the interface where this segment arrived on; this does not
	2053	* affect normal data output (for non-detached TCP) as it provides a
	2054	* hint about which route and interface to use for sending in the
	2055	* absence of a PCB, when scoped routing (and thus source interface
	2056	* selection) are enabled.
	2057	*/
	2058	if ((m->m_pkthdr.pkt_flags & PKTF_LOOP) \|\| m->m_pkthdr.rcvif == NULL) {
	2059	ifscope = IFSCOPE_NONE;
	2060	} else {
	2061	ifscope = m->m_pkthdr.rcvif->if_index;
	2062	}
	2063
	2064	/*
	2065	* Convert TCP protocol specific fields to host format.
	2066	*/
	2067
	2068	#if BYTE_ORDER != BIG_ENDIAN
	2069	NTOHL(th->th_seq);
	2070	NTOHL(th->th_ack);
	2071	NTOHS(th->th_win);
	2072	NTOHS(th->th_urp);
	2073	#endif
	2074
	2075	/*
	2076	* Locate pcb for segment.
	2077	*/
	2078	findpcb:
	2079
	2080	isconnected = FALSE;
	2081	isdisconnected = FALSE;
	2082
	2083	if (isipv6) {
	2084	inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
	2085	&ip6->ip6_dst, th->th_dport, 1,
	2086	m->m_pkthdr.rcvif);
	2087	} else {
	2088	inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
	2089	ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
	2090	}
	2091
	2092	/*
	2093	* Use the interface scope information from the PCB for outbound
	2094	* segments. If the PCB isn't present and if scoped routing is
	2095	* enabled, tcp_respond will use the scope of the interface where
	2096	* the segment arrived on.
	2097	*/
	2098	if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) {
	2099	ifscope = inp->inp_boundifp->if_index;
	2100	}
	2101
	2102	/*
	2103	* If the state is CLOSED (i.e., TCB does not exist) then
	2104	* all data in the incoming segment is discarded.
	2105	* If the TCB exists but is in CLOSED state, it is embryonic,
	2106	* but should either do a listen or a connect soon.
	2107	*/
	2108	if (inp == NULL) {
	2109	if (log_in_vain) {
	2110	char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
	2111
	2112	if (isipv6) {
	2113	inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
	2114	inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
	2115	} else {
	2116	inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
	2117	inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
	2118	}
	2119	switch (log_in_vain) {
	2120	case 1:
	2121	if (thflags & TH_SYN) {
	2122	log(LOG_INFO,
	2123	"Connection attempt to TCP %s:%d from %s:%d\n",
	2124	dbuf, ntohs(th->th_dport),
	2125	sbuf,
	2126	ntohs(th->th_sport));
	2127	}
	2128	break;
	2129	case 2:
	2130	log(LOG_INFO,
	2131	"Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
	2132	dbuf, ntohs(th->th_dport), sbuf,
	2133	ntohs(th->th_sport), thflags);
	2134	break;
	2135	case 3:
	2136	case 4:
	2137	if ((thflags & TH_SYN) && !(thflags & TH_ACK) &&
	2138	!(m->m_flags & (M_BCAST \| M_MCAST)) &&
	2139	((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) \|\|
	2140	(!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))) {
	2141	log_in_vain_log((LOG_INFO,
	2142	"Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
	2143	dbuf, ntohs(th->th_dport),
	2144	sbuf,
	2145	ntohs(th->th_sport)));
	2146	}
	2147	break;
	2148	default:
	2149	break;
	2150	}
	2151	}
	2152	if (blackhole) {
	2153	if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP) {
	2154	switch (blackhole) {
	2155	case 1:
	2156	if (thflags & TH_SYN) {
	2157	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole 1 syn for closed port");
	2158	goto dropnosock;
	2159	}
	2160	break;
	2161	case 2:
	2162	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole 2 closed port");
	2163	goto dropnosock;
	2164	default:
	2165	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole closed port");
	2166	goto dropnosock;
	2167	}
	2168	}
	2169	}
	2170	IF_TCP_STATINC(ifp, noconnnolist);
	2171	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "closed port");
	2172	goto dropwithresetnosock;
	2173	}
	2174	so = inp->inp_socket;
	2175	if (so == NULL) {
	2176	/* This case shouldn't happen as the socket shouldn't be null
	2177	* if inp_state isn't set to INPCB_STATE_DEAD
	2178	* But just in case, we pretend we didn't find the socket if we hit this case
	2179	* as this isn't cause for a panic (the socket might be leaked however)...
	2180	*/
	2181	inp = NULL;
	2182	#if TEMPDEBUG
	2183	printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
	2184	#endif
	2185	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp_socket NULL");
	2186	goto dropnosock;
	2187	}
	2188
	2189	socket_lock(so, 1);
	2190	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
	2191	socket_unlock(so, 1);
	2192	inp = NULL; // pretend we didn't find it
	2193	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp state WNT_STOPUSING");
	2194	goto dropnosock;
	2195	}
	2196
	2197	if (!isipv6 && inp->inp_faddr.s_addr != INADDR_ANY) {
	2198	if (inp->inp_faddr.s_addr != ip->ip_src.s_addr \|\|
	2199	inp->inp_laddr.s_addr != ip->ip_dst.s_addr \|\|
	2200	inp->inp_fport != th->th_sport \|\|
	2201	inp->inp_lport != th->th_dport) {
	2202	os_log_error(OS_LOG_DEFAULT, "%s 5-tuple does not match: %u:%u %u:%u\n",
	2203	__func__,
	2204	ntohs(inp->inp_fport), ntohs(th->th_sport),
	2205	ntohs(inp->inp_lport), ntohs(th->th_dport));
	2206	if (findpcb_iterated) {
	2207	goto drop;
	2208	}
	2209	findpcb_iterated = true;
	2210	socket_unlock(so, 1);
	2211	inp = NULL;
	2212	goto findpcb;
	2213	}
	2214	} else if (isipv6 && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	2215	if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src) \|\|
	2216	!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst) \|\|
	2217	inp->inp_fport != th->th_sport \|\|
	2218	inp->inp_lport != th->th_dport) {
	2219	os_log_error(OS_LOG_DEFAULT, "%s 5-tuple does not match: %u:%u %u:%u\n",
	2220	__func__,
	2221	ntohs(inp->inp_fport), ntohs(th->th_sport),
	2222	ntohs(inp->inp_lport), ntohs(th->th_dport));
	2223	if (findpcb_iterated) {
	2224	goto drop;
	2225	}
	2226	findpcb_iterated = true;
	2227	socket_unlock(so, 1);
	2228	inp = NULL;
	2229	goto findpcb;
	2230	}
	2231	}
	2232
	2233	tp = intotcpcb(inp);
	2234	if (tp == NULL) {
	2235	IF_TCP_STATINC(ifp, noconnlist);
	2236	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "tp is NULL");
	2237	goto dropwithreset;
	2238	}
	2239
	2240	TCP_LOG_TH_FLAGS(TCP_LOG_HDR, th, tp, false, ifp);
	2241
	2242	if (tp->t_state == TCPS_CLOSED) {
	2243	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "tp state TCPS_CLOSED");
	2244	goto drop;
	2245	}
	2246
	2247	#if NECP
	2248	if (so->so_state & SS_ISCONNECTED) {
	2249	// Connected TCP sockets have a fully-bound local and remote,
	2250	// so the policy check doesn't need to override addresses
	2251	if (!necp_socket_is_allowed_to_send_recv(inp, ifp, pf_tag, NULL, NULL, NULL, NULL)) {
	2252	TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
	2253	IF_TCP_STATINC(ifp, badformat);
	2254	goto drop;
	2255	}
	2256	} else {
	2257	/*
	2258	* If the proc_uuid_policy table has been updated since the last use
	2259	* of the listening socket (i.e., the proc_uuid_policy_table_gencount
	2260	* has been updated), the flags in the socket may be out of date.
	2261	* If INP2_WANT_APP_POLICY is stale, inbound packets may
	2262	* be dropped by NECP if the socket should now match a per-app
	2263	* exception policy.
	2264	* In order to avoid this refresh the proc_uuid_policy state to
	2265	* potentially recalculate the socket's flags before checking
	2266	* with NECP.
	2267	*/
	2268	(void) inp_update_policy(inp);
	2269
	2270	if (isipv6) {
	2271	if (!necp_socket_is_allowed_to_send_recv_v6(inp,
	2272	th->th_dport, th->th_sport, &ip6->ip6_dst,
	2273	&ip6->ip6_src, ifp, pf_tag, NULL, NULL, NULL, NULL)) {
	2274	TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
	2275	IF_TCP_STATINC(ifp, badformat);
	2276	goto drop;
	2277	}
	2278	} else {
	2279	if (!necp_socket_is_allowed_to_send_recv_v4(inp,
	2280	th->th_dport, th->th_sport, &ip->ip_dst, &ip->ip_src,
	2281	ifp, pf_tag, NULL, NULL, NULL, NULL)) {
	2282	TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
	2283	IF_TCP_STATINC(ifp, badformat);
	2284	goto drop;
	2285	}
	2286	}
	2287	}
	2288	#endif /* NECP */
	2289
	2290	prev_t_state = tp->t_state;
	2291
	2292	/* If none of the FIN\|SYN\|RST\|ACK flag is set, drop */
	2293	if ((thflags & TH_ACCEPT) == 0) {
	2294	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 TH_ACCEPT == 0");
	2295	goto drop;
	2296	}
	2297
	2298	/* Unscale the window into a 32-bit value. */
	2299	if ((thflags & TH_SYN) == 0) {
	2300	tiwin = th->th_win << tp->snd_scale;
	2301	} else {
	2302	tiwin = th->th_win;
	2303	}
	2304
	2305	/* Avoid processing packets while closing a listen socket */
	2306	if (tp->t_state == TCPS_LISTEN &&
	2307	(so->so_options & SO_ACCEPTCONN) == 0) {
	2308	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "closing a listening socket");
	2309	goto drop;
	2310	}
	2311
	2312	if (so->so_options & (SO_DEBUG \| SO_ACCEPTCONN)) {
	2313	#if TCPDEBUG
	2314	if (so->so_options & SO_DEBUG) {
	2315	ostate = tp->t_state;
	2316	if (isipv6) {
	2317	bcopy((char )ip6, (char )tcp_saveipgen,
	2318	sizeof(*ip6));
	2319	} else {
	2320	bcopy((char )ip, (char )tcp_saveipgen, sizeof(*ip));
	2321	}
	2322	tcp_savetcp = *th;
	2323	}
	2324	#endif
	2325	if (so->so_options & SO_ACCEPTCONN) {
	2326	struct tcpcb *tp0 = tp;
	2327	struct socket *so2;
	2328	struct socket *oso;
	2329	struct sockaddr_storage from;
	2330	struct sockaddr_storage to2;
	2331	struct inpcb *oinp = sotoinpcb(so);
	2332	struct ifnet *head_ifscope;
	2333	unsigned int head_nocell, head_recvanyif,
	2334	head_noexpensive, head_awdl_unrestricted,
	2335	head_intcoproc_allowed, head_external_port,
	2336	head_noconstrained;
	2337
	2338	/* Get listener's bound-to-interface, if any */
	2339	head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
	2340	inp->inp_boundifp : NULL;
	2341	/* Get listener's no-cellular information, if any */
	2342	head_nocell = INP_NO_CELLULAR(inp);
	2343	/* Get listener's recv-any-interface, if any */
	2344	head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF);
	2345	/* Get listener's no-expensive information, if any */
	2346	head_noexpensive = INP_NO_EXPENSIVE(inp);
	2347	head_noconstrained = INP_NO_CONSTRAINED(inp);
	2348	head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
	2349	head_intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp);
	2350	head_external_port = (inp->inp_flags2 & INP2_EXTERNAL_PORT);
	2351
	2352	/*
	2353	* If the state is LISTEN then ignore segment if it contains an RST.
	2354	* If the segment contains an ACK then it is bad and send a RST.
	2355	* If it does not contain a SYN then it is not interesting; drop it.
	2356	* If it is from this socket, drop it, it must be forged.
	2357	*/
	2358	if ((thflags & (TH_RST \| TH_ACK \| TH_SYN)) != TH_SYN) {
	2359	IF_TCP_STATINC(ifp, listbadsyn);
	2360
	2361	if (thflags & TH_RST) {
	2362	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN with RST");
	2363	goto drop;
	2364	}
	2365	if (thflags & TH_ACK) {
	2366	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN with ACK");
	2367	tp = NULL;
	2368	tcpstat.tcps_badsyn++;
	2369	goto dropwithreset;
	2370	}
	2371
	2372	/* We come here if there is no SYN set */
	2373	tcpstat.tcps_badsyn++;
	2374	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad SYN");
	2375	goto drop;
	2376	}
	2377	KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN \| DBG_FUNC_START, 0, 0, 0, 0, 0);
	2378	if (th->th_dport == th->th_sport) {
	2379	if (isipv6) {
	2380	if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
	2381	&ip6->ip6_src)) {
	2382	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same port");
	2383	goto drop;
	2384	}
	2385	} else if (ip->ip_dst.s_addr == ip->ip_src.s_addr) {
	2386	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same IPv4 address");
	2387	goto drop;
	2388	}
	2389	}
	2390	/*
	2391	* RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
	2392	* in_broadcast() should never return true on a received
	2393	* packet with M_BCAST not set.
	2394	*
	2395	* Packets with a multicast source address should also
	2396	* be discarded.
	2397	*/
	2398	if (m->m_flags & (M_BCAST \| M_MCAST)) {
	2399	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "mbuf M_BCAST \| M_MCAST");
	2400	goto drop;
	2401	}
	2402	if (isipv6) {
	2403	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) \|\|
	2404	IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
	2405	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "IN6_IS_ADDR_MULTICAST");
	2406	goto drop;
	2407	}
	2408	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) \|\|
	2409	IN_MULTICAST(ntohl(ip->ip_src.s_addr)) \|\|
	2410	ip->ip_src.s_addr == htonl(INADDR_BROADCAST) \|\|
	2411	in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
	2412	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "multicast or broadcast address");
	2413	goto drop;
	2414	}
	2415
	2416
	2417	/*
	2418	* If deprecated address is forbidden,
	2419	* we do not accept SYN to deprecated interface
	2420	* address to prevent any new inbound connection from
	2421	* getting established.
	2422	* When we do not accept SYN, we send a TCP RST,
	2423	* with deprecated source address (instead of dropping
	2424	* it). We compromise it as it is much better for peer
	2425	* to send a RST, and RST will be the final packet
	2426	* for the exchange.
	2427	*
	2428	* If we do not forbid deprecated addresses, we accept
	2429	* the SYN packet. RFC 4862 forbids dropping SYN in
	2430	* this case.
	2431	*/
	2432	if (isipv6 && !ip6_use_deprecated) {
	2433	uint32_t ia6_flags;
	2434
	2435	if (ip6_getdstifaddr_info(m, NULL,
	2436	&ia6_flags) == 0) {
	2437	if (ia6_flags & IN6_IFF_DEPRECATED) {
	2438	tp = NULL;
	2439	IF_TCP_STATINC(ifp, deprecate6);
	2440	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "deprecated IPv6 address");
	2441	goto dropwithreset;
	2442	}
	2443	}
	2444	}
	2445	if (so->so_filt \|\| check_cfil) {
	2446	if (isipv6) {
	2447	struct sockaddr_in6 sin6 = (struct sockaddr_in6)&from;
	2448
	2449	sin6->sin6_len = sizeof(*sin6);
	2450	sin6->sin6_family = AF_INET6;
	2451	sin6->sin6_port = th->th_sport;
	2452	sin6->sin6_flowinfo = 0;
	2453	sin6->sin6_addr = ip6->ip6_src;
	2454	sin6->sin6_scope_id = 0;
	2455
	2456	sin6 = (struct sockaddr_in6*)&to2;
	2457
	2458	sin6->sin6_len = sizeof(struct sockaddr_in6);
	2459	sin6->sin6_family = AF_INET6;
	2460	sin6->sin6_port = th->th_dport;
	2461	sin6->sin6_flowinfo = 0;
	2462	sin6->sin6_addr = ip6->ip6_dst;
	2463	sin6->sin6_scope_id = 0;
	2464	} else {
	2465	struct sockaddr_in sin = (struct sockaddr_in)&from;
	2466
	2467	sin->sin_len = sizeof(*sin);
	2468	sin->sin_family = AF_INET;
	2469	sin->sin_port = th->th_sport;
	2470	sin->sin_addr = ip->ip_src;
	2471
	2472	sin = (struct sockaddr_in*)&to2;
	2473
	2474	sin->sin_len = sizeof(struct sockaddr_in);
	2475	sin->sin_family = AF_INET;
	2476	sin->sin_port = th->th_dport;
	2477	sin->sin_addr = ip->ip_dst;
	2478	}
	2479	}
	2480
	2481	if (so->so_filt) {
	2482	so2 = sonewconn(so, 0, (struct sockaddr*)&from);
	2483	} else {
	2484	so2 = sonewconn(so, 0, NULL);
	2485	}
	2486	if (so2 == 0) {
	2487	tcpstat.tcps_listendrop++;
	2488	if (tcp_dropdropablreq(so)) {
	2489	if (so->so_filt) {
	2490	so2 = sonewconn(so, 0, (struct sockaddr*)&from);
	2491	} else {
	2492	so2 = sonewconn(so, 0, NULL);
	2493	}
	2494	}
	2495	if (!so2) {
	2496	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " listen drop");
	2497	goto drop;
	2498	}
	2499	}
	2500
	2501	/* Point "inp" and "tp" in tandem to new socket */
	2502	inp = (struct inpcb *)so2->so_pcb;
	2503	tp = intotcpcb(inp);
	2504
	2505	oso = so;
	2506	socket_unlock(so, 0); /* Unlock but keep a reference on listener for now */
	2507
	2508	so = so2;
	2509	socket_lock(so, 1);
	2510	/*
	2511	* Mark socket as temporary until we're
	2512	* committed to keeping it. The code at
	2513	* ``drop'' and ``dropwithreset'' check the
	2514	* flag dropsocket to see if the temporary
	2515	* socket created here should be discarded.
	2516	* We mark the socket as discardable until
	2517	* we're committed to it below in TCPS_LISTEN.
	2518	* There are some error conditions in which we
	2519	* have to drop the temporary socket.
	2520	*/
	2521	dropsocket++;
	2522	/*
	2523	* Inherit INP_BOUND_IF from listener; testing if
	2524	* head_ifscope is non-NULL is sufficient, since it
	2525	* can only be set to a non-zero value earlier if
	2526	* the listener has such a flag set.
	2527	*/
	2528	if (head_ifscope != NULL) {
	2529	inp->inp_flags \|= INP_BOUND_IF;
	2530	inp->inp_boundifp = head_ifscope;
	2531	} else {
	2532	inp->inp_flags &= ~INP_BOUND_IF;
	2533	}
	2534	/*
	2535	* Inherit restrictions from listener.
	2536	*/
	2537	if (head_nocell) {
	2538	inp_set_nocellular(inp);
	2539	}
	2540	if (head_noexpensive) {
	2541	inp_set_noexpensive(inp);
	2542	}
	2543	if (head_noconstrained) {
	2544	inp_set_noconstrained(inp);
	2545	}
	2546	if (head_awdl_unrestricted) {
	2547	inp_set_awdl_unrestricted(inp);
	2548	}
	2549	if (head_intcoproc_allowed) {
	2550	inp_set_intcoproc_allowed(inp);
	2551	}
	2552	/*
	2553	* Inherit {IN,IN6}_RECV_ANYIF from listener.
	2554	*/
	2555	if (head_recvanyif) {
	2556	inp->inp_flags \|= INP_RECV_ANYIF;
	2557	} else {
	2558	inp->inp_flags &= ~INP_RECV_ANYIF;
	2559	}
	2560
	2561	if (head_external_port) {
	2562	inp->inp_flags2 \|= INP2_EXTERNAL_PORT;
	2563	}
	2564	if (isipv6) {
	2565	inp->in6p_laddr = ip6->ip6_dst;
	2566	} else {
	2567	inp->inp_vflag &= ~INP_IPV6;
	2568	inp->inp_vflag \|= INP_IPV4;
	2569	inp->inp_laddr = ip->ip_dst;
	2570	}
	2571	inp->inp_lport = th->th_dport;
	2572	if (in_pcbinshash(inp, 0) != 0) {
	2573	/*
	2574	* Undo the assignments above if we failed to
	2575	* put the PCB on the hash lists.
	2576	*/
	2577	if (isipv6) {
	2578	inp->in6p_laddr = in6addr_any;
	2579	} else {
	2580	inp->inp_laddr.s_addr = INADDR_ANY;
	2581	}
	2582	inp->inp_lport = 0;
	2583	socket_lock(oso, 0); /* release ref on parent */
	2584	socket_unlock(oso, 1);
	2585	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " in_pcbinshash failed");
	2586	goto drop;
	2587	}
	2588	socket_lock(oso, 0);
	2589	if (isipv6) {
	2590	/*
	2591	* Inherit socket options from the listening
	2592	* socket.
	2593	* Note that in6p_inputopts are not (even
	2594	* should not be) copied, since it stores
	2595	* previously received options and is used to
	2596	* detect if each new option is different than
	2597	* the previous one and hence should be passed
	2598	* to a user.
	2599	* If we copied in6p_inputopts, a user would
	2600	* not be able to receive options just after
	2601	* calling the accept system call.
	2602	*/
	2603	inp->inp_flags \|=
	2604	oinp->inp_flags & INP_CONTROLOPTS;
	2605	if (oinp->in6p_outputopts) {
	2606	inp->in6p_outputopts =
	2607	ip6_copypktopts(oinp->in6p_outputopts,
	2608	M_NOWAIT);
	2609	}
	2610	} else {
	2611	inp->inp_options = ip_srcroute();
	2612	inp->inp_ip_tos = oinp->inp_ip_tos;
	2613	}
	2614	#if IPSEC
	2615	/* copy old policy into new socket's */
	2616	if (sotoinpcb(oso)->inp_sp) {
	2617	int error = 0;
	2618	/* Is it a security hole here to silently fail to copy the policy? */
	2619	if (inp->inp_sp != NULL) {
	2620	error = ipsec_init_policy(so, &inp->inp_sp);
	2621	}
	2622	if (error != 0 \|\| ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) {
	2623	printf("tcp_input: could not copy policy\n");
	2624	}
	2625	}
	2626	#endif
	2627	/* inherit states from the listener */
	2628	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
	2629	struct tcpcb *, tp, int32_t, TCPS_LISTEN);
	2630	tp->t_state = TCPS_LISTEN;
	2631	tp->t_flags \|= tp0->t_flags & (TF_NOPUSH \| TF_NOOPT \| TF_NODELAY);
	2632	tp->t_flagsext \|= (tp0->t_flagsext & (TF_RXTFINDROP \| TF_NOTIMEWAIT \| TF_FASTOPEN));
	2633	tp->t_keepinit = tp0->t_keepinit;
	2634	tp->t_keepcnt = tp0->t_keepcnt;
	2635	tp->t_keepintvl = tp0->t_keepintvl;
	2636	tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo;
	2637	tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo;
	2638	tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
	2639	if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
	2640	tp->t_notsent_lowat = tp0->t_notsent_lowat;
	2641	}
	2642	tp->t_inpcb->inp_flags2 \|=
	2643	tp0->t_inpcb->inp_flags2 & INP2_KEEPALIVE_OFFLOAD;
	2644
	2645	/* now drop the reference on the listener */
	2646	socket_unlock(oso, 1);
	2647
	2648	tcp_set_max_rwinscale(tp, so);
	2649
	2650	#if CONTENT_FILTER
	2651	if (check_cfil) {
	2652	int error = cfil_sock_attach(so2, (struct sockaddr)&to2, (struct sockaddr)&from,
	2653	CFS_CONNECTION_DIR_IN);
	2654	if (error != 0) {
	2655	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " cfil_sock_attach failed");
	2656	goto drop;
	2657	}
	2658	}
	2659	#endif /* CONTENT_FILTER */
	2660
	2661	KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	2662	}
	2663	}
	2664	socket_lock_assert_owned(so);
	2665
	2666	if (net_mpklog_enabled && (m->m_pkthdr.rcvif->if_xflags & IFXF_MPK_LOG)) {
	2667	MPKL_TCP_INPUT(tcp_mpkl_log_object,
	2668	ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
	2669	th->th_seq, th->th_ack, tlen, thflags,
	2670	so->last_pid, so->so_log_seqn++);
	2671	}
	2672
	2673	if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
	2674	/*
	2675	* Evaluate the rate of arrival of packets to see if the
	2676	* receiver can reduce the ack traffic. The algorithm to
	2677	* stretch acks will be enabled if the connection meets
	2678	* certain criteria defined in tcp_stretch_ack_enable function.
	2679	*/
	2680	if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) {
	2681	TCP_INC_VAR(tp->rcv_waitforss, segment_count);
	2682	}
	2683	if (tcp_stretch_ack_enable(tp, thflags)) {
	2684	tp->t_flags \|= TF_STRETCHACK;
	2685	tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
	2686	tp->rcv_waitforss = 0;
	2687	} else {
	2688	tp->t_flags &= ~(TF_STRETCHACK);
	2689	}
	2690	if (TSTMP_GT(tp->rcv_unackwin - (tcp_rcvunackwin >> 1), tcp_now)) {
	2691	tp->rcv_by_unackhalfwin += (tlen + off);
	2692	tp->rcv_by_unackwin += (tlen + off);
	2693	} else {
	2694	tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
	2695	tp->rcv_by_unackwin = tp->rcv_by_unackhalfwin + tlen + off;
	2696	tp->rcv_by_unackhalfwin = tlen + off;
	2697	}
	2698	}
	2699
	2700	/*
	2701	* Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
	2702	* bother doing extensive checks for state and whatnot.
	2703	*/
	2704	if (thflags & TH_CWR) {
	2705	tp->ecn_flags &= ~TE_SENDECE;
	2706	tp->t_ecn_recv_cwr++;
	2707	}
	2708
	2709	/*
	2710	* Explicit Congestion Notification - Flag that we need to send ECT if
	2711	* + The IP Congestion experienced flag was set.
	2712	* + Socket is in established state
	2713	* + We negotiated ECN in the TCP setup
	2714	* + This isn't a pure ack (tlen > 0)
	2715	* + The data is in the valid window
	2716	*
	2717	* TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
	2718	*/
	2719	if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
	2720	TCP_ECN_ENABLED(tp) && tlen > 0 &&
	2721	SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
	2722	SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
	2723	tp->t_ecn_recv_ce++;
	2724	tcpstat.tcps_ecn_recv_ce++;
	2725	INP_INC_IFNET_STAT(inp, ecn_recv_ce);
	2726	/* Mark this connection as it received CE from network */
	2727	tp->ecn_flags \|= TE_RECV_ECN_CE;
	2728	tp->ecn_flags \|= TE_SENDECE;
	2729	}
	2730
	2731	/*
	2732	* If we received an explicit notification of congestion in
	2733	* ip tos ecn bits or by the CWR bit in TCP header flags, reset
	2734	* the ack-stretching state. We need to handle ECN notification if
	2735	* an ECN setup SYN was sent even once.
	2736	*/
	2737	if (tp->t_state == TCPS_ESTABLISHED &&
	2738	(tp->ecn_flags & TE_SETUPSENT) &&
	2739	(ip_ecn == IPTOS_ECN_CE \|\| (thflags & TH_CWR))) {
	2740	tcp_reset_stretch_ack(tp);
	2741	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
	2742	CLEAR_IAJ_STATE(tp);
	2743	}
	2744
	2745	if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
	2746	!TCP_ECN_ENABLED(tp) && !(tp->ecn_flags & TE_CEHEURI_SET)) {
	2747	tcpstat.tcps_ecn_fallback_ce++;
	2748	tcp_heuristic_ecn_aggressive(tp);
	2749	tp->ecn_flags \|= TE_CEHEURI_SET;
	2750	}
	2751
	2752	if (tp->t_state == TCPS_ESTABLISHED && TCP_ECN_ENABLED(tp) &&
	2753	ip_ecn == IPTOS_ECN_CE && !(tp->ecn_flags & TE_CEHEURI_SET)) {
	2754	if (inp->inp_stat->rxpackets < ECN_MIN_CE_PROBES) {
	2755	tp->t_ecn_recv_ce_pkt++;
	2756	} else if (tp->t_ecn_recv_ce_pkt > ECN_MAX_CE_RATIO) {
	2757	tcpstat.tcps_ecn_fallback_ce++;
	2758	tcp_heuristic_ecn_aggressive(tp);
	2759	tp->ecn_flags \|= TE_CEHEURI_SET;
	2760	INP_INC_IFNET_STAT(inp, ecn_fallback_ce);
	2761	} else {
	2762	/* We tracked the first ECN_MIN_CE_PROBES segments, we
	2763	* now know that the path is good.
	2764	*/
	2765	tp->ecn_flags \|= TE_CEHEURI_SET;
	2766	}
	2767	}
	2768
	2769	/* Update rcvtime as a new segment was received on the connection */
	2770	tp->t_rcvtime = tcp_now;
	2771
	2772	/*
	2773	* Segment received on connection.
	2774	* Reset idle time and keep-alive timer.
	2775	*/
	2776	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
	2777	tcp_keepalive_reset(tp);
	2778
	2779	if (tp->t_mpsub) {
	2780	mptcp_reset_keepalive(tp);
	2781	}
	2782	}
	2783
	2784	/*
	2785	* Process options if not in LISTEN state,
	2786	* else do it below (after getting remote address).
	2787	*/
	2788	if (tp->t_state != TCPS_LISTEN && optp) {
	2789	tcp_dooptions(tp, optp, optlen, th, &to);
	2790	}
	2791	#if MPTCP
	2792	if (tp->t_state != TCPS_LISTEN && (so->so_flags & SOF_MP_SUBFLOW) &&
	2793	mptcp_input_preproc(tp, m, th, drop_hdrlen) != 0) {
	2794	tp->t_flags \|= TF_ACKNOW;
	2795	(void) tcp_output(tp);
	2796	tcp_check_timer_state(tp);
	2797	socket_unlock(so, 1);
	2798	return;
	2799	}
	2800	#endif /* MPTCP */
	2801	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
	2802	if (!(thflags & TH_ACK) \|\|
	2803	(SEQ_GT(th->th_ack, tp->iss) &&
	2804	SEQ_LEQ(th->th_ack, tp->snd_max))) {
	2805	tcp_finalize_options(tp, &to, ifscope);
	2806	}
	2807	}
	2808
	2809	#if TRAFFIC_MGT
	2810	/*
	2811	* Compute inter-packet arrival jitter. According to RFC 3550,
	2812	* inter-packet arrival jitter is defined as the difference in
	2813	* packet spacing at the receiver compared to the sender for a
	2814	* pair of packets. When two packets of maximum segment size come
	2815	* one after the other with consecutive sequence numbers, we
	2816	* consider them as packets sent together at the sender and use
	2817	* them as a pair to compute inter-packet arrival jitter. This
	2818	* metric indicates the delay induced by the network components due
	2819	* to queuing in edge/access routers.
	2820	*/
	2821	if (tp->t_state == TCPS_ESTABLISHED &&
	2822	(thflags & (TH_SYN \| TH_FIN \| TH_RST \| TH_URG \| TH_ACK \| TH_ECE \| TH_PUSH)) == TH_ACK &&
	2823	((tp->t_flags & TF_NEEDFIN) == 0) &&
	2824	((to.to_flags & TOF_TS) == 0 \|\|
	2825	TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
	2826	th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) {
	2827	int seg_size = tlen;
	2828	if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
	2829	TCP_INC_VAR(tp->iaj_pktcnt, segment_count);
	2830	}
	2831
	2832	if (tp->iaj_size == 0 \|\| seg_size > tp->iaj_size \|\|
	2833	(seg_size == tp->iaj_size && tp->iaj_rcv_ts == 0)) {
	2834	/*
	2835	* State related to inter-arrival jitter is
	2836	* uninitialized or we are trying to find a good
	2837	* first packet to start computing the metric
	2838	*/
	2839	update_iaj_state(tp, seg_size, 0);
	2840	} else {
	2841	if (seg_size == tp->iaj_size) {
	2842	/*
	2843	* Compute inter-arrival jitter taking
	2844	* this packet as the second packet
	2845	*/
	2846	compute_iaj(tp);
	2847	}
	2848	if (seg_size < tp->iaj_size) {
	2849	/*
	2850	* There is a smaller packet in the stream.
	2851	* Some times the maximum size supported
	2852	* on a path can change if there is a new
	2853	* link with smaller MTU. The receiver will
	2854	* not know about this change. If there
	2855	* are too many packets smaller than
	2856	* iaj_size, we try to learn the iaj_size
	2857	* again.
	2858	*/
	2859	TCP_INC_VAR(tp->iaj_small_pkt, segment_count);
	2860	if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
	2861	update_iaj_state(tp, seg_size, 1);
	2862	} else {
	2863	CLEAR_IAJ_STATE(tp);
	2864	}
	2865	} else {
	2866	update_iaj_state(tp, seg_size, 0);
	2867	}
	2868	}
	2869	} else {
	2870	CLEAR_IAJ_STATE(tp);
	2871	}
	2872	#endif /* TRAFFIC_MGT */
	2873
	2874	/*
	2875	* Header prediction: check for the two common cases
	2876	* of a uni-directional data xfer. If the packet has
	2877	* no control flags, is in-sequence, the window didn't
	2878	* change and we're not retransmitting, it's a
	2879	* candidate. If the length is zero and the ack moved
	2880	* forward, we're the sender side of the xfer. Just
	2881	* free the data acked & wake any higher level process
	2882	* that was blocked waiting for space. If the length
	2883	* is non-zero and the ack didn't move, we're the
	2884	* receiver side. If we're getting packets in-order
	2885	* (the reassembly queue is empty), add the data to
	2886	* the socket buffer and note that we need a delayed ack.
	2887	* Make sure that the hidden state-flags are also off.
	2888	* Since we check for TCPS_ESTABLISHED above, it can only
	2889	* be TH_NEEDSYN.
	2890	*/
	2891	if (tp->t_state == TCPS_ESTABLISHED &&
	2892	(thflags & (TH_SYN \| TH_FIN \| TH_RST \| TH_URG \| TH_ACK \| TH_ECE \| TH_CWR)) == TH_ACK &&
	2893	((tp->t_flags & TF_NEEDFIN) == 0) &&
	2894	((to.to_flags & TOF_TS) == 0 \|\|
	2895	TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
	2896	th->th_seq == tp->rcv_nxt &&
	2897	tiwin && tiwin == tp->snd_wnd &&
	2898	tp->snd_nxt == tp->snd_max) {
	2899	/*
	2900	* If last ACK falls within this segment's sequence numbers,
	2901	* record the timestamp.
	2902	* NOTE that the test is modified according to the latest
	2903	* proposal of the tcplw@cray.com list (Braden 1993/04/26).
	2904	*/
	2905	if ((to.to_flags & TOF_TS) != 0 &&
	2906	SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
	2907	tp->ts_recent_age = tcp_now;
	2908	tp->ts_recent = to.to_tsval;
	2909	}
	2910
	2911	if (tlen == 0) {
	2912	if (SEQ_GT(th->th_ack, tp->snd_una) &&
	2913	SEQ_LEQ(th->th_ack, tp->snd_max) &&
	2914	tp->snd_cwnd >= tp->snd_ssthresh &&
	2915	(!IN_FASTRECOVERY(tp) &&
	2916	((!(SACK_ENABLED(tp)) &&
	2917	tp->t_dupacks < tp->t_rexmtthresh) \|\|
	2918	(SACK_ENABLED(tp) && to.to_nsacks == 0 &&
	2919	TAILQ_EMPTY(&tp->snd_holes))))) {
	2920	/*
	2921	* this is a pure ack for outstanding data.
	2922	*/
	2923	++tcpstat.tcps_predack;
	2924
	2925	tcp_bad_rexmt_check(tp, th, &to);
	2926
	2927	/* Recalculate the RTT */
	2928	tcp_compute_rtt(tp, &to, th);
	2929
	2930	VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
	2931	acked = BYTES_ACKED(th, tp);
	2932	tcpstat.tcps_rcvackpack++;
	2933	tcpstat.tcps_rcvackbyte += acked;
	2934
	2935	/*
	2936	* Handle an ack that is in sequence during
	2937	* congestion avoidance phase. The
	2938	* calculations in this function
	2939	* assume that snd_una is not updated yet.
	2940	*/
	2941	if (CC_ALGO(tp)->congestion_avd != NULL) {
	2942	CC_ALGO(tp)->congestion_avd(tp, th);
	2943	}
	2944	tcp_ccdbg_trace(tp, th, TCP_CC_INSEQ_ACK_RCVD);
	2945	sbdrop(&so->so_snd, acked);
	2946	tcp_sbsnd_trim(&so->so_snd);
	2947
	2948	if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
	2949	SEQ_LEQ(th->th_ack, tp->snd_recover)) {
	2950	tp->snd_recover = th->th_ack - 1;
	2951	}
	2952
	2953	tcp_update_snd_una(tp, th->th_ack);
	2954
	2955	TCP_RESET_REXMT_STATE(tp);
	2956
	2957	/*
	2958	* pull snd_wl2 up to prevent seq wrap relative
	2959	* to th_ack.
	2960	*/
	2961	tp->snd_wl2 = th->th_ack;
	2962
	2963	if (tp->t_dupacks > 0) {
	2964	tp->t_dupacks = 0;
	2965	tp->t_rexmtthresh = tcprexmtthresh;
	2966	tp->t_new_dupacks = 0;
	2967	}
	2968
	2969	tp->sackhint.sack_bytes_acked = 0;
	2970
	2971	/*
	2972	* If all outstanding data are acked, stop
	2973	* retransmit timer, otherwise restart timer
	2974	* using current (possibly backed-off) value.
	2975	* If process is waiting for space,
	2976	* wakeup/selwakeup/signal. If data
	2977	* are ready to send, let tcp_output
	2978	* decide between more output or persist.
	2979	*/
	2980	if (tp->snd_una == tp->snd_max) {
	2981	tp->t_timer[TCPT_REXMT] = 0;
	2982	tp->t_timer[TCPT_PTO] = 0;
	2983	} else if (tp->t_timer[TCPT_PERSIST] == 0) {
	2984	tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
	2985	}
	2986	if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
	2987	!TCP_DSACK_SEQ_IN_WINDOW(tp,
	2988	tp->t_dsack_lastuna, tp->snd_una)) {
	2989	tcp_rxtseg_clean(tp);
	2990	}
	2991
	2992	if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
	2993	tp->t_bwmeas != NULL) {
	2994	tcp_bwmeas_check(tp);
	2995	}
	2996
	2997	write_wakeup = 1;
	2998	if (!SLIST_EMPTY(&tp->t_notify_ack)) {
	2999	tcp_notify_acknowledgement(tp, so);
	3000	}
	3001
	3002	if ((so->so_snd.sb_cc) \|\| (tp->t_flags & TF_ACKNOW)) {
	3003	(void) tcp_output(tp);
	3004	}
	3005
	3006	tcp_tfo_rcv_ack(tp, th);
	3007
	3008	m_freem(m);
	3009
	3010	tcp_check_timer_state(tp);
	3011
	3012	tcp_handle_wakeup(so, read_wakeup, write_wakeup);
	3013
	3014	socket_unlock(so, 1);
	3015	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	3016	return;
	3017	}
	3018	} else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) &&
	3019	tlen <= tcp_sbspace(tp)) {
	3020	/*
	3021	* this is a pure, in-sequence data packet
	3022	* with nothing on the reassembly queue and
	3023	* we have enough buffer space to take it.
	3024	*/
	3025
	3026	/* Clean receiver SACK report if present */
	3027	if (SACK_ENABLED(tp) && tp->rcv_numsacks) {
	3028	tcp_clean_sackreport(tp);
	3029	}
	3030	++tcpstat.tcps_preddat;
	3031	tp->rcv_nxt += tlen;
	3032	/*
	3033	* Pull snd_wl1 up to prevent seq wrap relative to
	3034	* th_seq.
	3035	*/
	3036	tp->snd_wl1 = th->th_seq;
	3037	/*
	3038	* Pull rcv_up up to prevent seq wrap relative to
	3039	* rcv_nxt.
	3040	*/
	3041	tp->rcv_up = tp->rcv_nxt;
	3042	TCP_INC_VAR(tcpstat.tcps_rcvpack, segment_count);
	3043	tcpstat.tcps_rcvbyte += tlen;
	3044	if (nstat_collect) {
	3045	INP_ADD_STAT(inp, cell, wifi, wired,
	3046	rxpackets, 1);
	3047	INP_ADD_STAT(inp, cell, wifi, wired, rxbytes,
	3048	tlen);
	3049	inp_set_activity_bitmap(inp);
	3050	}
	3051
	3052	/*
	3053	* Calculate the RTT on the receiver only if the
	3054	* connection is in streaming mode and the last
	3055	* packet was not an end-of-write
	3056	*/
	3057	if (tp->t_flags & TF_STREAMING_ON) {
	3058	tcp_compute_rtt(tp, &to, th);
	3059	}
	3060
	3061	tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
	3062
	3063	/*
	3064	* Add data to socket buffer.
	3065	*/
	3066	so_recv_data_stat(so, m, 0);
	3067	m_adj(m, drop_hdrlen); /* delayed header drop */
	3068
	3069	/*
	3070	* If message delivery (SOF_ENABLE_MSGS) is enabled on
	3071	* this socket, deliver the packet received as an
	3072	* in-order message with sequence number attached to it.
	3073	*/
	3074	if (isipv6) {
	3075	memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr));
	3076	ip6 = (struct ip6_hdr *)&saved_hdr[0];
	3077	} else {
	3078	memcpy(&saved_hdr, ip, ip->ip_hl << 2);
	3079	ip = (struct ip *)&saved_hdr[0];
	3080	}
	3081	memcpy(&saved_tcphdr, th, sizeof(struct tcphdr));
	3082
	3083	if (th->th_flags & TH_PUSH) {
	3084	tp->t_flagsext \|= TF_LAST_IS_PSH;
	3085	} else {
	3086	tp->t_flagsext &= ~TF_LAST_IS_PSH;
	3087	}
	3088
	3089	if (sbappendstream_rcvdemux(so, m)) {
	3090	mptcp_handle_input(so);
	3091	read_wakeup = 1;
	3092	}
	3093	th = &saved_tcphdr;
	3094
	3095	if (isipv6) {
	3096	KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) \| th->th_sport),
	3097	(((ip6->ip6_src.s6_addr16[0]) << 16) \| (ip6->ip6_dst.s6_addr16[0])),
	3098	th->th_seq, th->th_ack, th->th_win);
	3099	} else {
	3100	KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) \| th->th_sport),
	3101	(((ip->ip_src.s_addr & 0xffff) << 16) \| (ip->ip_dst.s_addr & 0xffff)),
	3102	th->th_seq, th->th_ack, th->th_win);
	3103	}
	3104	TCP_INC_VAR(tp->t_unacksegs, segment_count);
	3105	if (DELAY_ACK(tp, th)) {
	3106	if ((tp->t_flags & TF_DELACK) == 0) {
	3107	tp->t_flags \|= TF_DELACK;
	3108	tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
	3109	}
	3110	} else {
	3111	tp->t_flags \|= TF_ACKNOW;
	3112	tcp_output(tp);
	3113	}
	3114
	3115	tcp_adaptive_rwtimo_check(tp, tlen);
	3116
	3117	if (tlen > 0) {
	3118	tcp_tfo_rcv_data(tp);
	3119	}
	3120
	3121	tcp_check_timer_state(tp);
	3122
	3123	tcp_handle_wakeup(so, read_wakeup, write_wakeup);
	3124
	3125	socket_unlock(so, 1);
	3126	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	3127	return;
	3128	}
	3129	}
	3130
	3131	/*
	3132	* Calculate amount of space in receive window,
	3133	* and then do TCP input processing.
	3134	* Receive window is amount of space in rcv queue,
	3135	* but not less than advertised window.
	3136	*/
	3137	socket_lock_assert_owned(so);
	3138	win = tcp_sbspace(tp);
	3139	if (win < 0) {
	3140	win = 0;
	3141	} else { /* clip rcv window to 4K for modems */
	3142	if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
	3143	win = min(win, slowlink_wsize);
	3144	}
	3145	}
	3146	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
	3147	#if MPTCP
	3148	/*
	3149	* Ensure that the subflow receive window isn't greater
	3150	* than the connection level receive window.
	3151	*/
	3152	if ((tp->t_mpflags & TMPF_MPTCP_TRUE) && (mp_tp = tptomptp(tp))) {
	3153	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
	3154	int64_t recwin_conn = (int64_t)(mp_tp->mpt_rcvadv - mp_tp->mpt_rcvnxt);
	3155
	3156	VERIFY(recwin_conn < INT32_MAX && recwin_conn > INT32_MIN);
	3157	if (recwin_conn > 0 && tp->rcv_wnd > (uint32_t)recwin_conn) {
	3158	tp->rcv_wnd = (uint32_t)recwin_conn;
	3159	tcpstat.tcps_mp_reducedwin++;
	3160	}
	3161	}
	3162	#endif /* MPTCP */
	3163
	3164	switch (tp->t_state) {
	3165	/*
	3166	* Initialize tp->rcv_nxt, and tp->irs, select an initial
	3167	* tp->iss, and send a segment:
	3168	* <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
	3169	* Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
	3170	* Fill in remote peer address fields if not previously specified.
	3171	* Enter SYN_RECEIVED state, and process any other fields of this
	3172	* segment in this state.
	3173	*/
	3174	case TCPS_LISTEN: {
	3175	struct sockaddr_in *sin;
	3176	struct sockaddr_in6 *sin6;
	3177
	3178	socket_lock_assert_owned(so);
	3179
	3180	/* Clear the logging flags inherited from the listening socket */
	3181	tp->t_log_flags = 0;
	3182	tp->t_flagsext &= ~TF_LOGGED_CONN_SUMMARY;
	3183
	3184	if (isipv6) {
	3185	MALLOC(sin6, struct sockaddr_in6 , sizeof sin6,
	3186	M_SONAME, M_NOWAIT);
	3187	if (sin6 == NULL) {
	3188	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN malloc M_SONAME failed");
	3189	goto drop;
	3190	}
	3191	bzero(sin6, sizeof(*sin6));
	3192	sin6->sin6_family = AF_INET6;
	3193	sin6->sin6_len = sizeof(*sin6);
	3194	sin6->sin6_addr = ip6->ip6_src;
	3195	sin6->sin6_port = th->th_sport;
	3196	laddr6 = inp->in6p_laddr;
	3197	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
	3198	inp->in6p_laddr = ip6->ip6_dst;
	3199	}
	3200	if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
	3201	kernel_proc)) {
	3202	inp->in6p_laddr = laddr6;
	3203	FREE(sin6, M_SONAME);
	3204	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in6_pcbconnect failed");
	3205	goto drop;
	3206	}
	3207	FREE(sin6, M_SONAME);
	3208	} else {
	3209	socket_lock_assert_owned(so);
	3210	MALLOC(sin, struct sockaddr_in , sizeof sin, M_SONAME,
	3211	M_NOWAIT);
	3212	if (sin == NULL) {
	3213	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN malloc M_SONAME failed");
	3214	goto drop;
	3215	}
	3216	sin->sin_family = AF_INET;
	3217	sin->sin_len = sizeof(*sin);
	3218	sin->sin_addr = ip->ip_src;
	3219	sin->sin_port = th->th_sport;
	3220	bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
	3221	laddr = inp->inp_laddr;
	3222	if (inp->inp_laddr.s_addr == INADDR_ANY) {
	3223	inp->inp_laddr = ip->ip_dst;
	3224	}
	3225	if (in_pcbconnect(inp, (struct sockaddr *)sin, kernel_proc,
	3226	IFSCOPE_NONE, NULL)) {
	3227	inp->inp_laddr = laddr;
	3228	FREE(sin, M_SONAME);
	3229	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in_pcbconnect failed");
	3230	goto drop;
	3231	}
	3232	FREE(sin, M_SONAME);
	3233	}
	3234
	3235	tcp_dooptions(tp, optp, optlen, th, &to);
	3236	tcp_finalize_options(tp, &to, ifscope);
	3237
	3238	if (tfo_enabled(tp) && tcp_tfo_syn(tp, &to)) {
	3239	isconnected = TRUE;
	3240	}
	3241
	3242	if (iss) {
	3243	tp->iss = iss;
	3244	} else {
	3245	tp->iss = tcp_new_isn(tp);
	3246	}
	3247	tp->irs = th->th_seq;
	3248	tcp_sendseqinit(tp);
	3249	tcp_rcvseqinit(tp);
	3250	tp->snd_recover = tp->snd_una;
	3251	/*
	3252	* Initialization of the tcpcb for transaction;
	3253	* set SND.WND = SEG.WND,
	3254	* initialize CCsend and CCrecv.
	3255	*/
	3256	tp->snd_wnd = tiwin; /* initial send-window */
	3257	tp->max_sndwnd = tp->snd_wnd;
	3258	tp->t_flags \|= TF_ACKNOW;
	3259	tp->t_unacksegs = 0;
	3260	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
	3261	struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
	3262	tp->t_state = TCPS_SYN_RECEIVED;
	3263	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
	3264	TCP_CONN_KEEPINIT(tp));
	3265	tp->t_connect_time = tcp_now;
	3266	dropsocket = 0; /* committed to socket */
	3267
	3268	if (inp->inp_flowhash == 0) {
	3269	inp->inp_flowhash = inp_calc_flowhash(inp);
	3270	}
	3271	/* update flowinfo - RFC 6437 */
	3272	if (inp->inp_flow == 0 &&
	3273	inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
	3274	inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
	3275	inp->inp_flow \|=
	3276	(htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK);
	3277	}
	3278
	3279	/* reset the incomp processing flag */
	3280	so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
	3281	tcpstat.tcps_accepts++;
	3282	if ((thflags & (TH_ECE \| TH_CWR)) == (TH_ECE \| TH_CWR)) {
	3283	/* ECN-setup SYN */
	3284	tp->ecn_flags \|= (TE_SETUPRECEIVED \| TE_SENDIPECT);
	3285	}
	3286
	3287	/*
	3288	* The address and connection state are finalized
	3289	*/
	3290	TCP_LOG_CONNECT(tp, false, 0);
	3291
	3292	tcp_add_fsw_flow(tp, ifp);
	3293
	3294	goto trimthenstep6;
	3295	}
	3296
	3297	/*
	3298	* If the state is SYN_RECEIVED and the seg contains an ACK,
	3299	* but not for our SYN/ACK, send a RST.
	3300	*/
	3301	case TCPS_SYN_RECEIVED:
	3302	if ((thflags & TH_ACK) &&
	3303	(SEQ_LEQ(th->th_ack, tp->snd_una) \|\|
	3304	SEQ_GT(th->th_ack, tp->snd_max))) {
	3305	IF_TCP_STATINC(ifp, ooopacket);
	3306	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_RECEIVED bad ACK");
	3307	goto dropwithreset;
	3308	}
	3309
	3310	/*
	3311	* In SYN_RECEIVED state, if we recv some SYNS with
	3312	* window scale and others without, window scaling should
	3313	* be disabled. Otherwise the window advertised will be
	3314	* lower if we assume scaling and the other end does not.
	3315	*/
	3316	if ((thflags & TH_SYN) &&
	3317	(tp->irs == th->th_seq) &&
	3318	!(to.to_flags & TOF_SCALE)) {
	3319	tp->t_flags &= ~TF_RCVD_SCALE;
	3320	}
	3321	break;
	3322
	3323	/*
	3324	* If the state is SYN_SENT:
	3325	* if seg contains an ACK, but not for our SYN, drop the input.
	3326	* if seg contains a RST, then drop the connection.
	3327	* if seg does not contain SYN, then drop it.
	3328	* Otherwise this is an acceptable SYN segment
	3329	* initialize tp->rcv_nxt and tp->irs
	3330	* if seg contains ack then advance tp->snd_una
	3331	* if SYN has been acked change to ESTABLISHED else SYN_RCVD state
	3332	* arrange for segment to be acked (eventually)
	3333	* continue processing rest of data/controls, beginning with URG
	3334	*/
	3335	case TCPS_SYN_SENT:
	3336	if ((thflags & TH_ACK) &&
	3337	(SEQ_LEQ(th->th_ack, tp->iss) \|\|
	3338	SEQ_GT(th->th_ack, tp->snd_max))) {
	3339	IF_TCP_STATINC(ifp, ooopacket);
	3340	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT bad ACK");
	3341	goto dropwithreset;
	3342	}
	3343	if (thflags & TH_RST) {
	3344	if ((thflags & TH_ACK) != 0) {
	3345	if (tfo_enabled(tp) &&
	3346	!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE)) {
	3347	tcp_heuristic_tfo_rst(tp);
	3348	}
	3349	if ((tp->ecn_flags & (TE_SETUPSENT \| TE_RCVD_SYN_RST)) == TE_SETUPSENT) {
	3350	/*
	3351	* On local connections, send
	3352	* non-ECN syn one time before
	3353	* dropping the connection
	3354	*/
	3355	if (tp->t_flags & TF_LOCAL) {
	3356	tp->ecn_flags \|= TE_RCVD_SYN_RST;
	3357	goto drop;
	3358	} else {
	3359	tcp_heuristic_ecn_synrst(tp);
	3360	}
	3361	}
	3362	soevent(so,
	3363	(SO_FILT_HINT_LOCKED \|
	3364	SO_FILT_HINT_CONNRESET));
	3365	tp = tcp_drop(tp, ECONNREFUSED);
	3366	}
	3367	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT got RST");
	3368	goto drop;
	3369	}
	3370	if ((thflags & TH_SYN) == 0) {
	3371	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT no SYN");
	3372	goto drop;
	3373	}
	3374	tp->snd_wnd = th->th_win; /* initial send window */
	3375	tp->max_sndwnd = tp->snd_wnd;
	3376
	3377	tp->irs = th->th_seq;
	3378	tcp_rcvseqinit(tp);
	3379	if (thflags & TH_ACK) {
	3380	tcpstat.tcps_connects++;
	3381
	3382	if ((thflags & (TH_ECE \| TH_CWR)) == (TH_ECE)) {
	3383	/* ECN-setup SYN-ACK */
	3384	tp->ecn_flags \|= TE_SETUPRECEIVED;
	3385	if (TCP_ECN_ENABLED(tp)) {
	3386	tcp_heuristic_ecn_success(tp);
	3387	tcpstat.tcps_ecn_client_success++;
	3388	}
	3389	} else {
	3390	if (tp->ecn_flags & TE_SETUPSENT &&
	3391	tp->t_rxtshift == 0) {
	3392	tcp_heuristic_ecn_success(tp);
	3393	tcpstat.tcps_ecn_not_supported++;
	3394	}
	3395	if (tp->ecn_flags & TE_SETUPSENT &&
	3396	tp->t_rxtshift > 0) {
	3397	tcp_heuristic_ecn_loss(tp);
	3398	}
	3399
	3400	/* non-ECN-setup SYN-ACK */
	3401	tp->ecn_flags &= ~TE_SENDIPECT;
	3402	}
	3403
	3404	/* Do window scaling on this connection? */
	3405	if (TCP_WINDOW_SCALE_ENABLED(tp)) {
	3406	tp->snd_scale = tp->requested_s_scale;
	3407	tp->rcv_scale = tp->request_r_scale;
	3408	}
	3409
	3410	tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale);
	3411	tp->snd_una++; /* SYN is acked */
	3412	if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
	3413	tp->snd_nxt = tp->snd_una;
	3414	}
	3415
	3416	/*
	3417	* We have sent more in the SYN than what is being
	3418	* acked. (e.g., TFO)
	3419	* We should restart the sending from what the receiver
	3420	* has acknowledged immediately.
	3421	*/
	3422	if (SEQ_GT(tp->snd_nxt, th->th_ack)) {
	3423	/*
	3424	* rdar://problem/33214601
	3425	* There is a middlebox that acks all but one
	3426	* byte and still drops the data.
	3427	*/
	3428	if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
	3429	(tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
	3430	tp->snd_max == th->th_ack + 1 &&
	3431	tp->snd_max > tp->snd_una + 1) {
	3432	tcp_heuristic_tfo_middlebox(tp);
	3433
	3434	so->so_error = ENODATA;
	3435	soevent(so,
	3436	(SO_FILT_HINT_LOCKED \| SO_FILT_HINT_MP_SUB_ERROR));
	3437
	3438	tp->t_tfo_stats \|= TFO_S_ONE_BYTE_PROXY;
	3439	}
	3440
	3441	tp->snd_max = tp->snd_nxt = th->th_ack;
	3442	}
	3443
	3444	/*
	3445	* If there's data, delay ACK; if there's also a FIN
	3446	* ACKNOW will be turned on later.
	3447	*/
	3448	TCP_INC_VAR(tp->t_unacksegs, segment_count);
	3449	if (DELAY_ACK(tp, th) && tlen != 0) {
	3450	if ((tp->t_flags & TF_DELACK) == 0) {
	3451	tp->t_flags \|= TF_DELACK;
	3452	tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
	3453	}
	3454	} else {
	3455	tp->t_flags \|= TF_ACKNOW;
	3456	}
	3457	/*
	3458	* Received <SYN,ACK> in SYN_SENT[*] state.
	3459	* Transitions:
	3460	* SYN_SENT --> ESTABLISHED
	3461	* SYN_SENT* --> FIN_WAIT_1
	3462	*/
	3463	tp->t_starttime = tcp_now;
	3464	tcp_sbrcv_tstmp_check(tp);
	3465	if (tp->t_flags & TF_NEEDFIN) {
	3466	DTRACE_TCP4(state__change, void, NULL,
	3467	struct inpcb *, inp,
	3468	struct tcpcb *, tp, int32_t,
	3469	TCPS_FIN_WAIT_1);
	3470	tp->t_state = TCPS_FIN_WAIT_1;
	3471	tp->t_flags &= ~TF_NEEDFIN;
	3472	thflags &= ~TH_SYN;
	3473
	3474	TCP_LOG_CONNECTION_SUMMARY(tp);
	3475	} else {
	3476	DTRACE_TCP4(state__change, void, NULL,
	3477	struct inpcb , inp, struct tcpcb ,
	3478	tp, int32_t, TCPS_ESTABLISHED);
	3479	tp->t_state = TCPS_ESTABLISHED;
	3480	tp->t_timer[TCPT_KEEP] =
	3481	OFFSET_FROM_START(tp,
	3482	TCP_CONN_KEEPIDLE(tp));
	3483	if (nstat_collect) {
	3484	nstat_route_connect_success(
	3485	inp->inp_route.ro_rt);
	3486	}
	3487	/*
	3488	* The SYN is acknowledged but una is not
	3489	* updated yet. So pass the value of
	3490	* ack to compute sndbytes correctly
	3491	*/
	3492	inp_count_sndbytes(inp, th->th_ack);
	3493	}
	3494	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
	3495	#if MPTCP
	3496	/*
	3497	* Do not send the connect notification for additional
	3498	* subflows until ACK for 3-way handshake arrives.
	3499	*/
	3500	if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
	3501	(tp->t_mpflags & TMPF_SENT_JOIN)) {
	3502	isconnected = FALSE;
	3503	} else
	3504	#endif /* MPTCP */
	3505	isconnected = TRUE;
	3506
	3507	if ((tp->t_tfo_flags & (TFO_F_COOKIE_REQ \| TFO_F_COOKIE_SENT)) \|\|
	3508	(tp->t_tfo_stats & TFO_S_SYN_DATA_SENT)) {
	3509	tcp_tfo_synack(tp, &to);
	3510
	3511	if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
	3512	SEQ_LT(tp->snd_una, th->th_ack)) {
	3513	tp->t_tfo_stats \|= TFO_S_SYN_DATA_ACKED;
	3514	tcpstat.tcps_tfo_syn_data_acked++;
	3515	#if MPTCP
	3516	if (so->so_flags & SOF_MP_SUBFLOW) {
	3517	so->so_flags1 \|= SOF1_TFO_REWIND;
	3518	}
	3519	#endif
	3520	tcp_tfo_rcv_probe(tp, tlen);
	3521	}
	3522	}
	3523	} else {
	3524	/*
	3525	* Received initial SYN in SYN-SENT[*] state => simul-
	3526	* taneous open.
	3527	* Do 3-way handshake:
	3528	* SYN-SENT -> SYN-RECEIVED
	3529	* SYN-SENT* -> SYN-RECEIVED*
	3530	*/
	3531	tp->t_flags \|= TF_ACKNOW;
	3532	tp->t_timer[TCPT_REXMT] = 0;
	3533	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
	3534	struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
	3535	tp->t_state = TCPS_SYN_RECEIVED;
	3536
	3537	/*
	3538	* During simultaneous open, TFO should not be used.
	3539	* So, we disable it here, to prevent that data gets
	3540	* sent on the SYN/ACK.
	3541	*/
	3542	tcp_disable_tfo(tp);
	3543	}
	3544
	3545	trimthenstep6:
	3546	/*
	3547	* Advance th->th_seq to correspond to first data byte.
	3548	* If data, trim to stay within window,
	3549	* dropping FIN if necessary.
	3550	*/
	3551	th->th_seq++;
	3552	if (tlen > tp->rcv_wnd) {
	3553	todrop = tlen - tp->rcv_wnd;
	3554	m_adj(m, -todrop);
	3555	tlen = tp->rcv_wnd;
	3556	thflags &= ~TH_FIN;
	3557	tcpstat.tcps_rcvpackafterwin++;
	3558	tcpstat.tcps_rcvbyteafterwin += todrop;
	3559	}
	3560	tp->snd_wl1 = th->th_seq - 1;
	3561	tp->rcv_up = th->th_seq;
	3562	/*
	3563	* Client side of transaction: already sent SYN and data.
	3564	* If the remote host used T/TCP to validate the SYN,
	3565	* our data will be ACK'd; if so, enter normal data segment
	3566	* processing in the middle of step 5, ack processing.
	3567	* Otherwise, goto step 6.
	3568	*/
	3569	if (thflags & TH_ACK) {
	3570	goto process_ACK;
	3571	}
	3572	goto step6;
	3573	/*
	3574	* If the state is LAST_ACK or CLOSING or TIME_WAIT:
	3575	* do normal processing.
	3576	*
	3577	* NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
	3578	*/
	3579	case TCPS_LAST_ACK:
	3580	case TCPS_CLOSING:
	3581	case TCPS_TIME_WAIT:
	3582	break; /* continue normal processing */
	3583
	3584	/* Received a SYN while connection is already established.
	3585	* This is a "half open connection and other anomalies" described
	3586	* in RFC793 page 34, send an ACK so the remote reset the connection
	3587	* or recovers by adjusting its sequence numbering. Sending an ACK is
	3588	* in accordance with RFC 5961 Section 4.2
	3589	*/
	3590	case TCPS_ESTABLISHED:
	3591	if (thflags & TH_SYN && tlen <= 0) {
	3592	/* Drop the packet silently if we have reached the limit */
	3593	if (tcp_is_ack_ratelimited(tp)) {
	3594	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 rate limited");
	3595	goto drop;
	3596	} else {
	3597	/* Send challenge ACK */
	3598	tcpstat.tcps_synchallenge++;
	3599	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 challenge ACK");
	3600	goto dropafterack;
	3601	}
	3602	}
	3603	break;
	3604	}
	3605
	3606	/*
	3607	* States other than LISTEN or SYN_SENT.
	3608	* First check the RST flag and sequence number since reset segments
	3609	* are exempt from the timestamp and connection count tests. This
	3610	* fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
	3611	* below which allowed reset segments in half the sequence space
	3612	* to fall though and be processed (which gives forged reset
	3613	* segments with a random sequence number a 50 percent chance of
	3614	* killing a connection).
	3615	* Then check timestamp, if present.
	3616	* Then check the connection count, if present.
	3617	* Then check that at least some bytes of segment are within
	3618	* receive window. If segment begins before rcv_nxt,
	3619	* drop leading data (and SYN); if nothing left, just ack.
	3620	*
	3621	*
	3622	* If the RST bit is set, check the sequence number to see
	3623	* if this is a valid reset segment.
	3624	* RFC 793 page 37:
	3625	* In all states except SYN-SENT, all reset (RST) segments
	3626	* are validated by checking their SEQ-fields. A reset is
	3627	* valid if its sequence number is in the window.
	3628	* Note: this does not take into account delayed ACKs, so
	3629	* we should test against last_ack_sent instead of rcv_nxt.
	3630	* The sequence number in the reset segment is normally an
	3631	* echo of our outgoing acknowlegement numbers, but some hosts
	3632	* send a reset with the sequence number at the rightmost edge
	3633	* of our receive window, and we have to handle this case.
	3634	* Note 2: Paul Watson's paper "Slipping in the Window" has shown
	3635	* that brute force RST attacks are possible. To combat this,
	3636	* we use a much stricter check while in the ESTABLISHED state,
	3637	* only accepting RSTs where the sequence number is equal to
	3638	* last_ack_sent. In all other states (the states in which a
	3639	* RST is more likely), the more permissive check is used.
	3640	* RFC 5961 Section 3.2: if the RST bit is set, sequence # is
	3641	* within the receive window and last_ack_sent == seq,
	3642	* then reset the connection. Otherwise if the seq doesn't
	3643	* match last_ack_sent, TCP must send challenge ACK. Perform
	3644	* rate limitation when sending the challenge ACK.
	3645	* If we have multiple segments in flight, the intial reset
	3646	* segment sequence numbers will be to the left of last_ack_sent,
	3647	* but they will eventually catch up.
	3648	* In any case, it never made sense to trim reset segments to
	3649	* fit the receive window since RFC 1122 says:
	3650	* 4.2.2.12 RST Segment: RFC-793 Section 3.4
	3651	*
	3652	* A TCP SHOULD allow a received RST segment to include data.
	3653	*
	3654	* DISCUSSION
	3655	* It has been suggested that a RST segment could contain
	3656	* ASCII text that encoded and explained the cause of the
	3657	* RST. No standard has yet been established for such
	3658	* data.
	3659	*
	3660	* If the reset segment passes the sequence number test examine
	3661	* the state:
	3662	* SYN_RECEIVED STATE:
	3663	* If passive open, return to LISTEN state.
	3664	* If active open, inform user that connection was refused.
	3665	* ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
	3666	* Inform user that connection was reset, and close tcb.
	3667	* CLOSING, LAST_ACK STATES:
	3668	* Close the tcb.
	3669	* TIME_WAIT STATE:
	3670	* Drop the segment - see Stevens, vol. 2, p. 964 and
	3671	* RFC 1337.
	3672	*
	3673	* Radar 4803931: Allows for the case where we ACKed the FIN but
	3674	* there is already a RST in flight from the peer.
	3675	* In that case, accept the RST for non-established
	3676	* state if it's one off from last_ack_sent.
	3677	*
	3678	*/
	3679	if (thflags & TH_RST) {
	3680	if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
	3681	SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) \|\|
	3682	(tp->rcv_wnd == 0 &&
	3683	((tp->last_ack_sent == th->th_seq) \|\|
	3684	((tp->last_ack_sent - 1) == th->th_seq)))) {
	3685	if (tp->last_ack_sent == th->th_seq) {
	3686	switch (tp->t_state) {
	3687	case TCPS_SYN_RECEIVED:
	3688	IF_TCP_STATINC(ifp, rstinsynrcv);
	3689	so->so_error = ECONNREFUSED;
	3690	goto close;
	3691
	3692	case TCPS_ESTABLISHED:
	3693	if (TCP_ECN_ENABLED(tp) &&
	3694	tp->snd_una == tp->iss + 1 &&
	3695	SEQ_GT(tp->snd_max, tp->snd_una)) {
	3696	/*
	3697	* If the first data packet on an
	3698	* ECN connection, receives a RST
	3699	* increment the heuristic
	3700	*/
	3701	tcp_heuristic_ecn_droprst(tp);
	3702	}
	3703	OS_FALLTHROUGH;
	3704	case TCPS_FIN_WAIT_1:
	3705	case TCPS_CLOSE_WAIT:
	3706	case TCPS_FIN_WAIT_2:
	3707	so->so_error = ECONNRESET;
	3708	close:
	3709	soevent(so,
	3710	(SO_FILT_HINT_LOCKED \|
	3711	SO_FILT_HINT_CONNRESET));
	3712
	3713	tcpstat.tcps_drops++;
	3714	tp = tcp_close(tp);
	3715	break;
	3716
	3717	case TCPS_CLOSING:
	3718	case TCPS_LAST_ACK:
	3719	tp = tcp_close(tp);
	3720	break;
	3721
	3722	case TCPS_TIME_WAIT:
	3723	break;
	3724	}
	3725	} else {
	3726	tcpstat.tcps_badrst++;
	3727	/* Drop if we have reached the ACK limit */
	3728	if (tcp_is_ack_ratelimited(tp)) {
	3729	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 rate limited");
	3730	goto drop;
	3731	} else {
	3732	/* Send challenge ACK */
	3733	tcpstat.tcps_rstchallenge++;
	3734	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 challenge ACK");
	3735	goto dropafterack;
	3736	}
	3737	}
	3738	}
	3739	goto drop;
	3740	}
	3741
	3742	/*
	3743	* RFC 1323 PAWS: If we have a timestamp reply on this segment
	3744	* and it's less than ts_recent, drop it.
	3745	*/
	3746	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
	3747	TSTMP_LT(to.to_tsval, tp->ts_recent)) {
	3748	/* Check to see if ts_recent is over 24 days old. */
	3749	if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
	3750	/*
	3751	* Invalidate ts_recent. If this segment updates
	3752	* ts_recent, the age will be reset later and ts_recent
	3753	* will get a valid value. If it does not, setting
	3754	* ts_recent to zero will at least satisfy the
	3755	* requirement that zero be placed in the timestamp
	3756	* echo reply when ts_recent isn't valid. The
	3757	* age isn't reset until we get a valid ts_recent
	3758	* because we don't want out-of-order segments to be
	3759	* dropped when ts_recent is old.
	3760	*/
	3761	tp->ts_recent = 0;
	3762	} else {
	3763	tcpstat.tcps_rcvduppack++;
	3764	tcpstat.tcps_rcvdupbyte += tlen;
	3765	tp->t_pawsdrop++;
	3766	tcpstat.tcps_pawsdrop++;
	3767
	3768	/*
	3769	* PAWS-drop when ECN is being used? That indicates
	3770	* that ECT-marked packets take a different path, with
	3771	* different congestion-characteristics.
	3772	*
	3773	* Only fallback when we did send less than 2GB as PAWS
	3774	* really has no reason to kick in earlier.
	3775	*/
	3776	if (TCP_ECN_ENABLED(tp) &&
	3777	inp->inp_stat->rxbytes < 2147483648) {
	3778	INP_INC_IFNET_STAT(inp, ecn_fallback_reorder);
	3779	tcpstat.tcps_ecn_fallback_reorder++;
	3780	tcp_heuristic_ecn_aggressive(tp);
	3781	}
	3782
	3783	if (nstat_collect) {
	3784	nstat_route_rx(tp->t_inpcb->inp_route.ro_rt,
	3785	1, tlen, NSTAT_RX_FLAG_DUPLICATE);
	3786	INP_ADD_STAT(inp, cell, wifi, wired,
	3787	rxpackets, 1);
	3788	INP_ADD_STAT(inp, cell, wifi, wired,
	3789	rxbytes, tlen);
	3790	tp->t_stat.rxduplicatebytes += tlen;
	3791	inp_set_activity_bitmap(inp);
	3792	}
	3793	if (tlen > 0) {
	3794	goto dropafterack;
	3795	}
	3796	goto drop;
	3797	}
	3798	}
	3799
	3800	/*
	3801	* In the SYN-RECEIVED state, validate that the packet belongs to
	3802	* this connection before trimming the data to fit the receive
	3803	* window. Check the sequence number versus IRS since we know
	3804	* the sequence numbers haven't wrapped. This is a partial fix
	3805	* for the "LAND" DoS attack.
	3806	*/
	3807	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
	3808	IF_TCP_STATINC(ifp, dospacket);
	3809	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_RECEIVED bad SEQ");
	3810	goto dropwithreset;
	3811	}
	3812
	3813	/*
	3814	* Check if there is old data at the beginning of the window
	3815	* i.e. the sequence number is before rcv_nxt
	3816	*/
	3817	todrop = tp->rcv_nxt - th->th_seq;
	3818	if (todrop > 0) {
	3819	boolean_t is_syn_set = FALSE;
	3820
	3821	if (thflags & TH_SYN) {
	3822	is_syn_set = TRUE;
	3823	thflags &= ~TH_SYN;
	3824	th->th_seq++;
	3825	if (th->th_urp > 1) {
	3826	th->th_urp--;
	3827	} else {
	3828	thflags &= ~TH_URG;
	3829	}
	3830	todrop--;
	3831	}
	3832	/*
	3833	* Following if statement from Stevens, vol. 2, p. 960.
	3834	* The amount of duplicate data is greater than or equal
	3835	* to the size of the segment - entire segment is duplicate
	3836	*/
	3837	if (todrop > tlen
	3838	\|\| (todrop == tlen && (thflags & TH_FIN) == 0)) {
	3839	/*
	3840	* Any valid FIN must be to the left of the window.
	3841	* At this point the FIN must be a duplicate or out
	3842	* of sequence; drop it.
	3843	*/
	3844	thflags &= ~TH_FIN;
	3845
	3846	/*
	3847	* Send an ACK to resynchronize and drop any data.
	3848	* But keep on processing for RST or ACK.
	3849	*
	3850	* If the SYN bit was originally set, then only send
	3851	* an ACK if we are not rate-limiting this connection.
	3852	*/
	3853	if (is_syn_set) {
	3854	if (!tcp_is_ack_ratelimited(tp)) {
	3855	tcpstat.tcps_synchallenge++;
	3856	tp->t_flags \|= TF_ACKNOW;
	3857	}
	3858	} else {
	3859	tp->t_flags \|= TF_ACKNOW;
	3860	}
	3861
	3862	if (todrop == 1) {
	3863	/* This could be a keepalive */
	3864	soevent(so, SO_FILT_HINT_LOCKED \|
	3865	SO_FILT_HINT_KEEPALIVE);
	3866	}
	3867	todrop = tlen;
	3868	tcpstat.tcps_rcvduppack++;
	3869	tcpstat.tcps_rcvdupbyte += todrop;
	3870	} else {
	3871	tcpstat.tcps_rcvpartduppack++;
	3872	tcpstat.tcps_rcvpartdupbyte += todrop;
	3873	}
	3874
	3875	if (todrop > 1) {
	3876	/*
	3877	* Note the duplicate data sequence space so that
	3878	* it can be reported in DSACK option.
	3879	*/
	3880	tp->t_dsack_lseq = th->th_seq;
	3881	tp->t_dsack_rseq = th->th_seq + todrop;
	3882	tp->t_flags \|= TF_ACKNOW;
	3883	}
	3884	if (nstat_collect) {
	3885	nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1,
	3886	todrop, NSTAT_RX_FLAG_DUPLICATE);
	3887	INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
	3888	INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, todrop);
	3889	tp->t_stat.rxduplicatebytes += todrop;
	3890	inp_set_activity_bitmap(inp);
	3891	}
	3892	drop_hdrlen += todrop; /* drop from the top afterwards */
	3893	th->th_seq += todrop;
	3894	tlen -= todrop;
	3895	if (th->th_urp > todrop) {
	3896	th->th_urp -= todrop;
	3897	} else {
	3898	thflags &= ~TH_URG;
	3899	th->th_urp = 0;
	3900	}
	3901	}
	3902
	3903	/*
	3904	* If new data are received on a connection after the user
	3905	* processes are gone, then RST the other end.
	3906	* Send also a RST when we received a data segment after we've
	3907	* sent our FIN when the socket is defunct.
	3908	* Note that an MPTCP subflow socket would have SS_NOFDREF set
	3909	* by default. So, if it's an MPTCP-subflow we rather check the
	3910	* MPTCP-level's socket state for SS_NOFDREF.
	3911	*/
	3912	if (tlen) {
	3913	boolean_t close_it = FALSE;
	3914
	3915	if (!(so->so_flags & SOF_MP_SUBFLOW) && (so->so_state & SS_NOFDREF) &&
	3916	tp->t_state > TCPS_CLOSE_WAIT) {
	3917	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SS_NOFDREF");
	3918	close_it = TRUE;
	3919	}
	3920
	3921	if ((so->so_flags & SOF_MP_SUBFLOW) && (mptetoso(tptomptp(tp)->mpt_mpte)->so_state & SS_NOFDREF) &&
	3922	tp->t_state > TCPS_CLOSE_WAIT) {
	3923	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SOF_MP_SUBFLOW SS_NOFDREF");
	3924	close_it = TRUE;
	3925	}
	3926
	3927	if ((so->so_flags & SOF_DEFUNCT) && tp->t_state > TCPS_FIN_WAIT_1) {
	3928	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SOF_DEFUNCT");
	3929	close_it = TRUE;
	3930	}
	3931
	3932	if (close_it) {
	3933	tp = tcp_close(tp);
	3934	tcpstat.tcps_rcvafterclose++;
	3935	IF_TCP_STATINC(ifp, cleanup);
	3936	goto dropwithreset;
	3937	}
	3938	}
	3939
	3940	/*
	3941	* If segment ends after window, drop trailing data
	3942	* (and PUSH and FIN); if nothing left, just ACK.
	3943	*/
	3944	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
	3945	if (todrop > 0) {
	3946	tcpstat.tcps_rcvpackafterwin++;
	3947	if (todrop >= tlen) {
	3948	tcpstat.tcps_rcvbyteafterwin += tlen;
	3949	/*
	3950	* If a new connection request is received
	3951	* while in TIME_WAIT, drop the old connection
	3952	* and start over if the sequence numbers
	3953	* are above the previous ones.
	3954	*/
	3955	if (thflags & TH_SYN &&
	3956	tp->t_state == TCPS_TIME_WAIT &&
	3957	SEQ_GT(th->th_seq, tp->rcv_nxt)) {
	3958	iss = tcp_new_isn(tp);
	3959	tp = tcp_close(tp);
	3960	socket_unlock(so, 1);
	3961	goto findpcb;
	3962	}
	3963	/*
	3964	* If window is closed can only take segments at
	3965	* window edge, and have to drop data and PUSH from
	3966	* incoming segments. Continue processing, but
	3967	* remember to ack. Otherwise, drop segment
	3968	* and ack.
	3969	*/
	3970	if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
	3971	tp->t_flags \|= TF_ACKNOW;
	3972	tcpstat.tcps_rcvwinprobe++;
	3973	} else {
	3974	goto dropafterack;
	3975	}
	3976	} else {
	3977	tcpstat.tcps_rcvbyteafterwin += todrop;
	3978	}
	3979	m_adj(m, -todrop);
	3980	tlen -= todrop;
	3981	thflags &= ~(TH_PUSH \| TH_FIN);
	3982	}
	3983
	3984	/*
	3985	* If last ACK falls within this segment's sequence numbers,
	3986	* record its timestamp.
	3987	* NOTE:
	3988	* 1) That the test incorporates suggestions from the latest
	3989	* proposal of the tcplw@cray.com list (Braden 1993/04/26).
	3990	* 2) That updating only on newer timestamps interferes with
	3991	* our earlier PAWS tests, so this check should be solely
	3992	* predicated on the sequence space of this segment.
	3993	* 3) That we modify the segment boundary check to be
	3994	* Last.ACK.Sent <= SEG.SEQ + SEG.Len
	3995	* instead of RFC1323's
	3996	* Last.ACK.Sent < SEG.SEQ + SEG.Len,
	3997	* This modified check allows us to overcome RFC1323's
	3998	* limitations as described in Stevens TCP/IP Illustrated
	3999	* Vol. 2 p.869. In such cases, we can still calculate the
	4000	* RTT correctly when RCV.NXT == Last.ACK.Sent.
	4001	*/
	4002	if ((to.to_flags & TOF_TS) != 0 &&
	4003	SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
	4004	SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
	4005	((thflags & (TH_SYN \| TH_FIN)) != 0))) {
	4006	tp->ts_recent_age = tcp_now;
	4007	tp->ts_recent = to.to_tsval;
	4008	}
	4009
	4010	/*
	4011	* Stevens: If a SYN is in the window, then this is an
	4012	* error and we send an RST and drop the connection.
	4013	*
	4014	* RFC 5961 Section 4.2
	4015	* Send challenge ACK for any SYN in synchronized state
	4016	* Perform rate limitation in doing so.
	4017	*/
	4018	if (thflags & TH_SYN) {
	4019	if (!tcp_syn_data_valid(tp, th, tlen)) {
	4020	tcpstat.tcps_badsyn++;
	4021	/* Drop if we have reached ACK limit */
	4022	if (tcp_is_ack_ratelimited(tp)) {
	4023	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad SYN rate limited");
	4024	goto drop;
	4025	} else {
	4026	/* Send challenge ACK */
	4027	tcpstat.tcps_synchallenge++;
	4028	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad SYN challenge ack");
	4029	goto dropafterack;
	4030	}
	4031	} else {
	4032	/*
	4033	* Received SYN (/ACK) with data.
	4034	* Move sequence number along to process the data.
	4035	*/
	4036	th->th_seq++;
	4037	thflags &= ~TH_SYN;
	4038	}
	4039	}
	4040
	4041	/*
	4042	* If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
	4043	* flag is on (half-synchronized state), then queue data for
	4044	* later processing; else drop segment and return.
	4045	*/
	4046	if ((thflags & TH_ACK) == 0) {
	4047	if (tp->t_state == TCPS_SYN_RECEIVED) {
	4048	if ((tfo_enabled(tp))) {
	4049	/*
	4050	* So, we received a valid segment while in
	4051	* SYN-RECEIVED.
	4052	* As this cannot be an RST (see that if a bit
	4053	* higher), and it does not have the ACK-flag
	4054	* set, we want to retransmit the SYN/ACK.
	4055	* Thus, we have to reset snd_nxt to snd_una to
	4056	* trigger the going back to sending of the
	4057	* SYN/ACK. This is more consistent with the
	4058	* behavior of tcp_output(), which expects
	4059	* to send the segment that is pointed to by
	4060	* snd_nxt.
	4061	*/
	4062	tp->snd_nxt = tp->snd_una;
	4063
	4064	/*
	4065	* We need to make absolutely sure that we are
	4066	* going to reply upon a duplicate SYN-segment.
	4067	*/
	4068	if (th->th_flags & TH_SYN) {
	4069	needoutput = 1;
	4070	}
	4071	}
	4072
	4073	goto step6;
	4074	} else if (tp->t_flags & TF_ACKNOW) {
	4075	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad ACK");
	4076	goto dropafterack;
	4077	} else {
	4078	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad ACK");
	4079	goto drop;
	4080	}
	4081	}
	4082
	4083	/*
	4084	* Ack processing.
	4085	*/
	4086
	4087	switch (tp->t_state) {
	4088	/*
	4089	* In SYN_RECEIVED state, the ack ACKs our SYN, so enter
	4090	* ESTABLISHED state and continue processing.
	4091	* The ACK was checked above.
	4092	*/
	4093	case TCPS_SYN_RECEIVED:
	4094
	4095	tcpstat.tcps_connects++;
	4096
	4097	/* Do window scaling? */
	4098	if (TCP_WINDOW_SCALE_ENABLED(tp)) {
	4099	tp->snd_scale = tp->requested_s_scale;
	4100	tp->rcv_scale = tp->request_r_scale;
	4101	tp->snd_wnd = th->th_win << tp->snd_scale;
	4102	tp->max_sndwnd = tp->snd_wnd;
	4103	tiwin = tp->snd_wnd;
	4104	}
	4105	/*
	4106	* Make transitions:
	4107	* SYN-RECEIVED -> ESTABLISHED
	4108	* SYN-RECEIVED* -> FIN-WAIT-1
	4109	*/
	4110	tp->t_starttime = tcp_now;
	4111	tcp_sbrcv_tstmp_check(tp);
	4112	if (tp->t_flags & TF_NEEDFIN) {
	4113	DTRACE_TCP4(state__change, void, NULL,
	4114	struct inpcb *, inp,
	4115	struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
	4116	tp->t_state = TCPS_FIN_WAIT_1;
	4117	tp->t_flags &= ~TF_NEEDFIN;
	4118
	4119	TCP_LOG_CONNECTION_SUMMARY(tp);
	4120	} else {
	4121	DTRACE_TCP4(state__change, void, NULL,
	4122	struct inpcb *, inp,
	4123	struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
	4124	tp->t_state = TCPS_ESTABLISHED;
	4125	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
	4126	TCP_CONN_KEEPIDLE(tp));
	4127	if (nstat_collect) {
	4128	nstat_route_connect_success(
	4129	tp->t_inpcb->inp_route.ro_rt);
	4130	}
	4131
	4132	/*
	4133	* The SYN is acknowledged but una is not updated
	4134	* yet. So pass the value of ack to compute
	4135	* sndbytes correctly
	4136	*/
	4137	inp_count_sndbytes(inp, th->th_ack);
	4138	}
	4139	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
	4140	/*
	4141	* If segment contains data or ACK, will call tcp_reass()
	4142	* later; if not, do so now to pass queued data to user.
	4143	*/
	4144	if (tlen == 0 && (thflags & TH_FIN) == 0) {
	4145	if (isipv6) {
	4146	memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr));
	4147	ip6 = (struct ip6_hdr *)&saved_hdr[0];
	4148	} else {
	4149	memcpy(&saved_hdr, ip, ip->ip_hl << 2);
	4150	ip = (struct ip *)&saved_hdr[0];
	4151	}
	4152	memcpy(&saved_tcphdr, th, sizeof(struct tcphdr));
	4153	(void) tcp_reass(tp, (struct tcphdr *)0, &tlen,
	4154	NULL, ifp, &read_wakeup);
	4155	th = &saved_tcphdr;
	4156	}
	4157	tp->snd_wl1 = th->th_seq - 1;
	4158
	4159	#if MPTCP
	4160	/*
	4161	* Do not send the connect notification for additional subflows
	4162	* until ACK for 3-way handshake arrives.
	4163	*/
	4164	if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
	4165	(tp->t_mpflags & TMPF_SENT_JOIN)) {
	4166	isconnected = FALSE;
	4167	} else
	4168	#endif /* MPTCP */
	4169	isconnected = TRUE;
	4170	if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
	4171	/* Done this when receiving the SYN */
	4172	isconnected = FALSE;
	4173
	4174	OSDecrementAtomic(&tcp_tfo_halfcnt);
	4175
	4176	/* Panic if something has gone terribly wrong. */
	4177	VERIFY(tcp_tfo_halfcnt >= 0);
	4178
	4179	tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
	4180	}
	4181
	4182	/*
	4183	* In case there is data in the send-queue (e.g., TFO is being
	4184	* used, or connectx+data has been done), then if we would
	4185	* "FALLTHROUGH", we would handle this ACK as if data has been
	4186	* acknowledged. But, we have to prevent this. And this
	4187	* can be prevented by increasing snd_una by 1, so that the
	4188	* SYN is not considered as data (snd_una++ is actually also
	4189	* done in SYN_SENT-state as part of the regular TCP stack).
	4190	*
	4191	* In case there is data on this ack as well, the data will be
	4192	* handled by the label "dodata" right after step6.
	4193	*/
	4194	if (so->so_snd.sb_cc) {
	4195	tp->snd_una++; /* SYN is acked */
	4196	if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
	4197	tp->snd_nxt = tp->snd_una;
	4198	}
	4199
	4200	/*
	4201	* No duplicate-ACK handling is needed. So, we
	4202	* directly advance to processing the ACK (aka,
	4203	* updating the RTT estimation,...)
	4204	*
	4205	* But, we first need to handle eventual SACKs,
	4206	* because TFO will start sending data with the
	4207	* SYN/ACK, so it might be that the client
	4208	* includes a SACK with its ACK.
	4209	*/
	4210	if (SACK_ENABLED(tp) &&
	4211	(to.to_nsacks > 0 \|\| !TAILQ_EMPTY(&tp->snd_holes))) {
	4212	tcp_sack_doack(tp, &to, th, &sack_bytes_acked, &sack_bytes_newly_acked);
	4213	}
	4214
	4215	goto process_ACK;
	4216	}
	4217
	4218	OS_FALLTHROUGH;
	4219
	4220	/*
	4221	* In ESTABLISHED state: drop duplicate ACKs; ACK out of range
	4222	* ACKs. If the ack is in the range
	4223	* tp->snd_una < th->th_ack <= tp->snd_max
	4224	* then advance tp->snd_una to th->th_ack and drop
	4225	* data from the retransmission queue. If this ACK reflects
	4226	* more up to date window information we update our window information.
	4227	*/
	4228	case TCPS_ESTABLISHED:
	4229	case TCPS_FIN_WAIT_1:
	4230	case TCPS_FIN_WAIT_2:
	4231	case TCPS_CLOSE_WAIT:
	4232	case TCPS_CLOSING:
	4233	case TCPS_LAST_ACK:
	4234	case TCPS_TIME_WAIT:
	4235	if (SEQ_GT(th->th_ack, tp->snd_max)) {
	4236	tcpstat.tcps_rcvacktoomuch++;
	4237	if (tcp_is_ack_ratelimited(tp)) {
	4238	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 rcvacktoomuch");
	4239	goto drop;
	4240	} else {
	4241	goto dropafterack;
	4242	}
	4243	}
	4244	if (SEQ_LT(th->th_ack, tp->snd_una - tp->max_sndwnd)) {
	4245	if (tcp_is_ack_ratelimited(tp)) {
	4246	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad ACK");
	4247	goto drop;
	4248	} else {
	4249	goto dropafterack;
	4250	}
	4251	}
	4252	if (SACK_ENABLED(tp) && to.to_nsacks > 0) {
	4253	recvd_dsack = tcp_sack_process_dsack(tp, &to, th);
	4254	/*
	4255	* If DSACK is received and this packet has no
	4256	* other SACK information, it can be dropped.
	4257	* We do not want to treat it as a duplicate ack.
	4258	*/
	4259	if (recvd_dsack &&
	4260	SEQ_LEQ(th->th_ack, tp->snd_una) &&
	4261	to.to_nsacks == 0) {
	4262	tcp_bad_rexmt_check(tp, th, &to);
	4263	goto drop;
	4264	}
	4265	}
	4266
	4267	if (SACK_ENABLED(tp) &&
	4268	(to.to_nsacks > 0 \|\| !TAILQ_EMPTY(&tp->snd_holes))) {
	4269	tcp_sack_doack(tp, &to, th, &sack_bytes_acked, &sack_bytes_newly_acked);
	4270	}
	4271
	4272	#if MPTCP
	4273	if (tp->t_mpuna && SEQ_GEQ(th->th_ack, tp->t_mpuna)) {
	4274	if (tp->t_mpflags & TMPF_PREESTABLISHED) {
	4275	/* MP TCP establishment succeeded */
	4276	tp->t_mpuna = 0;
	4277	if (tp->t_mpflags & TMPF_JOINED_FLOW) {
	4278	if (tp->t_mpflags & TMPF_SENT_JOIN) {
	4279	tp->t_mpflags &=
	4280	~TMPF_PREESTABLISHED;
	4281	tp->t_mpflags \|=
	4282	TMPF_MPTCP_TRUE;
	4283
	4284	tp->t_timer[TCPT_JACK_RXMT] = 0;
	4285	tp->t_mprxtshift = 0;
	4286	isconnected = TRUE;
	4287	} else {
	4288	isconnected = FALSE;
	4289	}
	4290	} else {
	4291	isconnected = TRUE;
	4292	}
	4293	}
	4294	}
	4295	#endif /* MPTCP */
	4296
	4297	tcp_tfo_rcv_ack(tp, th);
	4298
	4299	/*
	4300	* If we have outstanding data (other than
	4301	* a window probe), this is a completely
	4302	* duplicate ack and the ack is the biggest we've seen.
	4303	*
	4304	* Need to accommodate a change in window on duplicate acks
	4305	* to allow operating systems that update window during
	4306	* recovery with SACK
	4307	*/
	4308	if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
	4309	if (tlen == 0 && (tiwin == tp->snd_wnd \|\|
	4310	(to.to_nsacks > 0 && sack_bytes_acked > 0))) {
	4311	uint32_t old_dupacks;
	4312	/*
	4313	* If both ends send FIN at the same time,
	4314	* then the ack will be a duplicate ack
	4315	* but we have to process the FIN. Check
	4316	* for this condition and process the FIN
	4317	* instead of the dupack
	4318	*/
	4319	if ((thflags & TH_FIN) &&
	4320	!TCPS_HAVERCVDFIN(tp->t_state)) {
	4321	break;
	4322	}
	4323	process_dupack:
	4324	old_dupacks = tp->t_dupacks;
	4325	#if MPTCP
	4326	/*
	4327	* MPTCP options that are ignored must
	4328	* not be treated as duplicate ACKs.
	4329	*/
	4330	if (to.to_flags & TOF_MPTCP) {
	4331	goto drop;
	4332	}
	4333
	4334	if ((isconnected) && (tp->t_mpflags & TMPF_JOINED_FLOW)) {
	4335	mptcplog((LOG_DEBUG, "MPTCP "
	4336	"Sockets: bypass ack recovery\n"),
	4337	MPTCP_SOCKET_DBG,
	4338	MPTCP_LOGLVL_VERBOSE);
	4339	break;
	4340	}
	4341	#endif /* MPTCP */
	4342	/*
	4343	* If a duplicate acknowledgement was seen
	4344	* after ECN, it indicates packet loss in
	4345	* addition to ECN. Reset INRECOVERY flag
	4346	* so that we can process partial acks
	4347	* correctly
	4348	*/
	4349	if (tp->ecn_flags & TE_INRECOVERY) {
	4350	tp->ecn_flags &= ~TE_INRECOVERY;
	4351	}
	4352
	4353	tcpstat.tcps_rcvdupack++;
	4354	if (SACK_ENABLED(tp) && tcp_do_better_lr) {
	4355	tp->t_dupacks += max(1, sack_bytes_acked / tp->t_maxseg);
	4356	} else {
	4357	++tp->t_dupacks;
	4358	}
	4359
	4360	tp->sackhint.sack_bytes_acked += sack_bytes_acked;
	4361
	4362	if (SACK_ENABLED(tp) && tcp_do_better_lr) {
	4363	tp->t_new_dupacks += (sack_bytes_newly_acked / tp->t_maxseg);
	4364
	4365	if (tp->t_new_dupacks >= tp->t_rexmtthresh && IN_FASTRECOVERY(tp)) {
	4366	/* Let's restart the retransmission */
	4367	tcp_sack_lost_rexmit(tp);
	4368
	4369	/*
	4370	* If the current tcp cc module has
	4371	* defined a hook for tasks to run
	4372	* before entering FR, call it
	4373	*/
	4374	if (CC_ALGO(tp)->pre_fr != NULL) {
	4375	CC_ALGO(tp)->pre_fr(tp);
	4376	}
	4377
	4378	ENTER_FASTRECOVERY(tp);
	4379
	4380	if (tp->t_flags & TF_SENTFIN) {
	4381	tp->snd_recover = tp->snd_max - 1;
	4382	} else {
	4383	tp->snd_recover = tp->snd_max;
	4384	}
	4385	tp->t_rtttime = 0;
	4386
	4387	if (TCP_ECN_ENABLED(tp)) {
	4388	tp->ecn_flags \|= TE_SENDCWR;
	4389	}
	4390
	4391	if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
	4392	tcp_cc_adjust_nonvalidated_cwnd(tp);
	4393	} else {
	4394	tp->snd_cwnd = tp->snd_ssthresh;
	4395	}
	4396	}
	4397	}
	4398
	4399	/*
	4400	* Check if we need to reset the limit on
	4401	* early retransmit
	4402	*/
	4403	if (tp->t_early_rexmt_count > 0 &&
	4404	TSTMP_GEQ(tcp_now,
	4405	(tp->t_early_rexmt_win +
	4406	TCP_EARLY_REXMT_WIN))) {
	4407	tp->t_early_rexmt_count = 0;
	4408	}
	4409
	4410	/*
	4411	* Is early retransmit needed? We check for
	4412	* this when the connection is waiting for
	4413	* duplicate acks to enter fast recovery.
	4414	*/
	4415	if (!IN_FASTRECOVERY(tp)) {
	4416	tcp_early_rexmt_check(tp, th);
	4417	}
	4418
	4419	/*
	4420	* If we've seen exactly rexmt threshold
	4421	* of duplicate acks, assume a packet
	4422	* has been dropped and retransmit it.
	4423	* Kludge snd_nxt & the congestion
	4424	* window so we send only this one
	4425	* packet.
	4426	*
	4427	* We know we're losing at the current
	4428	* window size so do congestion avoidance
	4429	* (set ssthresh to half the current window
	4430	* and pull our congestion window back to
	4431	* the new ssthresh).
	4432	*
	4433	* Dup acks mean that packets have left the
	4434	* network (they're now cached at the receiver)
	4435	* so bump cwnd by the amount in the receiver
	4436	* to keep a constant cwnd packets in the
	4437	* network.
	4438	*/
	4439	if (tp->t_timer[TCPT_REXMT] == 0 \|\|
	4440	(th->th_ack != tp->snd_una && sack_bytes_acked == 0)) {
	4441	tp->t_dupacks = 0;
	4442	tp->t_rexmtthresh = tcprexmtthresh;
	4443	tp->t_new_dupacks = 0;
	4444	} else if ((tp->t_dupacks > tp->t_rexmtthresh && (!tcp_do_better_lr \|\| old_dupacks >= tp->t_rexmtthresh)) \|\|
	4445	IN_FASTRECOVERY(tp)) {
	4446	/*
	4447	* If this connection was seeing packet
	4448	* reordering, then recovery might be
	4449	* delayed to disambiguate between
	4450	* reordering and loss
	4451	*/
	4452	if (SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
	4453	(tp->t_flagsext &
	4454	(TF_PKTS_REORDERED \| TF_DELAY_RECOVERY)) ==
	4455	(TF_PKTS_REORDERED \| TF_DELAY_RECOVERY)) {
	4456	/*
	4457	* Since the SACK information is already
	4458	* updated, this ACK will be dropped
	4459	*/
	4460	break;
	4461	}
	4462
	4463	/*
	4464	* Dup acks mean that packets have left the
	4465	* network (they're now cached at the receiver)
	4466	* so bump cwnd by the amount in the receiver
	4467	* to keep a constant cwnd packets in the
	4468	* network.
	4469	*/
	4470	if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp)) {
	4471	int awnd;
	4472
	4473	/*
	4474	* Compute the amount of data in flight first.
	4475	* We can inject new data into the pipe iff
	4476	* we have less than snd_ssthres worth of data in
	4477	* flight.
	4478	*/
	4479	awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit;
	4480	if (awnd < tp->snd_ssthresh) {
	4481	tp->snd_cwnd += tp->t_maxseg;
	4482	if (tp->snd_cwnd > tp->snd_ssthresh) {
	4483	tp->snd_cwnd = tp->snd_ssthresh;
	4484	}
	4485	}
	4486	} else {
	4487	tp->snd_cwnd += tp->t_maxseg;
	4488	}
	4489
	4490	/* Process any window updates */
	4491	if (tiwin > tp->snd_wnd) {
	4492	tcp_update_window(tp, thflags,
	4493	th, tiwin, tlen);
	4494	}
	4495	tcp_ccdbg_trace(tp, th,
	4496	TCP_CC_IN_FASTRECOVERY);
	4497
	4498	(void) tcp_output(tp);
	4499
	4500	goto drop;
	4501	} else if ((!tcp_do_better_lr && tp->t_dupacks == tp->t_rexmtthresh) \|\|
	4502	(tcp_do_better_lr && tp->t_dupacks >= tp->t_rexmtthresh)) {
	4503	tcp_seq onxt = tp->snd_nxt;
	4504
	4505	/*
	4506	* If we're doing sack, check to
	4507	* see if we're already in sack
	4508	* recovery. If we're not doing sack,
	4509	* check to see if we're in newreno
	4510	* recovery.
	4511	*/
	4512	if (SACK_ENABLED(tp)) {
	4513	if (IN_FASTRECOVERY(tp)) {
	4514	tp->t_dupacks = 0;
	4515	break;
	4516	} else if (tp->t_flagsext & TF_DELAY_RECOVERY) {
	4517	break;
	4518	}
	4519	} else {
	4520	if (SEQ_LEQ(th->th_ack, tp->snd_recover)) {
	4521	tp->t_dupacks = 0;
	4522	break;
	4523	}
	4524	}
	4525	if (tp->t_flags & TF_SENTFIN) {
	4526	tp->snd_recover = tp->snd_max - 1;
	4527	} else {
	4528	tp->snd_recover = tp->snd_max;
	4529	}
	4530	tp->t_timer[TCPT_PTO] = 0;
	4531	tp->t_rtttime = 0;
	4532
	4533	/*
	4534	* If the connection has seen pkt
	4535	* reordering, delay recovery until
	4536	* it is clear that the packet
	4537	* was lost.
	4538	*/
	4539	if (SACK_ENABLED(tp) &&
	4540	(tp->t_flagsext &
	4541	(TF_PKTS_REORDERED \| TF_DELAY_RECOVERY))
	4542	== TF_PKTS_REORDERED &&
	4543	!IN_FASTRECOVERY(tp) &&
	4544	tp->t_reorderwin > 0 &&
	4545	(tp->t_state == TCPS_ESTABLISHED \|\|
	4546	tp->t_state == TCPS_FIN_WAIT_1)) {
	4547	tp->t_timer[TCPT_DELAYFR] =
	4548	OFFSET_FROM_START(tp,
	4549	tp->t_reorderwin);
	4550	tp->t_flagsext \|= TF_DELAY_RECOVERY;
	4551	tcpstat.tcps_delay_recovery++;
	4552	tcp_ccdbg_trace(tp, th,
	4553	TCP_CC_DELAY_FASTRECOVERY);
	4554	break;
	4555	}
	4556
	4557	tcp_rexmt_save_state(tp);
	4558	/*
	4559	* If the current tcp cc module has
	4560	* defined a hook for tasks to run
	4561	* before entering FR, call it
	4562	*/
	4563	if (CC_ALGO(tp)->pre_fr != NULL) {
	4564	CC_ALGO(tp)->pre_fr(tp);
	4565	}
	4566	ENTER_FASTRECOVERY(tp);
	4567	tp->t_timer[TCPT_REXMT] = 0;
	4568	if (TCP_ECN_ENABLED(tp)) {
	4569	tp->ecn_flags \|= TE_SENDCWR;
	4570	}
	4571
	4572	if (SACK_ENABLED(tp)) {
	4573	tcpstat.tcps_sack_recovery_episode++;
	4574	tp->t_sack_recovery_episode++;
	4575	tp->sack_newdata = tp->snd_nxt;
	4576	if (tcp_do_better_lr) {
	4577	tp->snd_cwnd = tp->snd_ssthresh;
	4578	} else {
	4579	tp->snd_cwnd = tp->t_maxseg;
	4580	}
	4581	tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
	4582
	4583	/* Process any window updates */
	4584	if (tiwin > tp->snd_wnd) {
	4585	tcp_update_window(tp, thflags, th, tiwin, tlen);
	4586	}
	4587
	4588	tcp_ccdbg_trace(tp, th, TCP_CC_ENTER_FASTRECOVERY);
	4589	(void) tcp_output(tp);
	4590	goto drop;
	4591	}
	4592	tp->snd_nxt = th->th_ack;
	4593	tp->snd_cwnd = tp->t_maxseg;
	4594
	4595	/* Process any window updates */
	4596	if (tiwin > tp->snd_wnd) {
	4597	tcp_update_window(tp, thflags, th, tiwin, tlen);
	4598	}
	4599
	4600	(void) tcp_output(tp);
	4601	if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
	4602	tcp_cc_adjust_nonvalidated_cwnd(tp);
	4603	} else {
	4604	tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks;
	4605	}
	4606	if (SEQ_GT(onxt, tp->snd_nxt)) {
	4607	tp->snd_nxt = onxt;
	4608	}
	4609
	4610	tcp_ccdbg_trace(tp, th, TCP_CC_ENTER_FASTRECOVERY);
	4611	goto drop;
	4612	} else if (ALLOW_LIMITED_TRANSMIT(tp) &&
	4613	(!(SACK_ENABLED(tp)) \|\| sack_bytes_acked > 0) &&
	4614	(so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) > 0) {
	4615	u_int32_t incr = (tp->t_maxseg * tp->t_dupacks);
	4616
	4617	/* Use Limited Transmit algorithm on the first two
	4618	* duplicate acks when there is new data to transmit
	4619	*/
	4620	tp->snd_cwnd += incr;
	4621	tcpstat.tcps_limited_txt++;
	4622	(void) tcp_output(tp);
	4623
	4624	tcp_ccdbg_trace(tp, th, TCP_CC_LIMITED_TRANSMIT);
	4625
	4626	/* Reset snd_cwnd back to normal */
	4627	tp->snd_cwnd -= incr;
	4628	}
	4629	}
	4630	break;
	4631	}
	4632	/*
	4633	* If the congestion window was inflated to account
	4634	* for the other side's cached packets, retract it.
	4635	*/
	4636	if (IN_FASTRECOVERY(tp)) {
	4637	if (SEQ_LT(th->th_ack, tp->snd_recover)) {
	4638	/*
	4639	* If we received an ECE and entered
	4640	* recovery, the subsequent ACKs should
	4641	* not be treated as partial acks.
	4642	*/
	4643	if (tp->ecn_flags & TE_INRECOVERY) {
	4644	goto process_ACK;
	4645	}
	4646
	4647	if (SACK_ENABLED(tp)) {
	4648	tcp_sack_partialack(tp, th);
	4649	} else {
	4650	tcp_newreno_partial_ack(tp, th);
	4651	}
	4652	tcp_ccdbg_trace(tp, th, TCP_CC_PARTIAL_ACK);
	4653	} else {
	4654	if (tcp_cubic_minor_fixes) {
	4655	exiting_fr = 1;
	4656	}
	4657	EXIT_FASTRECOVERY(tp);
	4658	if (CC_ALGO(tp)->post_fr != NULL) {
	4659	CC_ALGO(tp)->post_fr(tp, th);
	4660	}
	4661	tp->t_pipeack = 0;
	4662	tcp_clear_pipeack_state(tp);
	4663	tcp_ccdbg_trace(tp, th,
	4664	TCP_CC_EXIT_FASTRECOVERY);
	4665	}
	4666	} else if ((tp->t_flagsext &
	4667	(TF_PKTS_REORDERED \| TF_DELAY_RECOVERY))
	4668	== (TF_PKTS_REORDERED \| TF_DELAY_RECOVERY)) {
	4669	/*
	4670	* If the ack acknowledges upto snd_recover or if
	4671	* it acknowledges all the snd holes, exit
	4672	* recovery and cancel the timer. Otherwise,
	4673	* this is a partial ack. Wait for recovery timer
	4674	* to enter recovery. The snd_holes have already
	4675	* been updated.
	4676	*/
	4677	if (SEQ_GEQ(th->th_ack, tp->snd_recover) \|\|
	4678	TAILQ_EMPTY(&tp->snd_holes)) {
	4679	tp->t_timer[TCPT_DELAYFR] = 0;
	4680	tp->t_flagsext &= ~TF_DELAY_RECOVERY;
	4681	EXIT_FASTRECOVERY(tp);
	4682	tcp_ccdbg_trace(tp, th,
	4683	TCP_CC_EXIT_FASTRECOVERY);
	4684	}
	4685	} else {
	4686	/*
	4687	* We were not in fast recovery. Reset the
	4688	* duplicate ack counter.
	4689	*/
	4690	tp->t_dupacks = 0;
	4691	tp->t_rexmtthresh = tcprexmtthresh;
	4692	tp->t_new_dupacks = 0;
	4693	}
	4694
	4695	process_ACK:
	4696	VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
	4697	acked = BYTES_ACKED(th, tp);
	4698	tcpstat.tcps_rcvackpack++;
	4699	tcpstat.tcps_rcvackbyte += acked;
	4700
	4701	/*
	4702	* If the last packet was a retransmit, make sure
	4703	* it was not spurious.
	4704	*
	4705	* This will also take care of congestion window
	4706	* adjustment if a last packet was recovered due to a
	4707	* tail loss probe.
	4708	*/
	4709	tcp_bad_rexmt_check(tp, th, &to);
	4710
	4711	/* Recalculate the RTT */
	4712	tcp_compute_rtt(tp, &to, th);
	4713
	4714	/*
	4715	* If all outstanding data is acked, stop retransmit
	4716	* timer and remember to restart (more output or persist).
	4717	* If there is more data to be acked, restart retransmit
	4718	* timer, using current (possibly backed-off) value.
	4719	*/
	4720	TCP_RESET_REXMT_STATE(tp);
	4721	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
	4722	tp->t_rttmin, TCPTV_REXMTMAX,
	4723	TCP_ADD_REXMTSLOP(tp));
	4724	if (th->th_ack == tp->snd_max) {
	4725	tp->t_timer[TCPT_REXMT] = 0;
	4726	tp->t_timer[TCPT_PTO] = 0;
	4727	needoutput = 1;
	4728	} else if (tp->t_timer[TCPT_PERSIST] == 0) {
	4729	tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
	4730	tp->t_rxtcur);
	4731	}
	4732
	4733	if ((prev_t_state == TCPS_SYN_SENT \|\|
	4734	prev_t_state == TCPS_SYN_RECEIVED) &&
	4735	tp->t_state == TCPS_ESTABLISHED) {
	4736	TCP_LOG_RTT_INFO(tp);
	4737	}
	4738
	4739	/*
	4740	* If no data (only SYN) was ACK'd, skip rest of ACK
	4741	* processing.
	4742	*/
	4743	if (acked == 0) {
	4744	goto step6;
	4745	}
	4746
	4747	/*
	4748	* When outgoing data has been acked (except the SYN+data), we
	4749	* mark this connection as "sending good" for TFO.
	4750	*/
	4751	if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
	4752	!(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
	4753	!(th->th_flags & TH_SYN)) {
	4754	tp->t_tfo_flags \|= TFO_F_NO_SNDPROBING;
	4755	}
	4756
	4757	/*
	4758	* If TH_ECE is received, make sure that ECN is enabled
	4759	* on that connection and we have sent ECT on data packets.
	4760	*/
	4761	if ((thflags & TH_ECE) != 0 && TCP_ECN_ENABLED(tp) &&
	4762	(tp->ecn_flags & TE_SENDIPECT)) {
	4763	/*
	4764	* Reduce the congestion window if we haven't
	4765	* done so.
	4766	*/
	4767	if (!IN_FASTRECOVERY(tp)) {
	4768	tcp_reduce_congestion_window(tp);
	4769	tp->ecn_flags \|= (TE_INRECOVERY \| TE_SENDCWR);
	4770	/*
	4771	* Also note that the connection received
	4772	* ECE atleast once
	4773	*/
	4774	tp->ecn_flags \|= TE_RECV_ECN_ECE;
	4775	INP_INC_IFNET_STAT(inp, ecn_recv_ece);
	4776	tcpstat.tcps_ecn_recv_ece++;
	4777	tcp_ccdbg_trace(tp, th, TCP_CC_ECN_RCVD);
	4778	}
	4779	}
	4780
	4781	/*
	4782	* When new data is acked, open the congestion window.
	4783	* The specifics of how this is achieved are up to the
	4784	* congestion control algorithm in use for this connection.
	4785	*
	4786	* The calculations in this function assume that snd_una is
	4787	* not updated yet.
	4788	*/
	4789	if (!IN_FASTRECOVERY(tp) && !exiting_fr) {
	4790	if (CC_ALGO(tp)->ack_rcvd != NULL) {
	4791	CC_ALGO(tp)->ack_rcvd(tp, th);
	4792	}
	4793	tcp_ccdbg_trace(tp, th, TCP_CC_ACK_RCVD);
	4794	}
	4795	if (acked > so->so_snd.sb_cc) {
	4796	tp->snd_wnd -= so->so_snd.sb_cc;
	4797	sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
	4798	ourfinisacked = 1;
	4799	} else {
	4800	sbdrop(&so->so_snd, acked);
	4801	tcp_sbsnd_trim(&so->so_snd);
	4802	tp->snd_wnd -= acked;
	4803	ourfinisacked = 0;
	4804	}
	4805	/* detect una wraparound */
	4806	if (!IN_FASTRECOVERY(tp) &&
	4807	SEQ_GT(tp->snd_una, tp->snd_recover) &&
	4808	SEQ_LEQ(th->th_ack, tp->snd_recover)) {
	4809	tp->snd_recover = th->th_ack - 1;
	4810	}
	4811
	4812	if (IN_FASTRECOVERY(tp) &&
	4813	SEQ_GEQ(th->th_ack, tp->snd_recover)) {
	4814	EXIT_FASTRECOVERY(tp);
	4815	}
	4816
	4817	tcp_update_snd_una(tp, th->th_ack);
	4818
	4819	if (SACK_ENABLED(tp)) {
	4820	if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
	4821	tp->snd_recover = tp->snd_una;
	4822	}
	4823	}
	4824	if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
	4825	tp->snd_nxt = tp->snd_una;
	4826	}
	4827	if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
	4828	!TCP_DSACK_SEQ_IN_WINDOW(tp, tp->t_dsack_lastuna,
	4829	tp->snd_una)) {
	4830	tcp_rxtseg_clean(tp);
	4831	}
	4832	if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
	4833	tp->t_bwmeas != NULL) {
	4834	tcp_bwmeas_check(tp);
	4835	}
	4836
	4837	write_wakeup = 1;
	4838
	4839	if (!SLIST_EMPTY(&tp->t_notify_ack)) {
	4840	tcp_notify_acknowledgement(tp, so);
	4841	}
	4842
	4843	switch (tp->t_state) {
	4844	/*
	4845	* In FIN_WAIT_1 STATE in addition to the processing
	4846	* for the ESTABLISHED state if our FIN is now acknowledged
	4847	* then enter FIN_WAIT_2.
	4848	*/
	4849	case TCPS_FIN_WAIT_1:
	4850	if (ourfinisacked) {
	4851	/*
	4852	* If we can't receive any more
	4853	* data, then closing user can proceed.
	4854	* Starting the TCPT_2MSL timer is contrary to the
	4855	* specification, but if we don't get a FIN
	4856	* we'll hang forever.
	4857	*/
	4858	if (so->so_state & SS_CANTRCVMORE) {
	4859	tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
	4860	TCP_CONN_MAXIDLE(tp));
	4861	isconnected = FALSE;
	4862	isdisconnected = TRUE;
	4863	}
	4864	DTRACE_TCP4(state__change, void, NULL,
	4865	struct inpcb *, inp,
	4866	struct tcpcb *, tp,
	4867	int32_t, TCPS_FIN_WAIT_2);
	4868	tp->t_state = TCPS_FIN_WAIT_2;
	4869	/* fall through and make sure we also recognize
	4870	* data ACKed with the FIN
	4871	*/
	4872	}
	4873	break;
	4874
	4875	/*
	4876	* In CLOSING STATE in addition to the processing for
	4877	* the ESTABLISHED state if the ACK acknowledges our FIN
	4878	* then enter the TIME-WAIT state, otherwise ignore
	4879	* the segment.
	4880	*/
	4881	case TCPS_CLOSING:
	4882	if (ourfinisacked) {
	4883	DTRACE_TCP4(state__change, void, NULL,
	4884	struct inpcb *, inp,
	4885	struct tcpcb *, tp,
	4886	int32_t, TCPS_TIME_WAIT);
	4887	tp->t_state = TCPS_TIME_WAIT;
	4888	tcp_canceltimers(tp);
	4889	if (tp->t_flagsext & TF_NOTIMEWAIT) {
	4890	tp->t_flags \|= TF_CLOSING;
	4891	} else {
	4892	add_to_time_wait(tp, 2 * tcp_msl);
	4893	}
	4894	isconnected = FALSE;
	4895	isdisconnected = TRUE;
	4896	}
	4897	break;
	4898
	4899	/*
	4900	* In LAST_ACK, we may still be waiting for data to drain
	4901	* and/or to be acked, as well as for the ack of our FIN.
	4902	* If our FIN is now acknowledged, delete the TCB,
	4903	* enter the closed state and return.
	4904	*/
	4905	case TCPS_LAST_ACK:
	4906	if (ourfinisacked) {
	4907	tp = tcp_close(tp);
	4908	goto drop;
	4909	}
	4910	break;
	4911
	4912	/*
	4913	* In TIME_WAIT state the only thing that should arrive
	4914	* is a retransmission of the remote FIN. Acknowledge
	4915	* it and restart the finack timer.
	4916	*/
	4917	case TCPS_TIME_WAIT:
	4918	add_to_time_wait(tp, 2 * tcp_msl);
	4919	goto dropafterack;
	4920	}
	4921
	4922	/*
	4923	* If there is a SACK option on the ACK and we
	4924	* haven't seen any duplicate acks before, count
	4925	* it as a duplicate ack even if the cumulative
	4926	* ack is advanced. If the receiver delayed an
	4927	* ack and detected loss afterwards, then the ack
	4928	* will advance cumulative ack and will also have
	4929	* a SACK option. So counting it as one duplicate
	4930	* ack is ok.
	4931	*/
	4932	if (tp->t_state == TCPS_ESTABLISHED &&
	4933	SACK_ENABLED(tp) && sack_bytes_acked > 0 &&
	4934	to.to_nsacks > 0 && tp->t_dupacks == 0 &&
	4935	SEQ_LEQ(th->th_ack, tp->snd_una) && tlen == 0 &&
	4936	!(tp->t_flagsext & TF_PKTS_REORDERED)) {
	4937	tcpstat.tcps_sack_ackadv++;
	4938	goto process_dupack;
	4939	}
	4940	}
	4941
	4942	step6:
	4943	/*
	4944	* Update window information.
	4945	*/
	4946	if (tcp_update_window(tp, thflags, th, tiwin, tlen)) {
	4947	needoutput = 1;
	4948	}
	4949
	4950	/*
	4951	* Process segments with URG.
	4952	*/
	4953	if ((thflags & TH_URG) && th->th_urp &&
	4954	TCPS_HAVERCVDFIN(tp->t_state) == 0) {
	4955	/*
	4956	* This is a kludge, but if we receive and accept
	4957	* random urgent pointers, we'll crash in
	4958	* soreceive. It's hard to imagine someone
	4959	* actually wanting to send this much urgent data.
	4960	*/
	4961	if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
	4962	th->th_urp = 0; /* XXX */
	4963	thflags &= ~TH_URG; /* XXX */
	4964	goto dodata; /* XXX */
	4965	}
	4966	/*
	4967	* If this segment advances the known urgent pointer,
	4968	* then mark the data stream. This should not happen
	4969	* in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
	4970	* a FIN has been received from the remote side.
	4971	* In these states we ignore the URG.
	4972	*
	4973	* According to RFC961 (Assigned Protocols),
	4974	* the urgent pointer points to the last octet
	4975	* of urgent data. We continue, however,
	4976	* to consider it to indicate the first octet
	4977	* of data past the urgent section as the original
	4978	* spec states (in one of two places).
	4979	*/
	4980	if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
	4981	tp->rcv_up = th->th_seq + th->th_urp;
	4982	so->so_oobmark = so->so_rcv.sb_cc +
	4983	(tp->rcv_up - tp->rcv_nxt) - 1;
	4984	if (so->so_oobmark == 0) {
	4985	so->so_state \|= SS_RCVATMARK;
	4986	}
	4987	sohasoutofband(so);
	4988	tp->t_oobflags &= ~(TCPOOB_HAVEDATA \| TCPOOB_HADDATA);
	4989	}
	4990	/*
	4991	* Remove out of band data so doesn't get presented to user.
	4992	* This can happen independent of advancing the URG pointer,
	4993	* but if two URG's are pending at once, some out-of-band
	4994	* data may creep in... ick.
	4995	*/
	4996	if (th->th_urp <= (u_int32_t)tlen
	4997	#if SO_OOBINLINE
	4998	&& (so->so_options & SO_OOBINLINE) == 0
	4999	#endif
	5000	) {
	5001	tcp_pulloutofband(so, th, m,
	5002	drop_hdrlen); /* hdr drop is delayed */
	5003	}
	5004	} else {
	5005	/*
	5006	* If no out of band data is expected,
	5007	* pull receive urgent pointer along
	5008	* with the receive window.
	5009	*/
	5010	if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) {
	5011	tp->rcv_up = tp->rcv_nxt;
	5012	}
	5013	}
	5014	dodata:
	5015
	5016	/* Set socket's connect or disconnect state correcly before doing data.
	5017	* The following might unlock the socket if there is an upcall or a socket
	5018	* filter.
	5019	*/
	5020	if (isconnected) {
	5021	soisconnected(so);
	5022	} else if (isdisconnected) {
	5023	soisdisconnected(so);
	5024	}
	5025
	5026	/* Let's check the state of pcb just to make sure that it did not get closed
	5027	* when we unlocked above
	5028	*/
	5029	if (inp->inp_state == INPCB_STATE_DEAD) {
	5030	/* Just drop the packet that we are processing and return */
	5031	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "INPCB_STATE_DEAD");
	5032	goto drop;
	5033	}
	5034
	5035	/*
	5036	* Process the segment text, merging it into the TCP sequencing queue,
	5037	* and arranging for acknowledgment of receipt if necessary.
	5038	* This process logically involves adjusting tp->rcv_wnd as data
	5039	* is presented to the user (this happens in tcp_usrreq.c,
	5040	* case PRU_RCVD). If a FIN has already been received on this
	5041	* connection then we just ignore the text.
	5042	*
	5043	* If we are in SYN-received state and got a valid TFO cookie, we want
	5044	* to process the data.
	5045	*/
	5046	if ((tlen \|\| (thflags & TH_FIN)) &&
	5047	TCPS_HAVERCVDFIN(tp->t_state) == 0 &&
	5048	(TCPS_HAVEESTABLISHED(tp->t_state) \|\|
	5049	(tp->t_state == TCPS_SYN_RECEIVED &&
	5050	(tp->t_tfo_flags & TFO_F_COOKIE_VALID)))) {
	5051	tcp_seq save_start = th->th_seq;
	5052	tcp_seq save_end = th->th_seq + tlen;
	5053	m_adj(m, drop_hdrlen); /* delayed header drop */
	5054	/*
	5055	* Insert segment which includes th into TCP reassembly queue
	5056	* with control block tp. Set thflags to whether reassembly now
	5057	* includes a segment with FIN. This handles the common case
	5058	* inline (segment is the next to be received on an established
	5059	* connection, and the queue is empty), avoiding linkage into
	5060	* and removal from the queue and repetition of various
	5061	* conversions.
	5062	* Set DELACK for segments received in order, but ack
	5063	* immediately when segments are out of order (so
	5064	* fast retransmit can work).
	5065	*/
	5066	if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) {
	5067	TCP_INC_VAR(tp->t_unacksegs, segment_count);
	5068	/*
	5069	* Calculate the RTT on the receiver only if the
	5070	* connection is in streaming mode and the last
	5071	* packet was not an end-of-write
	5072	*/
	5073	if (tp->t_flags & TF_STREAMING_ON) {
	5074	tcp_compute_rtt(tp, &to, th);
	5075	}
	5076
	5077	if (DELAY_ACK(tp, th) &&
	5078	((tp->t_flags & TF_ACKNOW) == 0)) {
	5079	if ((tp->t_flags & TF_DELACK) == 0) {
	5080	tp->t_flags \|= TF_DELACK;
	5081	tp->t_timer[TCPT_DELACK] =
	5082	OFFSET_FROM_START(tp, tcp_delack);
	5083	}
	5084	} else {
	5085	tp->t_flags \|= TF_ACKNOW;
	5086	}
	5087	tp->rcv_nxt += tlen;
	5088	thflags = th->th_flags & TH_FIN;
	5089	TCP_INC_VAR(tcpstat.tcps_rcvpack, segment_count);
	5090	tcpstat.tcps_rcvbyte += tlen;
	5091	if (nstat_collect) {
	5092	INP_ADD_STAT(inp, cell, wifi, wired,
	5093	rxpackets, 1);
	5094	INP_ADD_STAT(inp, cell, wifi, wired,
	5095	rxbytes, tlen);
	5096	inp_set_activity_bitmap(inp);
	5097	}
	5098	tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
	5099	so_recv_data_stat(so, m, drop_hdrlen);
	5100
	5101	if (isipv6) {
	5102	memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr));
	5103	ip6 = (struct ip6_hdr *)&saved_hdr[0];
	5104	} else {
	5105	memcpy(&saved_hdr, ip, ip->ip_hl << 2);
	5106	ip = (struct ip *)&saved_hdr[0];
	5107	}
	5108	memcpy(&saved_tcphdr, th, sizeof(struct tcphdr));
	5109
	5110	if (th->th_flags & TH_PUSH) {
	5111	tp->t_flagsext \|= TF_LAST_IS_PSH;
	5112	} else {
	5113	tp->t_flagsext &= ~TF_LAST_IS_PSH;
	5114	}
	5115
	5116	if (sbappendstream_rcvdemux(so, m)) {
	5117	read_wakeup = 1;
	5118	}
	5119	th = &saved_tcphdr;
	5120	} else {
	5121	if (isipv6) {
	5122	memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr));
	5123	ip6 = (struct ip6_hdr *)&saved_hdr[0];
	5124	} else {
	5125	memcpy(&saved_hdr, ip, ip->ip_hl << 2);
	5126	ip = (struct ip *)&saved_hdr[0];
	5127	}
	5128
	5129	if (tcp_autotune_reorder) {
	5130	tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
	5131	}
	5132
	5133	memcpy(&saved_tcphdr, th, sizeof(struct tcphdr));
	5134	thflags = tcp_reass(tp, th, &tlen, m, ifp, &read_wakeup);
	5135	th = &saved_tcphdr;
	5136	tp->t_flags \|= TF_ACKNOW;
	5137	}
	5138
	5139	if ((tlen > 0 \|\| (th->th_flags & TH_FIN)) && SACK_ENABLED(tp)) {
	5140	if (th->th_flags & TH_FIN) {
	5141	save_end++;
	5142	}
	5143	tcp_update_sack_list(tp, save_start, save_end);
	5144	}
	5145
	5146	tcp_adaptive_rwtimo_check(tp, tlen);
	5147
	5148	if (tlen > 0) {
	5149	tcp_tfo_rcv_data(tp);
	5150	}
	5151
	5152	if (tp->t_flags & TF_DELACK) {
	5153	if (isipv6) {
	5154	KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) \| th->th_sport),
	5155	(((ip6->ip6_src.s6_addr16[0]) << 16) \| (ip6->ip6_dst.s6_addr16[0])),
	5156	th->th_seq, th->th_ack, th->th_win);
	5157	} else {
	5158	KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) \| th->th_sport),
	5159	(((ip->ip_src.s_addr & 0xffff) << 16) \| (ip->ip_dst.s_addr & 0xffff)),
	5160	th->th_seq, th->th_ack, th->th_win);
	5161	}
	5162	}
	5163	} else {
	5164	if ((so->so_flags & SOF_MP_SUBFLOW) && tlen == 0 &&
	5165	(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) &&
	5166	(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
	5167	m_adj(m, drop_hdrlen); /* delayed header drop */
	5168	mptcp_input(tptomptp(tp)->mpt_mpte, m);
	5169	tp->t_flags \|= TF_ACKNOW;
	5170	} else {
	5171	m_freem(m);
	5172	}
	5173	thflags &= ~TH_FIN;
	5174	}
	5175
	5176	/*
	5177	* If FIN is received ACK the FIN and let the user know
	5178	* that the connection is closing.
	5179	*/
	5180	if (thflags & TH_FIN) {
	5181	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
	5182	socantrcvmore(so);
	5183	/*
	5184	* If connection is half-synchronized
	5185	* (ie NEEDSYN flag on) then delay ACK,
	5186	* so it may be piggybacked when SYN is sent.
	5187	* Otherwise, since we received a FIN then no
	5188	* more input can be expected, send ACK now.
	5189	*/
	5190	TCP_INC_VAR(tp->t_unacksegs, segment_count);
	5191	tp->t_flags \|= TF_ACKNOW;
	5192	tp->rcv_nxt++;
	5193	}
	5194	switch (tp->t_state) {
	5195	/*
	5196	* In SYN_RECEIVED and ESTABLISHED STATES
	5197	* enter the CLOSE_WAIT state.
	5198	*/
	5199	case TCPS_SYN_RECEIVED:
	5200	tp->t_starttime = tcp_now;
	5201	OS_FALLTHROUGH;
	5202	case TCPS_ESTABLISHED:
	5203	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
	5204	struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT);
	5205	tp->t_state = TCPS_CLOSE_WAIT;
	5206	break;
	5207
	5208	/*
	5209	* If still in FIN_WAIT_1 STATE FIN has not been acked so
	5210	* enter the CLOSING state.
	5211	*/
	5212	case TCPS_FIN_WAIT_1:
	5213	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
	5214	struct tcpcb *, tp, int32_t, TCPS_CLOSING);
	5215	tp->t_state = TCPS_CLOSING;
	5216	break;
	5217
	5218	/*
	5219	* In FIN_WAIT_2 state enter the TIME_WAIT state,
	5220	* starting the time-wait timer, turning off the other
	5221	* standard timers.
	5222	*/
	5223	case TCPS_FIN_WAIT_2:
	5224	DTRACE_TCP4(state__change, void, NULL,
	5225	struct inpcb *, inp,
	5226	struct tcpcb *, tp,
	5227	int32_t, TCPS_TIME_WAIT);
	5228	tp->t_state = TCPS_TIME_WAIT;
	5229	tcp_canceltimers(tp);
	5230	tp->t_flags \|= TF_ACKNOW;
	5231	if (tp->t_flagsext & TF_NOTIMEWAIT) {
	5232	tp->t_flags \|= TF_CLOSING;
	5233	} else {
	5234	add_to_time_wait(tp, 2 * tcp_msl);
	5235	}
	5236	soisdisconnected(so);
	5237	break;
	5238
	5239	/*
	5240	* In TIME_WAIT state restart the 2 MSL time_wait timer.
	5241	*/
	5242	case TCPS_TIME_WAIT:
	5243	add_to_time_wait(tp, 2 * tcp_msl);
	5244	break;
	5245	}
	5246	}
	5247	#if TCPDEBUG
	5248	if (so->so_options & SO_DEBUG) {
	5249	tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
	5250	&tcp_savetcp, 0);
	5251	}
	5252	#endif
	5253
	5254	if (read_wakeup) {
	5255	mptcp_handle_input(so);
	5256	}
	5257
	5258	/*
	5259	* Return any desired output.
	5260	*/
	5261	if (needoutput \|\| (tp->t_flags & TF_ACKNOW)) {
	5262	(void) tcp_output(tp);
	5263	}
	5264
	5265	tcp_check_timer_state(tp);
	5266
	5267	tcp_handle_wakeup(so, read_wakeup, write_wakeup);
	5268
	5269	socket_unlock(so, 1);
	5270	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	5271	return;
	5272
	5273	dropafterack:
	5274	/*
	5275	* Generate an ACK dropping incoming segment if it occupies
	5276	* sequence space, where the ACK reflects our state.
	5277	*
	5278	* We can now skip the test for the RST flag since all
	5279	* paths to this code happen after packets containing
	5280	* RST have been dropped.
	5281	*
	5282	* In the SYN-RECEIVED state, don't send an ACK unless the
	5283	* segment we received passes the SYN-RECEIVED ACK test.
	5284	* If it fails send a RST. This breaks the loop in the
	5285	* "LAND" DoS attack, and also prevents an ACK storm
	5286	* between two listening ports that have been sent forged
	5287	* SYN segments, each with the source address of the other.
	5288	*/
	5289	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
	5290	(SEQ_GT(tp->snd_una, th->th_ack) \|\|
	5291	SEQ_GT(th->th_ack, tp->snd_max))) {
	5292	IF_TCP_STATINC(ifp, dospacket);
	5293	goto dropwithreset;
	5294	}
	5295	#if TCPDEBUG
	5296	if (so->so_options & SO_DEBUG) {
	5297	tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
	5298	&tcp_savetcp, 0);
	5299	}
	5300	#endif
	5301	m_freem(m);
	5302	tp->t_flags \|= TF_ACKNOW;
	5303
	5304	(void) tcp_output(tp);
	5305
	5306	tcp_handle_wakeup(so, read_wakeup, write_wakeup);
	5307
	5308	/* Don't need to check timer state as we should have done it during tcp_output */
	5309	socket_unlock(so, 1);
	5310	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	5311	return;
	5312	dropwithresetnosock:
	5313	nosock = 1;
	5314	dropwithreset:
	5315	/*
	5316	* Generate a RST, dropping incoming segment.
	5317	* Make ACK acceptable to originator of segment.
	5318	* Don't bother to respond if destination was broadcast/multicast.
	5319	*/
	5320	if ((thflags & TH_RST) \|\| m->m_flags & (M_BCAST \| M_MCAST)) {
	5321	goto drop;
	5322	}
	5323	if (isipv6) {
	5324	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) \|\|
	5325	IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
	5326	goto drop;
	5327	}
	5328	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) \|\|
	5329	IN_MULTICAST(ntohl(ip->ip_src.s_addr)) \|\|
	5330	ip->ip_src.s_addr == htonl(INADDR_BROADCAST) \|\|
	5331	in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
	5332	goto drop;
	5333	}
	5334	/* IPv6 anycast check is done at tcp6_input() */
	5335
	5336	#if TCPDEBUG
	5337	if (tp == 0 \|\| (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
	5338	tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
	5339	&tcp_savetcp, 0);
	5340	}
	5341	#endif
	5342	bzero(&tra, sizeof(tra));
	5343	tra.ifscope = ifscope;
	5344	tra.awdl_unrestricted = 1;
	5345	tra.intcoproc_allowed = 1;
	5346	if (thflags & TH_ACK) {
	5347	/* mtod() below is safe as long as hdr dropping is delayed */
	5348	tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
	5349	TH_RST, &tra);
	5350	} else {
	5351	if (thflags & TH_SYN) {
	5352	tlen++;
	5353	}
	5354	/* mtod() below is safe as long as hdr dropping is delayed */
	5355	tcp_respond(tp, mtod(m, void *), th, m, th->th_seq + tlen,
	5356	(tcp_seq)0, TH_RST \| TH_ACK, &tra);
	5357	}
	5358	/* destroy temporarily created socket */
	5359	if (dropsocket) {
	5360	(void) soabort(so);
	5361	socket_unlock(so, 1);
	5362	} else if ((inp != NULL) && (nosock == 0)) {
	5363	tcp_handle_wakeup(so, read_wakeup, write_wakeup);
	5364
	5365	socket_unlock(so, 1);
	5366	}
	5367	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	5368	return;
	5369	dropnosock:
	5370	nosock = 1;
	5371	drop:
	5372	/*
	5373	* Drop space held by incoming segment and return.
	5374	*/
	5375	#if TCPDEBUG
	5376	if (tp == 0 \|\| (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
	5377	tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
	5378	&tcp_savetcp, 0);
	5379	}
	5380	#endif
	5381	m_freem(m);
	5382	/* destroy temporarily created socket */
	5383	if (dropsocket) {
	5384	(void) soabort(so);
	5385	socket_unlock(so, 1);
	5386	} else if (nosock == 0) {
	5387	tcp_handle_wakeup(so, read_wakeup, write_wakeup);
	5388
	5389	socket_unlock(so, 1);
	5390	}
	5391	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	5392	return;
	5393	}
	5394
	5395	/*
	5396	* Parse TCP options and place in tcpopt.
	5397	*/
	5398	static void
	5399	tcp_dooptions(struct tcpcb tp, u_char cp, int cnt, struct tcphdr *th,
	5400	struct tcpopt *to)
	5401	{
	5402	u_short mss = 0;
	5403	int opt, optlen;
	5404
	5405	for (; cnt > 0; cnt -= optlen, cp += optlen) {
	5406	opt = cp[0];
	5407	if (opt == TCPOPT_EOL) {
	5408	break;
	5409	}
	5410	if (opt == TCPOPT_NOP) {
	5411	optlen = 1;
	5412	} else {
	5413	if (cnt < 2) {
	5414	break;
	5415	}
	5416	optlen = cp[1];
	5417	if (optlen < 2 \|\| optlen > cnt) {
	5418	break;
	5419	}
	5420	}
	5421	switch (opt) {
	5422	default:
	5423	continue;
	5424
	5425	case TCPOPT_MAXSEG:
	5426	if (optlen != TCPOLEN_MAXSEG) {
	5427	continue;
	5428	}
	5429	if (!(th->th_flags & TH_SYN)) {
	5430	continue;
	5431	}
	5432	bcopy((char ) cp + 2, (char ) &mss, sizeof(mss));
	5433	NTOHS(mss);
	5434	to->to_mss = mss;
	5435	to->to_flags \|= TOF_MSS;
	5436	break;
	5437
	5438	case TCPOPT_WINDOW:
	5439	if (optlen != TCPOLEN_WINDOW) {
	5440	continue;
	5441	}
	5442	if (!(th->th_flags & TH_SYN)) {
	5443	continue;
	5444	}
	5445	to->to_flags \|= TOF_SCALE;
	5446	to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
	5447	break;
	5448
	5449	case TCPOPT_TIMESTAMP:
	5450	if (optlen != TCPOLEN_TIMESTAMP) {
	5451	continue;
	5452	}
	5453	to->to_flags \|= TOF_TS;
	5454	bcopy((char *)cp + 2,
	5455	(char *)&to->to_tsval, sizeof(to->to_tsval));
	5456	NTOHL(to->to_tsval);
	5457	bcopy((char *)cp + 6,
	5458	(char *)&to->to_tsecr, sizeof(to->to_tsecr));
	5459	NTOHL(to->to_tsecr);
	5460	/* Re-enable sending Timestamps if we received them */
	5461	if (!(tp->t_flags & TF_REQ_TSTMP)) {
	5462	tp->t_flags \|= TF_REQ_TSTMP;
	5463	}
	5464	break;
	5465	case TCPOPT_SACK_PERMITTED:
	5466	if (optlen != TCPOLEN_SACK_PERMITTED) {
	5467	continue;
	5468	}
	5469	if (th->th_flags & TH_SYN) {
	5470	to->to_flags \|= TOF_SACK;
	5471	}
	5472	break;
	5473	case TCPOPT_SACK:
	5474	if (optlen <= 2 \|\| (optlen - 2) % TCPOLEN_SACK != 0) {
	5475	continue;
	5476	}
	5477	to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
	5478	to->to_sacks = cp + 2;
	5479	tcpstat.tcps_sack_rcv_blocks++;
	5480
	5481	break;
	5482	case TCPOPT_FASTOPEN:
	5483	if (optlen == TCPOLEN_FASTOPEN_REQ) {
	5484	if (tp->t_state != TCPS_LISTEN) {
	5485	continue;
	5486	}
	5487
	5488	to->to_flags \|= TOF_TFOREQ;
	5489	} else {
	5490	if (optlen < TCPOLEN_FASTOPEN_REQ \|\|
	5491	(optlen - TCPOLEN_FASTOPEN_REQ) > TFO_COOKIE_LEN_MAX \|\|
	5492	(optlen - TCPOLEN_FASTOPEN_REQ) < TFO_COOKIE_LEN_MIN) {
	5493	continue;
	5494	}
	5495	if (tp->t_state != TCPS_LISTEN &&
	5496	tp->t_state != TCPS_SYN_SENT) {
	5497	continue;
	5498	}
	5499
	5500	to->to_flags \|= TOF_TFO;
	5501	to->to_tfo = cp + 1;
	5502	}
	5503
	5504	break;
	5505	#if MPTCP
	5506	case TCPOPT_MULTIPATH:
	5507	tcp_do_mptcp_options(tp, cp, th, to, optlen);
	5508	break;
	5509	#endif /* MPTCP */
	5510	}
	5511	}
	5512	}
	5513
	5514	static void
	5515	tcp_finalize_options(struct tcpcb tp, struct tcpopt to, unsigned int ifscope)
	5516	{
	5517	if (to->to_flags & TOF_TS) {
	5518	tp->t_flags \|= TF_RCVD_TSTMP;
	5519	tp->ts_recent = to->to_tsval;
	5520	tp->ts_recent_age = tcp_now;
	5521	}
	5522	if (to->to_flags & TOF_MSS) {
	5523	tcp_mss(tp, to->to_mss, ifscope);
	5524	}
	5525	if (SACK_ENABLED(tp)) {
	5526	if (!(to->to_flags & TOF_SACK)) {
	5527	tp->t_flagsext &= ~(TF_SACK_ENABLE);
	5528	} else {
	5529	tp->t_flags \|= TF_SACK_PERMIT;
	5530	}
	5531	}
	5532	if (to->to_flags & TOF_SCALE) {
	5533	tp->t_flags \|= TF_RCVD_SCALE;
	5534	tp->requested_s_scale = to->to_requested_s_scale;
	5535
	5536	/* Re-enable window scaling, if the option is received */
	5537	if (tp->request_r_scale > 0) {
	5538	tp->t_flags \|= TF_REQ_SCALE;
	5539	}
	5540	}
	5541	}
	5542
	5543	/*
	5544	* Pull out of band byte out of a segment so
	5545	* it doesn't appear in the user's data queue.
	5546	* It is still reflected in the segment length for
	5547	* sequencing purposes.
	5548	*
	5549	* @param off delayed to be droped hdrlen
	5550	*/
	5551	static void
	5552	tcp_pulloutofband(struct socket so, struct tcphdr th, struct mbuf *m, int off)
	5553	{
	5554	int cnt = off + th->th_urp - 1;
	5555
	5556	while (cnt >= 0) {
	5557	if (m->m_len > cnt) {
	5558	char *cp = mtod(m, caddr_t) + cnt;
	5559	struct tcpcb *tp = sototcpcb(so);
	5560
	5561	tp->t_iobc = *cp;
	5562	tp->t_oobflags \|= TCPOOB_HAVEDATA;
	5563	bcopy(cp + 1, cp, (unsigned)(m->m_len - cnt - 1));
	5564	m->m_len--;
	5565	if (m->m_flags & M_PKTHDR) {
	5566	m->m_pkthdr.len--;
	5567	}
	5568	return;
	5569	}
	5570	cnt -= m->m_len;
	5571	m = m->m_next;
	5572	if (m == 0) {
	5573	break;
	5574	}
	5575	}
	5576	panic("tcp_pulloutofband");
	5577	}
	5578
	5579	uint32_t
	5580	get_base_rtt(struct tcpcb *tp)
	5581	{
	5582	struct rtentry *rt = tp->t_inpcb->inp_route.ro_rt;
	5583	return (rt == NULL) ? 0 : rt->rtt_min;
	5584	}
	5585
	5586	/* Each value of RTT base represents the minimum RTT seen in a minute.
	5587	* We keep upto N_RTT_BASE minutes worth of history.
	5588	*/
	5589	void
	5590	update_base_rtt(struct tcpcb *tp, uint32_t rtt)
	5591	{
	5592	u_int32_t base_rtt, i;
	5593	struct rtentry *rt;
	5594
	5595	if ((rt = tp->t_inpcb->inp_route.ro_rt) == NULL) {
	5596	return;
	5597	}
	5598	if (rt->rtt_expire_ts == 0) {
	5599	RT_LOCK_SPIN(rt);
	5600	if (rt->rtt_expire_ts != 0) {
	5601	RT_UNLOCK(rt);
	5602	goto update;
	5603	}
	5604	rt->rtt_expire_ts = tcp_now;
	5605	rt->rtt_index = 0;
	5606	rt->rtt_hist[0] = rtt;
	5607	rt->rtt_min = rtt;
	5608	RT_UNLOCK(rt);
	5609	return;
	5610	}
	5611	update:
	5612	#if TRAFFIC_MGT
	5613	/*
	5614	* If the recv side is being throttled, check if the
	5615	* current RTT is closer to the base RTT seen in
	5616	* first (recent) two slots. If so, unthrottle the stream.
	5617	*/
	5618	if ((tp->t_flagsext & TF_RECV_THROTTLE) &&
	5619	(int)(tcp_now - tp->t_recv_throttle_ts) >= TCP_RECV_THROTTLE_WIN) {
	5620	base_rtt = rt->rtt_min;
	5621	if (tp->t_rttcur <= (base_rtt + target_qdelay)) {
	5622	tp->t_flagsext &= ~TF_RECV_THROTTLE;
	5623	tp->t_recv_throttle_ts = 0;
	5624	}
	5625	}
	5626	#endif /* TRAFFIC_MGT */
	5627	if ((int)(tcp_now - rt->rtt_expire_ts) >=
	5628	TCP_RTT_HISTORY_EXPIRE_TIME) {
	5629	RT_LOCK_SPIN(rt);
	5630	/* check the condition again to avoid race */
	5631	if ((int)(tcp_now - rt->rtt_expire_ts) >=
	5632	TCP_RTT_HISTORY_EXPIRE_TIME) {
	5633	rt->rtt_index++;
	5634	if (rt->rtt_index >= NRTT_HIST) {
	5635	rt->rtt_index = 0;
	5636	}
	5637	rt->rtt_hist[rt->rtt_index] = rtt;
	5638	rt->rtt_expire_ts = tcp_now;
	5639	} else {
	5640	rt->rtt_hist[rt->rtt_index] =
	5641	min(rt->rtt_hist[rt->rtt_index], rtt);
	5642	}
	5643	/* forget the old value and update minimum */
	5644	rt->rtt_min = 0;
	5645	for (i = 0; i < NRTT_HIST; ++i) {
	5646	if (rt->rtt_hist[i] != 0 &&
	5647	(rt->rtt_min == 0 \|\|
	5648	rt->rtt_hist[i] < rt->rtt_min)) {
	5649	rt->rtt_min = rt->rtt_hist[i];
	5650	}
	5651	}
	5652	RT_UNLOCK(rt);
	5653	} else {
	5654	rt->rtt_hist[rt->rtt_index] =
	5655	min(rt->rtt_hist[rt->rtt_index], rtt);
	5656	if (rt->rtt_min == 0) {
	5657	rt->rtt_min = rtt;
	5658	} else {
	5659	rt->rtt_min = min(rt->rtt_min, rtt);
	5660	}
	5661	}
	5662	}
	5663
	5664	/*
	5665	* If we have a timestamp reply, update smoothed RTT. If no timestamp is
	5666	* present but transmit timer is running and timed sequence number was
	5667	* acked, update smoothed RTT.
	5668	*
	5669	* If timestamps are supported, a receiver can update RTT even if
	5670	* there is no outstanding data.
	5671	*
	5672	* Some boxes send broken timestamp replies during the SYN+ACK phase,
	5673	* ignore timestamps of 0or we could calculate a huge RTT and blow up
	5674	* the retransmit timer.
	5675	*/
	5676	static void
	5677	tcp_compute_rtt(struct tcpcb tp, struct tcpopt to, struct tcphdr *th)
	5678	{
	5679	int rtt = 0;
	5680	VERIFY(to != NULL && th != NULL);
	5681	if (tp->t_rtttime != 0 && SEQ_GT(th->th_ack, tp->t_rtseq)) {
	5682	u_int32_t pipe_ack_val;
	5683	rtt = tcp_now - tp->t_rtttime;
	5684	/*
	5685	* Compute pipe ack -- the amount of data acknowledged
	5686	* in the last RTT
	5687	*/
	5688	if (SEQ_GT(th->th_ack, tp->t_pipeack_lastuna)) {
	5689	pipe_ack_val = th->th_ack - tp->t_pipeack_lastuna;
	5690	/* Update the sample */
	5691	tp->t_pipeack_sample[tp->t_pipeack_ind++] =
	5692	pipe_ack_val;
	5693	tp->t_pipeack_ind %= TCP_PIPEACK_SAMPLE_COUNT;
	5694
	5695	/* Compute the max of the pipeack samples */
	5696	pipe_ack_val = tcp_get_max_pipeack(tp);
	5697	tp->t_pipeack = (pipe_ack_val >
	5698	tcp_initial_cwnd(tp)) ?
	5699	pipe_ack_val : 0;
	5700	}
	5701	/* start another measurement */
	5702	tp->t_rtttime = 0;
	5703	}
	5704	if (((to->to_flags & TOF_TS) != 0) &&
	5705	(to->to_tsecr != 0) &&
	5706	TSTMP_GEQ(tcp_now, to->to_tsecr)) {
	5707	tcp_xmit_timer(tp, (tcp_now - to->to_tsecr),
	5708	to->to_tsecr, th->th_ack);
	5709	} else if (rtt > 0) {
	5710	tcp_xmit_timer(tp, rtt, 0, th->th_ack);
	5711	}
	5712	}
	5713
	5714	/*
	5715	* Collect new round-trip time estimate and update averages and
	5716	* current timeout.
	5717	*/
	5718	static void
	5719	tcp_xmit_timer(struct tcpcb *tp, int rtt,
	5720	u_int32_t tsecr, tcp_seq th_ack)
	5721	{
	5722	int delta;
	5723	int old_srtt = tp->t_srtt;
	5724	int old_rttvar = tp->t_rttvar;
	5725	bool log_rtt = false;
	5726
	5727	/*
	5728	* On AWDL interface, the initial RTT measurement on SYN
	5729	* can be wrong due to peer caching. Avoid the first RTT
	5730	* measurement as it might skew up the RTO.
	5731	* <rdar://problem/28739046>
	5732	*/
	5733	if (tp->t_inpcb->inp_last_outifp != NULL &&
	5734	(tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_AWDL) &&
	5735	th_ack == tp->iss + 1) {
	5736	return;
	5737	}
	5738
	5739	if (tp->t_flagsext & TF_RECOMPUTE_RTT) {
	5740	if (SEQ_GT(th_ack, tp->snd_una) &&
	5741	SEQ_LEQ(th_ack, tp->snd_max) &&
	5742	(tsecr == 0 \|\|
	5743	TSTMP_GEQ(tsecr, tp->t_badrexmt_time))) {
	5744	/*
	5745	* We received a new ACK after a
	5746	* spurious timeout. Adapt retransmission
	5747	* timer as described in rfc 4015.
	5748	*/
	5749	tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
	5750	tp->t_badrexmt_time = 0;
	5751	tp->t_srtt = max(tp->t_srtt_prev, rtt);
	5752	tp->t_srtt = tp->t_srtt << TCP_RTT_SHIFT;
	5753	tp->t_rttvar = max(tp->t_rttvar_prev, (rtt >> 1));
	5754	tp->t_rttvar = tp->t_rttvar << TCP_RTTVAR_SHIFT;
	5755
	5756	if (tp->t_rttbest > (tp->t_srtt + tp->t_rttvar)) {
	5757	tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
	5758	}
	5759
	5760	goto compute_rto;
	5761	} else {
	5762	return;
	5763	}
	5764	}
	5765
	5766	tcpstat.tcps_rttupdated++;
	5767	tp->t_rttupdated++;
	5768
	5769	if (rtt > 0) {
	5770	tp->t_rttcur = rtt;
	5771	update_base_rtt(tp, rtt);
	5772	}
	5773
	5774	if (tp->t_srtt != 0) {
	5775	/*
	5776	* srtt is stored as fixed point with 5 bits after the
	5777	* binary point (i.e., scaled by 32). The following magic
	5778	* is equivalent to the smoothing algorithm in rfc793 with
	5779	* an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
	5780	* point).
	5781	*
	5782	* Freebsd adjusts rtt to origin 0 by subtracting 1
	5783	* from the provided rtt value. This was required because
	5784	* of the way t_rtttime was initiailised to 1 before.
	5785	* Since we changed t_rtttime to be based on
	5786	* tcp_now, this extra adjustment is not needed.
	5787	*/
	5788	delta = (rtt << TCP_DELTA_SHIFT)
	5789	- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
	5790
	5791	if ((tp->t_srtt += delta) <= 0) {
	5792	tp->t_srtt = 1;
	5793	}
	5794
	5795	/*
	5796	* We accumulate a smoothed rtt variance (actually, a
	5797	* smoothed mean difference), then set the retransmit
	5798	* timer to smoothed rtt + 4 times the smoothed variance.
	5799	* rttvar is stored as fixed point with 4 bits after the
	5800	* binary point (scaled by 16). The following is
	5801	* equivalent to rfc793 smoothing with an alpha of .75
	5802	* (rttvar = rttvar*3/4 + \|delta\| / 4). This replaces
	5803	* rfc793's wired-in beta.
	5804	*/
	5805	if (delta < 0) {
	5806	delta = -delta;
	5807	}
	5808	delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
	5809	if ((tp->t_rttvar += delta) <= 0) {
	5810	tp->t_rttvar = 1;
	5811	}
	5812	if (tp->t_rttbest == 0 \|\|
	5813	tp->t_rttbest > (tp->t_srtt + tp->t_rttvar)) {
	5814	tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
	5815	}
	5816	} else {
	5817	/*
	5818	* No rtt measurement yet - use the unsmoothed rtt.
	5819	* Set the variance to half the rtt (so our first
	5820	* retransmit happens at 3*rtt).
	5821	*/
	5822	tp->t_srtt = rtt << TCP_RTT_SHIFT;
	5823	tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
	5824	tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
	5825	}
	5826
	5827	compute_rto:
	5828	nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt,
	5829	tp->t_rttvar);
	5830
	5831	/*
	5832	* the retransmit should happen at rtt + 4 * rttvar.
	5833	* Because of the way we do the smoothing, srtt and rttvar
	5834	* will each average +1/2 tick of bias. When we compute
	5835	* the retransmit timer, we want 1/2 tick of rounding and
	5836	* 1 extra tick because of +-1/2 tick uncertainty in the
	5837	* firing of the timer. The bias will give us exactly the
	5838	* 1.5 tick we need. But, because the bias is
	5839	* statistical, we have to test that we don't drop below
	5840	* the minimum feasible timer (which is 2 ticks).
	5841	*/
	5842	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
	5843	max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX,
	5844	TCP_ADD_REXMTSLOP(tp));
	5845
	5846	/*
	5847	* We received an ack for a packet that wasn't retransmitted;
	5848	* it is probably safe to discard any error indications we've
	5849	* received recently. This isn't quite right, but close enough
	5850	* for now (a route might have failed after we sent a segment,
	5851	* and the return path might not be symmetrical).
	5852	*/
	5853	tp->t_softerror = 0;
	5854
	5855	if (log_rtt) {
	5856	TCP_LOG_RTT_INFO(tp);
	5857	}
	5858
	5859	TCP_LOG_RTT_CHANGE(tp, old_srtt, old_rttvar);
	5860	}
	5861
	5862	static inline unsigned int
	5863	tcp_maxmtu(struct rtentry *rt)
	5864	{
	5865	unsigned int maxmtu;
	5866	int interface_mtu = 0;
	5867
	5868	RT_LOCK_ASSERT_HELD(rt);
	5869	interface_mtu = rt->rt_ifp->if_mtu;
	5870
	5871	if (rt_key(rt)->sa_family == AF_INET &&
	5872	INTF_ADJUST_MTU_FOR_CLAT46(rt->rt_ifp)) {
	5873	interface_mtu = IN6_LINKMTU(rt->rt_ifp);
	5874	/* Further adjust the size for CLAT46 expansion */
	5875	interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
	5876	}
	5877
	5878	if (rt->rt_rmx.rmx_mtu == 0) {
	5879	maxmtu = interface_mtu;
	5880	} else {
	5881	maxmtu = MIN(rt->rt_rmx.rmx_mtu, interface_mtu);
	5882	}
	5883
	5884	return maxmtu;
	5885	}
	5886
	5887	static inline unsigned int
	5888	tcp_maxmtu6(struct rtentry *rt)
	5889	{
	5890	unsigned int maxmtu;
	5891	struct nd_ifinfo *ndi = NULL;
	5892
	5893	RT_LOCK_ASSERT_HELD(rt);
	5894	if ((ndi = ND_IFINFO(rt->rt_ifp)) != NULL && !ndi->initialized) {
	5895	ndi = NULL;
	5896	}
	5897	if (ndi != NULL) {
	5898	lck_mtx_lock(&ndi->lock);
	5899	}
	5900	if (rt->rt_rmx.rmx_mtu == 0) {
	5901	maxmtu = IN6_LINKMTU(rt->rt_ifp);
	5902	} else {
	5903	maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
	5904	}
	5905	if (ndi != NULL) {
	5906	lck_mtx_unlock(&ndi->lock);
	5907	}
	5908
	5909	return maxmtu;
	5910	}
	5911
	5912	unsigned int
	5913	get_maxmtu(struct rtentry *rt)
	5914	{
	5915	unsigned int maxmtu = 0;
	5916
	5917	RT_LOCK_ASSERT_NOTHELD(rt);
	5918
	5919	RT_LOCK(rt);
	5920
	5921	if (rt_key(rt)->sa_family == AF_INET6) {
	5922	maxmtu = tcp_maxmtu6(rt);
	5923	} else {
	5924	maxmtu = tcp_maxmtu(rt);
	5925	}
	5926
	5927	RT_UNLOCK(rt);
	5928
	5929	return maxmtu;
	5930	}
	5931
	5932	/*
	5933	* Determine a reasonable value for maxseg size.
	5934	* If the route is known, check route for mtu.
	5935	* If none, use an mss that can be handled on the outgoing
	5936	* interface without forcing IP to fragment; if bigger than
	5937	* an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
	5938	* to utilize large mbufs. If no route is found, route has no mtu,
	5939	* or the destination isn't local, use a default, hopefully conservative
	5940	* size (usually 512 or the default IP max size, but no more than the mtu
	5941	* of the interface), as we can't discover anything about intervening
	5942	* gateways or networks. We also initialize the congestion/slow start
	5943	* window. While looking at the routing entry, we also initialize
	5944	* other path-dependent parameters from pre-set or cached values
	5945	* in the routing entry.
	5946	*
	5947	* Also take into account the space needed for options that we
	5948	* send regularly. Make maxseg shorter by that amount to assure
	5949	* that we can send maxseg amount of data even when the options
	5950	* are present. Store the upper limit of the length of options plus
	5951	* data in maxopd.
	5952	*
	5953	* NOTE that this routine is only called when we process an incoming
	5954	* segment, for outgoing segments only tcp_mssopt is called.
	5955	*
	5956	*/
	5957	void
	5958	tcp_mss(struct tcpcb *tp, int offer, unsigned int input_ifscope)
	5959	{
	5960	struct rtentry *rt;
	5961	struct ifnet *ifp;
	5962	int rtt, mss;
	5963	u_int32_t bufsize;
	5964	struct inpcb *inp;
	5965	struct socket *so;
	5966	int origoffer = offer;
	5967	u_int32_t sb_max_corrected;
	5968	int isnetlocal = 0;
	5969	int isipv6;
	5970	int min_protoh;
	5971
	5972	inp = tp->t_inpcb;
	5973
	5974	so = inp->inp_socket;
	5975	/*
	5976	* Nothing left to send after the socket is defunct or TCP is in the closed state
	5977	*/
	5978	if ((so->so_state & SS_DEFUNCT) \|\| tp->t_state == TCPS_CLOSED) {
	5979	return;
	5980	}
	5981
	5982	isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
	5983	min_protoh = isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr)
	5984	: sizeof(struct tcpiphdr);
	5985
	5986	if (isipv6) {
	5987	rt = tcp_rtlookup6(inp, input_ifscope);
	5988	} else {
	5989	rt = tcp_rtlookup(inp, input_ifscope);
	5990	}
	5991	isnetlocal = (tp->t_flags & TF_LOCAL);
	5992
	5993	if (rt == NULL) {
	5994	tp->t_maxopd = tp->t_maxseg = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
	5995	return;
	5996	}
	5997	ifp = rt->rt_ifp;
	5998	/*
	5999	* Slower link window correction:
	6000	* If a value is specificied for slowlink_wsize use it for
	6001	* PPP links believed to be on a serial modem (speed <128Kbps).
	6002	* Excludes 9600bps as it is the default value adversized
	6003	* by pseudo-devices over ppp.
	6004	*/
	6005	if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
	6006	ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
	6007	tp->t_flags \|= TF_SLOWLINK;
	6008	}
	6009
	6010	/*
	6011	* Offer == -1 means that we didn't receive SYN yet. Use 0 then.
	6012	*/
	6013	if (offer == -1) {
	6014	offer = rt->rt_rmx.rmx_filler[0];
	6015	}
	6016	/*
	6017	* Offer == 0 means that there was no MSS on the SYN segment,
	6018	* in this case we use tcp_mssdflt.
	6019	*/
	6020	if (offer == 0) {
	6021	offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
	6022	} else {
	6023	/*
	6024	* Prevent DoS attack with too small MSS. Round up
	6025	* to at least minmss.
	6026	*/
	6027	offer = max(offer, tcp_minmss);
	6028	/*
	6029	* Sanity check: make sure that maxopd will be large
	6030	* enough to allow some data on segments even is the
	6031	* all the option space is used (40bytes). Otherwise
	6032	* funny things may happen in tcp_output.
	6033	*/
	6034	offer = max(offer, 64);
	6035	}
	6036	rt->rt_rmx.rmx_filler[0] = offer;
	6037
	6038	/*
	6039	* While we're here, check if there's an initial rtt
	6040	* or rttvar. Convert from the route-table units
	6041	* to scaled multiples of the slow timeout timer.
	6042	*/
	6043	if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt) != 0) {
	6044	tcp_getrt_rtt(tp, rt);
	6045	} else {
	6046	tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
	6047	}
	6048
	6049	mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
	6050
	6051	#if NECP
	6052	// At this point, the mss is just the MTU. Adjust if necessary.
	6053	mss = necp_socket_get_effective_mtu(inp, mss);
	6054	#endif /* NECP */
	6055
	6056	mss -= min_protoh;
	6057
	6058	if (rt->rt_rmx.rmx_mtu == 0) {
	6059	if (isipv6) {
	6060	if (!isnetlocal) {
	6061	mss = min(mss, tcp_v6mssdflt);
	6062	}
	6063	} else if (!isnetlocal) {
	6064	mss = min(mss, tcp_mssdflt);
	6065	}
	6066	}
	6067
	6068	mss = min(mss, offer);
	6069	/*
	6070	* maxopd stores the maximum length of data AND options
	6071	* in a segment; maxseg is the amount of data in a normal
	6072	* segment. We need to store this value (maxopd) apart
	6073	* from maxseg, because now every segment carries options
	6074	* and thus we normally have somewhat less data in segments.
	6075	*/
	6076	tp->t_maxopd = mss;
	6077
	6078	/*
	6079	* origoffer==-1 indicates, that no segments were received yet.
	6080	* In this case we just guess.
	6081	*/
	6082	if ((tp->t_flags & (TF_REQ_TSTMP \| TF_NOOPT)) == TF_REQ_TSTMP &&
	6083	(origoffer == -1 \|\|
	6084	(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) {
	6085	mss -= TCPOLEN_TSTAMP_APPA;
	6086	}
	6087
	6088	#if MPTCP
	6089	mss -= mptcp_adj_mss(tp, FALSE);
	6090	#endif /* MPTCP */
	6091	tp->t_maxseg = mss;
	6092
	6093	/*
	6094	* Calculate corrected value for sb_max; ensure to upgrade the
	6095	* numerator for large sb_max values else it will overflow.
	6096	*/
	6097	sb_max_corrected = (sb_max * (u_int64_t)MCLBYTES) / (MSIZE + MCLBYTES);
	6098
	6099	/*
	6100	* If there's a pipesize (ie loopback), change the socket
	6101	* buffer to that size only if it's bigger than the current
	6102	* sockbuf size. Make the socket buffers an integral
	6103	* number of mss units; if the mss is larger than
	6104	* the socket buffer, decrease the mss.
	6105	*/
	6106	#if RTV_SPIPE
	6107	bufsize = rt->rt_rmx.rmx_sendpipe;
	6108	if (bufsize < so->so_snd.sb_hiwat)
	6109	#endif
	6110	bufsize = so->so_snd.sb_hiwat;
	6111	if (bufsize < mss) {
	6112	mss = bufsize;
	6113	} else {
	6114	bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
	6115	if (bufsize > sb_max_corrected) {
	6116	bufsize = sb_max_corrected;
	6117	}
	6118	(void)sbreserve(&so->so_snd, bufsize);
	6119	}
	6120	tp->t_maxseg = mss;
	6121
	6122	ASSERT(tp->t_maxseg);
	6123
	6124	/*
	6125	* Update MSS using recommendation from link status report. This is
	6126	* temporary
	6127	*/
	6128	tcp_update_mss_locked(so, ifp);
	6129
	6130	#if RTV_RPIPE
	6131	bufsize = rt->rt_rmx.rmx_recvpipe;
	6132	if (bufsize < so->so_rcv.sb_hiwat)
	6133	#endif
	6134	bufsize = so->so_rcv.sb_hiwat;
	6135	if (bufsize > mss) {
	6136	bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
	6137	if (bufsize > sb_max_corrected) {
	6138	bufsize = sb_max_corrected;
	6139	}
	6140	(void)sbreserve(&so->so_rcv, bufsize);
	6141	}
	6142
	6143	set_tcp_stream_priority(so);
	6144
	6145	if (rt->rt_rmx.rmx_ssthresh) {
	6146	/*
	6147	* There's some sort of gateway or interface
	6148	* buffer limit on the path. Use this to set
	6149	* slow-start threshold, but set the threshold to
	6150	* no less than 2*mss.
	6151	*/
	6152	tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
	6153	tcpstat.tcps_usedssthresh++;
	6154	} else {
	6155	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
	6156	}
	6157
	6158	/*
	6159	* Set the slow-start flight size depending on whether this
	6160	* is a local network or not.
	6161	*/
	6162	if (CC_ALGO(tp)->cwnd_init != NULL) {
	6163	CC_ALGO(tp)->cwnd_init(tp);
	6164	}
	6165
	6166	tcp_ccdbg_trace(tp, NULL, TCP_CC_CWND_INIT);
	6167
	6168	/* Route locked during lookup above */
	6169	RT_UNLOCK(rt);
	6170	}
	6171
	6172	/*
	6173	* Determine the MSS option to send on an outgoing SYN.
	6174	*/
	6175	int
	6176	tcp_mssopt(struct tcpcb *tp)
	6177	{
	6178	struct rtentry *rt;
	6179	int mss;
	6180	int isipv6;
	6181	int min_protoh;
	6182
	6183	isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
	6184	min_protoh = isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr)
	6185	: sizeof(struct tcpiphdr);
	6186
	6187	if (isipv6) {
	6188	rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE);
	6189	} else {
	6190	rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
	6191	}
	6192	if (rt == NULL) {
	6193	return isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
	6194	}
	6195	/*
	6196	* Slower link window correction:
	6197	* If a value is specificied for slowlink_wsize use it for PPP links
	6198	* believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
	6199	* it is the default value adversized by pseudo-devices over ppp.
	6200	*/
	6201	if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
	6202	rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
	6203	tp->t_flags \|= TF_SLOWLINK;
	6204	}
	6205
	6206	mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
	6207	/* Route locked during lookup above */
	6208	RT_UNLOCK(rt);
	6209
	6210	#if NECP
	6211	// At this point, the mss is just the MTU. Adjust if necessary.
	6212	mss = necp_socket_get_effective_mtu(tp->t_inpcb, mss);
	6213	#endif /* NECP */
	6214
	6215	return mss - min_protoh;
	6216	}
	6217
	6218	/*
	6219	* On a partial ack arrives, force the retransmission of the
	6220	* next unacknowledged segment. Do not clear tp->t_dupacks.
	6221	* By setting snd_nxt to th_ack, this forces retransmission timer to
	6222	* be started again.
	6223	*/
	6224	static void
	6225	tcp_newreno_partial_ack(struct tcpcb tp, struct tcphdr th)
	6226	{
	6227	tcp_seq onxt = tp->snd_nxt;
	6228	u_int32_t ocwnd = tp->snd_cwnd;
	6229	tp->t_timer[TCPT_REXMT] = 0;
	6230	tp->t_timer[TCPT_PTO] = 0;
	6231	tp->t_rtttime = 0;
	6232	tp->snd_nxt = th->th_ack;
	6233	/*
	6234	* Set snd_cwnd to one segment beyond acknowledged offset
	6235	* (tp->snd_una has not yet been updated when this function
	6236	* is called)
	6237	*/
	6238	tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(th, tp);
	6239	(void) tcp_output(tp);
	6240	tp->snd_cwnd = ocwnd;
	6241	if (SEQ_GT(onxt, tp->snd_nxt)) {
	6242	tp->snd_nxt = onxt;
	6243	}
	6244	/*
	6245	* Partial window deflation. Relies on fact that tp->snd_una
	6246	* not updated yet.
	6247	*/
	6248	if (tp->snd_cwnd > BYTES_ACKED(th, tp)) {
	6249	tp->snd_cwnd -= BYTES_ACKED(th, tp);
	6250	} else {
	6251	tp->snd_cwnd = 0;
	6252	}
	6253	tp->snd_cwnd += tp->t_maxseg;
	6254	}
	6255
	6256	/*
	6257	* Drop a random TCP connection that hasn't been serviced yet and
	6258	* is eligible for discard. There is a one in qlen chance that
	6259	* we will return a null, saying that there are no dropable
	6260	* requests. In this case, the protocol specific code should drop
	6261	* the new request. This insures fairness.
	6262	*
	6263	* The listening TCP socket "head" must be locked
	6264	*/
	6265	static int
	6266	tcp_dropdropablreq(struct socket *head)
	6267	{
	6268	struct socket so, sonext;
	6269	unsigned int i, j, qlen;
	6270	static u_int32_t rnd = 0;
	6271	static u_int64_t old_runtime;
	6272	static unsigned int cur_cnt, old_cnt;
	6273	u_int64_t now_sec;
	6274	struct inpcb *inp = NULL;
	6275	struct tcpcb *tp;
	6276
	6277	if ((head->so_options & SO_ACCEPTCONN) == 0) {
	6278	return 0;
	6279	}
	6280
	6281	if (TAILQ_EMPTY(&head->so_incomp)) {
	6282	return 0;
	6283	}
	6284
	6285	so_acquire_accept_list(head, NULL);
	6286	socket_unlock(head, 0);
	6287
	6288	/*
	6289	* Check if there is any socket in the incomp queue
	6290	* that is closed because of a reset from the peer and is
	6291	* waiting to be garbage collected. If so, pick that as
	6292	* the victim
	6293	*/
	6294	TAILQ_FOREACH_SAFE(so, &head->so_incomp, so_list, sonext) {
	6295	inp = sotoinpcb(so);
	6296	tp = intotcpcb(inp);
	6297	if (tp != NULL && tp->t_state == TCPS_CLOSED &&
	6298	so->so_head != NULL &&
	6299	(so->so_state & (SS_INCOMP \| SS_CANTSENDMORE \| SS_CANTRCVMORE)) ==
	6300	(SS_INCOMP \| SS_CANTSENDMORE \| SS_CANTRCVMORE)) {
	6301	/*
	6302	* The listen socket is already locked but we
	6303	* can lock this socket here without lock ordering
	6304	* issues because it is in the incomp queue and
	6305	* is not visible to others.
	6306	*/
	6307	if (socket_try_lock(so)) {
	6308	so->so_usecount++;
	6309	goto found_victim;
	6310	} else {
	6311	continue;
	6312	}
	6313	}
	6314	}
	6315
	6316	so = TAILQ_FIRST(&head->so_incomp);
	6317
	6318	now_sec = net_uptime();
	6319	if ((i = (now_sec - old_runtime)) != 0) {
	6320	old_runtime = now_sec;
	6321	old_cnt = cur_cnt / i;
	6322	cur_cnt = 0;
	6323	}
	6324
	6325	qlen = head->so_incqlen;
	6326	if (rnd == 0) {
	6327	rnd = RandomULong();
	6328	}
	6329
	6330	if (++cur_cnt > qlen \|\| old_cnt > qlen) {
	6331	rnd = (314159 * rnd + 66329) & 0xffff;
	6332	j = ((qlen + 1) * rnd) >> 16;
	6333
	6334	while (j-- && so) {
	6335	so = TAILQ_NEXT(so, so_list);
	6336	}
	6337	}
	6338	/* Find a connection that is not already closing (or being served) */
	6339	while (so) {
	6340	inp = (struct inpcb *)so->so_pcb;
	6341
	6342	sonext = TAILQ_NEXT(so, so_list);
	6343
	6344	if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
	6345	/*
	6346	* Avoid the issue of a socket being accepted
	6347	* by one input thread and being dropped by
	6348	* another input thread. If we can't get a hold
	6349	* on this mutex, then grab the next socket in
	6350	* line.
	6351	*/
	6352	if (socket_try_lock(so)) {
	6353	so->so_usecount++;
	6354	if ((so->so_usecount == 2) &&
	6355	(so->so_state & SS_INCOMP) &&
	6356	!(so->so_flags & SOF_INCOMP_INPROGRESS)) {
	6357	break;
	6358	} else {
	6359	/*
	6360	* don't use if being accepted or
	6361	* used in any other way
	6362	*/
	6363	in_pcb_checkstate(inp, WNT_RELEASE, 1);
	6364	socket_unlock(so, 1);
	6365	}
	6366	} else {
	6367	/*
	6368	* do not try to lock the inp in
	6369	* in_pcb_checkstate because the lock
	6370	* is already held in some other thread.
	6371	* Only drop the inp_wntcnt reference.
	6372	*/
	6373	in_pcb_checkstate(inp, WNT_RELEASE, 1);
	6374	}
	6375	}
	6376	so = sonext;
	6377	}
	6378	if (so == NULL) {
	6379	socket_lock(head, 0);
	6380	so_release_accept_list(head);
	6381	return 0;
	6382	}
	6383
	6384	/* Makes sure socket is still in the right state to be discarded */
	6385
	6386	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
	6387	socket_unlock(so, 1);
	6388	socket_lock(head, 0);
	6389	so_release_accept_list(head);
	6390	return 0;
	6391	}
	6392
	6393	found_victim:
	6394	if (so->so_usecount != 2 \|\| !(so->so_state & SS_INCOMP)) {
	6395	/* do not discard: that socket is being accepted */
	6396	socket_unlock(so, 1);
	6397	socket_lock(head, 0);
	6398	so_release_accept_list(head);
	6399	return 0;
	6400	}
	6401
	6402	socket_lock(head, 0);
	6403	TAILQ_REMOVE(&head->so_incomp, so, so_list);
	6404	head->so_incqlen--;
	6405	head->so_qlen--;
	6406	so->so_state &= ~SS_INCOMP;
	6407	so->so_flags \|= SOF_OVERFLOW;
	6408	so->so_head = NULL;
	6409	so_release_accept_list(head);
	6410	socket_unlock(head, 0);
	6411
	6412	socket_lock_assert_owned(so);
	6413	tp = sototcpcb(so);
	6414
	6415	tcp_close(tp);
	6416	if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) {
	6417	/*
	6418	* Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
	6419	* doesn't require a lock, it could have happened while
	6420	* we are holding the lock. This pcb will have to
	6421	* be garbage collected later.
	6422	* Release the reference held for so_incomp queue
	6423	*/
	6424	VERIFY(so->so_usecount > 0);
	6425	so->so_usecount--;
	6426	socket_unlock(so, 1);
	6427	} else {
	6428	/*
	6429	* Unlock this socket and leave the reference on.
	6430	* We need to acquire the pcbinfo lock in order to
	6431	* fully dispose it off
	6432	*/
	6433	socket_unlock(so, 0);
	6434
	6435	lck_rw_lock_exclusive(tcbinfo.ipi_lock);
	6436
	6437	socket_lock(so, 0);
	6438	/* Release the reference held for so_incomp queue */
	6439	VERIFY(so->so_usecount > 0);
	6440	so->so_usecount--;
	6441
	6442	if (so->so_usecount != 1 \|\|
	6443	(inp->inp_wantcnt > 0 &&
	6444	inp->inp_wantcnt != WNT_STOPUSING)) {
	6445	/*
	6446	* There is an extra wantcount or usecount
	6447	* that must have been added when the socket
	6448	* was unlocked. This socket will have to be
	6449	* garbage collected later
	6450	*/
	6451	socket_unlock(so, 1);
	6452	} else {
	6453	/* Drop the reference held for this function */
	6454	VERIFY(so->so_usecount > 0);
	6455	so->so_usecount--;
	6456
	6457	in_pcbdispose(inp);
	6458	}
	6459	lck_rw_done(tcbinfo.ipi_lock);
	6460	}
	6461	tcpstat.tcps_drops++;
	6462
	6463	socket_lock(head, 0);
	6464	return 1;
	6465	}
	6466
	6467	/* Set background congestion control on a socket */
	6468	void
	6469	tcp_set_background_cc(struct socket *so)
	6470	{
	6471	tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
	6472	}
	6473
	6474	/* Set foreground congestion control on a socket */
	6475	void
	6476	tcp_set_foreground_cc(struct socket *so)
	6477	{
	6478	if (tcp_use_newreno) {
	6479	tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX);
	6480	} else {
	6481	tcp_set_new_cc(so, TCP_CC_ALGO_CUBIC_INDEX);
	6482	}
	6483	}
	6484
	6485	static void
	6486	tcp_set_new_cc(struct socket *so, uint16_t cc_index)
	6487	{
	6488	struct inpcb *inp = sotoinpcb(so);
	6489	struct tcpcb *tp = intotcpcb(inp);
	6490	u_char old_cc_index = 0;
	6491	if (tp->tcp_cc_index != cc_index) {
	6492	old_cc_index = tp->tcp_cc_index;
	6493
	6494	if (CC_ALGO(tp)->cleanup != NULL) {
	6495	CC_ALGO(tp)->cleanup(tp);
	6496	}
	6497	tp->tcp_cc_index = cc_index;
	6498
	6499	tcp_cc_allocate_state(tp);
	6500
	6501	if (CC_ALGO(tp)->switch_to != NULL) {
	6502	CC_ALGO(tp)->switch_to(tp, old_cc_index);
	6503	}
	6504
	6505	tcp_ccdbg_trace(tp, NULL, TCP_CC_CHANGE_ALGO);
	6506	}
	6507	}
	6508
	6509	void
	6510	tcp_set_recv_bg(struct socket *so)
	6511	{
	6512	if (!IS_TCP_RECV_BG(so)) {
	6513	so->so_flags1 \|= SOF1_TRAFFIC_MGT_TCP_RECVBG;
	6514	}
	6515	}
	6516
	6517	void
	6518	tcp_clear_recv_bg(struct socket *so)
	6519	{
	6520	if (IS_TCP_RECV_BG(so)) {
	6521	so->so_flags1 &= ~(SOF1_TRAFFIC_MGT_TCP_RECVBG);
	6522	}
	6523	}
	6524
	6525	void
	6526	inp_fc_throttle_tcp(struct inpcb *inp)
	6527	{
	6528	struct tcpcb *tp = inp->inp_ppcb;
	6529
	6530	if (!tcp_flow_control_response) {
	6531	return;
	6532	}
	6533
	6534	/*
	6535	* Back off the slow-start threshold and enter
	6536	* congestion avoidance phase
	6537	*/
	6538	if (CC_ALGO(tp)->pre_fr != NULL) {
	6539	CC_ALGO(tp)->pre_fr(tp);
	6540	}
	6541	}
	6542
	6543	void
	6544	inp_fc_unthrottle_tcp(struct inpcb *inp)
	6545	{
	6546	struct tcpcb *tp = inp->inp_ppcb;
	6547
	6548	if (tcp_flow_control_response) {
	6549	if (CC_ALGO(tp)->post_fr != NULL) {
	6550	CC_ALGO(tp)->post_fr(tp, NULL);
	6551	}
	6552
	6553	tp->t_bytes_acked = 0;
	6554
	6555	/*
	6556	* Reset retransmit shift as we know that the reason
	6557	* for delay in sending a packet is due to flow
	6558	* control on the outgoing interface. There is no need
	6559	* to backoff retransmit timer.
	6560	*/
	6561	TCP_RESET_REXMT_STATE(tp);
	6562
	6563	tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
	6564
	6565	/*
	6566	* Start the output stream again. Since we are
	6567	* not retransmitting data, do not reset the
	6568	* retransmit timer or rtt calculation.
	6569	*/
	6570	tcp_output(tp);
	6571	return;
	6572	}
	6573
	6574	/*
	6575	* Back off the slow-start threshold and enter
	6576	* congestion avoidance phase
	6577	*/
	6578	if (CC_ALGO(tp)->pre_fr != NULL) {
	6579	CC_ALGO(tp)->pre_fr(tp);
	6580	}
	6581
	6582	tp->snd_cwnd = tp->snd_ssthresh;
	6583	tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
	6584	/*
	6585	* Restart counting for ABC as we changed the
	6586	* congestion window just now.
	6587	*/
	6588	tp->t_bytes_acked = 0;
	6589
	6590	/* Reset retransmit shift as we know that the reason
	6591	* for delay in sending a packet is due to flow
	6592	* control on the outgoing interface. There is no need
	6593	* to backoff retransmit timer.
	6594	*/
	6595	TCP_RESET_REXMT_STATE(tp);
	6596
	6597	/*
	6598	* Start the output stream again. Since we are
	6599	* not retransmitting data, do not reset the
	6600	* retransmit timer or rtt calculation.
	6601	*/
	6602	tcp_output(tp);
	6603	}
	6604
	6605	static int
	6606	tcp_getstat SYSCTL_HANDLER_ARGS
	6607	{
	6608	#pragma unused(oidp, arg1, arg2)
	6609
	6610	int error;
	6611	struct tcpstat *stat;
	6612	stat = &tcpstat;
	6613	#if XNU_TARGET_OS_OSX
	6614	struct tcpstat zero_stat;
	6615
	6616	if (tcp_disable_access_to_stats &&
	6617	!kauth_cred_issuser(kauth_cred_get())) {
	6618	bzero(&zero_stat, sizeof(zero_stat));
	6619	stat = &zero_stat;
	6620	}
	6621
	6622	#endif /* XNU_TARGET_OS_OSX */
	6623
	6624	if (req->oldptr == 0) {
	6625	req->oldlen = (size_t)sizeof(struct tcpstat);
	6626	}
	6627
	6628	error = SYSCTL_OUT(req, stat, MIN(sizeof(tcpstat), req->oldlen));
	6629
	6630	return error;
	6631	}
	6632
	6633	/*
	6634	* Checksum extended TCP header and data.
	6635	*/
	6636	int
	6637	tcp_input_checksum(int af, struct mbuf m, struct tcphdr th, int off, int tlen)
	6638	{
	6639	struct ifnet *ifp = m->m_pkthdr.rcvif;
	6640
	6641	switch (af) {
	6642	case AF_INET: {
	6643	struct ip ip = mtod(m, struct ip );
	6644	struct ipovly ipov = (struct ipovly )ip;
	6645
	6646	/* ip_stripoptions() must have been called before we get here */
	6647	ASSERT((ip->ip_hl << 2) == sizeof(*ip));
	6648
	6649	if ((hwcksum_rx \|\| (ifp->if_flags & IFF_LOOPBACK) \|\|
	6650	(m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
	6651	(m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
	6652	if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
	6653	th->th_sum = m->m_pkthdr.csum_rx_val;
	6654	} else {
	6655	uint32_t sum = m->m_pkthdr.csum_rx_val;
	6656	uint32_t start = m->m_pkthdr.csum_rx_start;
	6657	int32_t trailer = (m_pktlen(m) - (off + tlen));
	6658
	6659	/*
	6660	* Perform 1's complement adjustment of octets
	6661	* that got included/excluded in the hardware-
	6662	* calculated checksum value. Ignore cases
	6663	* where the value already includes the entire
	6664	* IP header span, as the sum for those octets
	6665	* would already be 0 by the time we get here;
	6666	* IP has already performed its header checksum
	6667	* checks. If we do need to adjust, restore
	6668	* the original fields in the IP header when
	6669	* computing the adjustment value. Also take
	6670	* care of any trailing bytes and subtract out
	6671	* their partial sum.
	6672	*/
	6673	ASSERT(trailer >= 0);
	6674	if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
	6675	((start != 0 && start != off) \|\| trailer)) {
	6676	uint32_t swbytes = (uint32_t)trailer;
	6677
	6678	if (start < off) {
	6679	ip->ip_len += sizeof(*ip);
	6680	#if BYTE_ORDER != BIG_ENDIAN
	6681	HTONS(ip->ip_len);
	6682	HTONS(ip->ip_off);
	6683	#endif /* BYTE_ORDER != BIG_ENDIAN */
	6684	}
	6685	/* callee folds in sum */
	6686	sum = m_adj_sum16(m, start, off,
	6687	tlen, sum);
	6688	if (off > start) {
	6689	swbytes += (off - start);
	6690	} else {
	6691	swbytes += (start - off);
	6692	}
	6693
	6694	if (start < off) {
	6695	#if BYTE_ORDER != BIG_ENDIAN
	6696	NTOHS(ip->ip_off);
	6697	NTOHS(ip->ip_len);
	6698	#endif /* BYTE_ORDER != BIG_ENDIAN */
	6699	ip->ip_len -= sizeof(*ip);
	6700	}
	6701
	6702	if (swbytes != 0) {
	6703	tcp_in_cksum_stats(swbytes);
	6704	}
	6705	if (trailer != 0) {
	6706	m_adj(m, -trailer);
	6707	}
	6708	}
	6709
	6710	/* callee folds in sum */
	6711	th->th_sum = in_pseudo(ip->ip_src.s_addr,
	6712	ip->ip_dst.s_addr,
	6713	sum + htonl(tlen + IPPROTO_TCP));
	6714	}
	6715	th->th_sum ^= 0xffff;
	6716	} else {
	6717	uint16_t ip_sum;
	6718	int len;
	6719	char b[9];
	6720
	6721	bcopy(ipov->ih_x1, b, sizeof(ipov->ih_x1));
	6722	bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
	6723	ip_sum = ipov->ih_len;
	6724	ipov->ih_len = (u_short)tlen;
	6725	#if BYTE_ORDER != BIG_ENDIAN
	6726	HTONS(ipov->ih_len);
	6727	#endif
	6728	len = sizeof(struct ip) + tlen;
	6729	th->th_sum = in_cksum(m, len);
	6730	bcopy(b, ipov->ih_x1, sizeof(ipov->ih_x1));
	6731	ipov->ih_len = ip_sum;
	6732
	6733	tcp_in_cksum_stats(len);
	6734	}
	6735	break;
	6736	}
	6737	case AF_INET6: {
	6738	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	6739
	6740	if ((hwcksum_rx \|\| (ifp->if_flags & IFF_LOOPBACK) \|\|
	6741	(m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
	6742	(m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
	6743	if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
	6744	th->th_sum = m->m_pkthdr.csum_rx_val;
	6745	} else {
	6746	uint32_t sum = m->m_pkthdr.csum_rx_val;
	6747	uint32_t start = m->m_pkthdr.csum_rx_start;
	6748	int32_t trailer = (m_pktlen(m) - (off + tlen));
	6749
	6750	/*
	6751	* Perform 1's complement adjustment of octets
	6752	* that got included/excluded in the hardware-
	6753	* calculated checksum value. Also take care
	6754	* of any trailing bytes and subtract out their
	6755	* partial sum.
	6756	*/
	6757	ASSERT(trailer >= 0);
	6758	if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
	6759	(start != off \|\| trailer != 0)) {
	6760	uint16_t s = 0, d = 0;
	6761	uint32_t swbytes = (uint32_t)trailer;
	6762
	6763	if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
	6764	s = ip6->ip6_src.s6_addr16[1];
	6765	ip6->ip6_src.s6_addr16[1] = 0;
	6766	}
	6767	if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
	6768	d = ip6->ip6_dst.s6_addr16[1];
	6769	ip6->ip6_dst.s6_addr16[1] = 0;
	6770	}
	6771
	6772	/* callee folds in sum */
	6773	sum = m_adj_sum16(m, start, off,
	6774	tlen, sum);
	6775	if (off > start) {
	6776	swbytes += (off - start);
	6777	} else {
	6778	swbytes += (start - off);
	6779	}
	6780
	6781	if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
	6782	ip6->ip6_src.s6_addr16[1] = s;
	6783	}
	6784	if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
	6785	ip6->ip6_dst.s6_addr16[1] = d;
	6786	}
	6787
	6788	if (swbytes != 0) {
	6789	tcp_in6_cksum_stats(swbytes);
	6790	}
	6791	if (trailer != 0) {
	6792	m_adj(m, -trailer);
	6793	}
	6794	}
	6795
	6796	th->th_sum = in6_pseudo(
	6797	&ip6->ip6_src, &ip6->ip6_dst,
	6798	sum + htonl(tlen + IPPROTO_TCP));
	6799	}
	6800	th->th_sum ^= 0xffff;
	6801	} else {
	6802	tcp_in6_cksum_stats(tlen);
	6803	th->th_sum = in6_cksum(m, IPPROTO_TCP, off, tlen);
	6804	}
	6805	break;
	6806	}
	6807	default:
	6808	VERIFY(0);
	6809	/* NOTREACHED */
	6810	}
	6811
	6812	if (th->th_sum != 0) {
	6813	tcpstat.tcps_rcvbadsum++;
	6814	IF_TCP_STATINC(ifp, badformat);
	6815	return -1;
	6816	}
	6817
	6818	return 0;
	6819	}
	6820
	6821
	6822	SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats,
	6823	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, 0, 0, tcp_getstat,
	6824	"S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
	6825
	6826	static int
	6827	sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
	6828	{
	6829	#pragma unused(arg1, arg2)
	6830
	6831	int error, val = tcprexmtthresh;
	6832
	6833	error = sysctl_handle_int(oidp, &val, 0, req);
	6834	if (error \|\| !req->newptr) {
	6835	return error;
	6836	}
	6837
	6838	/*
	6839	* Constrain the number of duplicate ACKs
	6840	* to consider for TCP fast retransmit
	6841	* to either 2 or 3
	6842	*/
	6843
	6844	if (val < 2 \|\| val > 3) {
	6845	return EINVAL;
	6846	}
	6847
	6848	tcprexmtthresh = val;
	6849
	6850	return 0;
	6851	}
	6852
	6853	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT \| CTLFLAG_RW \|
	6854	CTLFLAG_LOCKED, &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I",
	6855	"Duplicate ACK Threshold for Fast Retransmit");