git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2019 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	30	* The Regents of the University of California. All rights reserved.
	31	*
	32	* Redistribution and use in source and binary forms, with or without
	33	* modification, are permitted provided that the following conditions
	34	* are met:
	35	* 1. Redistributions of source code must retain the above copyright
	36	* notice, this list of conditions and the following disclaimer.
	37	* 2. Redistributions in binary form must reproduce the above copyright
	38	* notice, this list of conditions and the following disclaimer in the
	39	* documentation and/or other materials provided with the distribution.
	40	* 3. All advertising materials mentioning features or use of this software
	41	* must display the following acknowledgement:
	42	* This product includes software developed by the University of
	43	* California, Berkeley and its contributors.
	44	* 4. Neither the name of the University nor the names of its contributors
	45	* may be used to endorse or promote products derived from this software
	46	* without specific prior written permission.
	47	*
	48	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	49	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	50	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	51	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	52	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	53	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	54	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	55	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	56	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	57	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	58	* SUCH DAMAGE.
	59	*
	60	* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
	61	* $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
	62	*/
	63	/*
	64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
	65	* support for mandatory and extensible security protections. This notice
	66	* is included in support of clause 2.2 (b) of the Apple Public License,
	67	* Version 2.0.
	68	*/
	69
	70	#define _IP_VHL
	71
	72
	73	#include <sys/param.h>
	74	#include <sys/systm.h>
	75	#include <sys/kernel.h>
	76	#include <sys/sysctl.h>
	77	#include <sys/mbuf.h>
	78	#include <sys/domain.h>
	79	#include <sys/protosw.h>
	80	#include <sys/socket.h>
	81	#include <sys/socketvar.h>
	82
	83	#include <net/route.h>
	84	#include <net/ntstat.h>
	85	#include <net/if_var.h>
	86	#include <net/if.h>
	87	#include <net/if_types.h>
	88	#include <net/dlil.h>
	89
	90	#include <netinet/in.h>
	91	#include <netinet/in_systm.h>
	92	#include <netinet/in_var.h>
	93	#include <netinet/in_tclass.h>
	94	#include <netinet/ip.h>
	95	#include <netinet/in_pcb.h>
	96	#include <netinet/ip_var.h>
	97	#include <mach/sdt.h>
	98	#if INET6
	99	#include <netinet6/in6_pcb.h>
	100	#include <netinet/ip6.h>
	101	#include <netinet6/ip6_var.h>
	102	#endif
	103	#include <netinet/tcp.h>
	104	#define TCPOUTFLAGS
	105	#include <netinet/tcp_cache.h>
	106	#include <netinet/tcp_fsm.h>
	107	#include <netinet/tcp_seq.h>
	108	#include <netinet/tcp_timer.h>
	109	#include <netinet/tcp_var.h>
	110	#include <netinet/tcpip.h>
	111	#include <netinet/tcp_cc.h>
	112	#if TCPDEBUG
	113	#include <netinet/tcp_debug.h>
	114	#endif
	115	#include <netinet/tcp_log.h>
	116	#include <sys/kdebug.h>
	117	#include <mach/sdt.h>
	118
	119	#if IPSEC
	120	#include <netinet6/ipsec.h>
	121	#endif /IPSEC/
	122
	123	#if CONFIG_MACF_NET
	124	#include <security/mac_framework.h>
	125	#endif /* MAC_SOCKET */
	126
	127	#include <netinet/lro_ext.h>
	128	#if MPTCP
	129	#include <netinet/mptcp_var.h>
	130	#include <netinet/mptcp.h>
	131	#include <netinet/mptcp_opt.h>
	132	#endif
	133
	134	#include <corecrypto/ccaes.h>
	135
	136	#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1)
	137	#define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3)
	138	#define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) \| 1)
	139
	140	SYSCTL_SKMEM_TCP_INT(OID_AUTO, path_mtu_discovery,
	141	CTLFLAG_RW \| CTLFLAG_LOCKED, int, path_mtu_discovery, 1,
	142	"Enable Path MTU Discovery");
	143
	144	SYSCTL_SKMEM_TCP_INT(OID_AUTO, slowstart_flightsize,
	145	CTLFLAG_RW \| CTLFLAG_LOCKED, int, ss_fltsz, 1,
	146	"Slow start flight size");
	147
	148	SYSCTL_SKMEM_TCP_INT(OID_AUTO, local_slowstart_flightsize,
	149	CTLFLAG_RW \| CTLFLAG_LOCKED, int, ss_fltsz_local, 8,
	150	"Slow start flight size for local networks");
	151
	152	int tcp_do_tso = 1;
	153	SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW \| CTLFLAG_LOCKED,
	154	&tcp_do_tso, 0, "Enable TCP Segmentation Offload");
	155
	156	SYSCTL_SKMEM_TCP_INT(OID_AUTO, ecn_setup_percentage,
	157	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_ecn_setup_percentage, 100,
	158	"Max ECN setup percentage");
	159
	160	static int
	161	sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS
	162	{
	163	#pragma unused(oidp, arg1, arg2)
	164	int i, err = 0, changed = 0;
	165	struct ifnet *ifp;
	166
	167	err = sysctl_io_number(req, tcp_ecn_outbound, sizeof(int32_t),
	168	&i, &changed);
	169	if (err != 0 \|\| req->newptr == USER_ADDR_NULL)
	170	return err;
	171
	172	if (changed) {
	173	if ((tcp_ecn_outbound == 0 \|\| tcp_ecn_outbound == 1) &&
	174	(i == 0 \|\| i == 1)) {
	175	tcp_ecn_outbound = i;
	176	SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_initiate_out, tcp_ecn_outbound);
	177	return err;
	178	}
	179	if (tcp_ecn_outbound == 2 && (i == 0 \|\| i == 1)) {
	180	/*
	181	* Reset ECN enable flags on non-cellular
	182	* interfaces so that the system default will take
	183	* over
	184	*/
	185	ifnet_head_lock_shared();
	186	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
	187	if (!IFNET_IS_CELLULAR(ifp)) {
	188	ifnet_lock_exclusive(ifp);
	189	ifp->if_eflags &= ~IFEF_ECN_DISABLE;
	190	ifp->if_eflags &= ~IFEF_ECN_ENABLE;
	191	ifnet_lock_done(ifp);
	192	}
	193	}
	194	ifnet_head_done();
	195	} else {
	196	/*
	197	* Set ECN enable flags on non-cellular
	198	* interfaces
	199	*/
	200	ifnet_head_lock_shared();
	201	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
	202	if (!IFNET_IS_CELLULAR(ifp)) {
	203	ifnet_lock_exclusive(ifp);
	204	ifp->if_eflags \|= IFEF_ECN_ENABLE;
	205	ifp->if_eflags &= ~IFEF_ECN_DISABLE;
	206	ifnet_lock_done(ifp);
	207	}
	208	}
	209	ifnet_head_done();
	210	}
	211	tcp_ecn_outbound = i;
	212	SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_initiate_out, tcp_ecn_outbound);
	213	}
	214	/* Change the other one too as the work is done */
	215	if (i == 2 \|\| tcp_ecn_inbound == 2) {
	216	tcp_ecn_inbound = i;
	217	SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_negotiate_in, tcp_ecn_inbound);
	218	}
	219	return err;
	220	}
	221
	222	int tcp_ecn_outbound = 2;
	223	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_initiate_out,
	224	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_ecn_outbound, 0,
	225	sysctl_change_ecn_setting, "IU",
	226	"Initiate ECN for outbound connections");
	227
	228	int tcp_ecn_inbound = 2;
	229	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_negotiate_in,
	230	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_ecn_inbound, 0,
	231	sysctl_change_ecn_setting, "IU",
	232	"Initiate ECN for inbound connections");
	233
	234	SYSCTL_SKMEM_TCP_INT(OID_AUTO, packetchain,
	235	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_packet_chaining, 50,
	236	"Enable TCP output packet chaining");
	237
	238	SYSCTL_SKMEM_TCP_INT(OID_AUTO, socket_unlocked_on_output,
	239	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_output_unlocked, 1,
	240	"Unlock TCP when sending packets down to IP");
	241
	242	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rfc3390,
	243	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_do_rfc3390, 1,
	244	"Calculate intial slowstart cwnd depending on MSS");
	245
	246	SYSCTL_SKMEM_TCP_INT(OID_AUTO, min_iaj_win,
	247	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_min_iaj_win, MIN_IAJ_WIN,
	248	"Minimum recv win based on inter-packet arrival jitter");
	249
	250	SYSCTL_SKMEM_TCP_INT(OID_AUTO, acc_iaj_react_limit,
	251	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_acc_iaj_react_limit,
	252	ACC_IAJ_REACT_LIMIT, "Accumulated IAJ when receiver starts to react");
	253
	254	SYSCTL_SKMEM_TCP_INT(OID_AUTO, doautosndbuf,
	255	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_do_autosendbuf, 1,
	256	"Enable send socket buffer auto-tuning");
	257
	258	SYSCTL_SKMEM_TCP_INT(OID_AUTO, autosndbufinc,
	259	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_autosndbuf_inc,
	260	8 * 1024, "Increment in send socket bufffer size");
	261
	262	SYSCTL_SKMEM_TCP_INT(OID_AUTO, autosndbufmax,
	263	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_autosndbuf_max, 512 * 1024,
	264	"Maximum send socket buffer size");
	265
	266	SYSCTL_SKMEM_TCP_INT(OID_AUTO, ack_prioritize,
	267	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_prioritize_acks, 1,
	268	"Prioritize pure acks");
	269
	270	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rtt_recvbg,
	271	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_use_rtt_recvbg, 1,
	272	"Use RTT for bg recv algorithm");
	273
	274	SYSCTL_SKMEM_TCP_INT(OID_AUTO, recv_throttle_minwin,
	275	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_recv_throttle_minwin, 16 * 1024,
	276	"Minimum recv win for throttling");
	277
	278	SYSCTL_SKMEM_TCP_INT(OID_AUTO, enable_tlp,
	279	CTLFLAG_RW \| CTLFLAG_LOCKED,
	280	int32_t, tcp_enable_tlp, 1, "Enable Tail loss probe");
	281
	282	static int32_t packchain_newlist = 0;
	283	static int32_t packchain_looped = 0;
	284	static int32_t packchain_sent = 0;
	285
	286	/* temporary: for testing */
	287	#if IPSEC
	288	extern int ipsec_bypass;
	289	#endif
	290
	291	extern int slowlink_wsize; /* window correction for slow links */
	292	#if IPFIREWALL
	293	extern int fw_enable; /* firewall check for packet chaining */
	294	extern int fw_bypass; /* firewall check: disable packet chaining if there is rules */
	295	#endif /* IPFIREWALL */
	296
	297	extern u_int32_t dlil_filter_disable_tso_count;
	298	extern u_int32_t kipf_count;
	299
	300	static int tcp_ip_output(struct socket , struct tcpcb , struct mbuf *,
	301	int, struct mbuf *, int, int, boolean_t);
	302	static struct mbuf* tcp_send_lroacks(struct tcpcb tp, struct mbuf m, struct tcphdr *th);
	303	static int tcp_recv_throttle(struct tcpcb *tp);
	304
	305	static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len)
	306	{
	307	struct socket *so = tp->t_inpcb->inp_socket;
	308	unsigned int optlen = 0;
	309	unsigned int cookie_len;
	310
	311	if (tp->t_flags & TF_NOOPT)
	312	goto fallback;
	313
	314	if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
	315	!tcp_heuristic_do_tfo(tp)) {
	316	tp->t_tfo_stats \|= TFO_S_HEURISTICS_DISABLE;
	317	tcpstat.tcps_tfo_heuristics_disable++;
	318	goto fallback;
	319	}
	320
	321	if (so->so_flags1 & SOF1_DATA_AUTHENTICATED)
	322	return len;
	323
	324	optlen += TCPOLEN_MAXSEG;
	325
	326	if (tp->t_flags & TF_REQ_SCALE)
	327	optlen += 4;
	328
	329	#if MPTCP
	330	if ((so->so_flags & SOF_MP_SUBFLOW) && mptcp_enable &&
	331	(tp->t_rxtshift <= mptcp_mpcap_retries \|\|
	332	(tptomptp(tp)->mpt_mpte->mpte_flags & MPTE_FORCE_ENABLE)))
	333	optlen += sizeof(struct mptcp_mpcapable_opt_common) + sizeof(mptcp_key_t);
	334	#endif /* MPTCP */
	335
	336	if (tp->t_flags & TF_REQ_TSTMP)
	337	optlen += TCPOLEN_TSTAMP_APPA;
	338
	339	if (SACK_ENABLED(tp))
	340	optlen += TCPOLEN_SACK_PERMITTED;
	341
	342	/* Now, decide whether to use TFO or not */
	343
	344	/* Don't even bother trying if there is no space at all... */
	345	if (MAX_TCPOPTLEN - optlen < TCPOLEN_FASTOPEN_REQ)
	346	goto fallback;
	347
	348	cookie_len = tcp_cache_get_cookie_len(tp);
	349	if (cookie_len == 0)
	350	/* No cookie, so we request one */
	351	return 0;
	352
	353	/* There is not enough space for the cookie, so we cannot do TFO */
	354	if (MAX_TCPOPTLEN - optlen < cookie_len)
	355	goto fallback;
	356
	357	/* Do not send SYN+data if there is more in the queue than MSS */
	358	if (so->so_snd.sb_cc > (tp->t_maxopd - MAX_TCPOPTLEN))
	359	goto fallback;
	360
	361	/* Ok, everything looks good. We can go on and do TFO */
	362	return len;
	363
	364	fallback:
	365	tcp_disable_tfo(tp);
	366	return 0;
	367	}
	368
	369	/* Returns the number of bytes written to the TCP option-space */
	370	static unsigned
	371	tcp_tfo_write_cookie_rep(struct tcpcb tp, unsigned optlen, u_char opt)
	372	{
	373	u_char out[CCAES_BLOCK_SIZE];
	374	unsigned ret = 0;
	375	u_char *bp;
	376
	377	if ((MAX_TCPOPTLEN - optlen) <
	378	(TCPOLEN_FASTOPEN_REQ + TFO_COOKIE_LEN_DEFAULT))
	379	return ret;
	380
	381	tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out));
	382
	383	bp = opt + optlen;
	384
	385	*bp++ = TCPOPT_FASTOPEN;
	386	*bp++ = 2 + TFO_COOKIE_LEN_DEFAULT;
	387	memcpy(bp, out, TFO_COOKIE_LEN_DEFAULT);
	388	ret += 2 + TFO_COOKIE_LEN_DEFAULT;
	389
	390	tp->t_tfo_stats \|= TFO_S_COOKIE_SENT;
	391	tcpstat.tcps_tfo_cookie_sent++;
	392
	393	return ret;
	394	}
	395
	396	static unsigned
	397	tcp_tfo_write_cookie(struct tcpcb *tp, unsigned optlen, int32_t len,
	398	u_char *opt)
	399	{
	400	u_int8_t tfo_len = MAX_TCPOPTLEN - optlen - TCPOLEN_FASTOPEN_REQ;
	401	struct socket *so = tp->t_inpcb->inp_socket;
	402	unsigned ret = 0;
	403	int res;
	404	u_char *bp;
	405
	406	if (so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
	407	/* If there is some data, let's track it */
	408	if (len > 0) {
	409	tp->t_tfo_stats \|= TFO_S_SYN_DATA_SENT;
	410	tcpstat.tcps_tfo_syn_data_sent++;
	411	}
	412
	413	return 0;
	414	}
	415
	416	bp = opt + optlen;
	417
	418	/*
	419	* The cookie will be copied in the appropriate place within the
	420	* TCP-option space. That way we avoid the need for an intermediate
	421	* variable.
	422	*/
	423	res = tcp_cache_get_cookie(tp, bp + TCPOLEN_FASTOPEN_REQ, &tfo_len);
	424	if (res == 0) {
	425	*bp++ = TCPOPT_FASTOPEN;
	426	*bp++ = TCPOLEN_FASTOPEN_REQ;
	427	ret += TCPOLEN_FASTOPEN_REQ;
	428
	429	tp->t_tfo_flags \|= TFO_F_COOKIE_REQ;
	430
	431	tp->t_tfo_stats \|= TFO_S_COOKIE_REQ;
	432	tcpstat.tcps_tfo_cookie_req++;
	433	} else {
	434	*bp++ = TCPOPT_FASTOPEN;
	435	*bp++ = TCPOLEN_FASTOPEN_REQ + tfo_len;
	436
	437	ret += TCPOLEN_FASTOPEN_REQ + tfo_len;
	438
	439	tp->t_tfo_flags \|= TFO_F_COOKIE_SENT;
	440
	441	/* If there is some data, let's track it */
	442	if (len > 0) {
	443	tp->t_tfo_stats \|= TFO_S_SYN_DATA_SENT;
	444	tcpstat.tcps_tfo_syn_data_sent++;
	445	}
	446	}
	447
	448	return ret;
	449	}
	450
	451	static inline bool
	452	tcp_send_ecn_flags_on_syn(struct tcpcb tp, struct socket so)
	453	{
	454	return !((tp->ecn_flags & TE_SETUPSENT \|\|
	455	(so->so_flags & SOF_MP_SUBFLOW) \|\|
	456	(tfo_enabled(tp))));
	457	}
	458
	459	void
	460	tcp_set_ecn(struct tcpcb tp, struct ifnet ifp)
	461	{
	462	boolean_t inbound;
	463
	464	/*
	465	* Socket option has precedence
	466	*/
	467	if (tp->ecn_flags & TE_ECN_MODE_ENABLE) {
	468	tp->ecn_flags \|= TE_ENABLE_ECN;
	469	goto check_heuristic;
	470	}
	471
	472	if (tp->ecn_flags & TE_ECN_MODE_DISABLE) {
	473	tp->ecn_flags &= ~TE_ENABLE_ECN;
	474	return;
	475	}
	476	/*
	477	* Per interface setting comes next
	478	*/
	479	if (ifp != NULL) {
	480	if (ifp->if_eflags & IFEF_ECN_ENABLE) {
	481	tp->ecn_flags \|= TE_ENABLE_ECN;
	482	goto check_heuristic;
	483	}
	484
	485	if (ifp->if_eflags & IFEF_ECN_DISABLE) {
	486	tp->ecn_flags &= ~TE_ENABLE_ECN;
	487	return;
	488	}
	489	}
	490	/*
	491	* System wide settings come last
	492	*/
	493	inbound = (tp->t_inpcb->inp_socket->so_head != NULL);
	494	if ((inbound && tcp_ecn_inbound == 1) \|\|
	495	(!inbound && tcp_ecn_outbound == 1)) {
	496	tp->ecn_flags \|= TE_ENABLE_ECN;
	497	goto check_heuristic;
	498	} else {
	499	tp->ecn_flags &= ~TE_ENABLE_ECN;
	500	}
	501
	502	return;
	503
	504	check_heuristic:
	505	if (!tcp_heuristic_do_ecn(tp))
	506	tp->ecn_flags &= ~TE_ENABLE_ECN;
	507
	508	/*
	509	* If the interface setting, system-level setting and heuristics
	510	* allow to enable ECN, randomly select 5% of connections to
	511	* enable it
	512	*/
	513	if ((tp->ecn_flags & (TE_ECN_MODE_ENABLE \| TE_ECN_MODE_DISABLE
	514	\| TE_ENABLE_ECN)) == TE_ENABLE_ECN) {
	515	/*
	516	* Use the random value in iss for randomizing
	517	* this selection
	518	*/
	519	if ((tp->iss % 100) >= tcp_ecn_setup_percentage)
	520	tp->ecn_flags &= ~TE_ENABLE_ECN;
	521	}
	522	}
	523
	524	/*
	525	* Tcp output routine: figure out what should be sent and send it.
	526	*
	527	* Returns: 0 Success
	528	* EADDRNOTAVAIL
	529	* ENOBUFS
	530	* EMSGSIZE
	531	* EHOSTUNREACH
	532	* ENETDOWN
	533	* ip_output_list:ENOMEM
	534	* ip_output_list:EADDRNOTAVAIL
	535	* ip_output_list:ENETUNREACH
	536	* ip_output_list:EHOSTUNREACH
	537	* ip_output_list:EACCES
	538	* ip_output_list:EMSGSIZE
	539	* ip_output_list:ENOBUFS
	540	* ip_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
	541	* ip6_output_list:EINVAL
	542	* ip6_output_list:EOPNOTSUPP
	543	* ip6_output_list:EHOSTUNREACH
	544	* ip6_output_list:EADDRNOTAVAIL
	545	* ip6_output_list:ENETUNREACH
	546	* ip6_output_list:EMSGSIZE
	547	* ip6_output_list:ENOBUFS
	548	* ip6_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
	549	*/
	550	int
	551	tcp_output(struct tcpcb *tp)
	552	{
	553	struct inpcb *inp = tp->t_inpcb;
	554	struct socket *so = inp->inp_socket;
	555	int32_t len, recwin, sendwin, off;
	556	int flags, error;
	557	struct mbuf *m;
	558	struct ip *ip = NULL;
	559	struct ipovly *ipov = NULL;
	560	#if INET6
	561	struct ip6_hdr *ip6 = NULL;
	562	#endif /* INET6 */
	563	struct tcphdr *th;
	564	u_char opt[TCP_MAXOLEN];
	565	unsigned ipoptlen, optlen, hdrlen;
	566	int idle, sendalot, lost = 0;
	567	int i, sack_rxmit;
	568	int tso = 0;
	569	int sack_bytes_rxmt;
	570	tcp_seq old_snd_nxt = 0;
	571	struct sackhole *p;
	572	#if IPSEC
	573	unsigned ipsec_optlen = 0;
	574	#endif /* IPSEC */
	575	int idle_time = 0;
	576	struct mbuf *packetlist = NULL;
	577	struct mbuf *tp_inp_options = inp->inp_depend4.inp4_options;
	578	#if INET6
	579	int isipv6 = inp->inp_vflag & INP_IPV6 ;
	580	#else
	581	int isipv6 = 0;
	582	#endif
	583	short packchain_listadd = 0;
	584	int so_options = so->so_options;
	585	struct rtentry *rt;
	586	u_int32_t svc_flags = 0, allocated_len;
	587	u_int32_t lro_ackmore = (tp->t_lropktlen != 0) ? 1 : 0;
	588	struct mbuf *mnext = NULL;
	589	int sackoptlen = 0;
	590	#if MPTCP
	591	boolean_t mptcp_acknow;
	592	#endif /* MPTCP */
	593	boolean_t cell = FALSE;
	594	boolean_t wifi = FALSE;
	595	boolean_t wired = FALSE;
	596	boolean_t sack_rescue_rxt = FALSE;
	597	int sotc = so->so_traffic_class;
	598
	599	/*
	600	* Determine length of data that should be transmitted,
	601	* and flags that will be used.
	602	* If there is some data or critical controls (SYN, RST)
	603	* to send, then transmit; otherwise, investigate further.
	604	*/
	605	idle = (tp->t_flags & TF_LASTIDLE) \|\| (tp->snd_max == tp->snd_una);
	606
	607	/* Since idle_time is signed integer, the following integer subtraction
	608	* will take care of wrap around of tcp_now
	609	*/
	610	idle_time = tcp_now - tp->t_rcvtime;
	611	if (idle && idle_time >= TCP_IDLETIMEOUT(tp)) {
	612	if (CC_ALGO(tp)->after_idle != NULL &&
	613	(tp->tcp_cc_index != TCP_CC_ALGO_CUBIC_INDEX \|\|
	614	idle_time >= TCP_CC_CWND_NONVALIDATED_PERIOD)) {
	615	CC_ALGO(tp)->after_idle(tp);
	616	tcp_ccdbg_trace(tp, NULL, TCP_CC_IDLE_TIMEOUT);
	617	}
	618
	619	/*
	620	* Do some other tasks that need to be done after
	621	* idle time
	622	*/
	623	if (!SLIST_EMPTY(&tp->t_rxt_segments))
	624	tcp_rxtseg_clean(tp);
	625
	626	/* If stretch ack was auto-disabled, re-evaluate it */
	627	tcp_cc_after_idle_stretchack(tp);
	628	}
	629	tp->t_flags &= ~TF_LASTIDLE;
	630	if (idle) {
	631	if (tp->t_flags & TF_MORETOCOME) {
	632	tp->t_flags \|= TF_LASTIDLE;
	633	idle = 0;
	634	}
	635	}
	636	#if MPTCP
	637	if (tp->t_mpflags & TMPF_RESET) {
	638	tcp_check_timer_state(tp);
	639	/*
	640	* Once a RST has been sent for an MPTCP subflow,
	641	* the subflow socket stays around until deleted.
	642	* No packets such as FINs must be sent after RST.
	643	*/
	644	return 0;
	645	}
	646	#endif /* MPTCP */
	647
	648	again:
	649	#if MPTCP
	650	mptcp_acknow = FALSE;
	651	#endif
	652
	653	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_START, 0,0,0,0,0);
	654
	655	#if INET6
	656	if (isipv6) {
	657	KERNEL_DEBUG(DBG_LAYER_BEG,
	658	((inp->inp_fport << 16) \| inp->inp_lport),
	659	(((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) \|
	660	(inp->in6p_faddr.s6_addr16[0] & 0xffff)),
	661	sendalot,0,0);
	662	} else
	663	#endif
	664
	665	{
	666	KERNEL_DEBUG(DBG_LAYER_BEG,
	667	((inp->inp_fport << 16) \| inp->inp_lport),
	668	(((inp->inp_laddr.s_addr & 0xffff) << 16) \|
	669	(inp->inp_faddr.s_addr & 0xffff)),
	670	sendalot,0,0);
	671	}
	672	/*
	673	* If the route generation id changed, we need to check that our
	674	* local (source) IP address is still valid. If it isn't either
	675	* return error or silently do nothing (assuming the address will
	676	* come back before the TCP connection times out).
	677	*/
	678	rt = inp->inp_route.ro_rt;
	679	if (rt != NULL && ROUTE_UNUSABLE(&tp->t_inpcb->inp_route)) {
	680	struct ifnet *ifp;
	681	struct in_ifaddr *ia = NULL;
	682	struct in6_ifaddr *ia6 = NULL;
	683	int found_srcaddr = 0;
	684
	685	/* disable multipages at the socket */
	686	somultipages(so, FALSE);
	687
	688	/* Disable TSO for the socket until we know more */
	689	tp->t_flags &= ~TF_TSO;
	690
	691	soif2kcl(so, FALSE);
	692
	693	if (isipv6) {
	694	ia6 = ifa_foraddr6(&inp->in6p_laddr);
	695	if (ia6 != NULL)
	696	found_srcaddr = 1;
	697	} else {
	698	ia = ifa_foraddr(inp->inp_laddr.s_addr);
	699	if (ia != NULL)
	700	found_srcaddr = 1;
	701	}
	702
	703	/* check that the source address is still valid */
	704	if (found_srcaddr == 0) {
	705	soevent(so,
	706	(SO_FILT_HINT_LOCKED \| SO_FILT_HINT_NOSRCADDR));
	707
	708	if (tp->t_state >= TCPS_CLOSE_WAIT) {
	709	tcp_drop(tp, EADDRNOTAVAIL);
	710	return EADDRNOTAVAIL;
	711	}
	712
	713	/*
	714	* Set retransmit timer if it wasn't set,
	715	* reset Persist timer and shift register as the
	716	* advertised peer window may not be valid anymore
	717	*/
	718	if (tp->t_timer[TCPT_REXMT] == 0) {
	719	tp->t_timer[TCPT_REXMT] =
	720	OFFSET_FROM_START(tp, tp->t_rxtcur);
	721	if (tp->t_timer[TCPT_PERSIST] != 0) {
	722	tp->t_timer[TCPT_PERSIST] = 0;
	723	tp->t_persist_stop = 0;
	724	TCP_RESET_REXMT_STATE(tp);
	725	}
	726	}
	727
	728	if (tp->t_pktlist_head != NULL)
	729	m_freem_list(tp->t_pktlist_head);
	730	TCP_PKTLIST_CLEAR(tp);
	731
	732	/* drop connection if source address isn't available */
	733	if (so->so_flags & SOF_NOADDRAVAIL) {
	734	tcp_drop(tp, EADDRNOTAVAIL);
	735	return EADDRNOTAVAIL;
	736	} else {
	737	tcp_check_timer_state(tp);
	738	return 0; /* silently ignore, keep data in socket: address may be back */
	739	}
	740	}
	741	if (ia != NULL)
	742	IFA_REMREF(&ia->ia_ifa);
	743
	744	if (ia6 != NULL)
	745	IFA_REMREF(&ia6->ia_ifa);
	746
	747	/*
	748	* Address is still valid; check for multipages capability
	749	* again in case the outgoing interface has changed.
	750	*/
	751	RT_LOCK(rt);
	752	if ((ifp = rt->rt_ifp) != NULL) {
	753	somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
	754	tcp_set_tso(tp, ifp);
	755	soif2kcl(so, (ifp->if_eflags & IFEF_2KCL));
	756	tcp_set_ecn(tp, ifp);
	757	}
	758	if (rt->rt_flags & RTF_UP)
	759	RT_GENID_SYNC(rt);
	760	/*
	761	* See if we should do MTU discovery. Don't do it if:
	762	* 1) it is disabled via the sysctl
	763	* 2) the route isn't up
	764	* 3) the MTU is locked (if it is, then discovery
	765	* has been disabled)
	766	*/
	767
	768	if (!path_mtu_discovery \|\| ((rt != NULL) &&
	769	(!(rt->rt_flags & RTF_UP) \|\|
	770	(rt->rt_rmx.rmx_locks & RTV_MTU))))
	771	tp->t_flags &= ~TF_PMTUD;
	772	else
	773	tp->t_flags \|= TF_PMTUD;
	774
	775	RT_UNLOCK(rt);
	776	}
	777
	778	if (rt != NULL) {
	779	cell = IFNET_IS_CELLULAR(rt->rt_ifp);
	780	wifi = (!cell && IFNET_IS_WIFI(rt->rt_ifp));
	781	wired = (!wifi && IFNET_IS_WIRED(rt->rt_ifp));
	782	}
	783
	784	/*
	785	* If we've recently taken a timeout, snd_max will be greater than
	786	* snd_nxt. There may be SACK information that allows us to avoid
	787	* resending already delivered data. Adjust snd_nxt accordingly.
	788	*/
	789	if (SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
	790	tcp_sack_adjust(tp);
	791	sendalot = 0;
	792	off = tp->snd_nxt - tp->snd_una;
	793	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
	794
	795	if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
	796	sendwin = min(sendwin, slowlink_wsize);
	797
	798	flags = tcp_outflags[tp->t_state];
	799	/*
	800	* Send any SACK-generated retransmissions. If we're explicitly
	801	* trying to send out new data (when sendalot is 1), bypass this
	802	* function. If we retransmit in fast recovery mode, decrement
	803	* snd_cwnd, since we're replacing a (future) new transmission
	804	* with a retransmission now, and we previously incremented
	805	* snd_cwnd in tcp_input().
	806	*/
	807	/*
	808	* Still in sack recovery , reset rxmit flag to zero.
	809	*/
	810	sack_rxmit = 0;
	811	sack_bytes_rxmt = 0;
	812	len = 0;
	813	p = NULL;
	814	if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp) &&
	815	(p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
	816	int32_t cwin;
	817
	818	cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
	819	if (cwin < 0)
	820	cwin = 0;
	821	/* Do not retransmit SACK segments beyond snd_recover */
	822	if (SEQ_GT(p->end, tp->snd_recover)) {
	823	/*
	824	* (At least) part of sack hole extends beyond
	825	* snd_recover. Check to see if we can rexmit data
	826	* for this hole.
	827	*/
	828	if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
	829	/*
	830	* Can't rexmit any more data for this hole.
	831	* That data will be rexmitted in the next
	832	* sack recovery episode, when snd_recover
	833	* moves past p->rxmit.
	834	*/
	835	p = NULL;
	836	goto after_sack_rexmit;
	837	} else
	838	/* Can rexmit part of the current hole */
	839	len = ((int32_t)min(cwin,
	840	tp->snd_recover - p->rxmit));
	841	} else {
	842	len = ((int32_t)min(cwin, p->end - p->rxmit));
	843	}
	844	if (len > 0) {
	845	off = p->rxmit - tp->snd_una;
	846	sack_rxmit = 1;
	847	sendalot = 1;
	848	tcpstat.tcps_sack_rexmits++;
	849	tcpstat.tcps_sack_rexmit_bytes +=
	850	min(len, tp->t_maxseg);
	851	} else {
	852	len = 0;
	853	}
	854	}
	855	after_sack_rexmit:
	856	/*
	857	* Get standard flags, and add SYN or FIN if requested by 'hidden'
	858	* state flags.
	859	*/
	860	if (tp->t_flags & TF_NEEDFIN)
	861	flags \|= TH_FIN;
	862	if (tp->t_flags & TF_NEEDSYN)
	863	flags \|= TH_SYN;
	864
	865	/*
	866	* If in persist timeout with window of 0, send 1 byte.
	867	* Otherwise, if window is small but nonzero
	868	* and timer expired, we will send what we can
	869	* and go to transmit state.
	870	*/
	871	if (tp->t_flagsext & TF_FORCE) {
	872	if (sendwin == 0) {
	873	/*
	874	* If we still have some data to send, then
	875	* clear the FIN bit. Usually this would
	876	* happen below when it realizes that we
	877	* aren't sending all the data. However,
	878	* if we have exactly 1 byte of unsent data,
	879	* then it won't clear the FIN bit below,
	880	* and if we are in persist state, we wind
	881	* up sending the packet without recording
	882	* that we sent the FIN bit.
	883	*
	884	* We can't just blindly clear the FIN bit,
	885	* because if we don't have any more data
	886	* to send then the probe will be the FIN
	887	* itself.
	888	*/
	889	if (off < so->so_snd.sb_cc)
	890	flags &= ~TH_FIN;
	891	sendwin = 1;
	892	} else {
	893	tp->t_timer[TCPT_PERSIST] = 0;
	894	tp->t_persist_stop = 0;
	895	TCP_RESET_REXMT_STATE(tp);
	896	}
	897	}
	898
	899	/*
	900	* If snd_nxt == snd_max and we have transmitted a FIN, the
	901	* offset will be > 0 even if so_snd.sb_cc is 0, resulting in
	902	* a negative length. This can also occur when TCP opens up
	903	* its congestion window while receiving additional duplicate
	904	* acks after fast-retransmit because TCP will reset snd_nxt
	905	* to snd_max after the fast-retransmit.
	906	*
	907	* In the normal retransmit-FIN-only case, however, snd_nxt will
	908	* be set to snd_una, the offset will be 0, and the length may
	909	* wind up 0.
	910	*
	911	* If sack_rxmit is true we are retransmitting from the scoreboard
	912	* in which case len is already set.
	913	*/
	914	if (sack_rxmit == 0) {
	915	if (sack_bytes_rxmt == 0) {
	916	len = min(so->so_snd.sb_cc, sendwin) - off;
	917	} else {
	918	int32_t cwin;
	919
	920	cwin = tp->snd_cwnd -
	921	(tp->snd_nxt - tp->sack_newdata) -
	922	sack_bytes_rxmt;
	923	if (cwin < 0)
	924	cwin = 0;
	925	/*
	926	* We are inside of a SACK recovery episode and are
	927	* sending new data, having retransmitted all the
	928	* data possible in the scoreboard.
	929	*/
	930	len = min(so->so_snd.sb_cc, tp->snd_wnd)
	931	- off;
	932	/*
	933	* Don't remove this (len > 0) check !
	934	* We explicitly check for len > 0 here (although it
	935	* isn't really necessary), to work around a gcc
	936	* optimization issue - to force gcc to compute
	937	* len above. Without this check, the computation
	938	* of len is bungled by the optimizer.
	939	*/
	940	if (len > 0) {
	941	len = imin(len, cwin);
	942	} else {
	943	len = 0;
	944	}
	945	/*
	946	* At this point SACK recovery can not send any
	947	* data from scoreboard or any new data. Check
	948	* if we can do a rescue retransmit towards the
	949	* tail end of recovery window.
	950	*/
	951	if (len == 0 && cwin > 0 &&
	952	SEQ_LT(tp->snd_fack, tp->snd_recover) &&
	953	!(tp->t_flagsext & TF_RESCUE_RXT)) {
	954	len = min((tp->snd_recover - tp->snd_fack),
	955	tp->t_maxseg);
	956	len = imin(len, cwin);
	957	old_snd_nxt = tp->snd_nxt;
	958	sack_rescue_rxt = TRUE;
	959	tp->snd_nxt = tp->snd_recover - len;
	960	/*
	961	* If FIN has been sent, snd_max
	962	* must have been advanced to cover it.
	963	*/
	964	if ((tp->t_flags & TF_SENTFIN) &&
	965	tp->snd_max == tp->snd_recover)
	966	tp->snd_nxt--;
	967
	968	off = tp->snd_nxt - tp->snd_una;
	969	sendalot = 0;
	970	tp->t_flagsext \|= TF_RESCUE_RXT;
	971	}
	972	}
	973	}
	974
	975	/*
	976	* Lop off SYN bit if it has already been sent. However, if this
	977	* is SYN-SENT state and if segment contains data and if we don't
	978	* know that foreign host supports TAO, suppress sending segment.
	979	*/
	980	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
	981	if (tp->t_state == TCPS_SYN_RECEIVED && tfo_enabled(tp) && tp->snd_nxt == tp->snd_una + 1) {
	982	/* We are sending the SYN again! */
	983	off--;
	984	len++;
	985	} else {
	986	if (tp->t_state != TCPS_SYN_RECEIVED \|\| tfo_enabled(tp)) {
	987	flags &= ~TH_SYN;
	988	}
	989
	990	off--;
	991	len++;
	992	if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
	993	while (inp->inp_sndinprog_cnt == 0 &&
	994	tp->t_pktlist_head != NULL) {
	995	packetlist = tp->t_pktlist_head;
	996	packchain_listadd = tp->t_lastchain;
	997	packchain_sent++;
	998	TCP_PKTLIST_CLEAR(tp);
	999
	1000	error = tcp_ip_output(so, tp, packetlist,
	1001	packchain_listadd, tp_inp_options,
	1002	(so_options & SO_DONTROUTE),
	1003	(sack_rxmit \|\| (sack_bytes_rxmt != 0)),
	1004	isipv6);
	1005	}
	1006
	1007	/*
	1008	* tcp was closed while we were in ip,
	1009	* resume close
	1010	*/
	1011	if (inp->inp_sndinprog_cnt == 0 &&
	1012	(tp->t_flags & TF_CLOSING)) {
	1013	tp->t_flags &= ~TF_CLOSING;
	1014	(void) tcp_close(tp);
	1015	} else {
	1016	tcp_check_timer_state(tp);
	1017	}
	1018	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END,
	1019	0,0,0,0,0);
	1020	return 0;
	1021	}
	1022	}
	1023	}
	1024
	1025	/*
	1026	* Be careful not to send data and/or FIN on SYN segments.
	1027	* This measure is needed to prevent interoperability problems
	1028	* with not fully conformant TCP implementations.
	1029	*
	1030	* In case of TFO, we handle the setting of the len in
	1031	* tcp_tfo_check. In case TFO is not enabled, never ever send
	1032	* SYN+data.
	1033	*/
	1034	if ((flags & TH_SYN) && !tfo_enabled(tp)) {
	1035	len = 0;
	1036	flags &= ~TH_FIN;
	1037	}
	1038
	1039	/*
	1040	* Don't send a RST with data.
	1041	*/
	1042	if (flags & TH_RST)
	1043	len = 0;
	1044
	1045	if ((flags & TH_SYN) && tp->t_state <= TCPS_SYN_SENT && tfo_enabled(tp))
	1046	len = tcp_tfo_check(tp, len);
	1047
	1048	/*
	1049	* The check here used to be (len < 0). Some times len is zero
	1050	* when the congestion window is closed and we need to check
	1051	* if persist timer has to be set in that case. But don't set
	1052	* persist until connection is established.
	1053	*/
	1054	if (len <= 0 && !(flags & TH_SYN)) {
	1055	/*
	1056	* If FIN has been sent but not acked,
	1057	* but we haven't been called to retransmit,
	1058	* len will be < 0. Otherwise, window shrank
	1059	* after we sent into it. If window shrank to 0,
	1060	* cancel pending retransmit, pull snd_nxt back
	1061	* to (closed) window, and set the persist timer
	1062	* if it isn't already going. If the window didn't
	1063	* close completely, just wait for an ACK.
	1064	*/
	1065	len = 0;
	1066	if (sendwin == 0) {
	1067	tp->t_timer[TCPT_REXMT] = 0;
	1068	tp->t_timer[TCPT_PTO] = 0;
	1069	TCP_RESET_REXMT_STATE(tp);
	1070	tp->snd_nxt = tp->snd_una;
	1071	off = 0;
	1072	if (tp->t_timer[TCPT_PERSIST] == 0)
	1073	tcp_setpersist(tp);
	1074	}
	1075	}
	1076
	1077	/*
	1078	* Automatic sizing of send socket buffer. Increase the send
	1079	* socket buffer size if all of the following criteria are met
	1080	* 1. the receiver has enough buffer space for this data
	1081	* 2. send buffer is filled to 7/8th with data (so we actually
	1082	* have data to make use of it);
	1083	* 3. our send window (slow start and congestion controlled) is
	1084	* larger than sent but unacknowledged data in send buffer.
	1085	*/
	1086	if (tcp_do_autosendbuf == 1 &&
	1087	!INP_WAIT_FOR_IF_FEEDBACK(inp) && !IN_FASTRECOVERY(tp) &&
	1088	(so->so_snd.sb_flags & (SB_AUTOSIZE \| SB_TRIM)) == SB_AUTOSIZE &&
	1089	tcp_cansbgrow(&so->so_snd)) {
	1090	if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
	1091	so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
	1092	sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
	1093	if (sbreserve(&so->so_snd,
	1094	min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
	1095	tcp_autosndbuf_max)) == 1) {
	1096	so->so_snd.sb_idealsize = so->so_snd.sb_hiwat;
	1097	}
	1098	}
	1099	}
	1100
	1101	/*
	1102	* Truncate to the maximum segment length or enable TCP Segmentation
	1103	* Offloading (if supported by hardware) and ensure that FIN is removed
	1104	* if the length no longer contains the last data byte.
	1105	*
	1106	* TSO may only be used if we are in a pure bulk sending state.
	1107	* The presence of TCP-MD5, SACK retransmits, SACK advertizements,
	1108	* ipfw rules and IP options, as well as disabling hardware checksum
	1109	* offload prevent using TSO. With TSO the TCP header is the same
	1110	* (except for the sequence number) for all generated packets. This
	1111	* makes it impossible to transmit any options which vary per generated
	1112	* segment or packet.
	1113	*
	1114	* The length of TSO bursts is limited to TCP_MAXWIN. That limit and
	1115	* removal of FIN (if not already catched here) are handled later after
	1116	* the exact length of the TCP options are known.
	1117	*/
	1118	#if IPSEC
	1119	/*
	1120	* Pre-calculate here as we save another lookup into the darknesses
	1121	* of IPsec that way and can actually decide if TSO is ok.
	1122	*/
	1123	if (ipsec_bypass == 0)
	1124	ipsec_optlen = ipsec_hdrsiz_tcp(tp);
	1125	#endif
	1126	if (len > tp->t_maxseg) {
	1127	if ((tp->t_flags & TF_TSO) && tcp_do_tso && hwcksum_tx &&
	1128	ip_use_randomid && kipf_count == 0 &&
	1129	dlil_filter_disable_tso_count == 0 &&
	1130	tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
	1131	sack_bytes_rxmt == 0 &&
	1132	inp->inp_options == NULL &&
	1133	inp->in6p_options == NULL
	1134	#if IPSEC
	1135	&& ipsec_optlen == 0
	1136	#endif
	1137	#if IPFIREWALL
	1138	&& (fw_enable == 0 \|\| fw_bypass)
	1139	#endif
	1140	) {
	1141	tso = 1;
	1142	sendalot = 0;
	1143	} else {
	1144	len = tp->t_maxseg;
	1145	sendalot = 1;
	1146	tso = 0;
	1147	}
	1148	}
	1149
	1150	/* Send one segment or less as a tail loss probe */
	1151	if (tp->t_flagsext & TF_SENT_TLPROBE) {
	1152	len = min(len, tp->t_maxseg);
	1153	sendalot = 0;
	1154	tso = 0;
	1155	}
	1156
	1157	#if MPTCP
	1158	if (so->so_flags & SOF_MP_SUBFLOW && off < 0) {
	1159	os_log_error(mptcp_log_handle, "%s - %lx: offset is negative! len %d off %d\n",
	1160	__func__, (unsigned long)VM_KERNEL_ADDRPERM(tp->t_mpsub->mpts_mpte),
	1161	len, off);
	1162	}
	1163
	1164	if ((so->so_flags & SOF_MP_SUBFLOW) &&
	1165	!(tp->t_mpflags & TMPF_TCP_FALLBACK)) {
	1166	int newlen = len;
	1167	if (tp->t_state >= TCPS_ESTABLISHED &&
	1168	(tp->t_mpflags & TMPF_SND_MPPRIO \|\|
	1169	tp->t_mpflags & TMPF_SND_REM_ADDR \|\|
	1170	tp->t_mpflags & TMPF_SND_MPFAIL \|\|
	1171	tp->t_mpflags & TMPF_SND_KEYS \|\|
	1172	tp->t_mpflags & TMPF_SND_JACK)) {
	1173	if (len > 0) {
	1174	len = 0;
	1175	}
	1176	/*
	1177	* On a new subflow, don't try to send again, because
	1178	* we are still waiting for the fourth ack.
	1179	*/
	1180	if (!(tp->t_mpflags & TMPF_PREESTABLISHED))
	1181	sendalot = 1;
	1182	mptcp_acknow = TRUE;
	1183	} else {
	1184	mptcp_acknow = FALSE;
	1185	}
	1186	/*
	1187	* The contiguous bytes in the subflow socket buffer can be
	1188	* discontiguous at the MPTCP level. Since only one DSS
	1189	* option can be sent in one packet, reduce length to match
	1190	* the contiguous MPTCP level. Set sendalot to send remainder.
	1191	*/
	1192	if (len > 0 && off >= 0) {
	1193	newlen = mptcp_adj_sendlen(so, off);
	1194	}
	1195
	1196	if (newlen < len) {
	1197	len = newlen;
	1198	}
	1199	}
	1200	#endif /* MPTCP */
	1201
	1202	/*
	1203	* If the socket is capable of doing unordered send,
	1204	* pull the amount of data that can be sent from the
	1205	* unordered priority queues to the serial queue in
	1206	* the socket buffer. If bytes are not yet available
	1207	* in the highest priority message, we may not be able
	1208	* to send any new data.
	1209	*/
	1210	if (so->so_flags & SOF_ENABLE_MSGS) {
	1211	if ((off + len) >
	1212	so->so_msg_state->msg_serial_bytes) {
	1213	sbpull_unordered_data(so, off, len);
	1214
	1215	/* check if len needs to be modified */
	1216	if ((off + len) >
	1217	so->so_msg_state->msg_serial_bytes) {
	1218	len = so->so_msg_state->msg_serial_bytes - off;
	1219	if (len <= 0) {
	1220	len = 0;
	1221	tcpstat.tcps_msg_sndwaithipri++;
	1222	}
	1223	}
	1224	}
	1225	}
	1226
	1227	if (sack_rxmit) {
	1228	if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
	1229	flags &= ~TH_FIN;
	1230	} else {
	1231	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
	1232	flags &= ~TH_FIN;
	1233	}
	1234	/*
	1235	* Compare available window to amount of window
	1236	* known to peer (as advertised window less
	1237	* next expected input). If the difference is at least two
	1238	* max size segments, or at least 25% of the maximum possible
	1239	* window, then want to send a window update to peer.
	1240	*/
	1241	recwin = tcp_sbspace(tp);
	1242
	1243	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
	1244	if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) &&
	1245	recwin < (int)tp->t_maxseg) {
	1246	recwin = 0;
	1247	}
	1248	} else {
	1249	struct mptcb *mp_tp = tptomptp(tp);
	1250	struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
	1251
	1252	if (recwin < (int32_t)(mp_so->so_rcv.sb_hiwat / 4) &&
	1253	recwin < (int)tp->t_maxseg) {
	1254	recwin = 0;
	1255	}
	1256	}
	1257
	1258	#if TRAFFIC_MGT
	1259	if (tcp_recv_bg == 1 \|\| IS_TCP_RECV_BG(so)) {
	1260	if (recwin > 0 && tcp_recv_throttle(tp)) {
	1261	uint32_t min_iaj_win = tcp_min_iaj_win * tp->t_maxseg;
	1262	uint32_t bg_rwintop = tp->rcv_adv;
	1263	if (SEQ_LT(bg_rwintop, tp->rcv_nxt + min_iaj_win))
	1264	bg_rwintop = tp->rcv_nxt + min_iaj_win;
	1265	recwin = imin((int32_t)(bg_rwintop - tp->rcv_nxt),
	1266	recwin);
	1267	if (recwin < 0)
	1268	recwin = 0;
	1269	}
	1270	}
	1271	#endif /* TRAFFIC_MGT */
	1272
	1273	if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
	1274	recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
	1275
	1276	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
	1277	if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt)) {
	1278	recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
	1279	}
	1280	} else {
	1281	struct mptcb *mp_tp = tptomptp(tp);
	1282
	1283	/* Don't remove what we announced at the MPTCP-layer */
	1284	if (recwin < (int32_t)(mp_tp->mpt_rcvadv - (uint32_t)mp_tp->mpt_rcvnxt)) {
	1285	recwin = (int32_t)(mp_tp->mpt_rcvadv - (uint32_t)mp_tp->mpt_rcvnxt);
	1286	}
	1287	}
	1288
	1289	/*
	1290	* Sender silly window avoidance. We transmit under the following
	1291	* conditions when len is non-zero:
	1292	*
	1293	* - we've timed out (e.g. persist timer)
	1294	* - we need to retransmit
	1295	* - We have a full segment (or more with TSO)
	1296	* - This is the last buffer in a write()/send() and we are
	1297	* either idle or running NODELAY
	1298	* - we have more then 1/2 the maximum send window's worth of
	1299	* data (receiver may be limited the window size)
	1300	*/
	1301	if (len) {
	1302	if (tp->t_flagsext & TF_FORCE)
	1303	goto send;
	1304	if (SEQ_LT(tp->snd_nxt, tp->snd_max))
	1305	goto send;
	1306	if (sack_rxmit)
	1307	goto send;
	1308
	1309	/*
	1310	* If this here is the first segment after SYN/ACK and TFO
	1311	* is being used, then we always send it, regardless of Nagle,...
	1312	*/
	1313	if (tp->t_state == TCPS_SYN_RECEIVED &&
	1314	tfo_enabled(tp) &&
	1315	(tp->t_tfo_flags & TFO_F_COOKIE_VALID) &&
	1316	tp->snd_nxt == tp->iss + 1)
	1317	goto send;
	1318
	1319	/*
	1320	* Send new data on the connection only if it is
	1321	* not flow controlled
	1322	*/
	1323	if (!INP_WAIT_FOR_IF_FEEDBACK(inp) \|\|
	1324	tp->t_state != TCPS_ESTABLISHED) {
	1325	if (len >= tp->t_maxseg)
	1326	goto send;
	1327
	1328	if (!(tp->t_flags & TF_MORETOCOME) &&
	1329	(idle \|\| tp->t_flags & TF_NODELAY \|\|
	1330	(tp->t_flags & TF_MAXSEGSNT) \|\|
	1331	ALLOW_LIMITED_TRANSMIT(tp)) &&
	1332	(tp->t_flags & TF_NOPUSH) == 0 &&
	1333	(len + off >= so->so_snd.sb_cc \|\|
	1334	/*
	1335	* MPTCP needs to respect the DSS-mappings. So, it
	1336	* may be sending data that could have been
	1337	* coalesced, but cannot because of
	1338	* mptcp_adj_sendlen().
	1339	*/
	1340	so->so_flags & SOF_MP_SUBFLOW))
	1341	goto send;
	1342	if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
	1343	goto send;
	1344	} else {
	1345	tcpstat.tcps_fcholdpacket++;
	1346	}
	1347	}
	1348
	1349	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
	1350	/*
	1351	* "adv" is the amount we can increase the window,
	1352	* taking into account that we are limited by
	1353	* TCP_MAXWIN << tp->rcv_scale.
	1354	*/
	1355	int32_t adv, oldwin = 0;
	1356	adv = imin(recwin, (int)TCP_MAXWIN << tp->rcv_scale) -
	1357	(tp->rcv_adv - tp->rcv_nxt);
	1358
	1359	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
	1360	oldwin = tp->rcv_adv - tp->rcv_nxt;
	1361
	1362	if (adv >= (int32_t) (2 * tp->t_maxseg)) {
	1363	/*
	1364	* Update only if the resulting scaled value of
	1365	* the window changed, or if there is a change in
	1366	* the sequence since the last ack. This avoids
	1367	* what appears as dupe ACKS (see rdar://5640997)
	1368	*
	1369	* If streaming is detected avoid sending too many
	1370	* window updates. We will depend on the delack
	1371	* timer to send a window update when needed.
	1372	*/
	1373	if (!(tp->t_flags & TF_STRETCHACK) &&
	1374	(tp->last_ack_sent != tp->rcv_nxt \|\|
	1375	((oldwin + adv) >> tp->rcv_scale) >
	1376	(oldwin >> tp->rcv_scale))) {
	1377	goto send;
	1378	}
	1379
	1380	}
	1381	if (4 * adv >= (int32_t) so->so_rcv.sb_hiwat)
	1382	goto send;
	1383
	1384	/*
	1385	* Make sure that the delayed ack timer is set if
	1386	* we delayed sending a window update because of
	1387	* streaming detection.
	1388	*/
	1389	if ((tp->t_flags & TF_STRETCHACK) &&
	1390	!(tp->t_flags & TF_DELACK)) {
	1391	tp->t_flags \|= TF_DELACK;
	1392	tp->t_timer[TCPT_DELACK] =
	1393	OFFSET_FROM_START(tp, tcp_delack);
	1394	}
	1395	}
	1396
	1397	/*
	1398	* Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
	1399	* is also a catch-all for the retransmit timer timeout case.
	1400	*/
	1401	if (tp->t_flags & TF_ACKNOW)
	1402	goto send;
	1403	if ((flags & TH_RST) \|\|
	1404	((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
	1405	goto send;
	1406	if (SEQ_GT(tp->snd_up, tp->snd_una))
	1407	goto send;
	1408	#if MPTCP
	1409	if (mptcp_acknow)
	1410	goto send;
	1411	#endif /* MPTCP */
	1412	/*
	1413	* If our state indicates that FIN should be sent
	1414	* and we have not yet done so, then we need to send.
	1415	*/
	1416	if ((flags & TH_FIN) &&
	1417	(!(tp->t_flags & TF_SENTFIN) \|\| tp->snd_nxt == tp->snd_una))
	1418	goto send;
	1419	/*
	1420	* In SACK, it is possible for tcp_output to fail to send a segment
	1421	* after the retransmission timer has been turned off. Make sure
	1422	* that the retransmission timer is set.
	1423	*/
	1424	if (SACK_ENABLED(tp) && (tp->t_state >= TCPS_ESTABLISHED) &&
	1425	SEQ_GT(tp->snd_max, tp->snd_una) &&
	1426	tp->t_timer[TCPT_REXMT] == 0 &&
	1427	tp->t_timer[TCPT_PERSIST] == 0) {
	1428	tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
	1429	tp->t_rxtcur);
	1430	goto just_return;
	1431	}
	1432	/*
	1433	* TCP window updates are not reliable, rather a polling protocol
	1434	* using ``persist'' packets is used to insure receipt of window
	1435	* updates. The three ``states'' for the output side are:
	1436	* idle not doing retransmits or persists
	1437	* persisting to move a small or zero window
	1438	* (re)transmitting and thereby not persisting
	1439	*
	1440	* tp->t_timer[TCPT_PERSIST]
	1441	* is set when we are in persist state.
	1442	* tp->t_force
	1443	* is set when we are called to send a persist packet.
	1444	* tp->t_timer[TCPT_REXMT]
	1445	* is set when we are retransmitting
	1446	* The output side is idle when both timers are zero.
	1447	*
	1448	* If send window is too small, there is data to transmit, and no
	1449	* retransmit or persist is pending, then go to persist state.
	1450	* If nothing happens soon, send when timer expires:
	1451	* if window is nonzero, transmit what we can,
	1452	* otherwise force out a byte.
	1453	*/
	1454	if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
	1455	tp->t_timer[TCPT_PERSIST] == 0) {
	1456	TCP_RESET_REXMT_STATE(tp);
	1457	tcp_setpersist(tp);
	1458	}
	1459	just_return:
	1460	/*
	1461	* If there is no reason to send a segment, just return.
	1462	* but if there is some packets left in the packet list, send them now.
	1463	*/
	1464	while (inp->inp_sndinprog_cnt == 0 &&
	1465	tp->t_pktlist_head != NULL) {
	1466	packetlist = tp->t_pktlist_head;
	1467	packchain_listadd = tp->t_lastchain;
	1468	packchain_sent++;
	1469	TCP_PKTLIST_CLEAR(tp);
	1470
	1471	error = tcp_ip_output(so, tp, packetlist,
	1472	packchain_listadd,
	1473	tp_inp_options, (so_options & SO_DONTROUTE),
	1474	(sack_rxmit \|\| (sack_bytes_rxmt != 0)), isipv6);
	1475	}
	1476	/* tcp was closed while we were in ip; resume close */
	1477	if (inp->inp_sndinprog_cnt == 0 &&
	1478	(tp->t_flags & TF_CLOSING)) {
	1479	tp->t_flags &= ~TF_CLOSING;
	1480	(void) tcp_close(tp);
	1481	} else {
	1482	tcp_check_timer_state(tp);
	1483	}
	1484	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, 0,0,0,0,0);
	1485	return 0;
	1486
	1487	send:
	1488	/*
	1489	* Set TF_MAXSEGSNT flag if the segment size is greater than
	1490	* the max segment size.
	1491	*/
	1492	if (len > 0) {
	1493	if (len >= tp->t_maxseg)
	1494	tp->t_flags \|= TF_MAXSEGSNT;
	1495	else
	1496	tp->t_flags &= ~TF_MAXSEGSNT;
	1497	}
	1498	/*
	1499	* Before ESTABLISHED, force sending of initial options
	1500	* unless TCP set not to do any options.
	1501	* NOTE: we assume that the IP/TCP header plus TCP options
	1502	* always fit in a single mbuf, leaving room for a maximum
	1503	* link header, i.e.
	1504	* max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
	1505	*/
	1506	optlen = 0;
	1507	#if INET6
	1508	if (isipv6)
	1509	hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
	1510	else
	1511	#endif
	1512	hdrlen = sizeof (struct tcpiphdr);
	1513	if (flags & TH_SYN) {
	1514	tp->snd_nxt = tp->iss;
	1515	if ((tp->t_flags & TF_NOOPT) == 0) {
	1516	u_short mss;
	1517
	1518	opt[0] = TCPOPT_MAXSEG;
	1519	opt[1] = TCPOLEN_MAXSEG;
	1520	mss = htons((u_short) tcp_mssopt(tp));
	1521	(void)memcpy(opt + 2, &mss, sizeof(mss));
	1522	optlen = TCPOLEN_MAXSEG;
	1523
	1524	if ((tp->t_flags & TF_REQ_SCALE) &&
	1525	((flags & TH_ACK) == 0 \|\|
	1526	(tp->t_flags & TF_RCVD_SCALE))) {
	1527	((u_int32_t )(void *)(opt + optlen)) = htonl(
	1528	TCPOPT_NOP << 24 \|
	1529	TCPOPT_WINDOW << 16 \|
	1530	TCPOLEN_WINDOW << 8 \|
	1531	tp->request_r_scale);
	1532	optlen += 4;
	1533	}
	1534	#if MPTCP
	1535	if (mptcp_enable && (so->so_flags & SOF_MP_SUBFLOW)) {
	1536	optlen = mptcp_setup_syn_opts(so, opt, optlen);
	1537	}
	1538	#endif /* MPTCP */
	1539	}
	1540	}
	1541
	1542	/*
	1543	* Send a timestamp and echo-reply if this is a SYN and our side
	1544	* wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	1545	* and our peer have sent timestamps in our SYN's.
	1546	*/
	1547	if ((tp->t_flags & (TF_REQ_TSTMP\|TF_NOOPT)) == TF_REQ_TSTMP &&
	1548	(flags & TH_RST) == 0 &&
	1549	((flags & TH_ACK) == 0 \|\|
	1550	(tp->t_flags & TF_RCVD_TSTMP))) {
	1551	u_int32_t lp = (u_int32_t )(void *)(opt + optlen);
	1552
	1553	/* Form timestamp option as shown in appendix A of RFC 1323. */
	1554	*lp++ = htonl(TCPOPT_TSTAMP_HDR);
	1555	*lp++ = htonl(tcp_now);
	1556	*lp = htonl(tp->ts_recent);
	1557	optlen += TCPOLEN_TSTAMP_APPA;
	1558	}
	1559
	1560	/* Note the timestamp for receive buffer autosizing */
	1561	if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE))
	1562	tp->rfbuf_ts = tcp_now;
	1563
	1564	if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) {
	1565	/*
	1566	* Tack on the SACK permitted option last.
	1567	* And do padding of options after tacking this on.
	1568	* This is because of MSS, TS, WinScale and Signatures are
	1569	* all present, we have just 2 bytes left for the SACK
	1570	* permitted option, which is just enough.
	1571	*/
	1572	/*
	1573	* If this is the first SYN of connection (not a SYN
	1574	* ACK), include SACK permitted option. If this is a
	1575	* SYN ACK, include SACK permitted option if peer has
	1576	* already done so. This is only for active connect,
	1577	* since the syncache takes care of the passive connect.
	1578	*/
	1579	if ((flags & TH_SYN) &&
	1580	(!(flags & TH_ACK) \|\| (tp->t_flags & TF_SACK_PERMIT))) {
	1581	u_char *bp;
	1582	bp = (u_char *)opt + optlen;
	1583
	1584	*bp++ = TCPOPT_SACK_PERMITTED;
	1585	*bp++ = TCPOLEN_SACK_PERMITTED;
	1586	optlen += TCPOLEN_SACK_PERMITTED;
	1587	}
	1588	}
	1589	#if MPTCP
	1590	if (so->so_flags & SOF_MP_SUBFLOW) {
	1591	/*
	1592	* Its important to piggyback acks with data as ack only packets
	1593	* may get lost and data packets that don't send Data ACKs
	1594	* still advance the subflow level ACK and therefore make it
	1595	* hard for the remote end to recover in low cwnd situations.
	1596	*/
	1597	if (len != 0) {
	1598	tp->t_mpflags \|= (TMPF_SEND_DSN \|
	1599	TMPF_MPTCP_ACKNOW);
	1600	} else {
	1601	tp->t_mpflags \|= TMPF_MPTCP_ACKNOW;
	1602	}
	1603	optlen = mptcp_setup_opts(tp, off, &opt[0], optlen, flags,
	1604	len, &mptcp_acknow);
	1605	tp->t_mpflags &= ~TMPF_SEND_DSN;
	1606	}
	1607	#endif /* MPTCP */
	1608
	1609	if (tfo_enabled(tp) && !(tp->t_flags & TF_NOOPT) &&
	1610	(flags & (TH_SYN \| TH_ACK)) == TH_SYN)
	1611	optlen += tcp_tfo_write_cookie(tp, optlen, len, opt);
	1612
	1613	if (tfo_enabled(tp) &&
	1614	(flags & (TH_SYN \| TH_ACK)) == (TH_SYN \| TH_ACK) &&
	1615	(tp->t_tfo_flags & TFO_F_OFFER_COOKIE))
	1616	optlen += tcp_tfo_write_cookie_rep(tp, optlen, opt);
	1617
	1618	if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) {
	1619	/*
	1620	* Send SACKs if necessary. This should be the last
	1621	* option processed. Only as many SACKs are sent as
	1622	* are permitted by the maximum options size.
	1623	*
	1624	* In general, SACK blocks consume 8*n+2 bytes.
	1625	* So a full size SACK blocks option is 34 bytes
	1626	* (to generate 4 SACK blocks). At a minimum,
	1627	* we need 10 bytes (to generate 1 SACK block).
	1628	* If TCP Timestamps (12 bytes) and TCP Signatures
	1629	* (18 bytes) are both present, we'll just have
	1630	* 10 bytes for SACK options 40 - (12 + 18).
	1631	*/
	1632	if (TCPS_HAVEESTABLISHED(tp->t_state) &&
	1633	(tp->t_flags & TF_SACK_PERMIT) &&
	1634	(tp->rcv_numsacks > 0 \|\| TCP_SEND_DSACK_OPT(tp)) &&
	1635	MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) {
	1636	int nsack, padlen;
	1637	u_char bp = (u_char )opt + optlen;
	1638	u_int32_t *lp;
	1639
	1640	nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK;
	1641	nsack = min(nsack, (tp->rcv_numsacks +
	1642	(TCP_SEND_DSACK_OPT(tp) ? 1 : 0)));
	1643	sackoptlen = (2 + nsack * TCPOLEN_SACK);
	1644
	1645	/*
	1646	* First we need to pad options so that the
	1647	* SACK blocks can start at a 4-byte boundary
	1648	* (sack option and length are at a 2 byte offset).
	1649	*/
	1650	padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4;
	1651	optlen += padlen;
	1652	while (padlen-- > 0)
	1653	*bp++ = TCPOPT_NOP;
	1654
	1655	tcpstat.tcps_sack_send_blocks++;
	1656	*bp++ = TCPOPT_SACK;
	1657	*bp++ = sackoptlen;
	1658	lp = (u_int32_t )(void )bp;
	1659
	1660	/*
	1661	* First block of SACK option should represent
	1662	* DSACK. Prefer to send SACK information if there
	1663	* is space for only one SACK block. This will
	1664	* allow for faster recovery.
	1665	*/
	1666	if (TCP_SEND_DSACK_OPT(tp) && nsack > 0 &&
	1667	(tp->rcv_numsacks == 0 \|\| nsack > 1)) {
	1668	*lp++ = htonl(tp->t_dsack_lseq);
	1669	*lp++ = htonl(tp->t_dsack_rseq);
	1670	tcpstat.tcps_dsack_sent++;
	1671	tp->t_dsack_sent++;
	1672	nsack--;
	1673	}
	1674	VERIFY(nsack == 0 \|\| tp->rcv_numsacks >= nsack);
	1675	for (i = 0; i < nsack; i++) {
	1676	struct sackblk sack = tp->sackblks[i];
	1677	*lp++ = htonl(sack.start);
	1678	*lp++ = htonl(sack.end);
	1679	}
	1680	optlen += sackoptlen;
	1681	}
	1682	}
	1683
	1684	/* Pad TCP options to a 4 byte boundary */
	1685	if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
	1686	int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
	1687	u_char bp = (u_char )opt + optlen;
	1688
	1689	optlen += pad;
	1690	while (pad) {
	1691	*bp++ = TCPOPT_EOL;
	1692	pad--;
	1693	}
	1694	}
	1695
	1696	/*
	1697	* RFC 3168 states that:
	1698	* - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
	1699	* to handle the TCP ECE flag, even if you also later send a
	1700	* non-ECN-setup SYN/SYN-ACK.
	1701	* - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
	1702	* the ip ECT flag.
	1703	*
	1704	* It is not clear how the ECE flag would ever be set if you never
	1705	* set the IP ECT flag on outbound packets. All the same, we use
	1706	* the TE_SETUPSENT to indicate that we have committed to handling
	1707	* the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
	1708	* whether or not we should set the IP ECT flag on outbound packet
	1709	*
	1710	* For a SYN-ACK, send an ECN setup SYN-ACK
	1711	*/
	1712	if ((flags & (TH_SYN \| TH_ACK)) == (TH_SYN \| TH_ACK) &&
	1713	(tp->ecn_flags & TE_ENABLE_ECN)) {
	1714	if (tp->ecn_flags & TE_SETUPRECEIVED) {
	1715	if (tcp_send_ecn_flags_on_syn(tp, so)) {
	1716	/*
	1717	* Setting TH_ECE makes this an ECN-setup
	1718	* SYN-ACK
	1719	*/
	1720	flags \|= TH_ECE;
	1721
	1722	/*
	1723	* Record that we sent the ECN-setup and
	1724	* default to setting IP ECT.
	1725	*/
	1726	tp->ecn_flags \|= (TE_SETUPSENT\|TE_SENDIPECT);
	1727	tcpstat.tcps_ecn_server_setup++;
	1728	tcpstat.tcps_ecn_server_success++;
	1729	} else {
	1730	/*
	1731	* We sent an ECN-setup SYN-ACK but it was
	1732	* dropped. Fallback to non-ECN-setup
	1733	* SYN-ACK and clear flag to indicate that
	1734	* we should not send data with IP ECT set
	1735	*
	1736	* Pretend we didn't receive an
	1737	* ECN-setup SYN.
	1738	*
	1739	* We already incremented the counter
	1740	* assuming that the ECN setup will
	1741	* succeed. Decrementing here
	1742	* tcps_ecn_server_success to correct it.
	1743	*/
	1744	if (tp->ecn_flags & TE_SETUPSENT) {
	1745	tcpstat.tcps_ecn_lost_synack++;
	1746	tcpstat.tcps_ecn_server_success--;
	1747	tp->ecn_flags \|= TE_LOST_SYNACK;
	1748	}
	1749
	1750	tp->ecn_flags &=
	1751	~(TE_SETUPRECEIVED \| TE_SENDIPECT \|
	1752	TE_SENDCWR);
	1753	}
	1754	}
	1755	} else if ((flags & (TH_SYN \| TH_ACK)) == TH_SYN &&
	1756	(tp->ecn_flags & TE_ENABLE_ECN)) {
	1757	if (tcp_send_ecn_flags_on_syn(tp, so)) {
	1758	/*
	1759	* Setting TH_ECE and TH_CWR makes this an
	1760	* ECN-setup SYN
	1761	*/
	1762	flags \|= (TH_ECE \| TH_CWR);
	1763	tcpstat.tcps_ecn_client_setup++;
	1764	tp->ecn_flags \|= TE_CLIENT_SETUP;
	1765
	1766	/*
	1767	* Record that we sent the ECN-setup and default to
	1768	* setting IP ECT.
	1769	*/
	1770	tp->ecn_flags \|= (TE_SETUPSENT \| TE_SENDIPECT);
	1771	} else {
	1772	/*
	1773	* We sent an ECN-setup SYN but it was dropped.
	1774	* Fall back to non-ECN and clear flag indicating
	1775	* we should send data with IP ECT set.
	1776	*/
	1777	if (tp->ecn_flags & TE_SETUPSENT) {
	1778	tcpstat.tcps_ecn_lost_syn++;
	1779	tp->ecn_flags \|= TE_LOST_SYN;
	1780	}
	1781	tp->ecn_flags &= ~TE_SENDIPECT;
	1782	}
	1783	}
	1784
	1785	/*
	1786	* Check if we should set the TCP CWR flag.
	1787	* CWR flag is sent when we reduced the congestion window because
	1788	* we received a TCP ECE or we performed a fast retransmit. We
	1789	* never set the CWR flag on retransmitted packets. We only set
	1790	* the CWR flag on data packets. Pure acks don't have this set.
	1791	*/
	1792	if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 &&
	1793	!SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
	1794	flags \|= TH_CWR;
	1795	tp->ecn_flags &= ~TE_SENDCWR;
	1796	}
	1797
	1798	/*
	1799	* Check if we should set the TCP ECE flag.
	1800	*/
	1801	if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) {
	1802	flags \|= TH_ECE;
	1803	tcpstat.tcps_ecn_sent_ece++;
	1804	}
	1805
	1806
	1807	hdrlen += optlen;
	1808
	1809	/* Reset DSACK sequence numbers */
	1810	tp->t_dsack_lseq = 0;
	1811	tp->t_dsack_rseq = 0;
	1812
	1813	#if INET6
	1814	if (isipv6)
	1815	ipoptlen = ip6_optlen(inp);
	1816	else
	1817	#endif
	1818	{
	1819	if (tp_inp_options) {
	1820	ipoptlen = tp_inp_options->m_len -
	1821	offsetof(struct ipoption, ipopt_list);
	1822	} else {
	1823	ipoptlen = 0;
	1824	}
	1825	}
	1826	#if IPSEC
	1827	ipoptlen += ipsec_optlen;
	1828	#endif
	1829
	1830	/*
	1831	* Adjust data length if insertion of options will
	1832	* bump the packet length beyond the t_maxopd length.
	1833	* Clear the FIN bit because we cut off the tail of
	1834	* the segment.
	1835	*
	1836	* When doing TSO limit a burst to TCP_MAXWIN minus the
	1837	* IP, TCP and Options length to keep ip->ip_len from
	1838	* overflowing. Prevent the last segment from being
	1839	* fractional thus making them all equal sized and set
	1840	* the flag to continue sending. TSO is disabled when
	1841	* IP options or IPSEC are present.
	1842	*/
	1843	if (len + optlen + ipoptlen > tp->t_maxopd) {
	1844	/*
	1845	* If there is still more to send,
	1846	* don't close the connection.
	1847	*/
	1848	flags &= ~TH_FIN;
	1849	if (tso) {
	1850	int32_t tso_maxlen;
	1851
	1852	tso_maxlen = tp->tso_max_segment_size ?
	1853	tp->tso_max_segment_size : TCP_MAXWIN;
	1854
	1855	if (len > tso_maxlen - hdrlen - optlen) {
	1856	len = tso_maxlen - hdrlen - optlen;
	1857	len = len - (len % (tp->t_maxopd - optlen));
	1858	sendalot = 1;
	1859	} else if (tp->t_flags & TF_NEEDFIN) {
	1860	sendalot = 1;
	1861	}
	1862	} else {
	1863	len = tp->t_maxopd - optlen - ipoptlen;
	1864	sendalot = 1;
	1865	}
	1866	}
	1867
	1868	if (max_linkhdr + hdrlen > MCLBYTES)
	1869	panic("tcphdr too big");
	1870
	1871	/* Check if there is enough data in the send socket
	1872	* buffer to start measuring bandwidth
	1873	*/
	1874	if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
	1875	(tp->t_bwmeas != NULL) &&
	1876	(tp->t_flagsext & TF_BWMEAS_INPROGRESS) == 0) {
	1877	tp->t_bwmeas->bw_size = min(min(
	1878	(so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)),
	1879	tp->snd_cwnd), tp->snd_wnd);
	1880	if (tp->t_bwmeas->bw_minsize > 0 &&
	1881	tp->t_bwmeas->bw_size < tp->t_bwmeas->bw_minsize)
	1882	tp->t_bwmeas->bw_size = 0;
	1883	if (tp->t_bwmeas->bw_maxsize > 0)
	1884	tp->t_bwmeas->bw_size = min(tp->t_bwmeas->bw_size,
	1885	tp->t_bwmeas->bw_maxsize);
	1886	if (tp->t_bwmeas->bw_size > 0) {
	1887	tp->t_flagsext \|= TF_BWMEAS_INPROGRESS;
	1888	tp->t_bwmeas->bw_start = tp->snd_max;
	1889	tp->t_bwmeas->bw_ts = tcp_now;
	1890	}
	1891	}
	1892
	1893	VERIFY(inp->inp_flowhash != 0);
	1894	/*
	1895	* Grab a header mbuf, attaching a copy of data to
	1896	* be transmitted, and initialize the header from
	1897	* the template for sends on this connection.
	1898	*/
	1899	if (len) {
	1900	tp->t_pmtud_lastseg_size = len + optlen + ipoptlen;
	1901	if ((tp->t_flagsext & TF_FORCE) && len == 1)
	1902	tcpstat.tcps_sndprobe++;
	1903	else if (SEQ_LT(tp->snd_nxt, tp->snd_max) \|\| sack_rxmit) {
	1904	tcpstat.tcps_sndrexmitpack++;
	1905	tcpstat.tcps_sndrexmitbyte += len;
	1906	if (nstat_collect) {
	1907	nstat_route_tx(inp->inp_route.ro_rt, 1,
	1908	len, NSTAT_TX_FLAG_RETRANSMIT);
	1909	INP_ADD_STAT(inp, cell, wifi, wired,
	1910	txpackets, 1);
	1911	INP_ADD_STAT(inp, cell, wifi, wired,
	1912	txbytes, len);
	1913	tp->t_stat.txretransmitbytes += len;
	1914	tp->t_stat.rxmitpkts++;
	1915	}
	1916	} else {
	1917	tcpstat.tcps_sndpack++;
	1918	tcpstat.tcps_sndbyte += len;
	1919
	1920	if (nstat_collect) {
	1921	INP_ADD_STAT(inp, cell, wifi, wired,
	1922	txpackets, 1);
	1923	INP_ADD_STAT(inp, cell, wifi, wired,
	1924	txbytes, len);
	1925	}
	1926	inp_decr_sndbytes_unsent(so, len);
	1927	}
	1928	inp_set_activity_bitmap(inp);
	1929	#if MPTCP
	1930	if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
	1931	tcpstat.tcps_mp_sndpacks++;
	1932	tcpstat.tcps_mp_sndbytes += len;
	1933	}
	1934	#endif /* MPTCP */
	1935	/*
	1936	* try to use the new interface that allocates all
	1937	* the necessary mbuf hdrs under 1 mbuf lock and
	1938	* avoids rescanning the socket mbuf list if
	1939	* certain conditions are met. This routine can't
	1940	* be used in the following cases...
	1941	* 1) the protocol headers exceed the capacity of
	1942	* of a single mbuf header's data area (no cluster attached)
	1943	* 2) the length of the data being transmitted plus
	1944	* the protocol headers fits into a single mbuf header's
	1945	* data area (no cluster attached)
	1946	*/
	1947	m = NULL;
	1948
	1949	/* minimum length we are going to allocate */
	1950	allocated_len = MHLEN;
	1951	if (MHLEN < hdrlen + max_linkhdr) {
	1952	MGETHDR(m, M_DONTWAIT, MT_HEADER);
	1953	if (m == NULL) {
	1954	error = ENOBUFS;
	1955	goto out;
	1956	}
	1957	MCLGET(m, M_DONTWAIT);
	1958	if ((m->m_flags & M_EXT) == 0) {
	1959	m_freem(m);
	1960	error = ENOBUFS;
	1961	goto out;
	1962	}
	1963	m->m_data += max_linkhdr;
	1964	m->m_len = hdrlen;
	1965	allocated_len = MCLBYTES;
	1966	}
	1967	if (len <= allocated_len - hdrlen - max_linkhdr) {
	1968	if (m == NULL) {
	1969	VERIFY(allocated_len <= MHLEN);
	1970	MGETHDR(m, M_DONTWAIT, MT_HEADER);
	1971	if (m == NULL) {
	1972	error = ENOBUFS;
	1973	goto out;
	1974	}
	1975	m->m_data += max_linkhdr;
	1976	m->m_len = hdrlen;
	1977	}
	1978	/* makes sure we still have data left to be sent at this point */
	1979	if (so->so_snd.sb_mb == NULL \|\| off < 0) {
	1980	if (m != NULL) m_freem(m);
	1981	error = 0; /* should we return an error? */
	1982	goto out;
	1983	}
	1984	m_copydata(so->so_snd.sb_mb, off, (int) len,
	1985	mtod(m, caddr_t) + hdrlen);
	1986	m->m_len += len;
	1987	} else {
	1988	uint32_t copymode;
	1989	/*
	1990	* Retain packet header metadata at the socket
	1991	* buffer if this is is an MPTCP subflow,
	1992	* otherwise move it.
	1993	*/
	1994	copymode = M_COPYM_MOVE_HDR;
	1995	#if MPTCP
	1996	if (so->so_flags & SOF_MP_SUBFLOW) {
	1997	copymode = M_COPYM_NOOP_HDR;
	1998	}
	1999	#endif /* MPTCP */
	2000	if (m != NULL) {
	2001	m->m_next = m_copym_mode(so->so_snd.sb_mb,
	2002	off, (int)len, M_DONTWAIT, copymode);
	2003	if (m->m_next == NULL) {
	2004	(void) m_free(m);
	2005	error = ENOBUFS;
	2006	goto out;
	2007	}
	2008	} else {
	2009	/*
	2010	* make sure we still have data left
	2011	* to be sent at this point
	2012	*/
	2013	if (so->so_snd.sb_mb == NULL) {
	2014	error = 0; /* should we return an error? */
	2015	goto out;
	2016	}
	2017
	2018	/*
	2019	* m_copym_with_hdrs will always return the
	2020	* last mbuf pointer and the offset into it that
	2021	* it acted on to fullfill the current request,
	2022	* whether a valid 'hint' was passed in or not.
	2023	*/
	2024	if ((m = m_copym_with_hdrs(so->so_snd.sb_mb,
	2025	off, len, M_DONTWAIT, NULL, NULL,
	2026	copymode)) == NULL) {
	2027	error = ENOBUFS;
	2028	goto out;
	2029	}
	2030	m->m_data += max_linkhdr;
	2031	m->m_len = hdrlen;
	2032	}
	2033	}
	2034	/*
	2035	* If we're sending everything we've got, set PUSH.
	2036	* (This will keep happy those implementations which only
	2037	* give data to the user when a buffer fills or
	2038	* a PUSH comes in.)
	2039	*
	2040	* On SYN-segments we should not add the PUSH-flag.
	2041	*/
	2042	if (off + len == so->so_snd.sb_cc && !(flags & TH_SYN))
	2043	flags \|= TH_PUSH;
	2044	} else {
	2045	if (tp->t_flags & TF_ACKNOW)
	2046	tcpstat.tcps_sndacks++;
	2047	else if (flags & (TH_SYN\|TH_FIN\|TH_RST))
	2048	tcpstat.tcps_sndctrl++;
	2049	else if (SEQ_GT(tp->snd_up, tp->snd_una))
	2050	tcpstat.tcps_sndurg++;
	2051	else
	2052	tcpstat.tcps_sndwinup++;
	2053
	2054	MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */
	2055	if (m == NULL) {
	2056	error = ENOBUFS;
	2057	goto out;
	2058	}
	2059	if (MHLEN < (hdrlen + max_linkhdr)) {
	2060	MCLGET(m, M_DONTWAIT);
	2061	if ((m->m_flags & M_EXT) == 0) {
	2062	m_freem(m);
	2063	error = ENOBUFS;
	2064	goto out;
	2065	}
	2066	}
	2067	m->m_data += max_linkhdr;
	2068	m->m_len = hdrlen;
	2069	}
	2070	m->m_pkthdr.rcvif = 0;
	2071	#if CONFIG_MACF_NET
	2072	mac_mbuf_label_associate_inpcb(inp, m);
	2073	#endif
	2074	#if INET6
	2075	if (isipv6) {
	2076	ip6 = mtod(m, struct ip6_hdr *);
	2077	th = (struct tcphdr )(void )(ip6 + 1);
	2078	tcp_fillheaders(tp, ip6, th);
	2079	if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
	2080	!SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
	2081	ip6->ip6_flow \|= htonl(IPTOS_ECN_ECT0 << 20);
	2082	}
	2083	svc_flags \|= PKT_SCF_IPV6;
	2084	#if PF_ECN
	2085	m_pftag(m)->pftag_hdr = (void *)ip6;
	2086	m_pftag(m)->pftag_flags \|= PF_TAG_HDR_INET6;
	2087	#endif /* PF_ECN */
	2088	} else
	2089	#endif /* INET6 */
	2090	{
	2091	ip = mtod(m, struct ip *);
	2092	ipov = (struct ipovly *)ip;
	2093	th = (struct tcphdr )(void )(ip + 1);
	2094	/* this picks up the pseudo header (w/o the length) */
	2095	tcp_fillheaders(tp, ip, th);
	2096	if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
	2097	!SEQ_LT(tp->snd_nxt, tp->snd_max) &&
	2098	!sack_rxmit && !(flags & TH_SYN)) {
	2099	ip->ip_tos \|= IPTOS_ECN_ECT0;
	2100	}
	2101	#if PF_ECN
	2102	m_pftag(m)->pftag_hdr = (void *)ip;
	2103	m_pftag(m)->pftag_flags \|= PF_TAG_HDR_INET;
	2104	#endif /* PF_ECN */
	2105	}
	2106
	2107	/*
	2108	* Fill in fields, remembering maximum advertised
	2109	* window for use in delaying messages about window sizes.
	2110	* If resending a FIN, be sure not to use a new sequence number.
	2111	*/
	2112	if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
	2113	tp->snd_nxt == tp->snd_max)
	2114	tp->snd_nxt--;
	2115	/*
	2116	* If we are doing retransmissions, then snd_nxt will
	2117	* not reflect the first unsent octet. For ACK only
	2118	* packets, we do not want the sequence number of the
	2119	* retransmitted packet, we want the sequence number
	2120	* of the next unsent octet. So, if there is no data
	2121	* (and no SYN or FIN), use snd_max instead of snd_nxt
	2122	* when filling in ti_seq. But if we are in persist
	2123	* state, snd_max might reflect one byte beyond the
	2124	* right edge of the window, so use snd_nxt in that
	2125	* case, since we know we aren't doing a retransmission.
	2126	* (retransmit and persist are mutually exclusive...)
	2127	*
	2128	* Note the state of this retransmit segment to detect spurious
	2129	* retransmissions.
	2130	*/
	2131	if (sack_rxmit == 0) {
	2132	if (len \|\| (flags & (TH_SYN\|TH_FIN)) \|\|
	2133	tp->t_timer[TCPT_PERSIST]) {
	2134	th->th_seq = htonl(tp->snd_nxt);
	2135	if (len > 0) {
	2136	m->m_pkthdr.tx_start_seq = tp->snd_nxt;
	2137	m->m_pkthdr.pkt_flags \|= PKTF_START_SEQ;
	2138	}
	2139	if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
	2140	if (SACK_ENABLED(tp) && len > 1) {
	2141	tcp_rxtseg_insert(tp, tp->snd_nxt,
	2142	(tp->snd_nxt + len - 1));
	2143	}
	2144	if (len > 0)
	2145	m->m_pkthdr.pkt_flags \|=
	2146	PKTF_TCP_REXMT;
	2147	}
	2148	} else {
	2149	th->th_seq = htonl(tp->snd_max);
	2150	}
	2151	} else {
	2152	th->th_seq = htonl(p->rxmit);
	2153	if (len > 0) {
	2154	m->m_pkthdr.pkt_flags \|=
	2155	(PKTF_TCP_REXMT \| PKTF_START_SEQ);
	2156	m->m_pkthdr.tx_start_seq = p->rxmit;
	2157	}
	2158	tcp_rxtseg_insert(tp, p->rxmit, (p->rxmit + len - 1));
	2159	p->rxmit += len;
	2160	tp->sackhint.sack_bytes_rexmit += len;
	2161	}
	2162	th->th_ack = htonl(tp->rcv_nxt);
	2163	tp->last_ack_sent = tp->rcv_nxt;
	2164	if (optlen) {
	2165	bcopy(opt, th + 1, optlen);
	2166	th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
	2167	}
	2168	th->th_flags = flags;
	2169	th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
	2170	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
	2171	if (recwin > 0 && SEQ_LT(tp->rcv_adv, tp->rcv_nxt + recwin)) {
	2172	tp->rcv_adv = tp->rcv_nxt + recwin;
	2173	}
	2174	} else {
	2175	struct mptcb *mp_tp = tptomptp(tp);
	2176	if (recwin > 0) {
	2177	tp->rcv_adv = tp->rcv_nxt + recwin;
	2178	}
	2179
	2180	if (recwin > 0 && SEQ_LT(mp_tp->mpt_rcvadv, (uint32_t)mp_tp->mpt_rcvnxt + recwin)) {
	2181	mp_tp->mpt_rcvadv = (uint32_t)mp_tp->mpt_rcvnxt + recwin;
	2182	}
	2183	}
	2184
	2185	/*
	2186	* Adjust the RXWIN0SENT flag - indicate that we have advertised
	2187	* a 0 window. This may cause the remote transmitter to stall. This
	2188	* flag tells soreceive() to disable delayed acknowledgements when
	2189	* draining the buffer. This can occur if the receiver is attempting
	2190	* to read more data then can be buffered prior to transmitting on
	2191	* the connection.
	2192	*/
	2193	if (th->th_win == 0)
	2194	tp->t_flags \|= TF_RXWIN0SENT;
	2195	else
	2196	tp->t_flags &= ~TF_RXWIN0SENT;
	2197
	2198	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
	2199	th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
	2200	th->th_flags \|= TH_URG;
	2201	} else {
	2202	/*
	2203	* If no urgent pointer to send, then we pull
	2204	* the urgent pointer to the left edge of the send window
	2205	* so that it doesn't drift into the send window on sequence
	2206	* number wraparound.
	2207	*/
	2208	tp->snd_up = tp->snd_una; /* drag it along */
	2209	}
	2210
	2211	/*
	2212	* Put TCP length in extended header, and then
	2213	* checksum extended header and data.
	2214	*/
	2215	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
	2216
	2217	/*
	2218	* If this is potentially the last packet on the stream, then mark
	2219	* it in order to enable some optimizations in the underlying
	2220	* layers
	2221	*/
	2222	if (tp->t_state != TCPS_ESTABLISHED &&
	2223	(tp->t_state == TCPS_CLOSING \|\| tp->t_state == TCPS_TIME_WAIT
	2224	\|\| tp->t_state == TCPS_LAST_ACK \|\| (th->th_flags & TH_RST)))
	2225	m->m_pkthdr.pkt_flags \|= PKTF_LAST_PKT;
	2226
	2227	#if INET6
	2228	if (isipv6) {
	2229	/*
	2230	* ip6_plen is not need to be filled now, and will be filled
	2231	* in ip6_output.
	2232	*/
	2233	m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
	2234	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	2235	if (len + optlen)
	2236	th->th_sum = in_addword(th->th_sum,
	2237	htons((u_short)(optlen + len)));
	2238	}
	2239	else
	2240	#endif /* INET6 */
	2241	{
	2242	m->m_pkthdr.csum_flags = CSUM_TCP;
	2243	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	2244	if (len + optlen)
	2245	th->th_sum = in_addword(th->th_sum,
	2246	htons((u_short)(optlen + len)));
	2247	}
	2248
	2249	/*
	2250	* Enable TSO and specify the size of the segments.
	2251	* The TCP pseudo header checksum is always provided.
	2252	*/
	2253	if (tso) {
	2254	#if INET6
	2255	if (isipv6)
	2256	m->m_pkthdr.csum_flags \|= CSUM_TSO_IPV6;
	2257	else
	2258	#endif /* INET6 */
	2259	m->m_pkthdr.csum_flags \|= CSUM_TSO_IPV4;
	2260
	2261	m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
	2262	} else {
	2263	m->m_pkthdr.tso_segsz = 0;
	2264	}
	2265
	2266	/*
	2267	* In transmit state, time the transmission and arrange for
	2268	* the retransmit. In persist state, just set snd_max.
	2269	*/
	2270	if (!(tp->t_flagsext & TF_FORCE)
	2271	\|\| tp->t_timer[TCPT_PERSIST] == 0) {
	2272	tcp_seq startseq = tp->snd_nxt;
	2273
	2274	/*
	2275	* Advance snd_nxt over sequence space of this segment.
	2276	*/
	2277	if (flags & (TH_SYN\|TH_FIN)) {
	2278	if (flags & TH_SYN)
	2279	tp->snd_nxt++;
	2280	if ((flags & TH_FIN) &&
	2281	!(tp->t_flags & TF_SENTFIN)) {
	2282	tp->snd_nxt++;
	2283	tp->t_flags \|= TF_SENTFIN;
	2284	}
	2285	}
	2286	if (sack_rxmit)
	2287	goto timer;
	2288	if (sack_rescue_rxt == TRUE) {
	2289	tp->snd_nxt = old_snd_nxt;
	2290	sack_rescue_rxt = FALSE;
	2291	tcpstat.tcps_pto_in_recovery++;
	2292	} else {
	2293	tp->snd_nxt += len;
	2294	}
	2295	if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
	2296	tp->snd_max = tp->snd_nxt;
	2297	tp->t_sndtime = tcp_now;
	2298	/*
	2299	* Time this transmission if not a retransmission and
	2300	* not currently timing anything.
	2301	*/
	2302	if (tp->t_rtttime == 0) {
	2303	tp->t_rtttime = tcp_now;
	2304	tp->t_rtseq = startseq;
	2305	tcpstat.tcps_segstimed++;
	2306
	2307	/* update variables related to pipe ack */
	2308	tp->t_pipeack_lastuna = tp->snd_una;
	2309	}
	2310	}
	2311
	2312	/*
	2313	* Set retransmit timer if not currently set,
	2314	* and not doing an ack or a keep-alive probe.
	2315	*/
	2316	timer:
	2317	if (tp->t_timer[TCPT_REXMT] == 0 &&
	2318	((sack_rxmit && tp->snd_nxt != tp->snd_max) \|\|
	2319	tp->snd_nxt != tp->snd_una \|\| (flags & TH_FIN))) {
	2320	if (tp->t_timer[TCPT_PERSIST]) {
	2321	tp->t_timer[TCPT_PERSIST] = 0;
	2322	tp->t_persist_stop = 0;
	2323	TCP_RESET_REXMT_STATE(tp);
	2324	}
	2325	tp->t_timer[TCPT_REXMT] =
	2326	OFFSET_FROM_START(tp, tp->t_rxtcur);
	2327	}
	2328
	2329	/*
	2330	* Set tail loss probe timeout if new data is being
	2331	* transmitted. This will be supported only when
	2332	* SACK option is enabled on a connection.
	2333	*
	2334	* Every time new data is sent PTO will get reset.
	2335	*/
	2336	if (tcp_enable_tlp && len != 0 && tp->t_state == TCPS_ESTABLISHED &&
	2337	SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
	2338	tp->snd_nxt == tp->snd_max &&
	2339	SEQ_GT(tp->snd_nxt, tp->snd_una) &&
	2340	tp->t_rxtshift == 0 &&
	2341	(tp->t_flagsext & (TF_SENT_TLPROBE\|TF_PKTS_REORDERED)) == 0) {
	2342	u_int32_t pto, srtt;
	2343
	2344	/*
	2345	* Using SRTT alone to set PTO can cause spurious
	2346	* retransmissions on wireless networks where there
	2347	* is a lot of variance in RTT. Taking variance
	2348	* into account will avoid this.
	2349	*/
	2350	srtt = tp->t_srtt >> TCP_RTT_SHIFT;
	2351	pto = ((TCP_REXMTVAL(tp)) * 3) >> 1;
	2352	pto = max (2 * srtt, pto);
	2353	if ((tp->snd_max - tp->snd_una) == tp->t_maxseg)
	2354	pto = max(pto,
	2355	(((3 * pto) >> 2) + tcp_delack * 2));
	2356	else
	2357	pto = max(10, pto);
	2358
	2359	/* if RTO is less than PTO, choose RTO instead */
	2360	if (tp->t_rxtcur < pto)
	2361	pto = tp->t_rxtcur;
	2362
	2363	tp->t_timer[TCPT_PTO] = OFFSET_FROM_START(tp, pto);
	2364	}
	2365	} else {
	2366	/*
	2367	* Persist case, update snd_max but since we are in
	2368	* persist mode (no window) we do not update snd_nxt.
	2369	*/
	2370	int xlen = len;
	2371	if (flags & TH_SYN)
	2372	++xlen;
	2373	if ((flags & TH_FIN) &&
	2374	!(tp->t_flags & TF_SENTFIN)) {
	2375	++xlen;
	2376	tp->t_flags \|= TF_SENTFIN;
	2377	}
	2378	if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
	2379	tp->snd_max = tp->snd_nxt + len;
	2380	tp->t_sndtime = tcp_now;
	2381	}
	2382	}
	2383
	2384	#if TCPDEBUG
	2385	/*
	2386	* Trace.
	2387	*/
	2388	if (so_options & SO_DEBUG)
	2389	tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
	2390	#endif
	2391
	2392	/*
	2393	* Fill in IP length and desired time to live and
	2394	* send to IP level. There should be a better way
	2395	* to handle ttl and tos; we could keep them in
	2396	* the template, but need a way to checksum without them.
	2397	*/
	2398	#if INET6
	2399	/*
	2400	* m->m_pkthdr.len should have been set before cksum calcuration,
	2401	* because in6_cksum() need it.
	2402	*/
	2403	if (isipv6) {
	2404	/*
	2405	* we separately set hoplimit for every segment, since the
	2406	* user might want to change the value via setsockopt.
	2407	* Also, desired default hop limit might be changed via
	2408	* Neighbor Discovery.
	2409	*/
	2410	ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
	2411	inp->in6p_route.ro_rt->rt_ifp : NULL);
	2412
	2413	/* TODO: IPv6 IP6TOS_ECT bit on */
	2414	KERNEL_DEBUG(DBG_LAYER_BEG,
	2415	((inp->inp_fport << 16) \| inp->inp_lport),
	2416	(((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) \|
	2417	(inp->in6p_faddr.s6_addr16[0] & 0xffff)),
	2418	sendalot,0,0);
	2419	} else
	2420	#endif /* INET6 */
	2421	{
	2422	ip->ip_len = m->m_pkthdr.len;
	2423	ip->ip_ttl = inp->inp_ip_ttl; /* XXX */
	2424	ip->ip_tos \|= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);/* XXX */
	2425	KERNEL_DEBUG(DBG_LAYER_BEG,
	2426	((inp->inp_fport << 16) \| inp->inp_lport),
	2427	(((inp->inp_laddr.s_addr & 0xffff) << 16) \|
	2428	(inp->inp_faddr.s_addr & 0xffff)), 0,0,0);
	2429	}
	2430
	2431	/*
	2432	* See if we should do MTU discovery.
	2433	* Look at the flag updated on the following criterias:
	2434	* 1) Path MTU discovery is authorized by the sysctl
	2435	* 2) The route isn't set yet (unlikely but could happen)
	2436	* 3) The route is up
	2437	* 4) the MTU is not locked (if it is, then discovery has been
	2438	* disabled for that route)
	2439	*/
	2440	#if INET6
	2441	if (!isipv6)
	2442	#endif /* INET6 */
	2443	if (path_mtu_discovery && (tp->t_flags & TF_PMTUD))
	2444	ip->ip_off \|= IP_DF;
	2445
	2446	#if NECP
	2447	{
	2448	necp_kernel_policy_id policy_id;
	2449	necp_kernel_policy_id skip_policy_id;
	2450	u_int32_t route_rule_id;
	2451	if (!necp_socket_is_allowed_to_send_recv(inp, NULL, &policy_id, &route_rule_id, &skip_policy_id)) {
	2452	TCP_LOG_DROP_NECP(isipv6 ? (void )ip6 : (void )ip, th, tp, true);
	2453	m_freem(m);
	2454	error = EHOSTUNREACH;
	2455	goto out;
	2456	}
	2457	necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id, skip_policy_id);
	2458
	2459	if (net_qos_policy_restricted != 0) {
	2460	necp_socket_update_qos_marking(inp, inp->inp_route.ro_rt,
	2461	NULL, route_rule_id);
	2462	}
	2463	}
	2464	#endif /* NECP */
	2465
	2466	#if IPSEC
	2467	if (inp->inp_sp != NULL)
	2468	ipsec_setsocket(m, so);
	2469	#endif /IPSEC/
	2470
	2471	/*
	2472	* The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
	2473	*/
	2474	lost = 0;
	2475
	2476	/*
	2477	* Embed the flow hash in pkt hdr and mark the packet as
	2478	* capable of flow controlling
	2479	*/
	2480	m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
	2481	m->m_pkthdr.pkt_flowid = inp->inp_flowhash;
	2482	m->m_pkthdr.pkt_flags \|= (PKTF_FLOW_ID \| PKTF_FLOW_LOCALSRC \| PKTF_FLOW_ADV);
	2483	m->m_pkthdr.pkt_proto = IPPROTO_TCP;
	2484	m->m_pkthdr.tx_tcp_pid = so->last_pid;
	2485	if (so->so_flags & SOF_DELEGATED)
	2486	m->m_pkthdr.tx_tcp_e_pid = so->e_pid;
	2487	else
	2488	m->m_pkthdr.tx_tcp_e_pid = 0;
	2489
	2490	m->m_nextpkt = NULL;
	2491
	2492	if (inp->inp_last_outifp != NULL &&
	2493	!(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
	2494	/* Hint to prioritize this packet if
	2495	* 1. if the packet has no data
	2496	* 2. the interface supports transmit-start model and did
	2497	* not disable ACK prioritization.
	2498	* 3. Only ACK flag is set.
	2499	* 4. there is no outstanding data on this connection.
	2500	*/
	2501	if (tcp_prioritize_acks != 0 && len == 0 &&
	2502	(inp->inp_last_outifp->if_eflags &
	2503	(IFEF_TXSTART \| IFEF_NOACKPRI)) == IFEF_TXSTART) {
	2504	if (th->th_flags == TH_ACK &&
	2505	tp->snd_una == tp->snd_max &&
	2506	tp->t_timer[TCPT_REXMT] == 0)
	2507	svc_flags \|= PKT_SCF_TCP_ACK;
	2508	if (th->th_flags & TH_SYN)
	2509	svc_flags \|= PKT_SCF_TCP_SYN;
	2510	}
	2511	set_packet_service_class(m, so, sotc, svc_flags);
	2512	} else {
	2513	/*
	2514	* Optimization for loopback just set the mbuf
	2515	* service class
	2516	*/
	2517	(void) m_set_service_class(m, so_tc2msc(sotc));
	2518	}
	2519
	2520	TCP_LOG_TH_FLAGS(isipv6 ? (void )ip6 : (void )ip, th, tp, true,
	2521	inp->inp_last_outifp != NULL ? inp->inp_last_outifp :
	2522	inp->inp_boundifp);
	2523
	2524	tp->t_pktlist_sentlen += len;
	2525	tp->t_lastchain++;
	2526
	2527	#if INET6
	2528	if (isipv6) {
	2529	DTRACE_TCP5(send, struct mbuf , m, struct inpcb , inp,
	2530	struct ip6 , ip6, struct tcpcb , tp, struct tcphdr *,
	2531	th);
	2532	} else
	2533	#endif /* INET6 */
	2534	{
	2535	DTRACE_TCP5(send, struct mbuf , m, struct inpcb , inp,
	2536	struct ip , ip, struct tcpcb , tp, struct tcphdr *, th);
	2537	}
	2538
	2539	if (tp->t_pktlist_head != NULL) {
	2540	tp->t_pktlist_tail->m_nextpkt = m;
	2541	tp->t_pktlist_tail = m;
	2542	} else {
	2543	packchain_newlist++;
	2544	tp->t_pktlist_head = tp->t_pktlist_tail = m;
	2545	}
	2546
	2547	if (lro_ackmore && !sackoptlen && tp->t_timer[TCPT_PERSIST] == 0 &&
	2548	(th->th_flags & TH_ACK) == TH_ACK && len == 0 &&
	2549	tp->t_state == TCPS_ESTABLISHED) {
	2550	/* For a pure ACK, see if you need to send more of them */
	2551	mnext = tcp_send_lroacks(tp, m, th);
	2552	if (mnext) {
	2553	tp->t_pktlist_tail->m_nextpkt = mnext;
	2554	if (mnext->m_nextpkt == NULL) {
	2555	tp->t_pktlist_tail = mnext;
	2556	tp->t_lastchain++;
	2557	} else {
	2558	struct mbuf tail, next;
	2559	next = mnext->m_nextpkt;
	2560	tail = next->m_nextpkt;
	2561	while (tail) {
	2562	next = tail;
	2563	tail = tail->m_nextpkt;
	2564	tp->t_lastchain++;
	2565	}
	2566	tp->t_pktlist_tail = next;
	2567	}
	2568	}
	2569	}
	2570
	2571	if (sendalot == 0 \|\| (tp->t_state != TCPS_ESTABLISHED) \|\|
	2572	(tp->snd_cwnd <= (tp->snd_wnd / 8)) \|\|
	2573	(tp->t_flags & TF_ACKNOW) \|\|
	2574	(tp->t_flagsext & TF_FORCE) \|\|
	2575	tp->t_lastchain >= tcp_packet_chaining) {
	2576	error = 0;
	2577	while (inp->inp_sndinprog_cnt == 0 &&
	2578	tp->t_pktlist_head != NULL) {
	2579	packetlist = tp->t_pktlist_head;
	2580	packchain_listadd = tp->t_lastchain;
	2581	packchain_sent++;
	2582	lost = tp->t_pktlist_sentlen;
	2583	TCP_PKTLIST_CLEAR(tp);
	2584
	2585	error = tcp_ip_output(so, tp, packetlist,
	2586	packchain_listadd, tp_inp_options,
	2587	(so_options & SO_DONTROUTE),
	2588	(sack_rxmit \|\| (sack_bytes_rxmt != 0)), isipv6);
	2589	if (error) {
	2590	/*
	2591	* Take into account the rest of unsent
	2592	* packets in the packet list for this tcp
	2593	* into "lost", since we're about to free
	2594	* the whole list below.
	2595	*/
	2596	lost += tp->t_pktlist_sentlen;
	2597	break;
	2598	} else {
	2599	lost = 0;
	2600	}
	2601	}
	2602	/* tcp was closed while we were in ip; resume close */
	2603	if (inp->inp_sndinprog_cnt == 0 &&
	2604	(tp->t_flags & TF_CLOSING)) {
	2605	tp->t_flags &= ~TF_CLOSING;
	2606	(void) tcp_close(tp);
	2607	return 0;
	2608	}
	2609	} else {
	2610	error = 0;
	2611	packchain_looped++;
	2612	tcpstat.tcps_sndtotal++;
	2613
	2614	goto again;
	2615	}
	2616	if (error) {
	2617	/*
	2618	* Assume that the packets were lost, so back out the
	2619	* sequence number advance, if any. Note that the "lost"
	2620	* variable represents the amount of user data sent during
	2621	* the recent call to ip_output_list() plus the amount of
	2622	* user data in the packet list for this tcp at the moment.
	2623	*/
	2624	if (!(tp->t_flagsext & TF_FORCE)
	2625	\|\| tp->t_timer[TCPT_PERSIST] == 0) {
	2626	/*
	2627	* No need to check for TH_FIN here because
	2628	* the TF_SENTFIN flag handles that case.
	2629	*/
	2630	if ((flags & TH_SYN) == 0) {
	2631	if (sack_rxmit) {
	2632	if (SEQ_GT((p->rxmit - lost),
	2633	tp->snd_una)) {
	2634	p->rxmit -= lost;
	2635	} else {
	2636	lost = p->rxmit - tp->snd_una;
	2637	p->rxmit = tp->snd_una;
	2638	}
	2639	tp->sackhint.sack_bytes_rexmit -= lost;
	2640	} else {
	2641	if (SEQ_GT((tp->snd_nxt - lost),
	2642	tp->snd_una))
	2643	tp->snd_nxt -= lost;
	2644	else
	2645	tp->snd_nxt = tp->snd_una;
	2646	}
	2647	}
	2648	}
	2649	out:
	2650	if (tp->t_pktlist_head != NULL)
	2651	m_freem_list(tp->t_pktlist_head);
	2652	TCP_PKTLIST_CLEAR(tp);
	2653
	2654	if (error == ENOBUFS) {
	2655	/*
	2656	* Set retransmit timer if not currently set
	2657	* when we failed to send a segment that can be
	2658	* retransmitted (i.e. not pure ack or rst)
	2659	*/
	2660	if (tp->t_timer[TCPT_REXMT] == 0 &&
	2661	tp->t_timer[TCPT_PERSIST] == 0 &&
	2662	(len != 0 \|\| (flags & (TH_SYN \| TH_FIN)) != 0 \|\|
	2663	so->so_snd.sb_cc > 0))
	2664	tp->t_timer[TCPT_REXMT] =
	2665	OFFSET_FROM_START(tp, tp->t_rxtcur);
	2666	tp->snd_cwnd = tp->t_maxseg;
	2667	tp->t_bytes_acked = 0;
	2668	tcp_check_timer_state(tp);
	2669	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, 0,0,0,0,0);
	2670
	2671	tcp_ccdbg_trace(tp, NULL, TCP_CC_OUTPUT_ERROR);
	2672	return 0;
	2673	}
	2674	if (error == EMSGSIZE) {
	2675	/*
	2676	* ip_output() will have already fixed the route
	2677	* for us. tcp_mtudisc() will, as its last action,
	2678	* initiate retransmission, so it is important to
	2679	* not do so here.
	2680	*
	2681	* If TSO was active we either got an interface
	2682	* without TSO capabilits or TSO was turned off.
	2683	* Disable it for this connection as too and
	2684	* immediatly retry with MSS sized segments generated
	2685	* by this function.
	2686	*/
	2687	if (tso)
	2688	tp->t_flags &= ~TF_TSO;
	2689
	2690	tcp_mtudisc(inp, 0);
	2691	tcp_check_timer_state(tp);
	2692
	2693	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, 0,0,0,0,0);
	2694	return 0;
	2695	}
	2696	/*
	2697	* Unless this is due to interface restriction policy,
	2698	* treat EHOSTUNREACH/ENETDOWN as a soft error.
	2699	*/
	2700	if ((error == EHOSTUNREACH \|\| error == ENETDOWN) &&
	2701	TCPS_HAVERCVDSYN(tp->t_state) &&
	2702	!inp_restricted_send(inp, inp->inp_last_outifp)) {
	2703	tp->t_softerror = error;
	2704	error = 0;
	2705	}
	2706	tcp_check_timer_state(tp);
	2707	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, 0,0,0,0,0);
	2708	return error;
	2709	}
	2710
	2711	tcpstat.tcps_sndtotal++;
	2712
	2713	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END,0,0,0,0,0);
	2714	if (sendalot)
	2715	goto again;
	2716
	2717	tcp_check_timer_state(tp);
	2718
	2719	return 0;
	2720	}
	2721
	2722	static int
	2723	tcp_ip_output(struct socket so, struct tcpcb tp, struct mbuf *pkt,
	2724	int cnt, struct mbuf *opt, int flags, int sack_in_progress, boolean_t isipv6)
	2725	{
	2726	int error = 0;
	2727	boolean_t chain;
	2728	boolean_t unlocked = FALSE;
	2729	boolean_t ifdenied = FALSE;
	2730	struct inpcb *inp = tp->t_inpcb;
	2731	struct ip_out_args ipoa;
	2732	struct route ro;
	2733	struct ifnet *outif = NULL;
	2734
	2735	bzero(&ipoa, sizeof(ipoa));
	2736	ipoa.ipoa_boundif = IFSCOPE_NONE;
	2737	ipoa.ipoa_flags = IPOAF_SELECT_SRCIF \| IPOAF_BOUND_SRCADDR;
	2738	ipoa.ipoa_sotc = SO_TC_UNSPEC;
	2739	ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
	2740	#if INET6
	2741	struct ip6_out_args ip6oa;
	2742	struct route_in6 ro6;
	2743
	2744	bzero(&ip6oa, sizeof(ip6oa));
	2745	ip6oa.ip6oa_boundif = IFSCOPE_NONE;
	2746	ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF \| IP6OAF_BOUND_SRCADDR;
	2747	ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
	2748	ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
	2749
	2750	struct flowadv *adv =
	2751	(isipv6 ? &ip6oa.ip6oa_flowadv : &ipoa.ipoa_flowadv);
	2752	#else /* INET6 */
	2753	struct flowadv *adv = &ipoa.ipoa_flowadv;
	2754	#endif /* !INET6 */
	2755
	2756	/* If socket was bound to an ifindex, tell ip_output about it */
	2757	if (inp->inp_flags & INP_BOUND_IF) {
	2758	#if INET6
	2759	if (isipv6) {
	2760	ip6oa.ip6oa_boundif = inp->inp_boundifp->if_index;
	2761	ip6oa.ip6oa_flags \|= IP6OAF_BOUND_IF;
	2762	} else
	2763	#endif /* INET6 */
	2764	{
	2765	ipoa.ipoa_boundif = inp->inp_boundifp->if_index;
	2766	ipoa.ipoa_flags \|= IPOAF_BOUND_IF;
	2767	}
	2768	}
	2769
	2770	if (INP_NO_CELLULAR(inp)) {
	2771	#if INET6
	2772	if (isipv6)
	2773	ip6oa.ip6oa_flags \|= IP6OAF_NO_CELLULAR;
	2774	else
	2775	#endif /* INET6 */
	2776	ipoa.ipoa_flags \|= IPOAF_NO_CELLULAR;
	2777	}
	2778	if (INP_NO_EXPENSIVE(inp)) {
	2779	#if INET6
	2780	if (isipv6)
	2781	ip6oa.ip6oa_flags \|= IP6OAF_NO_EXPENSIVE;
	2782	else
	2783	#endif /* INET6 */
	2784	ipoa.ipoa_flags \|= IPOAF_NO_EXPENSIVE;
	2785
	2786	}
	2787	if (INP_NO_CONSTRAINED(inp)) {
	2788	#if INET6
	2789	if (isipv6)
	2790	ip6oa.ip6oa_flags \|= IP6OAF_NO_CONSTRAINED;
	2791	else
	2792	#endif /* INET6 */
	2793	ipoa.ipoa_flags \|= IPOAF_NO_CONSTRAINED;
	2794	}
	2795	if (INP_AWDL_UNRESTRICTED(inp)) {
	2796	#if INET6
	2797	if (isipv6)
	2798	ip6oa.ip6oa_flags \|= IP6OAF_AWDL_UNRESTRICTED;
	2799	else
	2800	#endif /* INET6 */
	2801	ipoa.ipoa_flags \|= IPOAF_AWDL_UNRESTRICTED;
	2802
	2803	}
	2804	#if INET6
	2805	if (INP_INTCOPROC_ALLOWED(inp) && isipv6) {
	2806	ip6oa.ip6oa_flags \|= IP6OAF_INTCOPROC_ALLOWED;
	2807	}
	2808	if (isipv6) {
	2809	ip6oa.ip6oa_sotc = so->so_traffic_class;
	2810	ip6oa.ip6oa_netsvctype = so->so_netsvctype;
	2811	} else
	2812	#endif /* INET6 */
	2813	{
	2814	ipoa.ipoa_sotc = so->so_traffic_class;
	2815	ipoa.ipoa_netsvctype = so->so_netsvctype;
	2816	}
	2817	if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
	2818	#if INET6
	2819	if (isipv6)
	2820	ip6oa.ip6oa_flags \|= IP6OAF_QOSMARKING_ALLOWED;
	2821	else
	2822	#endif /* INET6 */
	2823	ipoa.ipoa_flags \|= IPOAF_QOSMARKING_ALLOWED;
	2824	}
	2825	#if INET6
	2826	if (isipv6)
	2827	flags \|= IPV6_OUTARGS;
	2828	else
	2829	#endif /* INET6 */
	2830	flags \|= IP_OUTARGS;
	2831
	2832	/* Copy the cached route and take an extra reference */
	2833	#if INET6
	2834	if (isipv6)
	2835	in6p_route_copyout(inp, &ro6);
	2836	else
	2837	#endif /* INET6 */
	2838	inp_route_copyout(inp, &ro);
	2839
	2840	/*
	2841	* Make sure ACK/DELACK conditions are cleared before
	2842	* we unlock the socket.
	2843	*/
	2844	tp->last_ack_sent = tp->rcv_nxt;
	2845	tp->t_flags &= ~(TF_ACKNOW \| TF_DELACK);
	2846	tp->t_timer[TCPT_DELACK] = 0;
	2847	tp->t_unacksegs = 0;
	2848
	2849	/* Increment the count of outstanding send operations */
	2850	inp->inp_sndinprog_cnt++;
	2851
	2852	/*
	2853	* If allowed, unlock TCP socket while in IP
	2854	* but only if the connection is established and
	2855	* in a normal mode where reentrancy on the tcpcb won't be
	2856	* an issue:
	2857	* - there is no SACK episode
	2858	* - we're not in Fast Recovery mode
	2859	* - if we're not sending from an upcall.
	2860	*/
	2861	if (tcp_output_unlocked && !so->so_upcallusecount &&
	2862	(tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0) &&
	2863	!IN_FASTRECOVERY(tp) && !(so->so_flags & SOF_MP_SUBFLOW)) {
	2864
	2865	unlocked = TRUE;
	2866	socket_unlock(so, 0);
	2867	}
	2868
	2869	/*
	2870	* Don't send down a chain of packets when:
	2871	* - TCP chaining is disabled
	2872	* - there is an IPsec rule set
	2873	* - there is a non default rule set for the firewall
	2874	*/
	2875
	2876	chain = tcp_packet_chaining > 1
	2877	#if IPSEC
	2878	&& ipsec_bypass
	2879	#endif
	2880	#if IPFIREWALL
	2881	&& (fw_enable == 0 \|\| fw_bypass)
	2882	#endif
	2883	; // I'm important, not extraneous
	2884
	2885	while (pkt != NULL) {
	2886	struct mbuf *npkt = pkt->m_nextpkt;
	2887
	2888	if (!chain) {
	2889	pkt->m_nextpkt = NULL;
	2890	/*
	2891	* If we are not chaining, make sure to set the packet
	2892	* list count to 0 so that IP takes the right path;
	2893	* this is important for cases such as IPsec where a
	2894	* single mbuf might result in multiple mbufs as part
	2895	* of the encapsulation. If a non-zero count is passed
	2896	* down to IP, the head of the chain might change and
	2897	* we could end up skipping it (thus generating bogus
	2898	* packets). Fixing it in IP would be desirable, but
	2899	* for now this would do it.
	2900	*/
	2901	cnt = 0;
	2902	}
	2903	#if INET6
	2904	if (isipv6) {
	2905	error = ip6_output_list(pkt, cnt,
	2906	inp->in6p_outputopts, &ro6, flags, NULL, NULL,
	2907	&ip6oa);
	2908	ifdenied = (ip6oa.ip6oa_retflags & IP6OARF_IFDENIED);
	2909	} else {
	2910	#endif /* INET6 */
	2911	error = ip_output_list(pkt, cnt, opt, &ro, flags, NULL,
	2912	&ipoa);
	2913	ifdenied = (ipoa.ipoa_retflags & IPOARF_IFDENIED);
	2914	}
	2915
	2916	if (chain \|\| error) {
	2917	/*
	2918	* If we sent down a chain then we are done since
	2919	* the callee had taken care of everything; else
	2920	* we need to free the rest of the chain ourselves.
	2921	*/
	2922	if (!chain)
	2923	m_freem_list(npkt);
	2924	break;
	2925	}
	2926	pkt = npkt;
	2927	}
	2928
	2929	if (unlocked)
	2930	socket_lock(so, 0);
	2931
	2932	/*
	2933	* Enter flow controlled state if the connection is established
	2934	* and is not in recovery. Flow control is allowed only if there
	2935	* is outstanding data.
	2936	*
	2937	* A connection will enter suspended state even if it is in
	2938	* recovery.
	2939	*/
	2940	if (((adv->code == FADV_FLOW_CONTROLLED && !IN_FASTRECOVERY(tp)) \|\|
	2941	adv->code == FADV_SUSPENDED) &&
	2942	!(tp->t_flags & TF_CLOSING) &&
	2943	tp->t_state == TCPS_ESTABLISHED &&
	2944	SEQ_GT(tp->snd_max, tp->snd_una)) {
	2945	int rc;
	2946	rc = inp_set_fc_state(inp, adv->code);
	2947
	2948	if (rc == 1)
	2949	tcp_ccdbg_trace(tp, NULL,
	2950	((adv->code == FADV_FLOW_CONTROLLED) ?
	2951	TCP_CC_FLOW_CONTROL : TCP_CC_SUSPEND));
	2952	}
	2953
	2954	/*
	2955	* When an interface queue gets suspended, some of the
	2956	* packets are dropped. Return ENOBUFS, to update the
	2957	* pcb state.
	2958	*/
	2959	if (adv->code == FADV_SUSPENDED)
	2960	error = ENOBUFS;
	2961
	2962	VERIFY(inp->inp_sndinprog_cnt > 0);
	2963	if ( --inp->inp_sndinprog_cnt == 0) {
	2964	inp->inp_flags &= ~(INP_FC_FEEDBACK);
	2965	if (inp->inp_sndingprog_waiters > 0) {
	2966	wakeup(&inp->inp_sndinprog_cnt);
	2967	}
	2968	}
	2969
	2970	#if INET6
	2971	if (isipv6) {
	2972	/*
	2973	* When an NECP IP tunnel policy forces the outbound interface,
	2974	* ip6_output_list() informs the transport layer what is the actual
	2975	* outgoing interface
	2976	*/
	2977	if (ip6oa.ip6oa_flags & IP6OAF_BOUND_IF) {
	2978	outif = ifindex2ifnet[ip6oa.ip6oa_boundif];
	2979	} else if (ro6.ro_rt != NULL) {
	2980	outif = ro6.ro_rt->rt_ifp;
	2981	}
	2982	} else
	2983	#endif /* INET6 */
	2984	if (ro.ro_rt != NULL)
	2985	outif = ro.ro_rt->rt_ifp;
	2986
	2987	if (outif != NULL && outif != inp->inp_last_outifp) {
	2988	/* Update the send byte count */
	2989	if (so->so_snd.sb_cc > 0 && so->so_snd.sb_flags & SB_SNDBYTE_CNT) {
	2990	inp_decr_sndbytes_total(so, so->so_snd.sb_cc);
	2991	inp_decr_sndbytes_allunsent(so, tp->snd_una);
	2992	so->so_snd.sb_flags &= ~SB_SNDBYTE_CNT;
	2993	}
	2994	inp->inp_last_outifp = outif;
	2995
	2996	}
	2997
	2998	if (error != 0 && ifdenied &&
	2999	(INP_NO_CELLULAR(inp) \|\| INP_NO_EXPENSIVE(inp) \|\| INP_NO_CONSTRAINED(inp)))
	3000	soevent(so,
	3001	(SO_FILT_HINT_LOCKED\|SO_FILT_HINT_IFDENIED));
	3002
	3003	/* Synchronize cached PCB route & options */
	3004	#if INET6
	3005	if (isipv6)
	3006	in6p_route_copyin(inp, &ro6);
	3007	else
	3008	#endif /* INET6 */
	3009	inp_route_copyin(inp, &ro);
	3010
	3011	if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift == 0 &&
	3012	tp->t_inpcb->inp_route.ro_rt != NULL) {
	3013	/* If we found the route and there is an rtt on it
	3014	* reset the retransmit timer
	3015	*/
	3016	tcp_getrt_rtt(tp, tp->t_inpcb->in6p_route.ro_rt);
	3017	tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
	3018	}
	3019	return error;
	3020	}
	3021
	3022	int tcptv_persmin_val = TCPTV_PERSMIN;
	3023
	3024	void
	3025	tcp_setpersist(struct tcpcb *tp)
	3026	{
	3027	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
	3028
	3029	/* If a PERSIST_TIMER option was set we will limit the
	3030	* time the persist timer will be active for that connection
	3031	* in order to avoid DOS by using zero window probes.
	3032	* see rdar://5805356
	3033	*/
	3034
	3035	if (tp->t_persist_timeout != 0 &&
	3036	tp->t_timer[TCPT_PERSIST] == 0 &&
	3037	tp->t_persist_stop == 0) {
	3038	tp->t_persist_stop = tcp_now + tp->t_persist_timeout;
	3039	}
	3040
	3041	/*
	3042	* Start/restart persistance timer.
	3043	*/
	3044	TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
	3045	t * tcp_backoff[tp->t_rxtshift],
	3046	tcptv_persmin_val, TCPTV_PERSMAX, 0);
	3047	tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]);
	3048
	3049	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
	3050	tp->t_rxtshift++;
	3051	}
	3052
	3053	/*
	3054	* Send as many acks as data coalesced. Every other packet when stretch
	3055	* ACK is not enabled. Every 8 packets, if stretch ACK is enabled.
	3056	*/
	3057	static struct mbuf*
	3058	tcp_send_lroacks(struct tcpcb tp, struct mbuf m, struct tcphdr *th)
	3059	{
	3060	struct mbuf mnext = NULL, ack_chain = NULL, *tail = NULL;
	3061	int count = 0;
	3062	tcp_seq org_ack = ntohl(th->th_ack);
	3063	tcp_seq prev_ack = 0;
	3064	int tack_offset = 28; /* IPv6 and IP options not supported */
	3065	int twin_offset = 34; /* IPv6 and IP options not supported */
	3066	int ack_size = (tp->t_flags & TF_STRETCHACK) ?
	3067	(maxseg_unacked * tp->t_maxseg) : (tp->t_maxseg << 1);
	3068	int segs_acked = (tp->t_flags & TF_STRETCHACK) ? maxseg_unacked : 2;
	3069	struct mbuf *prev_ack_pkt = NULL;
	3070	struct socket *so = tp->t_inpcb->inp_socket;
	3071	unsigned short winsz = ntohs(th->th_win);
	3072	unsigned int scaled_win = winsz<<tp->rcv_scale;
	3073	tcp_seq win_rtedge = org_ack + scaled_win;
	3074
	3075	count = tp->t_lropktlen/tp->t_maxseg;
	3076
	3077	prev_ack = (org_ack - tp->t_lropktlen) + ack_size;
	3078	if (prev_ack < org_ack) {
	3079	ack_chain = m_dup(m, M_DONTWAIT);
	3080	if (ack_chain) {
	3081	th->th_ack = htonl(prev_ack);
	3082	/* Keep adv window constant for duplicated ACK packets */
	3083	scaled_win = win_rtedge - prev_ack;
	3084	if (scaled_win > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
	3085	scaled_win = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
	3086	th->th_win = htons(scaled_win>>tp->rcv_scale);
	3087	if (lrodebug == 5) {
	3088	printf("%s: win = %d winsz = %d sc = %d"
	3089	" lro_len %d %d\n",
	3090	__func__, scaled_win>>tp->rcv_scale, winsz,
	3091	tp->rcv_scale, tp->t_lropktlen, count);
	3092	}
	3093	tail = ack_chain;
	3094	count -= segs_acked; /* accounts for prev_ack packet */
	3095	count = (count <= segs_acked) ? 0 : count - segs_acked;
	3096	tcpstat.tcps_sndacks++;
	3097	so_tc_update_stats(m, so, m_get_service_class(m));
	3098	} else {
	3099	return NULL;
	3100	}
	3101	}
	3102	else {
	3103	tp->t_lropktlen = 0;
	3104	return NULL;
	3105	}
	3106
	3107	prev_ack_pkt = ack_chain;
	3108
	3109	while (count > 0) {
	3110	if ((prev_ack + ack_size) < org_ack) {
	3111	prev_ack += ack_size;
	3112	} else {
	3113	/*
	3114	* The last ACK sent must have the ACK number that TCP
	3115	* thinks is the last sent ACK number.
	3116	*/
	3117	prev_ack = org_ack;
	3118	}
	3119	mnext = m_dup(prev_ack_pkt, M_DONTWAIT);
	3120	if (mnext) {
	3121	/* Keep adv window constant for duplicated ACK packets */
	3122	scaled_win = win_rtedge - prev_ack;
	3123	if (scaled_win > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
	3124	scaled_win = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
	3125	winsz = htons(scaled_win>>tp->rcv_scale);
	3126	if (lrodebug == 5) {
	3127	printf("%s: winsz = %d ack %x count %d\n",
	3128	__func__, scaled_win>>tp->rcv_scale,
	3129	prev_ack, count);
	3130	}
	3131	bcopy(&winsz, mtod(prev_ack_pkt, caddr_t) + twin_offset, 2);
	3132	HTONL(prev_ack);
	3133	bcopy(&prev_ack, mtod(prev_ack_pkt, caddr_t) + tack_offset, 4);
	3134	NTOHL(prev_ack);
	3135	tail->m_nextpkt = mnext;
	3136	tail = mnext;
	3137	count -= segs_acked;
	3138	tcpstat.tcps_sndacks++;
	3139	so_tc_update_stats(m, so, m_get_service_class(m));
	3140	} else {
	3141	if (lrodebug == 5) {
	3142	printf("%s: failed to alloc mbuf.\n", __func__);
	3143	}
	3144	break;
	3145	}
	3146	prev_ack_pkt = mnext;
	3147	}
	3148	tp->t_lropktlen = 0;
	3149	return ack_chain;
	3150	}
	3151
	3152	static int
	3153	tcp_recv_throttle (struct tcpcb *tp)
	3154	{
	3155	uint32_t base_rtt, newsize;
	3156	struct sockbuf *sbrcv = &tp->t_inpcb->inp_socket->so_rcv;
	3157
	3158	if (tcp_use_rtt_recvbg == 1 &&
	3159	TSTMP_SUPPORTED(tp)) {
	3160	/*
	3161	* Timestamps are supported on this connection. Use
	3162	* RTT to look for an increase in latency.
	3163	*/
	3164
	3165	/*
	3166	* If the connection is already being throttled, leave it
	3167	* in that state until rtt comes closer to base rtt
	3168	*/
	3169	if (tp->t_flagsext & TF_RECV_THROTTLE)
	3170	return 1;
	3171
	3172	base_rtt = get_base_rtt(tp);
	3173
	3174	if (base_rtt != 0 && tp->t_rttcur != 0) {
	3175	/*
	3176	* if latency increased on a background flow,
	3177	* return 1 to start throttling.
	3178	*/
	3179	if (tp->t_rttcur > (base_rtt + target_qdelay)) {
	3180	tp->t_flagsext \|= TF_RECV_THROTTLE;
	3181	if (tp->t_recv_throttle_ts == 0)
	3182	tp->t_recv_throttle_ts = tcp_now;
	3183	/*
	3184	* Reduce the recv socket buffer size to
	3185	* minimize latecy.
	3186	*/
	3187	if (sbrcv->sb_idealsize >
	3188	tcp_recv_throttle_minwin) {
	3189	newsize = sbrcv->sb_idealsize >> 1;
	3190	/* Set a minimum of 16 K */
	3191	newsize =
	3192	max(newsize,
	3193	tcp_recv_throttle_minwin);
	3194	sbrcv->sb_idealsize = newsize;
	3195	}
	3196	return 1;
	3197	} else {
	3198	return 0;
	3199	}
	3200	}
	3201	}
	3202
	3203	/*
	3204	* Timestamps are not supported or there is no good RTT
	3205	* measurement. Use IPDV in this case.
	3206	*/
	3207	if (tp->acc_iaj > tcp_acc_iaj_react_limit)
	3208	return 1;
	3209
	3210	return 0;
	3211	}