git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2014 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	30	* The Regents of the University of California. All rights reserved.
	31	*
	32	* Redistribution and use in source and binary forms, with or without
	33	* modification, are permitted provided that the following conditions
	34	* are met:
	35	* 1. Redistributions of source code must retain the above copyright
	36	* notice, this list of conditions and the following disclaimer.
	37	* 2. Redistributions in binary form must reproduce the above copyright
	38	* notice, this list of conditions and the following disclaimer in the
	39	* documentation and/or other materials provided with the distribution.
	40	* 3. All advertising materials mentioning features or use of this software
	41	* must display the following acknowledgement:
	42	* This product includes software developed by the University of
	43	* California, Berkeley and its contributors.
	44	* 4. Neither the name of the University nor the names of its contributors
	45	* may be used to endorse or promote products derived from this software
	46	* without specific prior written permission.
	47	*
	48	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	49	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	50	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	51	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	52	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	53	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	54	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	55	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	56	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	57	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	58	* SUCH DAMAGE.
	59	*
	60	* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
	61	* $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
	62	*/
	63	/*
	64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
	65	* support for mandatory and extensible security protections. This notice
	66	* is included in support of clause 2.2 (b) of the Apple Public License,
	67	* Version 2.0.
	68	*/
	69
	70	#define _IP_VHL
	71
	72
	73	#include <sys/param.h>
	74	#include <sys/systm.h>
	75	#include <sys/kernel.h>
	76	#include <sys/sysctl.h>
	77	#include <sys/mbuf.h>
	78	#include <sys/domain.h>
	79	#include <sys/protosw.h>
	80	#include <sys/socket.h>
	81	#include <sys/socketvar.h>
	82
	83	#include <net/route.h>
	84	#include <net/ntstat.h>
	85	#include <net/if_var.h>
	86	#include <net/if.h>
	87	#include <net/if_types.h>
	88	#include <net/dlil.h>
	89
	90	#include <netinet/in.h>
	91	#include <netinet/in_systm.h>
	92	#include <netinet/in_var.h>
	93	#include <netinet/ip.h>
	94	#include <netinet/in_pcb.h>
	95	#include <netinet/ip_var.h>
	96	#include <mach/sdt.h>
	97	#if INET6
	98	#include <netinet6/in6_pcb.h>
	99	#include <netinet/ip6.h>
	100	#include <netinet6/ip6_var.h>
	101	#endif
	102	#include <netinet/tcp.h>
	103	#define TCPOUTFLAGS
	104	#include <netinet/tcp_fsm.h>
	105	#include <netinet/tcp_seq.h>
	106	#include <netinet/tcp_timer.h>
	107	#include <netinet/tcp_var.h>
	108	#include <netinet/tcpip.h>
	109	#include <netinet/tcp_cc.h>
	110	#if TCPDEBUG
	111	#include <netinet/tcp_debug.h>
	112	#endif
	113	#include <sys/kdebug.h>
	114	#include <mach/sdt.h>
	115
	116	#if IPSEC
	117	#include <netinet6/ipsec.h>
	118	#endif /IPSEC/
	119
	120	#if CONFIG_MACF_NET
	121	#include <security/mac_framework.h>
	122	#endif /* MAC_SOCKET */
	123
	124	#include <netinet/lro_ext.h>
	125	#if MPTCP
	126	#include <netinet/mptcp_var.h>
	127	#include <netinet/mptcp.h>
	128	#include <netinet/mptcp_opt.h>
	129	#endif
	130
	131	#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1)
	132	#define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3)
	133	#define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) \| 1)
	134
	135	int path_mtu_discovery = 1;
	136	SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery,
	137	CTLFLAG_RW \| CTLFLAG_LOCKED, &path_mtu_discovery, 1,
	138	"Enable Path MTU Discovery");
	139
	140	int ss_fltsz = 1;
	141	SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize,
	142	CTLFLAG_RW \| CTLFLAG_LOCKED,&ss_fltsz, 1,
	143	"Slow start flight size");
	144
	145	int ss_fltsz_local = 8; /* starts with eight segments max */
	146	SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize,
	147	CTLFLAG_RW \| CTLFLAG_LOCKED, &ss_fltsz_local, 1,
	148	"Slow start flight size for local networks");
	149
	150	int tcp_do_tso = 1;
	151	SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW \| CTLFLAG_LOCKED,
	152	&tcp_do_tso, 0, "Enable TCP Segmentation Offload");
	153
	154	int tcp_ecn_outbound = 0;
	155	SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out,
	156	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_ecn_outbound, 0,
	157	"Initiate ECN for outbound connections");
	158
	159	int tcp_ecn_inbound = 0;
	160	SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in,
	161	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_ecn_inbound, 0,
	162	"Allow ECN negotiation for inbound connections");
	163
	164	int tcp_packet_chaining = 50;
	165	SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain,
	166	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_packet_chaining, 0,
	167	"Enable TCP output packet chaining");
	168
	169	int tcp_output_unlocked = 1;
	170	SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output,
	171	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_output_unlocked, 0,
	172	"Unlock TCP when sending packets down to IP");
	173
	174	int tcp_do_rfc3390 = 1;
	175	SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390,
	176	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_do_rfc3390, 1,
	177	"Calculate intial slowstart cwnd depending on MSS");
	178
	179	int tcp_min_iaj_win = MIN_IAJ_WIN;
	180	SYSCTL_INT(_net_inet_tcp, OID_AUTO, min_iaj_win,
	181	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_min_iaj_win, 1,
	182	"Minimum recv win based on inter-packet arrival jitter");
	183
	184	int tcp_acc_iaj_react_limit = ACC_IAJ_REACT_LIMIT;
	185	SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_react_limit,
	186	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_acc_iaj_react_limit, 1,
	187	"Accumulated IAJ when receiver starts to react");
	188
	189	uint32_t tcp_do_autosendbuf = 1;
	190	SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautosndbuf,
	191	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_do_autosendbuf, 1,
	192	"Enable send socket buffer auto-tuning");
	193
	194	uint32_t tcp_autosndbuf_inc = 8 * 1024;
	195	SYSCTL_INT(_net_inet_tcp, OID_AUTO, autosndbufinc,
	196	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_autosndbuf_inc, 1,
	197	"Increment in send socket bufffer size");
	198
	199	uint32_t tcp_autosndbuf_max = 512 * 1024;
	200	SYSCTL_INT(_net_inet_tcp, OID_AUTO, autosndbufmax,
	201	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_autosndbuf_max, 1,
	202	"Maximum send socket buffer size");
	203
	204	uint32_t tcp_prioritize_acks = 1;
	205	SYSCTL_INT(_net_inet_tcp, OID_AUTO, ack_prioritize,
	206	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_prioritize_acks, 1,
	207	"Prioritize pure acks");
	208
	209	uint32_t tcp_use_rtt_recvbg = 1;
	210	SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_recvbg,
	211	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_use_rtt_recvbg, 1,
	212	"Use RTT for bg recv algorithm");
	213
	214	uint32_t tcp_recv_throttle_minwin = 16 * 1024;
	215	SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_throttle_minwin,
	216	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_recv_throttle_minwin, 1,
	217	"Minimum recv win for throttling");
	218
	219	int32_t tcp_enable_tlp = 1;
	220	SYSCTL_INT(_net_inet_tcp, OID_AUTO, enable_tlp,
	221	CTLFLAG_RW \| CTLFLAG_LOCKED,
	222	&tcp_enable_tlp, 1, "Enable Tail loss probe");
	223
	224	static int32_t packchain_newlist = 0;
	225	static int32_t packchain_looped = 0;
	226	static int32_t packchain_sent = 0;
	227
	228	/* temporary: for testing */
	229	#if IPSEC
	230	extern int ipsec_bypass;
	231	#endif
	232
	233	extern int slowlink_wsize; /* window correction for slow links */
	234	#if IPFIREWALL
	235	extern int fw_enable; /* firewall check for packet chaining */
	236	extern int fw_bypass; /* firewall check: disable packet chaining if there is rules */
	237	#endif /* IPFIREWALL */
	238
	239	extern u_int32_t dlil_filter_disable_tso_count;
	240	extern u_int32_t kipf_count;
	241	extern int tcp_recv_bg;
	242
	243	static int tcp_ip_output(struct socket , struct tcpcb , struct mbuf *, int,
	244	struct mbuf *, int, int, int32_t, boolean_t);
	245	static struct mbuf* tcp_send_lroacks(struct tcpcb tp, struct mbuf m, struct tcphdr *th);
	246	static int tcp_recv_throttle(struct tcpcb *tp);
	247
	248	/*
	249	* Tcp output routine: figure out what should be sent and send it.
	250	*
	251	* Returns: 0 Success
	252	* EADDRNOTAVAIL
	253	* ENOBUFS
	254	* EMSGSIZE
	255	* EHOSTUNREACH
	256	* ENETDOWN
	257	* ip_output_list:ENOMEM
	258	* ip_output_list:EADDRNOTAVAIL
	259	* ip_output_list:ENETUNREACH
	260	* ip_output_list:EHOSTUNREACH
	261	* ip_output_list:EACCES
	262	* ip_output_list:EMSGSIZE
	263	* ip_output_list:ENOBUFS
	264	* ip_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
	265	* ip6_output_list:EINVAL
	266	* ip6_output_list:EOPNOTSUPP
	267	* ip6_output_list:EHOSTUNREACH
	268	* ip6_output_list:EADDRNOTAVAIL
	269	* ip6_output_list:ENETUNREACH
	270	* ip6_output_list:EMSGSIZE
	271	* ip6_output_list:ENOBUFS
	272	* ip6_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
	273	*/
	274	int
	275	tcp_output(struct tcpcb *tp)
	276	{
	277	struct inpcb *inp = tp->t_inpcb;
	278	struct socket *so = inp->inp_socket;
	279	int32_t len, recwin, sendwin, off;
	280	int flags, error;
	281	struct mbuf *m;
	282	struct ip *ip = NULL;
	283	struct ipovly *ipov = NULL;
	284	#if INET6
	285	struct ip6_hdr *ip6 = NULL;
	286	#endif /* INET6 */
	287	struct tcphdr *th;
	288	u_char opt[TCP_MAXOLEN];
	289	unsigned ipoptlen, optlen, hdrlen;
	290	int idle, sendalot, lost = 0;
	291	int i, sack_rxmit;
	292	int tso = 0;
	293	int sack_bytes_rxmt;
	294	struct sackhole *p;
	295	#if IPSEC
	296	unsigned ipsec_optlen = 0;
	297	#endif /* IPSEC */
	298	int idle_time = 0;
	299	struct mbuf *packetlist = NULL;
	300	struct mbuf *tp_inp_options = inp->inp_depend4.inp4_options;
	301	#if INET6
	302	int isipv6 = inp->inp_vflag & INP_IPV6 ;
	303	#endif
	304	short packchain_listadd = 0;
	305	int so_options = so->so_options;
	306	struct rtentry *rt;
	307	u_int32_t basertt, svc_flags = 0, allocated_len;
	308	u_int32_t lro_ackmore = (tp->t_lropktlen != 0) ? 1 : 0;
	309	struct mbuf *mnext = NULL;
	310	int sackoptlen = 0;
	311	#if MPTCP
	312	unsigned int *dlenp = NULL;
	313	u_int8_t *finp = NULL;
	314	u_int32_t *sseqp = NULL;
	315	u_int64_t dss_val = 0;
	316	boolean_t mptcp_acknow = FALSE;
	317	boolean_t early_data_sent = FALSE;
	318	#endif /* MPTCP */
	319	boolean_t cell = FALSE;
	320	boolean_t wifi = FALSE;
	321	boolean_t wired = FALSE;
	322
	323	/*
	324	* Determine length of data that should be transmitted,
	325	* and flags that will be used.
	326	* If there is some data or critical controls (SYN, RST)
	327	* to send, then transmit; otherwise, investigate further.
	328	*/
	329	idle = (tp->t_flags & TF_LASTIDLE) \|\| (tp->snd_max == tp->snd_una);
	330
	331	/* Since idle_time is signed integer, the following integer subtraction
	332	* will take care of wrap around of tcp_now
	333	*/
	334	idle_time = tcp_now - tp->t_rcvtime;
	335	if (idle && idle_time >= TCP_IDLETIMEOUT(tp)) {
	336	if (CC_ALGO(tp)->after_idle != NULL)
	337	CC_ALGO(tp)->after_idle(tp);
	338	tcp_ccdbg_trace(tp, NULL, TCP_CC_IDLE_TIMEOUT);
	339	}
	340	tp->t_flags &= ~TF_LASTIDLE;
	341	if (idle) {
	342	if (tp->t_flags & TF_MORETOCOME) {
	343	tp->t_flags \|= TF_LASTIDLE;
	344	idle = 0;
	345	}
	346	}
	347	#if MPTCP
	348	if (tp->t_mpflags & TMPF_RESET) {
	349	tcp_check_timer_state(tp);
	350	/*
	351	* Once a RST has been sent for an MPTCP subflow,
	352	* the subflow socket stays around until deleted.
	353	* No packets such as FINs must be sent after RST.
	354	*/
	355	return (0);
	356	}
	357	#endif /* MPTCP */
	358
	359	again:
	360	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_START, 0,0,0,0,0);
	361
	362	#if INET6
	363	if (isipv6) {
	364	KERNEL_DEBUG(DBG_LAYER_BEG,
	365	((inp->inp_fport << 16) \| inp->inp_lport),
	366	(((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) \|
	367	(inp->in6p_faddr.s6_addr16[0] & 0xffff)),
	368	sendalot,0,0);
	369	} else
	370	#endif
	371
	372	{
	373	KERNEL_DEBUG(DBG_LAYER_BEG,
	374	((inp->inp_fport << 16) \| inp->inp_lport),
	375	(((inp->inp_laddr.s_addr & 0xffff) << 16) \|
	376	(inp->inp_faddr.s_addr & 0xffff)),
	377	sendalot,0,0);
	378	}
	379	/*
	380	* If the route generation id changed, we need to check that our
	381	* local (source) IP address is still valid. If it isn't either
	382	* return error or silently do nothing (assuming the address will
	383	* come back before the TCP connection times out).
	384	*/
	385	rt = inp->inp_route.ro_rt;
	386	if (rt != NULL && ROUTE_UNUSABLE(&tp->t_inpcb->inp_route)) {
	387	struct ifnet *ifp;
	388	struct in_ifaddr *ia = NULL;
	389	struct in6_ifaddr *ia6 = NULL;
	390	int found_srcaddr = 0;
	391
	392	/* disable multipages at the socket */
	393	somultipages(so, FALSE);
	394
	395	/* Disable TSO for the socket until we know more */
	396	tp->t_flags &= ~TF_TSO;
	397
	398	soif2kcl(so, FALSE);
	399
	400	if (isipv6) {
	401	ia6 = ifa_foraddr6(&inp->in6p_laddr);
	402	if (ia6 != NULL)
	403	found_srcaddr = 1;
	404	} else {
	405	ia = ifa_foraddr(inp->inp_laddr.s_addr);
	406	if (ia != NULL)
	407	found_srcaddr = 1;
	408	}
	409
	410	/* check that the source address is still valid */
	411	if (found_srcaddr == 0) {
	412	soevent(so,
	413	(SO_FILT_HINT_LOCKED \| SO_FILT_HINT_NOSRCADDR));
	414
	415	if (tp->t_state >= TCPS_CLOSE_WAIT) {
	416	tcp_drop(tp, EADDRNOTAVAIL);
	417	return(EADDRNOTAVAIL);
	418	}
	419
	420	/* Set retransmit timer if it wasn't set,
	421	* reset Persist timer and shift register as the
	422	* advertised peer window may not be valid anymore
	423	*/
	424
	425	if (!tp->t_timer[TCPT_REXMT]) {
	426	tp->t_timer[TCPT_REXMT] =
	427	OFFSET_FROM_START(tp, tp->t_rxtcur);
	428	if (tp->t_timer[TCPT_PERSIST]) {
	429	tp->t_timer[TCPT_PERSIST] = 0;
	430	tp->t_rxtshift = 0;
	431	tp->t_persist_stop = 0;
	432	tp->t_rxtstart = 0;
	433	}
	434	}
	435
	436	if (tp->t_pktlist_head != NULL)
	437	m_freem_list(tp->t_pktlist_head);
	438	TCP_PKTLIST_CLEAR(tp);
	439
	440	/* drop connection if source address isn't available */
	441	if (so->so_flags & SOF_NOADDRAVAIL) {
	442	tcp_drop(tp, EADDRNOTAVAIL);
	443	return(EADDRNOTAVAIL);
	444	} else {
	445	tcp_check_timer_state(tp);
	446	return(0); /* silently ignore, keep data in socket: address may be back */
	447	}
	448	}
	449	if (ia != NULL)
	450	IFA_REMREF(&ia->ia_ifa);
	451
	452	if (ia6 != NULL)
	453	IFA_REMREF(&ia6->ia_ifa);
	454
	455	/*
	456	* Address is still valid; check for multipages capability
	457	* again in case the outgoing interface has changed.
	458	*/
	459	RT_LOCK(rt);
	460	if ((ifp = rt->rt_ifp) != NULL) {
	461	somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
	462	tcp_set_tso(tp, ifp);
	463	soif2kcl(so,
	464	(ifp->if_eflags & IFEF_2KCL));
	465	}
	466	if (rt->rt_flags & RTF_UP)
	467	RT_GENID_SYNC(rt);
	468	/*
	469	* See if we should do MTU discovery. Don't do it if:
	470	* 1) it is disabled via the sysctl
	471	* 2) the route isn't up
	472	* 3) the MTU is locked (if it is, then discovery
	473	* has been disabled)
	474	*/
	475
	476	if (!path_mtu_discovery \|\| ((rt != NULL) &&
	477	(!(rt->rt_flags & RTF_UP) \|\|
	478	(rt->rt_rmx.rmx_locks & RTV_MTU))))
	479	tp->t_flags &= ~TF_PMTUD;
	480	else
	481	tp->t_flags \|= TF_PMTUD;
	482
	483	RT_UNLOCK(rt);
	484	}
	485
	486	if (rt != NULL) {
	487	cell = IFNET_IS_CELLULAR(rt->rt_ifp);
	488	wifi = (!cell && IFNET_IS_WIFI(rt->rt_ifp));
	489	wired = (!wifi && IFNET_IS_WIRED(rt->rt_ifp));
	490	}
	491
	492	/*
	493	* If we've recently taken a timeout, snd_max will be greater than
	494	* snd_nxt. There may be SACK information that allows us to avoid
	495	* resending already delivered data. Adjust snd_nxt accordingly.
	496	*/
	497	if (SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
	498	tcp_sack_adjust(tp);
	499	sendalot = 0;
	500	off = tp->snd_nxt - tp->snd_una;
	501	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
	502
	503	if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
	504	sendwin = min(sendwin, slowlink_wsize);
	505
	506	flags = tcp_outflags[tp->t_state];
	507	/*
	508	* Send any SACK-generated retransmissions. If we're explicitly
	509	* trying to send out new data (when sendalot is 1), bypass this
	510	* function. If we retransmit in fast recovery mode, decrement
	511	* snd_cwnd, since we're replacing a (future) new transmission
	512	* with a retransmission now, and we previously incremented
	513	* snd_cwnd in tcp_input().
	514	*/
	515	/*
	516	* Still in sack recovery , reset rxmit flag to zero.
	517	*/
	518	sack_rxmit = 0;
	519	sack_bytes_rxmt = 0;
	520	len = 0;
	521	p = NULL;
	522	if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp) &&
	523	(p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
	524	int32_t cwin;
	525
	526	cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
	527	if (cwin < 0)
	528	cwin = 0;
	529	/* Do not retransmit SACK segments beyond snd_recover */
	530	if (SEQ_GT(p->end, tp->snd_recover)) {
	531	/*
	532	* (At least) part of sack hole extends beyond
	533	* snd_recover. Check to see if we can rexmit data
	534	* for this hole.
	535	*/
	536	if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
	537	/*
	538	* Can't rexmit any more data for this hole.
	539	* That data will be rexmitted in the next
	540	* sack recovery episode, when snd_recover
	541	* moves past p->rxmit.
	542	*/
	543	p = NULL;
	544	goto after_sack_rexmit;
	545	} else
	546	/* Can rexmit part of the current hole */
	547	len = ((int32_t)min(cwin,
	548	tp->snd_recover - p->rxmit));
	549	} else {
	550	len = ((int32_t)min(cwin, p->end - p->rxmit));
	551	}
	552	if (len > 0) {
	553	off = p->rxmit - tp->snd_una;
	554	sack_rxmit = 1;
	555	sendalot = 1;
	556	tcpstat.tcps_sack_rexmits++;
	557	tcpstat.tcps_sack_rexmit_bytes +=
	558	min(len, tp->t_maxseg);
	559	if (nstat_collect) {
	560	nstat_route_tx(inp->inp_route.ro_rt, 1,
	561	min(len, tp->t_maxseg),
	562	NSTAT_TX_FLAG_RETRANSMIT);
	563	INP_ADD_STAT(inp, cell, wifi, wired,
	564	txpackets, 1);
	565	INP_ADD_STAT(inp, cell, wifi, wired,
	566	txbytes, min(len, tp->t_maxseg));
	567	tp->t_stat.txretransmitbytes += min(len, tp->t_maxseg);
	568	}
	569	} else {
	570	len = 0;
	571	}
	572	}
	573	after_sack_rexmit:
	574	/*
	575	* Get standard flags, and add SYN or FIN if requested by 'hidden'
	576	* state flags.
	577	*/
	578	if (tp->t_flags & TF_NEEDFIN)
	579	flags \|= TH_FIN;
	580	if (tp->t_flags & TF_NEEDSYN)
	581	flags \|= TH_SYN;
	582
	583	/*
	584	* If in persist timeout with window of 0, send 1 byte.
	585	* Otherwise, if window is small but nonzero
	586	* and timer expired, we will send what we can
	587	* and go to transmit state.
	588	*/
	589	if (tp->t_flagsext & TF_FORCE) {
	590	if (sendwin == 0) {
	591	/*
	592	* If we still have some data to send, then
	593	* clear the FIN bit. Usually this would
	594	* happen below when it realizes that we
	595	* aren't sending all the data. However,
	596	* if we have exactly 1 byte of unsent data,
	597	* then it won't clear the FIN bit below,
	598	* and if we are in persist state, we wind
	599	* up sending the packet without recording
	600	* that we sent the FIN bit.
	601	*
	602	* We can't just blindly clear the FIN bit,
	603	* because if we don't have any more data
	604	* to send then the probe will be the FIN
	605	* itself.
	606	*/
	607	if (off < so->so_snd.sb_cc)
	608	flags &= ~TH_FIN;
	609	sendwin = 1;
	610	} else {
	611	tp->t_timer[TCPT_PERSIST] = 0;
	612	tp->t_rxtshift = 0;
	613	tp->t_rxtstart = 0;
	614	tp->t_persist_stop = 0;
	615	}
	616	}
	617
	618	/*
	619	* If snd_nxt == snd_max and we have transmitted a FIN, the
	620	* offset will be > 0 even if so_snd.sb_cc is 0, resulting in
	621	* a negative length. This can also occur when TCP opens up
	622	* its congestion window while receiving additional duplicate
	623	* acks after fast-retransmit because TCP will reset snd_nxt
	624	* to snd_max after the fast-retransmit.
	625	*
	626	* In the normal retransmit-FIN-only case, however, snd_nxt will
	627	* be set to snd_una, the offset will be 0, and the length may
	628	* wind up 0.
	629	*
	630	* If sack_rxmit is true we are retransmitting from the scoreboard
	631	* in which case len is already set.
	632	*/
	633	if (sack_rxmit == 0) {
	634	if (sack_bytes_rxmt == 0)
	635	len = min(so->so_snd.sb_cc, sendwin) - off;
	636	else {
	637	int32_t cwin;
	638
	639	/*
	640	* We are inside of a SACK recovery episode and are
	641	* sending new data, having retransmitted all the
	642	* data possible in the scoreboard.
	643	*/
	644	len = min(so->so_snd.sb_cc, tp->snd_wnd)
	645	- off;
	646	/*
	647	* Don't remove this (len > 0) check !
	648	* We explicitly check for len > 0 here (although it
	649	* isn't really necessary), to work around a gcc
	650	* optimization issue - to force gcc to compute
	651	* len above. Without this check, the computation
	652	* of len is bungled by the optimizer.
	653	*/
	654	if (len > 0) {
	655	cwin = tp->snd_cwnd -
	656	(tp->snd_nxt - tp->sack_newdata) -
	657	sack_bytes_rxmt;
	658	if (cwin < 0)
	659	cwin = 0;
	660	len = imin(len, cwin);
	661	}
	662	else
	663	len = 0;
	664	}
	665	}
	666
	667	#if MPTCP
	668	if ((tp->t_mpflags & TMPF_FASTJOIN_SEND) &&
	669	(tp->t_state == TCPS_SYN_SENT) &&
	670	(!(tp->t_flags & TF_CLOSING)) &&
	671	(so->so_snd.sb_cc != 0) &&
	672	(tp->t_rxtshift == 0)) {
	673	flags &= ~TH_SYN;
	674	flags \|= TH_ACK;
	675	off = 0;
	676	len = min(so->so_snd.sb_cc, tp->t_maxseg);
	677	early_data_sent = TRUE;
	678	} else if (early_data_sent) {
	679	/* for now, we allow only one data segment to be sent */
	680	return (0);
	681	}
	682	#endif /* MPTCP */
	683	/*
	684	* Lop off SYN bit if it has already been sent. However, if this
	685	* is SYN-SENT state and if segment contains data and if we don't
	686	* know that foreign host supports TAO, suppress sending segment.
	687	*/
	688	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
	689	if (tp->t_state != TCPS_SYN_RECEIVED)
	690	flags &= ~TH_SYN;
	691	off--, len++;
	692	if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
	693	while (inp->inp_sndinprog_cnt == 0 &&
	694	tp->t_pktlist_head != NULL) {
	695	packetlist = tp->t_pktlist_head;
	696	packchain_listadd = tp->t_lastchain;
	697	packchain_sent++;
	698	TCP_PKTLIST_CLEAR(tp);
	699
	700	error = tcp_ip_output(so, tp, packetlist,
	701	packchain_listadd, tp_inp_options,
	702	(so_options & SO_DONTROUTE),
	703	(sack_rxmit \| (sack_bytes_rxmt != 0)), 0,
	704	#if INET6
	705	isipv6);
	706	#else /* INET6 */
	707	0);
	708	#endif /* !INET6 */
	709
	710
	711	}
	712
	713	/*
	714	* tcp was closed while we were in ip,
	715	* resume close
	716	*/
	717	if (inp->inp_sndinprog_cnt == 0 &&
	718	(tp->t_flags & TF_CLOSING)) {
	719	tp->t_flags &= ~TF_CLOSING;
	720	(void) tcp_close(tp);
	721	} else {
	722	tcp_check_timer_state(tp);
	723	}
	724	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END,
	725	0,0,0,0,0);
	726	return(0);
	727	}
	728	}
	729
	730	/*
	731	* Be careful not to send data and/or FIN on SYN segments.
	732	* This measure is needed to prevent interoperability problems
	733	* with not fully conformant TCP implementations.
	734	*/
	735	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
	736	len = 0;
	737	flags &= ~TH_FIN;
	738	}
	739
	740	/*
	741	* The check here used to be (len < 0). Some times len is zero
	742	* when the congestion window is closed and we need to check
	743	* if persist timer has to be set in that case. But don't set
	744	* persist until connection is established.
	745	*/
	746	if (len <= 0 && !(flags & TH_SYN)) {
	747	/*
	748	* If FIN has been sent but not acked,
	749	* but we haven't been called to retransmit,
	750	* len will be < 0. Otherwise, window shrank
	751	* after we sent into it. If window shrank to 0,
	752	* cancel pending retransmit, pull snd_nxt back
	753	* to (closed) window, and set the persist timer
	754	* if it isn't already going. If the window didn't
	755	* close completely, just wait for an ACK.
	756	*/
	757	len = 0;
	758	if (sendwin == 0) {
	759	tp->t_timer[TCPT_REXMT] = 0;
	760	tp->t_timer[TCPT_PTO] = 0;
	761	tp->t_rxtshift = 0;
	762	tp->t_rxtstart = 0;
	763	tp->snd_nxt = tp->snd_una;
	764	off = 0;
	765	if (tp->t_timer[TCPT_PERSIST] == 0)
	766	tcp_setpersist(tp);
	767	}
	768	}
	769
	770	/*
	771	* Automatic sizing of send socket buffer. Increase the send
	772	* socket buffer size if all of the following criteria are met
	773	* 1. the receiver has enough buffer space for this data
	774	* 2. send buffer is filled to 7/8th with data (so we actually
	775	* have data to make use of it);
	776	* 3. our send window (slow start and congestion controlled) is
	777	* larger than sent but unacknowledged data in send buffer.
	778	*/
	779	basertt = get_base_rtt(tp);
	780	if (tcp_do_autosendbuf == 1 &&
	781	!INP_WAIT_FOR_IF_FEEDBACK(inp) && !IN_FASTRECOVERY(tp) &&
	782	(so->so_snd.sb_flags & (SB_AUTOSIZE \| SB_TRIM)) == SB_AUTOSIZE &&
	783	tcp_cansbgrow(&so->so_snd)) {
	784	if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
	785	so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
	786	sendwin >= (so->so_snd.sb_cc -
	787	(tp->snd_nxt - tp->snd_una))) {
	788	/* Also increase the send buffer only if the
	789	* round-trip time is not increasing because we do
	790	* not want to contribute to latency by filling
	791	* buffers.
	792	* We also do not want to hold onto application's
	793	* old data for too long. Interactive applications
	794	* would rather discard old data.
	795	*/
	796	if (tp->t_rttcur <= (basertt + 25)) {
	797	if (sbreserve(&so->so_snd,
	798	min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
	799	tcp_autosndbuf_max)) == 1) {
	800	so->so_snd.sb_idealsize = so->so_snd.sb_hiwat;
	801	}
	802	} else {
	803	so->so_snd.sb_idealsize =
	804	max(tcp_sendspace, so->so_snd.sb_hiwat -
	805	(2 * tcp_autosndbuf_inc));
	806	so->so_snd.sb_flags \|= SB_TRIM;
	807	}
	808	}
	809	}
	810
	811	/*
	812	* Truncate to the maximum segment length or enable TCP Segmentation
	813	* Offloading (if supported by hardware) and ensure that FIN is removed
	814	* if the length no longer contains the last data byte.
	815	*
	816	* TSO may only be used if we are in a pure bulk sending state.
	817	* The presence of TCP-MD5, SACK retransmits, SACK advertizements,
	818	* ipfw rules and IP options, as well as disabling hardware checksum
	819	* offload prevent using TSO. With TSO the TCP header is the same
	820	* (except for the sequence number) for all generated packets. This
	821	* makes it impossible to transmit any options which vary per generated
	822	* segment or packet.
	823	*
	824	* The length of TSO bursts is limited to TCP_MAXWIN. That limit and
	825	* removal of FIN (if not already catched here) are handled later after
	826	* the exact length of the TCP options are known.
	827	*/
	828	#if IPSEC
	829	/*
	830	* Pre-calculate here as we save another lookup into the darknesses
	831	* of IPsec that way and can actually decide if TSO is ok.
	832	*/
	833	if (ipsec_bypass == 0)
	834	ipsec_optlen = ipsec_hdrsiz_tcp(tp);
	835	#endif
	836	if (len > tp->t_maxseg) {
	837	if ((tp->t_flags & TF_TSO) && tcp_do_tso && hwcksum_tx &&
	838	ip_use_randomid && kipf_count == 0 &&
	839	dlil_filter_disable_tso_count == 0 &&
	840	tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
	841	sack_bytes_rxmt == 0 &&
	842	inp->inp_options == NULL &&
	843	inp->in6p_options == NULL
	844	#if IPSEC
	845	&& ipsec_optlen == 0
	846	#endif
	847	#if IPFIREWALL
	848	&& (fw_enable == 0 \|\| fw_bypass)
	849	#endif
	850	) {
	851	tso = 1;
	852	sendalot = 0;
	853	} else {
	854	len = tp->t_maxseg;
	855	sendalot = 1;
	856	tso = 0;
	857	}
	858	}
	859
	860	/* Send one segment or less as a tail loss probe */
	861	if (tp->t_flagsext & TF_SENT_TLPROBE) {
	862	len = min(len, tp->t_maxseg);
	863	sendalot = 0;
	864	tso = 0;
	865	}
	866
	867	#if MPTCP
	868	if ((so->so_flags & SOF_MP_SUBFLOW) &&
	869	!(tp->t_mpflags & TMPF_TCP_FALLBACK)) {
	870	int newlen = len;
	871	if (!(tp->t_mpflags & TMPF_PREESTABLISHED) &&
	872	(tp->t_state > TCPS_CLOSED) &&
	873	((tp->t_mpflags & TMPF_SND_MPPRIO) \|\|
	874	(tp->t_mpflags & TMPF_SND_REM_ADDR) \|\|
	875	(tp->t_mpflags & TMPF_SND_MPFAIL))) {
	876	if (len > 0) {
	877	len = 0;
	878	}
	879	sendalot = 1;
	880	mptcp_acknow = TRUE;
	881	} else {
	882	mptcp_acknow = FALSE;
	883	}
	884	/*
	885	* The contiguous bytes in the subflow socket buffer can be
	886	* discontiguous at the MPTCP level. Since only one DSS
	887	* option can be sent in one packet, reduce length to match
	888	* the contiguous MPTCP level. Set sendalot to send remainder.
	889	*/
	890	if (len > 0)
	891	newlen = mptcp_adj_sendlen(so, off, len);
	892	if (newlen < len) {
	893	len = newlen;
	894	sendalot = 1;
	895	}
	896	}
	897	#endif /* MPTCP */
	898
	899	/*
	900	* If the socket is capable of doing unordered send,
	901	* pull the amount of data that can be sent from the
	902	* unordered priority queues to the serial queue in
	903	* the socket buffer. If bytes are not yet available
	904	* in the highest priority message, we may not be able
	905	* to send any new data.
	906	*/
	907	if (so->so_flags & SOF_ENABLE_MSGS) {
	908	if ((off + len) >
	909	so->so_msg_state->msg_serial_bytes) {
	910	sbpull_unordered_data(so, off, len);
	911
	912	/* check if len needs to be modified */
	913	if ((off + len) >
	914	so->so_msg_state->msg_serial_bytes) {
	915	len = so->so_msg_state->msg_serial_bytes - off;
	916	if (len <= 0) {
	917	len = 0;
	918	tcpstat.tcps_msg_sndwaithipri++;
	919	}
	920	}
	921	}
	922	}
	923
	924	if (sack_rxmit) {
	925	if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
	926	flags &= ~TH_FIN;
	927	} else {
	928	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
	929	flags &= ~TH_FIN;
	930	}
	931
	932	recwin = tcp_sbspace(tp);
	933
	934	/*
	935	* Sender silly window avoidance. We transmit under the following
	936	* conditions when len is non-zero:
	937	*
	938	* - we've timed out (e.g. persist timer)
	939	* - we need to retransmit
	940	* - We have a full segment (or more with TSO)
	941	* - This is the last buffer in a write()/send() and we are
	942	* either idle or running NODELAY
	943	* - we have more then 1/2 the maximum send window's worth of
	944	* data (receiver may be limited the window size)
	945	*/
	946	if (len) {
	947	if (tp->t_flagsext & TF_FORCE)
	948	goto send;
	949	if (SEQ_LT(tp->snd_nxt, tp->snd_max))
	950	goto send;
	951	if (sack_rxmit)
	952	goto send;
	953
	954	/*
	955	* Send new data on the connection only if it is
	956	* not flow controlled
	957	*/
	958	if (!INP_WAIT_FOR_IF_FEEDBACK(inp) \|\|
	959	tp->t_state != TCPS_ESTABLISHED) {
	960	if (len >= tp->t_maxseg)
	961	goto send;
	962	if (!(tp->t_flags & TF_MORETOCOME) &&
	963	(idle \|\| tp->t_flags & TF_NODELAY \|\|
	964	tp->t_flags & TF_MAXSEGSNT \|\|
	965	ALLOW_LIMITED_TRANSMIT(tp)) &&
	966	(tp->t_flags & TF_NOPUSH) == 0 &&
	967	len + off >= so->so_snd.sb_cc)
	968	goto send;
	969	if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
	970	goto send;
	971	} else {
	972	tcpstat.tcps_fcholdpacket++;
	973	}
	974	}
	975
	976	/*
	977	* Compare available window to amount of window
	978	* known to peer (as advertised window less
	979	* next expected input). If the difference is at least two
	980	* max size segments, or at least 25% of the maximum possible
	981	* window, then want to send a window update to peer.
	982	* Skip this if the connection is in T/TCP half-open state.
	983	*/
	984	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
	985	/*
	986	* "adv" is the amount we can increase the window,
	987	* taking into account that we are limited by
	988	* TCP_MAXWIN << tp->rcv_scale.
	989	*/
	990	int32_t adv, oldwin = 0;
	991	adv = imin(recwin, (int)TCP_MAXWIN << tp->rcv_scale) -
	992	(tp->rcv_adv - tp->rcv_nxt);
	993
	994	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
	995	oldwin = tp->rcv_adv - tp->rcv_nxt;
	996
	997	if (adv >= (int32_t) (2 * tp->t_maxseg)) {
	998	/*
	999	* Update only if the resulting scaled value of
	1000	* the window changed, or if there is a change in
	1001	* the sequence since the last ack. This avoids
	1002	* what appears as dupe ACKS (see rdar://5640997)
	1003	*
	1004	* If streaming is detected avoid sending too many
	1005	* window updates. We will depend on the delack
	1006	* timer to send a window update when needed.
	1007	*/
	1008	if (!(tp->t_flags & TF_STRETCHACK) &&
	1009	(tp->last_ack_sent != tp->rcv_nxt \|\|
	1010	((oldwin + adv) >> tp->rcv_scale) >
	1011	(oldwin >> tp->rcv_scale))) {
	1012	goto send;
	1013	}
	1014
	1015	/*
	1016	* Make sure that the delayed ack timer is set if
	1017	* we delayed sending a window update because of
	1018	* streaming detection.
	1019	*/
	1020	if ((tp->t_flags & TF_STRETCHACK) &&
	1021	!(tp->t_flags & TF_DELACK)) {
	1022	tp->t_flags \|= TF_DELACK;
	1023	tp->t_timer[TCPT_DELACK] =
	1024	OFFSET_FROM_START(tp, tcp_delack);
	1025	}
	1026	}
	1027	if (4 * adv >= (int32_t) so->so_rcv.sb_hiwat)
	1028	goto send;
	1029	}
	1030
	1031	/*
	1032	* Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
	1033	* is also a catch-all for the retransmit timer timeout case.
	1034	*/
	1035	if (tp->t_flags & TF_ACKNOW)
	1036	goto send;
	1037	if ((flags & TH_RST) \|\|
	1038	((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
	1039	goto send;
	1040	if (SEQ_GT(tp->snd_up, tp->snd_una))
	1041	goto send;
	1042	#if MPTCP
	1043	if (mptcp_acknow)
	1044	goto send;
	1045	#endif /* MPTCP */
	1046	/*
	1047	* If our state indicates that FIN should be sent
	1048	* and we have not yet done so, then we need to send.
	1049	*/
	1050	if ((flags & TH_FIN) &&
	1051	(!(tp->t_flags & TF_SENTFIN) \|\| tp->snd_nxt == tp->snd_una))
	1052	goto send;
	1053	/*
	1054	* In SACK, it is possible for tcp_output to fail to send a segment
	1055	* after the retransmission timer has been turned off. Make sure
	1056	* that the retransmission timer is set.
	1057	*/
	1058	if (SACK_ENABLED(tp) && (tp->t_state >= TCPS_ESTABLISHED) &&
	1059	SEQ_GT(tp->snd_max, tp->snd_una) &&
	1060	tp->t_timer[TCPT_REXMT] == 0 &&
	1061	tp->t_timer[TCPT_PERSIST] == 0) {
	1062	tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
	1063	tp->t_rxtcur);
	1064	goto just_return;
	1065	}
	1066	/*
	1067	* TCP window updates are not reliable, rather a polling protocol
	1068	* using ``persist'' packets is used to insure receipt of window
	1069	* updates. The three ``states'' for the output side are:
	1070	* idle not doing retransmits or persists
	1071	* persisting to move a small or zero window
	1072	* (re)transmitting and thereby not persisting
	1073	*
	1074	* tp->t_timer[TCPT_PERSIST]
	1075	* is set when we are in persist state.
	1076	* tp->t_force
	1077	* is set when we are called to send a persist packet.
	1078	* tp->t_timer[TCPT_REXMT]
	1079	* is set when we are retransmitting
	1080	* The output side is idle when both timers are zero.
	1081	*
	1082	* If send window is too small, there is data to transmit, and no
	1083	* retransmit or persist is pending, then go to persist state.
	1084	* If nothing happens soon, send when timer expires:
	1085	* if window is nonzero, transmit what we can,
	1086	* otherwise force out a byte.
	1087	*/
	1088	if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
	1089	tp->t_timer[TCPT_PERSIST] == 0) {
	1090	tp->t_rxtshift = 0;
	1091	tp->t_rxtstart = 0;
	1092	tcp_setpersist(tp);
	1093	}
	1094	just_return:
	1095	/*
	1096	* If there is no reason to send a segment, just return.
	1097	* but if there is some packets left in the packet list, send them now.
	1098	*/
	1099	while (inp->inp_sndinprog_cnt == 0 &&
	1100	tp->t_pktlist_head != NULL) {
	1101	packetlist = tp->t_pktlist_head;
	1102	packchain_listadd = tp->t_lastchain;
	1103	packchain_sent++;
	1104	TCP_PKTLIST_CLEAR(tp);
	1105
	1106	error = tcp_ip_output(so, tp, packetlist,
	1107	packchain_listadd,
	1108	tp_inp_options, (so_options & SO_DONTROUTE),
	1109	(sack_rxmit \| (sack_bytes_rxmt != 0)), recwin,
	1110	#if INET6
	1111	isipv6);
	1112	#else /* INET6 */
	1113	0);
	1114	#endif /* !INET6 */
	1115	}
	1116	/* tcp was closed while we were in ip; resume close */
	1117	if (inp->inp_sndinprog_cnt == 0 &&
	1118	(tp->t_flags & TF_CLOSING)) {
	1119	tp->t_flags &= ~TF_CLOSING;
	1120	(void) tcp_close(tp);
	1121	} else {
	1122	tcp_check_timer_state(tp);
	1123	}
	1124	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, 0,0,0,0,0);
	1125	return (0);
	1126
	1127	send:
	1128	/*
	1129	* Set TF_MAXSEGSNT flag if the segment size is greater than
	1130	* the max segment size.
	1131	*/
	1132	if (len > 0) {
	1133	if (len >= tp->t_maxseg)
	1134	tp->t_flags \|= TF_MAXSEGSNT;
	1135	else
	1136	tp->t_flags &= ~TF_MAXSEGSNT;
	1137	}
	1138	/*
	1139	* Before ESTABLISHED, force sending of initial options
	1140	* unless TCP set not to do any options.
	1141	* NOTE: we assume that the IP/TCP header plus TCP options
	1142	* always fit in a single mbuf, leaving room for a maximum
	1143	* link header, i.e.
	1144	* max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
	1145	*/
	1146	optlen = 0;
	1147	#if INET6
	1148	if (isipv6)
	1149	hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
	1150	else
	1151	#endif
	1152	hdrlen = sizeof (struct tcpiphdr);
	1153	if (flags & TH_SYN) {
	1154	tp->snd_nxt = tp->iss;
	1155	if ((tp->t_flags & TF_NOOPT) == 0) {
	1156	u_short mss;
	1157
	1158	opt[0] = TCPOPT_MAXSEG;
	1159	opt[1] = TCPOLEN_MAXSEG;
	1160	mss = htons((u_short) tcp_mssopt(tp));
	1161	(void)memcpy(opt + 2, &mss, sizeof(mss));
	1162	optlen = TCPOLEN_MAXSEG;
	1163
	1164	if ((tp->t_flags & TF_REQ_SCALE) &&
	1165	((flags & TH_ACK) == 0 \|\|
	1166	(tp->t_flags & TF_RCVD_SCALE))) {
	1167	((u_int32_t )(void *)(opt + optlen)) = htonl(
	1168	TCPOPT_NOP << 24 \|
	1169	TCPOPT_WINDOW << 16 \|
	1170	TCPOLEN_WINDOW << 8 \|
	1171	tp->request_r_scale);
	1172	optlen += 4;
	1173	}
	1174	#if MPTCP
	1175	if (mptcp_enable) {
	1176	optlen = mptcp_setup_syn_opts(so, flags, opt,
	1177	optlen);
	1178	}
	1179	#endif /* MPTCP */
	1180	}
	1181	}
	1182
	1183	/*
	1184	* RFC 3168 states that:
	1185	* - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
	1186	* to handle the TCP ECE flag, even if you also later send a
	1187	* non-ECN-setup SYN/SYN-ACK.
	1188	* - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
	1189	* the ip ECT flag.
	1190	*
	1191	* It is not clear how the ECE flag would ever be set if you never
	1192	* set the IP ECT flag on outbound packets. All the same, we use
	1193	* the TE_SETUPSENT to indicate that we have committed to handling
	1194	* the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
	1195	* whether or not we should set the IP ECT flag on outbound packet
	1196	*
	1197	* For a SYN-ACK, send an ECN setup SYN-ACK
	1198	*/
	1199	if ((tcp_ecn_inbound \|\| (tp->t_flags & TF_ENABLE_ECN))
	1200	&& (flags & (TH_SYN \| TH_ACK)) == (TH_SYN \| TH_ACK)) {
	1201	if ((tp->ecn_flags & TE_SETUPRECEIVED) != 0) {
	1202	if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
	1203	/* Setting TH_ECE makes this an ECN-setup SYN-ACK */
	1204	flags \|= TH_ECE;
	1205
	1206	/*
	1207	* Record that we sent the ECN-setup and
	1208	* default to setting IP ECT.
	1209	*/
	1210	tp->ecn_flags \|= (TE_SETUPSENT\|TE_SENDIPECT);
	1211	tcpstat.tcps_ecn_setup++;
	1212	} else {
	1213	/*
	1214	* We sent an ECN-setup SYN-ACK but it was
	1215	* dropped. Fallback to non-ECN-setup
	1216	* SYN-ACK and clear flag to indicate that
	1217	* we should not send data with IP ECT set
	1218	*
	1219	* Pretend we didn't receive an
	1220	* ECN-setup SYN.
	1221	*/
	1222	tp->ecn_flags &= ~TE_SETUPRECEIVED;
	1223	/*
	1224	* We already incremented the counter
	1225	* assuming that the ECN setup will
	1226	* succeed. Decrementing here to
	1227	* correct it.
	1228	*/
	1229	tcpstat.tcps_ecn_setup--;
	1230	}
	1231	}
	1232	} else if ((tcp_ecn_outbound \|\| (tp->t_flags & TF_ENABLE_ECN))
	1233	&& (flags & (TH_SYN \| TH_ACK)) == TH_SYN) {
	1234	if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
	1235	/* Setting TH_ECE and TH_CWR makes this an ECN-setup SYN */
	1236	flags \|= (TH_ECE \| TH_CWR);
	1237
	1238	/*
	1239	* Record that we sent the ECN-setup and default to
	1240	* setting IP ECT.
	1241	*/
	1242	tp->ecn_flags \|= (TE_SETUPSENT \| TE_SENDIPECT);
	1243	} else {
	1244	/*
	1245	* We sent an ECN-setup SYN but it was dropped.
	1246	* Fall back to no ECN and clear flag indicating
	1247	* we should send data with IP ECT set.
	1248	*/
	1249	tp->ecn_flags &= ~TE_SENDIPECT;
	1250	}
	1251	}
	1252
	1253	/*
	1254	* Check if we should set the TCP CWR flag.
	1255	* CWR flag is sent when we reduced the congestion window because
	1256	* we received a TCP ECE or we performed a fast retransmit. We
	1257	* never set the CWR flag on retransmitted packets. We only set
	1258	* the CWR flag on data packets. Pure acks don't have this set.
	1259	*/
	1260	if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 &&
	1261	!SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
	1262	flags \|= TH_CWR;
	1263	tp->ecn_flags &= ~TE_SENDCWR;
	1264	tcpstat.tcps_sent_cwr++;
	1265	}
	1266
	1267	/*
	1268	* Check if we should set the TCP ECE flag.
	1269	*/
	1270	if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) {
	1271	flags \|= TH_ECE;
	1272	tcpstat.tcps_sent_ece++;
	1273	}
	1274
	1275	/*
	1276	* Send a timestamp and echo-reply if this is a SYN and our side
	1277	* wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	1278	* and our peer have sent timestamps in our SYN's.
	1279	*/
	1280	if ((tp->t_flags & (TF_REQ_TSTMP\|TF_NOOPT)) == TF_REQ_TSTMP &&
	1281	(flags & TH_RST) == 0 &&
	1282	((flags & TH_ACK) == 0 \|\|
	1283	(tp->t_flags & TF_RCVD_TSTMP))) {
	1284	u_int32_t lp = (u_int32_t )(void *)(opt + optlen);
	1285
	1286	/* Form timestamp option as shown in appendix A of RFC 1323. */
	1287	*lp++ = htonl(TCPOPT_TSTAMP_HDR);
	1288	*lp++ = htonl(tcp_now);
	1289	*lp = htonl(tp->ts_recent);
	1290	optlen += TCPOLEN_TSTAMP_APPA;
	1291	}
	1292
	1293	/* Note the timestamp for receive buffer autosizing */
	1294	if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE))
	1295	tp->rfbuf_ts = tcp_now;
	1296
	1297	if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) {
	1298	/*
	1299	* Tack on the SACK permitted option last.
	1300	* And do padding of options after tacking this on.
	1301	* This is because of MSS, TS, WinScale and Signatures are
	1302	* all present, we have just 2 bytes left for the SACK
	1303	* permitted option, which is just enough.
	1304	*/
	1305	/*
	1306	* If this is the first SYN of connection (not a SYN
	1307	* ACK), include SACK permitted option. If this is a
	1308	* SYN ACK, include SACK permitted option if peer has
	1309	* already done so. This is only for active connect,
	1310	* since the syncache takes care of the passive connect.
	1311	*/
	1312	if ((flags & TH_SYN) &&
	1313	(!(flags & TH_ACK) \|\| (tp->t_flags & TF_SACK_PERMIT))) {
	1314	u_char *bp;
	1315	bp = (u_char *)opt + optlen;
	1316
	1317	*bp++ = TCPOPT_SACK_PERMITTED;
	1318	*bp++ = TCPOLEN_SACK_PERMITTED;
	1319	optlen += TCPOLEN_SACK_PERMITTED;
	1320	}
	1321	}
	1322	#if MPTCP
	1323	if (so->so_flags & SOF_MP_SUBFLOW) {
	1324	/*
	1325	* Its important to piggyback acks with data as ack only packets
	1326	* may get lost and data packets that don't send Data ACKs
	1327	* still advance the subflow level ACK and therefore make it
	1328	* hard for the remote end to recover in low cwnd situations.
	1329	*/
	1330	if (len != 0) {
	1331	tp->t_mpflags \|= (TMPF_SEND_DSN \|
	1332	TMPF_MPTCP_ACKNOW);
	1333	} else {
	1334	tp->t_mpflags \|= TMPF_MPTCP_ACKNOW;
	1335	}
	1336	optlen = mptcp_setup_opts(tp, off, &opt[0], optlen, flags,
	1337	len, &dlenp, &finp, &dss_val, &sseqp, &mptcp_acknow);
	1338	tp->t_mpflags &= ~TMPF_SEND_DSN;
	1339	}
	1340	#endif /* MPTCP */
	1341
	1342	if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) {
	1343	/*
	1344	* Send SACKs if necessary. This should be the last
	1345	* option processed. Only as many SACKs are sent as
	1346	* are permitted by the maximum options size.
	1347	*
	1348	* In general, SACK blocks consume 8*n+2 bytes.
	1349	* So a full size SACK blocks option is 34 bytes
	1350	* (to generate 4 SACK blocks). At a minimum,
	1351	* we need 10 bytes (to generate 1 SACK block).
	1352	* If TCP Timestamps (12 bytes) and TCP Signatures
	1353	* (18 bytes) are both present, we'll just have
	1354	* 10 bytes for SACK options 40 - (12 + 18).
	1355	*/
	1356	if (TCPS_HAVEESTABLISHED(tp->t_state) &&
	1357	(tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0 &&
	1358	MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) {
	1359	int nsack, padlen;
	1360	u_char bp = (u_char )opt + optlen;
	1361	u_int32_t *lp;
	1362
	1363	nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK;
	1364	nsack = min(nsack, tp->rcv_numsacks);
	1365	sackoptlen = (2 + nsack * TCPOLEN_SACK);
	1366
	1367	/*
	1368	* First we need to pad options so that the
	1369	* SACK blocks can start at a 4-byte boundary
	1370	* (sack option and length are at a 2 byte offset).
	1371	*/
	1372	padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4;
	1373	optlen += padlen;
	1374	while (padlen-- > 0)
	1375	*bp++ = TCPOPT_NOP;
	1376
	1377	tcpstat.tcps_sack_send_blocks++;
	1378	*bp++ = TCPOPT_SACK;
	1379	*bp++ = sackoptlen;
	1380	lp = (u_int32_t )(void )bp;
	1381	for (i = 0; i < nsack; i++) {
	1382	struct sackblk sack = tp->sackblks[i];
	1383	*lp++ = htonl(sack.start);
	1384	*lp++ = htonl(sack.end);
	1385	}
	1386	optlen += sackoptlen;
	1387	}
	1388	}
	1389
	1390	/* Pad TCP options to a 4 byte boundary */
	1391	if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
	1392	int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
	1393	u_char bp = (u_char )opt + optlen;
	1394
	1395	optlen += pad;
	1396	while (pad) {
	1397	*bp++ = TCPOPT_EOL;
	1398	pad--;
	1399	}
	1400	}
	1401
	1402	hdrlen += optlen;
	1403
	1404	#if INET6
	1405	if (isipv6)
	1406	ipoptlen = ip6_optlen(inp);
	1407	else
	1408	#endif
	1409	{
	1410	if (tp_inp_options) {
	1411	ipoptlen = tp_inp_options->m_len -
	1412	offsetof(struct ipoption, ipopt_list);
	1413	} else {
	1414	ipoptlen = 0;
	1415	}
	1416	}
	1417	#if IPSEC
	1418	ipoptlen += ipsec_optlen;
	1419	#endif
	1420
	1421	/*
	1422	* Adjust data length if insertion of options will
	1423	* bump the packet length beyond the t_maxopd length.
	1424	* Clear the FIN bit because we cut off the tail of
	1425	* the segment.
	1426	*
	1427	* When doing TSO limit a burst to TCP_MAXWIN minus the
	1428	* IP, TCP and Options length to keep ip->ip_len from
	1429	* overflowing. Prevent the last segment from being
	1430	* fractional thus making them all equal sized and set
	1431	* the flag to continue sending. TSO is disabled when
	1432	* IP options or IPSEC are present.
	1433	*/
	1434	if (len + optlen + ipoptlen > tp->t_maxopd) {
	1435	/*
	1436	* If there is still more to send,
	1437	* don't close the connection.
	1438	*/
	1439	flags &= ~TH_FIN;
	1440	if (tso) {
	1441	int32_t tso_maxlen;
	1442
	1443	tso_maxlen = tp->tso_max_segment_size ?
	1444	tp->tso_max_segment_size : TCP_MAXWIN;
	1445
	1446	if (len > tso_maxlen - hdrlen - optlen) {
	1447	len = tso_maxlen - hdrlen - optlen;
	1448	len = len - (len % (tp->t_maxopd - optlen));
	1449	sendalot = 1;
	1450	} else if (tp->t_flags & TF_NEEDFIN) {
	1451	sendalot = 1;
	1452	}
	1453	} else {
	1454	len = tp->t_maxopd - optlen - ipoptlen;
	1455	sendalot = 1;
	1456	}
	1457	}
	1458	#if MPTCP
	1459	/* Adjust the length in the DSS option, if it is lesser than len */
	1460	if (dlenp) {
	1461	/*
	1462	* To test this path without SACK, artificially
	1463	* decrement len with something like
	1464	* if (len > 10)
	1465	len -= 10;
	1466	*/
	1467	if (ntohs(*dlenp) > len) {
	1468	*dlenp = htons(len);
	1469	/* Unset the FIN flag, if len was adjusted */
	1470	if (finp) {
	1471	*finp &= ~MDSS_F;
	1472	}
	1473	sendalot = 1;
	1474	}
	1475	}
	1476	#endif /* MPTCP */
	1477
	1478	if (max_linkhdr + hdrlen > MCLBYTES)
	1479	panic("tcphdr too big");
	1480
	1481	/* Check if there is enough data in the send socket
	1482	* buffer to start measuring bw
	1483	*/
	1484	if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
	1485	(tp->t_bwmeas != NULL) &&
	1486	(tp->t_flagsext & TF_BWMEAS_INPROGRESS) == 0 &&
	1487	(so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) >=
	1488	tp->t_bwmeas->bw_minsize) {
	1489	tp->t_bwmeas->bw_size = min(
	1490	(so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)),
	1491	tp->t_bwmeas->bw_maxsize);
	1492	tp->t_flagsext \|= TF_BWMEAS_INPROGRESS;
	1493	tp->t_bwmeas->bw_start = tp->snd_max;
	1494	tp->t_bwmeas->bw_ts = tcp_now;
	1495	}
	1496
	1497	VERIFY(inp->inp_flowhash != 0);
	1498	/*
	1499	* Grab a header mbuf, attaching a copy of data to
	1500	* be transmitted, and initialize the header from
	1501	* the template for sends on this connection.
	1502	*/
	1503	if (len) {
	1504	if ((tp->t_flagsext & TF_FORCE) && len == 1)
	1505	tcpstat.tcps_sndprobe++;
	1506	else if (SEQ_LT(tp->snd_nxt, tp->snd_max) \|\| sack_rxmit) {
	1507	tcpstat.tcps_sndrexmitpack++;
	1508	tcpstat.tcps_sndrexmitbyte += len;
	1509	if (nstat_collect) {
	1510	nstat_route_tx(inp->inp_route.ro_rt, 1,
	1511	len, NSTAT_TX_FLAG_RETRANSMIT);
	1512	INP_ADD_STAT(inp, cell, wifi, wired,
	1513	txpackets, 1);
	1514	INP_ADD_STAT(inp, cell, wifi, wired,
	1515	txbytes, len);
	1516	tp->t_stat.txretransmitbytes += len;
	1517	}
	1518	} else {
	1519	tcpstat.tcps_sndpack++;
	1520	tcpstat.tcps_sndbyte += len;
	1521
	1522	if (nstat_collect) {
	1523	INP_ADD_STAT(inp, cell, wifi, wired,
	1524	txpackets, 1);
	1525	INP_ADD_STAT(inp, cell, wifi, wired,
	1526	txbytes, len);
	1527	}
	1528	}
	1529	#if MPTCP
	1530	if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
	1531	tcpstat.tcps_mp_sndpacks++;
	1532	tcpstat.tcps_mp_sndbytes += len;
	1533	}
	1534	#endif /* MPTCP */
	1535	/*
	1536	* try to use the new interface that allocates all
	1537	* the necessary mbuf hdrs under 1 mbuf lock and
	1538	* avoids rescanning the socket mbuf list if
	1539	* certain conditions are met. This routine can't
	1540	* be used in the following cases...
	1541	* 1) the protocol headers exceed the capacity of
	1542	* of a single mbuf header's data area (no cluster attached)
	1543	* 2) the length of the data being transmitted plus
	1544	* the protocol headers fits into a single mbuf header's
	1545	* data area (no cluster attached)
	1546	*/
	1547	m = NULL;
	1548
	1549	/* minimum length we are going to allocate */
	1550	allocated_len = MHLEN;
	1551	if (MHLEN < hdrlen + max_linkhdr) {
	1552	MGETHDR(m, M_DONTWAIT, MT_HEADER);
	1553	if (m == NULL) {
	1554	error = ENOBUFS;
	1555	goto out;
	1556	}
	1557	MCLGET(m, M_DONTWAIT);
	1558	if ((m->m_flags & M_EXT) == 0) {
	1559	m_freem(m);
	1560	error = ENOBUFS;
	1561	goto out;
	1562	}
	1563	m->m_data += max_linkhdr;
	1564	m->m_len = hdrlen;
	1565	allocated_len = MCLBYTES;
	1566	}
	1567	if (len <= allocated_len - hdrlen - max_linkhdr) {
	1568	if (m == NULL) {
	1569	VERIFY(allocated_len <= MHLEN);
	1570	MGETHDR(m, M_DONTWAIT, MT_HEADER);
	1571	if (m == NULL) {
	1572	error = ENOBUFS;
	1573	goto out;
	1574	}
	1575	m->m_data += max_linkhdr;
	1576	m->m_len = hdrlen;
	1577	}
	1578	/* makes sure we still have data left to be sent at this point */
	1579	if (so->so_snd.sb_mb == NULL \|\| off < 0) {
	1580	if (m != NULL) m_freem(m);
	1581	error = 0; /* should we return an error? */
	1582	goto out;
	1583	}
	1584	m_copydata(so->so_snd.sb_mb, off, (int) len,
	1585	mtod(m, caddr_t) + hdrlen);
	1586	m->m_len += len;
	1587	} else {
	1588	uint32_t copymode;
	1589	/*
	1590	* Retain packet header metadata at the socket
	1591	* buffer if this is is an MPTCP subflow,
	1592	* otherwise move it.
	1593	*/
	1594	copymode = M_COPYM_MOVE_HDR;
	1595	#if MPTCP
	1596	if (so->so_flags & SOF_MP_SUBFLOW) {
	1597	copymode = M_COPYM_NOOP_HDR;
	1598	}
	1599	#endif /* MPTCP */
	1600	if (m != NULL) {
	1601	m->m_next = m_copym_mode(so->so_snd.sb_mb,
	1602	off, (int)len, M_DONTWAIT, copymode);
	1603	if (m->m_next == NULL) {
	1604	(void) m_free(m);
	1605	error = ENOBUFS;
	1606	goto out;
	1607	}
	1608	} else {
	1609	/*
	1610	* make sure we still have data left
	1611	* to be sent at this point
	1612	*/
	1613	if (so->so_snd.sb_mb == NULL) {
	1614	error = 0; /* should we return an error? */
	1615	goto out;
	1616	}
	1617
	1618	/*
	1619	* m_copym_with_hdrs will always return the
	1620	* last mbuf pointer and the offset into it that
	1621	* it acted on to fullfill the current request,
	1622	* whether a valid 'hint' was passed in or not.
	1623	*/
	1624	if ((m = m_copym_with_hdrs(so->so_snd.sb_mb,
	1625	off, len, M_DONTWAIT, NULL, NULL,
	1626	copymode)) == NULL) {
	1627	error = ENOBUFS;
	1628	goto out;
	1629	}
	1630	m->m_data += max_linkhdr;
	1631	m->m_len = hdrlen;
	1632	}
	1633	}
	1634	/*
	1635	* If we're sending everything we've got, set PUSH.
	1636	* (This will keep happy those implementations which only
	1637	* give data to the user when a buffer fills or
	1638	* a PUSH comes in.)
	1639	*/
	1640	if (off + len == so->so_snd.sb_cc)
	1641	flags \|= TH_PUSH;
	1642	} else {
	1643	if (tp->t_flags & TF_ACKNOW)
	1644	tcpstat.tcps_sndacks++;
	1645	else if (flags & (TH_SYN\|TH_FIN\|TH_RST))
	1646	tcpstat.tcps_sndctrl++;
	1647	else if (SEQ_GT(tp->snd_up, tp->snd_una))
	1648	tcpstat.tcps_sndurg++;
	1649	else
	1650	tcpstat.tcps_sndwinup++;
	1651
	1652	MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */
	1653	if (m == NULL) {
	1654	error = ENOBUFS;
	1655	goto out;
	1656	}
	1657	if (MHLEN < (hdrlen + max_linkhdr)) {
	1658	MCLGET(m, M_DONTWAIT);
	1659	if ((m->m_flags & M_EXT) == 0) {
	1660	m_freem(m);
	1661	error = ENOBUFS;
	1662	goto out;
	1663	}
	1664	}
	1665	m->m_data += max_linkhdr;
	1666	m->m_len = hdrlen;
	1667	}
	1668	m->m_pkthdr.rcvif = 0;
	1669	#if MPTCP
	1670	/* Before opt is copied to the mbuf, set the csum field */
	1671	mptcp_output_csum(tp, m, len, hdrlen, dss_val, sseqp);
	1672	#endif /* MPTCP */
	1673	#if CONFIG_MACF_NET
	1674	mac_mbuf_label_associate_inpcb(inp, m);
	1675	#endif
	1676	#if INET6
	1677	if (isipv6) {
	1678	ip6 = mtod(m, struct ip6_hdr *);
	1679	th = (struct tcphdr )(void )(ip6 + 1);
	1680	tcp_fillheaders(tp, ip6, th);
	1681	if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
	1682	!SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
	1683	ip6->ip6_flow \|= htonl(IPTOS_ECN_ECT0 << 20);
	1684	}
	1685	svc_flags \|= PKT_SCF_IPV6;
	1686	#if PF_ECN
	1687	m->m_pkthdr.pf_mtag.pftag_hdr = (void *)ip6;
	1688	m->m_pkthdr.pf_mtag.pftag_flags \|= PF_TAG_HDR_INET6;
	1689	#endif /* PF_ECN */
	1690	} else
	1691	#endif /* INET6 */
	1692	{
	1693	ip = mtod(m, struct ip *);
	1694	ipov = (struct ipovly *)ip;
	1695	th = (struct tcphdr )(void )(ip + 1);
	1696	/* this picks up the pseudo header (w/o the length) */
	1697	tcp_fillheaders(tp, ip, th);
	1698	if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
	1699	!SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
	1700	ip->ip_tos = IPTOS_ECN_ECT0;
	1701	}
	1702	#if PF_ECN
	1703	m->m_pkthdr.pf_mtag.pftag_hdr = (void *)ip;
	1704	m->m_pkthdr.pf_mtag.pftag_flags \|= PF_TAG_HDR_INET;
	1705	#endif /* PF_ECN */
	1706	}
	1707
	1708	/*
	1709	* Fill in fields, remembering maximum advertised
	1710	* window for use in delaying messages about window sizes.
	1711	* If resending a FIN, be sure not to use a new sequence number.
	1712	*/
	1713	if (flags & TH_FIN && (tp->t_flags & TF_SENTFIN) &&
	1714	tp->snd_nxt == tp->snd_max)
	1715	tp->snd_nxt--;
	1716	/*
	1717	* If we are doing retransmissions, then snd_nxt will
	1718	* not reflect the first unsent octet. For ACK only
	1719	* packets, we do not want the sequence number of the
	1720	* retransmitted packet, we want the sequence number
	1721	* of the next unsent octet. So, if there is no data
	1722	* (and no SYN or FIN), use snd_max instead of snd_nxt
	1723	* when filling in ti_seq. But if we are in persist
	1724	* state, snd_max might reflect one byte beyond the
	1725	* right edge of the window, so use snd_nxt in that
	1726	* case, since we know we aren't doing a retransmission.
	1727	* (retransmit and persist are mutually exclusive...)
	1728	*/
	1729	if (sack_rxmit == 0) {
	1730	if (len \|\| (flags & (TH_SYN\|TH_FIN)) \|\| tp->t_timer[TCPT_PERSIST])
	1731	th->th_seq = htonl(tp->snd_nxt);
	1732	else
	1733	th->th_seq = htonl(tp->snd_max);
	1734	} else {
	1735	th->th_seq = htonl(p->rxmit);
	1736	p->rxmit += len;
	1737	tp->sackhint.sack_bytes_rexmit += len;
	1738	}
	1739	th->th_ack = htonl(tp->rcv_nxt);
	1740	tp->last_ack_sent = tp->rcv_nxt;
	1741	#if MPTCP
	1742	/* Initialize the ACK field to a value as 0 ack fields are dropped */
	1743	if (early_data_sent) {
	1744	th->th_ack = th->th_seq + 1;
	1745	}
	1746	#endif /* MPTCP */
	1747	if (optlen) {
	1748	bcopy(opt, th + 1, optlen);
	1749	th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
	1750	}
	1751	th->th_flags = flags;
	1752	/*
	1753	* Calculate receive window. Don't shrink window,
	1754	* but avoid silly window syndrome.
	1755	*/
	1756	if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) && recwin < (int)tp->t_maxseg)
	1757	recwin = 0;
	1758	if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt))
	1759	recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
	1760	if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
	1761	if (recwin > (int32_t)slowlink_wsize)
	1762	recwin = slowlink_wsize;
	1763	}
	1764
	1765	#if TRAFFIC_MGT
	1766	if (tcp_recv_bg == 1 \|\| IS_TCP_RECV_BG(so)) {
	1767	if (tcp_recv_throttle(tp)) {
	1768	uint32_t min_iaj_win =
	1769	tcp_min_iaj_win * tp->t_maxseg;
	1770	if (tp->iaj_rwintop == 0 \|\|
	1771	SEQ_LT(tp->iaj_rwintop, tp->rcv_adv))
	1772	tp->iaj_rwintop = tp->rcv_adv;
	1773	if (SEQ_LT(tp->iaj_rwintop,
	1774	tp->rcv_nxt + min_iaj_win))
	1775	tp->iaj_rwintop = tp->rcv_nxt + min_iaj_win;
	1776	recwin = min(tp->iaj_rwintop - tp->rcv_nxt, recwin);
	1777	}
	1778	}
	1779	#endif /* TRAFFIC_MGT */
	1780
	1781	if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
	1782	recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
	1783	th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
	1784
	1785	/*
	1786	* Adjust the RXWIN0SENT flag - indicate that we have advertised
	1787	* a 0 window. This may cause the remote transmitter to stall. This
	1788	* flag tells soreceive() to disable delayed acknowledgements when
	1789	* draining the buffer. This can occur if the receiver is attempting
	1790	* to read more data then can be buffered prior to transmitting on
	1791	* the connection.
	1792	*/
	1793	if (th->th_win == 0)
	1794	tp->t_flags \|= TF_RXWIN0SENT;
	1795	else
	1796	tp->t_flags &= ~TF_RXWIN0SENT;
	1797	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
	1798	th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
	1799	th->th_flags \|= TH_URG;
	1800	} else {
	1801	/*
	1802	* If no urgent pointer to send, then we pull
	1803	* the urgent pointer to the left edge of the send window
	1804	* so that it doesn't drift into the send window on sequence
	1805	* number wraparound.
	1806	*/
	1807	tp->snd_up = tp->snd_una; /* drag it along */
	1808	}
	1809
	1810	/*
	1811	* Put TCP length in extended header, and then
	1812	* checksum extended header and data.
	1813	*/
	1814	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
	1815	#if INET6
	1816	if (isipv6) {
	1817	/*
	1818	* ip6_plen is not need to be filled now, and will be filled
	1819	* in ip6_output.
	1820	*/
	1821	m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
	1822	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	1823	if (len + optlen)
	1824	th->th_sum = in_addword(th->th_sum,
	1825	htons((u_short)(optlen + len)));
	1826	}
	1827	else
	1828	#endif /* INET6 */
	1829	{
	1830	m->m_pkthdr.csum_flags = CSUM_TCP;
	1831	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	1832	if (len + optlen)
	1833	th->th_sum = in_addword(th->th_sum,
	1834	htons((u_short)(optlen + len)));
	1835	}
	1836
	1837	/*
	1838	* Enable TSO and specify the size of the segments.
	1839	* The TCP pseudo header checksum is always provided.
	1840	*/
	1841	if (tso) {
	1842	#if INET6
	1843	if (isipv6)
	1844	m->m_pkthdr.csum_flags \|= CSUM_TSO_IPV6;
	1845	else
	1846	#endif /* INET6 */
	1847	m->m_pkthdr.csum_flags \|= CSUM_TSO_IPV4;
	1848
	1849	m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
	1850	} else {
	1851	m->m_pkthdr.tso_segsz = 0;
	1852	}
	1853
	1854	/*
	1855	* In transmit state, time the transmission and arrange for
	1856	* the retransmit. In persist state, just set snd_max.
	1857	*/
	1858	if (!(tp->t_flagsext & TF_FORCE)
	1859	\|\| tp->t_timer[TCPT_PERSIST] == 0) {
	1860	tcp_seq startseq = tp->snd_nxt;
	1861
	1862	/*
	1863	* Advance snd_nxt over sequence space of this segment.
	1864	*/
	1865	if (flags & (TH_SYN\|TH_FIN)) {
	1866	if (flags & TH_SYN)
	1867	tp->snd_nxt++;
	1868	if ((flags & TH_FIN) &&
	1869	!(tp->t_flags & TF_SENTFIN)) {
	1870	tp->snd_nxt++;
	1871	tp->t_flags \|= TF_SENTFIN;
	1872	}
	1873	}
	1874	if (sack_rxmit)
	1875	goto timer;
	1876	tp->snd_nxt += len;
	1877	if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
	1878	tp->snd_max = tp->snd_nxt;
	1879	/*
	1880	* Time this transmission if not a retransmission and
	1881	* not currently timing anything.
	1882	*/
	1883	if (tp->t_rtttime == 0) {
	1884	tp->t_rtttime = tcp_now;
	1885	tp->t_rtseq = startseq;
	1886	tcpstat.tcps_segstimed++;
	1887	}
	1888	}
	1889
	1890	/*
	1891	* Set retransmit timer if not currently set,
	1892	* and not doing an ack or a keep-alive probe.
	1893	*/
	1894	timer:
	1895	if (tp->t_timer[TCPT_REXMT] == 0 &&
	1896	((sack_rxmit && tp->snd_nxt != tp->snd_max) \|\|
	1897	tp->snd_nxt != tp->snd_una \|\| (flags & TH_FIN))) {
	1898	if (tp->t_timer[TCPT_PERSIST]) {
	1899	tp->t_timer[TCPT_PERSIST] = 0;
	1900	tp->t_rxtshift = 0;
	1901	tp->t_rxtstart = 0;
	1902	tp->t_persist_stop = 0;
	1903	}
	1904	tp->t_timer[TCPT_REXMT] =
	1905	OFFSET_FROM_START(tp, tp->t_rxtcur);
	1906	}
	1907
	1908	/*
	1909	* Set tail loss probe timeout if new data is being
	1910	* transmitted. This will be supported only when
	1911	* SACK option is enabled on a connection.
	1912	*
	1913	* Every time new data is sent PTO will get reset.
	1914	*/
	1915	if (tcp_enable_tlp && tp->t_state == TCPS_ESTABLISHED &&
	1916	SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp)
	1917	&& tp->snd_nxt == tp->snd_max
	1918	&& SEQ_GT(tp->snd_nxt, tp->snd_una)
	1919	&& tp->t_rxtshift == 0
	1920	&& (tp->t_flagsext & (TF_SENT_TLPROBE\|TF_PKTS_REORDERED)) == 0) {
	1921	u_int32_t pto, srtt, new_rto = 0;
	1922
	1923	/*
	1924	* Using SRTT alone to set PTO can cause spurious
	1925	* retransmissions on wireless networks where there
	1926	* is a lot of variance in RTT. Taking variance
	1927	* into account will avoid this.
	1928	*/
	1929	srtt = tp->t_srtt >> TCP_RTT_SHIFT;
	1930	pto = ((TCP_REXMTVAL(tp)) * 3) >> 1;
	1931	pto = max (2 * srtt, pto);
	1932	if ((tp->snd_max - tp->snd_una) == tp->t_maxseg)
	1933	pto = max(pto,
	1934	(((3 * pto) >> 2) + tcp_delack * 2));
	1935	else
	1936	pto = max(10, pto);
	1937
	1938	/* if RTO is less than PTO, choose RTO instead */
	1939	if (tp->t_rxtcur < pto) {
	1940	/*
	1941	* Schedule PTO instead of RTO in favor of
	1942	* fast recovery.
	1943	*/
	1944	pto = tp->t_rxtcur;
	1945
	1946	/* Reset the next RTO to be after PTO. */
	1947	TCPT_RANGESET(new_rto,
	1948	(pto + TCP_REXMTVAL(tp)),
	1949	max(tp->t_rttmin, tp->t_rttcur + 2),
	1950	TCPTV_REXMTMAX, 0);
	1951	tp->t_timer[TCPT_REXMT] =
	1952	OFFSET_FROM_START(tp, new_rto);
	1953	}
	1954	tp->t_timer[TCPT_PTO] = OFFSET_FROM_START(tp, pto);
	1955	}
	1956	} else {
	1957	/*
	1958	* Persist case, update snd_max but since we are in
	1959	* persist mode (no window) we do not update snd_nxt.
	1960	*/
	1961	int xlen = len;
	1962	if (flags & TH_SYN)
	1963	++xlen;
	1964	if ((flags & TH_FIN) &&
	1965	!(tp->t_flags & TF_SENTFIN)) {
	1966	++xlen;
	1967	tp->t_flags \|= TF_SENTFIN;
	1968	}
	1969	if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
	1970	tp->snd_max = tp->snd_nxt + len;
	1971	}
	1972
	1973	#if TCPDEBUG
	1974	/*
	1975	* Trace.
	1976	*/
	1977	if (so_options & SO_DEBUG)
	1978	tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
	1979	#endif
	1980
	1981	/*
	1982	* Fill in IP length and desired time to live and
	1983	* send to IP level. There should be a better way
	1984	* to handle ttl and tos; we could keep them in
	1985	* the template, but need a way to checksum without them.
	1986	*/
	1987	#if INET6
	1988	/*
	1989	* m->m_pkthdr.len should have been set before cksum calcuration,
	1990	* because in6_cksum() need it.
	1991	*/
	1992	if (isipv6) {
	1993	/*
	1994	* we separately set hoplimit for every segment, since the
	1995	* user might want to change the value via setsockopt.
	1996	* Also, desired default hop limit might be changed via
	1997	* Neighbor Discovery.
	1998	*/
	1999	ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
	2000	inp->in6p_route.ro_rt->rt_ifp : NULL);
	2001
	2002	/* TODO: IPv6 IP6TOS_ECT bit on */
	2003	KERNEL_DEBUG(DBG_LAYER_BEG,
	2004	((inp->inp_fport << 16) \| inp->inp_lport),
	2005	(((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) \|
	2006	(inp->in6p_faddr.s6_addr16[0] & 0xffff)),
	2007	sendalot,0,0);
	2008	} else
	2009	#endif /* INET6 */
	2010	{
	2011	ip->ip_len = m->m_pkthdr.len;
	2012	ip->ip_ttl = inp->inp_ip_ttl; /* XXX */
	2013	ip->ip_tos \|= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);/* XXX */
	2014	KERNEL_DEBUG(DBG_LAYER_BEG,
	2015	((inp->inp_fport << 16) \| inp->inp_lport),
	2016	(((inp->inp_laddr.s_addr & 0xffff) << 16) \|
	2017	(inp->inp_faddr.s_addr & 0xffff)), 0,0,0);
	2018	}
	2019
	2020	/*
	2021	* See if we should do MTU discovery.
	2022	* Look at the flag updated on the following criterias:
	2023	* 1) Path MTU discovery is authorized by the sysctl
	2024	* 2) The route isn't set yet (unlikely but could happen)
	2025	* 3) The route is up
	2026	* 4) the MTU is not locked (if it is, then discovery has been
	2027	* disabled for that route)
	2028	*/
	2029	#if INET6
	2030	if (!isipv6)
	2031	#endif /* INET6 */
	2032	if (path_mtu_discovery && (tp->t_flags & TF_PMTUD))
	2033	ip->ip_off \|= IP_DF;
	2034
	2035	#if NECP
	2036	{
	2037	necp_kernel_policy_id policy_id;
	2038	if (!necp_socket_is_allowed_to_send_recv(inp, &policy_id)) {
	2039	m_freem(m);
	2040	error = EHOSTUNREACH;
	2041	goto out;
	2042	}
	2043
	2044	necp_mark_packet_from_socket(m, inp, policy_id);
	2045	}
	2046	#endif /* NECP */
	2047
	2048	#if IPSEC
	2049	if (inp->inp_sp != NULL)
	2050	ipsec_setsocket(m, so);
	2051	#endif /IPSEC/
	2052
	2053	/*
	2054	* The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
	2055	*/
	2056	lost = 0;
	2057
	2058	/*
	2059	* Embed the flow hash in pkt hdr and mark the packet as
	2060	* capable of flow controlling
	2061	*/
	2062	m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
	2063	m->m_pkthdr.pkt_flowid = inp->inp_flowhash;
	2064	m->m_pkthdr.pkt_flags \|= PKTF_FLOW_ID \| PKTF_FLOW_LOCALSRC;
	2065	#if MPTCP
	2066	/* Disable flow advisory when using MPTCP. */
	2067	if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
	2068	#endif /* MPTCP */
	2069	m->m_pkthdr.pkt_flags \|= PKTF_FLOW_ADV;
	2070	m->m_pkthdr.pkt_proto = IPPROTO_TCP;
	2071
	2072	m->m_nextpkt = NULL;
	2073
	2074	if (inp->inp_last_outifp != NULL &&
	2075	!(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
	2076	/* Hint to prioritize this packet if
	2077	* 1. if the packet has no data
	2078	* 2. the interface supports transmit-start model and did
	2079	* not disable ACK prioritization.
	2080	* 3. Only ACK flag is set.
	2081	* 4. there is no outstanding data on this connection.
	2082	*/
	2083	if (tcp_prioritize_acks != 0 && len == 0 &&
	2084	(inp->inp_last_outifp->if_eflags &
	2085	(IFEF_TXSTART \| IFEF_NOACKPRI)) == IFEF_TXSTART &&
	2086	th->th_flags == TH_ACK && tp->snd_una == tp->snd_max &&
	2087	tp->t_timer[TCPT_REXMT] == 0) {
	2088	svc_flags \|= PKT_SCF_TCP_ACK;
	2089	}
	2090	set_packet_service_class(m, so, MBUF_SC_UNSPEC, svc_flags);
	2091	}
	2092
	2093	tp->t_pktlist_sentlen += len;
	2094	tp->t_lastchain++;
	2095
	2096	#if INET6
	2097	if (isipv6) {
	2098	DTRACE_TCP5(send, struct mbuf , m, struct inpcb , inp,
	2099	struct ip6 , ip6, struct tcpcb , tp, struct tcphdr *,
	2100	th);
	2101	} else
	2102	#endif /* INET6 */
	2103	{
	2104	DTRACE_TCP5(send, struct mbuf , m, struct inpcb , inp,
	2105	struct ip , ip, struct tcpcb , tp, struct tcphdr *, th);
	2106	}
	2107
	2108	if (tp->t_pktlist_head != NULL) {
	2109	tp->t_pktlist_tail->m_nextpkt = m;
	2110	tp->t_pktlist_tail = m;
	2111	} else {
	2112	packchain_newlist++;
	2113	tp->t_pktlist_head = tp->t_pktlist_tail = m;
	2114	}
	2115
	2116	if ((lro_ackmore) && (!sackoptlen) && (!tp->t_timer[TCPT_PERSIST]) &&
	2117	((th->th_flags & TH_ACK) == TH_ACK) && (!len) &&
	2118	(tp->t_state == TCPS_ESTABLISHED)) {
	2119	/* For a pure ACK, see if you need to send more of them */
	2120	mnext = tcp_send_lroacks(tp, m, th);
	2121	if (mnext) {
	2122	tp->t_pktlist_tail->m_nextpkt = mnext;
	2123	if (mnext->m_nextpkt == NULL) {
	2124	tp->t_pktlist_tail = mnext;
	2125	tp->t_lastchain++;
	2126	} else {
	2127	struct mbuf tail, next;
	2128	next = mnext->m_nextpkt;
	2129	tail = next->m_nextpkt;
	2130	while (tail) {
	2131	next = tail;
	2132	tail = tail->m_nextpkt;
	2133	tp->t_lastchain++;
	2134	}
	2135	tp->t_pktlist_tail = next;
	2136	}
	2137	}
	2138	}
	2139
	2140	if (sendalot == 0 \|\| (tp->t_state != TCPS_ESTABLISHED) \|\|
	2141	(tp->snd_cwnd <= (tp->snd_wnd / 8)) \|\|
	2142	(tp->t_flags & (TH_PUSH \| TF_ACKNOW)) \|\|
	2143	(tp->t_flagsext & TF_FORCE) \|\|
	2144	tp->t_lastchain >= tcp_packet_chaining) {
	2145	error = 0;
	2146	while (inp->inp_sndinprog_cnt == 0 &&
	2147	tp->t_pktlist_head != NULL) {
	2148	packetlist = tp->t_pktlist_head;
	2149	packchain_listadd = tp->t_lastchain;
	2150	packchain_sent++;
	2151	lost = tp->t_pktlist_sentlen;
	2152	TCP_PKTLIST_CLEAR(tp);
	2153
	2154	error = tcp_ip_output(so, tp, packetlist,
	2155	packchain_listadd, tp_inp_options,
	2156	(so_options & SO_DONTROUTE),
	2157	(sack_rxmit \| (sack_bytes_rxmt != 0)), recwin,
	2158	#if INET6
	2159	isipv6);
	2160	#else /* INET6 */
	2161	0);
	2162	#endif /* !INET6 */
	2163	if (error) {
	2164	/*
	2165	* Take into account the rest of unsent
	2166	* packets in the packet list for this tcp
	2167	* into "lost", since we're about to free
	2168	* the whole list below.
	2169	*/
	2170	lost += tp->t_pktlist_sentlen;
	2171	break;
	2172	} else {
	2173	lost = 0;
	2174	}
	2175	}
	2176	/* tcp was closed while we were in ip; resume close */
	2177	if (inp->inp_sndinprog_cnt == 0 &&
	2178	(tp->t_flags & TF_CLOSING)) {
	2179	tp->t_flags &= ~TF_CLOSING;
	2180	(void) tcp_close(tp);
	2181	return (0);
	2182	}
	2183	} else {
	2184	error = 0;
	2185	packchain_looped++;
	2186	tcpstat.tcps_sndtotal++;
	2187
	2188	goto again;
	2189	}
	2190	if (error) {
	2191	/*
	2192	* Assume that the packets were lost, so back out the
	2193	* sequence number advance, if any. Note that the "lost"
	2194	* variable represents the amount of user data sent during
	2195	* the recent call to ip_output_list() plus the amount of
	2196	* user data in the packet list for this tcp at the moment.
	2197	*/
	2198	if (!(tp->t_flagsext & TF_FORCE)
	2199	\|\| tp->t_timer[TCPT_PERSIST] == 0) {
	2200	/*
	2201	* No need to check for TH_FIN here because
	2202	* the TF_SENTFIN flag handles that case.
	2203	*/
	2204	if ((flags & TH_SYN) == 0) {
	2205	if (sack_rxmit) {
	2206	if (SEQ_GT((p->rxmit - lost),
	2207	tp->snd_una)) {
	2208	p->rxmit -= lost;
	2209	} else {
	2210	lost = p->rxmit - tp->snd_una;
	2211	p->rxmit = tp->snd_una;
	2212	}
	2213	tp->sackhint.sack_bytes_rexmit -= lost;
	2214	} else {
	2215	if (SEQ_GT((tp->snd_nxt - lost),
	2216	tp->snd_una))
	2217	tp->snd_nxt -= lost;
	2218	else
	2219	tp->snd_nxt = tp->snd_una;
	2220	}
	2221	}
	2222	}
	2223	out:
	2224	if (tp->t_pktlist_head != NULL)
	2225	m_freem_list(tp->t_pktlist_head);
	2226	TCP_PKTLIST_CLEAR(tp);
	2227
	2228	if (error == ENOBUFS) {
	2229	if (!tp->t_timer[TCPT_REXMT] &&
	2230	!tp->t_timer[TCPT_PERSIST])
	2231	tp->t_timer[TCPT_REXMT] =
	2232	OFFSET_FROM_START(tp, tp->t_rxtcur);
	2233	tp->snd_cwnd = tp->t_maxseg;
	2234	tp->t_bytes_acked = 0;
	2235	tcp_check_timer_state(tp);
	2236	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, 0,0,0,0,0);
	2237
	2238	tcp_ccdbg_trace(tp, NULL, TCP_CC_OUTPUT_ERROR);
	2239	return (0);
	2240	}
	2241	if (error == EMSGSIZE) {
	2242	/*
	2243	* ip_output() will have already fixed the route
	2244	* for us. tcp_mtudisc() will, as its last action,
	2245	* initiate retransmission, so it is important to
	2246	* not do so here.
	2247	*
	2248	* If TSO was active we either got an interface
	2249	* without TSO capabilits or TSO was turned off.
	2250	* Disable it for this connection as too and
	2251	* immediatly retry with MSS sized segments generated
	2252	* by this function.
	2253	*/
	2254	if (tso)
	2255	tp->t_flags &= ~TF_TSO;
	2256
	2257	tcp_mtudisc(inp, 0);
	2258	tcp_check_timer_state(tp);
	2259
	2260	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, 0,0,0,0,0);
	2261	return 0;
	2262	}
	2263	/*
	2264	* Unless this is due to interface restriction policy,
	2265	* treat EHOSTUNREACH/ENETDOWN as a soft error.
	2266	*/
	2267	if ((error == EHOSTUNREACH \|\| error == ENETDOWN) &&
	2268	TCPS_HAVERCVDSYN(tp->t_state) &&
	2269	!inp_restricted_send(inp, inp->inp_last_outifp)) {
	2270	tp->t_softerror = error;
	2271	error = 0;
	2272	}
	2273	tcp_check_timer_state(tp);
	2274	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, 0,0,0,0,0);
	2275	return (error);
	2276	}
	2277
	2278	tcpstat.tcps_sndtotal++;
	2279
	2280	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END,0,0,0,0,0);
	2281	if (sendalot)
	2282	goto again;
	2283
	2284	tcp_check_timer_state(tp);
	2285	return (0);
	2286	}
	2287
	2288	static int
	2289	tcp_ip_output(struct socket so, struct tcpcb tp, struct mbuf *pkt,
	2290	int cnt, struct mbuf *opt, int flags, int sack_in_progress, int recwin,
	2291	boolean_t isipv6)
	2292	{
	2293	int error = 0;
	2294	boolean_t chain;
	2295	boolean_t unlocked = FALSE;
	2296	boolean_t ifdenied = FALSE;
	2297	struct inpcb *inp = tp->t_inpcb;
	2298	struct ip_out_args ipoa =
	2299	{ IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF\|IPOAF_BOUND_SRCADDR, 0 };
	2300	struct route ro;
	2301	struct ifnet *outif = NULL;
	2302	#if INET6
	2303	struct ip6_out_args ip6oa =
	2304	{ IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF\|IP6OAF_BOUND_SRCADDR, 0 };
	2305	struct route_in6 ro6;
	2306	struct flowadv *adv =
	2307	(isipv6 ? &ip6oa.ip6oa_flowadv : &ipoa.ipoa_flowadv);
	2308	#else /* INET6 */
	2309	struct flowadv *adv = &ipoa.ipoa_flowadv;
	2310	#endif /* !INET6 */
	2311
	2312	/* If socket was bound to an ifindex, tell ip_output about it */
	2313	if (inp->inp_flags & INP_BOUND_IF) {
	2314	#if INET6
	2315	if (isipv6) {
	2316	ip6oa.ip6oa_boundif = inp->inp_boundifp->if_index;
	2317	ip6oa.ip6oa_flags \|= IP6OAF_BOUND_IF;
	2318	} else
	2319	#endif /* INET6 */
	2320	{
	2321	ipoa.ipoa_boundif = inp->inp_boundifp->if_index;
	2322	ipoa.ipoa_flags \|= IPOAF_BOUND_IF;
	2323	}
	2324	}
	2325
	2326	if (INP_NO_CELLULAR(inp)) {
	2327	#if INET6
	2328	if (isipv6)
	2329	ip6oa.ip6oa_flags \|= IP6OAF_NO_CELLULAR;
	2330	else
	2331	#endif /* INET6 */
	2332	ipoa.ipoa_flags \|= IPOAF_NO_CELLULAR;
	2333	}
	2334	if (INP_NO_EXPENSIVE(inp)) {
	2335	#if INET6
	2336	if (isipv6)
	2337	ip6oa.ip6oa_flags \|= IP6OAF_NO_EXPENSIVE;
	2338	else
	2339	#endif /* INET6 */
	2340	ipoa.ipoa_flags \|= IPOAF_NO_EXPENSIVE;
	2341
	2342	}
	2343	if (INP_AWDL_UNRESTRICTED(inp)) {
	2344	#if INET6
	2345	if (isipv6)
	2346	ip6oa.ip6oa_flags \|= IP6OAF_AWDL_UNRESTRICTED;
	2347	else
	2348	#endif /* INET6 */
	2349	ipoa.ipoa_flags \|= IPOAF_AWDL_UNRESTRICTED;
	2350
	2351	}
	2352	#if INET6
	2353	if (isipv6)
	2354	flags \|= IPV6_OUTARGS;
	2355	else
	2356	#endif /* INET6 */
	2357	flags \|= IP_OUTARGS;
	2358
	2359	/* Copy the cached route and take an extra reference */
	2360	#if INET6
	2361	if (isipv6)
	2362	in6p_route_copyout(inp, &ro6);
	2363	else
	2364	#endif /* INET6 */
	2365	inp_route_copyout(inp, &ro);
	2366
	2367	/*
	2368	* Data sent (as far as we can tell).
	2369	* If this advertises a larger window than any other segment,
	2370	* then remember the size of the advertised window.
	2371	* Make sure ACK/DELACK conditions are cleared before
	2372	* we unlock the socket.
	2373	*/
	2374	if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
	2375	tp->rcv_adv = tp->rcv_nxt + recwin;
	2376	tp->last_ack_sent = tp->rcv_nxt;
	2377	tp->t_flags &= ~(TF_ACKNOW \| TF_DELACK);
	2378	tp->t_timer[TCPT_DELACK] = 0;
	2379	tp->t_unacksegs = 0;
	2380
	2381	/* Increment the count of outstanding send operations */
	2382	inp->inp_sndinprog_cnt++;
	2383
	2384	/*
	2385	* If allowed, unlock TCP socket while in IP
	2386	* but only if the connection is established and
	2387	* in a normal mode where reentrancy on the tcpcb won't be
	2388	* an issue:
	2389	* - there is no SACK episode
	2390	* - we're not in Fast Recovery mode
	2391	* - if we're not sending from an upcall.
	2392	*/
	2393	if (tcp_output_unlocked && !so->so_upcallusecount &&
	2394	(tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0) &&
	2395	!IN_FASTRECOVERY(tp)) {
	2396
	2397	unlocked = TRUE;
	2398	socket_unlock(so, 0);
	2399	}
	2400
	2401	/*
	2402	* Don't send down a chain of packets when:
	2403	* - TCP chaining is disabled
	2404	* - there is an IPsec rule set
	2405	* - there is a non default rule set for the firewall
	2406	*/
	2407
	2408	chain = tcp_packet_chaining > 1
	2409	#if IPSEC
	2410	&& ipsec_bypass
	2411	#endif
	2412	#if IPFIREWALL
	2413	&& (fw_enable == 0 \|\| fw_bypass)
	2414	#endif
	2415	; // I'm important, not extraneous
	2416
	2417
	2418	while (pkt != NULL) {
	2419	struct mbuf *npkt = pkt->m_nextpkt;
	2420
	2421	if (!chain) {
	2422	pkt->m_nextpkt = NULL;
	2423	/*
	2424	* If we are not chaining, make sure to set the packet
	2425	* list count to 0 so that IP takes the right path;
	2426	* this is important for cases such as IPSec where a
	2427	* single mbuf might result in multiple mbufs as part
	2428	* of the encapsulation. If a non-zero count is passed
	2429	* down to IP, the head of the chain might change and
	2430	* we could end up skipping it (thus generating bogus
	2431	* packets). Fixing it in IP would be desirable, but
	2432	* for now this would do it.
	2433	*/
	2434	cnt = 0;
	2435	}
	2436	#if INET6
	2437	if (isipv6) {
	2438	error = ip6_output_list(pkt, cnt,
	2439	inp->in6p_outputopts, &ro6, flags, NULL, NULL,
	2440	&ip6oa);
	2441	ifdenied = (ip6oa.ip6oa_retflags & IP6OARF_IFDENIED);
	2442	} else {
	2443	#endif /* INET6 */
	2444	error = ip_output_list(pkt, cnt, opt, &ro, flags, NULL,
	2445	&ipoa);
	2446	ifdenied = (ipoa.ipoa_retflags & IPOARF_IFDENIED);
	2447	}
	2448
	2449	if (chain \|\| error) {
	2450	/*
	2451	* If we sent down a chain then we are done since
	2452	* the callee had taken care of everything; else
	2453	* we need to free the rest of the chain ourselves.
	2454	*/
	2455	if (!chain)
	2456	m_freem_list(npkt);
	2457	break;
	2458	}
	2459	pkt = npkt;
	2460	}
	2461
	2462	if (unlocked)
	2463	socket_lock(so, 0);
	2464
	2465	/*
	2466	* Enter flow controlled state if the connection is established
	2467	* and is not in recovery.
	2468	*
	2469	* A connection will enter suspended state even if it is in
	2470	* recovery.
	2471	*/
	2472	if (((adv->code == FADV_FLOW_CONTROLLED && !IN_FASTRECOVERY(tp)) \|\|
	2473	adv->code == FADV_SUSPENDED) &&
	2474	!(tp->t_flags & TF_CLOSING) &&
	2475	tp->t_state == TCPS_ESTABLISHED) {
	2476	int rc;
	2477	rc = inp_set_fc_state(inp, adv->code);
	2478
	2479	if (rc == 1)
	2480	tcp_ccdbg_trace(tp, NULL,
	2481	((adv->code == FADV_FLOW_CONTROLLED) ?
	2482	TCP_CC_FLOW_CONTROL : TCP_CC_SUSPEND));
	2483	}
	2484
	2485	/*
	2486	* When an interface queue gets suspended, some of the
	2487	* packets are dropped. Return ENOBUFS, to update the
	2488	* pcb state.
	2489	*/
	2490	if (adv->code == FADV_SUSPENDED)
	2491	error = ENOBUFS;
	2492
	2493	VERIFY(inp->inp_sndinprog_cnt > 0);
	2494	if ( --inp->inp_sndinprog_cnt == 0)
	2495	inp->inp_flags &= ~(INP_FC_FEEDBACK);
	2496
	2497	#if INET6
	2498	if (isipv6) {
	2499	if (ro6.ro_rt != NULL && (outif = ro6.ro_rt->rt_ifp) !=
	2500	inp->in6p_last_outifp)
	2501	inp->in6p_last_outifp = outif;
	2502	} else
	2503	#endif /* INET6 */
	2504	if (ro.ro_rt != NULL && (outif = ro.ro_rt->rt_ifp) !=
	2505	inp->inp_last_outifp)
	2506	inp->inp_last_outifp = outif;
	2507
	2508	if (error != 0 && ifdenied &&
	2509	(INP_NO_CELLULAR(inp) \|\| INP_NO_EXPENSIVE(inp)))
	2510	soevent(inp->inp_socket,
	2511	(SO_FILT_HINT_LOCKED\|SO_FILT_HINT_IFDENIED));
	2512
	2513	/* Synchronize cached PCB route & options */
	2514	#if INET6
	2515	if (isipv6)
	2516	in6p_route_copyin(inp, &ro6);
	2517	else
	2518	#endif /* INET6 */
	2519	inp_route_copyin(inp, &ro);
	2520
	2521	if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift == 0 &&
	2522	tp->t_inpcb->inp_route.ro_rt != NULL) {
	2523	/* If we found the route and there is an rtt on it
	2524	* reset the retransmit timer
	2525	*/
	2526	tcp_getrt_rtt(tp, tp->t_inpcb->in6p_route.ro_rt);
	2527	tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
	2528	}
	2529	return (error);
	2530	}
	2531
	2532	void
	2533	tcp_setpersist(tp)
	2534	register struct tcpcb *tp;
	2535	{
	2536	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
	2537
	2538	/* If a PERSIST_TIMER option was set we will limit the
	2539	* time the persist timer will be active for that connection
	2540	* in order to avoid DOS by using zero window probes.
	2541	* see rdar://5805356
	2542	*/
	2543
	2544	if ((tp->t_persist_timeout != 0) &&
	2545	(tp->t_timer[TCPT_PERSIST] == 0) &&
	2546	(tp->t_persist_stop == 0)) {
	2547	tp->t_persist_stop = tcp_now + tp->t_persist_timeout;
	2548	}
	2549
	2550	/*
	2551	* Start/restart persistance timer.
	2552	*/
	2553	TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
	2554	t * tcp_backoff[tp->t_rxtshift],
	2555	TCPTV_PERSMIN, TCPTV_PERSMAX, 0);
	2556	tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]);
	2557
	2558	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
	2559	tp->t_rxtshift++;
	2560	}
	2561
	2562	/*
	2563	* Send as many acks as data coalesced. Every other packet when stretch
	2564	* ACK is not enabled. Every 8 packets, if stretch ACK is enabled.
	2565	*/
	2566	static struct mbuf*
	2567	tcp_send_lroacks(struct tcpcb tp, struct mbuf m, struct tcphdr *th)
	2568	{
	2569	struct mbuf mnext = NULL, ack_chain = NULL, *tail = NULL;
	2570	int count = 0;
	2571	tcp_seq org_ack = ntohl(th->th_ack);
	2572	tcp_seq prev_ack = 0;
	2573	int tack_offset = 28; /* XXX IPv6 and IP options not supported */
	2574	int twin_offset = 34; /* XXX IPv6 and IP options not supported */
	2575	int ack_size = (tp->t_flags & TF_STRETCHACK) ?
	2576	(maxseg_unacked * tp->t_maxseg) : (tp->t_maxseg << 1);
	2577	int segs_acked = (tp->t_flags & TF_STRETCHACK) ? maxseg_unacked : 2;
	2578	struct mbuf *prev_ack_pkt = NULL;
	2579	struct socket *so = tp->t_inpcb->inp_socket;
	2580	unsigned short winsz = ntohs(th->th_win);
	2581	unsigned int scaled_win = winsz<<tp->rcv_scale;
	2582	tcp_seq win_rtedge = org_ack + scaled_win;
	2583
	2584	count = tp->t_lropktlen/tp->t_maxseg;
	2585
	2586	prev_ack = (org_ack - tp->t_lropktlen) + ack_size;
	2587	if (prev_ack < org_ack) {
	2588	ack_chain = m_dup(m, M_DONTWAIT);
	2589	if (ack_chain) {
	2590	th->th_ack = htonl(prev_ack);
	2591	/* Keep adv window constant for duplicated ACK packets */
	2592	scaled_win = win_rtedge - prev_ack;
	2593	if (scaled_win > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
	2594	scaled_win = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
	2595	th->th_win = htons(scaled_win>>tp->rcv_scale);
	2596	if (lrodebug == 5) {
	2597	printf("%s: win = %d winsz = %d sc = %d"
	2598	" lro_len %d %d\n",
	2599	__func__, scaled_win>>tp->rcv_scale, winsz,
	2600	tp->rcv_scale, tp->t_lropktlen, count);
	2601	}
	2602	tail = ack_chain;
	2603	count -= segs_acked; /* accounts for prev_ack packet */
	2604	count = (count <= segs_acked) ? 0 : count - segs_acked;
	2605	tcpstat.tcps_sndacks++;
	2606	so_tc_update_stats(m, so, m_get_service_class(m));
	2607	} else {
	2608	return NULL;
	2609	}
	2610	}
	2611	else {
	2612	tp->t_lropktlen = 0;
	2613	return NULL;
	2614	}
	2615
	2616	prev_ack_pkt = ack_chain;
	2617
	2618	while (count > 0) {
	2619	if ((prev_ack + ack_size) < org_ack) {
	2620	prev_ack += ack_size;
	2621	} else {
	2622	/*
	2623	* The last ACK sent must have the ACK number that TCP
	2624	* thinks is the last sent ACK number.
	2625	*/
	2626	prev_ack = org_ack;
	2627	}
	2628	mnext = m_dup(prev_ack_pkt, M_DONTWAIT);
	2629	if (mnext) {
	2630	/* Keep adv window constant for duplicated ACK packets */
	2631	scaled_win = win_rtedge - prev_ack;
	2632	if (scaled_win > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
	2633	scaled_win = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
	2634	winsz = htons(scaled_win>>tp->rcv_scale);
	2635	if (lrodebug == 5) {
	2636	printf("%s: winsz = %d ack %x count %d\n",
	2637	__func__, scaled_win>>tp->rcv_scale,
	2638	prev_ack, count);
	2639	}
	2640	bcopy(&winsz, mtod(prev_ack_pkt, caddr_t) + twin_offset, 2);
	2641	HTONL(prev_ack);
	2642	bcopy(&prev_ack, mtod(prev_ack_pkt, caddr_t) + tack_offset, 4);
	2643	NTOHL(prev_ack);
	2644	tail->m_nextpkt = mnext;
	2645	tail = mnext;
	2646	count -= segs_acked;
	2647	tcpstat.tcps_sndacks++;
	2648	so_tc_update_stats(m, so, m_get_service_class(m));
	2649	} else {
	2650	if (lrodebug == 5) {
	2651	printf("%s: failed to alloc mbuf.\n", __func__);
	2652	}
	2653	break;
	2654	}
	2655	prev_ack_pkt = mnext;
	2656	}
	2657	tp->t_lropktlen = 0;
	2658	return ack_chain;
	2659	}
	2660
	2661	static int
	2662	tcp_recv_throttle (struct tcpcb *tp)
	2663	{
	2664	uint32_t base_rtt, newsize;
	2665	int32_t qdelay;
	2666	struct sockbuf *sbrcv = &tp->t_inpcb->inp_socket->so_rcv;
	2667
	2668	if (tcp_use_rtt_recvbg == 1 &&
	2669	TSTMP_SUPPORTED(tp)) {
	2670	/*
	2671	* Timestamps are supported on this connection. Use
	2672	* RTT to look for an increase in latency.
	2673	*/
	2674
	2675	/*
	2676	* If the connection is already being throttled, leave it
	2677	* in that state until rtt comes closer to base rtt
	2678	*/
	2679	if (tp->t_flagsext & TF_RECV_THROTTLE)
	2680	return (1);
	2681
	2682	base_rtt = get_base_rtt(tp);
	2683
	2684	if (base_rtt != 0 && tp->t_rttcur != 0) {
	2685	qdelay = tp->t_rttcur - base_rtt;
	2686	/*
	2687	* if latency increased on a background flow,
	2688	* return 1 to start throttling.
	2689	*/
	2690	if (qdelay > target_qdelay) {
	2691	tp->t_flagsext \|= TF_RECV_THROTTLE;
	2692
	2693	/*
	2694	* Reduce the recv socket buffer size to
	2695	* minimize latecy.
	2696	*/
	2697	if (sbrcv->sb_idealsize >
	2698	tcp_recv_throttle_minwin) {
	2699	newsize = sbrcv->sb_idealsize >> 1;
	2700	/* Set a minimum of 16 K */
	2701	newsize =
	2702	max(newsize,
	2703	tcp_recv_throttle_minwin);
	2704	sbrcv->sb_idealsize = newsize;
	2705	}
	2706	return (1);
	2707	} else {
	2708	return (0);
	2709	}
	2710	}
	2711	}
	2712
	2713	/*
	2714	* Timestamps are not supported or there is no good RTT
	2715	* measurement. Use IPDV in this case.
	2716	*/
	2717	if (tp->acc_iaj > tcp_acc_iaj_react_limit)
	2718	return (1);
	2719
	2720	return (0);
	2721	}