]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/tcp_output.c
xnu-1486.2.11.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_output.c
CommitLineData
1c79356b 1/*
c910b4d9 2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
9bccf70c 61 * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
1c79356b 62 */
2d21ac55
A
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
1c79356b 69
1c79356b
A
70#define _IP_VHL
71
1c79356b
A
72
73#include <sys/param.h>
74#include <sys/systm.h>
9bccf70c
A
75#include <sys/kernel.h>
76#include <sys/sysctl.h>
1c79356b
A
77#include <sys/mbuf.h>
78#include <sys/domain.h>
79#include <sys/protosw.h>
80#include <sys/socket.h>
81#include <sys/socketvar.h>
82
83#include <net/route.h>
2d21ac55 84#include <net/if_var.h>
1c79356b
A
85
86#include <netinet/in.h>
87#include <netinet/in_systm.h>
2d21ac55 88#include <netinet/in_var.h>
1c79356b 89#include <netinet/ip.h>
9bccf70c 90#include <netinet/in_pcb.h>
1c79356b
A
91#include <netinet/ip_var.h>
92#if INET6
9bccf70c 93#include <netinet6/in6_pcb.h>
1c79356b 94#include <netinet/ip6.h>
1c79356b
A
95#include <netinet6/ip6_var.h>
96#endif
1c79356b
A
97#include <netinet/tcp.h>
98#define TCPOUTFLAGS
99#include <netinet/tcp_fsm.h>
100#include <netinet/tcp_seq.h>
101#include <netinet/tcp_timer.h>
102#include <netinet/tcp_var.h>
103#include <netinet/tcpip.h>
104#if TCPDEBUG
105#include <netinet/tcp_debug.h>
106#endif
107#include <sys/kdebug.h>
108
9bccf70c
A
109#if IPSEC
110#include <netinet6/ipsec.h>
111#endif /*IPSEC*/
fa4905b1 112
2d21ac55
A
113#if CONFIG_MACF_NET
114#include <security/mac_framework.h>
115#endif /* MAC_SOCKET */
116
1c79356b
A
117#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1)
118#define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3)
119#define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
120
121
122#ifdef notyet
123extern struct mbuf *m_copypack();
124#endif
125
2d21ac55 126int path_mtu_discovery = 1;
9bccf70c
A
127SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
128 &path_mtu_discovery, 1, "Enable Path MTU Discovery");
129
130int ss_fltsz = 1;
131SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
132 &ss_fltsz, 1, "Slow start flight size");
133
2d21ac55 134int ss_fltsz_local = 8; /* starts with eight segments max */
9bccf70c
A
135SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
136 &ss_fltsz_local, 1, "Slow start flight size for local networks");
137
138int tcp_do_newreno = 0;
139SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
140 0, "Enable NewReno Algorithms");
141
b0d623f7
A
142int tcp_do_tso = 1;
143SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
144 &tcp_do_tso, 0, "Enable TCP Segmentation Offload");
145
146
2d21ac55
A
147int tcp_ecn_outbound = 0;
148SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out, CTLFLAG_RW, &tcp_ecn_outbound,
149 0, "Initiate ECN for outbound connections");
150
151int tcp_ecn_inbound = 0;
152SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, CTLFLAG_RW, &tcp_ecn_inbound,
153 0, "Allow ECN negotiation for inbound connections");
154
91447636
A
155int tcp_packet_chaining = 50;
156SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW, &tcp_packet_chaining,
157 0, "Enable TCP output packet chaining");
158
2d21ac55
A
159int tcp_output_unlocked = 1;
160SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output, CTLFLAG_RW, &tcp_output_unlocked,
161 0, "Unlock TCP when sending packets down to IP");
162
b0d623f7
A
163static int32_t packchain_newlist = 0;
164static int32_t packchain_looped = 0;
165static int32_t packchain_sent = 0;
9bccf70c
A
166
167/* temporary: for testing */
168#if IPSEC
169extern int ipsec_bypass;
170#endif
1c79356b 171
d12e1678 172extern int slowlink_wsize; /* window correction for slow links */
4a3eedf9 173#if IPFIREWALL
2d21ac55
A
174extern int fw_enable; /* firewall check for packet chaining */
175extern int fw_bypass; /* firewall check: disable packet chaining if there is rules */
4a3eedf9 176#endif /* IPFIREWALL */
55e303ae 177
91447636 178extern vm_size_t so_cache_zone_element_size;
b0d623f7
A
179#if RANDOM_IP_ID
180extern int ip_use_randomid;
181#endif /* RANDOM_IP_ID */
182extern u_int32_t dlil_filter_count;
183extern u_int32_t kipf_count;
91447636 184
2d21ac55 185static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, int,
b0d623f7 186 struct mbuf *, int, int);
2d21ac55 187
91447636
A
188static __inline__ u_int16_t
189get_socket_id(struct socket * s)
190{
191 u_int16_t val;
192
193 if (so_cache_zone_element_size == 0) {
194 return (0);
195 }
b0d623f7 196 val = (u_int16_t)(((uintptr_t)s) / so_cache_zone_element_size);
91447636
A
197 if (val == 0) {
198 val = 0xffff;
199 }
200 return (val);
201}
d12e1678 202
1c79356b
A
203/*
204 * Tcp output routine: figure out what should be sent and send it.
2d21ac55
A
205 *
206 * Returns: 0 Success
207 * EADDRNOTAVAIL
208 * ENOBUFS
209 * EMSGSIZE
210 * EHOSTUNREACH
211 * ENETDOWN
212 * ip_output_list:ENOMEM
213 * ip_output_list:EADDRNOTAVAIL
214 * ip_output_list:ENETUNREACH
215 * ip_output_list:EHOSTUNREACH
216 * ip_output_list:EACCES
217 * ip_output_list:EMSGSIZE
218 * ip_output_list:ENOBUFS
219 * ip_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
220 * ip6_output:??? [IPV6 only]
1c79356b
A
221 */
222int
8ad349bb 223tcp_output(struct tcpcb *tp)
1c79356b 224{
8ad349bb 225 struct socket *so = tp->t_inpcb->inp_socket;
b0d623f7
A
226 int32_t len, recwin, sendwin, off;
227 int flags, error;
1c79356b
A
228 register struct mbuf *m;
229 struct ip *ip = NULL;
9bccf70c 230 register struct ipovly *ipov = NULL;
1c79356b
A
231#if INET6
232 struct ip6_hdr *ip6 = NULL;
233#endif /* INET6 */
9bccf70c 234 register struct tcphdr *th;
1c79356b
A
235 u_char opt[TCP_MAXOLEN];
236 unsigned ipoptlen, optlen, hdrlen;
2d21ac55 237 int idle, sendalot, lost = 0;
8ad349bb 238 int i, sack_rxmit;
b0d623f7 239 int tso = 0;
8ad349bb
A
240 int sack_bytes_rxmt;
241 struct sackhole *p;
b0d623f7
A
242#ifdef IPSEC
243 unsigned ipsec_optlen = 0;
244#endif
9bccf70c 245 int maxburst = TCP_MAXBURST;
55e303ae 246 int last_off = 0;
fa4905b1 247 int m_off;
2d21ac55
A
248 struct mbuf *m_last = NULL;
249 struct mbuf *m_head = NULL;
250 struct mbuf *packetlist = NULL;
251 struct mbuf *tp_inp_options = tp->t_inpcb->inp_depend4.inp4_options;
9bccf70c 252#if INET6
55e303ae 253 int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ;
2d21ac55 254 struct ip6_pktopts *inp6_pktopts = tp->t_inpcb->inp_depend6.inp6_outputopts;
9bccf70c 255#endif
91447636
A
256 short packchain_listadd = 0;
257 u_int16_t socket_id = get_socket_id(so);
2d21ac55
A
258 int so_options = so->so_options;
259 struct rtentry *rt;
55e303ae 260
1c79356b
A
261 /*
262 * Determine length of data that should be transmitted,
263 * and flags that will be used.
264 * If there is some data or critical controls (SYN, RST)
265 * to send, then transmit; otherwise, investigate further.
266 */
8ad349bb 267 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
9bccf70c 268 if (idle && tp->t_rcvtime >= tp->t_rxtcur) {
1c79356b
A
269 /*
270 * We have been idle for "a while" and no acks are
271 * expected to clock out any data we send --
272 * slow start to get ack "clock" running again.
8ad349bb 273 *
9bccf70c
A
274 * Set the slow-start flight size depending on whether
275 * this is a local network or not.
8ad349bb 276 */
9bccf70c
A
277 if (
278#if INET6
279 (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) ||
280 (!isipv6 &&
281#endif
282 in_localaddr(tp->t_inpcb->inp_faddr)
283#if INET6
284 )
285#endif
286 )
287 tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
288 else
289 tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
290 }
8ad349bb
A
291 tp->t_flags &= ~TF_LASTIDLE;
292 if (idle) {
293 if (tp->t_flags & TF_MORETOCOME) {
294 tp->t_flags |= TF_LASTIDLE;
295 idle = 0;
296 }
297 }
1c79356b 298again:
55e303ae
A
299 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
300
301#if INET6
302 if (isipv6) {
303
304 KERNEL_DEBUG(DBG_LAYER_BEG,
305 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
306 (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
307 (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
308 sendalot,0,0);
309 }
310 else
311#endif
312
313 {
314 KERNEL_DEBUG(DBG_LAYER_BEG,
315 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
316 (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
317 (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
318 sendalot,0,0);
319 /*
320 * If the route generation id changed, we need to check that our
321 * local (source) IP address is still valid. If it isn't either
322 * return error or silently do nothing (assuming the address will
323 * come back before the TCP connection times out).
324 */
2d21ac55 325 rt = tp->t_inpcb->inp_route.ro_rt;
b0d623f7
A
326 if (rt != NULL && (!(rt->rt_flags & RTF_UP) ||
327 rt->generation_id != route_generation)) {
2d21ac55 328 struct ifnet *ifp;
b0d623f7 329 struct in_ifaddr *ia;
2d21ac55
A
330
331 /* disable multipages at the socket */
332 somultipages(so, FALSE);
55e303ae 333
b0d623f7
A
334 /* Disable TSO for the socket until we know more */
335 tp->t_flags &= ~TF_TSO;
336
55e303ae 337 /* check that the source address is still valid */
b0d623f7 338 if ((ia = ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr)) == NULL) {
2d21ac55 339
55e303ae 340 if (tp->t_state >= TCPS_CLOSE_WAIT) {
2d21ac55 341 tcp_drop(tp, EADDRNOTAVAIL);
55e303ae
A
342 return(EADDRNOTAVAIL);
343 }
344
345 /* set Retransmit timer if it wasn't set
346 * reset Persist timer and shift register as the
347 * adversed peer window may not be valid anymore
348 */
349
350 if (!tp->t_timer[TCPT_REXMT]) {
351 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
352 if (tp->t_timer[TCPT_PERSIST]) {
353 tp->t_timer[TCPT_PERSIST] = 0;
354 tp->t_rxtshift = 0;
355 }
356 }
357
2d21ac55
A
358 if (tp->t_pktlist_head != NULL)
359 m_freem_list(tp->t_pktlist_head);
360 TCP_PKTLIST_CLEAR(tp);
361
362 /* drop connection if source address isn't available */
363 if (so->so_flags & SOF_NOADDRAVAIL) {
364 tcp_drop(tp, EADDRNOTAVAIL);
55e303ae 365 return(EADDRNOTAVAIL);
2d21ac55 366 }
55e303ae 367 else
2d21ac55 368 return(0); /* silently ignore, keep data in socket: address may be back */
55e303ae 369 }
b0d623f7 370 ifafree(&ia->ia_ifa);
2d21ac55
A
371
372 /*
373 * Address is still valid; check for multipages capability
374 * again in case the outgoing interface has changed.
375 */
b0d623f7
A
376 RT_LOCK(rt);
377 if ((ifp = rt->rt_ifp) != NULL) {
2d21ac55 378 somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
b0d623f7
A
379 tcp_set_tso(tp, ifp);
380 }
381 if (rt->rt_flags & RTF_UP)
2d21ac55
A
382 rt->generation_id = route_generation;
383 /*
384 * See if we should do MTU discovery. Don't do it if:
385 * 1) it is disabled via the sysctl
386 * 2) the route isn't up
387 * 3) the MTU is locked (if it is, then discovery has been
388 * disabled)
389 */
390
391 if (!path_mtu_discovery || ((rt != NULL) &&
392 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
393 tp->t_flags &= ~TF_PMTUD;
394 else
395 tp->t_flags |= TF_PMTUD;
396
b0d623f7 397 RT_UNLOCK(rt);
55e303ae
A
398 }
399 }
8ad349bb
A
400
401 /*
402 * If we've recently taken a timeout, snd_max will be greater than
403 * snd_nxt. There may be SACK information that allows us to avoid
404 * resending already delivered data. Adjust snd_nxt accordingly.
405 */
406 if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
407 tcp_sack_adjust(tp);
1c79356b
A
408 sendalot = 0;
409 off = tp->snd_nxt - tp->snd_una;
8ad349bb
A
410 sendwin = min(tp->snd_wnd, tp->snd_cwnd);
411
d12e1678 412 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
8ad349bb 413 sendwin = min(sendwin, slowlink_wsize);
1c79356b
A
414
415 flags = tcp_outflags[tp->t_state];
8ad349bb
A
416 /*
417 * Send any SACK-generated retransmissions. If we're explicitly trying
418 * to send out new data (when sendalot is 1), bypass this function.
419 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
420 * we're replacing a (future) new transmission with a retransmission
421 * now, and we previously incremented snd_cwnd in tcp_input().
422 */
423 /*
424 * Still in sack recovery , reset rxmit flag to zero.
425 */
426 sack_rxmit = 0;
427 sack_bytes_rxmt = 0;
428 len = 0;
429 p = NULL;
430 if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
431 (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
b0d623f7 432 int32_t cwin;
8ad349bb
A
433
434 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
435 if (cwin < 0)
436 cwin = 0;
437 /* Do not retransmit SACK segments beyond snd_recover */
438 if (SEQ_GT(p->end, tp->snd_recover)) {
439 /*
440 * (At least) part of sack hole extends beyond
441 * snd_recover. Check to see if we can rexmit data
442 * for this hole.
443 */
444 if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
445 /*
446 * Can't rexmit any more data for this hole.
447 * That data will be rexmitted in the next
448 * sack recovery episode, when snd_recover
449 * moves past p->rxmit.
450 */
451 p = NULL;
452 goto after_sack_rexmit;
453 } else
454 /* Can rexmit part of the current hole */
b0d623f7 455 len = ((int32_t)min(cwin,
8ad349bb
A
456 tp->snd_recover - p->rxmit));
457 } else
b0d623f7 458 len = ((int32_t)min(cwin, p->end - p->rxmit));
8ad349bb 459 if (len > 0) {
b0d623f7 460 off = p->rxmit - tp->snd_una; /* update off only if we really transmit SACK data */
8ad349bb
A
461 sack_rxmit = 1;
462 sendalot = 1;
463 tcpstat.tcps_sack_rexmits++;
464 tcpstat.tcps_sack_rexmit_bytes +=
465 min(len, tp->t_maxseg);
466 }
b0d623f7
A
467 else
468 len = 0;
8ad349bb
A
469 }
470after_sack_rexmit:
1c79356b
A
471 /*
472 * Get standard flags, and add SYN or FIN if requested by 'hidden'
473 * state flags.
474 */
475 if (tp->t_flags & TF_NEEDFIN)
476 flags |= TH_FIN;
477 if (tp->t_flags & TF_NEEDSYN)
478 flags |= TH_SYN;
479
480 /*
481 * If in persist timeout with window of 0, send 1 byte.
482 * Otherwise, if window is small but nonzero
483 * and timer expired, we will send what we can
484 * and go to transmit state.
485 */
486 if (tp->t_force) {
8ad349bb 487 if (sendwin == 0) {
1c79356b
A
488 /*
489 * If we still have some data to send, then
490 * clear the FIN bit. Usually this would
491 * happen below when it realizes that we
492 * aren't sending all the data. However,
493 * if we have exactly 1 byte of unsent data,
494 * then it won't clear the FIN bit below,
495 * and if we are in persist state, we wind
496 * up sending the packet without recording
497 * that we sent the FIN bit.
498 *
499 * We can't just blindly clear the FIN bit,
500 * because if we don't have any more data
501 * to send then the probe will be the FIN
502 * itself.
503 */
504 if (off < so->so_snd.sb_cc)
505 flags &= ~TH_FIN;
8ad349bb 506 sendwin = 1;
1c79356b
A
507 } else {
508 tp->t_timer[TCPT_PERSIST] = 0;
509 tp->t_rxtshift = 0;
510 }
511 }
512
8ad349bb
A
513 /*
514 * If snd_nxt == snd_max and we have transmitted a FIN, the
515 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
516 * a negative length. This can also occur when TCP opens up
517 * its congestion window while receiving additional duplicate
518 * acks after fast-retransmit because TCP will reset snd_nxt
519 * to snd_max after the fast-retransmit.
520 *
521 * In the normal retransmit-FIN-only case, however, snd_nxt will
522 * be set to snd_una, the offset will be 0, and the length may
523 * wind up 0.
524 *
525 * If sack_rxmit is true we are retransmitting from the scoreboard
526 * in which case len is already set.
527 */
528 if (sack_rxmit == 0) {
529 if (sack_bytes_rxmt == 0)
b0d623f7 530 len = min(so->so_snd.sb_cc, sendwin) - off;
8ad349bb 531 else {
b0d623f7 532 int32_t cwin;
8ad349bb
A
533
534 /*
535 * We are inside of a SACK recovery episode and are
536 * sending new data, having retransmitted all the
537 * data possible in the scoreboard.
538 */
b0d623f7
A
539 len = min(so->so_snd.sb_cc, tp->snd_wnd)
540 - off;
8ad349bb
A
541 /*
542 * Don't remove this (len > 0) check !
543 * We explicitly check for len > 0 here (although it
544 * isn't really necessary), to work around a gcc
545 * optimization issue - to force gcc to compute
546 * len above. Without this check, the computation
547 * of len is bungled by the optimizer.
548 */
549 if (len > 0) {
550 cwin = tp->snd_cwnd -
551 (tp->snd_nxt - tp->sack_newdata) -
552 sack_bytes_rxmt;
553 if (cwin < 0)
554 cwin = 0;
b0d623f7 555 len = imin(len, cwin);
8ad349bb 556 }
b0d623f7
A
557 else
558 len = 0;
8ad349bb 559 }
1c79356b
A
560 }
561
562 /*
563 * Lop off SYN bit if it has already been sent. However, if this
564 * is SYN-SENT state and if segment contains data and if we don't
565 * know that foreign host supports TAO, suppress sending segment.
566 */
567 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
568 flags &= ~TH_SYN;
569 off--, len++;
8ad349bb 570 if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
2d21ac55
A
571 while (!(tp->t_flags & TF_SENDINPROG) &&
572 tp->t_pktlist_head != NULL) {
573 packetlist = tp->t_pktlist_head;
574 packchain_listadd = tp->t_lastchain;
575 packchain_sent++;
576 TCP_PKTLIST_CLEAR(tp);
577 tp->t_flags |= TF_SENDINPROG;
578
579 error = tcp_ip_output(so, tp, packetlist,
580 packchain_listadd, tp_inp_options,
b0d623f7 581 (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)));
2d21ac55
A
582
583 tp->t_flags &= ~TF_SENDINPROG;
584 }
585 /* tcp was closed while we were in ip; resume close */
586 if ((tp->t_flags &
587 (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
588 tp->t_flags &= ~TF_CLOSING;
589 (void) tcp_close(tp);
91447636 590 }
2d21ac55
A
591 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,
592 0,0,0,0,0);
593 return 0;
1c79356b
A
594 }
595 }
596
597 /*
8ad349bb 598 * Be careful not to send data and/or FIN on SYN segments.
1c79356b
A
599 * This measure is needed to prevent interoperability problems
600 * with not fully conformant TCP implementations.
601 */
8ad349bb 602 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
1c79356b
A
603 len = 0;
604 flags &= ~TH_FIN;
605 }
606
607 if (len < 0) {
608 /*
609 * If FIN has been sent but not acked,
610 * but we haven't been called to retransmit,
8ad349bb 611 * len will be < 0. Otherwise, window shrank
1c79356b
A
612 * after we sent into it. If window shrank to 0,
613 * cancel pending retransmit, pull snd_nxt back
614 * to (closed) window, and set the persist timer
615 * if it isn't already going. If the window didn't
616 * close completely, just wait for an ACK.
617 */
618 len = 0;
8ad349bb 619 if (sendwin == 0) {
1c79356b
A
620 tp->t_timer[TCPT_REXMT] = 0;
621 tp->t_rxtshift = 0;
622 tp->snd_nxt = tp->snd_una;
623 if (tp->t_timer[TCPT_PERSIST] == 0)
624 tcp_setpersist(tp);
625 }
626 }
8ad349bb
A
627
628 /*
b0d623f7
A
629 * Truncate to the maximum segment length or enable TCP Segmentation
630 * Offloading (if supported by hardware) and ensure that FIN is removed
631 * if the length no longer contains the last data byte.
632 *
633 * TSO may only be used if we are in a pure bulk sending state. The
634 * presence of TCP-MD5, SACK retransmits, SACK advertizements, ipfw rules
635 * and IP options prevent using TSO. With TSO the TCP header is the same
636 * (except for the sequence number) for all generated packets. This
637 * makes it impossible to transmit any options which vary per generated
638 * segment or packet.
639 *
640 * The length of TSO bursts is limited to TCP_MAXWIN. That limit and
641 * removal of FIN (if not already catched here) are handled later after
642 * the exact length of the TCP options are known.
643 */
644#if IPSEC
645 /*
646 * Pre-calculate here as we save another lookup into the darknesses
647 * of IPsec that way and can actually decide if TSO is ok.
8ad349bb 648 */
b0d623f7
A
649 if (ipsec_bypass == 0)
650 ipsec_optlen = ipsec_hdrsiz_tcp(tp);
651#endif
652
1c79356b 653 if (len > tp->t_maxseg) {
b0d623f7
A
654 if ((tp->t_flags & TF_TSO) && tcp_do_tso &&
655#if RANDOM_IP_ID
656 ip_use_randomid &&
657#endif /* RANDOM_IP_ID */
658 kipf_count == 0 && dlil_filter_count == 0 &&
659 tp->rcv_numsacks == 0 && sack_rxmit == 0 && sack_bytes_rxmt == 0 &&
660 tp->t_inpcb->inp_options == NULL &&
661 tp->t_inpcb->in6p_options == NULL
662#if IPSEC
663 && ipsec_optlen == 0
664#endif
665#if IPFIREWALL
666 && (fw_enable == 0 || fw_bypass)
667#endif
668 ) {
669 tso = 1;
670 sendalot = 0;
671 } else {
672 len = tp->t_maxseg;
673 sendalot = 1;
674 tso = 0;
675 }
1c79356b 676 }
8ad349bb
A
677 if (sack_rxmit) {
678 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
679 flags &= ~TH_FIN;
680 } else {
681 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
682 flags &= ~TH_FIN;
683 }
1c79356b 684
2d21ac55 685 recwin = tcp_sbspace(tp);
1c79356b
A
686
687 /*
8ad349bb
A
688 * Sender silly window avoidance. We transmit under the following
689 * conditions when len is non-zero:
690 *
b0d623f7 691 * - We have a full segment (or more with TSO)
8ad349bb
A
692 * - This is the last buffer in a write()/send() and we are
693 * either idle or running NODELAY
694 * - we've timed out (e.g. persist timer)
695 * - we have more then 1/2 the maximum send window's worth of
696 * data (receiver may be limited the window size)
697 * - we need to retransmit
1c79356b
A
698 */
699 if (len) {
b0d623f7 700 if (len >= tp->t_maxseg) {
2d21ac55 701 tp->t_flags |= TF_MAXSEGSNT;
1c79356b 702 goto send;
2d21ac55 703 }
1c79356b 704 if (!(tp->t_flags & TF_MORETOCOME) &&
2d21ac55 705 (idle || tp->t_flags & TF_NODELAY || tp->t_flags & TF_MAXSEGSNT) &&
1c79356b 706 (tp->t_flags & TF_NOPUSH) == 0 &&
2d21ac55
A
707 len + off >= so->so_snd.sb_cc) {
708 tp->t_flags &= ~TF_MAXSEGSNT;
1c79356b 709 goto send;
2d21ac55
A
710 }
711 if (tp->t_force) {
712 tp->t_flags &= ~TF_MAXSEGSNT;
1c79356b 713 goto send;
2d21ac55
A
714 }
715 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
716 tp->t_flags &= ~TF_MAXSEGSNT;
1c79356b 717 goto send;
2d21ac55
A
718 }
719 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
720 tp->t_flags &= ~TF_MAXSEGSNT;
8ad349bb 721 goto send;
2d21ac55 722 }
8ad349bb 723 if (sack_rxmit)
1c79356b
A
724 goto send;
725 }
726
727 /*
728 * Compare available window to amount of window
729 * known to peer (as advertised window less
730 * next expected input). If the difference is at least two
731 * max size segments, or at least 50% of the maximum possible
732 * window, then want to send a window update to peer.
8ad349bb 733 * Skip this if the connection is in T/TCP half-open state.
1c79356b 734 */
8ad349bb 735 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
1c79356b
A
736 /*
737 * "adv" is the amount we can increase the window,
738 * taking into account that we are limited by
739 * TCP_MAXWIN << tp->rcv_scale.
740 */
b0d623f7 741 int32_t adv = imin(recwin, (int)TCP_MAXWIN << tp->rcv_scale) -
1c79356b
A
742 (tp->rcv_adv - tp->rcv_nxt);
743
b0d623f7 744 if (adv >= (int32_t) (2 * tp->t_maxseg)) {
4a3eedf9
A
745
746 /*
747 * Update only if the resulting scaled value of the window changed, or
748 * if there is a change in the sequence since the last ack.
749 * This avoids what appears as dupe ACKS (see rdar://5640997)
750 */
751
752 if ((tp->last_ack_sent != tp->rcv_nxt) || (((recwin + adv) >> tp->rcv_scale) > recwin))
753 goto send;
754 }
b0d623f7 755 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
4a3eedf9 756 goto send;
1c79356b
A
757 }
758
759 /*
8ad349bb
A
760 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
761 * is also a catch-all for the retransmit timer timeout case.
1c79356b
A
762 */
763 if (tp->t_flags & TF_ACKNOW)
764 goto send;
765 if ((flags & TH_RST) ||
766 ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
767 goto send;
768 if (SEQ_GT(tp->snd_up, tp->snd_una))
769 goto send;
770 /*
771 * If our state indicates that FIN should be sent
8ad349bb 772 * and we have not yet done so, then we need to send.
1c79356b
A
773 */
774 if (flags & TH_FIN &&
775 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
776 goto send;
8ad349bb
A
777 /*
778 * In SACK, it is possible for tcp_output to fail to send a segment
779 * after the retransmission timer has been turned off. Make sure
780 * that the retransmission timer is set.
781 */
cf7d32b8 782 if (tp->sack_enable && (tp->t_state >= TCPS_ESTABLISHED) && SEQ_GT(tp->snd_max, tp->snd_una) &&
8ad349bb
A
783 tp->t_timer[TCPT_REXMT] == 0 &&
784 tp->t_timer[TCPT_PERSIST] == 0) {
785 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
786 goto just_return;
787 }
1c79356b
A
788 /*
789 * TCP window updates are not reliable, rather a polling protocol
790 * using ``persist'' packets is used to insure receipt of window
791 * updates. The three ``states'' for the output side are:
792 * idle not doing retransmits or persists
793 * persisting to move a small or zero window
794 * (re)transmitting and thereby not persisting
795 *
796 * tp->t_timer[TCPT_PERSIST]
797 * is set when we are in persist state.
798 * tp->t_force
799 * is set when we are called to send a persist packet.
800 * tp->t_timer[TCPT_REXMT]
801 * is set when we are retransmitting
802 * The output side is idle when both timers are zero.
803 *
804 * If send window is too small, there is data to transmit, and no
805 * retransmit or persist is pending, then go to persist state.
806 * If nothing happens soon, send when timer expires:
807 * if window is nonzero, transmit what we can,
808 * otherwise force out a byte.
809 */
810 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
811 tp->t_timer[TCPT_PERSIST] == 0) {
812 tp->t_rxtshift = 0;
813 tcp_setpersist(tp);
814 }
8ad349bb 815just_return:
1c79356b 816 /*
91447636
A
817 * If there is no reason to send a segment, just return.
818 * but if there is some packets left in the packet list, send them now.
1c79356b 819 */
2d21ac55
A
820 while (!(tp->t_flags & TF_SENDINPROG) && tp->t_pktlist_head != NULL) {
821 packetlist = tp->t_pktlist_head;
822 packchain_listadd = tp->t_lastchain;
823 packchain_sent++;
824 TCP_PKTLIST_CLEAR(tp);
825 tp->t_flags |= TF_SENDINPROG;
826
827 error = tcp_ip_output(so, tp, packetlist, packchain_listadd,
b0d623f7 828 tp_inp_options, (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)));
2d21ac55
A
829
830 tp->t_flags &= ~TF_SENDINPROG;
831 }
832 /* tcp was closed while we were in ip; resume close */
833 if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
834 tp->t_flags &= ~TF_CLOSING;
835 (void) tcp_close(tp);
91447636 836 }
1c79356b
A
837 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
838 return (0);
839
840send:
841 /*
842 * Before ESTABLISHED, force sending of initial options
843 * unless TCP set not to do any options.
844 * NOTE: we assume that the IP/TCP header plus TCP options
845 * always fit in a single mbuf, leaving room for a maximum
846 * link header, i.e.
9bccf70c 847 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
1c79356b
A
848 */
849 optlen = 0;
850#if INET6
851 if (isipv6)
9bccf70c 852 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
1c79356b
A
853 else
854#endif
855 hdrlen = sizeof (struct tcpiphdr);
856 if (flags & TH_SYN) {
857 tp->snd_nxt = tp->iss;
858 if ((tp->t_flags & TF_NOOPT) == 0) {
859 u_short mss;
860
861 opt[0] = TCPOPT_MAXSEG;
862 opt[1] = TCPOLEN_MAXSEG;
9bccf70c 863 mss = htons((u_short) tcp_mssopt(tp));
1c79356b
A
864 (void)memcpy(opt + 2, &mss, sizeof(mss));
865 optlen = TCPOLEN_MAXSEG;
866
867 if ((tp->t_flags & TF_REQ_SCALE) &&
868 ((flags & TH_ACK) == 0 ||
869 (tp->t_flags & TF_RCVD_SCALE))) {
870 *((u_int32_t *)(opt + optlen)) = htonl(
871 TCPOPT_NOP << 24 |
872 TCPOPT_WINDOW << 16 |
873 TCPOLEN_WINDOW << 8 |
874 tp->request_r_scale);
875 optlen += 4;
876 }
877 }
2d21ac55 878
1c79356b 879 }
2d21ac55
A
880
881 /*
882 RFC 3168 states that:
883 - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
884 to handle the TCP ECE flag, even if you also later send a
885 non-ECN-setup SYN/SYN-ACK.
886 - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
887 the ip ECT flag.
888
889 It is not clear how the ECE flag would ever be set if you never
890 set the IP ECT flag on outbound packets. All the same, we use
891 the TE_SETUPSENT to indicate that we have committed to handling
892 the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
893 whether or not we should set the IP ECT flag on outbound packets.
894 */
895 /*
896 * For a SYN-ACK, send an ECN setup SYN-ACK
897 */
898 if (tcp_ecn_inbound && (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
899 if ((tp->ecn_flags & TE_SETUPRECEIVED) != 0) {
900 if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
901 /* Setting TH_ECE makes this an ECN-setup SYN-ACK */
902 flags |= TH_ECE;
903
904 /*
905 * Record that we sent the ECN-setup and default to
906 * setting IP ECT.
907 */
908 tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
909 }
910 else {
911 /*
912 * We sent an ECN-setup SYN-ACK but it was dropped.
913 * Fallback to non-ECN-setup SYN-ACK and clear flag
914 * that to indicate we should not send data with IP ECT set.
915 *
916 * Pretend we didn't receive an ECN-setup SYN.
917 */
918 tp->ecn_flags &= ~TE_SETUPRECEIVED;
919 }
920 }
921 }
922 else if (tcp_ecn_outbound && (flags & (TH_SYN | TH_ACK)) == TH_SYN) {
923 if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
924 /* Setting TH_ECE and TH_CWR makes this an ECN-setup SYN */
925 flags |= (TH_ECE | TH_CWR);
926
927 /*
928 * Record that we sent the ECN-setup and default to
929 * setting IP ECT.
930 */
931 tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
932 }
933 else {
934 /*
935 * We sent an ECN-setup SYN but it was dropped.
936 * Fall back to no ECN and clear flag indicating
937 * we should send data with IP ECT set.
938 */
939 tp->ecn_flags &= ~TE_SENDIPECT;
940 }
941 }
942
943 /*
944 * Check if we should set the TCP CWR flag.
945 * CWR flag is sent when we reduced the congestion window because
946 * we received a TCP ECE or we performed a fast retransmit. We
947 * never set the CWR flag on retransmitted packets. We only set
948 * the CWR flag on data packets. Pure acks don't have this set.
949 */
950 if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 &&
951 !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
952 flags |= TH_CWR;
953 tp->ecn_flags &= ~TE_SENDCWR;
954 }
955
956 /*
957 * Check if we should set the TCP ECE flag.
958 */
959 if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) {
960 flags |= TH_ECE;
961 }
1c79356b
A
962
963 /*
964 * Send a timestamp and echo-reply if this is a SYN and our side
965 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
966 * and our peer have sent timestamps in our SYN's.
967 */
968 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
969 (flags & TH_RST) == 0 &&
970 ((flags & TH_ACK) == 0 ||
971 (tp->t_flags & TF_RCVD_TSTMP))) {
972 u_int32_t *lp = (u_int32_t *)(opt + optlen);
973
974 /* Form timestamp option as shown in appendix A of RFC 1323. */
975 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
976 *lp++ = htonl(tcp_now);
977 *lp = htonl(tp->ts_recent);
978 optlen += TCPOLEN_TSTAMP_APPA;
979 }
980
8ad349bb
A
981 if (tp->sack_enable && ((tp->t_flags & TF_NOOPT) == 0)) {
982 /*
983 * Tack on the SACK permitted option *last*.
984 * And do padding of options after tacking this on.
985 * This is because of MSS, TS, WinScale and Signatures are
986 * all present, we have just 2 bytes left for the SACK
987 * permitted option, which is just enough.
1c79356b 988 */
1c79356b 989 /*
8ad349bb
A
990 * If this is the first SYN of connection (not a SYN
991 * ACK), include SACK permitted option. If this is a
992 * SYN ACK, include SACK permitted option if peer has
993 * already done so. This is only for active connect,
994 * since the syncache takes care of the passive connect.
1c79356b 995 */
8ad349bb
A
996 if ((flags & TH_SYN) &&
997 (!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) {
998 u_char *bp;
999 bp = (u_char *)opt + optlen;
1000
1001 *bp++ = TCPOPT_SACK_PERMITTED;
1002 *bp++ = TCPOLEN_SACK_PERMITTED;
1003 optlen += TCPOLEN_SACK_PERMITTED;
1004 }
1c79356b
A
1005
1006 /*
8ad349bb
A
1007 * Send SACKs if necessary. This should be the last
1008 * option processed. Only as many SACKs are sent as
1009 * are permitted by the maximum options size.
1010 *
1011 * In general, SACK blocks consume 8*n+2 bytes.
1012 * So a full size SACK blocks option is 34 bytes
1013 * (to generate 4 SACK blocks). At a minimum,
1014 * we need 10 bytes (to generate 1 SACK block).
1015 * If TCP Timestamps (12 bytes) and TCP Signatures
1016 * (18 bytes) are both present, we'll just have
1017 * 10 bytes for SACK options 40 - (12 + 18).
1c79356b 1018 */
8ad349bb
A
1019 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1020 (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0 &&
1021 MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) {
1022 int nsack, sackoptlen, padlen;
1023 u_char *bp = (u_char *)opt + optlen;
1024 u_int32_t *lp;
1c79356b 1025
8ad349bb
A
1026 nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK;
1027 nsack = min(nsack, tp->rcv_numsacks);
1028 sackoptlen = (2 + nsack * TCPOLEN_SACK);
1029
1030 /*
1031 * First we need to pad options so that the
1032 * SACK blocks can start at a 4-byte boundary
1033 * (sack option and length are at a 2 byte offset).
1034 */
1035 padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4;
1036 optlen += padlen;
1037 while (padlen-- > 0)
1038 *bp++ = TCPOPT_NOP;
1039
1040 tcpstat.tcps_sack_send_blocks++;
1041 *bp++ = TCPOPT_SACK;
1042 *bp++ = sackoptlen;
1043 lp = (u_int32_t *)bp;
1044 for (i = 0; i < nsack; i++) {
1045 struct sackblk sack = tp->sackblks[i];
1046 *lp++ = htonl(sack.start);
1047 *lp++ = htonl(sack.end);
1c79356b 1048 }
8ad349bb 1049 optlen += sackoptlen;
1c79356b 1050 }
8ad349bb
A
1051 }
1052
1053 /* Pad TCP options to a 4 byte boundary */
1054 if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
1055 int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
1056 u_char *bp = (u_char *)opt + optlen;
1057
1058 optlen += pad;
1059 while (pad) {
1060 *bp++ = TCPOPT_EOL;
1061 pad--;
1062 }
1063 }
1064
1065 hdrlen += optlen;
1c79356b 1066
1c79356b
A
1067#if INET6
1068 if (isipv6)
1069 ipoptlen = ip6_optlen(tp->t_inpcb);
1070 else
1071#endif
9bccf70c 1072 {
2d21ac55
A
1073 if (tp_inp_options) {
1074 ipoptlen = tp_inp_options->m_len -
1c79356b 1075 offsetof(struct ipoption, ipopt_list);
8ad349bb 1076 } else
9bccf70c 1077 ipoptlen = 0;
1c79356b
A
1078 }
1079#if IPSEC
b0d623f7 1080 ipoptlen += ipsec_optlen;
1c79356b
A
1081#endif
1082
1083 /*
1084 * Adjust data length if insertion of options will
1085 * bump the packet length beyond the t_maxopd length.
1086 * Clear the FIN bit because we cut off the tail of
1087 * the segment.
b0d623f7
A
1088 *
1089 * When doing TSO limit a burst to TCP_MAXWIN minus the
1090 * IP, TCP and Options length to keep ip->ip_len from
1091 * overflowing. Prevent the last segment from being
1092 * fractional thus making them all equal sized and set
1093 * the flag to continue sending. TSO is disabled when
1094 * IP options or IPSEC are present.
1c79356b
A
1095 */
1096 if (len + optlen + ipoptlen > tp->t_maxopd) {
1097 /*
1098 * If there is still more to send, don't close the connection.
1099 */
1100 flags &= ~TH_FIN;
b0d623f7
A
1101 if (tso) {
1102 int32_t tso_maxlen;
1103
1104 tso_maxlen = tp->tso_max_segment_size ? tp->tso_max_segment_size : TCP_MAXWIN;
1105
1106 if (len > tso_maxlen - hdrlen - optlen) {
1107 len = tso_maxlen - hdrlen - optlen;
1108 len = len - (len % (tp->t_maxopd - optlen));
1109 sendalot = 1;
1110 } else if (tp->t_flags & TF_NEEDFIN)
1111 sendalot = 1;
1112 } else {
1113 len = tp->t_maxopd - optlen - ipoptlen;
1114 sendalot = 1;
1115 }
1c79356b
A
1116 }
1117
1118/*#ifdef DIAGNOSTIC*/
9bccf70c
A
1119#if INET6
1120 if (max_linkhdr + hdrlen > MCLBYTES)
1121 panic("tcphdr too big");
1122#else
1c79356b
A
1123 if (max_linkhdr + hdrlen > MHLEN)
1124 panic("tcphdr too big");
9bccf70c 1125#endif
1c79356b
A
1126/*#endif*/
1127
1128 /*
1129 * Grab a header mbuf, attaching a copy of data to
1130 * be transmitted, and initialize the header from
1131 * the template for sends on this connection.
1132 */
1133 if (len) {
1134 if (tp->t_force && len == 1)
1135 tcpstat.tcps_sndprobe++;
b0d623f7 1136 else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
1c79356b
A
1137 tcpstat.tcps_sndrexmitpack++;
1138 tcpstat.tcps_sndrexmitbyte += len;
1139 } else {
1140 tcpstat.tcps_sndpack++;
1141 tcpstat.tcps_sndbyte += len;
1142 }
1143#ifdef notyet
1144 if ((m = m_copypack(so->so_snd.sb_mb, off,
1145 (int)len, max_linkhdr + hdrlen)) == 0) {
1146 error = ENOBUFS;
1147 goto out;
1148 }
1149 /*
1150 * m_copypack left space for our hdr; use it.
1151 */
1152 m->m_len += hdrlen;
1153 m->m_data -= hdrlen;
1154#else
9bccf70c
A
1155 /*
1156 * try to use the new interface that allocates all
1157 * the necessary mbuf hdrs under 1 mbuf lock and
1158 * avoids rescanning the socket mbuf list if
1159 * certain conditions are met. This routine can't
1160 * be used in the following cases...
1161 * 1) the protocol headers exceed the capacity of
1162 * of a single mbuf header's data area (no cluster attached)
1163 * 2) the length of the data being transmitted plus
1164 * the protocol headers fits into a single mbuf header's
1165 * data area (no cluster attached)
1166 */
fa4905b1 1167 m = NULL;
1c79356b
A
1168#if INET6
1169 if (MHLEN < hdrlen + max_linkhdr) {
2d21ac55 1170 MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */
fa4905b1
A
1171 if (m == NULL) {
1172 error = ENOBUFS;
1173 goto out;
1174 }
1c79356b
A
1175 MCLGET(m, M_DONTWAIT);
1176 if ((m->m_flags & M_EXT) == 0) {
1177 m_freem(m);
1178 error = ENOBUFS;
1179 goto out;
1180 }
fa4905b1
A
1181 m->m_data += max_linkhdr;
1182 m->m_len = hdrlen;
1c79356b
A
1183 }
1184#endif
1c79356b 1185 if (len <= MHLEN - hdrlen - max_linkhdr) {
fa4905b1 1186 if (m == NULL) {
2d21ac55 1187 MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */
fa4905b1
A
1188 if (m == NULL) {
1189 error = ENOBUFS;
1190 goto out;
1191 }
1192 m->m_data += max_linkhdr;
1193 m->m_len = hdrlen;
1194 }
55e303ae 1195 /* makes sure we still have data left to be sent at this point */
b0d623f7 1196 if (so->so_snd.sb_mb == NULL || off < 0) {
55e303ae
A
1197 if (m != NULL) m_freem(m);
1198 error = 0; /* should we return an error? */
1199 goto out;
1200 }
1c79356b
A
1201 m_copydata(so->so_snd.sb_mb, off, (int) len,
1202 mtod(m, caddr_t) + hdrlen);
1203 m->m_len += len;
1204 } else {
fa4905b1
A
1205 if (m != NULL) {
1206 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
1207 if (m->m_next == 0) {
1208 (void) m_free(m);
1209 error = ENOBUFS;
1210 goto out;
1211 }
1212 } else {
9bccf70c
A
1213 /*
1214 * determine whether the mbuf pointer and offset passed back by the 'last' call
1215 * to m_copym_with_hdrs are still valid... if the head of the socket chain has
1216 * changed (due to an incoming ACK for instance), or the offset into the chain we
1217 * just computed is different from the one last returned by m_copym_with_hdrs (perhaps
1218 * we're re-transmitting a packet sent earlier), than we can't pass the mbuf pointer and
1219 * offset into it as valid hints for m_copym_with_hdrs to use (if valid, these hints allow
1220 * m_copym_with_hdrs to avoid rescanning from the beginning of the socket buffer mbuf list.
1221 * setting the mbuf pointer to NULL is sufficient to disable the hint mechanism.
1222 */
b0d623f7 1223 if (m_head != so->so_snd.sb_mb || sack_rxmit || last_off != off)
fa4905b1
A
1224 m_last = NULL;
1225 last_off = off + len;
1226 m_head = so->so_snd.sb_mb;
55e303ae
A
1227
1228 /* makes sure we still have data left to be sent at this point */
1229 if (m_head == NULL) {
1230 error = 0; /* should we return an error? */
1231 goto out;
1232 }
1233
9bccf70c
A
1234 /*
1235 * m_copym_with_hdrs will always return the last mbuf pointer and the offset into it that
1236 * it acted on to fullfill the current request, whether a valid 'hint' was passed in or not
1237 */
b0d623f7 1238 if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, len, M_DONTWAIT, &m_last, &m_off)) == NULL) {
fa4905b1
A
1239 error = ENOBUFS;
1240 goto out;
1241 }
1242 m->m_data += max_linkhdr;
1243 m->m_len = hdrlen;
1c79356b
A
1244 }
1245 }
1246#endif
1247 /*
1248 * If we're sending everything we've got, set PUSH.
1249 * (This will keep happy those implementations which only
1250 * give data to the user when a buffer fills or
1251 * a PUSH comes in.)
1252 */
1253 if (off + len == so->so_snd.sb_cc)
1254 flags |= TH_PUSH;
1255 } else {
1256 if (tp->t_flags & TF_ACKNOW)
1257 tcpstat.tcps_sndacks++;
1258 else if (flags & (TH_SYN|TH_FIN|TH_RST))
1259 tcpstat.tcps_sndctrl++;
1260 else if (SEQ_GT(tp->snd_up, tp->snd_una))
1261 tcpstat.tcps_sndurg++;
1262 else
1263 tcpstat.tcps_sndwinup++;
1264
2d21ac55 1265 MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */
1c79356b
A
1266 if (m == NULL) {
1267 error = ENOBUFS;
1268 goto out;
1269 }
1270#if INET6
9bccf70c
A
1271 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
1272 MHLEN >= hdrlen) {
1c79356b
A
1273 MH_ALIGN(m, hdrlen);
1274 } else
1275#endif
1276 m->m_data += max_linkhdr;
1277 m->m_len = hdrlen;
1278 }
91447636 1279 m->m_pkthdr.rcvif = 0;
2d21ac55
A
1280#if CONFIG_MACF_NET
1281 mac_mbuf_label_associate_inpcb(tp->t_inpcb, m);
1282#endif
1c79356b
A
1283#if INET6
1284 if (isipv6) {
1285 ip6 = mtod(m, struct ip6_hdr *);
1286 th = (struct tcphdr *)(ip6 + 1);
9bccf70c
A
1287 tcp_fillheaders(tp, ip6, th);
1288 } else
1c79356b 1289#endif /* INET6 */
9bccf70c
A
1290 {
1291 ip = mtod(m, struct ip *);
1292 ipov = (struct ipovly *)ip;
1293 th = (struct tcphdr *)(ip + 1);
1294 /* this picks up the pseudo header (w/o the length) */
1295 tcp_fillheaders(tp, ip, th);
2d21ac55
A
1296 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
1297 !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1298 ip->ip_tos = IPTOS_ECN_ECT0;
1299 }
1c79356b 1300 }
1c79356b
A
1301
1302 /*
1303 * Fill in fields, remembering maximum advertised
1304 * window for use in delaying messages about window sizes.
1305 * If resending a FIN, be sure not to use a new sequence number.
1306 */
1307 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
1308 tp->snd_nxt == tp->snd_max)
1309 tp->snd_nxt--;
1310 /*
1311 * If we are doing retransmissions, then snd_nxt will
1312 * not reflect the first unsent octet. For ACK only
1313 * packets, we do not want the sequence number of the
1314 * retransmitted packet, we want the sequence number
1315 * of the next unsent octet. So, if there is no data
1316 * (and no SYN or FIN), use snd_max instead of snd_nxt
1317 * when filling in ti_seq. But if we are in persist
1318 * state, snd_max might reflect one byte beyond the
1319 * right edge of the window, so use snd_nxt in that
1320 * case, since we know we aren't doing a retransmission.
1321 * (retransmit and persist are mutually exclusive...)
1322 */
8ad349bb
A
1323 if (sack_rxmit == 0) {
1324 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
1325 th->th_seq = htonl(tp->snd_nxt);
1326 else
1327 th->th_seq = htonl(tp->snd_max);
1328 } else {
1329 th->th_seq = htonl(p->rxmit);
1330 p->rxmit += len;
1331 tp->sackhint.sack_bytes_rexmit += len;
1332 }
1c79356b 1333 th->th_ack = htonl(tp->rcv_nxt);
4a3eedf9
A
1334 tp->last_ack_sent = tp->rcv_nxt;
1335
1c79356b
A
1336 if (optlen) {
1337 bcopy(opt, th + 1, optlen);
1338 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
1339 }
1340 th->th_flags = flags;
1341 /*
1342 * Calculate receive window. Don't shrink window,
1343 * but avoid silly window syndrome.
1344 */
b0d623f7 1345 if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) && recwin < (int)tp->t_maxseg)
8ad349bb 1346 recwin = 0;
b0d623f7
A
1347 if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt))
1348 recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
d12e1678 1349 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
b0d623f7 1350 if (recwin > (int32_t)slowlink_wsize)
8ad349bb 1351 recwin = slowlink_wsize;
b0d623f7 1352 th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
d12e1678
A
1353 }
1354 else {
b0d623f7
A
1355 if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
1356 recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
8ad349bb 1357 th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
d12e1678 1358 }
91447636 1359
8ad349bb
A
1360 /*
1361 * Adjust the RXWIN0SENT flag - indicate that we have advertised
1362 * a 0 window. This may cause the remote transmitter to stall. This
1363 * flag tells soreceive() to disable delayed acknowledgements when
1364 * draining the buffer. This can occur if the receiver is attempting
1365 * to read more data then can be buffered prior to transmitting on
1366 * the connection.
1367 */
1368 if (recwin == 0)
1369 tp->t_flags |= TF_RXWIN0SENT;
1370 else
1371 tp->t_flags &= ~TF_RXWIN0SENT;
1c79356b
A
1372 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1373 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
1374 th->th_flags |= TH_URG;
1375 } else
1376 /*
1377 * If no urgent pointer to send, then we pull
1378 * the urgent pointer to the left edge of the send window
1379 * so that it doesn't drift into the send window on sequence
1380 * number wraparound.
1381 */
1382 tp->snd_up = tp->snd_una; /* drag it along */
1383
1384 /*
1385 * Put TCP length in extended header, and then
1386 * checksum extended header and data.
1387 */
9bccf70c 1388 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
1c79356b 1389#if INET6
9bccf70c
A
1390 if (isipv6)
1391 /*
1392 * ip6_plen is not need to be filled now, and will be filled
1393 * in ip6_output.
1394 */
1c79356b
A
1395 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
1396 sizeof(struct tcphdr) + optlen + len);
9bccf70c 1397 else
1c79356b 1398#endif /* INET6 */
0b4e3aa0 1399 {
9bccf70c
A
1400 m->m_pkthdr.csum_flags = CSUM_TCP;
1401 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1402 if (len + optlen)
1403 th->th_sum = in_addword(th->th_sum,
1404 htons((u_short)(optlen + len)));
2d21ac55 1405 }
1c79356b 1406
b0d623f7
A
1407 /*
1408 * Enable TSO and specify the size of the segments.
1409 * The TCP pseudo header checksum is always provided.
1410 * XXX: Fixme: This is currently not the case for IPv6.
1411 */
1412 if (tso) {
1413#if INET6
1414 if (isipv6)
1415 m->m_pkthdr.csum_flags = CSUM_TSO_IPV6;
1416 else
1417#endif /* INET6 */
1418 m->m_pkthdr.csum_flags = CSUM_TSO_IPV4;
1419
1420 m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
1421 }
1422 else
1423 m->m_pkthdr.tso_segsz = 0;
1424
1c79356b
A
1425 /*
1426 * In transmit state, time the transmission and arrange for
1427 * the retransmit. In persist state, just set snd_max.
1428 */
1429 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
1430 tcp_seq startseq = tp->snd_nxt;
1431
1432 /*
1433 * Advance snd_nxt over sequence space of this segment.
1434 */
1435 if (flags & (TH_SYN|TH_FIN)) {
1436 if (flags & TH_SYN)
1437 tp->snd_nxt++;
1438 if (flags & TH_FIN) {
1439 tp->snd_nxt++;
1440 tp->t_flags |= TF_SENTFIN;
1441 }
1442 }
8ad349bb
A
1443 if (sack_rxmit)
1444 goto timer;
1c79356b
A
1445 tp->snd_nxt += len;
1446 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1447 tp->snd_max = tp->snd_nxt;
1448 /*
1449 * Time this transmission if not a retransmission and
1450 * not currently timing anything.
1451 */
9bccf70c
A
1452 if (tp->t_rtttime == 0) {
1453 tp->t_rtttime = 1;
1c79356b
A
1454 tp->t_rtseq = startseq;
1455 tcpstat.tcps_segstimed++;
1456 }
1457 }
1458
1459 /*
1460 * Set retransmit timer if not currently set,
1461 * and not doing an ack or a keep-alive probe.
1462 * Initial value for retransmit timer is smoothed
1463 * round-trip time + 2 * round-trip time variance.
1464 * Initialize shift counter which is used for backoff
1465 * of retransmit time.
1466 */
8ad349bb 1467timer:
1c79356b 1468 if (tp->t_timer[TCPT_REXMT] == 0 &&
8ad349bb
A
1469 ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
1470 tp->snd_nxt != tp->snd_una)) {
1c79356b
A
1471 if (tp->t_timer[TCPT_PERSIST]) {
1472 tp->t_timer[TCPT_PERSIST] = 0;
1473 tp->t_rxtshift = 0;
1474 }
8ad349bb 1475 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1c79356b 1476 }
8ad349bb
A
1477 } else {
1478 /*
1479 * Persist case, update snd_max but since we are in
1480 * persist mode (no window) we do not update snd_nxt.
1481 */
1482 int xlen = len;
1483 if (flags & TH_SYN)
1484 ++xlen;
1485 if (flags & TH_FIN) {
1486 ++xlen;
1487 tp->t_flags |= TF_SENTFIN;
1488 }
1489 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
1c79356b 1490 tp->snd_max = tp->snd_nxt + len;
8ad349bb 1491 }
1c79356b
A
1492
1493#if TCPDEBUG
1494 /*
1495 * Trace.
1496 */
2d21ac55 1497 if (so_options & SO_DEBUG)
9bccf70c
A
1498 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
1499#endif
1c79356b
A
1500
1501 /*
1502 * Fill in IP length and desired time to live and
1503 * send to IP level. There should be a better way
1504 * to handle ttl and tos; we could keep them in
1505 * the template, but need a way to checksum without them.
1506 */
9bccf70c
A
1507 /*
1508 * m->m_pkthdr.len should have been set before cksum calcuration,
1509 * because in6_cksum() need it.
1510 */
1c79356b
A
1511#if INET6
1512 if (isipv6) {
9bccf70c 1513 /*
1c79356b
A
1514 * we separately set hoplimit for every segment, since the
1515 * user might want to change the value via setsockopt.
1516 * Also, desired default hop limit might be changed via
1517 * Neighbor Discovery.
1518 */
1519 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
1520 tp->t_inpcb->in6p_route.ro_rt ?
1521 tp->t_inpcb->in6p_route.ro_rt->rt_ifp
1522 : NULL);
1523
1524 /* TODO: IPv6 IP6TOS_ECT bit on */
1525#if IPSEC
9bccf70c
A
1526 if (ipsec_bypass == 0 && ipsec_setsocket(m, so) != 0) {
1527 m_freem(m);
1528 error = ENOBUFS;
1529 goto out;
1530 }
1c79356b 1531#endif /*IPSEC*/
91447636 1532 m->m_pkthdr.socket_id = socket_id;
1c79356b 1533 error = ip6_output(m,
2d21ac55 1534 inp6_pktopts,
1c79356b 1535 &tp->t_inpcb->in6p_route,
2d21ac55 1536 (so_options & SO_DONTROUTE), NULL, NULL, 0);
1c79356b
A
1537 } else
1538#endif /* INET6 */
9bccf70c 1539 {
1c79356b
A
1540 ip->ip_len = m->m_pkthdr.len;
1541#if INET6
55e303ae 1542 if (isipv6)
9bccf70c
A
1543 ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
1544 tp->t_inpcb->in6p_route.ro_rt ?
1545 tp->t_inpcb->in6p_route.ro_rt->rt_ifp
1546 : NULL);
1547 else
1c79356b
A
1548#endif /* INET6 */
1549 ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */
2d21ac55 1550 ip->ip_tos |= (tp->t_inpcb->inp_ip_tos & ~IPTOS_ECN_MASK); /* XXX */
1c79356b 1551
0b4e3aa0 1552
9bccf70c
A
1553#if INET6
1554 if (isipv6) {
1555 KERNEL_DEBUG(DBG_LAYER_BEG,
1556 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
1557 (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
1558 (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
1559 0,0,0);
1560 }
1561 else
1562#endif
1563 {
1564 KERNEL_DEBUG(DBG_LAYER_BEG,
1565 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
1566 (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
1567 (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
1568 0,0,0);
1569 }
1570
1c79356b 1571 /*
2d21ac55
A
1572 * See if we should do MTU discovery.
1573 * Look at the flag updated on the following criterias:
1574 * 1) Path MTU discovery is authorized by the sysctl
1575 * 2) The route isn't set yet (unlikely but could happen)
1576 * 3) The route is up
1577 * 4) the MTU is not locked (if it is, then discovery has been
1578 * disabled for that route)
1c79356b 1579 */
2d21ac55
A
1580
1581 if (path_mtu_discovery && (tp->t_flags & TF_PMTUD))
1c79356b 1582 ip->ip_off |= IP_DF;
2d21ac55 1583
1c79356b 1584#if IPSEC
9bccf70c
A
1585 if (ipsec_bypass == 0)
1586 ipsec_setsocket(m, so);
1c79356b 1587#endif /*IPSEC*/
91447636
A
1588
1589 /*
1590 * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
1591 */
2d21ac55 1592 lost = 0;
91447636 1593 m->m_pkthdr.socket_id = socket_id;
2d21ac55
A
1594 m->m_nextpkt = NULL;
1595 tp->t_pktlist_sentlen += len;
1596 tp->t_lastchain++;
1597 if (tp->t_pktlist_head != NULL) {
1598 tp->t_pktlist_tail->m_nextpkt = m;
1599 tp->t_pktlist_tail = m;
1600 } else {
91447636 1601 packchain_newlist++;
2d21ac55 1602 tp->t_pktlist_head = tp->t_pktlist_tail = m;
91447636
A
1603 }
1604
2d21ac55
A
1605 if (sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) ||
1606 (tp->snd_cwnd <= (tp->snd_wnd / 8)) ||
1607 (tp->t_flags & (TH_PUSH | TF_ACKNOW)) || tp->t_force != 0 ||
1608 tp->t_lastchain >= tcp_packet_chaining) {
1609 error = 0;
1610 while (!(tp->t_flags & TF_SENDINPROG) &&
1611 tp->t_pktlist_head != NULL) {
1612 packetlist = tp->t_pktlist_head;
1613 packchain_listadd = tp->t_lastchain;
1614 packchain_sent++;
1615 lost = tp->t_pktlist_sentlen;
1616 TCP_PKTLIST_CLEAR(tp);
1617 tp->t_flags |= TF_SENDINPROG;
1618
1619 error = tcp_ip_output(so, tp, packetlist,
1620 packchain_listadd, tp_inp_options,
b0d623f7 1621 (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)));
2d21ac55
A
1622
1623 tp->t_flags &= ~TF_SENDINPROG;
1624 if (error) {
1625 /*
1626 * Take into account the rest of unsent
1627 * packets in the packet list for this tcp
1628 * into "lost", since we're about to free
1629 * the whole list below.
1630 */
1631 lost += tp->t_pktlist_sentlen;
1632 break;
1633 } else {
1634 lost = 0;
1635 }
1636 }
1637 /* tcp was closed while we were in ip; resume close */
1638 if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
1639 tp->t_flags &= ~TF_CLOSING;
1640 (void) tcp_close(tp);
1641 return (0);
1642 }
91447636
A
1643 }
1644 else {
1645 error = 0;
1646 packchain_looped++;
1647 tcpstat.tcps_sndtotal++;
2d21ac55 1648
8ad349bb
A
1649 if (recwin > 0 && SEQ_GT(tp->rcv_nxt+recwin, tp->rcv_adv))
1650 tp->rcv_adv = tp->rcv_nxt + recwin;
91447636
A
1651 tp->last_ack_sent = tp->rcv_nxt;
1652 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
1653 goto again;
1654 }
1655 }
1c79356b 1656 if (error) {
9bccf70c 1657 /*
2d21ac55
A
1658 * Assume that the packets were lost, so back out the
1659 * sequence number advance, if any. Note that the "lost"
1660 * variable represents the amount of user data sent during
1661 * the recent call to ip_output_list() plus the amount of
1662 * user data in the packet list for this tcp at the moment.
9bccf70c 1663 */
8ad349bb 1664 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
9bccf70c
A
1665 /*
1666 * No need to check for TH_FIN here because
1667 * the TF_SENTFIN flag handles that case.
1668 */
8ad349bb
A
1669 if ((flags & TH_SYN) == 0) {
1670 if (sack_rxmit) {
2d21ac55
A
1671 p->rxmit -= lost;
1672 tp->sackhint.sack_bytes_rexmit -= lost;
8ad349bb 1673 } else
2d21ac55 1674 tp->snd_nxt -= lost;
8ad349bb 1675 }
9bccf70c 1676 }
1c79356b 1677out:
2d21ac55
A
1678 if (tp->t_pktlist_head != NULL)
1679 m_freem_list(tp->t_pktlist_head);
1680 TCP_PKTLIST_CLEAR(tp);
1681
1c79356b 1682 if (error == ENOBUFS) {
b0d623f7
A
1683 if (!tp->t_timer[TCPT_REXMT] &&
1684 !tp->t_timer[TCPT_PERSIST])
1685 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1686
1687 tp->snd_cwnd = tp->t_maxseg;
1688 tp->t_bytes_acked = 0;
1689
1c79356b
A
1690 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1691 return (0);
1692 }
1c79356b
A
1693 if (error == EMSGSIZE) {
1694 /*
1695 * ip_output() will have already fixed the route
1696 * for us. tcp_mtudisc() will, as its last action,
1697 * initiate retransmission, so it is important to
1698 * not do so here.
b0d623f7
A
1699 *
1700 * If TSO was active we either got an interface
1701 * without TSO capabilits or TSO was turned off.
1702 * Disable it for this connection as too and
1703 * immediatly retry with MSS sized segments generated
1704 * by this function.
1c79356b 1705 */
b0d623f7
A
1706 if (tso)
1707 tp->t_flags &= ~TF_TSO;
1708
1c79356b
A
1709 tcp_mtudisc(tp->t_inpcb, 0);
1710 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1711 return 0;
1712 }
1c79356b
A
1713 if ((error == EHOSTUNREACH || error == ENETDOWN)
1714 && TCPS_HAVERCVDSYN(tp->t_state)) {
1715 tp->t_softerror = error;
1716 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1717 return (0);
1718 }
1719 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1720 return (error);
1721 }
2d21ac55 1722
1c79356b
A
1723 tcpstat.tcps_sndtotal++;
1724
1725 /*
1726 * Data sent (as far as we can tell).
1727 * If this advertises a larger window than any other segment,
1728 * then remember the size of the advertised window.
1729 * Any pending ACK has now been sent.
1730 */
b0d623f7 1731 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
8ad349bb 1732 tp->rcv_adv = tp->rcv_nxt + recwin;
1c79356b
A
1733 tp->last_ack_sent = tp->rcv_nxt;
1734 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
55e303ae 1735
91447636
A
1736 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0);
1737 if (sendalot && (!tcp_do_newreno || --maxburst))
1c79356b 1738 goto again;
1c79356b
A
1739 return (0);
1740}
1741
2d21ac55
A
1742static int
1743tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
b0d623f7 1744 int cnt, struct mbuf *opt, int flags, int sack_in_progress)
2d21ac55
A
1745{
1746 int error = 0;
1747 boolean_t chain;
1748 boolean_t unlocked = FALSE;
c910b4d9
A
1749 struct inpcb *inp = tp->t_inpcb;
1750 struct ip_out_args ipoa;
b0d623f7 1751 struct route ro;
c910b4d9
A
1752
1753 /* If socket was bound to an ifindex, tell ip_output about it */
1754 ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
1755 inp->inp_boundif : IFSCOPE_NONE;
1756 flags |= IP_OUTARGS;
2d21ac55 1757
b0d623f7
A
1758 /* Copy the cached route and take an extra reference */
1759 inp_route_copyout(inp, &ro);
1760
1761 /*
1762 * Make sure ACK/DELACK conditions are cleared before
4a3eedf9
A
1763 * we unlock the socket.
1764 */
4a3eedf9 1765 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
b0d623f7 1766
2d21ac55 1767 /*
b0d623f7 1768 * If allowed, unlock TCP socket while in IP
2d21ac55
A
1769 * but only if the connection is established and
1770 * if we're not sending from an upcall.
b0d623f7 1771 */
2d21ac55 1772 if (tcp_output_unlocked && ((so->so_flags & SOF_UPCALLINUSE) == 0) &&
b0d623f7
A
1773 (tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0)) {
1774 unlocked = TRUE;
1775 socket_unlock(so, 0);
2d21ac55 1776 }
593a1d5f 1777
2d21ac55
A
1778 /*
1779 * Don't send down a chain of packets when:
1780 * - TCP chaining is disabled
1781 * - there is an IPsec rule set
1782 * - there is a non default rule set for the firewall
1783 */
1784
4a3eedf9 1785 chain = tcp_packet_chaining > 1
2d21ac55 1786#if IPSEC
4a3eedf9
A
1787 && ipsec_bypass
1788#endif
1789#if IPFIREWALL
1790 && (fw_enable == 0 || fw_bypass)
2d21ac55 1791#endif
4a3eedf9
A
1792 ; // I'm important, not extraneous
1793
2d21ac55
A
1794
1795 while (pkt != NULL) {
1796 struct mbuf *npkt = pkt->m_nextpkt;
1797
1798 if (!chain) {
1799 pkt->m_nextpkt = NULL;
1800 /*
1801 * If we are not chaining, make sure to set the packet
1802 * list count to 0 so that IP takes the right path;
1803 * this is important for cases such as IPSec where a
1804 * single mbuf might result in multiple mbufs as part
1805 * of the encapsulation. If a non-zero count is passed
1806 * down to IP, the head of the chain might change and
1807 * we could end up skipping it (thus generating bogus
1808 * packets). Fixing it in IP would be desirable, but
1809 * for now this would do it.
1810 */
1811 cnt = 0;
1812 }
b0d623f7 1813 error = ip_output_list(pkt, cnt, opt, &ro, flags, 0, &ipoa);
2d21ac55
A
1814 if (chain || error) {
1815 /*
1816 * If we sent down a chain then we are done since
1817 * the callee had taken care of everything; else
1818 * we need to free the rest of the chain ourselves.
1819 */
1820 if (!chain)
1821 m_freem_list(npkt);
1822 break;
1823 }
1824 pkt = npkt;
1825 }
1826
1827 if (unlocked)
1828 socket_lock(so, 0);
1829
b0d623f7
A
1830 /* Synchronize cached PCB route */
1831 inp_route_copyin(inp, &ro);
1832
2d21ac55
A
1833 return (error);
1834}
1835
1c79356b
A
1836void
1837tcp_setpersist(tp)
1838 register struct tcpcb *tp;
1839{
9bccf70c 1840 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
1c79356b
A
1841
1842 if (tp->t_timer[TCPT_REXMT])
9bccf70c 1843 panic("tcp_setpersist: retransmit pending");
1c79356b
A
1844 /*
1845 * Start/restart persistance timer.
1846 */
1847 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
1848 t * tcp_backoff[tp->t_rxtshift],
1849 TCPTV_PERSMIN, TCPTV_PERSMAX);
1850 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1851 tp->t_rxtshift++;
1852}