2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
78 #include <sys/domain.h>
79 #include <sys/protosw.h>
80 #include <sys/socket.h>
81 #include <sys/socketvar.h>
83 #include <net/route.h>
84 #include <net/if_var.h>
86 #include <netinet/in.h>
87 #include <netinet/in_systm.h>
88 #include <netinet/in_var.h>
89 #include <netinet/ip.h>
90 #include <netinet/in_pcb.h>
91 #include <netinet/ip_var.h>
93 #include <netinet6/in6_pcb.h>
94 #include <netinet/ip6.h>
95 #include <netinet6/ip6_var.h>
97 #include <netinet/tcp.h>
99 #include <netinet/tcp_fsm.h>
100 #include <netinet/tcp_seq.h>
101 #include <netinet/tcp_timer.h>
102 #include <netinet/tcp_var.h>
103 #include <netinet/tcpip.h>
105 #include <netinet/tcp_debug.h>
107 #include <sys/kdebug.h>
110 #include <netinet6/ipsec.h>
114 #include <security/mac_framework.h>
115 #endif /* MAC_SOCKET */
117 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1)
118 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3)
119 #define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
123 extern struct mbuf
*m_copypack();
126 int path_mtu_discovery
= 1;
127 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, path_mtu_discovery
, CTLFLAG_RW
,
128 &path_mtu_discovery
, 1, "Enable Path MTU Discovery");
131 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, slowstart_flightsize
, CTLFLAG_RW
,
132 &ss_fltsz
, 1, "Slow start flight size");
134 int ss_fltsz_local
= 8; /* starts with eight segments max */
135 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, local_slowstart_flightsize
, CTLFLAG_RW
,
136 &ss_fltsz_local
, 1, "Slow start flight size for local networks");
138 int tcp_do_newreno
= 0;
139 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, newreno
, CTLFLAG_RW
, &tcp_do_newreno
,
140 0, "Enable NewReno Algorithms");
143 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, tso
, CTLFLAG_RW
,
144 &tcp_do_tso
, 0, "Enable TCP Segmentation Offload");
147 int tcp_ecn_outbound
= 0;
148 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, ecn_initiate_out
, CTLFLAG_RW
, &tcp_ecn_outbound
,
149 0, "Initiate ECN for outbound connections");
151 int tcp_ecn_inbound
= 0;
152 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, ecn_negotiate_in
, CTLFLAG_RW
, &tcp_ecn_inbound
,
153 0, "Allow ECN negotiation for inbound connections");
155 int tcp_packet_chaining
= 50;
156 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, packetchain
, CTLFLAG_RW
, &tcp_packet_chaining
,
157 0, "Enable TCP output packet chaining");
159 int tcp_output_unlocked
= 1;
160 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, socket_unlocked_on_output
, CTLFLAG_RW
, &tcp_output_unlocked
,
161 0, "Unlock TCP when sending packets down to IP");
163 static int32_t packchain_newlist
= 0;
164 static int32_t packchain_looped
= 0;
165 static int32_t packchain_sent
= 0;
167 /* temporary: for testing */
169 extern int ipsec_bypass
;
172 extern int slowlink_wsize
; /* window correction for slow links */
174 extern int fw_enable
; /* firewall check for packet chaining */
175 extern int fw_bypass
; /* firewall check: disable packet chaining if there is rules */
176 #endif /* IPFIREWALL */
178 extern vm_size_t so_cache_zone_element_size
;
180 extern int ip_use_randomid
;
181 #endif /* RANDOM_IP_ID */
182 extern u_int32_t dlil_filter_count
;
183 extern u_int32_t kipf_count
;
185 static int tcp_ip_output(struct socket
*, struct tcpcb
*, struct mbuf
*, int,
186 struct mbuf
*, int, int);
188 static __inline__ u_int16_t
189 get_socket_id(struct socket
* s
)
193 if (so_cache_zone_element_size
== 0) {
196 val
= (u_int16_t
)(((uintptr_t)s
) / so_cache_zone_element_size
);
204 * Tcp output routine: figure out what should be sent and send it.
212 * ip_output_list:ENOMEM
213 * ip_output_list:EADDRNOTAVAIL
214 * ip_output_list:ENETUNREACH
215 * ip_output_list:EHOSTUNREACH
216 * ip_output_list:EACCES
217 * ip_output_list:EMSGSIZE
218 * ip_output_list:ENOBUFS
219 * ip_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
220 * ip6_output:??? [IPV6 only]
223 tcp_output(struct tcpcb
*tp
)
225 struct socket
*so
= tp
->t_inpcb
->inp_socket
;
226 int32_t len
, recwin
, sendwin
, off
;
228 register struct mbuf
*m
;
229 struct ip
*ip
= NULL
;
230 register struct ipovly
*ipov
= NULL
;
232 struct ip6_hdr
*ip6
= NULL
;
234 register struct tcphdr
*th
;
235 u_char opt
[TCP_MAXOLEN
];
236 unsigned ipoptlen
, optlen
, hdrlen
;
237 int idle
, sendalot
, lost
= 0;
243 unsigned ipsec_optlen
= 0;
245 int maxburst
= TCP_MAXBURST
;
248 struct mbuf
*m_last
= NULL
;
249 struct mbuf
*m_head
= NULL
;
250 struct mbuf
*packetlist
= NULL
;
251 struct mbuf
*tp_inp_options
= tp
->t_inpcb
->inp_depend4
.inp4_options
;
253 int isipv6
= tp
->t_inpcb
->inp_vflag
& INP_IPV6
;
254 struct ip6_pktopts
*inp6_pktopts
= tp
->t_inpcb
->inp_depend6
.inp6_outputopts
;
256 short packchain_listadd
= 0;
257 u_int16_t socket_id
= get_socket_id(so
);
258 int so_options
= so
->so_options
;
262 * Determine length of data that should be transmitted,
263 * and flags that will be used.
264 * If there is some data or critical controls (SYN, RST)
265 * to send, then transmit; otherwise, investigate further.
267 idle
= (tp
->t_flags
& TF_LASTIDLE
) || (tp
->snd_max
== tp
->snd_una
);
268 if (idle
&& tp
->t_rcvtime
>= tp
->t_rxtcur
) {
270 * We have been idle for "a while" and no acks are
271 * expected to clock out any data we send --
272 * slow start to get ack "clock" running again.
274 * Set the slow-start flight size depending on whether
275 * this is a local network or not.
279 (isipv6
&& in6_localaddr(&tp
->t_inpcb
->in6p_faddr
)) ||
282 in_localaddr(tp
->t_inpcb
->inp_faddr
)
287 tp
->snd_cwnd
= tp
->t_maxseg
* ss_fltsz_local
;
289 tp
->snd_cwnd
= tp
->t_maxseg
* ss_fltsz
;
291 tp
->t_flags
&= ~TF_LASTIDLE
;
293 if (tp
->t_flags
& TF_MORETOCOME
) {
294 tp
->t_flags
|= TF_LASTIDLE
;
299 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT
| DBG_FUNC_START
, 0,0,0,0,0);
304 KERNEL_DEBUG(DBG_LAYER_BEG
,
305 ((tp
->t_inpcb
->inp_fport
<< 16) | tp
->t_inpcb
->inp_lport
),
306 (((tp
->t_inpcb
->in6p_laddr
.s6_addr16
[0] & 0xffff) << 16) |
307 (tp
->t_inpcb
->in6p_faddr
.s6_addr16
[0] & 0xffff)),
314 KERNEL_DEBUG(DBG_LAYER_BEG
,
315 ((tp
->t_inpcb
->inp_fport
<< 16) | tp
->t_inpcb
->inp_lport
),
316 (((tp
->t_inpcb
->inp_laddr
.s_addr
& 0xffff) << 16) |
317 (tp
->t_inpcb
->inp_faddr
.s_addr
& 0xffff)),
320 * If the route generation id changed, we need to check that our
321 * local (source) IP address is still valid. If it isn't either
322 * return error or silently do nothing (assuming the address will
323 * come back before the TCP connection times out).
325 rt
= tp
->t_inpcb
->inp_route
.ro_rt
;
326 if (rt
!= NULL
&& (!(rt
->rt_flags
& RTF_UP
) ||
327 rt
->generation_id
!= route_generation
)) {
329 struct in_ifaddr
*ia
;
331 /* disable multipages at the socket */
332 somultipages(so
, FALSE
);
334 /* Disable TSO for the socket until we know more */
335 tp
->t_flags
&= ~TF_TSO
;
337 /* check that the source address is still valid */
338 if ((ia
= ifa_foraddr(tp
->t_inpcb
->inp_laddr
.s_addr
)) == NULL
) {
340 if (tp
->t_state
>= TCPS_CLOSE_WAIT
) {
341 tcp_drop(tp
, EADDRNOTAVAIL
);
342 return(EADDRNOTAVAIL
);
345 /* set Retransmit timer if it wasn't set
346 * reset Persist timer and shift register as the
347 * adversed peer window may not be valid anymore
350 if (!tp
->t_timer
[TCPT_REXMT
]) {
351 tp
->t_timer
[TCPT_REXMT
] = tp
->t_rxtcur
;
352 if (tp
->t_timer
[TCPT_PERSIST
]) {
353 tp
->t_timer
[TCPT_PERSIST
] = 0;
358 if (tp
->t_pktlist_head
!= NULL
)
359 m_freem_list(tp
->t_pktlist_head
);
360 TCP_PKTLIST_CLEAR(tp
);
362 /* drop connection if source address isn't available */
363 if (so
->so_flags
& SOF_NOADDRAVAIL
) {
364 tcp_drop(tp
, EADDRNOTAVAIL
);
365 return(EADDRNOTAVAIL
);
368 return(0); /* silently ignore, keep data in socket: address may be back */
370 ifafree(&ia
->ia_ifa
);
373 * Address is still valid; check for multipages capability
374 * again in case the outgoing interface has changed.
377 if ((ifp
= rt
->rt_ifp
) != NULL
) {
378 somultipages(so
, (ifp
->if_hwassist
& IFNET_MULTIPAGES
));
379 tcp_set_tso(tp
, ifp
);
381 if (rt
->rt_flags
& RTF_UP
)
382 rt
->generation_id
= route_generation
;
384 * See if we should do MTU discovery. Don't do it if:
385 * 1) it is disabled via the sysctl
386 * 2) the route isn't up
387 * 3) the MTU is locked (if it is, then discovery has been
391 if (!path_mtu_discovery
|| ((rt
!= NULL
) &&
392 (!(rt
->rt_flags
& RTF_UP
) || (rt
->rt_rmx
.rmx_locks
& RTV_MTU
))))
393 tp
->t_flags
&= ~TF_PMTUD
;
395 tp
->t_flags
|= TF_PMTUD
;
402 * If we've recently taken a timeout, snd_max will be greater than
403 * snd_nxt. There may be SACK information that allows us to avoid
404 * resending already delivered data. Adjust snd_nxt accordingly.
406 if (tp
->sack_enable
&& SEQ_LT(tp
->snd_nxt
, tp
->snd_max
))
409 off
= tp
->snd_nxt
- tp
->snd_una
;
410 sendwin
= min(tp
->snd_wnd
, tp
->snd_cwnd
);
412 if (tp
->t_flags
& TF_SLOWLINK
&& slowlink_wsize
> 0)
413 sendwin
= min(sendwin
, slowlink_wsize
);
415 flags
= tcp_outflags
[tp
->t_state
];
417 * Send any SACK-generated retransmissions. If we're explicitly trying
418 * to send out new data (when sendalot is 1), bypass this function.
419 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
420 * we're replacing a (future) new transmission with a retransmission
421 * now, and we previously incremented snd_cwnd in tcp_input().
424 * Still in sack recovery , reset rxmit flag to zero.
430 if (tp
->sack_enable
&& IN_FASTRECOVERY(tp
) &&
431 (p
= tcp_sack_output(tp
, &sack_bytes_rxmt
))) {
434 cwin
= min(tp
->snd_wnd
, tp
->snd_cwnd
) - sack_bytes_rxmt
;
437 /* Do not retransmit SACK segments beyond snd_recover */
438 if (SEQ_GT(p
->end
, tp
->snd_recover
)) {
440 * (At least) part of sack hole extends beyond
441 * snd_recover. Check to see if we can rexmit data
444 if (SEQ_GEQ(p
->rxmit
, tp
->snd_recover
)) {
446 * Can't rexmit any more data for this hole.
447 * That data will be rexmitted in the next
448 * sack recovery episode, when snd_recover
449 * moves past p->rxmit.
452 goto after_sack_rexmit
;
454 /* Can rexmit part of the current hole */
455 len
= ((int32_t)min(cwin
,
456 tp
->snd_recover
- p
->rxmit
));
458 len
= ((int32_t)min(cwin
, p
->end
- p
->rxmit
));
460 off
= p
->rxmit
- tp
->snd_una
; /* update off only if we really transmit SACK data */
463 tcpstat
.tcps_sack_rexmits
++;
464 tcpstat
.tcps_sack_rexmit_bytes
+=
465 min(len
, tp
->t_maxseg
);
472 * Get standard flags, and add SYN or FIN if requested by 'hidden'
475 if (tp
->t_flags
& TF_NEEDFIN
)
477 if (tp
->t_flags
& TF_NEEDSYN
)
481 * If in persist timeout with window of 0, send 1 byte.
482 * Otherwise, if window is small but nonzero
483 * and timer expired, we will send what we can
484 * and go to transmit state.
489 * If we still have some data to send, then
490 * clear the FIN bit. Usually this would
491 * happen below when it realizes that we
492 * aren't sending all the data. However,
493 * if we have exactly 1 byte of unsent data,
494 * then it won't clear the FIN bit below,
495 * and if we are in persist state, we wind
496 * up sending the packet without recording
497 * that we sent the FIN bit.
499 * We can't just blindly clear the FIN bit,
500 * because if we don't have any more data
501 * to send then the probe will be the FIN
504 if (off
< so
->so_snd
.sb_cc
)
508 tp
->t_timer
[TCPT_PERSIST
] = 0;
514 * If snd_nxt == snd_max and we have transmitted a FIN, the
515 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
516 * a negative length. This can also occur when TCP opens up
517 * its congestion window while receiving additional duplicate
518 * acks after fast-retransmit because TCP will reset snd_nxt
519 * to snd_max after the fast-retransmit.
521 * In the normal retransmit-FIN-only case, however, snd_nxt will
522 * be set to snd_una, the offset will be 0, and the length may
525 * If sack_rxmit is true we are retransmitting from the scoreboard
526 * in which case len is already set.
528 if (sack_rxmit
== 0) {
529 if (sack_bytes_rxmt
== 0)
530 len
= min(so
->so_snd
.sb_cc
, sendwin
) - off
;
535 * We are inside of a SACK recovery episode and are
536 * sending new data, having retransmitted all the
537 * data possible in the scoreboard.
539 len
= min(so
->so_snd
.sb_cc
, tp
->snd_wnd
)
542 * Don't remove this (len > 0) check !
543 * We explicitly check for len > 0 here (although it
544 * isn't really necessary), to work around a gcc
545 * optimization issue - to force gcc to compute
546 * len above. Without this check, the computation
547 * of len is bungled by the optimizer.
550 cwin
= tp
->snd_cwnd
-
551 (tp
->snd_nxt
- tp
->sack_newdata
) -
555 len
= imin(len
, cwin
);
563 * Lop off SYN bit if it has already been sent. However, if this
564 * is SYN-SENT state and if segment contains data and if we don't
565 * know that foreign host supports TAO, suppress sending segment.
567 if ((flags
& TH_SYN
) && SEQ_GT(tp
->snd_nxt
, tp
->snd_una
)) {
570 if (len
> 0 && tp
->t_state
== TCPS_SYN_SENT
) {
571 while (!(tp
->t_flags
& TF_SENDINPROG
) &&
572 tp
->t_pktlist_head
!= NULL
) {
573 packetlist
= tp
->t_pktlist_head
;
574 packchain_listadd
= tp
->t_lastchain
;
576 TCP_PKTLIST_CLEAR(tp
);
577 tp
->t_flags
|= TF_SENDINPROG
;
579 error
= tcp_ip_output(so
, tp
, packetlist
,
580 packchain_listadd
, tp_inp_options
,
581 (so_options
& SO_DONTROUTE
), (sack_rxmit
| (sack_bytes_rxmt
!= 0)));
583 tp
->t_flags
&= ~TF_SENDINPROG
;
585 /* tcp was closed while we were in ip; resume close */
587 (TF_CLOSING
|TF_SENDINPROG
)) == TF_CLOSING
) {
588 tp
->t_flags
&= ~TF_CLOSING
;
589 (void) tcp_close(tp
);
591 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT
| DBG_FUNC_END
,
598 * Be careful not to send data and/or FIN on SYN segments.
599 * This measure is needed to prevent interoperability problems
600 * with not fully conformant TCP implementations.
602 if ((flags
& TH_SYN
) && (tp
->t_flags
& TF_NOOPT
)) {
609 * If FIN has been sent but not acked,
610 * but we haven't been called to retransmit,
611 * len will be < 0. Otherwise, window shrank
612 * after we sent into it. If window shrank to 0,
613 * cancel pending retransmit, pull snd_nxt back
614 * to (closed) window, and set the persist timer
615 * if it isn't already going. If the window didn't
616 * close completely, just wait for an ACK.
620 tp
->t_timer
[TCPT_REXMT
] = 0;
622 tp
->snd_nxt
= tp
->snd_una
;
623 if (tp
->t_timer
[TCPT_PERSIST
] == 0)
629 * Truncate to the maximum segment length or enable TCP Segmentation
630 * Offloading (if supported by hardware) and ensure that FIN is removed
631 * if the length no longer contains the last data byte.
633 * TSO may only be used if we are in a pure bulk sending state. The
634 * presence of TCP-MD5, SACK retransmits, SACK advertizements, ipfw rules
635 * and IP options prevent using TSO. With TSO the TCP header is the same
636 * (except for the sequence number) for all generated packets. This
637 * makes it impossible to transmit any options which vary per generated
640 * The length of TSO bursts is limited to TCP_MAXWIN. That limit and
641 * removal of FIN (if not already catched here) are handled later after
642 * the exact length of the TCP options are known.
646 * Pre-calculate here as we save another lookup into the darknesses
647 * of IPsec that way and can actually decide if TSO is ok.
649 if (ipsec_bypass
== 0)
650 ipsec_optlen
= ipsec_hdrsiz_tcp(tp
);
653 if (len
> tp
->t_maxseg
) {
654 if ((tp
->t_flags
& TF_TSO
) && tcp_do_tso
&&
657 #endif /* RANDOM_IP_ID */
658 kipf_count
== 0 && dlil_filter_count
== 0 &&
659 tp
->rcv_numsacks
== 0 && sack_rxmit
== 0 && sack_bytes_rxmt
== 0 &&
660 tp
->t_inpcb
->inp_options
== NULL
&&
661 tp
->t_inpcb
->in6p_options
== NULL
666 && (fw_enable
== 0 || fw_bypass
)
678 if (SEQ_LT(p
->rxmit
+ len
, tp
->snd_una
+ so
->so_snd
.sb_cc
))
681 if (SEQ_LT(tp
->snd_nxt
+ len
, tp
->snd_una
+ so
->so_snd
.sb_cc
))
685 recwin
= tcp_sbspace(tp
);
688 * Sender silly window avoidance. We transmit under the following
689 * conditions when len is non-zero:
691 * - We have a full segment (or more with TSO)
692 * - This is the last buffer in a write()/send() and we are
693 * either idle or running NODELAY
694 * - we've timed out (e.g. persist timer)
695 * - we have more then 1/2 the maximum send window's worth of
696 * data (receiver may be limited the window size)
697 * - we need to retransmit
700 if (len
>= tp
->t_maxseg
) {
701 tp
->t_flags
|= TF_MAXSEGSNT
;
704 if (!(tp
->t_flags
& TF_MORETOCOME
) &&
705 (idle
|| tp
->t_flags
& TF_NODELAY
|| tp
->t_flags
& TF_MAXSEGSNT
) &&
706 (tp
->t_flags
& TF_NOPUSH
) == 0 &&
707 len
+ off
>= so
->so_snd
.sb_cc
) {
708 tp
->t_flags
&= ~TF_MAXSEGSNT
;
712 tp
->t_flags
&= ~TF_MAXSEGSNT
;
715 if (len
>= tp
->max_sndwnd
/ 2 && tp
->max_sndwnd
> 0) {
716 tp
->t_flags
&= ~TF_MAXSEGSNT
;
719 if (SEQ_LT(tp
->snd_nxt
, tp
->snd_max
)) {
720 tp
->t_flags
&= ~TF_MAXSEGSNT
;
728 * Compare available window to amount of window
729 * known to peer (as advertised window less
730 * next expected input). If the difference is at least two
731 * max size segments, or at least 50% of the maximum possible
732 * window, then want to send a window update to peer.
733 * Skip this if the connection is in T/TCP half-open state.
735 if (recwin
> 0 && !(tp
->t_flags
& TF_NEEDSYN
)) {
737 * "adv" is the amount we can increase the window,
738 * taking into account that we are limited by
739 * TCP_MAXWIN << tp->rcv_scale.
741 int32_t adv
= imin(recwin
, (int)TCP_MAXWIN
<< tp
->rcv_scale
) -
742 (tp
->rcv_adv
- tp
->rcv_nxt
);
744 if (adv
>= (int32_t) (2 * tp
->t_maxseg
)) {
747 * Update only if the resulting scaled value of the window changed, or
748 * if there is a change in the sequence since the last ack.
749 * This avoids what appears as dupe ACKS (see rdar://5640997)
752 if ((tp
->last_ack_sent
!= tp
->rcv_nxt
) || (((recwin
+ adv
) >> tp
->rcv_scale
) > recwin
))
755 if (2 * adv
>= (int32_t) so
->so_rcv
.sb_hiwat
)
760 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
761 * is also a catch-all for the retransmit timer timeout case.
763 if (tp
->t_flags
& TF_ACKNOW
)
765 if ((flags
& TH_RST
) ||
766 ((flags
& TH_SYN
) && (tp
->t_flags
& TF_NEEDSYN
) == 0))
768 if (SEQ_GT(tp
->snd_up
, tp
->snd_una
))
771 * If our state indicates that FIN should be sent
772 * and we have not yet done so, then we need to send.
774 if (flags
& TH_FIN
&&
775 ((tp
->t_flags
& TF_SENTFIN
) == 0 || tp
->snd_nxt
== tp
->snd_una
))
778 * In SACK, it is possible for tcp_output to fail to send a segment
779 * after the retransmission timer has been turned off. Make sure
780 * that the retransmission timer is set.
782 if (tp
->sack_enable
&& (tp
->t_state
>= TCPS_ESTABLISHED
) && SEQ_GT(tp
->snd_max
, tp
->snd_una
) &&
783 tp
->t_timer
[TCPT_REXMT
] == 0 &&
784 tp
->t_timer
[TCPT_PERSIST
] == 0) {
785 tp
->t_timer
[TCPT_REXMT
] = tp
->t_rxtcur
;
789 * TCP window updates are not reliable, rather a polling protocol
790 * using ``persist'' packets is used to insure receipt of window
791 * updates. The three ``states'' for the output side are:
792 * idle not doing retransmits or persists
793 * persisting to move a small or zero window
794 * (re)transmitting and thereby not persisting
796 * tp->t_timer[TCPT_PERSIST]
797 * is set when we are in persist state.
799 * is set when we are called to send a persist packet.
800 * tp->t_timer[TCPT_REXMT]
801 * is set when we are retransmitting
802 * The output side is idle when both timers are zero.
804 * If send window is too small, there is data to transmit, and no
805 * retransmit or persist is pending, then go to persist state.
806 * If nothing happens soon, send when timer expires:
807 * if window is nonzero, transmit what we can,
808 * otherwise force out a byte.
810 if (so
->so_snd
.sb_cc
&& tp
->t_timer
[TCPT_REXMT
] == 0 &&
811 tp
->t_timer
[TCPT_PERSIST
] == 0) {
817 * If there is no reason to send a segment, just return.
818 * but if there is some packets left in the packet list, send them now.
820 while (!(tp
->t_flags
& TF_SENDINPROG
) && tp
->t_pktlist_head
!= NULL
) {
821 packetlist
= tp
->t_pktlist_head
;
822 packchain_listadd
= tp
->t_lastchain
;
824 TCP_PKTLIST_CLEAR(tp
);
825 tp
->t_flags
|= TF_SENDINPROG
;
827 error
= tcp_ip_output(so
, tp
, packetlist
, packchain_listadd
,
828 tp_inp_options
, (so_options
& SO_DONTROUTE
), (sack_rxmit
| (sack_bytes_rxmt
!= 0)));
830 tp
->t_flags
&= ~TF_SENDINPROG
;
832 /* tcp was closed while we were in ip; resume close */
833 if ((tp
->t_flags
& (TF_CLOSING
|TF_SENDINPROG
)) == TF_CLOSING
) {
834 tp
->t_flags
&= ~TF_CLOSING
;
835 (void) tcp_close(tp
);
837 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT
| DBG_FUNC_END
, 0,0,0,0,0);
842 * Before ESTABLISHED, force sending of initial options
843 * unless TCP set not to do any options.
844 * NOTE: we assume that the IP/TCP header plus TCP options
845 * always fit in a single mbuf, leaving room for a maximum
847 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
852 hdrlen
= sizeof (struct ip6_hdr
) + sizeof (struct tcphdr
);
855 hdrlen
= sizeof (struct tcpiphdr
);
856 if (flags
& TH_SYN
) {
857 tp
->snd_nxt
= tp
->iss
;
858 if ((tp
->t_flags
& TF_NOOPT
) == 0) {
861 opt
[0] = TCPOPT_MAXSEG
;
862 opt
[1] = TCPOLEN_MAXSEG
;
863 mss
= htons((u_short
) tcp_mssopt(tp
));
864 (void)memcpy(opt
+ 2, &mss
, sizeof(mss
));
865 optlen
= TCPOLEN_MAXSEG
;
867 if ((tp
->t_flags
& TF_REQ_SCALE
) &&
868 ((flags
& TH_ACK
) == 0 ||
869 (tp
->t_flags
& TF_RCVD_SCALE
))) {
870 *((u_int32_t
*)(opt
+ optlen
)) = htonl(
872 TCPOPT_WINDOW
<< 16 |
873 TCPOLEN_WINDOW
<< 8 |
874 tp
->request_r_scale
);
882 RFC 3168 states that:
883 - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
884 to handle the TCP ECE flag, even if you also later send a
885 non-ECN-setup SYN/SYN-ACK.
886 - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
889 It is not clear how the ECE flag would ever be set if you never
890 set the IP ECT flag on outbound packets. All the same, we use
891 the TE_SETUPSENT to indicate that we have committed to handling
892 the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
893 whether or not we should set the IP ECT flag on outbound packets.
896 * For a SYN-ACK, send an ECN setup SYN-ACK
898 if (tcp_ecn_inbound
&& (flags
& (TH_SYN
| TH_ACK
)) == (TH_SYN
| TH_ACK
)) {
899 if ((tp
->ecn_flags
& TE_SETUPRECEIVED
) != 0) {
900 if ((tp
->ecn_flags
& TE_SETUPSENT
) == 0) {
901 /* Setting TH_ECE makes this an ECN-setup SYN-ACK */
905 * Record that we sent the ECN-setup and default to
908 tp
->ecn_flags
|= (TE_SETUPSENT
| TE_SENDIPECT
);
912 * We sent an ECN-setup SYN-ACK but it was dropped.
913 * Fallback to non-ECN-setup SYN-ACK and clear flag
914 * that to indicate we should not send data with IP ECT set.
916 * Pretend we didn't receive an ECN-setup SYN.
918 tp
->ecn_flags
&= ~TE_SETUPRECEIVED
;
922 else if (tcp_ecn_outbound
&& (flags
& (TH_SYN
| TH_ACK
)) == TH_SYN
) {
923 if ((tp
->ecn_flags
& TE_SETUPSENT
) == 0) {
924 /* Setting TH_ECE and TH_CWR makes this an ECN-setup SYN */
925 flags
|= (TH_ECE
| TH_CWR
);
928 * Record that we sent the ECN-setup and default to
931 tp
->ecn_flags
|= (TE_SETUPSENT
| TE_SENDIPECT
);
935 * We sent an ECN-setup SYN but it was dropped.
936 * Fall back to no ECN and clear flag indicating
937 * we should send data with IP ECT set.
939 tp
->ecn_flags
&= ~TE_SENDIPECT
;
944 * Check if we should set the TCP CWR flag.
945 * CWR flag is sent when we reduced the congestion window because
946 * we received a TCP ECE or we performed a fast retransmit. We
947 * never set the CWR flag on retransmitted packets. We only set
948 * the CWR flag on data packets. Pure acks don't have this set.
950 if ((tp
->ecn_flags
& TE_SENDCWR
) != 0 && len
!= 0 &&
951 !SEQ_LT(tp
->snd_nxt
, tp
->snd_max
)) {
953 tp
->ecn_flags
&= ~TE_SENDCWR
;
957 * Check if we should set the TCP ECE flag.
959 if ((tp
->ecn_flags
& TE_SENDECE
) != 0 && len
== 0) {
964 * Send a timestamp and echo-reply if this is a SYN and our side
965 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
966 * and our peer have sent timestamps in our SYN's.
968 if ((tp
->t_flags
& (TF_REQ_TSTMP
|TF_NOOPT
)) == TF_REQ_TSTMP
&&
969 (flags
& TH_RST
) == 0 &&
970 ((flags
& TH_ACK
) == 0 ||
971 (tp
->t_flags
& TF_RCVD_TSTMP
))) {
972 u_int32_t
*lp
= (u_int32_t
*)(opt
+ optlen
);
974 /* Form timestamp option as shown in appendix A of RFC 1323. */
975 *lp
++ = htonl(TCPOPT_TSTAMP_HDR
);
976 *lp
++ = htonl(tcp_now
);
977 *lp
= htonl(tp
->ts_recent
);
978 optlen
+= TCPOLEN_TSTAMP_APPA
;
981 if (tp
->sack_enable
&& ((tp
->t_flags
& TF_NOOPT
) == 0)) {
983 * Tack on the SACK permitted option *last*.
984 * And do padding of options after tacking this on.
985 * This is because of MSS, TS, WinScale and Signatures are
986 * all present, we have just 2 bytes left for the SACK
987 * permitted option, which is just enough.
990 * If this is the first SYN of connection (not a SYN
991 * ACK), include SACK permitted option. If this is a
992 * SYN ACK, include SACK permitted option if peer has
993 * already done so. This is only for active connect,
994 * since the syncache takes care of the passive connect.
996 if ((flags
& TH_SYN
) &&
997 (!(flags
& TH_ACK
) || (tp
->t_flags
& TF_SACK_PERMIT
))) {
999 bp
= (u_char
*)opt
+ optlen
;
1001 *bp
++ = TCPOPT_SACK_PERMITTED
;
1002 *bp
++ = TCPOLEN_SACK_PERMITTED
;
1003 optlen
+= TCPOLEN_SACK_PERMITTED
;
1007 * Send SACKs if necessary. This should be the last
1008 * option processed. Only as many SACKs are sent as
1009 * are permitted by the maximum options size.
1011 * In general, SACK blocks consume 8*n+2 bytes.
1012 * So a full size SACK blocks option is 34 bytes
1013 * (to generate 4 SACK blocks). At a minimum,
1014 * we need 10 bytes (to generate 1 SACK block).
1015 * If TCP Timestamps (12 bytes) and TCP Signatures
1016 * (18 bytes) are both present, we'll just have
1017 * 10 bytes for SACK options 40 - (12 + 18).
1019 if (TCPS_HAVEESTABLISHED(tp
->t_state
) &&
1020 (tp
->t_flags
& TF_SACK_PERMIT
) && tp
->rcv_numsacks
> 0 &&
1021 MAX_TCPOPTLEN
- optlen
- 2 >= TCPOLEN_SACK
) {
1022 int nsack
, sackoptlen
, padlen
;
1023 u_char
*bp
= (u_char
*)opt
+ optlen
;
1026 nsack
= (MAX_TCPOPTLEN
- optlen
- 2) / TCPOLEN_SACK
;
1027 nsack
= min(nsack
, tp
->rcv_numsacks
);
1028 sackoptlen
= (2 + nsack
* TCPOLEN_SACK
);
1031 * First we need to pad options so that the
1032 * SACK blocks can start at a 4-byte boundary
1033 * (sack option and length are at a 2 byte offset).
1035 padlen
= (MAX_TCPOPTLEN
- optlen
- sackoptlen
) % 4;
1037 while (padlen
-- > 0)
1040 tcpstat
.tcps_sack_send_blocks
++;
1041 *bp
++ = TCPOPT_SACK
;
1043 lp
= (u_int32_t
*)bp
;
1044 for (i
= 0; i
< nsack
; i
++) {
1045 struct sackblk sack
= tp
->sackblks
[i
];
1046 *lp
++ = htonl(sack
.start
);
1047 *lp
++ = htonl(sack
.end
);
1049 optlen
+= sackoptlen
;
1053 /* Pad TCP options to a 4 byte boundary */
1054 if (optlen
< MAX_TCPOPTLEN
&& (optlen
% sizeof(u_int32_t
))) {
1055 int pad
= sizeof(u_int32_t
) - (optlen
% sizeof(u_int32_t
));
1056 u_char
*bp
= (u_char
*)opt
+ optlen
;
1069 ipoptlen
= ip6_optlen(tp
->t_inpcb
);
1073 if (tp_inp_options
) {
1074 ipoptlen
= tp_inp_options
->m_len
-
1075 offsetof(struct ipoption
, ipopt_list
);
1080 ipoptlen
+= ipsec_optlen
;
1084 * Adjust data length if insertion of options will
1085 * bump the packet length beyond the t_maxopd length.
1086 * Clear the FIN bit because we cut off the tail of
1089 * When doing TSO limit a burst to TCP_MAXWIN minus the
1090 * IP, TCP and Options length to keep ip->ip_len from
1091 * overflowing. Prevent the last segment from being
1092 * fractional thus making them all equal sized and set
1093 * the flag to continue sending. TSO is disabled when
1094 * IP options or IPSEC are present.
1096 if (len
+ optlen
+ ipoptlen
> tp
->t_maxopd
) {
1098 * If there is still more to send, don't close the connection.
1104 tso_maxlen
= tp
->tso_max_segment_size
? tp
->tso_max_segment_size
: TCP_MAXWIN
;
1106 if (len
> tso_maxlen
- hdrlen
- optlen
) {
1107 len
= tso_maxlen
- hdrlen
- optlen
;
1108 len
= len
- (len
% (tp
->t_maxopd
- optlen
));
1110 } else if (tp
->t_flags
& TF_NEEDFIN
)
1113 len
= tp
->t_maxopd
- optlen
- ipoptlen
;
1118 /*#ifdef DIAGNOSTIC*/
1120 if (max_linkhdr
+ hdrlen
> MCLBYTES
)
1121 panic("tcphdr too big");
1123 if (max_linkhdr
+ hdrlen
> MHLEN
)
1124 panic("tcphdr too big");
1129 * Grab a header mbuf, attaching a copy of data to
1130 * be transmitted, and initialize the header from
1131 * the template for sends on this connection.
1134 if (tp
->t_force
&& len
== 1)
1135 tcpstat
.tcps_sndprobe
++;
1136 else if (SEQ_LT(tp
->snd_nxt
, tp
->snd_max
) || sack_rxmit
) {
1137 tcpstat
.tcps_sndrexmitpack
++;
1138 tcpstat
.tcps_sndrexmitbyte
+= len
;
1140 tcpstat
.tcps_sndpack
++;
1141 tcpstat
.tcps_sndbyte
+= len
;
1144 if ((m
= m_copypack(so
->so_snd
.sb_mb
, off
,
1145 (int)len
, max_linkhdr
+ hdrlen
)) == 0) {
1150 * m_copypack left space for our hdr; use it.
1153 m
->m_data
-= hdrlen
;
1156 * try to use the new interface that allocates all
1157 * the necessary mbuf hdrs under 1 mbuf lock and
1158 * avoids rescanning the socket mbuf list if
1159 * certain conditions are met. This routine can't
1160 * be used in the following cases...
1161 * 1) the protocol headers exceed the capacity of
1162 * of a single mbuf header's data area (no cluster attached)
1163 * 2) the length of the data being transmitted plus
1164 * the protocol headers fits into a single mbuf header's
1165 * data area (no cluster attached)
1169 if (MHLEN
< hdrlen
+ max_linkhdr
) {
1170 MGETHDR(m
, M_DONTWAIT
, MT_HEADER
); /* MAC-OK */
1175 MCLGET(m
, M_DONTWAIT
);
1176 if ((m
->m_flags
& M_EXT
) == 0) {
1181 m
->m_data
+= max_linkhdr
;
1185 if (len
<= MHLEN
- hdrlen
- max_linkhdr
) {
1187 MGETHDR(m
, M_DONTWAIT
, MT_HEADER
); /* MAC-OK */
1192 m
->m_data
+= max_linkhdr
;
1195 /* makes sure we still have data left to be sent at this point */
1196 if (so
->so_snd
.sb_mb
== NULL
|| off
< 0) {
1197 if (m
!= NULL
) m_freem(m
);
1198 error
= 0; /* should we return an error? */
1201 m_copydata(so
->so_snd
.sb_mb
, off
, (int) len
,
1202 mtod(m
, caddr_t
) + hdrlen
);
1206 m
->m_next
= m_copy(so
->so_snd
.sb_mb
, off
, (int) len
);
1207 if (m
->m_next
== 0) {
1214 * determine whether the mbuf pointer and offset passed back by the 'last' call
1215 * to m_copym_with_hdrs are still valid... if the head of the socket chain has
1216 * changed (due to an incoming ACK for instance), or the offset into the chain we
1217 * just computed is different from the one last returned by m_copym_with_hdrs (perhaps
1218 * we're re-transmitting a packet sent earlier), than we can't pass the mbuf pointer and
1219 * offset into it as valid hints for m_copym_with_hdrs to use (if valid, these hints allow
1220 * m_copym_with_hdrs to avoid rescanning from the beginning of the socket buffer mbuf list.
1221 * setting the mbuf pointer to NULL is sufficient to disable the hint mechanism.
1223 if (m_head
!= so
->so_snd
.sb_mb
|| sack_rxmit
|| last_off
!= off
)
1225 last_off
= off
+ len
;
1226 m_head
= so
->so_snd
.sb_mb
;
1228 /* makes sure we still have data left to be sent at this point */
1229 if (m_head
== NULL
) {
1230 error
= 0; /* should we return an error? */
1235 * m_copym_with_hdrs will always return the last mbuf pointer and the offset into it that
1236 * it acted on to fullfill the current request, whether a valid 'hint' was passed in or not
1238 if ((m
= m_copym_with_hdrs(so
->so_snd
.sb_mb
, off
, len
, M_DONTWAIT
, &m_last
, &m_off
)) == NULL
) {
1242 m
->m_data
+= max_linkhdr
;
1248 * If we're sending everything we've got, set PUSH.
1249 * (This will keep happy those implementations which only
1250 * give data to the user when a buffer fills or
1253 if (off
+ len
== so
->so_snd
.sb_cc
)
1256 if (tp
->t_flags
& TF_ACKNOW
)
1257 tcpstat
.tcps_sndacks
++;
1258 else if (flags
& (TH_SYN
|TH_FIN
|TH_RST
))
1259 tcpstat
.tcps_sndctrl
++;
1260 else if (SEQ_GT(tp
->snd_up
, tp
->snd_una
))
1261 tcpstat
.tcps_sndurg
++;
1263 tcpstat
.tcps_sndwinup
++;
1265 MGETHDR(m
, M_DONTWAIT
, MT_HEADER
); /* MAC-OK */
1271 if (isipv6
&& (MHLEN
< hdrlen
+ max_linkhdr
) &&
1273 MH_ALIGN(m
, hdrlen
);
1276 m
->m_data
+= max_linkhdr
;
1279 m
->m_pkthdr
.rcvif
= 0;
1281 mac_mbuf_label_associate_inpcb(tp
->t_inpcb
, m
);
1285 ip6
= mtod(m
, struct ip6_hdr
*);
1286 th
= (struct tcphdr
*)(ip6
+ 1);
1287 tcp_fillheaders(tp
, ip6
, th
);
1291 ip
= mtod(m
, struct ip
*);
1292 ipov
= (struct ipovly
*)ip
;
1293 th
= (struct tcphdr
*)(ip
+ 1);
1294 /* this picks up the pseudo header (w/o the length) */
1295 tcp_fillheaders(tp
, ip
, th
);
1296 if ((tp
->ecn_flags
& TE_SENDIPECT
) != 0 && len
&&
1297 !SEQ_LT(tp
->snd_nxt
, tp
->snd_max
)) {
1298 ip
->ip_tos
= IPTOS_ECN_ECT0
;
1303 * Fill in fields, remembering maximum advertised
1304 * window for use in delaying messages about window sizes.
1305 * If resending a FIN, be sure not to use a new sequence number.
1307 if (flags
& TH_FIN
&& tp
->t_flags
& TF_SENTFIN
&&
1308 tp
->snd_nxt
== tp
->snd_max
)
1311 * If we are doing retransmissions, then snd_nxt will
1312 * not reflect the first unsent octet. For ACK only
1313 * packets, we do not want the sequence number of the
1314 * retransmitted packet, we want the sequence number
1315 * of the next unsent octet. So, if there is no data
1316 * (and no SYN or FIN), use snd_max instead of snd_nxt
1317 * when filling in ti_seq. But if we are in persist
1318 * state, snd_max might reflect one byte beyond the
1319 * right edge of the window, so use snd_nxt in that
1320 * case, since we know we aren't doing a retransmission.
1321 * (retransmit and persist are mutually exclusive...)
1323 if (sack_rxmit
== 0) {
1324 if (len
|| (flags
& (TH_SYN
|TH_FIN
)) || tp
->t_timer
[TCPT_PERSIST
])
1325 th
->th_seq
= htonl(tp
->snd_nxt
);
1327 th
->th_seq
= htonl(tp
->snd_max
);
1329 th
->th_seq
= htonl(p
->rxmit
);
1331 tp
->sackhint
.sack_bytes_rexmit
+= len
;
1333 th
->th_ack
= htonl(tp
->rcv_nxt
);
1334 tp
->last_ack_sent
= tp
->rcv_nxt
;
1337 bcopy(opt
, th
+ 1, optlen
);
1338 th
->th_off
= (sizeof (struct tcphdr
) + optlen
) >> 2;
1340 th
->th_flags
= flags
;
1342 * Calculate receive window. Don't shrink window,
1343 * but avoid silly window syndrome.
1345 if (recwin
< (int32_t)(so
->so_rcv
.sb_hiwat
/ 4) && recwin
< (int)tp
->t_maxseg
)
1347 if (recwin
< (int32_t)(tp
->rcv_adv
- tp
->rcv_nxt
))
1348 recwin
= (int32_t)(tp
->rcv_adv
- tp
->rcv_nxt
);
1349 if (tp
->t_flags
& TF_SLOWLINK
&& slowlink_wsize
> 0) {
1350 if (recwin
> (int32_t)slowlink_wsize
)
1351 recwin
= slowlink_wsize
;
1352 th
->th_win
= htons((u_short
) (recwin
>>tp
->rcv_scale
));
1355 if (recwin
> (int32_t)(TCP_MAXWIN
<< tp
->rcv_scale
))
1356 recwin
= (int32_t)(TCP_MAXWIN
<< tp
->rcv_scale
);
1357 th
->th_win
= htons((u_short
) (recwin
>>tp
->rcv_scale
));
1361 * Adjust the RXWIN0SENT flag - indicate that we have advertised
1362 * a 0 window. This may cause the remote transmitter to stall. This
1363 * flag tells soreceive() to disable delayed acknowledgements when
1364 * draining the buffer. This can occur if the receiver is attempting
1365 * to read more data then can be buffered prior to transmitting on
1369 tp
->t_flags
|= TF_RXWIN0SENT
;
1371 tp
->t_flags
&= ~TF_RXWIN0SENT
;
1372 if (SEQ_GT(tp
->snd_up
, tp
->snd_nxt
)) {
1373 th
->th_urp
= htons((u_short
)(tp
->snd_up
- tp
->snd_nxt
));
1374 th
->th_flags
|= TH_URG
;
1377 * If no urgent pointer to send, then we pull
1378 * the urgent pointer to the left edge of the send window
1379 * so that it doesn't drift into the send window on sequence
1380 * number wraparound.
1382 tp
->snd_up
= tp
->snd_una
; /* drag it along */
1385 * Put TCP length in extended header, and then
1386 * checksum extended header and data.
1388 m
->m_pkthdr
.len
= hdrlen
+ len
; /* in6_cksum() need this */
1392 * ip6_plen is not need to be filled now, and will be filled
1395 th
->th_sum
= in6_cksum(m
, IPPROTO_TCP
, sizeof(struct ip6_hdr
),
1396 sizeof(struct tcphdr
) + optlen
+ len
);
1400 m
->m_pkthdr
.csum_flags
= CSUM_TCP
;
1401 m
->m_pkthdr
.csum_data
= offsetof(struct tcphdr
, th_sum
);
1403 th
->th_sum
= in_addword(th
->th_sum
,
1404 htons((u_short
)(optlen
+ len
)));
1408 * Enable TSO and specify the size of the segments.
1409 * The TCP pseudo header checksum is always provided.
1410 * XXX: Fixme: This is currently not the case for IPv6.
1415 m
->m_pkthdr
.csum_flags
= CSUM_TSO_IPV6
;
1418 m
->m_pkthdr
.csum_flags
= CSUM_TSO_IPV4
;
1420 m
->m_pkthdr
.tso_segsz
= tp
->t_maxopd
- optlen
;
1423 m
->m_pkthdr
.tso_segsz
= 0;
1426 * In transmit state, time the transmission and arrange for
1427 * the retransmit. In persist state, just set snd_max.
1429 if (tp
->t_force
== 0 || tp
->t_timer
[TCPT_PERSIST
] == 0) {
1430 tcp_seq startseq
= tp
->snd_nxt
;
1433 * Advance snd_nxt over sequence space of this segment.
1435 if (flags
& (TH_SYN
|TH_FIN
)) {
1438 if (flags
& TH_FIN
) {
1440 tp
->t_flags
|= TF_SENTFIN
;
1446 if (SEQ_GT(tp
->snd_nxt
, tp
->snd_max
)) {
1447 tp
->snd_max
= tp
->snd_nxt
;
1449 * Time this transmission if not a retransmission and
1450 * not currently timing anything.
1452 if (tp
->t_rtttime
== 0) {
1454 tp
->t_rtseq
= startseq
;
1455 tcpstat
.tcps_segstimed
++;
1460 * Set retransmit timer if not currently set,
1461 * and not doing an ack or a keep-alive probe.
1462 * Initial value for retransmit timer is smoothed
1463 * round-trip time + 2 * round-trip time variance.
1464 * Initialize shift counter which is used for backoff
1465 * of retransmit time.
1468 if (tp
->t_timer
[TCPT_REXMT
] == 0 &&
1469 ((sack_rxmit
&& tp
->snd_nxt
!= tp
->snd_max
) ||
1470 tp
->snd_nxt
!= tp
->snd_una
)) {
1471 if (tp
->t_timer
[TCPT_PERSIST
]) {
1472 tp
->t_timer
[TCPT_PERSIST
] = 0;
1475 tp
->t_timer
[TCPT_REXMT
] = tp
->t_rxtcur
;
1479 * Persist case, update snd_max but since we are in
1480 * persist mode (no window) we do not update snd_nxt.
1485 if (flags
& TH_FIN
) {
1487 tp
->t_flags
|= TF_SENTFIN
;
1489 if (SEQ_GT(tp
->snd_nxt
+ xlen
, tp
->snd_max
))
1490 tp
->snd_max
= tp
->snd_nxt
+ len
;
1497 if (so_options
& SO_DEBUG
)
1498 tcp_trace(TA_OUTPUT
, tp
->t_state
, tp
, mtod(m
, void *), th
, 0);
1502 * Fill in IP length and desired time to live and
1503 * send to IP level. There should be a better way
1504 * to handle ttl and tos; we could keep them in
1505 * the template, but need a way to checksum without them.
1508 * m->m_pkthdr.len should have been set before cksum calcuration,
1509 * because in6_cksum() need it.
1514 * we separately set hoplimit for every segment, since the
1515 * user might want to change the value via setsockopt.
1516 * Also, desired default hop limit might be changed via
1517 * Neighbor Discovery.
1519 ip6
->ip6_hlim
= in6_selecthlim(tp
->t_inpcb
,
1520 tp
->t_inpcb
->in6p_route
.ro_rt
?
1521 tp
->t_inpcb
->in6p_route
.ro_rt
->rt_ifp
1524 /* TODO: IPv6 IP6TOS_ECT bit on */
1526 if (ipsec_bypass
== 0 && ipsec_setsocket(m
, so
) != 0) {
1532 m
->m_pkthdr
.socket_id
= socket_id
;
1533 error
= ip6_output(m
,
1535 &tp
->t_inpcb
->in6p_route
,
1536 (so_options
& SO_DONTROUTE
), NULL
, NULL
, 0);
1540 ip
->ip_len
= m
->m_pkthdr
.len
;
1543 ip
->ip_ttl
= in6_selecthlim(tp
->t_inpcb
,
1544 tp
->t_inpcb
->in6p_route
.ro_rt
?
1545 tp
->t_inpcb
->in6p_route
.ro_rt
->rt_ifp
1549 ip
->ip_ttl
= tp
->t_inpcb
->inp_ip_ttl
; /* XXX */
1550 ip
->ip_tos
|= (tp
->t_inpcb
->inp_ip_tos
& ~IPTOS_ECN_MASK
); /* XXX */
1555 KERNEL_DEBUG(DBG_LAYER_BEG
,
1556 ((tp
->t_inpcb
->inp_fport
<< 16) | tp
->t_inpcb
->inp_lport
),
1557 (((tp
->t_inpcb
->in6p_laddr
.s6_addr16
[0] & 0xffff) << 16) |
1558 (tp
->t_inpcb
->in6p_faddr
.s6_addr16
[0] & 0xffff)),
1564 KERNEL_DEBUG(DBG_LAYER_BEG
,
1565 ((tp
->t_inpcb
->inp_fport
<< 16) | tp
->t_inpcb
->inp_lport
),
1566 (((tp
->t_inpcb
->inp_laddr
.s_addr
& 0xffff) << 16) |
1567 (tp
->t_inpcb
->inp_faddr
.s_addr
& 0xffff)),
1572 * See if we should do MTU discovery.
1573 * Look at the flag updated on the following criterias:
1574 * 1) Path MTU discovery is authorized by the sysctl
1575 * 2) The route isn't set yet (unlikely but could happen)
1576 * 3) The route is up
1577 * 4) the MTU is not locked (if it is, then discovery has been
1578 * disabled for that route)
1581 if (path_mtu_discovery
&& (tp
->t_flags
& TF_PMTUD
))
1582 ip
->ip_off
|= IP_DF
;
1585 if (ipsec_bypass
== 0)
1586 ipsec_setsocket(m
, so
);
1590 * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
1593 m
->m_pkthdr
.socket_id
= socket_id
;
1594 m
->m_nextpkt
= NULL
;
1595 tp
->t_pktlist_sentlen
+= len
;
1597 if (tp
->t_pktlist_head
!= NULL
) {
1598 tp
->t_pktlist_tail
->m_nextpkt
= m
;
1599 tp
->t_pktlist_tail
= m
;
1601 packchain_newlist
++;
1602 tp
->t_pktlist_head
= tp
->t_pktlist_tail
= m
;
1605 if (sendalot
== 0 || (tp
->t_state
!= TCPS_ESTABLISHED
) ||
1606 (tp
->snd_cwnd
<= (tp
->snd_wnd
/ 8)) ||
1607 (tp
->t_flags
& (TH_PUSH
| TF_ACKNOW
)) || tp
->t_force
!= 0 ||
1608 tp
->t_lastchain
>= tcp_packet_chaining
) {
1610 while (!(tp
->t_flags
& TF_SENDINPROG
) &&
1611 tp
->t_pktlist_head
!= NULL
) {
1612 packetlist
= tp
->t_pktlist_head
;
1613 packchain_listadd
= tp
->t_lastchain
;
1615 lost
= tp
->t_pktlist_sentlen
;
1616 TCP_PKTLIST_CLEAR(tp
);
1617 tp
->t_flags
|= TF_SENDINPROG
;
1619 error
= tcp_ip_output(so
, tp
, packetlist
,
1620 packchain_listadd
, tp_inp_options
,
1621 (so_options
& SO_DONTROUTE
), (sack_rxmit
| (sack_bytes_rxmt
!= 0)));
1623 tp
->t_flags
&= ~TF_SENDINPROG
;
1626 * Take into account the rest of unsent
1627 * packets in the packet list for this tcp
1628 * into "lost", since we're about to free
1629 * the whole list below.
1631 lost
+= tp
->t_pktlist_sentlen
;
1637 /* tcp was closed while we were in ip; resume close */
1638 if ((tp
->t_flags
& (TF_CLOSING
|TF_SENDINPROG
)) == TF_CLOSING
) {
1639 tp
->t_flags
&= ~TF_CLOSING
;
1640 (void) tcp_close(tp
);
1647 tcpstat
.tcps_sndtotal
++;
1649 if (recwin
> 0 && SEQ_GT(tp
->rcv_nxt
+recwin
, tp
->rcv_adv
))
1650 tp
->rcv_adv
= tp
->rcv_nxt
+ recwin
;
1651 tp
->last_ack_sent
= tp
->rcv_nxt
;
1652 tp
->t_flags
&= ~(TF_ACKNOW
|TF_DELACK
);
1658 * Assume that the packets were lost, so back out the
1659 * sequence number advance, if any. Note that the "lost"
1660 * variable represents the amount of user data sent during
1661 * the recent call to ip_output_list() plus the amount of
1662 * user data in the packet list for this tcp at the moment.
1664 if (tp
->t_force
== 0 || tp
->t_timer
[TCPT_PERSIST
] == 0) {
1666 * No need to check for TH_FIN here because
1667 * the TF_SENTFIN flag handles that case.
1669 if ((flags
& TH_SYN
) == 0) {
1672 tp
->sackhint
.sack_bytes_rexmit
-= lost
;
1674 tp
->snd_nxt
-= lost
;
1678 if (tp
->t_pktlist_head
!= NULL
)
1679 m_freem_list(tp
->t_pktlist_head
);
1680 TCP_PKTLIST_CLEAR(tp
);
1682 if (error
== ENOBUFS
) {
1683 if (!tp
->t_timer
[TCPT_REXMT
] &&
1684 !tp
->t_timer
[TCPT_PERSIST
])
1685 tp
->t_timer
[TCPT_REXMT
] = tp
->t_rxtcur
;
1687 tp
->snd_cwnd
= tp
->t_maxseg
;
1688 tp
->t_bytes_acked
= 0;
1690 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT
| DBG_FUNC_END
, 0,0,0,0,0);
1693 if (error
== EMSGSIZE
) {
1695 * ip_output() will have already fixed the route
1696 * for us. tcp_mtudisc() will, as its last action,
1697 * initiate retransmission, so it is important to
1700 * If TSO was active we either got an interface
1701 * without TSO capabilits or TSO was turned off.
1702 * Disable it for this connection as too and
1703 * immediatly retry with MSS sized segments generated
1707 tp
->t_flags
&= ~TF_TSO
;
1709 tcp_mtudisc(tp
->t_inpcb
, 0);
1710 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT
| DBG_FUNC_END
, 0,0,0,0,0);
1713 if ((error
== EHOSTUNREACH
|| error
== ENETDOWN
)
1714 && TCPS_HAVERCVDSYN(tp
->t_state
)) {
1715 tp
->t_softerror
= error
;
1716 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT
| DBG_FUNC_END
, 0,0,0,0,0);
1719 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT
| DBG_FUNC_END
, 0,0,0,0,0);
1723 tcpstat
.tcps_sndtotal
++;
1726 * Data sent (as far as we can tell).
1727 * If this advertises a larger window than any other segment,
1728 * then remember the size of the advertised window.
1729 * Any pending ACK has now been sent.
1731 if (recwin
> 0 && SEQ_GT(tp
->rcv_nxt
+ recwin
, tp
->rcv_adv
))
1732 tp
->rcv_adv
= tp
->rcv_nxt
+ recwin
;
1733 tp
->last_ack_sent
= tp
->rcv_nxt
;
1734 tp
->t_flags
&= ~(TF_ACKNOW
|TF_DELACK
);
1736 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT
| DBG_FUNC_END
,0,0,0,0,0);
1737 if (sendalot
&& (!tcp_do_newreno
|| --maxburst
))
1743 tcp_ip_output(struct socket
*so
, struct tcpcb
*tp
, struct mbuf
*pkt
,
1744 int cnt
, struct mbuf
*opt
, int flags
, int sack_in_progress
)
1748 boolean_t unlocked
= FALSE
;
1749 struct inpcb
*inp
= tp
->t_inpcb
;
1750 struct ip_out_args ipoa
;
1753 /* If socket was bound to an ifindex, tell ip_output about it */
1754 ipoa
.ipoa_ifscope
= (inp
->inp_flags
& INP_BOUND_IF
) ?
1755 inp
->inp_boundif
: IFSCOPE_NONE
;
1756 flags
|= IP_OUTARGS
;
1758 /* Copy the cached route and take an extra reference */
1759 inp_route_copyout(inp
, &ro
);
1762 * Make sure ACK/DELACK conditions are cleared before
1763 * we unlock the socket.
1765 tp
->t_flags
&= ~(TF_ACKNOW
| TF_DELACK
);
1768 * If allowed, unlock TCP socket while in IP
1769 * but only if the connection is established and
1770 * if we're not sending from an upcall.
1772 if (tcp_output_unlocked
&& ((so
->so_flags
& SOF_UPCALLINUSE
) == 0) &&
1773 (tp
->t_state
== TCPS_ESTABLISHED
) && (sack_in_progress
== 0)) {
1775 socket_unlock(so
, 0);
1779 * Don't send down a chain of packets when:
1780 * - TCP chaining is disabled
1781 * - there is an IPsec rule set
1782 * - there is a non default rule set for the firewall
1785 chain
= tcp_packet_chaining
> 1
1790 && (fw_enable
== 0 || fw_bypass
)
1792 ; // I'm important, not extraneous
1795 while (pkt
!= NULL
) {
1796 struct mbuf
*npkt
= pkt
->m_nextpkt
;
1799 pkt
->m_nextpkt
= NULL
;
1801 * If we are not chaining, make sure to set the packet
1802 * list count to 0 so that IP takes the right path;
1803 * this is important for cases such as IPSec where a
1804 * single mbuf might result in multiple mbufs as part
1805 * of the encapsulation. If a non-zero count is passed
1806 * down to IP, the head of the chain might change and
1807 * we could end up skipping it (thus generating bogus
1808 * packets). Fixing it in IP would be desirable, but
1809 * for now this would do it.
1813 error
= ip_output_list(pkt
, cnt
, opt
, &ro
, flags
, 0, &ipoa
);
1814 if (chain
|| error
) {
1816 * If we sent down a chain then we are done since
1817 * the callee had taken care of everything; else
1818 * we need to free the rest of the chain ourselves.
1830 /* Synchronize cached PCB route */
1831 inp_route_copyin(inp
, &ro
);
1838 register struct tcpcb
*tp
;
1840 int t
= ((tp
->t_srtt
>> 2) + tp
->t_rttvar
) >> 1;
1842 if (tp
->t_timer
[TCPT_REXMT
])
1843 panic("tcp_setpersist: retransmit pending");
1845 * Start/restart persistance timer.
1847 TCPT_RANGESET(tp
->t_timer
[TCPT_PERSIST
],
1848 t
* tcp_backoff
[tp
->t_rxtshift
],
1849 TCPTV_PERSMIN
, TCPTV_PERSMAX
);
1850 if (tp
->t_rxtshift
< TCP_MAXRXTSHIFT
)