bsd/netinet/tcp_output.c

   1 /*
   2  * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_output.c        8.4 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #define _IP_VHL
  71
  72
  73 #include <sys/param.h>
  74 #include <sys/systm.h>
  75 #include <sys/kernel.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/mbuf.h>
  78 #include <sys/domain.h>
  79 #include <sys/protosw.h>
  80 #include <sys/socket.h>
  81 #include <sys/socketvar.h>
  82
  83 #include <net/route.h>
  84 #include <net/ntstat.h>
  85 #include <net/if_var.h>
  86
  87 #include <netinet/in.h>
  88 #include <netinet/in_systm.h>
  89 #include <netinet/in_var.h>
  90 #include <netinet/ip.h>
  91 #include <netinet/in_pcb.h>
  92 #include <netinet/ip_var.h>
  93 #include <mach/sdt.h>
  94 #if INET6
  95 #include <netinet6/in6_pcb.h>
  96 #include <netinet/ip6.h>
  97 #include <netinet6/ip6_var.h>
  98 #endif
  99 #include <netinet/tcp.h>
 100 #define TCPOUTFLAGS
 101 #include <netinet/tcp_fsm.h>
 102 #include <netinet/tcp_seq.h>
 103 #include <netinet/tcp_timer.h>
 104 #include <netinet/tcp_var.h>
 105 #include <netinet/tcpip.h>
 106 #include <netinet/tcp_cc.h>
 107 #if TCPDEBUG
 108 #include <netinet/tcp_debug.h>
 109 #endif
 110 #include <sys/kdebug.h>
 111 #include <mach/sdt.h>
 112
 113 #if IPSEC
 114 #include <netinet6/ipsec.h>
 115 #endif /*IPSEC*/
 116
 117 #if CONFIG_MACF_NET
 118 #include <security/mac_framework.h>
 119 #endif /* MAC_SOCKET */
 120
 121 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETTCP, 1)
 122 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETTCP, 3)
 123 #define DBG_FNC_TCP_OUTPUT      NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
 124
 125 #ifdef notyet
 126 extern struct mbuf *m_copypack();
 127 #endif
 128
 129 int path_mtu_discovery = 1;
 130 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW | CTLFLAG_LOCKED,
 131         &path_mtu_discovery, 1, "Enable Path MTU Discovery");
 132
 133 int ss_fltsz = 1;
 134 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW | CTLFLAG_LOCKED,
 135         &ss_fltsz, 1, "Slow start flight size");
 136
 137 int ss_fltsz_local = 8; /* starts with eight segments max */
 138 SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW | CTLFLAG_LOCKED,
 139         &ss_fltsz_local, 1, "Slow start flight size for local networks");
 140
 141 int     tcp_do_tso = 1;
 142 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW | CTLFLAG_LOCKED,
 143         &tcp_do_tso, 0, "Enable TCP Segmentation Offload");
 144
 145
 146 int     tcp_ecn_outbound = 0;
 147 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_outbound,
 148         0, "Initiate ECN for outbound connections");
 149
 150 int     tcp_ecn_inbound = 0;
 151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_inbound,
 152         0, "Allow ECN negotiation for inbound connections");
 153
 154 int     tcp_packet_chaining = 50;
 155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_packet_chaining,
 156         0, "Enable TCP output packet chaining");
 157
 158 int     tcp_output_unlocked = 1;
 159 SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_output_unlocked,
 160         0, "Unlock TCP when sending packets down to IP");
 161
 162 int tcp_do_rfc3390 = 1;
 163 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW | CTLFLAG_LOCKED,
 164         &tcp_do_rfc3390, 1, "Calculate intial slowstart cwnd depending on MSS");
 165
 166 int tcp_min_iaj_win = MIN_IAJ_WIN;
 167 SYSCTL_INT(_net_inet_tcp, OID_AUTO, min_iaj_win, CTLFLAG_RW | CTLFLAG_LOCKED,
 168         &tcp_min_iaj_win, 1, "Minimum recv win based on inter-packet arrival jitter");
 169
 170 int tcp_acc_iaj_react_limit = ACC_IAJ_REACT_LIMIT;
 171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_react_limit, CTLFLAG_RW | CTLFLAG_LOCKED,
 172         &tcp_acc_iaj_react_limit, 1, "Accumulated IAJ when receiver starts to react");
 173
 174 static int32_t packchain_newlist = 0;
 175 static int32_t packchain_looped = 0;
 176 static int32_t packchain_sent = 0;
 177
 178 /* temporary: for testing */
 179 #if IPSEC
 180 extern int ipsec_bypass;
 181 #endif
 182
 183 extern int slowlink_wsize;      /* window correction for slow links */
 184 #if IPFIREWALL
 185 extern int fw_enable;           /* firewall check for packet chaining */
 186 extern int fw_bypass;           /* firewall check: disable packet chaining if there is rules */
 187 #endif /* IPFIREWALL */
 188
 189 extern vm_size_t        so_cache_zone_element_size;
 190 #if RANDOM_IP_ID
 191 extern int              ip_use_randomid;
 192 #endif /* RANDOM_IP_ID */
 193 extern u_int32_t dlil_filter_count;
 194 extern u_int32_t kipf_count;
 195 extern int tcp_recv_bg;
 196
 197 static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, int,
 198     struct mbuf *, int, int, int32_t);
 199
 200 static inline int is_tcp_recv_bg(struct socket *so);
 201
 202 static __inline__ u_int16_t
 203 get_socket_id(struct socket * s)
 204 {
 205         u_int16_t               val;
 206
 207         if (so_cache_zone_element_size == 0) {
 208                 return (0);
 209         }
 210         val = (u_int16_t)(((uintptr_t)s) / so_cache_zone_element_size);
 211         if (val == 0) {
 212                 val = 0xffff;
 213         }
 214         return (val);
 215 }
 216
 217 static inline int
 218 is_tcp_recv_bg(struct socket *so)
 219 {
 220         return (so->so_traffic_mgt_flags & TRAFFIC_MGT_TCP_RECVBG);
 221 }
 222
 223 /*
 224  * Tcp output routine: figure out what should be sent and send it.
 225  *
 226  * Returns:     0                       Success
 227  *              EADDRNOTAVAIL
 228  *              ENOBUFS
 229  *              EMSGSIZE
 230  *              EHOSTUNREACH
 231  *              ENETDOWN
 232  *      ip_output_list:ENOMEM
 233  *      ip_output_list:EADDRNOTAVAIL
 234  *      ip_output_list:ENETUNREACH
 235  *      ip_output_list:EHOSTUNREACH
 236  *      ip_output_list:EACCES
 237  *      ip_output_list:EMSGSIZE
 238  *      ip_output_list:ENOBUFS
 239  *      ip_output_list:???              [ignorable: mostly IPSEC/firewall/DLIL]
 240  *      ip6_output:???                  [IPV6 only]
 241  */
 242 int
 243 tcp_output(struct tcpcb *tp)
 244 {
 245         struct socket *so = tp->t_inpcb->inp_socket;
 246         int32_t len, recwin, sendwin, off;
 247         int flags, error;
 248         register struct mbuf *m;
 249         struct ip *ip = NULL;
 250         register struct ipovly *ipov = NULL;
 251 #if INET6
 252         struct ip6_hdr *ip6 = NULL;
 253 #endif /* INET6 */
 254         register struct tcphdr *th;
 255         u_char opt[TCP_MAXOLEN];
 256         unsigned ipoptlen, optlen, hdrlen;
 257         int idle, sendalot, lost = 0;
 258         int i, sack_rxmit;
 259         int tso = 0;
 260         int sack_bytes_rxmt;
 261         struct sackhole *p;
 262 #ifdef IPSEC
 263         unsigned ipsec_optlen = 0;
 264 #endif
 265         int    last_off = 0;
 266         int    m_off;
 267         int    idle_time = 0;
 268         struct mbuf *m_lastm = NULL;
 269         struct mbuf *m_head = NULL;
 270         struct mbuf *packetlist = NULL;
 271         struct mbuf *tp_inp_options = tp->t_inpcb->inp_depend4.inp4_options;
 272 #if INET6
 273         int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ;
 274         struct ip6_pktopts *inp6_pktopts = tp->t_inpcb->inp_depend6.inp6_outputopts;
 275 #endif
 276         short packchain_listadd = 0;
 277         u_int16_t       socket_id = get_socket_id(so);
 278         int so_options = so->so_options;
 279         struct rtentry *rt;
 280
 281         /*
 282          * Determine length of data that should be transmitted,
 283          * and flags that will be used.
 284          * If there is some data or critical controls (SYN, RST)
 285          * to send, then transmit; otherwise, investigate further.
 286          */
 287         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 288
 289         /* Since idle_time is signed integer, the following integer subtraction
 290          * will take care of wrap around of tcp_now
 291          */
 292         idle_time = tcp_now - tp->t_rcvtime;
 293         if (idle && idle_time >= tp->t_rxtcur) {
 294                 if (CC_ALGO(tp)->after_idle != NULL)
 295                         CC_ALGO(tp)->after_idle(tp);
 296                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb,
 297                         struct tcpcb *, tp, struct tcphdr *, NULL,
 298                         int32_t, TCP_CC_IDLE_TIMEOUT);
 299         }
 300         tp->t_flags &= ~TF_LASTIDLE;
 301         if (idle) {
 302                 if (tp->t_flags & TF_MORETOCOME) {
 303                         tp->t_flags |= TF_LASTIDLE;
 304                         idle = 0;
 305                 }
 306         }
 307 again:
 308         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 309
 310 #if INET6
 311         if (isipv6) {
 312
 313                 KERNEL_DEBUG(DBG_LAYER_BEG,
 314                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
 315                      (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
 316                       (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
 317                      sendalot,0,0);
 318         }
 319         else
 320 #endif
 321
 322         {
 323                 KERNEL_DEBUG(DBG_LAYER_BEG,
 324                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
 325                      (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
 326                       (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
 327                      sendalot,0,0);
 328         /*
 329          * If the route generation id changed, we need to check that our
 330          * local (source) IP address is still valid. If it isn't either
 331          * return error or silently do nothing (assuming the address will
 332          * come back before the TCP connection times out).
 333          */
 334         rt = tp->t_inpcb->inp_route.ro_rt;
 335         if (rt != NULL && (!(rt->rt_flags & RTF_UP) ||
 336             rt->generation_id != route_generation)) {
 337                 struct ifnet *ifp;
 338                 struct in_ifaddr *ia;
 339
 340                 /* disable multipages at the socket */
 341                 somultipages(so, FALSE);
 342
 343                 /* Disable TSO for the socket until we know more */
 344                 tp->t_flags &= ~TF_TSO;
 345
 346                 /* check that the source address is still valid */
 347                 if ((ia = ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr)) == NULL) {
 348
 349                         if (tp->t_state >= TCPS_CLOSE_WAIT) {
 350                                 tcp_drop(tp, EADDRNOTAVAIL);
 351                                 return(EADDRNOTAVAIL);
 352                         }
 353
 354                         /* set Retransmit  timer if it wasn't set
 355                          * reset Persist timer and shift register as the
 356                          * advertised peer window may not be valid anymore
 357                          */
 358
 359                         if (!tp->t_timer[TCPT_REXMT]) {
 360                                 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
 361                                 if (tp->t_timer[TCPT_PERSIST]) {
 362                                         tp->t_timer[TCPT_PERSIST] = 0;
 363                                         tp->t_rxtshift = 0;
 364                                         tp->t_persist_stop = 0;
 365                                         tp->rxt_start = 0;
 366                                 }
 367                         }
 368
 369                         if (tp->t_pktlist_head != NULL)
 370                                 m_freem_list(tp->t_pktlist_head);
 371                         TCP_PKTLIST_CLEAR(tp);
 372
 373                         /* drop connection if source address isn't available */
 374                         if (so->so_flags & SOF_NOADDRAVAIL) {
 375                                 tcp_drop(tp, EADDRNOTAVAIL);
 376                                 return(EADDRNOTAVAIL);
 377                         }
 378                         else {
 379                                 tcp_check_timer_state(tp);
 380                                 return(0); /* silently ignore, keep data in socket: address may be back */
 381                         }
 382                 }
 383                 IFA_REMREF(&ia->ia_ifa);
 384
 385                 /*
 386                  * Address is still valid; check for multipages capability
 387                  * again in case the outgoing interface has changed.
 388                  */
 389                 RT_LOCK(rt);
 390                 if ((ifp = rt->rt_ifp) != NULL) {
 391                         somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
 392                         tcp_set_tso(tp, ifp);
 393                 }
 394                 if (rt->rt_flags & RTF_UP)
 395                         rt->generation_id = route_generation;
 396                 /*
 397                  * See if we should do MTU discovery. Don't do it if:
 398                  *      1) it is disabled via the sysctl
 399                  *      2) the route isn't up
 400                  *      3) the MTU is locked (if it is, then discovery has been
 401                  *         disabled)
 402                  */
 403
 404                 if (!path_mtu_discovery || ((rt != NULL) &&
 405                     (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
 406                         tp->t_flags &= ~TF_PMTUD;
 407                 else
 408                         tp->t_flags |= TF_PMTUD;
 409
 410                 RT_UNLOCK(rt);
 411         }
 412         }
 413
 414         /*
 415          * If we've recently taken a timeout, snd_max will be greater than
 416          * snd_nxt.  There may be SACK information that allows us to avoid
 417          * resending already delivered data.  Adjust snd_nxt accordingly.
 418          */
 419         if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
 420                 tcp_sack_adjust(tp);
 421         sendalot = 0;
 422         off = tp->snd_nxt - tp->snd_una;
 423         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 424
 425         if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
 426                 sendwin = min(sendwin, slowlink_wsize);
 427
 428         flags = tcp_outflags[tp->t_state];
 429         /*
 430          * Send any SACK-generated retransmissions.  If we're explicitly trying
 431          * to send out new data (when sendalot is 1), bypass this function.
 432          * If we retransmit in fast recovery mode, decrement snd_cwnd, since
 433          * we're replacing a (future) new transmission with a retransmission
 434          * now, and we previously incremented snd_cwnd in tcp_input().
 435          */
 436         /*
 437          * Still in sack recovery , reset rxmit flag to zero.
 438          */
 439         sack_rxmit = 0;
 440         sack_bytes_rxmt = 0;
 441         len = 0;
 442         p = NULL;
 443         if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
 444             (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 445                 int32_t cwin;
 446
 447                 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
 448                 if (cwin < 0)
 449                         cwin = 0;
 450                 /* Do not retransmit SACK segments beyond snd_recover */
 451                 if (SEQ_GT(p->end, tp->snd_recover)) {
 452                         /*
 453                          * (At least) part of sack hole extends beyond
 454                          * snd_recover. Check to see if we can rexmit data
 455                          * for this hole.
 456                          */
 457                         if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
 458                                 /*
 459                                  * Can't rexmit any more data for this hole.
 460                                  * That data will be rexmitted in the next
 461                                  * sack recovery episode, when snd_recover
 462                                  * moves past p->rxmit.
 463                                  */
 464                                 p = NULL;
 465                                 goto after_sack_rexmit;
 466                         } else
 467                                 /* Can rexmit part of the current hole */
 468                                 len = ((int32_t)min(cwin,
 469                                                    tp->snd_recover - p->rxmit));
 470                 } else
 471                         len = ((int32_t)min(cwin, p->end - p->rxmit));
 472                 if (len > 0) {
 473                         off = p->rxmit - tp->snd_una; /* update off only if we really transmit SACK data */
 474                         sack_rxmit = 1;
 475                         sendalot = 1;
 476                         tcpstat.tcps_sack_rexmits++;
 477                         tcpstat.tcps_sack_rexmit_bytes +=
 478                             min(len, tp->t_maxseg);
 479                         if (nstat_collect) {
 480                                 nstat_route_tx(tp->t_inpcb->inp_route.ro_rt, 1, min(len, tp->t_maxseg), NSTAT_TX_FLAG_RETRANSMIT);
 481                                 locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1);
 482                                 locked_add_64(&tp->t_inpcb->inp_stat->txbytes, min(len, tp->t_maxseg));
 483                                 tp->t_stat.txretransmitbytes += min(len, tp->t_maxseg);
 484                         }
 485                 }
 486                 else
 487                         len = 0;
 488         }
 489 after_sack_rexmit:
 490         /*
 491          * Get standard flags, and add SYN or FIN if requested by 'hidden'
 492          * state flags.
 493          */
 494         if (tp->t_flags & TF_NEEDFIN)
 495                 flags |= TH_FIN;
 496         if (tp->t_flags & TF_NEEDSYN)
 497                 flags |= TH_SYN;
 498
 499         /*
 500          * If in persist timeout with window of 0, send 1 byte.
 501          * Otherwise, if window is small but nonzero
 502          * and timer expired, we will send what we can
 503          * and go to transmit state.
 504          */
 505         if (tp->t_force) {
 506                 if (sendwin == 0) {
 507                         /*
 508                          * If we still have some data to send, then
 509                          * clear the FIN bit.  Usually this would
 510                          * happen below when it realizes that we
 511                          * aren't sending all the data.  However,
 512                          * if we have exactly 1 byte of unsent data,
 513                          * then it won't clear the FIN bit below,
 514                          * and if we are in persist state, we wind
 515                          * up sending the packet without recording
 516                          * that we sent the FIN bit.
 517                          *
 518                          * We can't just blindly clear the FIN bit,
 519                          * because if we don't have any more data
 520                          * to send then the probe will be the FIN
 521                          * itself.
 522                          */
 523                         if (off < so->so_snd.sb_cc)
 524                                 flags &= ~TH_FIN;
 525                         sendwin = 1;
 526                 } else {
 527                         tp->t_timer[TCPT_PERSIST] = 0;
 528                         tp->t_rxtshift = 0;
 529                         tp->rxt_start = 0;
 530                         tp->t_persist_stop = 0;
 531                 }
 532         }
 533
 534         /*
 535          * If snd_nxt == snd_max and we have transmitted a FIN, the
 536          * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
 537          * a negative length.  This can also occur when TCP opens up
 538          * its congestion window while receiving additional duplicate
 539          * acks after fast-retransmit because TCP will reset snd_nxt
 540          * to snd_max after the fast-retransmit.
 541          *
 542          * In the normal retransmit-FIN-only case, however, snd_nxt will
 543          * be set to snd_una, the offset will be 0, and the length may
 544          * wind up 0.
 545          *
 546          * If sack_rxmit is true we are retransmitting from the scoreboard
 547          * in which case len is already set.
 548          */
 549         if (sack_rxmit == 0) {
 550                 if (sack_bytes_rxmt == 0)
 551                         len = min(so->so_snd.sb_cc, sendwin) - off;
 552                 else {
 553                         int32_t cwin;
 554
 555                         /*
 556                          * We are inside of a SACK recovery episode and are
 557                          * sending new data, having retransmitted all the
 558                          * data possible in the scoreboard.
 559                          */
 560                         len = min(so->so_snd.sb_cc, tp->snd_wnd)
 561                                - off;
 562                         /*
 563                          * Don't remove this (len > 0) check !
 564                          * We explicitly check for len > 0 here (although it
 565                          * isn't really necessary), to work around a gcc
 566                          * optimization issue - to force gcc to compute
 567                          * len above. Without this check, the computation
 568                          * of len is bungled by the optimizer.
 569                          */
 570                         if (len > 0) {
 571                                 cwin = tp->snd_cwnd -
 572                                         (tp->snd_nxt - tp->sack_newdata) -
 573                                         sack_bytes_rxmt;
 574                                 if (cwin < 0)
 575                                         cwin = 0;
 576                                 len = imin(len, cwin);
 577                         }
 578                         else
 579                                 len = 0;
 580                 }
 581         }
 582
 583         /*
 584          * Lop off SYN bit if it has already been sent.  However, if this
 585          * is SYN-SENT state and if segment contains data and if we don't
 586          * know that foreign host supports TAO, suppress sending segment.
 587          */
 588         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 589                 flags &= ~TH_SYN;
 590                 off--, len++;
 591                 if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
 592                         while (!(tp->t_flags & TF_SENDINPROG) &&
 593                             tp->t_pktlist_head != NULL) {
 594                                 packetlist = tp->t_pktlist_head;
 595                                 packchain_listadd = tp->t_lastchain;
 596                                 packchain_sent++;
 597                                 TCP_PKTLIST_CLEAR(tp);
 598                                 tp->t_flags |= TF_SENDINPROG;
 599
 600                                 error = tcp_ip_output(so, tp, packetlist,
 601                                     packchain_listadd, tp_inp_options,
 602                                     (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)), 0);
 603
 604                                 tp->t_flags &= ~TF_SENDINPROG;
 605                         }
 606                         /* tcp was closed while we were in ip; resume close */
 607                         if ((tp->t_flags &
 608                             (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
 609                                 tp->t_flags &= ~TF_CLOSING;
 610                                 (void) tcp_close(tp);
 611                         } else {
 612                                 tcp_check_timer_state(tp);
 613                         }
 614                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,
 615                             0,0,0,0,0);
 616                         return 0;
 617                 }
 618         }
 619
 620         /*
 621          * Be careful not to send data and/or FIN on SYN segments.
 622          * This measure is needed to prevent interoperability problems
 623          * with not fully conformant TCP implementations.
 624          */
 625         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 626                 len = 0;
 627                 flags &= ~TH_FIN;
 628         }
 629
 630         /* The check here used to be (len < 0). Some times len is zero when
 631          * the congestion window is closed and we need to check if persist timer
 632          * has to be set in that case. But don't set persist until connection
 633          * is established.
 634          */
 635         if (len <= 0 && !(flags & TH_SYN)) {
 636                 /*
 637                  * If FIN has been sent but not acked,
 638                  * but we haven't been called to retransmit,
 639                  * len will be < 0.  Otherwise, window shrank
 640                  * after we sent into it.  If window shrank to 0,
 641                  * cancel pending retransmit, pull snd_nxt back
 642                  * to (closed) window, and set the persist timer
 643                  * if it isn't already going.  If the window didn't
 644                  * close completely, just wait for an ACK.
 645                  */
 646                 len = 0;
 647                 if (sendwin == 0) {
 648                         tp->t_timer[TCPT_REXMT] = 0;
 649                         tp->t_rxtshift = 0;
 650                         tp->rxt_start = 0;
 651                         tp->snd_nxt = tp->snd_una;
 652                         if (tp->t_timer[TCPT_PERSIST] == 0)
 653                                 tcp_setpersist(tp);
 654                 }
 655         }
 656
 657         /*
 658          * Truncate to the maximum segment length or enable TCP Segmentation
 659          * Offloading (if supported by hardware) and ensure that FIN is removed
 660          * if the length no longer contains the last data byte.
 661          *
 662          * TSO may only be used if we are in a pure bulk sending state.  The
 663          * presence of TCP-MD5, SACK retransmits, SACK advertizements, ipfw rules
 664          * and IP options prevent using TSO.  With TSO the TCP header is the same
 665          * (except for the sequence number) for all generated packets.  This
 666          * makes it impossible to transmit any options which vary per generated
 667          * segment or packet.
 668          *
 669          * The length of TSO bursts is limited to TCP_MAXWIN.  That limit and
 670          * removal of FIN (if not already catched here) are handled later after
 671          * the exact length of the TCP options are known.
 672          */
 673 #if IPSEC
 674         /*
 675          * Pre-calculate here as we save another lookup into the darknesses
 676          * of IPsec that way and can actually decide if TSO is ok.
 677          */
 678         if (ipsec_bypass == 0)
 679                 ipsec_optlen = ipsec_hdrsiz_tcp(tp);
 680 #endif
 681
 682         if (len > tp->t_maxseg) {
 683                 if ((tp->t_flags & TF_TSO) && tcp_do_tso &&
 684 #if RANDOM_IP_ID
 685                     ip_use_randomid &&
 686 #endif /* RANDOM_IP_ID */
 687                     kipf_count == 0 && dlil_filter_count == 0 &&
 688                     tp->rcv_numsacks == 0 && sack_rxmit == 0  && sack_bytes_rxmt == 0 &&
 689                     tp->t_inpcb->inp_options == NULL &&
 690                     tp->t_inpcb->in6p_options == NULL
 691 #if IPSEC
 692                     && ipsec_optlen == 0
 693 #endif
 694 #if IPFIREWALL
 695                     && (fw_enable == 0 || fw_bypass)
 696 #endif
 697                     ) {
 698                         tso = 1;
 699                         sendalot = 0;
 700                 } else {
 701                         len = tp->t_maxseg;
 702                         sendalot = 1;
 703                         tso = 0;
 704                 }
 705         }
 706         if (sack_rxmit) {
 707                 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
 708                         flags &= ~TH_FIN;
 709         } else {
 710                 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
 711                         flags &= ~TH_FIN;
 712         }
 713
 714         recwin = tcp_sbspace(tp);
 715
 716         /*
 717          * Sender silly window avoidance.   We transmit under the following
 718          * conditions when len is non-zero:
 719          *
 720          *      - We have a full segment (or more with TSO)
 721          *      - This is the last buffer in a write()/send() and we are
 722          *        either idle or running NODELAY
 723          *      - we've timed out (e.g. persist timer)
 724          *      - we have more then 1/2 the maximum send window's worth of
 725          *        data (receiver may be limited the window size)
 726          *      - we need to retransmit
 727          */
 728         if (len) {
 729                 if (len >= tp->t_maxseg) {
 730                         tp->t_flags |= TF_MAXSEGSNT;
 731                         goto send;
 732                 }
 733                 if (!(tp->t_flags & TF_MORETOCOME) &&
 734                     (idle || tp->t_flags & TF_NODELAY || tp->t_flags & TF_MAXSEGSNT) &&
 735                     (tp->t_flags & TF_NOPUSH) == 0 &&
 736                     len + off >= so->so_snd.sb_cc) {
 737                         tp->t_flags &= ~TF_MAXSEGSNT;
 738                         goto send;
 739                 }
 740                 if (tp->t_force) {
 741                         tp->t_flags &= ~TF_MAXSEGSNT;
 742                         goto send;
 743                 }
 744                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
 745                         tp->t_flags &= ~TF_MAXSEGSNT;
 746                         goto send;
 747                 }
 748                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 749                         tp->t_flags &= ~TF_MAXSEGSNT;
 750                         goto send;
 751                 }
 752                 if (sack_rxmit)
 753                         goto send;
 754         }
 755
 756         /*
 757          * Compare available window to amount of window
 758          * known to peer (as advertised window less
 759          * next expected input).  If the difference is at least two
 760          * max size segments, or at least 50% of the maximum possible
 761          * window, then want to send a window update to peer.
 762          * Skip this if the connection is in T/TCP half-open state.
 763          */
 764         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
 765                 /*
 766                  * "adv" is the amount we can increase the window,
 767                  * taking into account that we are limited by
 768                  * TCP_MAXWIN << tp->rcv_scale.
 769                  */
 770                 int32_t adv = imin(recwin, (int)TCP_MAXWIN << tp->rcv_scale) -
 771                         (tp->rcv_adv - tp->rcv_nxt);
 772
 773                 if (adv >= (int32_t) (2 * tp->t_maxseg)) {
 774
 775                         /*
 776                          * Update only if the resulting scaled value of the window changed, or
 777                          * if there is a change in the sequence since the last ack.
 778                          * This avoids what appears as dupe ACKS (see rdar://5640997)
 779                          */
 780
 781                         if ((tp->last_ack_sent != tp->rcv_nxt) || (((recwin + adv) >> tp->rcv_scale) > recwin))
 782                                 goto send;
 783                 }
 784                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
 785                                 goto send;
 786         }
 787
 788         /*
 789          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 790          * is also a catch-all for the retransmit timer timeout case.
 791          */
 792         if (tp->t_flags & TF_ACKNOW)
 793                 goto send;
 794         if ((flags & TH_RST) ||
 795             ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 796                 goto send;
 797         if (SEQ_GT(tp->snd_up, tp->snd_una))
 798                 goto send;
 799         /*
 800          * If our state indicates that FIN should be sent
 801          * and we have not yet done so, then we need to send.
 802          */
 803         if (flags & TH_FIN &&
 804             ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 805                 goto send;
 806         /*
 807          * In SACK, it is possible for tcp_output to fail to send a segment
 808          * after the retransmission timer has been turned off.  Make sure
 809          * that the retransmission timer is set.
 810          */
 811         if (tp->sack_enable && (tp->t_state >= TCPS_ESTABLISHED) && SEQ_GT(tp->snd_max, tp->snd_una) &&
 812                 tp->t_timer[TCPT_REXMT] == 0 &&
 813             tp->t_timer[TCPT_PERSIST] == 0) {
 814                         tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
 815                         goto just_return;
 816         }
 817         /*
 818          * TCP window updates are not reliable, rather a polling protocol
 819          * using ``persist'' packets is used to insure receipt of window
 820          * updates.  The three ``states'' for the output side are:
 821          *      idle                    not doing retransmits or persists
 822          *      persisting              to move a small or zero window
 823          *      (re)transmitting        and thereby not persisting
 824          *
 825          * tp->t_timer[TCPT_PERSIST]
 826          *      is set when we are in persist state.
 827          * tp->t_force
 828          *      is set when we are called to send a persist packet.
 829          * tp->t_timer[TCPT_REXMT]
 830          *      is set when we are retransmitting
 831          * The output side is idle when both timers are zero.
 832          *
 833          * If send window is too small, there is data to transmit, and no
 834          * retransmit or persist is pending, then go to persist state.
 835          * If nothing happens soon, send when timer expires:
 836          * if window is nonzero, transmit what we can,
 837          * otherwise force out a byte.
 838          */
 839         if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
 840             tp->t_timer[TCPT_PERSIST] == 0) {
 841                 tp->t_rxtshift = 0;
 842                 tp->rxt_start = 0;
 843                 tcp_setpersist(tp);
 844         }
 845 just_return:
 846         /*
 847          * If there is no reason to send a segment, just return.
 848          * but if there is some packets left in the packet list, send them now.
 849          */
 850         while (!(tp->t_flags & TF_SENDINPROG) && tp->t_pktlist_head != NULL) {
 851                 packetlist = tp->t_pktlist_head;
 852                 packchain_listadd = tp->t_lastchain;
 853                 packchain_sent++;
 854                 TCP_PKTLIST_CLEAR(tp);
 855                 tp->t_flags |= TF_SENDINPROG;
 856
 857                 error = tcp_ip_output(so, tp, packetlist, packchain_listadd,
 858                     tp_inp_options, (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)), recwin);
 859
 860                 tp->t_flags &= ~TF_SENDINPROG;
 861         }
 862         /* tcp was closed while we were in ip; resume close */
 863         if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
 864                 tp->t_flags &= ~TF_CLOSING;
 865                 (void) tcp_close(tp);
 866         } else {
 867                 tcp_check_timer_state(tp);
 868         }
 869         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 870         return (0);
 871
 872 send:
 873         /*
 874          * Before ESTABLISHED, force sending of initial options
 875          * unless TCP set not to do any options.
 876          * NOTE: we assume that the IP/TCP header plus TCP options
 877          * always fit in a single mbuf, leaving room for a maximum
 878          * link header, i.e.
 879          *      max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 880          */
 881         optlen = 0;
 882 #if INET6
 883         if (isipv6)
 884                 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 885         else
 886 #endif
 887         hdrlen = sizeof (struct tcpiphdr);
 888         if (flags & TH_SYN) {
 889                 tp->snd_nxt = tp->iss;
 890                 if ((tp->t_flags & TF_NOOPT) == 0) {
 891                         u_short mss;
 892
 893                         opt[0] = TCPOPT_MAXSEG;
 894                         opt[1] = TCPOLEN_MAXSEG;
 895                         mss = htons((u_short) tcp_mssopt(tp));
 896                         (void)memcpy(opt + 2, &mss, sizeof(mss));
 897                         optlen = TCPOLEN_MAXSEG;
 898
 899                         if ((tp->t_flags & TF_REQ_SCALE) &&
 900                             ((flags & TH_ACK) == 0 ||
 901                             (tp->t_flags & TF_RCVD_SCALE))) {
 902                                 *((u_int32_t *)(opt + optlen)) = htonl(
 903                                         TCPOPT_NOP << 24 |
 904                                         TCPOPT_WINDOW << 16 |
 905                                         TCPOLEN_WINDOW << 8 |
 906                                         tp->request_r_scale);
 907                                 optlen += 4;
 908                         }
 909                 }
 910
 911         }
 912
 913         /*
 914           RFC 3168 states that:
 915            - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
 916            to handle the TCP ECE flag, even if you also later send a
 917            non-ECN-setup SYN/SYN-ACK.
 918            - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
 919            the ip ECT flag.
 920
 921            It is not clear how the ECE flag would ever be set if you never
 922            set the IP ECT flag on outbound packets. All the same, we use
 923            the TE_SETUPSENT to indicate that we have committed to handling
 924            the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
 925            whether or not we should set the IP ECT flag on outbound packets.
 926          */
 927         /*
 928          * For a SYN-ACK, send an ECN setup SYN-ACK
 929          */
 930         if (tcp_ecn_inbound && (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
 931                 if ((tp->ecn_flags & TE_SETUPRECEIVED) != 0) {
 932                         if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
 933                                 /* Setting TH_ECE makes this an ECN-setup SYN-ACK */
 934                                 flags |= TH_ECE;
 935
 936                                 /*
 937                                  * Record that we sent the ECN-setup and default to
 938                                  * setting IP ECT.
 939                                  */
 940                                 tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
 941                         }
 942                         else {
 943                                 /*
 944                                  * We sent an ECN-setup SYN-ACK but it was dropped.
 945                                  * Fallback to non-ECN-setup SYN-ACK and clear flag
 946                                  * that to indicate we should not send data with IP ECT set.
 947                                  *
 948                                  * Pretend we didn't receive an ECN-setup SYN.
 949                                  */
 950                                 tp->ecn_flags &= ~TE_SETUPRECEIVED;
 951                         }
 952                 }
 953         }
 954         else if (tcp_ecn_outbound && (flags & (TH_SYN | TH_ACK)) == TH_SYN) {
 955                 if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
 956                         /* Setting TH_ECE and TH_CWR makes this an ECN-setup SYN */
 957                         flags |= (TH_ECE | TH_CWR);
 958
 959                         /*
 960                          * Record that we sent the ECN-setup and default to
 961                          * setting IP ECT.
 962                          */
 963                         tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
 964                 }
 965                 else {
 966                         /*
 967                          * We sent an ECN-setup SYN but it was dropped.
 968                          * Fall back to no ECN and clear flag indicating
 969                          * we should send data with IP ECT set.
 970                          */
 971                         tp->ecn_flags &= ~TE_SENDIPECT;
 972                 }
 973         }
 974
 975         /*
 976          * Check if we should set the TCP CWR flag.
 977          * CWR flag is sent when we reduced the congestion window because
 978          * we received a TCP ECE or we performed a fast retransmit. We
 979          * never set the CWR flag on retransmitted packets. We only set
 980          * the CWR flag on data packets. Pure acks don't have this set.
 981          */
 982         if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 &&
 983                 !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 984                 flags |= TH_CWR;
 985                 tp->ecn_flags &= ~TE_SENDCWR;
 986         }
 987
 988         /*
 989          * Check if we should set the TCP ECE flag.
 990          */
 991         if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) {
 992                 flags |= TH_ECE;
 993         }
 994
 995         /*
 996          * Send a timestamp and echo-reply if this is a SYN and our side
 997          * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
 998          * and our peer have sent timestamps in our SYN's.
 999          */
1000         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1001             (flags & TH_RST) == 0 &&
1002             ((flags & TH_ACK) == 0 ||
1003              (tp->t_flags & TF_RCVD_TSTMP))) {
1004                 u_int32_t *lp = (u_int32_t *)(opt + optlen);
1005
1006                 /* Form timestamp option as shown in appendix A of RFC 1323. */
1007                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
1008                 *lp++ = htonl(tcp_now);
1009                 *lp   = htonl(tp->ts_recent);
1010                 optlen += TCPOLEN_TSTAMP_APPA;
1011         }
1012
1013         if (tp->sack_enable && ((tp->t_flags & TF_NOOPT) == 0)) {
1014                 /*
1015                  * Tack on the SACK permitted option *last*.
1016                  * And do padding of options after tacking this on.
1017                  * This is because of MSS, TS, WinScale and Signatures are
1018                  * all present, we have just 2 bytes left for the SACK
1019                  * permitted option, which is just enough.
1020                  */
1021                 /*
1022                  * If this is the first SYN of connection (not a SYN
1023                  * ACK), include SACK permitted option.  If this is a
1024                  * SYN ACK, include SACK permitted option if peer has
1025                  * already done so. This is only for active connect,
1026                  * since the syncache takes care of the passive connect.
1027                  */
1028                 if ((flags & TH_SYN) &&
1029                     (!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) {
1030                         u_char *bp;
1031                         bp = (u_char *)opt + optlen;
1032
1033                         *bp++ = TCPOPT_SACK_PERMITTED;
1034                         *bp++ = TCPOLEN_SACK_PERMITTED;
1035                         optlen += TCPOLEN_SACK_PERMITTED;
1036                 }
1037
1038                 /*
1039                  * Send SACKs if necessary.  This should be the last
1040                  * option processed.  Only as many SACKs are sent as
1041                  * are permitted by the maximum options size.
1042                  *
1043                  * In general, SACK blocks consume 8*n+2 bytes.
1044                  * So a full size SACK blocks option is 34 bytes
1045                  * (to generate 4 SACK blocks).  At a minimum,
1046                  * we need 10 bytes (to generate 1 SACK block).
1047                  * If TCP Timestamps (12 bytes) and TCP Signatures
1048                  * (18 bytes) are both present, we'll just have
1049                  * 10 bytes for SACK options 40 - (12 + 18).
1050                  */
1051                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1052                     (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0 &&
1053                     MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) {
1054                         int nsack, sackoptlen, padlen;
1055                         u_char *bp = (u_char *)opt + optlen;
1056                         u_int32_t *lp;
1057
1058                         nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK;
1059                         nsack = min(nsack, tp->rcv_numsacks);
1060                         sackoptlen = (2 + nsack * TCPOLEN_SACK);
1061
1062                         /*
1063                          * First we need to pad options so that the
1064                          * SACK blocks can start at a 4-byte boundary
1065                          * (sack option and length are at a 2 byte offset).
1066                          */
1067                         padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4;
1068                         optlen += padlen;
1069                         while (padlen-- > 0)
1070                                 *bp++ = TCPOPT_NOP;
1071
1072                         tcpstat.tcps_sack_send_blocks++;
1073                         *bp++ = TCPOPT_SACK;
1074                         *bp++ = sackoptlen;
1075                         lp = (u_int32_t *)bp;
1076                         for (i = 0; i < nsack; i++) {
1077                                 struct sackblk sack = tp->sackblks[i];
1078                                 *lp++ = htonl(sack.start);
1079                                 *lp++ = htonl(sack.end);
1080                         }
1081                         optlen += sackoptlen;
1082                 }
1083         }
1084
1085         /* Pad TCP options to a 4 byte boundary */
1086         if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
1087                 int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
1088                 u_char *bp = (u_char *)opt + optlen;
1089
1090                 optlen += pad;
1091                 while (pad) {
1092                         *bp++ = TCPOPT_EOL;
1093                         pad--;
1094                 }
1095         }
1096
1097         hdrlen += optlen;
1098
1099 #if INET6
1100         if (isipv6)
1101                 ipoptlen = ip6_optlen(tp->t_inpcb);
1102         else
1103 #endif
1104         {
1105                 if (tp_inp_options) {
1106                         ipoptlen = tp_inp_options->m_len -
1107                                 offsetof(struct ipoption, ipopt_list);
1108                 } else
1109                         ipoptlen = 0;
1110         }
1111 #if IPSEC
1112                 ipoptlen += ipsec_optlen;
1113 #endif
1114
1115         /*
1116          * Adjust data length if insertion of options will
1117          * bump the packet length beyond the t_maxopd length.
1118          * Clear the FIN bit because we cut off the tail of
1119          * the segment.
1120          *
1121          * When doing TSO limit a burst to TCP_MAXWIN minus the
1122          * IP, TCP and Options length to keep ip->ip_len from
1123          * overflowing.  Prevent the last segment from being
1124          * fractional thus making them all equal sized and set
1125          * the flag to continue sending.  TSO is disabled when
1126          * IP options or IPSEC are present.
1127          */
1128         if (len + optlen + ipoptlen > tp->t_maxopd) {
1129                 /*
1130                  * If there is still more to send, don't close the connection.
1131                  */
1132                 flags &= ~TH_FIN;
1133                 if (tso) {
1134                         int32_t tso_maxlen;
1135
1136                         tso_maxlen = tp->tso_max_segment_size ? tp->tso_max_segment_size : TCP_MAXWIN;
1137
1138                         if (len > tso_maxlen - hdrlen - optlen) {
1139                                 len = tso_maxlen - hdrlen - optlen;
1140                                 len = len - (len % (tp->t_maxopd - optlen));
1141                                 sendalot = 1;
1142                         } else if (tp->t_flags & TF_NEEDFIN)
1143                                 sendalot = 1;
1144                 } else {
1145                         len = tp->t_maxopd - optlen - ipoptlen;
1146                         sendalot = 1;
1147                 }
1148         }
1149
1150 /*#ifdef DIAGNOSTIC*/
1151 #if INET6
1152         if (max_linkhdr + hdrlen > MCLBYTES)
1153                 panic("tcphdr too big");
1154 #else
1155         if (max_linkhdr + hdrlen > MHLEN)
1156                 panic("tcphdr too big");
1157 #endif
1158 /*#endif*/
1159
1160         /*
1161          * Grab a header mbuf, attaching a copy of data to
1162          * be transmitted, and initialize the header from
1163          * the template for sends on this connection.
1164          */
1165         if (len) {
1166                 if (tp->t_force && len == 1)
1167                         tcpstat.tcps_sndprobe++;
1168                 else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
1169                         tcpstat.tcps_sndrexmitpack++;
1170                         tcpstat.tcps_sndrexmitbyte += len;
1171                         if (nstat_collect) {
1172                                 nstat_route_tx(tp->t_inpcb->inp_route.ro_rt, 1, len, NSTAT_TX_FLAG_RETRANSMIT);
1173                                 locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1);
1174                                 locked_add_64(&tp->t_inpcb->inp_stat->txbytes, len);
1175                                 tp->t_stat.txretransmitbytes += len;
1176                         }
1177                 } else {
1178                         tcpstat.tcps_sndpack++;
1179                         tcpstat.tcps_sndbyte += len;
1180                         if (nstat_collect) {
1181                                 locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1);
1182                                 locked_add_64(&tp->t_inpcb->inp_stat->txbytes, len);
1183                         }
1184                 }
1185 #ifdef notyet
1186                 if ((m = m_copypack(so->so_snd.sb_mb, off,
1187                     (int)len, max_linkhdr + hdrlen)) == 0) {
1188                         error = ENOBUFS;
1189                         goto out;
1190                 }
1191                 /*
1192                  * m_copypack left space for our hdr; use it.
1193                  */
1194                 m->m_len += hdrlen;
1195                 m->m_data -= hdrlen;
1196 #else
1197                 /*
1198                  * try to use the new interface that allocates all
1199                  * the necessary mbuf hdrs under 1 mbuf lock and
1200                  * avoids rescanning the socket mbuf list if
1201                  * certain conditions are met.  This routine can't
1202                  * be used in the following cases...
1203                  * 1) the protocol headers exceed the capacity of
1204                  * of a single mbuf header's data area (no cluster attached)
1205                  * 2) the length of the data being transmitted plus
1206                  * the protocol headers fits into a single mbuf header's
1207                  * data area (no cluster attached)
1208                  */
1209                 m = NULL;
1210 #if INET6
1211                 if (MHLEN < hdrlen + max_linkhdr) {
1212                         MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1213                         if (m == NULL) {
1214                                 error = ENOBUFS;
1215                                 goto out;
1216                         }
1217                         MCLGET(m, M_DONTWAIT);
1218                         if ((m->m_flags & M_EXT) == 0) {
1219                                 m_freem(m);
1220                                 error = ENOBUFS;
1221                                 goto out;
1222                         }
1223                         m->m_data += max_linkhdr;
1224                         m->m_len = hdrlen;
1225                 }
1226 #endif
1227                 if (len <= MHLEN - hdrlen - max_linkhdr) {
1228                         if (m == NULL) {
1229                                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1230                                 if (m == NULL) {
1231                                         error = ENOBUFS;
1232                                         goto out;
1233                                 }
1234                                 m->m_data += max_linkhdr;
1235                                 m->m_len = hdrlen;
1236                         }
1237                         /* makes sure we still have data left to be sent at this point */
1238                         if (so->so_snd.sb_mb == NULL || off < 0) {
1239                                 if (m != NULL)  m_freem(m);
1240                                 error = 0; /* should we return an error? */
1241                                 goto out;
1242                         }
1243                         m_copydata(so->so_snd.sb_mb, off, (int) len,
1244                             mtod(m, caddr_t) + hdrlen);
1245                         m->m_len += len;
1246                 } else {
1247                         if (m != NULL) {
1248                                 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
1249                                 if (m->m_next == 0) {
1250                                         (void) m_free(m);
1251                                         error = ENOBUFS;
1252                                         goto out;
1253                                 }
1254                         } else {
1255                                 /*
1256                                  * determine whether the mbuf pointer and offset passed back by the 'last' call
1257                                  * to m_copym_with_hdrs are still valid... if the head of the socket chain has
1258                                  * changed (due to an incoming ACK for instance), or the offset into the chain we
1259                                  * just computed is different from the one last returned by m_copym_with_hdrs (perhaps
1260                                  * we're re-transmitting a packet sent earlier), than we can't pass the mbuf pointer and
1261                                  * offset into it as valid hints for m_copym_with_hdrs to use (if valid, these hints allow
1262                                  * m_copym_with_hdrs to avoid rescanning from the beginning of the socket buffer mbuf list.
1263                                  * setting the mbuf pointer to NULL is sufficient to disable the hint mechanism.
1264                                  */
1265                                 if (m_head != so->so_snd.sb_mb || sack_rxmit || last_off != off)
1266                                         m_lastm = NULL;
1267                                 last_off = off + len;
1268                                 m_head = so->so_snd.sb_mb;
1269
1270                                 /* makes sure we still have data left to be sent at this point */
1271                                 if (m_head == NULL) {
1272                                         error = 0; /* should we return an error? */
1273                                         goto out;
1274                                 }
1275
1276                                 /*
1277                                  * m_copym_with_hdrs will always return the last mbuf pointer and the offset into it that
1278                                  * it acted on to fullfill the current request, whether a valid 'hint' was passed in or not
1279                                  */
1280                                 if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, len, M_DONTWAIT, &m_lastm, &m_off)) == NULL) {
1281                                         error = ENOBUFS;
1282                                         goto out;
1283                                 }
1284                                 m->m_data += max_linkhdr;
1285                                 m->m_len = hdrlen;
1286                         }
1287                 }
1288 #endif
1289                 /*
1290                  * If we're sending everything we've got, set PUSH.
1291                  * (This will keep happy those implementations which only
1292                  * give data to the user when a buffer fills or
1293                  * a PUSH comes in.)
1294                  */
1295                 if (off + len == so->so_snd.sb_cc)
1296                         flags |= TH_PUSH;
1297         } else {
1298                 if (tp->t_flags & TF_ACKNOW)
1299                         tcpstat.tcps_sndacks++;
1300                 else if (flags & (TH_SYN|TH_FIN|TH_RST))
1301                         tcpstat.tcps_sndctrl++;
1302                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
1303                         tcpstat.tcps_sndurg++;
1304                 else
1305                         tcpstat.tcps_sndwinup++;
1306
1307                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1308                 if (m == NULL) {
1309                         error = ENOBUFS;
1310                         goto out;
1311                 }
1312 #if INET6
1313                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
1314                     MHLEN >= hdrlen) {
1315                         MH_ALIGN(m, hdrlen);
1316                 } else
1317 #endif
1318                 m->m_data += max_linkhdr;
1319                 m->m_len = hdrlen;
1320         }
1321         m->m_pkthdr.rcvif = 0;
1322 #if CONFIG_MACF_NET
1323         mac_mbuf_label_associate_inpcb(tp->t_inpcb, m);
1324 #endif
1325 #if INET6
1326         if (isipv6) {
1327                 ip6 = mtod(m, struct ip6_hdr *);
1328                 th = (struct tcphdr *)(ip6 + 1);
1329                 tcp_fillheaders(tp, ip6, th);
1330                 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
1331                         !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1332                         ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
1333                 }
1334         } else
1335 #endif /* INET6 */
1336         {
1337                 ip = mtod(m, struct ip *);
1338                 ipov = (struct ipovly *)ip;
1339                 th = (struct tcphdr *)(ip + 1);
1340                 /* this picks up the pseudo header (w/o the length) */
1341                 tcp_fillheaders(tp, ip, th);
1342                 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
1343                         !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1344                         ip->ip_tos = IPTOS_ECN_ECT0;
1345                 }
1346         }
1347
1348         /*
1349          * Fill in fields, remembering maximum advertised
1350          * window for use in delaying messages about window sizes.
1351          * If resending a FIN, be sure not to use a new sequence number.
1352          */
1353         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
1354             tp->snd_nxt == tp->snd_max)
1355                 tp->snd_nxt--;
1356         /*
1357          * If we are doing retransmissions, then snd_nxt will
1358          * not reflect the first unsent octet.  For ACK only
1359          * packets, we do not want the sequence number of the
1360          * retransmitted packet, we want the sequence number
1361          * of the next unsent octet.  So, if there is no data
1362          * (and no SYN or FIN), use snd_max instead of snd_nxt
1363          * when filling in ti_seq.  But if we are in persist
1364          * state, snd_max might reflect one byte beyond the
1365          * right edge of the window, so use snd_nxt in that
1366          * case, since we know we aren't doing a retransmission.
1367          * (retransmit and persist are mutually exclusive...)
1368          */
1369         if (sack_rxmit == 0) {
1370                 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
1371                         th->th_seq = htonl(tp->snd_nxt);
1372                 else
1373                         th->th_seq = htonl(tp->snd_max);
1374         } else {
1375                 th->th_seq = htonl(p->rxmit);
1376                 p->rxmit += len;
1377                 tp->sackhint.sack_bytes_rexmit += len;
1378         }
1379         th->th_ack = htonl(tp->rcv_nxt);
1380         tp->last_ack_sent = tp->rcv_nxt;
1381
1382         if (optlen) {
1383                 bcopy(opt, th + 1, optlen);
1384                 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
1385         }
1386         th->th_flags = flags;
1387         /*
1388          * Calculate receive window.  Don't shrink window,
1389          * but avoid silly window syndrome.
1390          */
1391         if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) && recwin < (int)tp->t_maxseg)
1392                 recwin = 0;
1393         if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt))
1394                 recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
1395         if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
1396                 if (recwin > (int32_t)slowlink_wsize)
1397                         recwin = slowlink_wsize;
1398         }
1399
1400 #if TRAFFIC_MGT
1401         if (tcp_recv_bg == 1  || is_tcp_recv_bg(so)) {
1402                 if (tp->acc_iaj > tcp_acc_iaj_react_limit) {
1403                         uint32_t min_iaj_win = tcp_min_iaj_win * tp->t_maxseg;
1404                         if (tp->iaj_rwintop == 0 ||
1405                                 SEQ_LT(tp->iaj_rwintop, tp->rcv_adv))
1406                                 tp->iaj_rwintop = tp->rcv_adv;
1407                         if (SEQ_LT(tp->iaj_rwintop, tp->rcv_nxt + min_iaj_win))
1408                                 tp->iaj_rwintop =  tp->rcv_nxt + min_iaj_win;
1409                         recwin = min(tp->iaj_rwintop - tp->rcv_nxt, recwin);
1410                 }
1411         }
1412 #endif /* TRAFFIC_MGT */
1413
1414         if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
1415                 recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
1416         th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
1417
1418         /*
1419          * Adjust the RXWIN0SENT flag - indicate that we have advertised
1420          * a 0 window.  This may cause the remote transmitter to stall.  This
1421          * flag tells soreceive() to disable delayed acknowledgements when
1422          * draining the buffer.  This can occur if the receiver is attempting
1423          * to read more data then can be buffered prior to transmitting on
1424          * the connection.
1425          */
1426         if (th->th_win == 0)
1427                 tp->t_flags |= TF_RXWIN0SENT;
1428         else
1429                 tp->t_flags &= ~TF_RXWIN0SENT;
1430         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1431                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
1432                 th->th_flags |= TH_URG;
1433         } else
1434                 /*
1435                  * If no urgent pointer to send, then we pull
1436                  * the urgent pointer to the left edge of the send window
1437                  * so that it doesn't drift into the send window on sequence
1438                  * number wraparound.
1439                  */
1440                 tp->snd_up = tp->snd_una;               /* drag it along */
1441
1442         /*
1443          * Put TCP length in extended header, and then
1444          * checksum extended header and data.
1445          */
1446         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
1447 #if INET6
1448         if (isipv6) {
1449                 /*
1450                  * ip6_plen is not need to be filled now, and will be filled
1451                  * in ip6_output.
1452                  */
1453                 m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
1454                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1455                 if (len + optlen)
1456                         th->th_sum = in_addword(th->th_sum,
1457                                 htons((u_short)(optlen + len)));
1458         }
1459         else
1460 #endif /* INET6 */
1461         {
1462                 m->m_pkthdr.csum_flags = CSUM_TCP;
1463                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1464                 if (len + optlen)
1465                         th->th_sum = in_addword(th->th_sum,
1466                                 htons((u_short)(optlen + len)));
1467         }
1468
1469         /*
1470          * Enable TSO and specify the size of the segments.
1471          * The TCP pseudo header checksum is always provided.
1472          */
1473         if (tso) {
1474 #if INET6
1475                 if (isipv6)
1476                         m->m_pkthdr.csum_flags = CSUM_TSO_IPV6;
1477                 else
1478 #endif /* INET6 */
1479                         m->m_pkthdr.csum_flags = CSUM_TSO_IPV4;
1480
1481                 m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
1482         }
1483         else
1484                 m->m_pkthdr.tso_segsz = 0;
1485
1486         /*
1487          * In transmit state, time the transmission and arrange for
1488          * the retransmit.  In persist state, just set snd_max.
1489          */
1490         if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
1491                 tcp_seq startseq = tp->snd_nxt;
1492
1493                 /*
1494                  * Advance snd_nxt over sequence space of this segment.
1495                  */
1496                 if (flags & (TH_SYN|TH_FIN)) {
1497                         if (flags & TH_SYN)
1498                                 tp->snd_nxt++;
1499                         if (flags & TH_FIN) {
1500                                 tp->snd_nxt++;
1501                                 tp->t_flags |= TF_SENTFIN;
1502                         }
1503                 }
1504                 if (sack_rxmit)
1505                         goto timer;
1506                 tp->snd_nxt += len;
1507                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1508                         tp->snd_max = tp->snd_nxt;
1509                         /*
1510                          * Time this transmission if not a retransmission and
1511                          * not currently timing anything.
1512                          */
1513                         if (tp->t_rtttime == 0) {
1514                                 tp->t_rtttime = tcp_now;
1515                                 tp->t_rtseq = startseq;
1516                                 tcpstat.tcps_segstimed++;
1517                         }
1518                 }
1519
1520                 /*
1521                  * Set retransmit timer if not currently set,
1522                  * and not doing an ack or a keep-alive probe.
1523                  * Initial value for retransmit timer is smoothed
1524                  * round-trip time + 2 * round-trip time variance.
1525                  * Initialize shift counter which is used for backoff
1526                  * of retransmit time.
1527                  */
1528 timer:
1529                 if (tp->t_timer[TCPT_REXMT] == 0 &&
1530                     ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
1531                         tp->snd_nxt != tp->snd_una)) {
1532                         if (tp->t_timer[TCPT_PERSIST]) {
1533                                 tp->t_timer[TCPT_PERSIST] = 0;
1534                                 tp->t_rxtshift = 0;
1535                                 tp->rxt_start = 0;
1536                                 tp->t_persist_stop = 0;
1537                         }
1538                         tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
1539                 }
1540         } else {
1541                 /*
1542                  * Persist case, update snd_max but since we are in
1543                  * persist mode (no window) we do not update snd_nxt.
1544                  */
1545                 int xlen = len;
1546                 if (flags & TH_SYN)
1547                         ++xlen;
1548                 if (flags & TH_FIN) {
1549                         ++xlen;
1550                         tp->t_flags |= TF_SENTFIN;
1551                 }
1552                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
1553                         tp->snd_max = tp->snd_nxt + len;
1554         }
1555
1556 #if TCPDEBUG
1557         /*
1558          * Trace.
1559          */
1560         if (so_options & SO_DEBUG)
1561                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
1562 #endif
1563
1564         /*
1565          * Fill in IP length and desired time to live and
1566          * send to IP level.  There should be a better way
1567          * to handle ttl and tos; we could keep them in
1568          * the template, but need a way to checksum without them.
1569          */
1570         /*
1571          * m->m_pkthdr.len should have been set before cksum calcuration,
1572          * because in6_cksum() need it.
1573          */
1574 #if INET6
1575         if (isipv6) {
1576                 struct rtentry *rt6;
1577                 struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 };
1578                 unsigned int outif;
1579
1580                 KERNEL_DEBUG(DBG_LAYER_BEG,
1581                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
1582                      (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
1583                       (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
1584                      0,0,0);
1585                 /*
1586                  * we separately set hoplimit for every segment, since the
1587                  * user might want to change the value via setsockopt.
1588                  * Also, desired default hop limit might be changed via
1589                  * Neighbor Discovery.
1590                  */
1591                 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
1592                                                tp->t_inpcb->in6p_route.ro_rt ?
1593                                                tp->t_inpcb->in6p_route.ro_rt->rt_ifp
1594                                                : NULL);
1595
1596                 /* TODO: IPv6 IP6TOS_ECT bit on */
1597 #if IPSEC
1598                 if (ipsec_bypass == 0 && ipsec_setsocket(m, so) != 0) {
1599                         m_freem(m);
1600                         error = ENOBUFS;
1601                         goto out;
1602                 }
1603 #endif /*IPSEC*/
1604                 m->m_pkthdr.socket_id = socket_id;
1605
1606                 rt6 = tp->t_inpcb->in6p_route.ro_rt;
1607                 if (rt6 != NULL && rt6->rt_ifp != NULL
1608                         && rt6->rt_ifp != lo_ifp)
1609                         set_packet_tclass(m, so, MBUF_TC_UNSPEC, 1);
1610
1611                 DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, tp->t_inpcb, struct ip6_hdr *, ip6,
1612                         struct tcpcb *, tp, struct tcphdr *, th);
1613
1614                 if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
1615                         ip6oa.ip6oa_boundif = tp->t_inpcb->inp_boundif;
1616
1617                 ip6oa.ip6oa_nocell = (tp->t_inpcb->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
1618
1619                 error = ip6_output(m, inp6_pktopts, &tp->t_inpcb->in6p_route,
1620                     (so_options & SO_DONTROUTE) | IPV6_OUTARGS, NULL, NULL,
1621                     &ip6oa);
1622
1623                 /* Refresh rt6 as we may have lost the route while in ip6_output() */
1624                 if ((rt6 = tp->t_inpcb->in6p_route.ro_rt) != NULL &&
1625                     (outif = rt6->rt_ifp->if_index) != tp->t_inpcb->in6p_last_outif)
1626                         tp->t_inpcb->in6p_last_outif = outif;
1627         } else
1628 #endif /* INET6 */
1629     {
1630         ip->ip_len = m->m_pkthdr.len;
1631         ip->ip_ttl = tp->t_inpcb->inp_ip_ttl;   /* XXX */
1632         ip->ip_tos |= (tp->t_inpcb->inp_ip_tos & ~IPTOS_ECN_MASK);      /* XXX */
1633
1634
1635         KERNEL_DEBUG(DBG_LAYER_BEG,
1636              ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
1637              (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
1638               (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
1639              0,0,0);
1640
1641         /*
1642          * See if we should do MTU discovery.
1643          * Look at the flag updated on the following criterias:
1644          *      1) Path MTU discovery is authorized by the sysctl
1645          *      2) The route isn't set yet (unlikely but could happen)
1646          *      3) The route is up
1647          *      4) the MTU is not locked (if it is, then discovery has been
1648          *         disabled for that route)
1649          */
1650
1651         if (path_mtu_discovery && (tp->t_flags & TF_PMTUD))
1652                 ip->ip_off |= IP_DF;
1653
1654 #if IPSEC
1655         if (ipsec_bypass == 0)
1656                 ipsec_setsocket(m, so);
1657 #endif /*IPSEC*/
1658
1659         /*
1660          * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
1661          */
1662         lost = 0;
1663         m->m_pkthdr.socket_id = socket_id;
1664         m->m_nextpkt = NULL;
1665
1666         if (tp->t_inpcb->inp_route.ro_rt != NULL &&
1667                 tp->t_inpcb->inp_route.ro_rt->rt_ifp != NULL &&
1668                 tp->t_inpcb->inp_route.ro_rt->rt_ifp != lo_ifp)
1669                 set_packet_tclass(m, so, MBUF_TC_UNSPEC, 0);
1670
1671         tp->t_pktlist_sentlen += len;
1672         tp->t_lastchain++;
1673
1674         DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, tp->t_inpcb,
1675                 struct ip *, ip, struct tcpcb *, tp, struct tcphdr *, th);
1676
1677         if (tp->t_pktlist_head != NULL) {
1678                 tp->t_pktlist_tail->m_nextpkt = m;
1679                 tp->t_pktlist_tail = m;
1680         } else {
1681                 packchain_newlist++;
1682                 tp->t_pktlist_head = tp->t_pktlist_tail = m;
1683         }
1684
1685         if (sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) ||
1686               (tp->snd_cwnd <= (tp->snd_wnd / 8)) ||
1687               (tp->t_flags & (TH_PUSH | TF_ACKNOW)) || tp->t_force != 0 ||
1688               tp->t_lastchain >= tcp_packet_chaining) {
1689                 error = 0;
1690                 while (!(tp->t_flags & TF_SENDINPROG) &&
1691                     tp->t_pktlist_head != NULL) {
1692                         packetlist = tp->t_pktlist_head;
1693                         packchain_listadd = tp->t_lastchain;
1694                         packchain_sent++;
1695                         lost = tp->t_pktlist_sentlen;
1696                         TCP_PKTLIST_CLEAR(tp);
1697                         tp->t_flags |= TF_SENDINPROG;
1698
1699                         error = tcp_ip_output(so, tp, packetlist,
1700                             packchain_listadd, tp_inp_options,
1701                             (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)), recwin);
1702
1703                         tp->t_flags &= ~TF_SENDINPROG;
1704                         if (error) {
1705                                 /*
1706                                  * Take into account the rest of unsent
1707                                  * packets in the packet list for this tcp
1708                                  * into "lost", since we're about to free
1709                                  * the whole list below.
1710                                  */
1711                                 lost += tp->t_pktlist_sentlen;
1712                                 break;
1713                         } else {
1714                                 lost = 0;
1715                         }
1716                 }
1717                 /* tcp was closed while we were in ip; resume close */
1718                 if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
1719                         tp->t_flags &= ~TF_CLOSING;
1720                         (void) tcp_close(tp);
1721                         return (0);
1722                 }
1723         }
1724         else {
1725                 error = 0;
1726                 packchain_looped++;
1727                 tcpstat.tcps_sndtotal++;
1728
1729                 goto again;
1730         }
1731    }
1732         if (error) {
1733                 /*
1734                  * Assume that the packets were lost, so back out the
1735                  * sequence number advance, if any.  Note that the "lost"
1736                  * variable represents the amount of user data sent during
1737                  * the recent call to ip_output_list() plus the amount of
1738                  * user data in the packet list for this tcp at the moment.
1739                  */
1740                 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
1741                         /*
1742                          * No need to check for TH_FIN here because
1743                          * the TF_SENTFIN flag handles that case.
1744                          */
1745                         if ((flags & TH_SYN) == 0) {
1746                                 if (sack_rxmit) {
1747                                         p->rxmit -= lost;
1748                                         tp->sackhint.sack_bytes_rexmit -= lost;
1749                                 } else
1750                                         tp->snd_nxt -= lost;
1751                         }
1752                 }
1753 out:
1754                 if (tp->t_pktlist_head != NULL)
1755                         m_freem_list(tp->t_pktlist_head);
1756                 TCP_PKTLIST_CLEAR(tp);
1757
1758                 if (error == ENOBUFS) {
1759                         if (!tp->t_timer[TCPT_REXMT] &&
1760                                  !tp->t_timer[TCPT_PERSIST])
1761                                         tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
1762
1763                         tp->snd_cwnd = tp->t_maxseg;
1764                         tp->t_bytes_acked = 0;
1765
1766                         tcp_check_timer_state(tp);
1767                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1768
1769                         DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb,
1770                                 struct tcpcb *, tp, struct tcphdr *, NULL,
1771                                 int32_t, TCP_CC_OUTPUT_ERROR);
1772                         return (0);
1773                 }
1774                 if (error == EMSGSIZE) {
1775                         /*
1776                          * ip_output() will have already fixed the route
1777                          * for us.  tcp_mtudisc() will, as its last action,
1778                          * initiate retransmission, so it is important to
1779                          * not do so here.
1780                          *
1781                          * If TSO was active we either got an interface
1782                          * without TSO capabilits or TSO was turned off.
1783                          * Disable it for this connection as too and
1784                          * immediatly retry with MSS sized segments generated
1785                          * by this function.
1786                          */
1787                         if (tso)
1788                                 tp->t_flags &= ~TF_TSO;
1789
1790                         tcp_mtudisc(tp->t_inpcb, 0);
1791                         tcp_check_timer_state(tp);
1792
1793                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1794                         return 0;
1795                 }
1796                 if ((error == EHOSTUNREACH || error == ENETDOWN)
1797                     && TCPS_HAVERCVDSYN(tp->t_state)) {
1798                         tp->t_softerror = error;
1799                         tcp_check_timer_state(tp);
1800                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1801                         return (0);
1802                 }
1803                 tcp_check_timer_state(tp);
1804                 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1805                 return (error);
1806         }
1807
1808         tcpstat.tcps_sndtotal++;
1809
1810 #if INET6
1811         /*
1812          * Data sent (as far as we can tell).
1813          * If this advertises a larger window than any other segment,
1814          * then remember the size of the advertised window.
1815          * Make sure ACK/DELACK conditions are cleared before
1816          * we unlock the socket.
1817          *  NOTE: for now, this is done in tcp_ip_output for IPv4
1818          */
1819         if (isipv6) {
1820                 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
1821                         tp->rcv_adv = tp->rcv_nxt + recwin;
1822                 tp->last_ack_sent = tp->rcv_nxt;
1823                 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
1824                 tp->t_timer[TCPT_DELACK] = 0;
1825                 tp->t_unacksegs = 0;
1826         }
1827 #endif
1828
1829         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0);
1830         if (sendalot)
1831                 goto again;
1832
1833         tcp_check_timer_state(tp);
1834         return (0);
1835 }
1836
1837 static int
1838 tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
1839     int cnt, struct mbuf *opt, int flags, int sack_in_progress, int recwin)
1840 {
1841         int error = 0;
1842         boolean_t chain;
1843         boolean_t unlocked = FALSE;
1844         struct inpcb *inp = tp->t_inpcb;
1845         struct ip_out_args ipoa;
1846         struct route ro;
1847         unsigned int outif;
1848
1849         /* If socket was bound to an ifindex, tell ip_output about it */
1850         ipoa.ipoa_boundif = (inp->inp_flags & INP_BOUND_IF) ?
1851             inp->inp_boundif : IFSCOPE_NONE;
1852         ipoa.ipoa_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
1853         flags |= IP_OUTARGS;
1854
1855         /* Copy the cached route and take an extra reference */
1856         inp_route_copyout(inp, &ro);
1857
1858         /*
1859          * Data sent (as far as we can tell).
1860          * If this advertises a larger window than any other segment,
1861          * then remember the size of the advertised window.
1862          * Make sure ACK/DELACK conditions are cleared before
1863          * we unlock the socket.
1864          */
1865         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
1866                 tp->rcv_adv = tp->rcv_nxt + recwin;
1867         tp->last_ack_sent = tp->rcv_nxt;
1868         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
1869         tp->t_timer[TCPT_DELACK] = 0;
1870         tp->t_unacksegs = 0;
1871
1872         /*
1873          * If allowed, unlock TCP socket while in IP
1874          * but only if the connection is established and
1875          * in a normal mode where reentrancy on the tcpcb won't be
1876          * an issue:
1877          * - there is no SACK episode
1878          * - we're not in Fast Recovery mode
1879          * - if we're not sending from an upcall.
1880          */
1881         if (tcp_output_unlocked && ((so->so_flags & SOF_UPCALLINUSE) == 0) &&
1882             (tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0) &&
1883             ((tp->t_flags & TF_FASTRECOVERY) == 0)) {
1884                 unlocked = TRUE;
1885                 socket_unlock(so, 0);
1886         }
1887
1888         /*
1889          * Don't send down a chain of packets when:
1890          * - TCP chaining is disabled
1891          * - there is an IPsec rule set
1892          * - there is a non default rule set for the firewall
1893          */
1894
1895         chain = tcp_packet_chaining > 1
1896 #if IPSEC
1897                 && ipsec_bypass
1898 #endif
1899 #if IPFIREWALL
1900                 && (fw_enable == 0 || fw_bypass)
1901 #endif
1902                 ; // I'm important, not extraneous
1903
1904
1905         while (pkt != NULL) {
1906                 struct mbuf *npkt = pkt->m_nextpkt;
1907
1908                 if (!chain) {
1909                         pkt->m_nextpkt = NULL;
1910                         /*
1911                          * If we are not chaining, make sure to set the packet
1912                          * list count to 0 so that IP takes the right path;
1913                          * this is important for cases such as IPSec where a
1914                          * single mbuf might result in multiple mbufs as part
1915                          * of the encapsulation.  If a non-zero count is passed
1916                          * down to IP, the head of the chain might change and
1917                          * we could end up skipping it (thus generating bogus
1918                          * packets).  Fixing it in IP would be desirable, but
1919                          * for now this would do it.
1920                          */
1921                         cnt = 0;
1922                 }
1923                 error = ip_output_list(pkt, cnt, opt, &ro, flags, 0, &ipoa);
1924                 if (chain || error) {
1925                         /*
1926                          * If we sent down a chain then we are done since
1927                          * the callee had taken care of everything; else
1928                          * we need to free the rest of the chain ourselves.
1929                          */
1930                         if (!chain)
1931                                 m_freem_list(npkt);
1932                         break;
1933                 }
1934                 pkt = npkt;
1935         }
1936
1937         if (unlocked)
1938                 socket_lock(so, 0);
1939
1940         if (ro.ro_rt != NULL &&
1941             (outif = ro.ro_rt->rt_ifp->if_index) != inp->inp_last_outif)
1942                 inp->inp_last_outif = outif;
1943
1944         /* Synchronize cached PCB route */
1945         inp_route_copyin(inp, &ro);
1946
1947         return (error);
1948 }
1949
1950 void
1951 tcp_setpersist(tp)
1952         register struct tcpcb *tp;
1953 {
1954         int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
1955
1956         /* If a PERSIST_TIMER option was set we will limit the
1957          * time the persist timer will be active for that connection
1958          * in order to avoid DOS by using zero window probes.
1959          * see rdar://5805356
1960          */
1961
1962         if ((tp->t_persist_timeout != 0) &&
1963             (tp->t_timer[TCPT_PERSIST] == 0) &&
1964             (tp->t_persist_stop == 0)) {
1965                 tp->t_persist_stop = tcp_now + tp->t_persist_timeout;
1966         }
1967
1968         /*
1969          * Start/restart persistance timer.
1970          */
1971         TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
1972             t * tcp_backoff[tp->t_rxtshift],
1973             TCPTV_PERSMIN, TCPTV_PERSMAX,
1974             TCP_ADD_REXMTSLOP(tp));
1975         tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]);
1976
1977         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1978                 tp->t_rxtshift++;
1979 }