bsd/netinet/tcp_output.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_output.c        8.4 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #define _IP_VHL
  71
  72
  73 #include <sys/param.h>
  74 #include <sys/systm.h>
  75 #include <sys/kernel.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/mbuf.h>
  78 #include <sys/domain.h>
  79 #include <sys/protosw.h>
  80 #include <sys/socket.h>
  81 #include <sys/socketvar.h>
  82
  83 #include <net/route.h>
  84 #include <net/if_var.h>
  85
  86 #include <netinet/in.h>
  87 #include <netinet/in_systm.h>
  88 #include <netinet/in_var.h>
  89 #include <netinet/ip.h>
  90 #include <netinet/in_pcb.h>
  91 #include <netinet/ip_var.h>
  92 #if INET6
  93 #include <netinet6/in6_pcb.h>
  94 #include <netinet/ip6.h>
  95 #include <netinet6/ip6_var.h>
  96 #endif
  97 #include <netinet/tcp.h>
  98 #define TCPOUTFLAGS
  99 #include <netinet/tcp_fsm.h>
 100 #include <netinet/tcp_seq.h>
 101 #include <netinet/tcp_timer.h>
 102 #include <netinet/tcp_var.h>
 103 #include <netinet/tcpip.h>
 104 #if TCPDEBUG
 105 #include <netinet/tcp_debug.h>
 106 #endif
 107 #include <sys/kdebug.h>
 108
 109 #if IPSEC
 110 #include <netinet6/ipsec.h>
 111 #endif /*IPSEC*/
 112
 113 #if CONFIG_MACF_NET
 114 #include <security/mac_framework.h>
 115 #endif /* MAC_SOCKET */
 116
 117 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETTCP, 1)
 118 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETTCP, 3)
 119 #define DBG_FNC_TCP_OUTPUT      NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
 120
 121
 122 #ifdef notyet
 123 extern struct mbuf *m_copypack();
 124 #endif
 125
 126 int path_mtu_discovery = 1;
 127 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
 128         &path_mtu_discovery, 1, "Enable Path MTU Discovery");
 129
 130 int ss_fltsz = 1;
 131 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
 132         &ss_fltsz, 1, "Slow start flight size");
 133
 134 int ss_fltsz_local = 8; /* starts with eight segments max */
 135 SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
 136         &ss_fltsz_local, 1, "Slow start flight size for local networks");
 137
 138 int     tcp_do_newreno = 0;
 139 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
 140         0, "Enable NewReno Algorithms");
 141
 142 int     tcp_do_tso = 1;
 143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
 144         &tcp_do_tso, 0, "Enable TCP Segmentation Offload");
 145
 146
 147 int     tcp_ecn_outbound = 0;
 148 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out, CTLFLAG_RW, &tcp_ecn_outbound,
 149         0, "Initiate ECN for outbound connections");
 150
 151 int     tcp_ecn_inbound = 0;
 152 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, CTLFLAG_RW, &tcp_ecn_inbound,
 153         0, "Allow ECN negotiation for inbound connections");
 154
 155 int     tcp_packet_chaining = 50;
 156 SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW, &tcp_packet_chaining,
 157         0, "Enable TCP output packet chaining");
 158
 159 int     tcp_output_unlocked = 1;
 160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output, CTLFLAG_RW, &tcp_output_unlocked,
 161         0, "Unlock TCP when sending packets down to IP");
 162
 163 static int32_t packchain_newlist = 0;
 164 static int32_t packchain_looped = 0;
 165 static int32_t packchain_sent = 0;
 166
 167 /* temporary: for testing */
 168 #if IPSEC
 169 extern int ipsec_bypass;
 170 #endif
 171
 172 extern int slowlink_wsize;      /* window correction for slow links */
 173 #if IPFIREWALL
 174 extern int fw_enable;           /* firewall check for packet chaining */
 175 extern int fw_bypass;           /* firewall check: disable packet chaining if there is rules */
 176 #endif /* IPFIREWALL */
 177
 178 extern vm_size_t        so_cache_zone_element_size;
 179 #if RANDOM_IP_ID
 180 extern int              ip_use_randomid;
 181 #endif /* RANDOM_IP_ID */
 182 extern u_int32_t dlil_filter_count;
 183 extern u_int32_t kipf_count;
 184
 185 static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, int,
 186     struct mbuf *, int, int);
 187
 188 static __inline__ u_int16_t
 189 get_socket_id(struct socket * s)
 190 {
 191         u_int16_t               val;
 192
 193         if (so_cache_zone_element_size == 0) {
 194                 return (0);
 195         }
 196         val = (u_int16_t)(((uintptr_t)s) / so_cache_zone_element_size);
 197         if (val == 0) {
 198                 val = 0xffff;
 199         }
 200         return (val);
 201 }
 202
 203 /*
 204  * Tcp output routine: figure out what should be sent and send it.
 205  *
 206  * Returns:     0                       Success
 207  *              EADDRNOTAVAIL
 208  *              ENOBUFS
 209  *              EMSGSIZE
 210  *              EHOSTUNREACH
 211  *              ENETDOWN
 212  *      ip_output_list:ENOMEM
 213  *      ip_output_list:EADDRNOTAVAIL
 214  *      ip_output_list:ENETUNREACH
 215  *      ip_output_list:EHOSTUNREACH
 216  *      ip_output_list:EACCES
 217  *      ip_output_list:EMSGSIZE
 218  *      ip_output_list:ENOBUFS
 219  *      ip_output_list:???              [ignorable: mostly IPSEC/firewall/DLIL]
 220  *      ip6_output:???                  [IPV6 only]
 221  */
 222 int
 223 tcp_output(struct tcpcb *tp)
 224 {
 225         struct socket *so = tp->t_inpcb->inp_socket;
 226         int32_t len, recwin, sendwin, off;
 227         int flags, error;
 228         register struct mbuf *m;
 229         struct ip *ip = NULL;
 230         register struct ipovly *ipov = NULL;
 231 #if INET6
 232         struct ip6_hdr *ip6 = NULL;
 233 #endif /* INET6 */
 234         register struct tcphdr *th;
 235         u_char opt[TCP_MAXOLEN];
 236         unsigned ipoptlen, optlen, hdrlen;
 237         int idle, sendalot, lost = 0;
 238         int i, sack_rxmit;
 239         int tso = 0;
 240         int sack_bytes_rxmt;
 241         struct sackhole *p;
 242 #ifdef IPSEC
 243         unsigned ipsec_optlen = 0;
 244 #endif
 245         int maxburst = TCP_MAXBURST;
 246         int    last_off = 0;
 247         int    m_off;
 248         struct mbuf *m_last = NULL;
 249         struct mbuf *m_head = NULL;
 250         struct mbuf *packetlist = NULL;
 251         struct mbuf *tp_inp_options = tp->t_inpcb->inp_depend4.inp4_options;
 252 #if INET6
 253         int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ;
 254         struct ip6_pktopts *inp6_pktopts = tp->t_inpcb->inp_depend6.inp6_outputopts;
 255 #endif
 256         short packchain_listadd = 0;
 257         u_int16_t       socket_id = get_socket_id(so);
 258         int so_options = so->so_options;
 259         struct rtentry *rt;
 260
 261         /*
 262          * Determine length of data that should be transmitted,
 263          * and flags that will be used.
 264          * If there is some data or critical controls (SYN, RST)
 265          * to send, then transmit; otherwise, investigate further.
 266          */
 267         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 268         if (idle && tp->t_rcvtime >= tp->t_rxtcur) {
 269                 /*
 270                  * We have been idle for "a while" and no acks are
 271                  * expected to clock out any data we send --
 272                  * slow start to get ack "clock" running again.
 273                  *
 274                  * Set the slow-start flight size depending on whether
 275                  * this is a local network or not.
 276                  */
 277                 if (
 278 #if INET6
 279                     (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) ||
 280                     (!isipv6 &&
 281 #endif
 282                      in_localaddr(tp->t_inpcb->inp_faddr)
 283 #if INET6
 284                      )
 285 #endif
 286                     )
 287                         tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
 288                 else
 289                         tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
 290         }
 291         tp->t_flags &= ~TF_LASTIDLE;
 292         if (idle) {
 293                 if (tp->t_flags & TF_MORETOCOME) {
 294                         tp->t_flags |= TF_LASTIDLE;
 295                         idle = 0;
 296                 }
 297         }
 298 again:
 299         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 300
 301 #if INET6
 302         if (isipv6) {
 303
 304                 KERNEL_DEBUG(DBG_LAYER_BEG,
 305                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
 306                      (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
 307                       (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
 308                      sendalot,0,0);
 309         }
 310         else
 311 #endif
 312
 313         {
 314                 KERNEL_DEBUG(DBG_LAYER_BEG,
 315                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
 316                      (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
 317                       (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
 318                      sendalot,0,0);
 319         /*
 320          * If the route generation id changed, we need to check that our
 321          * local (source) IP address is still valid. If it isn't either
 322          * return error or silently do nothing (assuming the address will
 323          * come back before the TCP connection times out).
 324          */
 325         rt = tp->t_inpcb->inp_route.ro_rt;
 326         if (rt != NULL && (!(rt->rt_flags & RTF_UP) ||
 327             rt->generation_id != route_generation)) {
 328                 struct ifnet *ifp;
 329                 struct in_ifaddr *ia;
 330
 331                 /* disable multipages at the socket */
 332                 somultipages(so, FALSE);
 333
 334                 /* Disable TSO for the socket until we know more */
 335                 tp->t_flags &= ~TF_TSO;
 336
 337                 /* check that the source address is still valid */
 338                 if ((ia = ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr)) == NULL) {
 339
 340                         if (tp->t_state >= TCPS_CLOSE_WAIT) {
 341                                 tcp_drop(tp, EADDRNOTAVAIL);
 342                                 return(EADDRNOTAVAIL);
 343                         }
 344
 345                         /* set Retransmit  timer if it wasn't set
 346                          * reset Persist timer and shift register as the
 347                          * adversed peer window may not be valid anymore
 348                          */
 349
 350                         if (!tp->t_timer[TCPT_REXMT]) {
 351                                 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 352                                 if (tp->t_timer[TCPT_PERSIST]) {
 353                                         tp->t_timer[TCPT_PERSIST] = 0;
 354                                         tp->t_rxtshift = 0;
 355                                 }
 356                         }
 357
 358                         if (tp->t_pktlist_head != NULL)
 359                                 m_freem_list(tp->t_pktlist_head);
 360                         TCP_PKTLIST_CLEAR(tp);
 361
 362                         /* drop connection if source address isn't available */
 363                         if (so->so_flags & SOF_NOADDRAVAIL) {
 364                                 tcp_drop(tp, EADDRNOTAVAIL);
 365                                 return(EADDRNOTAVAIL);
 366                         }
 367                         else
 368                                 return(0); /* silently ignore, keep data in socket: address may be back */
 369                 }
 370                 ifafree(&ia->ia_ifa);
 371
 372                 /*
 373                  * Address is still valid; check for multipages capability
 374                  * again in case the outgoing interface has changed.
 375                  */
 376                 RT_LOCK(rt);
 377                 if ((ifp = rt->rt_ifp) != NULL) {
 378                         somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
 379                         tcp_set_tso(tp, ifp);
 380                 }
 381                 if (rt->rt_flags & RTF_UP)
 382                         rt->generation_id = route_generation;
 383                 /*
 384                  * See if we should do MTU discovery. Don't do it if:
 385                  *      1) it is disabled via the sysctl
 386                  *      2) the route isn't up
 387                  *      3) the MTU is locked (if it is, then discovery has been
 388                  *         disabled)
 389                  */
 390
 391                 if (!path_mtu_discovery || ((rt != NULL) &&
 392                     (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
 393                         tp->t_flags &= ~TF_PMTUD;
 394                 else
 395                         tp->t_flags |= TF_PMTUD;
 396
 397                 RT_UNLOCK(rt);
 398         }
 399         }
 400
 401         /*
 402          * If we've recently taken a timeout, snd_max will be greater than
 403          * snd_nxt.  There may be SACK information that allows us to avoid
 404          * resending already delivered data.  Adjust snd_nxt accordingly.
 405          */
 406         if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
 407                 tcp_sack_adjust(tp);
 408         sendalot = 0;
 409         off = tp->snd_nxt - tp->snd_una;
 410         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 411
 412         if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
 413                 sendwin = min(sendwin, slowlink_wsize);
 414
 415         flags = tcp_outflags[tp->t_state];
 416         /*
 417          * Send any SACK-generated retransmissions.  If we're explicitly trying
 418          * to send out new data (when sendalot is 1), bypass this function.
 419          * If we retransmit in fast recovery mode, decrement snd_cwnd, since
 420          * we're replacing a (future) new transmission with a retransmission
 421          * now, and we previously incremented snd_cwnd in tcp_input().
 422          */
 423         /*
 424          * Still in sack recovery , reset rxmit flag to zero.
 425          */
 426         sack_rxmit = 0;
 427         sack_bytes_rxmt = 0;
 428         len = 0;
 429         p = NULL;
 430         if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
 431             (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 432                 int32_t cwin;
 433
 434                 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
 435                 if (cwin < 0)
 436                         cwin = 0;
 437                 /* Do not retransmit SACK segments beyond snd_recover */
 438                 if (SEQ_GT(p->end, tp->snd_recover)) {
 439                         /*
 440                          * (At least) part of sack hole extends beyond
 441                          * snd_recover. Check to see if we can rexmit data
 442                          * for this hole.
 443                          */
 444                         if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
 445                                 /*
 446                                  * Can't rexmit any more data for this hole.
 447                                  * That data will be rexmitted in the next
 448                                  * sack recovery episode, when snd_recover
 449                                  * moves past p->rxmit.
 450                                  */
 451                                 p = NULL;
 452                                 goto after_sack_rexmit;
 453                         } else
 454                                 /* Can rexmit part of the current hole */
 455                                 len = ((int32_t)min(cwin,
 456                                                    tp->snd_recover - p->rxmit));
 457                 } else
 458                         len = ((int32_t)min(cwin, p->end - p->rxmit));
 459                 if (len > 0) {
 460                         off = p->rxmit - tp->snd_una; /* update off only if we really transmit SACK data */
 461                         sack_rxmit = 1;
 462                         sendalot = 1;
 463                         tcpstat.tcps_sack_rexmits++;
 464                         tcpstat.tcps_sack_rexmit_bytes +=
 465                             min(len, tp->t_maxseg);
 466                 }
 467                 else
 468                         len = 0;
 469         }
 470 after_sack_rexmit:
 471         /*
 472          * Get standard flags, and add SYN or FIN if requested by 'hidden'
 473          * state flags.
 474          */
 475         if (tp->t_flags & TF_NEEDFIN)
 476                 flags |= TH_FIN;
 477         if (tp->t_flags & TF_NEEDSYN)
 478                 flags |= TH_SYN;
 479
 480         /*
 481          * If in persist timeout with window of 0, send 1 byte.
 482          * Otherwise, if window is small but nonzero
 483          * and timer expired, we will send what we can
 484          * and go to transmit state.
 485          */
 486         if (tp->t_force) {
 487                 if (sendwin == 0) {
 488                         /*
 489                          * If we still have some data to send, then
 490                          * clear the FIN bit.  Usually this would
 491                          * happen below when it realizes that we
 492                          * aren't sending all the data.  However,
 493                          * if we have exactly 1 byte of unsent data,
 494                          * then it won't clear the FIN bit below,
 495                          * and if we are in persist state, we wind
 496                          * up sending the packet without recording
 497                          * that we sent the FIN bit.
 498                          *
 499                          * We can't just blindly clear the FIN bit,
 500                          * because if we don't have any more data
 501                          * to send then the probe will be the FIN
 502                          * itself.
 503                          */
 504                         if (off < so->so_snd.sb_cc)
 505                                 flags &= ~TH_FIN;
 506                         sendwin = 1;
 507                 } else {
 508                         tp->t_timer[TCPT_PERSIST] = 0;
 509                         tp->t_rxtshift = 0;
 510                 }
 511         }
 512
 513         /*
 514          * If snd_nxt == snd_max and we have transmitted a FIN, the
 515          * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
 516          * a negative length.  This can also occur when TCP opens up
 517          * its congestion window while receiving additional duplicate
 518          * acks after fast-retransmit because TCP will reset snd_nxt
 519          * to snd_max after the fast-retransmit.
 520          *
 521          * In the normal retransmit-FIN-only case, however, snd_nxt will
 522          * be set to snd_una, the offset will be 0, and the length may
 523          * wind up 0.
 524          *
 525          * If sack_rxmit is true we are retransmitting from the scoreboard
 526          * in which case len is already set.
 527          */
 528         if (sack_rxmit == 0) {
 529                 if (sack_bytes_rxmt == 0)
 530                         len = min(so->so_snd.sb_cc, sendwin) - off;
 531                 else {
 532                         int32_t cwin;
 533
 534                         /*
 535                          * We are inside of a SACK recovery episode and are
 536                          * sending new data, having retransmitted all the
 537                          * data possible in the scoreboard.
 538                          */
 539                         len = min(so->so_snd.sb_cc, tp->snd_wnd)
 540                                - off;
 541                         /*
 542                          * Don't remove this (len > 0) check !
 543                          * We explicitly check for len > 0 here (although it
 544                          * isn't really necessary), to work around a gcc
 545                          * optimization issue - to force gcc to compute
 546                          * len above. Without this check, the computation
 547                          * of len is bungled by the optimizer.
 548                          */
 549                         if (len > 0) {
 550                                 cwin = tp->snd_cwnd -
 551                                         (tp->snd_nxt - tp->sack_newdata) -
 552                                         sack_bytes_rxmt;
 553                                 if (cwin < 0)
 554                                         cwin = 0;
 555                                 len = imin(len, cwin);
 556                         }
 557                         else
 558                                 len = 0;
 559                 }
 560         }
 561
 562         /*
 563          * Lop off SYN bit if it has already been sent.  However, if this
 564          * is SYN-SENT state and if segment contains data and if we don't
 565          * know that foreign host supports TAO, suppress sending segment.
 566          */
 567         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 568                 flags &= ~TH_SYN;
 569                 off--, len++;
 570                 if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
 571                         while (!(tp->t_flags & TF_SENDINPROG) &&
 572                             tp->t_pktlist_head != NULL) {
 573                                 packetlist = tp->t_pktlist_head;
 574                                 packchain_listadd = tp->t_lastchain;
 575                                 packchain_sent++;
 576                                 TCP_PKTLIST_CLEAR(tp);
 577                                 tp->t_flags |= TF_SENDINPROG;
 578
 579                                 error = tcp_ip_output(so, tp, packetlist,
 580                                     packchain_listadd, tp_inp_options,
 581                                     (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)));
 582
 583                                 tp->t_flags &= ~TF_SENDINPROG;
 584                         }
 585                         /* tcp was closed while we were in ip; resume close */
 586                         if ((tp->t_flags &
 587                             (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
 588                                 tp->t_flags &= ~TF_CLOSING;
 589                                 (void) tcp_close(tp);
 590                         }
 591                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,
 592                             0,0,0,0,0);
 593                         return 0;
 594                 }
 595         }
 596
 597         /*
 598          * Be careful not to send data and/or FIN on SYN segments.
 599          * This measure is needed to prevent interoperability problems
 600          * with not fully conformant TCP implementations.
 601          */
 602         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 603                 len = 0;
 604                 flags &= ~TH_FIN;
 605         }
 606
 607         if (len < 0) {
 608                 /*
 609                  * If FIN has been sent but not acked,
 610                  * but we haven't been called to retransmit,
 611                  * len will be < 0.  Otherwise, window shrank
 612                  * after we sent into it.  If window shrank to 0,
 613                  * cancel pending retransmit, pull snd_nxt back
 614                  * to (closed) window, and set the persist timer
 615                  * if it isn't already going.  If the window didn't
 616                  * close completely, just wait for an ACK.
 617                  */
 618                 len = 0;
 619                 if (sendwin == 0) {
 620                         tp->t_timer[TCPT_REXMT] = 0;
 621                         tp->t_rxtshift = 0;
 622                         tp->snd_nxt = tp->snd_una;
 623                         if (tp->t_timer[TCPT_PERSIST] == 0)
 624                                 tcp_setpersist(tp);
 625                 }
 626         }
 627
 628         /*
 629          * Truncate to the maximum segment length or enable TCP Segmentation
 630          * Offloading (if supported by hardware) and ensure that FIN is removed
 631          * if the length no longer contains the last data byte.
 632          *
 633          * TSO may only be used if we are in a pure bulk sending state.  The
 634          * presence of TCP-MD5, SACK retransmits, SACK advertizements, ipfw rules
 635          * and IP options prevent using TSO.  With TSO the TCP header is the same
 636          * (except for the sequence number) for all generated packets.  This
 637          * makes it impossible to transmit any options which vary per generated
 638          * segment or packet.
 639          *
 640          * The length of TSO bursts is limited to TCP_MAXWIN.  That limit and
 641          * removal of FIN (if not already catched here) are handled later after
 642          * the exact length of the TCP options are known.
 643          */
 644 #if IPSEC
 645         /*
 646          * Pre-calculate here as we save another lookup into the darknesses
 647          * of IPsec that way and can actually decide if TSO is ok.
 648          */
 649         if (ipsec_bypass == 0)
 650                 ipsec_optlen = ipsec_hdrsiz_tcp(tp);
 651 #endif
 652
 653         if (len > tp->t_maxseg) {
 654                 if ((tp->t_flags & TF_TSO) && tcp_do_tso &&
 655 #if RANDOM_IP_ID
 656                     ip_use_randomid &&
 657 #endif /* RANDOM_IP_ID */
 658                     kipf_count == 0 && dlil_filter_count == 0 &&
 659                     tp->rcv_numsacks == 0 && sack_rxmit == 0  && sack_bytes_rxmt == 0 &&
 660                     tp->t_inpcb->inp_options == NULL &&
 661                     tp->t_inpcb->in6p_options == NULL
 662 #if IPSEC
 663                     && ipsec_optlen == 0
 664 #endif
 665 #if IPFIREWALL
 666                     && (fw_enable == 0 || fw_bypass)
 667 #endif
 668                     ) {
 669                         tso = 1;
 670                         sendalot = 0;
 671                 } else {
 672                         len = tp->t_maxseg;
 673                         sendalot = 1;
 674                         tso = 0;
 675                 }
 676         }
 677         if (sack_rxmit) {
 678                 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
 679                         flags &= ~TH_FIN;
 680         } else {
 681                 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
 682                         flags &= ~TH_FIN;
 683         }
 684
 685         recwin = tcp_sbspace(tp);
 686
 687         /*
 688          * Sender silly window avoidance.   We transmit under the following
 689          * conditions when len is non-zero:
 690          *
 691          *      - We have a full segment (or more with TSO)
 692          *      - This is the last buffer in a write()/send() and we are
 693          *        either idle or running NODELAY
 694          *      - we've timed out (e.g. persist timer)
 695          *      - we have more then 1/2 the maximum send window's worth of
 696          *        data (receiver may be limited the window size)
 697          *      - we need to retransmit
 698          */
 699         if (len) {
 700                 if (len >= tp->t_maxseg) {
 701                         tp->t_flags |= TF_MAXSEGSNT;
 702                         goto send;
 703                 }
 704                 if (!(tp->t_flags & TF_MORETOCOME) &&
 705                     (idle || tp->t_flags & TF_NODELAY || tp->t_flags & TF_MAXSEGSNT) &&
 706                     (tp->t_flags & TF_NOPUSH) == 0 &&
 707                     len + off >= so->so_snd.sb_cc) {
 708                         tp->t_flags &= ~TF_MAXSEGSNT;
 709                         goto send;
 710                 }
 711                 if (tp->t_force) {
 712                         tp->t_flags &= ~TF_MAXSEGSNT;
 713                         goto send;
 714                 }
 715                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
 716                         tp->t_flags &= ~TF_MAXSEGSNT;
 717                         goto send;
 718                 }
 719                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 720                         tp->t_flags &= ~TF_MAXSEGSNT;
 721                         goto send;
 722                 }
 723                 if (sack_rxmit)
 724                         goto send;
 725         }
 726
 727         /*
 728          * Compare available window to amount of window
 729          * known to peer (as advertised window less
 730          * next expected input).  If the difference is at least two
 731          * max size segments, or at least 50% of the maximum possible
 732          * window, then want to send a window update to peer.
 733          * Skip this if the connection is in T/TCP half-open state.
 734          */
 735         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
 736                 /*
 737                  * "adv" is the amount we can increase the window,
 738                  * taking into account that we are limited by
 739                  * TCP_MAXWIN << tp->rcv_scale.
 740                  */
 741                 int32_t adv = imin(recwin, (int)TCP_MAXWIN << tp->rcv_scale) -
 742                         (tp->rcv_adv - tp->rcv_nxt);
 743
 744                 if (adv >= (int32_t) (2 * tp->t_maxseg)) {
 745
 746                         /*
 747                          * Update only if the resulting scaled value of the window changed, or
 748                          * if there is a change in the sequence since the last ack.
 749                          * This avoids what appears as dupe ACKS (see rdar://5640997)
 750                          */
 751
 752                         if ((tp->last_ack_sent != tp->rcv_nxt) || (((recwin + adv) >> tp->rcv_scale) > recwin))
 753                                 goto send;
 754                 }
 755                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
 756                                 goto send;
 757         }
 758
 759         /*
 760          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 761          * is also a catch-all for the retransmit timer timeout case.
 762          */
 763         if (tp->t_flags & TF_ACKNOW)
 764                 goto send;
 765         if ((flags & TH_RST) ||
 766             ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 767                 goto send;
 768         if (SEQ_GT(tp->snd_up, tp->snd_una))
 769                 goto send;
 770         /*
 771          * If our state indicates that FIN should be sent
 772          * and we have not yet done so, then we need to send.
 773          */
 774         if (flags & TH_FIN &&
 775             ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 776                 goto send;
 777         /*
 778          * In SACK, it is possible for tcp_output to fail to send a segment
 779          * after the retransmission timer has been turned off.  Make sure
 780          * that the retransmission timer is set.
 781          */
 782         if (tp->sack_enable && (tp->t_state >= TCPS_ESTABLISHED) && SEQ_GT(tp->snd_max, tp->snd_una) &&
 783                 tp->t_timer[TCPT_REXMT] == 0 &&
 784             tp->t_timer[TCPT_PERSIST] == 0) {
 785                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 786                         goto just_return;
 787         }
 788         /*
 789          * TCP window updates are not reliable, rather a polling protocol
 790          * using ``persist'' packets is used to insure receipt of window
 791          * updates.  The three ``states'' for the output side are:
 792          *      idle                    not doing retransmits or persists
 793          *      persisting              to move a small or zero window
 794          *      (re)transmitting        and thereby not persisting
 795          *
 796          * tp->t_timer[TCPT_PERSIST]
 797          *      is set when we are in persist state.
 798          * tp->t_force
 799          *      is set when we are called to send a persist packet.
 800          * tp->t_timer[TCPT_REXMT]
 801          *      is set when we are retransmitting
 802          * The output side is idle when both timers are zero.
 803          *
 804          * If send window is too small, there is data to transmit, and no
 805          * retransmit or persist is pending, then go to persist state.
 806          * If nothing happens soon, send when timer expires:
 807          * if window is nonzero, transmit what we can,
 808          * otherwise force out a byte.
 809          */
 810         if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
 811             tp->t_timer[TCPT_PERSIST] == 0) {
 812                 tp->t_rxtshift = 0;
 813                 tcp_setpersist(tp);
 814         }
 815 just_return:
 816         /*
 817          * If there is no reason to send a segment, just return.
 818          * but if there is some packets left in the packet list, send them now.
 819          */
 820         while (!(tp->t_flags & TF_SENDINPROG) && tp->t_pktlist_head != NULL) {
 821                 packetlist = tp->t_pktlist_head;
 822                 packchain_listadd = tp->t_lastchain;
 823                 packchain_sent++;
 824                 TCP_PKTLIST_CLEAR(tp);
 825                 tp->t_flags |= TF_SENDINPROG;
 826
 827                 error = tcp_ip_output(so, tp, packetlist, packchain_listadd,
 828                     tp_inp_options, (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)));
 829
 830                 tp->t_flags &= ~TF_SENDINPROG;
 831         }
 832         /* tcp was closed while we were in ip; resume close */
 833         if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
 834                 tp->t_flags &= ~TF_CLOSING;
 835                 (void) tcp_close(tp);
 836         }
 837         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 838         return (0);
 839
 840 send:
 841         /*
 842          * Before ESTABLISHED, force sending of initial options
 843          * unless TCP set not to do any options.
 844          * NOTE: we assume that the IP/TCP header plus TCP options
 845          * always fit in a single mbuf, leaving room for a maximum
 846          * link header, i.e.
 847          *      max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 848          */
 849         optlen = 0;
 850 #if INET6
 851         if (isipv6)
 852                 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 853         else
 854 #endif
 855         hdrlen = sizeof (struct tcpiphdr);
 856         if (flags & TH_SYN) {
 857                 tp->snd_nxt = tp->iss;
 858                 if ((tp->t_flags & TF_NOOPT) == 0) {
 859                         u_short mss;
 860
 861                         opt[0] = TCPOPT_MAXSEG;
 862                         opt[1] = TCPOLEN_MAXSEG;
 863                         mss = htons((u_short) tcp_mssopt(tp));
 864                         (void)memcpy(opt + 2, &mss, sizeof(mss));
 865                         optlen = TCPOLEN_MAXSEG;
 866
 867                         if ((tp->t_flags & TF_REQ_SCALE) &&
 868                             ((flags & TH_ACK) == 0 ||
 869                             (tp->t_flags & TF_RCVD_SCALE))) {
 870                                 *((u_int32_t *)(opt + optlen)) = htonl(
 871                                         TCPOPT_NOP << 24 |
 872                                         TCPOPT_WINDOW << 16 |
 873                                         TCPOLEN_WINDOW << 8 |
 874                                         tp->request_r_scale);
 875                                 optlen += 4;
 876                         }
 877                 }
 878
 879         }
 880
 881         /*
 882           RFC 3168 states that:
 883            - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
 884            to handle the TCP ECE flag, even if you also later send a
 885            non-ECN-setup SYN/SYN-ACK.
 886            - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
 887            the ip ECT flag.
 888
 889            It is not clear how the ECE flag would ever be set if you never
 890            set the IP ECT flag on outbound packets. All the same, we use
 891            the TE_SETUPSENT to indicate that we have committed to handling
 892            the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
 893            whether or not we should set the IP ECT flag on outbound packets.
 894          */
 895         /*
 896          * For a SYN-ACK, send an ECN setup SYN-ACK
 897          */
 898         if (tcp_ecn_inbound && (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
 899                 if ((tp->ecn_flags & TE_SETUPRECEIVED) != 0) {
 900                         if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
 901                                 /* Setting TH_ECE makes this an ECN-setup SYN-ACK */
 902                                 flags |= TH_ECE;
 903
 904                                 /*
 905                                  * Record that we sent the ECN-setup and default to
 906                                  * setting IP ECT.
 907                                  */
 908                                 tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
 909                         }
 910                         else {
 911                                 /*
 912                                  * We sent an ECN-setup SYN-ACK but it was dropped.
 913                                  * Fallback to non-ECN-setup SYN-ACK and clear flag
 914                                  * that to indicate we should not send data with IP ECT set.
 915                                  *
 916                                  * Pretend we didn't receive an ECN-setup SYN.
 917                                  */
 918                                 tp->ecn_flags &= ~TE_SETUPRECEIVED;
 919                         }
 920                 }
 921         }
 922         else if (tcp_ecn_outbound && (flags & (TH_SYN | TH_ACK)) == TH_SYN) {
 923                 if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
 924                         /* Setting TH_ECE and TH_CWR makes this an ECN-setup SYN */
 925                         flags |= (TH_ECE | TH_CWR);
 926
 927                         /*
 928                          * Record that we sent the ECN-setup and default to
 929                          * setting IP ECT.
 930                          */
 931                         tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
 932                 }
 933                 else {
 934                         /*
 935                          * We sent an ECN-setup SYN but it was dropped.
 936                          * Fall back to no ECN and clear flag indicating
 937                          * we should send data with IP ECT set.
 938                          */
 939                         tp->ecn_flags &= ~TE_SENDIPECT;
 940                 }
 941         }
 942
 943         /*
 944          * Check if we should set the TCP CWR flag.
 945          * CWR flag is sent when we reduced the congestion window because
 946          * we received a TCP ECE or we performed a fast retransmit. We
 947          * never set the CWR flag on retransmitted packets. We only set
 948          * the CWR flag on data packets. Pure acks don't have this set.
 949          */
 950         if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 &&
 951                 !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 952                 flags |= TH_CWR;
 953                 tp->ecn_flags &= ~TE_SENDCWR;
 954         }
 955
 956         /*
 957          * Check if we should set the TCP ECE flag.
 958          */
 959         if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) {
 960                 flags |= TH_ECE;
 961         }
 962
 963         /*
 964          * Send a timestamp and echo-reply if this is a SYN and our side
 965          * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
 966          * and our peer have sent timestamps in our SYN's.
 967          */
 968         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 969             (flags & TH_RST) == 0 &&
 970             ((flags & TH_ACK) == 0 ||
 971              (tp->t_flags & TF_RCVD_TSTMP))) {
 972                 u_int32_t *lp = (u_int32_t *)(opt + optlen);
 973
 974                 /* Form timestamp option as shown in appendix A of RFC 1323. */
 975                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
 976                 *lp++ = htonl(tcp_now);
 977                 *lp   = htonl(tp->ts_recent);
 978                 optlen += TCPOLEN_TSTAMP_APPA;
 979         }
 980
 981         if (tp->sack_enable && ((tp->t_flags & TF_NOOPT) == 0)) {
 982                 /*
 983                  * Tack on the SACK permitted option *last*.
 984                  * And do padding of options after tacking this on.
 985                  * This is because of MSS, TS, WinScale and Signatures are
 986                  * all present, we have just 2 bytes left for the SACK
 987                  * permitted option, which is just enough.
 988                  */
 989                 /*
 990                  * If this is the first SYN of connection (not a SYN
 991                  * ACK), include SACK permitted option.  If this is a
 992                  * SYN ACK, include SACK permitted option if peer has
 993                  * already done so. This is only for active connect,
 994                  * since the syncache takes care of the passive connect.
 995                  */
 996                 if ((flags & TH_SYN) &&
 997                     (!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) {
 998                         u_char *bp;
 999                         bp = (u_char *)opt + optlen;
1000
1001                         *bp++ = TCPOPT_SACK_PERMITTED;
1002                         *bp++ = TCPOLEN_SACK_PERMITTED;
1003                         optlen += TCPOLEN_SACK_PERMITTED;
1004                 }
1005
1006                 /*
1007                  * Send SACKs if necessary.  This should be the last
1008                  * option processed.  Only as many SACKs are sent as
1009                  * are permitted by the maximum options size.
1010                  *
1011                  * In general, SACK blocks consume 8*n+2 bytes.
1012                  * So a full size SACK blocks option is 34 bytes
1013                  * (to generate 4 SACK blocks).  At a minimum,
1014                  * we need 10 bytes (to generate 1 SACK block).
1015                  * If TCP Timestamps (12 bytes) and TCP Signatures
1016                  * (18 bytes) are both present, we'll just have
1017                  * 10 bytes for SACK options 40 - (12 + 18).
1018                  */
1019                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1020                     (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0 &&
1021                     MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) {
1022                         int nsack, sackoptlen, padlen;
1023                         u_char *bp = (u_char *)opt + optlen;
1024                         u_int32_t *lp;
1025
1026                         nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK;
1027                         nsack = min(nsack, tp->rcv_numsacks);
1028                         sackoptlen = (2 + nsack * TCPOLEN_SACK);
1029
1030                         /*
1031                          * First we need to pad options so that the
1032                          * SACK blocks can start at a 4-byte boundary
1033                          * (sack option and length are at a 2 byte offset).
1034                          */
1035                         padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4;
1036                         optlen += padlen;
1037                         while (padlen-- > 0)
1038                                 *bp++ = TCPOPT_NOP;
1039
1040                         tcpstat.tcps_sack_send_blocks++;
1041                         *bp++ = TCPOPT_SACK;
1042                         *bp++ = sackoptlen;
1043                         lp = (u_int32_t *)bp;
1044                         for (i = 0; i < nsack; i++) {
1045                                 struct sackblk sack = tp->sackblks[i];
1046                                 *lp++ = htonl(sack.start);
1047                                 *lp++ = htonl(sack.end);
1048                         }
1049                         optlen += sackoptlen;
1050                 }
1051         }
1052
1053         /* Pad TCP options to a 4 byte boundary */
1054         if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
1055                 int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
1056                 u_char *bp = (u_char *)opt + optlen;
1057
1058                 optlen += pad;
1059                 while (pad) {
1060                         *bp++ = TCPOPT_EOL;
1061                         pad--;
1062                 }
1063         }
1064
1065         hdrlen += optlen;
1066
1067 #if INET6
1068         if (isipv6)
1069                 ipoptlen = ip6_optlen(tp->t_inpcb);
1070         else
1071 #endif
1072         {
1073                 if (tp_inp_options) {
1074                         ipoptlen = tp_inp_options->m_len -
1075                                 offsetof(struct ipoption, ipopt_list);
1076                 } else
1077                         ipoptlen = 0;
1078         }
1079 #if IPSEC
1080                 ipoptlen += ipsec_optlen;
1081 #endif
1082
1083         /*
1084          * Adjust data length if insertion of options will
1085          * bump the packet length beyond the t_maxopd length.
1086          * Clear the FIN bit because we cut off the tail of
1087          * the segment.
1088          *
1089          * When doing TSO limit a burst to TCP_MAXWIN minus the
1090          * IP, TCP and Options length to keep ip->ip_len from
1091          * overflowing.  Prevent the last segment from being
1092          * fractional thus making them all equal sized and set
1093          * the flag to continue sending.  TSO is disabled when
1094          * IP options or IPSEC are present.
1095          */
1096         if (len + optlen + ipoptlen > tp->t_maxopd) {
1097                 /*
1098                  * If there is still more to send, don't close the connection.
1099                  */
1100                 flags &= ~TH_FIN;
1101                 if (tso) {
1102                         int32_t tso_maxlen;
1103
1104                         tso_maxlen = tp->tso_max_segment_size ? tp->tso_max_segment_size : TCP_MAXWIN;
1105
1106                         if (len > tso_maxlen - hdrlen - optlen) {
1107                                 len = tso_maxlen - hdrlen - optlen;
1108                                 len = len - (len % (tp->t_maxopd - optlen));
1109                                 sendalot = 1;
1110                         } else if (tp->t_flags & TF_NEEDFIN)
1111                                 sendalot = 1;
1112                 } else {
1113                         len = tp->t_maxopd - optlen - ipoptlen;
1114                         sendalot = 1;
1115                 }
1116         }
1117
1118 /*#ifdef DIAGNOSTIC*/
1119 #if INET6
1120         if (max_linkhdr + hdrlen > MCLBYTES)
1121                 panic("tcphdr too big");
1122 #else
1123         if (max_linkhdr + hdrlen > MHLEN)
1124                 panic("tcphdr too big");
1125 #endif
1126 /*#endif*/
1127
1128         /*
1129          * Grab a header mbuf, attaching a copy of data to
1130          * be transmitted, and initialize the header from
1131          * the template for sends on this connection.
1132          */
1133         if (len) {
1134                 if (tp->t_force && len == 1)
1135                         tcpstat.tcps_sndprobe++;
1136                 else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
1137                         tcpstat.tcps_sndrexmitpack++;
1138                         tcpstat.tcps_sndrexmitbyte += len;
1139                 } else {
1140                         tcpstat.tcps_sndpack++;
1141                         tcpstat.tcps_sndbyte += len;
1142                 }
1143 #ifdef notyet
1144                 if ((m = m_copypack(so->so_snd.sb_mb, off,
1145                     (int)len, max_linkhdr + hdrlen)) == 0) {
1146                         error = ENOBUFS;
1147                         goto out;
1148                 }
1149                 /*
1150                  * m_copypack left space for our hdr; use it.
1151                  */
1152                 m->m_len += hdrlen;
1153                 m->m_data -= hdrlen;
1154 #else
1155                 /*
1156                  * try to use the new interface that allocates all
1157                  * the necessary mbuf hdrs under 1 mbuf lock and
1158                  * avoids rescanning the socket mbuf list if
1159                  * certain conditions are met.  This routine can't
1160                  * be used in the following cases...
1161                  * 1) the protocol headers exceed the capacity of
1162                  * of a single mbuf header's data area (no cluster attached)
1163                  * 2) the length of the data being transmitted plus
1164                  * the protocol headers fits into a single mbuf header's
1165                  * data area (no cluster attached)
1166                  */
1167                 m = NULL;
1168 #if INET6
1169                 if (MHLEN < hdrlen + max_linkhdr) {
1170                         MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1171                         if (m == NULL) {
1172                                 error = ENOBUFS;
1173                                 goto out;
1174                         }
1175                         MCLGET(m, M_DONTWAIT);
1176                         if ((m->m_flags & M_EXT) == 0) {
1177                                 m_freem(m);
1178                                 error = ENOBUFS;
1179                                 goto out;
1180                         }
1181                         m->m_data += max_linkhdr;
1182                         m->m_len = hdrlen;
1183                 }
1184 #endif
1185                 if (len <= MHLEN - hdrlen - max_linkhdr) {
1186                         if (m == NULL) {
1187                                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1188                                 if (m == NULL) {
1189                                         error = ENOBUFS;
1190                                         goto out;
1191                                 }
1192                                 m->m_data += max_linkhdr;
1193                                 m->m_len = hdrlen;
1194                         }
1195                         /* makes sure we still have data left to be sent at this point */
1196                         if (so->so_snd.sb_mb == NULL || off < 0) {
1197                                 if (m != NULL)  m_freem(m);
1198                                 error = 0; /* should we return an error? */
1199                                 goto out;
1200                         }
1201                         m_copydata(so->so_snd.sb_mb, off, (int) len,
1202                             mtod(m, caddr_t) + hdrlen);
1203                         m->m_len += len;
1204                 } else {
1205                         if (m != NULL) {
1206                                 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
1207                                 if (m->m_next == 0) {
1208                                         (void) m_free(m);
1209                                         error = ENOBUFS;
1210                                         goto out;
1211                                 }
1212                         } else {
1213                                 /*
1214                                  * determine whether the mbuf pointer and offset passed back by the 'last' call
1215                                  * to m_copym_with_hdrs are still valid... if the head of the socket chain has
1216                                  * changed (due to an incoming ACK for instance), or the offset into the chain we
1217                                  * just computed is different from the one last returned by m_copym_with_hdrs (perhaps
1218                                  * we're re-transmitting a packet sent earlier), than we can't pass the mbuf pointer and
1219                                  * offset into it as valid hints for m_copym_with_hdrs to use (if valid, these hints allow
1220                                  * m_copym_with_hdrs to avoid rescanning from the beginning of the socket buffer mbuf list.
1221                                  * setting the mbuf pointer to NULL is sufficient to disable the hint mechanism.
1222                                  */
1223                                 if (m_head != so->so_snd.sb_mb || sack_rxmit || last_off != off)
1224                                         m_last = NULL;
1225                                 last_off = off + len;
1226                                 m_head = so->so_snd.sb_mb;
1227
1228                                 /* makes sure we still have data left to be sent at this point */
1229                                 if (m_head == NULL) {
1230                                         error = 0; /* should we return an error? */
1231                                         goto out;
1232                                 }
1233
1234                                 /*
1235                                  * m_copym_with_hdrs will always return the last mbuf pointer and the offset into it that
1236                                  * it acted on to fullfill the current request, whether a valid 'hint' was passed in or not
1237                                  */
1238                                 if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, len, M_DONTWAIT, &m_last, &m_off)) == NULL) {
1239                                         error = ENOBUFS;
1240                                         goto out;
1241                                 }
1242                                 m->m_data += max_linkhdr;
1243                                 m->m_len = hdrlen;
1244                         }
1245                 }
1246 #endif
1247                 /*
1248                  * If we're sending everything we've got, set PUSH.
1249                  * (This will keep happy those implementations which only
1250                  * give data to the user when a buffer fills or
1251                  * a PUSH comes in.)
1252                  */
1253                 if (off + len == so->so_snd.sb_cc)
1254                         flags |= TH_PUSH;
1255         } else {
1256                 if (tp->t_flags & TF_ACKNOW)
1257                         tcpstat.tcps_sndacks++;
1258                 else if (flags & (TH_SYN|TH_FIN|TH_RST))
1259                         tcpstat.tcps_sndctrl++;
1260                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
1261                         tcpstat.tcps_sndurg++;
1262                 else
1263                         tcpstat.tcps_sndwinup++;
1264
1265                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1266                 if (m == NULL) {
1267                         error = ENOBUFS;
1268                         goto out;
1269                 }
1270 #if INET6
1271                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
1272                     MHLEN >= hdrlen) {
1273                         MH_ALIGN(m, hdrlen);
1274                 } else
1275 #endif
1276                 m->m_data += max_linkhdr;
1277                 m->m_len = hdrlen;
1278         }
1279         m->m_pkthdr.rcvif = 0;
1280 #if CONFIG_MACF_NET
1281         mac_mbuf_label_associate_inpcb(tp->t_inpcb, m);
1282 #endif
1283 #if INET6
1284         if (isipv6) {
1285                 ip6 = mtod(m, struct ip6_hdr *);
1286                 th = (struct tcphdr *)(ip6 + 1);
1287                 tcp_fillheaders(tp, ip6, th);
1288         } else
1289 #endif /* INET6 */
1290         {
1291                 ip = mtod(m, struct ip *);
1292                 ipov = (struct ipovly *)ip;
1293                 th = (struct tcphdr *)(ip + 1);
1294                 /* this picks up the pseudo header (w/o the length) */
1295                 tcp_fillheaders(tp, ip, th);
1296                 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
1297                         !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1298                         ip->ip_tos = IPTOS_ECN_ECT0;
1299                 }
1300         }
1301
1302         /*
1303          * Fill in fields, remembering maximum advertised
1304          * window for use in delaying messages about window sizes.
1305          * If resending a FIN, be sure not to use a new sequence number.
1306          */
1307         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
1308             tp->snd_nxt == tp->snd_max)
1309                 tp->snd_nxt--;
1310         /*
1311          * If we are doing retransmissions, then snd_nxt will
1312          * not reflect the first unsent octet.  For ACK only
1313          * packets, we do not want the sequence number of the
1314          * retransmitted packet, we want the sequence number
1315          * of the next unsent octet.  So, if there is no data
1316          * (and no SYN or FIN), use snd_max instead of snd_nxt
1317          * when filling in ti_seq.  But if we are in persist
1318          * state, snd_max might reflect one byte beyond the
1319          * right edge of the window, so use snd_nxt in that
1320          * case, since we know we aren't doing a retransmission.
1321          * (retransmit and persist are mutually exclusive...)
1322          */
1323         if (sack_rxmit == 0) {
1324                 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
1325                         th->th_seq = htonl(tp->snd_nxt);
1326                 else
1327                         th->th_seq = htonl(tp->snd_max);
1328         } else {
1329                 th->th_seq = htonl(p->rxmit);
1330                 p->rxmit += len;
1331                 tp->sackhint.sack_bytes_rexmit += len;
1332         }
1333         th->th_ack = htonl(tp->rcv_nxt);
1334         tp->last_ack_sent = tp->rcv_nxt;
1335
1336         if (optlen) {
1337                 bcopy(opt, th + 1, optlen);
1338                 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
1339         }
1340         th->th_flags = flags;
1341         /*
1342          * Calculate receive window.  Don't shrink window,
1343          * but avoid silly window syndrome.
1344          */
1345         if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) && recwin < (int)tp->t_maxseg)
1346                 recwin = 0;
1347         if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt))
1348                 recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
1349         if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
1350                 if (recwin > (int32_t)slowlink_wsize)
1351                         recwin = slowlink_wsize;
1352                 th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
1353         }
1354         else {
1355                 if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
1356                         recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
1357                 th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
1358         }
1359
1360         /*
1361          * Adjust the RXWIN0SENT flag - indicate that we have advertised
1362          * a 0 window.  This may cause the remote transmitter to stall.  This
1363          * flag tells soreceive() to disable delayed acknowledgements when
1364          * draining the buffer.  This can occur if the receiver is attempting
1365          * to read more data then can be buffered prior to transmitting on
1366          * the connection.
1367          */
1368         if (recwin == 0)
1369                 tp->t_flags |= TF_RXWIN0SENT;
1370         else
1371                 tp->t_flags &= ~TF_RXWIN0SENT;
1372         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1373                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
1374                 th->th_flags |= TH_URG;
1375         } else
1376                 /*
1377                  * If no urgent pointer to send, then we pull
1378                  * the urgent pointer to the left edge of the send window
1379                  * so that it doesn't drift into the send window on sequence
1380                  * number wraparound.
1381                  */
1382                 tp->snd_up = tp->snd_una;               /* drag it along */
1383
1384         /*
1385          * Put TCP length in extended header, and then
1386          * checksum extended header and data.
1387          */
1388         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
1389 #if INET6
1390         if (isipv6)
1391                 /*
1392                  * ip6_plen is not need to be filled now, and will be filled
1393                  * in ip6_output.
1394                  */
1395                 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
1396                                        sizeof(struct tcphdr) + optlen + len);
1397         else
1398 #endif /* INET6 */
1399         {
1400                 m->m_pkthdr.csum_flags = CSUM_TCP;
1401                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1402                 if (len + optlen)
1403                         th->th_sum = in_addword(th->th_sum,
1404                                 htons((u_short)(optlen + len)));
1405         }
1406
1407         /*
1408          * Enable TSO and specify the size of the segments.
1409          * The TCP pseudo header checksum is always provided.
1410          * XXX: Fixme: This is currently not the case for IPv6.
1411          */
1412         if (tso) {
1413 #if INET6
1414                 if (isipv6)
1415                         m->m_pkthdr.csum_flags = CSUM_TSO_IPV6;
1416                 else
1417 #endif /* INET6 */
1418                         m->m_pkthdr.csum_flags = CSUM_TSO_IPV4;
1419
1420                 m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
1421         }
1422         else
1423                 m->m_pkthdr.tso_segsz = 0;
1424
1425         /*
1426          * In transmit state, time the transmission and arrange for
1427          * the retransmit.  In persist state, just set snd_max.
1428          */
1429         if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
1430                 tcp_seq startseq = tp->snd_nxt;
1431
1432                 /*
1433                  * Advance snd_nxt over sequence space of this segment.
1434                  */
1435                 if (flags & (TH_SYN|TH_FIN)) {
1436                         if (flags & TH_SYN)
1437                                 tp->snd_nxt++;
1438                         if (flags & TH_FIN) {
1439                                 tp->snd_nxt++;
1440                                 tp->t_flags |= TF_SENTFIN;
1441                         }
1442                 }
1443                 if (sack_rxmit)
1444                         goto timer;
1445                 tp->snd_nxt += len;
1446                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1447                         tp->snd_max = tp->snd_nxt;
1448                         /*
1449                          * Time this transmission if not a retransmission and
1450                          * not currently timing anything.
1451                          */
1452                         if (tp->t_rtttime == 0) {
1453                                 tp->t_rtttime = 1;
1454                                 tp->t_rtseq = startseq;
1455                                 tcpstat.tcps_segstimed++;
1456                         }
1457                 }
1458
1459                 /*
1460                  * Set retransmit timer if not currently set,
1461                  * and not doing an ack or a keep-alive probe.
1462                  * Initial value for retransmit timer is smoothed
1463                  * round-trip time + 2 * round-trip time variance.
1464                  * Initialize shift counter which is used for backoff
1465                  * of retransmit time.
1466                  */
1467 timer:
1468                 if (tp->t_timer[TCPT_REXMT] == 0 &&
1469                     ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
1470                         tp->snd_nxt != tp->snd_una)) {
1471                         if (tp->t_timer[TCPT_PERSIST]) {
1472                                 tp->t_timer[TCPT_PERSIST] = 0;
1473                                 tp->t_rxtshift = 0;
1474                         }
1475                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1476                 }
1477         } else {
1478                 /*
1479                  * Persist case, update snd_max but since we are in
1480                  * persist mode (no window) we do not update snd_nxt.
1481                  */
1482                 int xlen = len;
1483                 if (flags & TH_SYN)
1484                         ++xlen;
1485                 if (flags & TH_FIN) {
1486                         ++xlen;
1487                         tp->t_flags |= TF_SENTFIN;
1488                 }
1489                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
1490                         tp->snd_max = tp->snd_nxt + len;
1491         }
1492
1493 #if TCPDEBUG
1494         /*
1495          * Trace.
1496          */
1497         if (so_options & SO_DEBUG)
1498                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
1499 #endif
1500
1501         /*
1502          * Fill in IP length and desired time to live and
1503          * send to IP level.  There should be a better way
1504          * to handle ttl and tos; we could keep them in
1505          * the template, but need a way to checksum without them.
1506          */
1507         /*
1508          * m->m_pkthdr.len should have been set before cksum calcuration,
1509          * because in6_cksum() need it.
1510          */
1511 #if INET6
1512         if (isipv6) {
1513                 /*
1514                  * we separately set hoplimit for every segment, since the
1515                  * user might want to change the value via setsockopt.
1516                  * Also, desired default hop limit might be changed via
1517                  * Neighbor Discovery.
1518                  */
1519                 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
1520                                                tp->t_inpcb->in6p_route.ro_rt ?
1521                                                tp->t_inpcb->in6p_route.ro_rt->rt_ifp
1522                                                : NULL);
1523
1524                 /* TODO: IPv6 IP6TOS_ECT bit on */
1525 #if IPSEC
1526                 if (ipsec_bypass == 0 && ipsec_setsocket(m, so) != 0) {
1527                         m_freem(m);
1528                         error = ENOBUFS;
1529                         goto out;
1530                 }
1531 #endif /*IPSEC*/
1532                 m->m_pkthdr.socket_id = socket_id;
1533                 error = ip6_output(m,
1534                             inp6_pktopts,
1535                             &tp->t_inpcb->in6p_route,
1536                             (so_options & SO_DONTROUTE), NULL, NULL, 0);
1537         } else
1538 #endif /* INET6 */
1539     {
1540         ip->ip_len = m->m_pkthdr.len;
1541 #if INET6
1542         if (isipv6)
1543                 ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
1544                                             tp->t_inpcb->in6p_route.ro_rt ?
1545                                             tp->t_inpcb->in6p_route.ro_rt->rt_ifp
1546                                             : NULL);
1547         else
1548 #endif /* INET6 */
1549         ip->ip_ttl = tp->t_inpcb->inp_ip_ttl;   /* XXX */
1550         ip->ip_tos |= (tp->t_inpcb->inp_ip_tos & ~IPTOS_ECN_MASK);      /* XXX */
1551
1552
1553 #if INET6
1554         if (isipv6) {
1555                 KERNEL_DEBUG(DBG_LAYER_BEG,
1556                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
1557                      (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
1558                       (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
1559                      0,0,0);
1560         }
1561         else
1562 #endif
1563         {
1564                 KERNEL_DEBUG(DBG_LAYER_BEG,
1565                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
1566                      (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
1567                       (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
1568                      0,0,0);
1569         }
1570
1571         /*
1572          * See if we should do MTU discovery.
1573          * Look at the flag updated on the following criterias:
1574          *      1) Path MTU discovery is authorized by the sysctl
1575          *      2) The route isn't set yet (unlikely but could happen)
1576          *      3) The route is up
1577          *      4) the MTU is not locked (if it is, then discovery has been
1578          *         disabled for that route)
1579          */
1580
1581         if (path_mtu_discovery && (tp->t_flags & TF_PMTUD))
1582                 ip->ip_off |= IP_DF;
1583
1584 #if IPSEC
1585         if (ipsec_bypass == 0)
1586                 ipsec_setsocket(m, so);
1587 #endif /*IPSEC*/
1588
1589         /*
1590          * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
1591          */
1592         lost = 0;
1593         m->m_pkthdr.socket_id = socket_id;
1594         m->m_nextpkt = NULL;
1595         tp->t_pktlist_sentlen += len;
1596         tp->t_lastchain++;
1597         if (tp->t_pktlist_head != NULL) {
1598                 tp->t_pktlist_tail->m_nextpkt = m;
1599                 tp->t_pktlist_tail = m;
1600         } else {
1601                 packchain_newlist++;
1602                 tp->t_pktlist_head = tp->t_pktlist_tail = m;
1603         }
1604
1605         if (sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) ||
1606               (tp->snd_cwnd <= (tp->snd_wnd / 8)) ||
1607               (tp->t_flags & (TH_PUSH | TF_ACKNOW)) || tp->t_force != 0 ||
1608               tp->t_lastchain >= tcp_packet_chaining) {
1609                 error = 0;
1610                 while (!(tp->t_flags & TF_SENDINPROG) &&
1611                     tp->t_pktlist_head != NULL) {
1612                         packetlist = tp->t_pktlist_head;
1613                         packchain_listadd = tp->t_lastchain;
1614                         packchain_sent++;
1615                         lost = tp->t_pktlist_sentlen;
1616                         TCP_PKTLIST_CLEAR(tp);
1617                         tp->t_flags |= TF_SENDINPROG;
1618
1619                         error = tcp_ip_output(so, tp, packetlist,
1620                             packchain_listadd, tp_inp_options,
1621                             (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)));
1622
1623                         tp->t_flags &= ~TF_SENDINPROG;
1624                         if (error) {
1625                                 /*
1626                                  * Take into account the rest of unsent
1627                                  * packets in the packet list for this tcp
1628                                  * into "lost", since we're about to free
1629                                  * the whole list below.
1630                                  */
1631                                 lost += tp->t_pktlist_sentlen;
1632                                 break;
1633                         } else {
1634                                 lost = 0;
1635                         }
1636                 }
1637                 /* tcp was closed while we were in ip; resume close */
1638                 if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
1639                         tp->t_flags &= ~TF_CLOSING;
1640                         (void) tcp_close(tp);
1641                         return (0);
1642                 }
1643         }
1644         else {
1645                 error = 0;
1646                 packchain_looped++;
1647                 tcpstat.tcps_sndtotal++;
1648
1649                 if (recwin > 0 && SEQ_GT(tp->rcv_nxt+recwin, tp->rcv_adv))
1650                         tp->rcv_adv = tp->rcv_nxt + recwin;
1651                 tp->last_ack_sent = tp->rcv_nxt;
1652                 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
1653                 goto again;
1654         }
1655    }
1656         if (error) {
1657                 /*
1658                  * Assume that the packets were lost, so back out the
1659                  * sequence number advance, if any.  Note that the "lost"
1660                  * variable represents the amount of user data sent during
1661                  * the recent call to ip_output_list() plus the amount of
1662                  * user data in the packet list for this tcp at the moment.
1663                  */
1664                 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
1665                         /*
1666                          * No need to check for TH_FIN here because
1667                          * the TF_SENTFIN flag handles that case.
1668                          */
1669                         if ((flags & TH_SYN) == 0) {
1670                                 if (sack_rxmit) {
1671                                         p->rxmit -= lost;
1672                                         tp->sackhint.sack_bytes_rexmit -= lost;
1673                                 } else
1674                                         tp->snd_nxt -= lost;
1675                         }
1676                 }
1677 out:
1678                 if (tp->t_pktlist_head != NULL)
1679                         m_freem_list(tp->t_pktlist_head);
1680                 TCP_PKTLIST_CLEAR(tp);
1681
1682                 if (error == ENOBUFS) {
1683                         if (!tp->t_timer[TCPT_REXMT] &&
1684                                  !tp->t_timer[TCPT_PERSIST])
1685                                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1686
1687                         tp->snd_cwnd = tp->t_maxseg;
1688                         tp->t_bytes_acked = 0;
1689
1690                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1691                         return (0);
1692                 }
1693                 if (error == EMSGSIZE) {
1694                         /*
1695                          * ip_output() will have already fixed the route
1696                          * for us.  tcp_mtudisc() will, as its last action,
1697                          * initiate retransmission, so it is important to
1698                          * not do so here.
1699                          *
1700                          * If TSO was active we either got an interface
1701                          * without TSO capabilits or TSO was turned off.
1702                          * Disable it for this connection as too and
1703                          * immediatly retry with MSS sized segments generated
1704                          * by this function.
1705                          */
1706                         if (tso)
1707                                 tp->t_flags &= ~TF_TSO;
1708
1709                         tcp_mtudisc(tp->t_inpcb, 0);
1710                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1711                         return 0;
1712                 }
1713                 if ((error == EHOSTUNREACH || error == ENETDOWN)
1714                     && TCPS_HAVERCVDSYN(tp->t_state)) {
1715                         tp->t_softerror = error;
1716                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1717                         return (0);
1718                 }
1719                 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1720                 return (error);
1721         }
1722
1723         tcpstat.tcps_sndtotal++;
1724
1725         /*
1726          * Data sent (as far as we can tell).
1727          * If this advertises a larger window than any other segment,
1728          * then remember the size of the advertised window.
1729          * Any pending ACK has now been sent.
1730          */
1731         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
1732                 tp->rcv_adv = tp->rcv_nxt + recwin;
1733         tp->last_ack_sent = tp->rcv_nxt;
1734         tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
1735
1736         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0);
1737         if (sendalot && (!tcp_do_newreno || --maxburst))
1738                 goto again;
1739         return (0);
1740 }
1741
1742 static int
1743 tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
1744     int cnt, struct mbuf *opt, int flags, int sack_in_progress)
1745 {
1746         int error = 0;
1747         boolean_t chain;
1748         boolean_t unlocked = FALSE;
1749         struct inpcb *inp = tp->t_inpcb;
1750         struct ip_out_args ipoa;
1751         struct route ro;
1752
1753         /* If socket was bound to an ifindex, tell ip_output about it */
1754         ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
1755             inp->inp_boundif : IFSCOPE_NONE;
1756         flags |= IP_OUTARGS;
1757
1758         /* Copy the cached route and take an extra reference */
1759         inp_route_copyout(inp, &ro);
1760
1761         /*
1762          * Make sure ACK/DELACK conditions are cleared before
1763          * we unlock the socket.
1764          */
1765         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
1766
1767         /*
1768          * If allowed, unlock TCP socket while in IP
1769          * but only if the connection is established and
1770          * if we're not sending from an upcall.
1771          */
1772         if (tcp_output_unlocked && ((so->so_flags & SOF_UPCALLINUSE) == 0) &&
1773             (tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0)) {
1774                 unlocked = TRUE;
1775                 socket_unlock(so, 0);
1776         }
1777
1778         /*
1779          * Don't send down a chain of packets when:
1780          * - TCP chaining is disabled
1781          * - there is an IPsec rule set
1782          * - there is a non default rule set for the firewall
1783          */
1784
1785         chain = tcp_packet_chaining > 1
1786 #if IPSEC
1787                 && ipsec_bypass
1788 #endif
1789 #if IPFIREWALL
1790                 && (fw_enable == 0 || fw_bypass)
1791 #endif
1792                 ; // I'm important, not extraneous
1793
1794
1795         while (pkt != NULL) {
1796                 struct mbuf *npkt = pkt->m_nextpkt;
1797
1798                 if (!chain) {
1799                         pkt->m_nextpkt = NULL;
1800                         /*
1801                          * If we are not chaining, make sure to set the packet
1802                          * list count to 0 so that IP takes the right path;
1803                          * this is important for cases such as IPSec where a
1804                          * single mbuf might result in multiple mbufs as part
1805                          * of the encapsulation.  If a non-zero count is passed
1806                          * down to IP, the head of the chain might change and
1807                          * we could end up skipping it (thus generating bogus
1808                          * packets).  Fixing it in IP would be desirable, but
1809                          * for now this would do it.
1810                          */
1811                         cnt = 0;
1812                 }
1813                 error = ip_output_list(pkt, cnt, opt, &ro, flags, 0, &ipoa);
1814                 if (chain || error) {
1815                         /*
1816                          * If we sent down a chain then we are done since
1817                          * the callee had taken care of everything; else
1818                          * we need to free the rest of the chain ourselves.
1819                          */
1820                         if (!chain)
1821                                 m_freem_list(npkt);
1822                         break;
1823                 }
1824                 pkt = npkt;
1825         }
1826
1827         if (unlocked)
1828                 socket_lock(so, 0);
1829
1830         /* Synchronize cached PCB route */
1831         inp_route_copyin(inp, &ro);
1832
1833         return (error);
1834 }
1835
1836 void
1837 tcp_setpersist(tp)
1838         register struct tcpcb *tp;
1839 {
1840         int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
1841
1842         if (tp->t_timer[TCPT_REXMT])
1843                 panic("tcp_setpersist: retransmit pending");
1844         /*
1845          * Start/restart persistance timer.
1846          */
1847         TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
1848             t * tcp_backoff[tp->t_rxtshift],
1849             TCPTV_PERSMIN, TCPTV_PERSMAX);
1850         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1851                 tp->t_rxtshift++;
1852 }