bsd/netinet/tcp_output.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_output.c        8.4 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #define _IP_VHL
  71
  72
  73 #include <sys/param.h>
  74 #include <sys/systm.h>
  75 #include <sys/kernel.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/mbuf.h>
  78 #include <sys/domain.h>
  79 #include <sys/protosw.h>
  80 #include <sys/socket.h>
  81 #include <sys/socketvar.h>
  82
  83 #include <net/route.h>
  84 #include <net/if_var.h>
  85
  86 #include <netinet/in.h>
  87 #include <netinet/in_systm.h>
  88 #include <netinet/in_var.h>
  89 #include <netinet/ip.h>
  90 #include <netinet/in_pcb.h>
  91 #include <netinet/ip_var.h>
  92 #if INET6
  93 #include <netinet6/in6_pcb.h>
  94 #include <netinet/ip6.h>
  95 #include <netinet6/ip6_var.h>
  96 #endif
  97 #include <netinet/tcp.h>
  98 #define TCPOUTFLAGS
  99 #include <netinet/tcp_fsm.h>
 100 #include <netinet/tcp_seq.h>
 101 #include <netinet/tcp_timer.h>
 102 #include <netinet/tcp_var.h>
 103 #include <netinet/tcpip.h>
 104 #if TCPDEBUG
 105 #include <netinet/tcp_debug.h>
 106 #endif
 107 #include <sys/kdebug.h>
 108
 109 #if IPSEC
 110 #include <netinet6/ipsec.h>
 111 #endif /*IPSEC*/
 112
 113 #if CONFIG_MACF_NET
 114 #include <security/mac_framework.h>
 115 #endif /* MAC_SOCKET */
 116
 117 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETTCP, 1)
 118 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETTCP, 3)
 119 #define DBG_FNC_TCP_OUTPUT      NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
 120
 121
 122 #ifdef notyet
 123 extern struct mbuf *m_copypack();
 124 #endif
 125
 126 int path_mtu_discovery = 1;
 127 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
 128         &path_mtu_discovery, 1, "Enable Path MTU Discovery");
 129
 130 int ss_fltsz = 1;
 131 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
 132         &ss_fltsz, 1, "Slow start flight size");
 133
 134 int ss_fltsz_local = 8; /* starts with eight segments max */
 135 SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
 136         &ss_fltsz_local, 1, "Slow start flight size for local networks");
 137
 138 int     tcp_do_newreno = 0;
 139 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
 140         0, "Enable NewReno Algorithms");
 141
 142 int     tcp_ecn_outbound = 0;
 143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out, CTLFLAG_RW, &tcp_ecn_outbound,
 144         0, "Initiate ECN for outbound connections");
 145
 146 int     tcp_ecn_inbound = 0;
 147 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, CTLFLAG_RW, &tcp_ecn_inbound,
 148         0, "Allow ECN negotiation for inbound connections");
 149
 150 int     tcp_packet_chaining = 50;
 151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW, &tcp_packet_chaining,
 152         0, "Enable TCP output packet chaining");
 153
 154 int     tcp_output_unlocked = 1;
 155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output, CTLFLAG_RW, &tcp_output_unlocked,
 156         0, "Unlock TCP when sending packets down to IP");
 157
 158 static long packchain_newlist = 0;
 159 static long packchain_looped = 0;
 160 static long packchain_sent = 0;
 161
 162
 163 /* temporary: for testing */
 164 #if IPSEC
 165 extern int ipsec_bypass;
 166 #endif
 167
 168 extern int slowlink_wsize;      /* window correction for slow links */
 169 extern u_long  route_generation;
 170 #if IPFIREWALL
 171 extern int fw_enable;           /* firewall check for packet chaining */
 172 extern int fw_bypass;           /* firewall check: disable packet chaining if there is rules */
 173 #endif /* IPFIREWALL */
 174
 175 extern vm_size_t        so_cache_zone_element_size;
 176
 177 static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, int,
 178     struct mbuf *, int);
 179
 180 static __inline__ u_int16_t
 181 get_socket_id(struct socket * s)
 182 {
 183         u_int16_t               val;
 184
 185         if (so_cache_zone_element_size == 0) {
 186                 return (0);
 187         }
 188         val = (u_int16_t)(((u_int32_t)s) / so_cache_zone_element_size);
 189         if (val == 0) {
 190                 val = 0xffff;
 191         }
 192         return (val);
 193 }
 194
 195 /*
 196  * Tcp output routine: figure out what should be sent and send it.
 197  *
 198  * Returns:     0                       Success
 199  *              EADDRNOTAVAIL
 200  *              ENOBUFS
 201  *              EMSGSIZE
 202  *              EHOSTUNREACH
 203  *              ENETDOWN
 204  *      ip_output_list:ENOMEM
 205  *      ip_output_list:EADDRNOTAVAIL
 206  *      ip_output_list:ENETUNREACH
 207  *      ip_output_list:EHOSTUNREACH
 208  *      ip_output_list:EACCES
 209  *      ip_output_list:EMSGSIZE
 210  *      ip_output_list:ENOBUFS
 211  *      ip_output_list:???              [ignorable: mostly IPSEC/firewall/DLIL]
 212  *      ip6_output:???                  [IPV6 only]
 213  */
 214 int
 215 tcp_output(struct tcpcb *tp)
 216 {
 217         struct socket *so = tp->t_inpcb->inp_socket;
 218         long len, recwin, sendwin;
 219         int off, flags, error;
 220         register struct mbuf *m;
 221         struct ip *ip = NULL;
 222         register struct ipovly *ipov = NULL;
 223 #if INET6
 224         struct ip6_hdr *ip6 = NULL;
 225 #endif /* INET6 */
 226         register struct tcphdr *th;
 227         u_char opt[TCP_MAXOLEN];
 228         unsigned ipoptlen, optlen, hdrlen;
 229         int idle, sendalot, lost = 0;
 230         int i, sack_rxmit;
 231         int sack_bytes_rxmt;
 232         struct sackhole *p;
 233
 234         int maxburst = TCP_MAXBURST;
 235         int    last_off = 0;
 236         int    m_off;
 237         struct mbuf *m_last = NULL;
 238         struct mbuf *m_head = NULL;
 239         struct mbuf *packetlist = NULL;
 240         struct mbuf *tp_inp_options = tp->t_inpcb->inp_depend4.inp4_options;
 241 #if INET6
 242         int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ;
 243         struct ip6_pktopts *inp6_pktopts = tp->t_inpcb->inp_depend6.inp6_outputopts;
 244 #endif
 245         short packchain_listadd = 0;
 246         u_int16_t       socket_id = get_socket_id(so);
 247         int so_options = so->so_options;
 248         struct rtentry *rt;
 249
 250         /*
 251          * Determine length of data that should be transmitted,
 252          * and flags that will be used.
 253          * If there is some data or critical controls (SYN, RST)
 254          * to send, then transmit; otherwise, investigate further.
 255          */
 256         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 257         if (idle && tp->t_rcvtime >= tp->t_rxtcur) {
 258                 /*
 259                  * We have been idle for "a while" and no acks are
 260                  * expected to clock out any data we send --
 261                  * slow start to get ack "clock" running again.
 262                  *
 263                  * Set the slow-start flight size depending on whether
 264                  * this is a local network or not.
 265                  */
 266                 if (
 267 #if INET6
 268                     (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) ||
 269                     (!isipv6 &&
 270 #endif
 271                      in_localaddr(tp->t_inpcb->inp_faddr)
 272 #if INET6
 273                      )
 274 #endif
 275                     )
 276                         tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
 277                 else
 278                         tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
 279         }
 280         tp->t_flags &= ~TF_LASTIDLE;
 281         if (idle) {
 282                 if (tp->t_flags & TF_MORETOCOME) {
 283                         tp->t_flags |= TF_LASTIDLE;
 284                         idle = 0;
 285                 }
 286         }
 287 again:
 288         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 289
 290 #if INET6
 291         if (isipv6) {
 292
 293                 KERNEL_DEBUG(DBG_LAYER_BEG,
 294                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
 295                      (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
 296                       (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
 297                      sendalot,0,0);
 298         }
 299         else
 300 #endif
 301
 302         {
 303                 KERNEL_DEBUG(DBG_LAYER_BEG,
 304                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
 305                      (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
 306                       (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
 307                      sendalot,0,0);
 308         /*
 309          * If the route generation id changed, we need to check that our
 310          * local (source) IP address is still valid. If it isn't either
 311          * return error or silently do nothing (assuming the address will
 312          * come back before the TCP connection times out).
 313          */
 314         rt = tp->t_inpcb->inp_route.ro_rt;
 315         if (rt != NULL && rt->generation_id != route_generation) {
 316                 struct ifnet *ifp;
 317
 318                 /* disable multipages at the socket */
 319                 somultipages(so, FALSE);
 320
 321                 /* check that the source address is still valid */
 322                 if (ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr) == 0) {
 323
 324                         if (tp->t_state >= TCPS_CLOSE_WAIT) {
 325                                 tcp_drop(tp, EADDRNOTAVAIL);
 326                                 return(EADDRNOTAVAIL);
 327                         }
 328
 329                         /* set Retransmit  timer if it wasn't set
 330                          * reset Persist timer and shift register as the
 331                          * adversed peer window may not be valid anymore
 332                          */
 333
 334                         if (!tp->t_timer[TCPT_REXMT]) {
 335                                 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 336                                 if (tp->t_timer[TCPT_PERSIST]) {
 337                                         tp->t_timer[TCPT_PERSIST] = 0;
 338                                         tp->t_rxtshift = 0;
 339                                 }
 340                         }
 341
 342                         if (tp->t_pktlist_head != NULL)
 343                                 m_freem_list(tp->t_pktlist_head);
 344                         TCP_PKTLIST_CLEAR(tp);
 345
 346                         /* drop connection if source address isn't available */
 347                         if (so->so_flags & SOF_NOADDRAVAIL) {
 348                                 tcp_drop(tp, EADDRNOTAVAIL);
 349                                 return(EADDRNOTAVAIL);
 350                         }
 351                         else
 352                                 return(0); /* silently ignore, keep data in socket: address may be back */
 353                 }
 354
 355                 /*
 356                  * Address is still valid; check for multipages capability
 357                  * again in case the outgoing interface has changed.
 358                  */
 359                 lck_mtx_lock(rt_mtx);
 360                 rt = tp->t_inpcb->inp_route.ro_rt;
 361                 if (rt != NULL && (ifp = rt->rt_ifp) != NULL)
 362                         somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
 363                 if (rt != NULL && rt->generation_id != route_generation)
 364                         rt->generation_id = route_generation;
 365                 /*
 366                  * See if we should do MTU discovery. Don't do it if:
 367                  *      1) it is disabled via the sysctl
 368                  *      2) the route isn't up
 369                  *      3) the MTU is locked (if it is, then discovery has been
 370                  *         disabled)
 371                  */
 372
 373                 if (!path_mtu_discovery || ((rt != NULL) &&
 374                     (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
 375                         tp->t_flags &= ~TF_PMTUD;
 376                 else
 377                         tp->t_flags |= TF_PMTUD;
 378
 379                 lck_mtx_unlock(rt_mtx);
 380         }
 381         }
 382
 383         /*
 384          * If we've recently taken a timeout, snd_max will be greater than
 385          * snd_nxt.  There may be SACK information that allows us to avoid
 386          * resending already delivered data.  Adjust snd_nxt accordingly.
 387          */
 388         if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
 389                 tcp_sack_adjust(tp);
 390         sendalot = 0;
 391         off = tp->snd_nxt - tp->snd_una;
 392         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 393
 394         if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
 395                 sendwin = min(sendwin, slowlink_wsize);
 396
 397         flags = tcp_outflags[tp->t_state];
 398         /*
 399          * Send any SACK-generated retransmissions.  If we're explicitly trying
 400          * to send out new data (when sendalot is 1), bypass this function.
 401          * If we retransmit in fast recovery mode, decrement snd_cwnd, since
 402          * we're replacing a (future) new transmission with a retransmission
 403          * now, and we previously incremented snd_cwnd in tcp_input().
 404          */
 405         /*
 406          * Still in sack recovery , reset rxmit flag to zero.
 407          */
 408         sack_rxmit = 0;
 409         sack_bytes_rxmt = 0;
 410         len = 0;
 411         p = NULL;
 412         if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
 413             (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 414                 long cwin;
 415
 416                 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
 417                 if (cwin < 0)
 418                         cwin = 0;
 419                 /* Do not retransmit SACK segments beyond snd_recover */
 420                 if (SEQ_GT(p->end, tp->snd_recover)) {
 421                         /*
 422                          * (At least) part of sack hole extends beyond
 423                          * snd_recover. Check to see if we can rexmit data
 424                          * for this hole.
 425                          */
 426                         if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
 427                                 /*
 428                                  * Can't rexmit any more data for this hole.
 429                                  * That data will be rexmitted in the next
 430                                  * sack recovery episode, when snd_recover
 431                                  * moves past p->rxmit.
 432                                  */
 433                                 p = NULL;
 434                                 goto after_sack_rexmit;
 435                         } else
 436                                 /* Can rexmit part of the current hole */
 437                                 len = ((long)ulmin(cwin,
 438                                                    tp->snd_recover - p->rxmit));
 439                 } else
 440                         len = ((long)ulmin(cwin, p->end - p->rxmit));
 441                 off = p->rxmit - tp->snd_una;
 442                 if (len > 0) {
 443                         sack_rxmit = 1;
 444                         sendalot = 1;
 445                         tcpstat.tcps_sack_rexmits++;
 446                         tcpstat.tcps_sack_rexmit_bytes +=
 447                             min(len, tp->t_maxseg);
 448                 }
 449         }
 450 after_sack_rexmit:
 451         /*
 452          * Get standard flags, and add SYN or FIN if requested by 'hidden'
 453          * state flags.
 454          */
 455         if (tp->t_flags & TF_NEEDFIN)
 456                 flags |= TH_FIN;
 457         if (tp->t_flags & TF_NEEDSYN)
 458                 flags |= TH_SYN;
 459
 460         /*
 461          * If in persist timeout with window of 0, send 1 byte.
 462          * Otherwise, if window is small but nonzero
 463          * and timer expired, we will send what we can
 464          * and go to transmit state.
 465          */
 466         if (tp->t_force) {
 467                 if (sendwin == 0) {
 468                         /*
 469                          * If we still have some data to send, then
 470                          * clear the FIN bit.  Usually this would
 471                          * happen below when it realizes that we
 472                          * aren't sending all the data.  However,
 473                          * if we have exactly 1 byte of unsent data,
 474                          * then it won't clear the FIN bit below,
 475                          * and if we are in persist state, we wind
 476                          * up sending the packet without recording
 477                          * that we sent the FIN bit.
 478                          *
 479                          * We can't just blindly clear the FIN bit,
 480                          * because if we don't have any more data
 481                          * to send then the probe will be the FIN
 482                          * itself.
 483                          */
 484                         if (off < so->so_snd.sb_cc)
 485                                 flags &= ~TH_FIN;
 486                         sendwin = 1;
 487                 } else {
 488                         tp->t_timer[TCPT_PERSIST] = 0;
 489                         tp->t_rxtshift = 0;
 490                 }
 491         }
 492
 493         /*
 494          * If snd_nxt == snd_max and we have transmitted a FIN, the
 495          * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
 496          * a negative length.  This can also occur when TCP opens up
 497          * its congestion window while receiving additional duplicate
 498          * acks after fast-retransmit because TCP will reset snd_nxt
 499          * to snd_max after the fast-retransmit.
 500          *
 501          * In the normal retransmit-FIN-only case, however, snd_nxt will
 502          * be set to snd_una, the offset will be 0, and the length may
 503          * wind up 0.
 504          *
 505          * If sack_rxmit is true we are retransmitting from the scoreboard
 506          * in which case len is already set.
 507          */
 508         if (sack_rxmit == 0) {
 509                 if (sack_bytes_rxmt == 0)
 510                         len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
 511                 else {
 512                         long cwin;
 513
 514                         /*
 515                          * We are inside of a SACK recovery episode and are
 516                          * sending new data, having retransmitted all the
 517                          * data possible in the scoreboard.
 518                          */
 519                         len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd)
 520                                - off);
 521                         /*
 522                          * Don't remove this (len > 0) check !
 523                          * We explicitly check for len > 0 here (although it
 524                          * isn't really necessary), to work around a gcc
 525                          * optimization issue - to force gcc to compute
 526                          * len above. Without this check, the computation
 527                          * of len is bungled by the optimizer.
 528                          */
 529                         if (len > 0) {
 530                                 cwin = tp->snd_cwnd -
 531                                         (tp->snd_nxt - tp->sack_newdata) -
 532                                         sack_bytes_rxmt;
 533                                 if (cwin < 0)
 534                                         cwin = 0;
 535                                 len = lmin(len, cwin);
 536                         }
 537                 }
 538         }
 539
 540         /*
 541          * Lop off SYN bit if it has already been sent.  However, if this
 542          * is SYN-SENT state and if segment contains data and if we don't
 543          * know that foreign host supports TAO, suppress sending segment.
 544          */
 545         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 546                 flags &= ~TH_SYN;
 547                 off--, len++;
 548                 if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
 549                         while (!(tp->t_flags & TF_SENDINPROG) &&
 550                             tp->t_pktlist_head != NULL) {
 551                                 packetlist = tp->t_pktlist_head;
 552                                 packchain_listadd = tp->t_lastchain;
 553                                 packchain_sent++;
 554                                 TCP_PKTLIST_CLEAR(tp);
 555                                 tp->t_flags |= TF_SENDINPROG;
 556
 557                                 error = tcp_ip_output(so, tp, packetlist,
 558                                     packchain_listadd, tp_inp_options,
 559                                     (so_options & SO_DONTROUTE));
 560
 561                                 tp->t_flags &= ~TF_SENDINPROG;
 562                         }
 563                         /* tcp was closed while we were in ip; resume close */
 564                         if ((tp->t_flags &
 565                             (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
 566                                 tp->t_flags &= ~TF_CLOSING;
 567                                 (void) tcp_close(tp);
 568                         }
 569                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,
 570                             0,0,0,0,0);
 571                         return 0;
 572                 }
 573         }
 574
 575         /*
 576          * Be careful not to send data and/or FIN on SYN segments.
 577          * This measure is needed to prevent interoperability problems
 578          * with not fully conformant TCP implementations.
 579          */
 580         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 581                 len = 0;
 582                 flags &= ~TH_FIN;
 583         }
 584
 585         if (len < 0) {
 586                 /*
 587                  * If FIN has been sent but not acked,
 588                  * but we haven't been called to retransmit,
 589                  * len will be < 0.  Otherwise, window shrank
 590                  * after we sent into it.  If window shrank to 0,
 591                  * cancel pending retransmit, pull snd_nxt back
 592                  * to (closed) window, and set the persist timer
 593                  * if it isn't already going.  If the window didn't
 594                  * close completely, just wait for an ACK.
 595                  */
 596                 len = 0;
 597                 if (sendwin == 0) {
 598                         tp->t_timer[TCPT_REXMT] = 0;
 599                         tp->t_rxtshift = 0;
 600                         tp->snd_nxt = tp->snd_una;
 601                         if (tp->t_timer[TCPT_PERSIST] == 0)
 602                                 tcp_setpersist(tp);
 603                 }
 604         }
 605
 606         /*
 607          * len will be >= 0 after this point.  Truncate to the maximum
 608          * segment length and ensure that FIN is removed if the length
 609          * no longer contains the last data byte.
 610          */
 611         if (len > tp->t_maxseg) {
 612                 len = tp->t_maxseg;
 613                 sendalot = 1;
 614         }
 615         if (sack_rxmit) {
 616                 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
 617                         flags &= ~TH_FIN;
 618         } else {
 619                 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
 620                         flags &= ~TH_FIN;
 621         }
 622
 623         recwin = tcp_sbspace(tp);
 624
 625         /*
 626          * Sender silly window avoidance.   We transmit under the following
 627          * conditions when len is non-zero:
 628          *
 629          *      - We have a full segment
 630          *      - This is the last buffer in a write()/send() and we are
 631          *        either idle or running NODELAY
 632          *      - we've timed out (e.g. persist timer)
 633          *      - we have more then 1/2 the maximum send window's worth of
 634          *        data (receiver may be limited the window size)
 635          *      - we need to retransmit
 636          */
 637         if (len) {
 638                 if (len == tp->t_maxseg) {
 639                         tp->t_flags |= TF_MAXSEGSNT;
 640                         goto send;
 641                 }
 642                 if (!(tp->t_flags & TF_MORETOCOME) &&
 643                     (idle || tp->t_flags & TF_NODELAY || tp->t_flags & TF_MAXSEGSNT) &&
 644                     (tp->t_flags & TF_NOPUSH) == 0 &&
 645                     len + off >= so->so_snd.sb_cc) {
 646                         tp->t_flags &= ~TF_MAXSEGSNT;
 647                         goto send;
 648                 }
 649                 if (tp->t_force) {
 650                         tp->t_flags &= ~TF_MAXSEGSNT;
 651                         goto send;
 652                 }
 653                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
 654                         tp->t_flags &= ~TF_MAXSEGSNT;
 655                         goto send;
 656                 }
 657                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 658                         tp->t_flags &= ~TF_MAXSEGSNT;
 659                         goto send;
 660                 }
 661                 if (sack_rxmit)
 662                         goto send;
 663         }
 664
 665         /*
 666          * Compare available window to amount of window
 667          * known to peer (as advertised window less
 668          * next expected input).  If the difference is at least two
 669          * max size segments, or at least 50% of the maximum possible
 670          * window, then want to send a window update to peer.
 671          * Skip this if the connection is in T/TCP half-open state.
 672          */
 673         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
 674                 /*
 675                  * "adv" is the amount we can increase the window,
 676                  * taking into account that we are limited by
 677                  * TCP_MAXWIN << tp->rcv_scale.
 678                  */
 679                 long adv = lmin(recwin, (long)TCP_MAXWIN << tp->rcv_scale) -
 680                         (tp->rcv_adv - tp->rcv_nxt);
 681
 682                 if (adv >= (long) (2 * tp->t_maxseg)) {
 683
 684                         /*
 685                          * Update only if the resulting scaled value of the window changed, or
 686                          * if there is a change in the sequence since the last ack.
 687                          * This avoids what appears as dupe ACKS (see rdar://5640997)
 688                          */
 689
 690                         if ((tp->last_ack_sent != tp->rcv_nxt) || (((recwin + adv) >> tp->rcv_scale) > recwin))
 691                                 goto send;
 692                 }
 693                 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
 694                                 goto send;
 695         }
 696
 697         /*
 698          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 699          * is also a catch-all for the retransmit timer timeout case.
 700          */
 701         if (tp->t_flags & TF_ACKNOW)
 702                 goto send;
 703         if ((flags & TH_RST) ||
 704             ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 705                 goto send;
 706         if (SEQ_GT(tp->snd_up, tp->snd_una))
 707                 goto send;
 708         /*
 709          * If our state indicates that FIN should be sent
 710          * and we have not yet done so, then we need to send.
 711          */
 712         if (flags & TH_FIN &&
 713             ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 714                 goto send;
 715         /*
 716          * In SACK, it is possible for tcp_output to fail to send a segment
 717          * after the retransmission timer has been turned off.  Make sure
 718          * that the retransmission timer is set.
 719          */
 720         if (tp->sack_enable && (tp->t_state >= TCPS_ESTABLISHED) && SEQ_GT(tp->snd_max, tp->snd_una) &&
 721                 tp->t_timer[TCPT_REXMT] == 0 &&
 722             tp->t_timer[TCPT_PERSIST] == 0) {
 723                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 724                         goto just_return;
 725         }
 726         /*
 727          * TCP window updates are not reliable, rather a polling protocol
 728          * using ``persist'' packets is used to insure receipt of window
 729          * updates.  The three ``states'' for the output side are:
 730          *      idle                    not doing retransmits or persists
 731          *      persisting              to move a small or zero window
 732          *      (re)transmitting        and thereby not persisting
 733          *
 734          * tp->t_timer[TCPT_PERSIST]
 735          *      is set when we are in persist state.
 736          * tp->t_force
 737          *      is set when we are called to send a persist packet.
 738          * tp->t_timer[TCPT_REXMT]
 739          *      is set when we are retransmitting
 740          * The output side is idle when both timers are zero.
 741          *
 742          * If send window is too small, there is data to transmit, and no
 743          * retransmit or persist is pending, then go to persist state.
 744          * If nothing happens soon, send when timer expires:
 745          * if window is nonzero, transmit what we can,
 746          * otherwise force out a byte.
 747          */
 748         if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
 749             tp->t_timer[TCPT_PERSIST] == 0) {
 750                 tp->t_rxtshift = 0;
 751                 tcp_setpersist(tp);
 752         }
 753 just_return:
 754         /*
 755          * If there is no reason to send a segment, just return.
 756          * but if there is some packets left in the packet list, send them now.
 757          */
 758         while (!(tp->t_flags & TF_SENDINPROG) && tp->t_pktlist_head != NULL) {
 759                 packetlist = tp->t_pktlist_head;
 760                 packchain_listadd = tp->t_lastchain;
 761                 packchain_sent++;
 762                 TCP_PKTLIST_CLEAR(tp);
 763                 tp->t_flags |= TF_SENDINPROG;
 764
 765                 error = tcp_ip_output(so, tp, packetlist, packchain_listadd,
 766                     tp_inp_options, (so_options & SO_DONTROUTE));
 767
 768                 tp->t_flags &= ~TF_SENDINPROG;
 769         }
 770         /* tcp was closed while we were in ip; resume close */
 771         if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
 772                 tp->t_flags &= ~TF_CLOSING;
 773                 (void) tcp_close(tp);
 774         }
 775         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 776         return (0);
 777
 778 send:
 779         /*
 780          * Before ESTABLISHED, force sending of initial options
 781          * unless TCP set not to do any options.
 782          * NOTE: we assume that the IP/TCP header plus TCP options
 783          * always fit in a single mbuf, leaving room for a maximum
 784          * link header, i.e.
 785          *      max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 786          */
 787         optlen = 0;
 788 #if INET6
 789         if (isipv6)
 790                 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 791         else
 792 #endif
 793         hdrlen = sizeof (struct tcpiphdr);
 794         if (flags & TH_SYN) {
 795                 tp->snd_nxt = tp->iss;
 796                 if ((tp->t_flags & TF_NOOPT) == 0) {
 797                         u_short mss;
 798
 799                         opt[0] = TCPOPT_MAXSEG;
 800                         opt[1] = TCPOLEN_MAXSEG;
 801                         mss = htons((u_short) tcp_mssopt(tp));
 802                         (void)memcpy(opt + 2, &mss, sizeof(mss));
 803                         optlen = TCPOLEN_MAXSEG;
 804
 805                         if ((tp->t_flags & TF_REQ_SCALE) &&
 806                             ((flags & TH_ACK) == 0 ||
 807                             (tp->t_flags & TF_RCVD_SCALE))) {
 808                                 *((u_int32_t *)(opt + optlen)) = htonl(
 809                                         TCPOPT_NOP << 24 |
 810                                         TCPOPT_WINDOW << 16 |
 811                                         TCPOLEN_WINDOW << 8 |
 812                                         tp->request_r_scale);
 813                                 optlen += 4;
 814                         }
 815                 }
 816
 817         }
 818
 819         /*
 820           RFC 3168 states that:
 821            - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
 822            to handle the TCP ECE flag, even if you also later send a
 823            non-ECN-setup SYN/SYN-ACK.
 824            - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
 825            the ip ECT flag.
 826
 827            It is not clear how the ECE flag would ever be set if you never
 828            set the IP ECT flag on outbound packets. All the same, we use
 829            the TE_SETUPSENT to indicate that we have committed to handling
 830            the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
 831            whether or not we should set the IP ECT flag on outbound packets.
 832          */
 833         /*
 834          * For a SYN-ACK, send an ECN setup SYN-ACK
 835          */
 836         if (tcp_ecn_inbound && (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
 837                 if ((tp->ecn_flags & TE_SETUPRECEIVED) != 0) {
 838                         if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
 839                                 /* Setting TH_ECE makes this an ECN-setup SYN-ACK */
 840                                 flags |= TH_ECE;
 841
 842                                 /*
 843                                  * Record that we sent the ECN-setup and default to
 844                                  * setting IP ECT.
 845                                  */
 846                                 tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
 847                         }
 848                         else {
 849                                 /*
 850                                  * We sent an ECN-setup SYN-ACK but it was dropped.
 851                                  * Fallback to non-ECN-setup SYN-ACK and clear flag
 852                                  * that to indicate we should not send data with IP ECT set.
 853                                  *
 854                                  * Pretend we didn't receive an ECN-setup SYN.
 855                                  */
 856                                 tp->ecn_flags &= ~TE_SETUPRECEIVED;
 857                         }
 858                 }
 859         }
 860         else if (tcp_ecn_outbound && (flags & (TH_SYN | TH_ACK)) == TH_SYN) {
 861                 if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
 862                         /* Setting TH_ECE and TH_CWR makes this an ECN-setup SYN */
 863                         flags |= (TH_ECE | TH_CWR);
 864
 865                         /*
 866                          * Record that we sent the ECN-setup and default to
 867                          * setting IP ECT.
 868                          */
 869                         tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
 870                 }
 871                 else {
 872                         /*
 873                          * We sent an ECN-setup SYN but it was dropped.
 874                          * Fall back to no ECN and clear flag indicating
 875                          * we should send data with IP ECT set.
 876                          */
 877                         tp->ecn_flags &= ~TE_SENDIPECT;
 878                 }
 879         }
 880
 881         /*
 882          * Check if we should set the TCP CWR flag.
 883          * CWR flag is sent when we reduced the congestion window because
 884          * we received a TCP ECE or we performed a fast retransmit. We
 885          * never set the CWR flag on retransmitted packets. We only set
 886          * the CWR flag on data packets. Pure acks don't have this set.
 887          */
 888         if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 &&
 889                 !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 890                 flags |= TH_CWR;
 891                 tp->ecn_flags &= ~TE_SENDCWR;
 892         }
 893
 894         /*
 895          * Check if we should set the TCP ECE flag.
 896          */
 897         if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) {
 898                 flags |= TH_ECE;
 899         }
 900
 901         /*
 902          * Send a timestamp and echo-reply if this is a SYN and our side
 903          * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
 904          * and our peer have sent timestamps in our SYN's.
 905          */
 906         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 907             (flags & TH_RST) == 0 &&
 908             ((flags & TH_ACK) == 0 ||
 909              (tp->t_flags & TF_RCVD_TSTMP))) {
 910                 u_int32_t *lp = (u_int32_t *)(opt + optlen);
 911
 912                 /* Form timestamp option as shown in appendix A of RFC 1323. */
 913                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
 914                 *lp++ = htonl(tcp_now);
 915                 *lp   = htonl(tp->ts_recent);
 916                 optlen += TCPOLEN_TSTAMP_APPA;
 917         }
 918
 919         if (tp->sack_enable && ((tp->t_flags & TF_NOOPT) == 0)) {
 920                 /*
 921                  * Tack on the SACK permitted option *last*.
 922                  * And do padding of options after tacking this on.
 923                  * This is because of MSS, TS, WinScale and Signatures are
 924                  * all present, we have just 2 bytes left for the SACK
 925                  * permitted option, which is just enough.
 926                  */
 927                 /*
 928                  * If this is the first SYN of connection (not a SYN
 929                  * ACK), include SACK permitted option.  If this is a
 930                  * SYN ACK, include SACK permitted option if peer has
 931                  * already done so. This is only for active connect,
 932                  * since the syncache takes care of the passive connect.
 933                  */
 934                 if ((flags & TH_SYN) &&
 935                     (!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) {
 936                         u_char *bp;
 937                         bp = (u_char *)opt + optlen;
 938
 939                         *bp++ = TCPOPT_SACK_PERMITTED;
 940                         *bp++ = TCPOLEN_SACK_PERMITTED;
 941                         optlen += TCPOLEN_SACK_PERMITTED;
 942                 }
 943
 944                 /*
 945                  * Send SACKs if necessary.  This should be the last
 946                  * option processed.  Only as many SACKs are sent as
 947                  * are permitted by the maximum options size.
 948                  *
 949                  * In general, SACK blocks consume 8*n+2 bytes.
 950                  * So a full size SACK blocks option is 34 bytes
 951                  * (to generate 4 SACK blocks).  At a minimum,
 952                  * we need 10 bytes (to generate 1 SACK block).
 953                  * If TCP Timestamps (12 bytes) and TCP Signatures
 954                  * (18 bytes) are both present, we'll just have
 955                  * 10 bytes for SACK options 40 - (12 + 18).
 956                  */
 957                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 958                     (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0 &&
 959                     MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) {
 960                         int nsack, sackoptlen, padlen;
 961                         u_char *bp = (u_char *)opt + optlen;
 962                         u_int32_t *lp;
 963
 964                         nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK;
 965                         nsack = min(nsack, tp->rcv_numsacks);
 966                         sackoptlen = (2 + nsack * TCPOLEN_SACK);
 967
 968                         /*
 969                          * First we need to pad options so that the
 970                          * SACK blocks can start at a 4-byte boundary
 971                          * (sack option and length are at a 2 byte offset).
 972                          */
 973                         padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4;
 974                         optlen += padlen;
 975                         while (padlen-- > 0)
 976                                 *bp++ = TCPOPT_NOP;
 977
 978                         tcpstat.tcps_sack_send_blocks++;
 979                         *bp++ = TCPOPT_SACK;
 980                         *bp++ = sackoptlen;
 981                         lp = (u_int32_t *)bp;
 982                         for (i = 0; i < nsack; i++) {
 983                                 struct sackblk sack = tp->sackblks[i];
 984                                 *lp++ = htonl(sack.start);
 985                                 *lp++ = htonl(sack.end);
 986                         }
 987                         optlen += sackoptlen;
 988                 }
 989         }
 990
 991         /* Pad TCP options to a 4 byte boundary */
 992         if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
 993                 int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
 994                 u_char *bp = (u_char *)opt + optlen;
 995
 996                 optlen += pad;
 997                 while (pad) {
 998                         *bp++ = TCPOPT_EOL;
 999                         pad--;
1000                 }
1001         }
1002
1003         hdrlen += optlen;
1004
1005 #if INET6
1006         if (isipv6)
1007                 ipoptlen = ip6_optlen(tp->t_inpcb);
1008         else
1009 #endif
1010         {
1011                 if (tp_inp_options) {
1012                         ipoptlen = tp_inp_options->m_len -
1013                                 offsetof(struct ipoption, ipopt_list);
1014                 } else
1015                         ipoptlen = 0;
1016         }
1017 #if IPSEC
1018         if (ipsec_bypass == 0)
1019                 ipoptlen += ipsec_hdrsiz_tcp(tp);
1020 #endif
1021
1022         /*
1023          * Adjust data length if insertion of options will
1024          * bump the packet length beyond the t_maxopd length.
1025          * Clear the FIN bit because we cut off the tail of
1026          * the segment.
1027          */
1028         if (len + optlen + ipoptlen > tp->t_maxopd) {
1029                 /*
1030                  * If there is still more to send, don't close the connection.
1031                  */
1032                 flags &= ~TH_FIN;
1033                 len = tp->t_maxopd - optlen - ipoptlen;
1034                 sendalot = 1;
1035         }
1036
1037 /*#ifdef DIAGNOSTIC*/
1038 #if INET6
1039         if (max_linkhdr + hdrlen > MCLBYTES)
1040                 panic("tcphdr too big");
1041 #else
1042         if (max_linkhdr + hdrlen > MHLEN)
1043                 panic("tcphdr too big");
1044 #endif
1045 /*#endif*/
1046
1047         /*
1048          * Grab a header mbuf, attaching a copy of data to
1049          * be transmitted, and initialize the header from
1050          * the template for sends on this connection.
1051          */
1052         if (len) {
1053                 if (tp->t_force && len == 1)
1054                         tcpstat.tcps_sndprobe++;
1055                 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1056                         tcpstat.tcps_sndrexmitpack++;
1057                         tcpstat.tcps_sndrexmitbyte += len;
1058                 } else {
1059                         tcpstat.tcps_sndpack++;
1060                         tcpstat.tcps_sndbyte += len;
1061                 }
1062 #ifdef notyet
1063                 if ((m = m_copypack(so->so_snd.sb_mb, off,
1064                     (int)len, max_linkhdr + hdrlen)) == 0) {
1065                         error = ENOBUFS;
1066                         goto out;
1067                 }
1068                 /*
1069                  * m_copypack left space for our hdr; use it.
1070                  */
1071                 m->m_len += hdrlen;
1072                 m->m_data -= hdrlen;
1073 #else
1074                 /*
1075                  * try to use the new interface that allocates all
1076                  * the necessary mbuf hdrs under 1 mbuf lock and
1077                  * avoids rescanning the socket mbuf list if
1078                  * certain conditions are met.  This routine can't
1079                  * be used in the following cases...
1080                  * 1) the protocol headers exceed the capacity of
1081                  * of a single mbuf header's data area (no cluster attached)
1082                  * 2) the length of the data being transmitted plus
1083                  * the protocol headers fits into a single mbuf header's
1084                  * data area (no cluster attached)
1085                  */
1086                 m = NULL;
1087 #if INET6
1088                 if (MHLEN < hdrlen + max_linkhdr) {
1089                         MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1090                         if (m == NULL) {
1091                                 error = ENOBUFS;
1092                                 goto out;
1093                         }
1094                         MCLGET(m, M_DONTWAIT);
1095                         if ((m->m_flags & M_EXT) == 0) {
1096                                 m_freem(m);
1097                                 error = ENOBUFS;
1098                                 goto out;
1099                         }
1100                         m->m_data += max_linkhdr;
1101                         m->m_len = hdrlen;
1102                 }
1103 #endif
1104                 if (len <= MHLEN - hdrlen - max_linkhdr) {
1105                         if (m == NULL) {
1106                                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1107                                 if (m == NULL) {
1108                                         error = ENOBUFS;
1109                                         goto out;
1110                                 }
1111                                 m->m_data += max_linkhdr;
1112                                 m->m_len = hdrlen;
1113                         }
1114                         /* makes sure we still have data left to be sent at this point */
1115                         if (so->so_snd.sb_mb == NULL || off == -1) {
1116                                 if (m != NULL)  m_freem(m);
1117                                 error = 0; /* should we return an error? */
1118                                 goto out;
1119                         }
1120                         m_copydata(so->so_snd.sb_mb, off, (int) len,
1121                             mtod(m, caddr_t) + hdrlen);
1122                         m->m_len += len;
1123                 } else {
1124                         if (m != NULL) {
1125                                 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
1126                                 if (m->m_next == 0) {
1127                                         (void) m_free(m);
1128                                         error = ENOBUFS;
1129                                         goto out;
1130                                 }
1131                         } else {
1132                                 /*
1133                                  * determine whether the mbuf pointer and offset passed back by the 'last' call
1134                                  * to m_copym_with_hdrs are still valid... if the head of the socket chain has
1135                                  * changed (due to an incoming ACK for instance), or the offset into the chain we
1136                                  * just computed is different from the one last returned by m_copym_with_hdrs (perhaps
1137                                  * we're re-transmitting a packet sent earlier), than we can't pass the mbuf pointer and
1138                                  * offset into it as valid hints for m_copym_with_hdrs to use (if valid, these hints allow
1139                                  * m_copym_with_hdrs to avoid rescanning from the beginning of the socket buffer mbuf list.
1140                                  * setting the mbuf pointer to NULL is sufficient to disable the hint mechanism.
1141                                  */
1142                                 if (m_head != so->so_snd.sb_mb || last_off != off)
1143                                         m_last = NULL;
1144                                 last_off = off + len;
1145                                 m_head = so->so_snd.sb_mb;
1146
1147                                 /* makes sure we still have data left to be sent at this point */
1148                                 if (m_head == NULL) {
1149                                         error = 0; /* should we return an error? */
1150                                         goto out;
1151                                 }
1152
1153                                 /*
1154                                  * m_copym_with_hdrs will always return the last mbuf pointer and the offset into it that
1155                                  * it acted on to fullfill the current request, whether a valid 'hint' was passed in or not
1156                                  */
1157                                 if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, (int) len, M_DONTWAIT, &m_last, &m_off)) == NULL) {
1158                                         error = ENOBUFS;
1159                                         goto out;
1160                                 }
1161                                 m->m_data += max_linkhdr;
1162                                 m->m_len = hdrlen;
1163                         }
1164                 }
1165 #endif
1166                 /*
1167                  * If we're sending everything we've got, set PUSH.
1168                  * (This will keep happy those implementations which only
1169                  * give data to the user when a buffer fills or
1170                  * a PUSH comes in.)
1171                  */
1172                 if (off + len == so->so_snd.sb_cc)
1173                         flags |= TH_PUSH;
1174         } else {
1175                 if (tp->t_flags & TF_ACKNOW)
1176                         tcpstat.tcps_sndacks++;
1177                 else if (flags & (TH_SYN|TH_FIN|TH_RST))
1178                         tcpstat.tcps_sndctrl++;
1179                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
1180                         tcpstat.tcps_sndurg++;
1181                 else
1182                         tcpstat.tcps_sndwinup++;
1183
1184                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1185                 if (m == NULL) {
1186                         error = ENOBUFS;
1187                         goto out;
1188                 }
1189 #if INET6
1190                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
1191                     MHLEN >= hdrlen) {
1192                         MH_ALIGN(m, hdrlen);
1193                 } else
1194 #endif
1195                 m->m_data += max_linkhdr;
1196                 m->m_len = hdrlen;
1197         }
1198         m->m_pkthdr.rcvif = 0;
1199 #if CONFIG_MACF_NET
1200         mac_mbuf_label_associate_inpcb(tp->t_inpcb, m);
1201 #endif
1202 #if CONFIG_IP_EDGEHOLE
1203         ip_edgehole_mbuf_tag(tp->t_inpcb, m);
1204 #endif
1205 #if INET6
1206         if (isipv6) {
1207                 ip6 = mtod(m, struct ip6_hdr *);
1208                 th = (struct tcphdr *)(ip6 + 1);
1209                 tcp_fillheaders(tp, ip6, th);
1210         } else
1211 #endif /* INET6 */
1212         {
1213                 ip = mtod(m, struct ip *);
1214                 ipov = (struct ipovly *)ip;
1215                 th = (struct tcphdr *)(ip + 1);
1216                 /* this picks up the pseudo header (w/o the length) */
1217                 tcp_fillheaders(tp, ip, th);
1218                 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
1219                         !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1220                         ip->ip_tos = IPTOS_ECN_ECT0;
1221                 }
1222         }
1223
1224         /*
1225          * Fill in fields, remembering maximum advertised
1226          * window for use in delaying messages about window sizes.
1227          * If resending a FIN, be sure not to use a new sequence number.
1228          */
1229         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
1230             tp->snd_nxt == tp->snd_max)
1231                 tp->snd_nxt--;
1232         /*
1233          * If we are doing retransmissions, then snd_nxt will
1234          * not reflect the first unsent octet.  For ACK only
1235          * packets, we do not want the sequence number of the
1236          * retransmitted packet, we want the sequence number
1237          * of the next unsent octet.  So, if there is no data
1238          * (and no SYN or FIN), use snd_max instead of snd_nxt
1239          * when filling in ti_seq.  But if we are in persist
1240          * state, snd_max might reflect one byte beyond the
1241          * right edge of the window, so use snd_nxt in that
1242          * case, since we know we aren't doing a retransmission.
1243          * (retransmit and persist are mutually exclusive...)
1244          */
1245         if (sack_rxmit == 0) {
1246                 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
1247                         th->th_seq = htonl(tp->snd_nxt);
1248                 else
1249                         th->th_seq = htonl(tp->snd_max);
1250         } else {
1251                 th->th_seq = htonl(p->rxmit);
1252                 p->rxmit += len;
1253                 tp->sackhint.sack_bytes_rexmit += len;
1254         }
1255         th->th_ack = htonl(tp->rcv_nxt);
1256         tp->last_ack_sent = tp->rcv_nxt;
1257
1258         if (optlen) {
1259                 bcopy(opt, th + 1, optlen);
1260                 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
1261         }
1262         th->th_flags = flags;
1263         /*
1264          * Calculate receive window.  Don't shrink window,
1265          * but avoid silly window syndrome.
1266          */
1267         if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)tp->t_maxseg)
1268                 recwin = 0;
1269         if (recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
1270                 recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
1271         if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
1272                 if (recwin > (long)slowlink_wsize)
1273                         recwin = slowlink_wsize;
1274                         th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
1275         }
1276         else {
1277                 if (recwin > (long)(TCP_MAXWIN << tp->rcv_scale))
1278                         recwin = (long)(TCP_MAXWIN << tp->rcv_scale);
1279                 th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
1280         }
1281
1282         /*
1283          * Adjust the RXWIN0SENT flag - indicate that we have advertised
1284          * a 0 window.  This may cause the remote transmitter to stall.  This
1285          * flag tells soreceive() to disable delayed acknowledgements when
1286          * draining the buffer.  This can occur if the receiver is attempting
1287          * to read more data then can be buffered prior to transmitting on
1288          * the connection.
1289          */
1290         if (recwin == 0)
1291                 tp->t_flags |= TF_RXWIN0SENT;
1292         else
1293                 tp->t_flags &= ~TF_RXWIN0SENT;
1294         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1295                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
1296                 th->th_flags |= TH_URG;
1297         } else
1298                 /*
1299                  * If no urgent pointer to send, then we pull
1300                  * the urgent pointer to the left edge of the send window
1301                  * so that it doesn't drift into the send window on sequence
1302                  * number wraparound.
1303                  */
1304                 tp->snd_up = tp->snd_una;               /* drag it along */
1305
1306         /*
1307          * Put TCP length in extended header, and then
1308          * checksum extended header and data.
1309          */
1310         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
1311 #if INET6
1312         if (isipv6)
1313                 /*
1314                  * ip6_plen is not need to be filled now, and will be filled
1315                  * in ip6_output.
1316                  */
1317                 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
1318                                        sizeof(struct tcphdr) + optlen + len);
1319         else
1320 #endif /* INET6 */
1321         {
1322                 m->m_pkthdr.csum_flags = CSUM_TCP;
1323                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1324                 if (len + optlen)
1325                         th->th_sum = in_addword(th->th_sum,
1326                                 htons((u_short)(optlen + len)));
1327         }
1328
1329         /*
1330          * In transmit state, time the transmission and arrange for
1331          * the retransmit.  In persist state, just set snd_max.
1332          */
1333         if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
1334                 tcp_seq startseq = tp->snd_nxt;
1335
1336                 /*
1337                  * Advance snd_nxt over sequence space of this segment.
1338                  */
1339                 if (flags & (TH_SYN|TH_FIN)) {
1340                         if (flags & TH_SYN)
1341                                 tp->snd_nxt++;
1342                         if (flags & TH_FIN) {
1343                                 tp->snd_nxt++;
1344                                 tp->t_flags |= TF_SENTFIN;
1345                         }
1346                 }
1347                 if (sack_rxmit)
1348                         goto timer;
1349                 tp->snd_nxt += len;
1350                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1351                         tp->snd_max = tp->snd_nxt;
1352                         /*
1353                          * Time this transmission if not a retransmission and
1354                          * not currently timing anything.
1355                          */
1356                         if (tp->t_rtttime == 0) {
1357                                 tp->t_rtttime = 1;
1358                                 tp->t_rtseq = startseq;
1359                                 tcpstat.tcps_segstimed++;
1360                         }
1361                 }
1362
1363                 /*
1364                  * Set retransmit timer if not currently set,
1365                  * and not doing an ack or a keep-alive probe.
1366                  * Initial value for retransmit timer is smoothed
1367                  * round-trip time + 2 * round-trip time variance.
1368                  * Initialize shift counter which is used for backoff
1369                  * of retransmit time.
1370                  */
1371 timer:
1372                 if (tp->t_timer[TCPT_REXMT] == 0 &&
1373                     ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
1374                         tp->snd_nxt != tp->snd_una)) {
1375                         if (tp->t_timer[TCPT_PERSIST]) {
1376                                 tp->t_timer[TCPT_PERSIST] = 0;
1377                                 tp->t_rxtshift = 0;
1378                         }
1379                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1380                 }
1381         } else {
1382                 /*
1383                  * Persist case, update snd_max but since we are in
1384                  * persist mode (no window) we do not update snd_nxt.
1385                  */
1386                 int xlen = len;
1387                 if (flags & TH_SYN)
1388                         ++xlen;
1389                 if (flags & TH_FIN) {
1390                         ++xlen;
1391                         tp->t_flags |= TF_SENTFIN;
1392                 }
1393                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
1394                         tp->snd_max = tp->snd_nxt + len;
1395         }
1396
1397 #if TCPDEBUG
1398         /*
1399          * Trace.
1400          */
1401         if (so_options & SO_DEBUG)
1402                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
1403 #endif
1404
1405         /*
1406          * Fill in IP length and desired time to live and
1407          * send to IP level.  There should be a better way
1408          * to handle ttl and tos; we could keep them in
1409          * the template, but need a way to checksum without them.
1410          */
1411         /*
1412          * m->m_pkthdr.len should have been set before cksum calcuration,
1413          * because in6_cksum() need it.
1414          */
1415 #if INET6
1416         if (isipv6) {
1417                 /*
1418                  * we separately set hoplimit for every segment, since the
1419                  * user might want to change the value via setsockopt.
1420                  * Also, desired default hop limit might be changed via
1421                  * Neighbor Discovery.
1422                  */
1423                 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
1424                                                tp->t_inpcb->in6p_route.ro_rt ?
1425                                                tp->t_inpcb->in6p_route.ro_rt->rt_ifp
1426                                                : NULL);
1427
1428                 /* TODO: IPv6 IP6TOS_ECT bit on */
1429 #if IPSEC
1430                 if (ipsec_bypass == 0 && ipsec_setsocket(m, so) != 0) {
1431                         m_freem(m);
1432                         error = ENOBUFS;
1433                         goto out;
1434                 }
1435 #endif /*IPSEC*/
1436                 m->m_pkthdr.socket_id = socket_id;
1437                 error = ip6_output(m,
1438                             inp6_pktopts,
1439                             &tp->t_inpcb->in6p_route,
1440                             (so_options & SO_DONTROUTE), NULL, NULL, 0);
1441         } else
1442 #endif /* INET6 */
1443     {
1444         ip->ip_len = m->m_pkthdr.len;
1445 #if INET6
1446         if (isipv6)
1447                 ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
1448                                             tp->t_inpcb->in6p_route.ro_rt ?
1449                                             tp->t_inpcb->in6p_route.ro_rt->rt_ifp
1450                                             : NULL);
1451         else
1452 #endif /* INET6 */
1453         ip->ip_ttl = tp->t_inpcb->inp_ip_ttl;   /* XXX */
1454         ip->ip_tos |= (tp->t_inpcb->inp_ip_tos & ~IPTOS_ECN_MASK);      /* XXX */
1455
1456
1457 #if INET6
1458         if (isipv6) {
1459                 KERNEL_DEBUG(DBG_LAYER_BEG,
1460                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
1461                      (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
1462                       (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
1463                      0,0,0);
1464         }
1465         else
1466 #endif
1467         {
1468                 KERNEL_DEBUG(DBG_LAYER_BEG,
1469                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
1470                      (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
1471                       (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
1472                      0,0,0);
1473         }
1474
1475         /*
1476          * See if we should do MTU discovery.
1477          * Look at the flag updated on the following criterias:
1478          *      1) Path MTU discovery is authorized by the sysctl
1479          *      2) The route isn't set yet (unlikely but could happen)
1480          *      3) The route is up
1481          *      4) the MTU is not locked (if it is, then discovery has been
1482          *         disabled for that route)
1483          */
1484
1485         if (path_mtu_discovery && (tp->t_flags & TF_PMTUD))
1486                 ip->ip_off |= IP_DF;
1487
1488 #if IPSEC
1489         if (ipsec_bypass == 0)
1490                 ipsec_setsocket(m, so);
1491 #endif /*IPSEC*/
1492
1493         /*
1494          * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
1495          */
1496         lost = 0;
1497         m->m_pkthdr.socket_id = socket_id;
1498         m->m_nextpkt = NULL;
1499         tp->t_pktlist_sentlen += len;
1500         tp->t_lastchain++;
1501         if (tp->t_pktlist_head != NULL) {
1502                 tp->t_pktlist_tail->m_nextpkt = m;
1503                 tp->t_pktlist_tail = m;
1504         } else {
1505                 packchain_newlist++;
1506                 tp->t_pktlist_head = tp->t_pktlist_tail = m;
1507         }
1508
1509         if (sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) ||
1510               (tp->snd_cwnd <= (tp->snd_wnd / 8)) ||
1511               (tp->t_flags & (TH_PUSH | TF_ACKNOW)) || tp->t_force != 0 ||
1512               tp->t_lastchain >= tcp_packet_chaining) {
1513                 error = 0;
1514                 while (!(tp->t_flags & TF_SENDINPROG) &&
1515                     tp->t_pktlist_head != NULL) {
1516                         packetlist = tp->t_pktlist_head;
1517                         packchain_listadd = tp->t_lastchain;
1518                         packchain_sent++;
1519                         lost = tp->t_pktlist_sentlen;
1520                         TCP_PKTLIST_CLEAR(tp);
1521                         tp->t_flags |= TF_SENDINPROG;
1522
1523                         error = tcp_ip_output(so, tp, packetlist,
1524                             packchain_listadd, tp_inp_options,
1525                             (so_options & SO_DONTROUTE));
1526
1527                         tp->t_flags &= ~TF_SENDINPROG;
1528                         if (error) {
1529                                 /*
1530                                  * Take into account the rest of unsent
1531                                  * packets in the packet list for this tcp
1532                                  * into "lost", since we're about to free
1533                                  * the whole list below.
1534                                  */
1535                                 lost += tp->t_pktlist_sentlen;
1536                                 break;
1537                         } else {
1538                                 lost = 0;
1539                         }
1540                 }
1541                 /* tcp was closed while we were in ip; resume close */
1542                 if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
1543                         tp->t_flags &= ~TF_CLOSING;
1544                         (void) tcp_close(tp);
1545                         return (0);
1546                 }
1547         }
1548         else {
1549                 error = 0;
1550                 packchain_looped++;
1551                 tcpstat.tcps_sndtotal++;
1552
1553                 if (recwin > 0 && SEQ_GT(tp->rcv_nxt+recwin, tp->rcv_adv))
1554                         tp->rcv_adv = tp->rcv_nxt + recwin;
1555                 tp->last_ack_sent = tp->rcv_nxt;
1556                 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
1557                 goto again;
1558         }
1559    }
1560         if (error) {
1561                 /*
1562                  * Assume that the packets were lost, so back out the
1563                  * sequence number advance, if any.  Note that the "lost"
1564                  * variable represents the amount of user data sent during
1565                  * the recent call to ip_output_list() plus the amount of
1566                  * user data in the packet list for this tcp at the moment.
1567                  */
1568                 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
1569                         /*
1570                          * No need to check for TH_FIN here because
1571                          * the TF_SENTFIN flag handles that case.
1572                          */
1573                         if ((flags & TH_SYN) == 0) {
1574                                 if (sack_rxmit) {
1575                                         p->rxmit -= lost;
1576                                         tp->sackhint.sack_bytes_rexmit -= lost;
1577                                 } else
1578                                         tp->snd_nxt -= lost;
1579                         }
1580                 }
1581 out:
1582                 if (tp->t_pktlist_head != NULL)
1583                         m_freem_list(tp->t_pktlist_head);
1584                 TCP_PKTLIST_CLEAR(tp);
1585
1586                 if (error == ENOBUFS) {
1587                         if (!tp->t_timer[TCPT_REXMT] &&
1588                              !tp->t_timer[TCPT_PERSIST])
1589                                 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1590                         tcp_quench(tp->t_inpcb, 0);
1591                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1592                         return (0);
1593                 }
1594                 if (error == EMSGSIZE) {
1595                         /*
1596                          * ip_output() will have already fixed the route
1597                          * for us.  tcp_mtudisc() will, as its last action,
1598                          * initiate retransmission, so it is important to
1599                          * not do so here.
1600                          */
1601                         tcp_mtudisc(tp->t_inpcb, 0);
1602                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1603                         return 0;
1604                 }
1605                 if ((error == EHOSTUNREACH || error == ENETDOWN)
1606                     && TCPS_HAVERCVDSYN(tp->t_state)) {
1607                         tp->t_softerror = error;
1608                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1609                         return (0);
1610                 }
1611                 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1612                 return (error);
1613         }
1614
1615         tcpstat.tcps_sndtotal++;
1616
1617         /*
1618          * Data sent (as far as we can tell).
1619          * If this advertises a larger window than any other segment,
1620          * then remember the size of the advertised window.
1621          * Any pending ACK has now been sent.
1622          */
1623         if (recwin > 0 && SEQ_GT(tp->rcv_nxt+recwin, tp->rcv_adv))
1624                 tp->rcv_adv = tp->rcv_nxt + recwin;
1625         tp->last_ack_sent = tp->rcv_nxt;
1626         tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
1627
1628         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0);
1629         if (sendalot && (!tcp_do_newreno || --maxburst))
1630                 goto again;
1631         return (0);
1632 }
1633
1634 static int
1635 tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
1636     int cnt, struct mbuf *opt, int flags)
1637 {
1638         int error = 0;
1639         boolean_t chain;
1640         boolean_t unlocked = FALSE;
1641
1642         /* Make sure ACK/DELACK conditions are cleared before
1643          * we unlock the socket.
1644          */
1645
1646         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
1647         /*
1648          * If allowed, unlock TCP socket while in IP
1649          * but only if the connection is established and
1650          * if we're not sending from an upcall.
1651          */
1652
1653         if (tcp_output_unlocked && ((so->so_flags & SOF_UPCALLINUSE) == 0) &&
1654             (tp->t_state == TCPS_ESTABLISHED)) {
1655                         unlocked = TRUE;
1656                         socket_unlock(so, 0);
1657         }
1658
1659         /*
1660          * Don't send down a chain of packets when:
1661          * - TCP chaining is disabled
1662          * - there is an IPsec rule set
1663          * - there is a non default rule set for the firewall
1664          */
1665
1666         chain = tcp_packet_chaining > 1
1667 #if IPSEC
1668                 && ipsec_bypass
1669 #endif
1670 #if IPFIREWALL
1671                 && (fw_enable == 0 || fw_bypass)
1672 #endif
1673                 ; // I'm important, not extraneous
1674
1675
1676         while (pkt != NULL) {
1677                 struct mbuf *npkt = pkt->m_nextpkt;
1678
1679                 if (!chain) {
1680                         pkt->m_nextpkt = NULL;
1681                         /*
1682                          * If we are not chaining, make sure to set the packet
1683                          * list count to 0 so that IP takes the right path;
1684                          * this is important for cases such as IPSec where a
1685                          * single mbuf might result in multiple mbufs as part
1686                          * of the encapsulation.  If a non-zero count is passed
1687                          * down to IP, the head of the chain might change and
1688                          * we could end up skipping it (thus generating bogus
1689                          * packets).  Fixing it in IP would be desirable, but
1690                          * for now this would do it.
1691                          */
1692                         cnt = 0;
1693                 }
1694 #if CONFIG_FORCE_OUT_IFP
1695                 error = ip_output_list(pkt, cnt, opt, &tp->t_inpcb->inp_route,
1696                     flags, 0, tp->t_inpcb->pdp_ifp);
1697 #else
1698                 error = ip_output_list(pkt, cnt, opt, &tp->t_inpcb->inp_route,
1699                     flags, 0, NULL);
1700 #endif
1701                 if (chain || error) {
1702                         /*
1703                          * If we sent down a chain then we are done since
1704                          * the callee had taken care of everything; else
1705                          * we need to free the rest of the chain ourselves.
1706                          */
1707                         if (!chain)
1708                                 m_freem_list(npkt);
1709                         break;
1710                 }
1711                 pkt = npkt;
1712         }
1713
1714         if (unlocked)
1715                 socket_lock(so, 0);
1716
1717         return (error);
1718 }
1719
1720 void
1721 tcp_setpersist(tp)
1722         register struct tcpcb *tp;
1723 {
1724         int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
1725
1726         if (tp->t_timer[TCPT_REXMT])
1727                 panic("tcp_setpersist: retransmit pending");
1728         /*
1729          * Start/restart persistance timer.
1730          */
1731         TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
1732             t * tcp_backoff[tp->t_rxtshift],
1733             TCPTV_PERSMIN, TCPTV_PERSMAX);
1734         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1735                 tp->t_rxtshift++;
1736 }