bsd/netinet/tcp_output.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_output.c        8.4 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #define _IP_VHL
  71
  72
  73 #include <sys/param.h>
  74 #include <sys/systm.h>
  75 #include <sys/kernel.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/mbuf.h>
  78 #include <sys/domain.h>
  79 #include <sys/protosw.h>
  80 #include <sys/socket.h>
  81 #include <sys/socketvar.h>
  82
  83 #include <net/route.h>
  84 #include <net/if_var.h>
  85
  86 #include <netinet/in.h>
  87 #include <netinet/in_systm.h>
  88 #include <netinet/in_var.h>
  89 #include <netinet/ip.h>
  90 #include <netinet/in_pcb.h>
  91 #include <netinet/ip_var.h>
  92 #if INET6
  93 #include <netinet6/in6_pcb.h>
  94 #include <netinet/ip6.h>
  95 #include <netinet6/ip6_var.h>
  96 #endif
  97 #include <netinet/tcp.h>
  98 #define TCPOUTFLAGS
  99 #include <netinet/tcp_fsm.h>
 100 #include <netinet/tcp_seq.h>
 101 #include <netinet/tcp_timer.h>
 102 #include <netinet/tcp_var.h>
 103 #include <netinet/tcpip.h>
 104 #if TCPDEBUG
 105 #include <netinet/tcp_debug.h>
 106 #endif
 107 #include <sys/kdebug.h>
 108
 109 #if IPSEC
 110 #include <netinet6/ipsec.h>
 111 #endif /*IPSEC*/
 112
 113 #if CONFIG_MACF_NET
 114 #include <security/mac_framework.h>
 115 #endif /* MAC_SOCKET */
 116
 117 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETTCP, 1)
 118 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETTCP, 3)
 119 #define DBG_FNC_TCP_OUTPUT      NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
 120
 121
 122 #ifdef notyet
 123 extern struct mbuf *m_copypack();
 124 #endif
 125
 126 int path_mtu_discovery = 1;
 127 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
 128         &path_mtu_discovery, 1, "Enable Path MTU Discovery");
 129
 130 int ss_fltsz = 1;
 131 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
 132         &ss_fltsz, 1, "Slow start flight size");
 133
 134 int ss_fltsz_local = 8; /* starts with eight segments max */
 135 SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
 136         &ss_fltsz_local, 1, "Slow start flight size for local networks");
 137
 138 int     tcp_do_newreno = 0;
 139 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
 140         0, "Enable NewReno Algorithms");
 141
 142 int     tcp_ecn_outbound = 0;
 143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out, CTLFLAG_RW, &tcp_ecn_outbound,
 144         0, "Initiate ECN for outbound connections");
 145
 146 int     tcp_ecn_inbound = 0;
 147 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, CTLFLAG_RW, &tcp_ecn_inbound,
 148         0, "Allow ECN negotiation for inbound connections");
 149
 150 int     tcp_packet_chaining = 50;
 151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW, &tcp_packet_chaining,
 152         0, "Enable TCP output packet chaining");
 153
 154 int     tcp_output_unlocked = 1;
 155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output, CTLFLAG_RW, &tcp_output_unlocked,
 156         0, "Unlock TCP when sending packets down to IP");
 157
 158 static long packchain_newlist = 0;
 159 static long packchain_looped = 0;
 160 static long packchain_sent = 0;
 161
 162
 163 /* temporary: for testing */
 164 #if IPSEC
 165 extern int ipsec_bypass;
 166 #endif
 167
 168 extern int slowlink_wsize;      /* window correction for slow links */
 169 extern u_long  route_generation;
 170 #if IPFIREWALL
 171 extern int fw_enable;           /* firewall check for packet chaining */
 172 extern int fw_bypass;           /* firewall check: disable packet chaining if there is rules */
 173 #endif /* IPFIREWALL */
 174
 175 extern vm_size_t        so_cache_zone_element_size;
 176
 177 static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, int,
 178     struct mbuf *, int);
 179
 180 static __inline__ u_int16_t
 181 get_socket_id(struct socket * s)
 182 {
 183         u_int16_t               val;
 184
 185         if (so_cache_zone_element_size == 0) {
 186                 return (0);
 187         }
 188         val = (u_int16_t)(((u_int32_t)s) / so_cache_zone_element_size);
 189         if (val == 0) {
 190                 val = 0xffff;
 191         }
 192         return (val);
 193 }
 194
 195 /*
 196  * Tcp output routine: figure out what should be sent and send it.
 197  *
 198  * Returns:     0                       Success
 199  *              EADDRNOTAVAIL
 200  *              ENOBUFS
 201  *              EMSGSIZE
 202  *              EHOSTUNREACH
 203  *              ENETDOWN
 204  *      ip_output_list:ENOMEM
 205  *      ip_output_list:EADDRNOTAVAIL
 206  *      ip_output_list:ENETUNREACH
 207  *      ip_output_list:EHOSTUNREACH
 208  *      ip_output_list:EACCES
 209  *      ip_output_list:EMSGSIZE
 210  *      ip_output_list:ENOBUFS
 211  *      ip_output_list:???              [ignorable: mostly IPSEC/firewall/DLIL]
 212  *      ip6_output:???                  [IPV6 only]
 213  */
 214 int
 215 tcp_output(struct tcpcb *tp)
 216 {
 217         struct socket *so = tp->t_inpcb->inp_socket;
 218         long len, recwin, sendwin;
 219         int off, flags, error;
 220         register struct mbuf *m;
 221         struct ip *ip = NULL;
 222         register struct ipovly *ipov = NULL;
 223 #if INET6
 224         struct ip6_hdr *ip6 = NULL;
 225 #endif /* INET6 */
 226         register struct tcphdr *th;
 227         u_char opt[TCP_MAXOLEN];
 228         unsigned ipoptlen, optlen, hdrlen;
 229         int idle, sendalot, lost = 0;
 230         int i, sack_rxmit;
 231         int sack_bytes_rxmt;
 232         struct sackhole *p;
 233
 234         int maxburst = TCP_MAXBURST;
 235         int    last_off = 0;
 236         int    m_off;
 237         struct mbuf *m_last = NULL;
 238         struct mbuf *m_head = NULL;
 239         struct mbuf *packetlist = NULL;
 240         struct mbuf *tp_inp_options = tp->t_inpcb->inp_depend4.inp4_options;
 241 #if INET6
 242         int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ;
 243         struct ip6_pktopts *inp6_pktopts = tp->t_inpcb->inp_depend6.inp6_outputopts;
 244 #endif
 245         short packchain_listadd = 0;
 246         u_int16_t       socket_id = get_socket_id(so);
 247         int so_options = so->so_options;
 248         struct rtentry *rt;
 249
 250         /*
 251          * Determine length of data that should be transmitted,
 252          * and flags that will be used.
 253          * If there is some data or critical controls (SYN, RST)
 254          * to send, then transmit; otherwise, investigate further.
 255          */
 256         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 257         if (idle && tp->t_rcvtime >= tp->t_rxtcur) {
 258                 /*
 259                  * We have been idle for "a while" and no acks are
 260                  * expected to clock out any data we send --
 261                  * slow start to get ack "clock" running again.
 262                  *
 263                  * Set the slow-start flight size depending on whether
 264                  * this is a local network or not.
 265                  */
 266                 if (
 267 #if INET6
 268                     (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) ||
 269                     (!isipv6 &&
 270 #endif
 271                      in_localaddr(tp->t_inpcb->inp_faddr)
 272 #if INET6
 273                      )
 274 #endif
 275                     )
 276                         tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
 277                 else
 278                         tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
 279         }
 280         tp->t_flags &= ~TF_LASTIDLE;
 281         if (idle) {
 282                 if (tp->t_flags & TF_MORETOCOME) {
 283                         tp->t_flags |= TF_LASTIDLE;
 284                         idle = 0;
 285                 }
 286         }
 287 again:
 288         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 289
 290 #if INET6
 291         if (isipv6) {
 292
 293                 KERNEL_DEBUG(DBG_LAYER_BEG,
 294                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
 295                      (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
 296                       (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
 297                      sendalot,0,0);
 298         }
 299         else
 300 #endif
 301
 302         {
 303                 KERNEL_DEBUG(DBG_LAYER_BEG,
 304                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
 305                      (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
 306                       (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
 307                      sendalot,0,0);
 308         /*
 309          * If the route generation id changed, we need to check that our
 310          * local (source) IP address is still valid. If it isn't either
 311          * return error or silently do nothing (assuming the address will
 312          * come back before the TCP connection times out).
 313          */
 314         rt = tp->t_inpcb->inp_route.ro_rt;
 315         if (rt != NULL && rt->generation_id != route_generation) {
 316                 struct ifnet *ifp;
 317
 318                 /* disable multipages at the socket */
 319                 somultipages(so, FALSE);
 320
 321                 /* check that the source address is still valid */
 322                 if (ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr) == 0) {
 323
 324                         if (tp->t_state >= TCPS_CLOSE_WAIT) {
 325                                 tcp_drop(tp, EADDRNOTAVAIL);
 326                                 return(EADDRNOTAVAIL);
 327                         }
 328
 329                         /* set Retransmit  timer if it wasn't set
 330                          * reset Persist timer and shift register as the
 331                          * adversed peer window may not be valid anymore
 332                          */
 333
 334                         if (!tp->t_timer[TCPT_REXMT]) {
 335                                 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 336                                 if (tp->t_timer[TCPT_PERSIST]) {
 337                                         tp->t_timer[TCPT_PERSIST] = 0;
 338                                         tp->t_rxtshift = 0;
 339                                 }
 340                         }
 341
 342                         if (tp->t_pktlist_head != NULL)
 343                                 m_freem_list(tp->t_pktlist_head);
 344                         TCP_PKTLIST_CLEAR(tp);
 345
 346                         /* drop connection if source address isn't available */
 347                         if (so->so_flags & SOF_NOADDRAVAIL) {
 348                                 tcp_drop(tp, EADDRNOTAVAIL);
 349                                 return(EADDRNOTAVAIL);
 350                         }
 351                         else
 352                                 return(0); /* silently ignore, keep data in socket: address may be back */
 353                 }
 354
 355                 /*
 356                  * Address is still valid; check for multipages capability
 357                  * again in case the outgoing interface has changed.
 358                  */
 359                 lck_mtx_lock(rt_mtx);
 360                 rt = tp->t_inpcb->inp_route.ro_rt;
 361                 if (rt != NULL && (ifp = rt->rt_ifp) != NULL)
 362                         somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
 363                 if (rt != NULL && rt->generation_id != route_generation)
 364                         rt->generation_id = route_generation;
 365                 /*
 366                  * See if we should do MTU discovery. Don't do it if:
 367                  *      1) it is disabled via the sysctl
 368                  *      2) the route isn't up
 369                  *      3) the MTU is locked (if it is, then discovery has been
 370                  *         disabled)
 371                  */
 372
 373                 if (!path_mtu_discovery || ((rt != NULL) &&
 374                     (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
 375                         tp->t_flags &= ~TF_PMTUD;
 376                 else
 377                         tp->t_flags |= TF_PMTUD;
 378
 379                 lck_mtx_unlock(rt_mtx);
 380         }
 381         }
 382
 383         /*
 384          * If we've recently taken a timeout, snd_max will be greater than
 385          * snd_nxt.  There may be SACK information that allows us to avoid
 386          * resending already delivered data.  Adjust snd_nxt accordingly.
 387          */
 388         if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
 389                 tcp_sack_adjust(tp);
 390         sendalot = 0;
 391         off = tp->snd_nxt - tp->snd_una;
 392         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 393
 394         if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
 395                 sendwin = min(sendwin, slowlink_wsize);
 396
 397         flags = tcp_outflags[tp->t_state];
 398         /*
 399          * Send any SACK-generated retransmissions.  If we're explicitly trying
 400          * to send out new data (when sendalot is 1), bypass this function.
 401          * If we retransmit in fast recovery mode, decrement snd_cwnd, since
 402          * we're replacing a (future) new transmission with a retransmission
 403          * now, and we previously incremented snd_cwnd in tcp_input().
 404          */
 405         /*
 406          * Still in sack recovery , reset rxmit flag to zero.
 407          */
 408         sack_rxmit = 0;
 409         sack_bytes_rxmt = 0;
 410         len = 0;
 411         p = NULL;
 412         if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
 413             (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 414                 long cwin;
 415
 416                 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
 417                 if (cwin < 0)
 418                         cwin = 0;
 419                 /* Do not retransmit SACK segments beyond snd_recover */
 420                 if (SEQ_GT(p->end, tp->snd_recover)) {
 421                         /*
 422                          * (At least) part of sack hole extends beyond
 423                          * snd_recover. Check to see if we can rexmit data
 424                          * for this hole.
 425                          */
 426                         if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
 427                                 /*
 428                                  * Can't rexmit any more data for this hole.
 429                                  * That data will be rexmitted in the next
 430                                  * sack recovery episode, when snd_recover
 431                                  * moves past p->rxmit.
 432                                  */
 433                                 p = NULL;
 434                                 goto after_sack_rexmit;
 435                         } else
 436                                 /* Can rexmit part of the current hole */
 437                                 len = ((long)ulmin(cwin,
 438                                                    tp->snd_recover - p->rxmit));
 439                 } else
 440                         len = ((long)ulmin(cwin, p->end - p->rxmit));
 441                 off = p->rxmit - tp->snd_una;
 442                 if (len > 0) {
 443                         sack_rxmit = 1;
 444                         sendalot = 1;
 445                         tcpstat.tcps_sack_rexmits++;
 446                         tcpstat.tcps_sack_rexmit_bytes +=
 447                             min(len, tp->t_maxseg);
 448                 }
 449         }
 450 after_sack_rexmit:
 451         /*
 452          * Get standard flags, and add SYN or FIN if requested by 'hidden'
 453          * state flags.
 454          */
 455         if (tp->t_flags & TF_NEEDFIN)
 456                 flags |= TH_FIN;
 457         if (tp->t_flags & TF_NEEDSYN)
 458                 flags |= TH_SYN;
 459
 460         /*
 461          * If in persist timeout with window of 0, send 1 byte.
 462          * Otherwise, if window is small but nonzero
 463          * and timer expired, we will send what we can
 464          * and go to transmit state.
 465          */
 466         if (tp->t_force) {
 467                 if (sendwin == 0) {
 468                         /*
 469                          * If we still have some data to send, then
 470                          * clear the FIN bit.  Usually this would
 471                          * happen below when it realizes that we
 472                          * aren't sending all the data.  However,
 473                          * if we have exactly 1 byte of unsent data,
 474                          * then it won't clear the FIN bit below,
 475                          * and if we are in persist state, we wind
 476                          * up sending the packet without recording
 477                          * that we sent the FIN bit.
 478                          *
 479                          * We can't just blindly clear the FIN bit,
 480                          * because if we don't have any more data
 481                          * to send then the probe will be the FIN
 482                          * itself.
 483                          */
 484                         if (off < so->so_snd.sb_cc)
 485                                 flags &= ~TH_FIN;
 486                         sendwin = 1;
 487                 } else {
 488                         tp->t_timer[TCPT_PERSIST] = 0;
 489                         tp->t_rxtshift = 0;
 490                 }
 491         }
 492
 493         /*
 494          * If snd_nxt == snd_max and we have transmitted a FIN, the
 495          * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
 496          * a negative length.  This can also occur when TCP opens up
 497          * its congestion window while receiving additional duplicate
 498          * acks after fast-retransmit because TCP will reset snd_nxt
 499          * to snd_max after the fast-retransmit.
 500          *
 501          * In the normal retransmit-FIN-only case, however, snd_nxt will
 502          * be set to snd_una, the offset will be 0, and the length may
 503          * wind up 0.
 504          *
 505          * If sack_rxmit is true we are retransmitting from the scoreboard
 506          * in which case len is already set.
 507          */
 508         if (sack_rxmit == 0) {
 509                 if (sack_bytes_rxmt == 0)
 510                         len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
 511                 else {
 512                         long cwin;
 513
 514                         /*
 515                          * We are inside of a SACK recovery episode and are
 516                          * sending new data, having retransmitted all the
 517                          * data possible in the scoreboard.
 518                          */
 519                         len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd)
 520                                - off);
 521                         /*
 522                          * Don't remove this (len > 0) check !
 523                          * We explicitly check for len > 0 here (although it
 524                          * isn't really necessary), to work around a gcc
 525                          * optimization issue - to force gcc to compute
 526                          * len above. Without this check, the computation
 527                          * of len is bungled by the optimizer.
 528                          */
 529                         if (len > 0) {
 530                                 cwin = tp->snd_cwnd -
 531                                         (tp->snd_nxt - tp->sack_newdata) -
 532                                         sack_bytes_rxmt;
 533                                 if (cwin < 0)
 534                                         cwin = 0;
 535                                 len = lmin(len, cwin);
 536                         }
 537                 }
 538         }
 539
 540         /*
 541          * Lop off SYN bit if it has already been sent.  However, if this
 542          * is SYN-SENT state and if segment contains data and if we don't
 543          * know that foreign host supports TAO, suppress sending segment.
 544          */
 545         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 546                 flags &= ~TH_SYN;
 547                 off--, len++;
 548                 if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
 549                         while (!(tp->t_flags & TF_SENDINPROG) &&
 550                             tp->t_pktlist_head != NULL) {
 551                                 packetlist = tp->t_pktlist_head;
 552                                 packchain_listadd = tp->t_lastchain;
 553                                 packchain_sent++;
 554                                 TCP_PKTLIST_CLEAR(tp);
 555                                 tp->t_flags |= TF_SENDINPROG;
 556
 557                                 error = tcp_ip_output(so, tp, packetlist,
 558                                     packchain_listadd, tp_inp_options,
 559                                     (so_options & SO_DONTROUTE));
 560
 561                                 tp->t_flags &= ~TF_SENDINPROG;
 562                         }
 563                         /* tcp was closed while we were in ip; resume close */
 564                         if ((tp->t_flags &
 565                             (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
 566                                 tp->t_flags &= ~TF_CLOSING;
 567                                 (void) tcp_close(tp);
 568                         }
 569                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,
 570                             0,0,0,0,0);
 571                         return 0;
 572                 }
 573         }
 574
 575         /*
 576          * Be careful not to send data and/or FIN on SYN segments.
 577          * This measure is needed to prevent interoperability problems
 578          * with not fully conformant TCP implementations.
 579          */
 580         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 581                 len = 0;
 582                 flags &= ~TH_FIN;
 583         }
 584
 585         if (len < 0) {
 586                 /*
 587                  * If FIN has been sent but not acked,
 588                  * but we haven't been called to retransmit,
 589                  * len will be < 0.  Otherwise, window shrank
 590                  * after we sent into it.  If window shrank to 0,
 591                  * cancel pending retransmit, pull snd_nxt back
 592                  * to (closed) window, and set the persist timer
 593                  * if it isn't already going.  If the window didn't
 594                  * close completely, just wait for an ACK.
 595                  */
 596                 len = 0;
 597                 if (sendwin == 0) {
 598                         tp->t_timer[TCPT_REXMT] = 0;
 599                         tp->t_rxtshift = 0;
 600                         tp->snd_nxt = tp->snd_una;
 601                         if (tp->t_timer[TCPT_PERSIST] == 0)
 602                                 tcp_setpersist(tp);
 603                 }
 604         }
 605
 606         /*
 607          * len will be >= 0 after this point.  Truncate to the maximum
 608          * segment length and ensure that FIN is removed if the length
 609          * no longer contains the last data byte.
 610          */
 611         if (len > tp->t_maxseg) {
 612                 len = tp->t_maxseg;
 613                 sendalot = 1;
 614         }
 615         if (sack_rxmit) {
 616                 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
 617                         flags &= ~TH_FIN;
 618         } else {
 619                 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
 620                         flags &= ~TH_FIN;
 621         }
 622
 623         recwin = tcp_sbspace(tp);
 624
 625         /*
 626          * Sender silly window avoidance.   We transmit under the following
 627          * conditions when len is non-zero:
 628          *
 629          *      - We have a full segment
 630          *      - This is the last buffer in a write()/send() and we are
 631          *        either idle or running NODELAY
 632          *      - we've timed out (e.g. persist timer)
 633          *      - we have more then 1/2 the maximum send window's worth of
 634          *        data (receiver may be limited the window size)
 635          *      - we need to retransmit
 636          */
 637         if (len) {
 638                 if (len == tp->t_maxseg) {
 639                         tp->t_flags |= TF_MAXSEGSNT;
 640                         goto send;
 641                 }
 642                 if (!(tp->t_flags & TF_MORETOCOME) &&
 643                     (idle || tp->t_flags & TF_NODELAY || tp->t_flags & TF_MAXSEGSNT) &&
 644                     (tp->t_flags & TF_NOPUSH) == 0 &&
 645                     len + off >= so->so_snd.sb_cc) {
 646                         tp->t_flags &= ~TF_MAXSEGSNT;
 647                         goto send;
 648                 }
 649                 if (tp->t_force) {
 650                         tp->t_flags &= ~TF_MAXSEGSNT;
 651                         goto send;
 652                 }
 653                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
 654                         tp->t_flags &= ~TF_MAXSEGSNT;
 655                         goto send;
 656                 }
 657                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 658                         tp->t_flags &= ~TF_MAXSEGSNT;
 659                         goto send;
 660                 }
 661                 if (sack_rxmit)
 662                         goto send;
 663         }
 664
 665         /*
 666          * Compare available window to amount of window
 667          * known to peer (as advertised window less
 668          * next expected input).  If the difference is at least two
 669          * max size segments, or at least 50% of the maximum possible
 670          * window, then want to send a window update to peer.
 671          * Skip this if the connection is in T/TCP half-open state.
 672          */
 673         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
 674                 /*
 675                  * "adv" is the amount we can increase the window,
 676                  * taking into account that we are limited by
 677                  * TCP_MAXWIN << tp->rcv_scale.
 678                  */
 679                 long adv = lmin(recwin, (long)TCP_MAXWIN << tp->rcv_scale) -
 680                         (tp->rcv_adv - tp->rcv_nxt);
 681
 682                 if (adv >= (long) (2 * tp->t_maxseg)) {
 683
 684                         /*
 685                          * Update only if the resulting scaled value of the window changed, or
 686                          * if there is a change in the sequence since the last ack.
 687                          * This avoids what appears as dupe ACKS (see rdar://5640997)
 688                          */
 689
 690                         if ((tp->last_ack_sent != tp->rcv_nxt) || (((recwin + adv) >> tp->rcv_scale) > recwin))
 691                                 goto send;
 692                 }
 693                 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
 694                                 goto send;
 695         }
 696
 697         /*
 698          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 699          * is also a catch-all for the retransmit timer timeout case.
 700          */
 701         if (tp->t_flags & TF_ACKNOW)
 702                 goto send;
 703         if ((flags & TH_RST) ||
 704             ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 705                 goto send;
 706         if (SEQ_GT(tp->snd_up, tp->snd_una))
 707                 goto send;
 708         /*
 709          * If our state indicates that FIN should be sent
 710          * and we have not yet done so, then we need to send.
 711          */
 712         if (flags & TH_FIN &&
 713             ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 714                 goto send;
 715         /*
 716          * In SACK, it is possible for tcp_output to fail to send a segment
 717          * after the retransmission timer has been turned off.  Make sure
 718          * that the retransmission timer is set.
 719          */
 720         if (tp->sack_enable && (tp->t_state >= TCPS_ESTABLISHED) && SEQ_GT(tp->snd_max, tp->snd_una) &&
 721                 tp->t_timer[TCPT_REXMT] == 0 &&
 722             tp->t_timer[TCPT_PERSIST] == 0) {
 723                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 724                         goto just_return;
 725         }
 726         /*
 727          * TCP window updates are not reliable, rather a polling protocol
 728          * using ``persist'' packets is used to insure receipt of window
 729          * updates.  The three ``states'' for the output side are:
 730          *      idle                    not doing retransmits or persists
 731          *      persisting              to move a small or zero window
 732          *      (re)transmitting        and thereby not persisting
 733          *
 734          * tp->t_timer[TCPT_PERSIST]
 735          *      is set when we are in persist state.
 736          * tp->t_force
 737          *      is set when we are called to send a persist packet.
 738          * tp->t_timer[TCPT_REXMT]
 739          *      is set when we are retransmitting
 740          * The output side is idle when both timers are zero.
 741          *
 742          * If send window is too small, there is data to transmit, and no
 743          * retransmit or persist is pending, then go to persist state.
 744          * If nothing happens soon, send when timer expires:
 745          * if window is nonzero, transmit what we can,
 746          * otherwise force out a byte.
 747          */
 748         if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
 749             tp->t_timer[TCPT_PERSIST] == 0) {
 750                 tp->t_rxtshift = 0;
 751                 tcp_setpersist(tp);
 752         }
 753 just_return:
 754         /*
 755          * If there is no reason to send a segment, just return.
 756          * but if there is some packets left in the packet list, send them now.
 757          */
 758         while (!(tp->t_flags & TF_SENDINPROG) && tp->t_pktlist_head != NULL) {
 759                 packetlist = tp->t_pktlist_head;
 760                 packchain_listadd = tp->t_lastchain;
 761                 packchain_sent++;
 762                 TCP_PKTLIST_CLEAR(tp);
 763                 tp->t_flags |= TF_SENDINPROG;
 764
 765                 error = tcp_ip_output(so, tp, packetlist, packchain_listadd,
 766                     tp_inp_options, (so_options & SO_DONTROUTE));
 767
 768                 tp->t_flags &= ~TF_SENDINPROG;
 769         }
 770         /* tcp was closed while we were in ip; resume close */
 771         if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
 772                 tp->t_flags &= ~TF_CLOSING;
 773                 (void) tcp_close(tp);
 774         }
 775         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 776         return (0);
 777
 778 send:
 779         /*
 780          * Before ESTABLISHED, force sending of initial options
 781          * unless TCP set not to do any options.
 782          * NOTE: we assume that the IP/TCP header plus TCP options
 783          * always fit in a single mbuf, leaving room for a maximum
 784          * link header, i.e.
 785          *      max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 786          */
 787         optlen = 0;
 788 #if INET6
 789         if (isipv6)
 790                 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 791         else
 792 #endif
 793         hdrlen = sizeof (struct tcpiphdr);
 794         if (flags & TH_SYN) {
 795                 tp->snd_nxt = tp->iss;
 796                 if ((tp->t_flags & TF_NOOPT) == 0) {
 797                         u_short mss;
 798
 799                         opt[0] = TCPOPT_MAXSEG;
 800                         opt[1] = TCPOLEN_MAXSEG;
 801                         mss = htons((u_short) tcp_mssopt(tp));
 802                         (void)memcpy(opt + 2, &mss, sizeof(mss));
 803                         optlen = TCPOLEN_MAXSEG;
 804
 805                         if ((tp->t_flags & TF_REQ_SCALE) &&
 806                             ((flags & TH_ACK) == 0 ||
 807                             (tp->t_flags & TF_RCVD_SCALE))) {
 808                                 *((u_int32_t *)(opt + optlen)) = htonl(
 809                                         TCPOPT_NOP << 24 |
 810                                         TCPOPT_WINDOW << 16 |
 811                                         TCPOLEN_WINDOW << 8 |
 812                                         tp->request_r_scale);
 813                                 optlen += 4;
 814                         }
 815                 }
 816
 817         }
 818
 819         /*
 820           RFC 3168 states that:
 821            - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
 822            to handle the TCP ECE flag, even if you also later send a
 823            non-ECN-setup SYN/SYN-ACK.
 824            - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
 825            the ip ECT flag.
 826
 827            It is not clear how the ECE flag would ever be set if you never
 828            set the IP ECT flag on outbound packets. All the same, we use
 829            the TE_SETUPSENT to indicate that we have committed to handling
 830            the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
 831            whether or not we should set the IP ECT flag on outbound packets.
 832          */
 833         /*
 834          * For a SYN-ACK, send an ECN setup SYN-ACK
 835          */
 836         if (tcp_ecn_inbound && (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
 837                 if ((tp->ecn_flags & TE_SETUPRECEIVED) != 0) {
 838                         if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
 839                                 /* Setting TH_ECE makes this an ECN-setup SYN-ACK */
 840                                 flags |= TH_ECE;
 841
 842                                 /*
 843                                  * Record that we sent the ECN-setup and default to
 844                                  * setting IP ECT.
 845                                  */
 846                                 tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
 847                         }
 848                         else {
 849                                 /*
 850                                  * We sent an ECN-setup SYN-ACK but it was dropped.
 851                                  * Fallback to non-ECN-setup SYN-ACK and clear flag
 852                                  * that to indicate we should not send data with IP ECT set.
 853                                  *
 854                                  * Pretend we didn't receive an ECN-setup SYN.
 855                                  */
 856                                 tp->ecn_flags &= ~TE_SETUPRECEIVED;
 857                         }
 858                 }
 859         }
 860         else if (tcp_ecn_outbound && (flags & (TH_SYN | TH_ACK)) == TH_SYN) {
 861                 if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
 862                         /* Setting TH_ECE and TH_CWR makes this an ECN-setup SYN */
 863                         flags |= (TH_ECE | TH_CWR);
 864
 865                         /*
 866                          * Record that we sent the ECN-setup and default to
 867                          * setting IP ECT.
 868                          */
 869                         tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
 870                 }
 871                 else {
 872                         /*
 873                          * We sent an ECN-setup SYN but it was dropped.
 874                          * Fall back to no ECN and clear flag indicating
 875                          * we should send data with IP ECT set.
 876                          */
 877                         tp->ecn_flags &= ~TE_SENDIPECT;
 878                 }
 879         }
 880
 881         /*
 882          * Check if we should set the TCP CWR flag.
 883          * CWR flag is sent when we reduced the congestion window because
 884          * we received a TCP ECE or we performed a fast retransmit. We
 885          * never set the CWR flag on retransmitted packets. We only set
 886          * the CWR flag on data packets. Pure acks don't have this set.
 887          */
 888         if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 &&
 889                 !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 890                 flags |= TH_CWR;
 891                 tp->ecn_flags &= ~TE_SENDCWR;
 892         }
 893
 894         /*
 895          * Check if we should set the TCP ECE flag.
 896          */
 897         if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) {
 898                 flags |= TH_ECE;
 899         }
 900
 901         /*
 902          * Send a timestamp and echo-reply if this is a SYN and our side
 903          * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
 904          * and our peer have sent timestamps in our SYN's.
 905          */
 906         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 907             (flags & TH_RST) == 0 &&
 908             ((flags & TH_ACK) == 0 ||
 909              (tp->t_flags & TF_RCVD_TSTMP))) {
 910                 u_int32_t *lp = (u_int32_t *)(opt + optlen);
 911
 912                 /* Form timestamp option as shown in appendix A of RFC 1323. */
 913                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
 914                 *lp++ = htonl(tcp_now);
 915                 *lp   = htonl(tp->ts_recent);
 916                 optlen += TCPOLEN_TSTAMP_APPA;
 917         }
 918
 919         if (tp->sack_enable && ((tp->t_flags & TF_NOOPT) == 0)) {
 920                 /*
 921                  * Tack on the SACK permitted option *last*.
 922                  * And do padding of options after tacking this on.
 923                  * This is because of MSS, TS, WinScale and Signatures are
 924                  * all present, we have just 2 bytes left for the SACK
 925                  * permitted option, which is just enough.
 926                  */
 927                 /*
 928                  * If this is the first SYN of connection (not a SYN
 929                  * ACK), include SACK permitted option.  If this is a
 930                  * SYN ACK, include SACK permitted option if peer has
 931                  * already done so. This is only for active connect,
 932                  * since the syncache takes care of the passive connect.
 933                  */
 934                 if ((flags & TH_SYN) &&
 935                     (!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) {
 936                         u_char *bp;
 937                         bp = (u_char *)opt + optlen;
 938
 939                         *bp++ = TCPOPT_SACK_PERMITTED;
 940                         *bp++ = TCPOLEN_SACK_PERMITTED;
 941                         optlen += TCPOLEN_SACK_PERMITTED;
 942                 }
 943
 944                 /*
 945                  * Send SACKs if necessary.  This should be the last
 946                  * option processed.  Only as many SACKs are sent as
 947                  * are permitted by the maximum options size.
 948                  *
 949                  * In general, SACK blocks consume 8*n+2 bytes.
 950                  * So a full size SACK blocks option is 34 bytes
 951                  * (to generate 4 SACK blocks).  At a minimum,
 952                  * we need 10 bytes (to generate 1 SACK block).
 953                  * If TCP Timestamps (12 bytes) and TCP Signatures
 954                  * (18 bytes) are both present, we'll just have
 955                  * 10 bytes for SACK options 40 - (12 + 18).
 956                  */
 957                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 958                     (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0 &&
 959                     MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) {
 960                         int nsack, sackoptlen, padlen;
 961                         u_char *bp = (u_char *)opt + optlen;
 962                         u_int32_t *lp;
 963
 964                         nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK;
 965                         nsack = min(nsack, tp->rcv_numsacks);
 966                         sackoptlen = (2 + nsack * TCPOLEN_SACK);
 967
 968                         /*
 969                          * First we need to pad options so that the
 970                          * SACK blocks can start at a 4-byte boundary
 971                          * (sack option and length are at a 2 byte offset).
 972                          */
 973                         padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4;
 974                         optlen += padlen;
 975                         while (padlen-- > 0)
 976                                 *bp++ = TCPOPT_NOP;
 977
 978                         tcpstat.tcps_sack_send_blocks++;
 979                         *bp++ = TCPOPT_SACK;
 980                         *bp++ = sackoptlen;
 981                         lp = (u_int32_t *)bp;
 982                         for (i = 0; i < nsack; i++) {
 983                                 struct sackblk sack = tp->sackblks[i];
 984                                 *lp++ = htonl(sack.start);
 985                                 *lp++ = htonl(sack.end);
 986                         }
 987                         optlen += sackoptlen;
 988                 }
 989         }
 990
 991         /* Pad TCP options to a 4 byte boundary */
 992         if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
 993                 int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
 994                 u_char *bp = (u_char *)opt + optlen;
 995
 996                 optlen += pad;
 997                 while (pad) {
 998                         *bp++ = TCPOPT_EOL;
 999                         pad--;
1000                 }
1001         }
1002
1003         hdrlen += optlen;
1004
1005 #if INET6
1006         if (isipv6)
1007                 ipoptlen = ip6_optlen(tp->t_inpcb);
1008         else
1009 #endif
1010         {
1011                 if (tp_inp_options) {
1012                         ipoptlen = tp_inp_options->m_len -
1013                                 offsetof(struct ipoption, ipopt_list);
1014                 } else
1015                         ipoptlen = 0;
1016         }
1017 #if IPSEC
1018         if (ipsec_bypass == 0)
1019                 ipoptlen += ipsec_hdrsiz_tcp(tp);
1020 #endif
1021
1022         /*
1023          * Adjust data length if insertion of options will
1024          * bump the packet length beyond the t_maxopd length.
1025          * Clear the FIN bit because we cut off the tail of
1026          * the segment.
1027          */
1028         if (len + optlen + ipoptlen > tp->t_maxopd) {
1029                 /*
1030                  * If there is still more to send, don't close the connection.
1031                  */
1032                 flags &= ~TH_FIN;
1033                 len = tp->t_maxopd - optlen - ipoptlen;
1034                 sendalot = 1;
1035         }
1036
1037 /*#ifdef DIAGNOSTIC*/
1038 #if INET6
1039         if (max_linkhdr + hdrlen > MCLBYTES)
1040                 panic("tcphdr too big");
1041 #else
1042         if (max_linkhdr + hdrlen > MHLEN)
1043                 panic("tcphdr too big");
1044 #endif
1045 /*#endif*/
1046
1047         /*
1048          * Grab a header mbuf, attaching a copy of data to
1049          * be transmitted, and initialize the header from
1050          * the template for sends on this connection.
1051          */
1052         if (len) {
1053                 if (tp->t_force && len == 1)
1054                         tcpstat.tcps_sndprobe++;
1055                 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1056                         tcpstat.tcps_sndrexmitpack++;
1057                         tcpstat.tcps_sndrexmitbyte += len;
1058                 } else {
1059                         tcpstat.tcps_sndpack++;
1060                         tcpstat.tcps_sndbyte += len;
1061                 }
1062 #ifdef notyet
1063                 if ((m = m_copypack(so->so_snd.sb_mb, off,
1064                     (int)len, max_linkhdr + hdrlen)) == 0) {
1065                         error = ENOBUFS;
1066                         goto out;
1067                 }
1068                 /*
1069                  * m_copypack left space for our hdr; use it.
1070                  */
1071                 m->m_len += hdrlen;
1072                 m->m_data -= hdrlen;
1073 #else
1074                 /*
1075                  * try to use the new interface that allocates all
1076                  * the necessary mbuf hdrs under 1 mbuf lock and
1077                  * avoids rescanning the socket mbuf list if
1078                  * certain conditions are met.  This routine can't
1079                  * be used in the following cases...
1080                  * 1) the protocol headers exceed the capacity of
1081                  * of a single mbuf header's data area (no cluster attached)
1082                  * 2) the length of the data being transmitted plus
1083                  * the protocol headers fits into a single mbuf header's
1084                  * data area (no cluster attached)
1085                  */
1086                 m = NULL;
1087 #if INET6
1088                 if (MHLEN < hdrlen + max_linkhdr) {
1089                         MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1090                         if (m == NULL) {
1091                                 error = ENOBUFS;
1092                                 goto out;
1093                         }
1094                         MCLGET(m, M_DONTWAIT);
1095                         if ((m->m_flags & M_EXT) == 0) {
1096                                 m_freem(m);
1097                                 error = ENOBUFS;
1098                                 goto out;
1099                         }
1100                         m->m_data += max_linkhdr;
1101                         m->m_len = hdrlen;
1102                 }
1103 #endif
1104                 if (len <= MHLEN - hdrlen - max_linkhdr) {
1105                         if (m == NULL) {
1106                                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1107                                 if (m == NULL) {
1108                                         error = ENOBUFS;
1109                                         goto out;
1110                                 }
1111                                 m->m_data += max_linkhdr;
1112                                 m->m_len = hdrlen;
1113                         }
1114                         /* makes sure we still have data left to be sent at this point */
1115                         if (so->so_snd.sb_mb == NULL || off == -1) {
1116                                 if (m != NULL)  m_freem(m);
1117                                 error = 0; /* should we return an error? */
1118                                 goto out;
1119                         }
1120                         m_copydata(so->so_snd.sb_mb, off, (int) len,
1121                             mtod(m, caddr_t) + hdrlen);
1122                         m->m_len += len;
1123                 } else {
1124                         if (m != NULL) {
1125                                 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
1126                                 if (m->m_next == 0) {
1127                                         (void) m_free(m);
1128                                         error = ENOBUFS;
1129                                         goto out;
1130                                 }
1131                         } else {
1132                                 /*
1133                                  * determine whether the mbuf pointer and offset passed back by the 'last' call
1134                                  * to m_copym_with_hdrs are still valid... if the head of the socket chain has
1135                                  * changed (due to an incoming ACK for instance), or the offset into the chain we
1136                                  * just computed is different from the one last returned by m_copym_with_hdrs (perhaps
1137                                  * we're re-transmitting a packet sent earlier), than we can't pass the mbuf pointer and
1138                                  * offset into it as valid hints for m_copym_with_hdrs to use (if valid, these hints allow
1139                                  * m_copym_with_hdrs to avoid rescanning from the beginning of the socket buffer mbuf list.
1140                                  * setting the mbuf pointer to NULL is sufficient to disable the hint mechanism.
1141                                  */
1142                                 if (m_head != so->so_snd.sb_mb || last_off != off)
1143                                         m_last = NULL;
1144                                 last_off = off + len;
1145                                 m_head = so->so_snd.sb_mb;
1146
1147                                 /* makes sure we still have data left to be sent at this point */
1148                                 if (m_head == NULL) {
1149                                         error = 0; /* should we return an error? */
1150                                         goto out;
1151                                 }
1152
1153                                 /*
1154                                  * m_copym_with_hdrs will always return the last mbuf pointer and the offset into it that
1155                                  * it acted on to fullfill the current request, whether a valid 'hint' was passed in or not
1156                                  */
1157                                 if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, (int) len, M_DONTWAIT, &m_last, &m_off)) == NULL) {
1158                                         error = ENOBUFS;
1159                                         goto out;
1160                                 }
1161                                 m->m_data += max_linkhdr;
1162                                 m->m_len = hdrlen;
1163                         }
1164                 }
1165 #endif
1166                 /*
1167                  * If we're sending everything we've got, set PUSH.
1168                  * (This will keep happy those implementations which only
1169                  * give data to the user when a buffer fills or
1170                  * a PUSH comes in.)
1171                  */
1172                 if (off + len == so->so_snd.sb_cc)
1173                         flags |= TH_PUSH;
1174         } else {
1175                 if (tp->t_flags & TF_ACKNOW)
1176                         tcpstat.tcps_sndacks++;
1177                 else if (flags & (TH_SYN|TH_FIN|TH_RST))
1178                         tcpstat.tcps_sndctrl++;
1179                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
1180                         tcpstat.tcps_sndurg++;
1181                 else
1182                         tcpstat.tcps_sndwinup++;
1183
1184                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1185                 if (m == NULL) {
1186                         error = ENOBUFS;
1187                         goto out;
1188                 }
1189 #if INET6
1190                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
1191                     MHLEN >= hdrlen) {
1192                         MH_ALIGN(m, hdrlen);
1193                 } else
1194 #endif
1195                 m->m_data += max_linkhdr;
1196                 m->m_len = hdrlen;
1197         }
1198         m->m_pkthdr.rcvif = 0;
1199 #if CONFIG_MACF_NET
1200         mac_mbuf_label_associate_inpcb(tp->t_inpcb, m);
1201 #endif
1202 #if INET6
1203         if (isipv6) {
1204                 ip6 = mtod(m, struct ip6_hdr *);
1205                 th = (struct tcphdr *)(ip6 + 1);
1206                 tcp_fillheaders(tp, ip6, th);
1207         } else
1208 #endif /* INET6 */
1209         {
1210                 ip = mtod(m, struct ip *);
1211                 ipov = (struct ipovly *)ip;
1212                 th = (struct tcphdr *)(ip + 1);
1213                 /* this picks up the pseudo header (w/o the length) */
1214                 tcp_fillheaders(tp, ip, th);
1215                 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
1216                         !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1217                         ip->ip_tos = IPTOS_ECN_ECT0;
1218                 }
1219         }
1220
1221         /*
1222          * Fill in fields, remembering maximum advertised
1223          * window for use in delaying messages about window sizes.
1224          * If resending a FIN, be sure not to use a new sequence number.
1225          */
1226         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
1227             tp->snd_nxt == tp->snd_max)
1228                 tp->snd_nxt--;
1229         /*
1230          * If we are doing retransmissions, then snd_nxt will
1231          * not reflect the first unsent octet.  For ACK only
1232          * packets, we do not want the sequence number of the
1233          * retransmitted packet, we want the sequence number
1234          * of the next unsent octet.  So, if there is no data
1235          * (and no SYN or FIN), use snd_max instead of snd_nxt
1236          * when filling in ti_seq.  But if we are in persist
1237          * state, snd_max might reflect one byte beyond the
1238          * right edge of the window, so use snd_nxt in that
1239          * case, since we know we aren't doing a retransmission.
1240          * (retransmit and persist are mutually exclusive...)
1241          */
1242         if (sack_rxmit == 0) {
1243                 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
1244                         th->th_seq = htonl(tp->snd_nxt);
1245                 else
1246                         th->th_seq = htonl(tp->snd_max);
1247         } else {
1248                 th->th_seq = htonl(p->rxmit);
1249                 p->rxmit += len;
1250                 tp->sackhint.sack_bytes_rexmit += len;
1251         }
1252         th->th_ack = htonl(tp->rcv_nxt);
1253         tp->last_ack_sent = tp->rcv_nxt;
1254
1255         if (optlen) {
1256                 bcopy(opt, th + 1, optlen);
1257                 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
1258         }
1259         th->th_flags = flags;
1260         /*
1261          * Calculate receive window.  Don't shrink window,
1262          * but avoid silly window syndrome.
1263          */
1264         if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)tp->t_maxseg)
1265                 recwin = 0;
1266         if (recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
1267                 recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
1268         if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
1269                 if (recwin > (long)slowlink_wsize)
1270                         recwin = slowlink_wsize;
1271                         th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
1272         }
1273         else {
1274                 if (recwin > (long)(TCP_MAXWIN << tp->rcv_scale))
1275                         recwin = (long)(TCP_MAXWIN << tp->rcv_scale);
1276                 th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
1277         }
1278
1279         /*
1280          * Adjust the RXWIN0SENT flag - indicate that we have advertised
1281          * a 0 window.  This may cause the remote transmitter to stall.  This
1282          * flag tells soreceive() to disable delayed acknowledgements when
1283          * draining the buffer.  This can occur if the receiver is attempting
1284          * to read more data then can be buffered prior to transmitting on
1285          * the connection.
1286          */
1287         if (recwin == 0)
1288                 tp->t_flags |= TF_RXWIN0SENT;
1289         else
1290                 tp->t_flags &= ~TF_RXWIN0SENT;
1291         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1292                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
1293                 th->th_flags |= TH_URG;
1294         } else
1295                 /*
1296                  * If no urgent pointer to send, then we pull
1297                  * the urgent pointer to the left edge of the send window
1298                  * so that it doesn't drift into the send window on sequence
1299                  * number wraparound.
1300                  */
1301                 tp->snd_up = tp->snd_una;               /* drag it along */
1302
1303         /*
1304          * Put TCP length in extended header, and then
1305          * checksum extended header and data.
1306          */
1307         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
1308 #if INET6
1309         if (isipv6)
1310                 /*
1311                  * ip6_plen is not need to be filled now, and will be filled
1312                  * in ip6_output.
1313                  */
1314                 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
1315                                        sizeof(struct tcphdr) + optlen + len);
1316         else
1317 #endif /* INET6 */
1318         {
1319                 m->m_pkthdr.csum_flags = CSUM_TCP;
1320                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1321                 if (len + optlen)
1322                         th->th_sum = in_addword(th->th_sum,
1323                                 htons((u_short)(optlen + len)));
1324         }
1325
1326         /*
1327          * In transmit state, time the transmission and arrange for
1328          * the retransmit.  In persist state, just set snd_max.
1329          */
1330         if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
1331                 tcp_seq startseq = tp->snd_nxt;
1332
1333                 /*
1334                  * Advance snd_nxt over sequence space of this segment.
1335                  */
1336                 if (flags & (TH_SYN|TH_FIN)) {
1337                         if (flags & TH_SYN)
1338                                 tp->snd_nxt++;
1339                         if (flags & TH_FIN) {
1340                                 tp->snd_nxt++;
1341                                 tp->t_flags |= TF_SENTFIN;
1342                         }
1343                 }
1344                 if (sack_rxmit)
1345                         goto timer;
1346                 tp->snd_nxt += len;
1347                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1348                         tp->snd_max = tp->snd_nxt;
1349                         /*
1350                          * Time this transmission if not a retransmission and
1351                          * not currently timing anything.
1352                          */
1353                         if (tp->t_rtttime == 0) {
1354                                 tp->t_rtttime = 1;
1355                                 tp->t_rtseq = startseq;
1356                                 tcpstat.tcps_segstimed++;
1357                         }
1358                 }
1359
1360                 /*
1361                  * Set retransmit timer if not currently set,
1362                  * and not doing an ack or a keep-alive probe.
1363                  * Initial value for retransmit timer is smoothed
1364                  * round-trip time + 2 * round-trip time variance.
1365                  * Initialize shift counter which is used for backoff
1366                  * of retransmit time.
1367                  */
1368 timer:
1369                 if (tp->t_timer[TCPT_REXMT] == 0 &&
1370                     ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
1371                         tp->snd_nxt != tp->snd_una)) {
1372                         if (tp->t_timer[TCPT_PERSIST]) {
1373                                 tp->t_timer[TCPT_PERSIST] = 0;
1374                                 tp->t_rxtshift = 0;
1375                         }
1376                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1377                 }
1378         } else {
1379                 /*
1380                  * Persist case, update snd_max but since we are in
1381                  * persist mode (no window) we do not update snd_nxt.
1382                  */
1383                 int xlen = len;
1384                 if (flags & TH_SYN)
1385                         ++xlen;
1386                 if (flags & TH_FIN) {
1387                         ++xlen;
1388                         tp->t_flags |= TF_SENTFIN;
1389                 }
1390                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
1391                         tp->snd_max = tp->snd_nxt + len;
1392         }
1393
1394 #if TCPDEBUG
1395         /*
1396          * Trace.
1397          */
1398         if (so_options & SO_DEBUG)
1399                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
1400 #endif
1401
1402         /*
1403          * Fill in IP length and desired time to live and
1404          * send to IP level.  There should be a better way
1405          * to handle ttl and tos; we could keep them in
1406          * the template, but need a way to checksum without them.
1407          */
1408         /*
1409          * m->m_pkthdr.len should have been set before cksum calcuration,
1410          * because in6_cksum() need it.
1411          */
1412 #if INET6
1413         if (isipv6) {
1414                 /*
1415                  * we separately set hoplimit for every segment, since the
1416                  * user might want to change the value via setsockopt.
1417                  * Also, desired default hop limit might be changed via
1418                  * Neighbor Discovery.
1419                  */
1420                 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
1421                                                tp->t_inpcb->in6p_route.ro_rt ?
1422                                                tp->t_inpcb->in6p_route.ro_rt->rt_ifp
1423                                                : NULL);
1424
1425                 /* TODO: IPv6 IP6TOS_ECT bit on */
1426 #if IPSEC
1427                 if (ipsec_bypass == 0 && ipsec_setsocket(m, so) != 0) {
1428                         m_freem(m);
1429                         error = ENOBUFS;
1430                         goto out;
1431                 }
1432 #endif /*IPSEC*/
1433                 m->m_pkthdr.socket_id = socket_id;
1434                 error = ip6_output(m,
1435                             inp6_pktopts,
1436                             &tp->t_inpcb->in6p_route,
1437                             (so_options & SO_DONTROUTE), NULL, NULL, 0);
1438         } else
1439 #endif /* INET6 */
1440     {
1441         ip->ip_len = m->m_pkthdr.len;
1442 #if INET6
1443         if (isipv6)
1444                 ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
1445                                             tp->t_inpcb->in6p_route.ro_rt ?
1446                                             tp->t_inpcb->in6p_route.ro_rt->rt_ifp
1447                                             : NULL);
1448         else
1449 #endif /* INET6 */
1450         ip->ip_ttl = tp->t_inpcb->inp_ip_ttl;   /* XXX */
1451         ip->ip_tos |= (tp->t_inpcb->inp_ip_tos & ~IPTOS_ECN_MASK);      /* XXX */
1452
1453
1454 #if INET6
1455         if (isipv6) {
1456                 KERNEL_DEBUG(DBG_LAYER_BEG,
1457                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
1458                      (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
1459                       (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
1460                      0,0,0);
1461         }
1462         else
1463 #endif
1464         {
1465                 KERNEL_DEBUG(DBG_LAYER_BEG,
1466                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
1467                      (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
1468                       (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
1469                      0,0,0);
1470         }
1471
1472         /*
1473          * See if we should do MTU discovery.
1474          * Look at the flag updated on the following criterias:
1475          *      1) Path MTU discovery is authorized by the sysctl
1476          *      2) The route isn't set yet (unlikely but could happen)
1477          *      3) The route is up
1478          *      4) the MTU is not locked (if it is, then discovery has been
1479          *         disabled for that route)
1480          */
1481
1482         if (path_mtu_discovery && (tp->t_flags & TF_PMTUD))
1483                 ip->ip_off |= IP_DF;
1484
1485 #if IPSEC
1486         if (ipsec_bypass == 0)
1487                 ipsec_setsocket(m, so);
1488 #endif /*IPSEC*/
1489
1490         /*
1491          * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
1492          */
1493         lost = 0;
1494         m->m_pkthdr.socket_id = socket_id;
1495         m->m_nextpkt = NULL;
1496         tp->t_pktlist_sentlen += len;
1497         tp->t_lastchain++;
1498         if (tp->t_pktlist_head != NULL) {
1499                 tp->t_pktlist_tail->m_nextpkt = m;
1500                 tp->t_pktlist_tail = m;
1501         } else {
1502                 packchain_newlist++;
1503                 tp->t_pktlist_head = tp->t_pktlist_tail = m;
1504         }
1505
1506         if (sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) ||
1507               (tp->snd_cwnd <= (tp->snd_wnd / 8)) ||
1508               (tp->t_flags & (TH_PUSH | TF_ACKNOW)) || tp->t_force != 0 ||
1509               tp->t_lastchain >= tcp_packet_chaining) {
1510                 error = 0;
1511                 while (!(tp->t_flags & TF_SENDINPROG) &&
1512                     tp->t_pktlist_head != NULL) {
1513                         packetlist = tp->t_pktlist_head;
1514                         packchain_listadd = tp->t_lastchain;
1515                         packchain_sent++;
1516                         lost = tp->t_pktlist_sentlen;
1517                         TCP_PKTLIST_CLEAR(tp);
1518                         tp->t_flags |= TF_SENDINPROG;
1519
1520                         error = tcp_ip_output(so, tp, packetlist,
1521                             packchain_listadd, tp_inp_options,
1522                             (so_options & SO_DONTROUTE));
1523
1524                         tp->t_flags &= ~TF_SENDINPROG;
1525                         if (error) {
1526                                 /*
1527                                  * Take into account the rest of unsent
1528                                  * packets in the packet list for this tcp
1529                                  * into "lost", since we're about to free
1530                                  * the whole list below.
1531                                  */
1532                                 lost += tp->t_pktlist_sentlen;
1533                                 break;
1534                         } else {
1535                                 lost = 0;
1536                         }
1537                 }
1538                 /* tcp was closed while we were in ip; resume close */
1539                 if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
1540                         tp->t_flags &= ~TF_CLOSING;
1541                         (void) tcp_close(tp);
1542                         return (0);
1543                 }
1544         }
1545         else {
1546                 error = 0;
1547                 packchain_looped++;
1548                 tcpstat.tcps_sndtotal++;
1549
1550                 if (recwin > 0 && SEQ_GT(tp->rcv_nxt+recwin, tp->rcv_adv))
1551                         tp->rcv_adv = tp->rcv_nxt + recwin;
1552                 tp->last_ack_sent = tp->rcv_nxt;
1553                 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
1554                 goto again;
1555         }
1556    }
1557         if (error) {
1558                 /*
1559                  * Assume that the packets were lost, so back out the
1560                  * sequence number advance, if any.  Note that the "lost"
1561                  * variable represents the amount of user data sent during
1562                  * the recent call to ip_output_list() plus the amount of
1563                  * user data in the packet list for this tcp at the moment.
1564                  */
1565                 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
1566                         /*
1567                          * No need to check for TH_FIN here because
1568                          * the TF_SENTFIN flag handles that case.
1569                          */
1570                         if ((flags & TH_SYN) == 0) {
1571                                 if (sack_rxmit) {
1572                                         p->rxmit -= lost;
1573                                         tp->sackhint.sack_bytes_rexmit -= lost;
1574                                 } else
1575                                         tp->snd_nxt -= lost;
1576                         }
1577                 }
1578 out:
1579                 if (tp->t_pktlist_head != NULL)
1580                         m_freem_list(tp->t_pktlist_head);
1581                 TCP_PKTLIST_CLEAR(tp);
1582
1583                 if (error == ENOBUFS) {
1584                         if (!tp->t_timer[TCPT_REXMT] &&
1585                              !tp->t_timer[TCPT_PERSIST])
1586                                 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1587                         tcp_quench(tp->t_inpcb, 0);
1588                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1589                         return (0);
1590                 }
1591                 if (error == EMSGSIZE) {
1592                         /*
1593                          * ip_output() will have already fixed the route
1594                          * for us.  tcp_mtudisc() will, as its last action,
1595                          * initiate retransmission, so it is important to
1596                          * not do so here.
1597                          */
1598                         tcp_mtudisc(tp->t_inpcb, 0);
1599                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1600                         return 0;
1601                 }
1602                 if ((error == EHOSTUNREACH || error == ENETDOWN)
1603                     && TCPS_HAVERCVDSYN(tp->t_state)) {
1604                         tp->t_softerror = error;
1605                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1606                         return (0);
1607                 }
1608                 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1609                 return (error);
1610         }
1611
1612         tcpstat.tcps_sndtotal++;
1613
1614         /*
1615          * Data sent (as far as we can tell).
1616          * If this advertises a larger window than any other segment,
1617          * then remember the size of the advertised window.
1618          * Any pending ACK has now been sent.
1619          */
1620         if (recwin > 0 && SEQ_GT(tp->rcv_nxt+recwin, tp->rcv_adv))
1621                 tp->rcv_adv = tp->rcv_nxt + recwin;
1622         tp->last_ack_sent = tp->rcv_nxt;
1623         tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
1624
1625         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0);
1626         if (sendalot && (!tcp_do_newreno || --maxburst))
1627                 goto again;
1628         return (0);
1629 }
1630
1631 static int
1632 tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
1633     int cnt, struct mbuf *opt, int flags)
1634 {
1635         int error = 0;
1636         boolean_t chain;
1637         boolean_t unlocked = FALSE;
1638
1639         /* Make sure ACK/DELACK conditions are cleared before
1640          * we unlock the socket.
1641          */
1642
1643         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
1644         /*
1645          * If allowed, unlock TCP socket while in IP
1646          * but only if the connection is established and
1647          * if we're not sending from an upcall.
1648          */
1649
1650         if (tcp_output_unlocked && ((so->so_flags & SOF_UPCALLINUSE) == 0) &&
1651             (tp->t_state == TCPS_ESTABLISHED)) {
1652                         unlocked = TRUE;
1653                         socket_unlock(so, 0);
1654         }
1655
1656         /*
1657          * Don't send down a chain of packets when:
1658          * - TCP chaining is disabled
1659          * - there is an IPsec rule set
1660          * - there is a non default rule set for the firewall
1661          */
1662
1663         chain = tcp_packet_chaining > 1
1664 #if IPSEC
1665                 && ipsec_bypass
1666 #endif
1667 #if IPFIREWALL
1668                 && (fw_enable == 0 || fw_bypass)
1669 #endif
1670                 ; // I'm important, not extraneous
1671
1672
1673         while (pkt != NULL) {
1674                 struct mbuf *npkt = pkt->m_nextpkt;
1675
1676                 if (!chain) {
1677                         pkt->m_nextpkt = NULL;
1678                         /*
1679                          * If we are not chaining, make sure to set the packet
1680                          * list count to 0 so that IP takes the right path;
1681                          * this is important for cases such as IPSec where a
1682                          * single mbuf might result in multiple mbufs as part
1683                          * of the encapsulation.  If a non-zero count is passed
1684                          * down to IP, the head of the chain might change and
1685                          * we could end up skipping it (thus generating bogus
1686                          * packets).  Fixing it in IP would be desirable, but
1687                          * for now this would do it.
1688                          */
1689                         cnt = 0;
1690                 }
1691 #if CONFIG_FORCE_OUT_IFP
1692                 error = ip_output_list(pkt, cnt, opt, &tp->t_inpcb->inp_route,
1693                     flags, 0, tp->t_inpcb->pdp_ifp);
1694 #else
1695                 error = ip_output_list(pkt, cnt, opt, &tp->t_inpcb->inp_route,
1696                     flags, 0, NULL);
1697 #endif
1698                 if (chain || error) {
1699                         /*
1700                          * If we sent down a chain then we are done since
1701                          * the callee had taken care of everything; else
1702                          * we need to free the rest of the chain ourselves.
1703                          */
1704                         if (!chain)
1705                                 m_freem_list(npkt);
1706                         break;
1707                 }
1708                 pkt = npkt;
1709         }
1710
1711         if (unlocked)
1712                 socket_lock(so, 0);
1713
1714         return (error);
1715 }
1716
1717 void
1718 tcp_setpersist(tp)
1719         register struct tcpcb *tp;
1720 {
1721         int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
1722
1723         if (tp->t_timer[TCPT_REXMT])
1724                 panic("tcp_setpersist: retransmit pending");
1725         /*
1726          * Start/restart persistance timer.
1727          */
1728         TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
1729             t * tcp_backoff[tp->t_rxtshift],
1730             TCPTV_PERSMIN, TCPTV_PERSMAX);
1731         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1732                 tp->t_rxtshift++;
1733 }