bsd/netinet/tcp_output.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_output.c        8.4 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #define _IP_VHL
  71
  72
  73 #include <sys/param.h>
  74 #include <sys/systm.h>
  75 #include <sys/kernel.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/mbuf.h>
  78 #include <sys/domain.h>
  79 #include <sys/protosw.h>
  80 #include <sys/socket.h>
  81 #include <sys/socketvar.h>
  82
  83 #include <net/route.h>
  84 #include <net/if_var.h>
  85
  86 #include <netinet/in.h>
  87 #include <netinet/in_systm.h>
  88 #include <netinet/in_var.h>
  89 #include <netinet/ip.h>
  90 #include <netinet/in_pcb.h>
  91 #include <netinet/ip_var.h>
  92 #if INET6
  93 #include <netinet6/in6_pcb.h>
  94 #include <netinet/ip6.h>
  95 #include <netinet6/ip6_var.h>
  96 #endif
  97 #include <netinet/tcp.h>
  98 #define TCPOUTFLAGS
  99 #include <netinet/tcp_fsm.h>
 100 #include <netinet/tcp_seq.h>
 101 #include <netinet/tcp_timer.h>
 102 #include <netinet/tcp_var.h>
 103 #include <netinet/tcpip.h>
 104 #if TCPDEBUG
 105 #include <netinet/tcp_debug.h>
 106 #endif
 107 #include <sys/kdebug.h>
 108
 109 #if IPSEC
 110 #include <netinet6/ipsec.h>
 111 #endif /*IPSEC*/
 112
 113 #if CONFIG_MACF_NET
 114 #include <security/mac_framework.h>
 115 #endif /* MAC_SOCKET */
 116
 117 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETTCP, 1)
 118 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETTCP, 3)
 119 #define DBG_FNC_TCP_OUTPUT      NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
 120
 121
 122 #ifdef notyet
 123 extern struct mbuf *m_copypack();
 124 #endif
 125
 126 int path_mtu_discovery = 1;
 127 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
 128         &path_mtu_discovery, 1, "Enable Path MTU Discovery");
 129
 130 int ss_fltsz = 1;
 131 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
 132         &ss_fltsz, 1, "Slow start flight size");
 133
 134 int ss_fltsz_local = 8; /* starts with eight segments max */
 135 SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
 136         &ss_fltsz_local, 1, "Slow start flight size for local networks");
 137
 138 int     tcp_do_newreno = 0;
 139 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
 140         0, "Enable NewReno Algorithms");
 141
 142 int     tcp_ecn_outbound = 0;
 143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out, CTLFLAG_RW, &tcp_ecn_outbound,
 144         0, "Initiate ECN for outbound connections");
 145
 146 int     tcp_ecn_inbound = 0;
 147 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, CTLFLAG_RW, &tcp_ecn_inbound,
 148         0, "Allow ECN negotiation for inbound connections");
 149
 150 int     tcp_packet_chaining = 50;
 151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW, &tcp_packet_chaining,
 152         0, "Enable TCP output packet chaining");
 153
 154 int     tcp_output_unlocked = 1;
 155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output, CTLFLAG_RW, &tcp_output_unlocked,
 156         0, "Unlock TCP when sending packets down to IP");
 157
 158 static long packchain_newlist = 0;
 159 static long packchain_looped = 0;
 160 static long packchain_sent = 0;
 161
 162
 163 /* temporary: for testing */
 164 #if IPSEC
 165 extern int ipsec_bypass;
 166 #endif
 167
 168 extern int slowlink_wsize;      /* window correction for slow links */
 169 extern u_long  route_generation;
 170 extern int fw_enable;           /* firewall check for packet chaining */
 171 extern int fw_bypass;           /* firewall check: disable packet chaining if there is rules */
 172
 173 extern vm_size_t        so_cache_zone_element_size;
 174
 175 static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, int,
 176     struct mbuf *, int);
 177
 178 static __inline__ u_int16_t
 179 get_socket_id(struct socket * s)
 180 {
 181         u_int16_t               val;
 182
 183         if (so_cache_zone_element_size == 0) {
 184                 return (0);
 185         }
 186         val = (u_int16_t)(((u_int32_t)s) / so_cache_zone_element_size);
 187         if (val == 0) {
 188                 val = 0xffff;
 189         }
 190         return (val);
 191 }
 192
 193 /*
 194  * Tcp output routine: figure out what should be sent and send it.
 195  *
 196  * Returns:     0                       Success
 197  *              EADDRNOTAVAIL
 198  *              ENOBUFS
 199  *              EMSGSIZE
 200  *              EHOSTUNREACH
 201  *              ENETDOWN
 202  *      ip_output_list:ENOMEM
 203  *      ip_output_list:EADDRNOTAVAIL
 204  *      ip_output_list:ENETUNREACH
 205  *      ip_output_list:EHOSTUNREACH
 206  *      ip_output_list:EACCES
 207  *      ip_output_list:EMSGSIZE
 208  *      ip_output_list:ENOBUFS
 209  *      ip_output_list:???              [ignorable: mostly IPSEC/firewall/DLIL]
 210  *      ip6_output:???                  [IPV6 only]
 211  */
 212 int
 213 tcp_output(struct tcpcb *tp)
 214 {
 215         struct socket *so = tp->t_inpcb->inp_socket;
 216         long len, recwin, sendwin;
 217         int off, flags, error;
 218         register struct mbuf *m;
 219         struct ip *ip = NULL;
 220         register struct ipovly *ipov = NULL;
 221 #if INET6
 222         struct ip6_hdr *ip6 = NULL;
 223 #endif /* INET6 */
 224         register struct tcphdr *th;
 225         u_char opt[TCP_MAXOLEN];
 226         unsigned ipoptlen, optlen, hdrlen;
 227         int idle, sendalot, lost = 0;
 228         int i, sack_rxmit;
 229         int sack_bytes_rxmt;
 230         struct sackhole *p;
 231
 232         int maxburst = TCP_MAXBURST;
 233         int    last_off = 0;
 234         int    m_off;
 235         struct mbuf *m_last = NULL;
 236         struct mbuf *m_head = NULL;
 237         struct mbuf *packetlist = NULL;
 238         struct mbuf *tp_inp_options = tp->t_inpcb->inp_depend4.inp4_options;
 239 #if INET6
 240         int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ;
 241         struct ip6_pktopts *inp6_pktopts = tp->t_inpcb->inp_depend6.inp6_outputopts;
 242 #endif
 243         short packchain_listadd = 0;
 244         u_int16_t       socket_id = get_socket_id(so);
 245         int so_options = so->so_options;
 246         struct rtentry *rt;
 247
 248         /*
 249          * Determine length of data that should be transmitted,
 250          * and flags that will be used.
 251          * If there is some data or critical controls (SYN, RST)
 252          * to send, then transmit; otherwise, investigate further.
 253          */
 254         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 255         if (idle && tp->t_rcvtime >= tp->t_rxtcur) {
 256                 /*
 257                  * We have been idle for "a while" and no acks are
 258                  * expected to clock out any data we send --
 259                  * slow start to get ack "clock" running again.
 260                  *
 261                  * Set the slow-start flight size depending on whether
 262                  * this is a local network or not.
 263                  */
 264                 if (
 265 #if INET6
 266                     (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) ||
 267                     (!isipv6 &&
 268 #endif
 269                      in_localaddr(tp->t_inpcb->inp_faddr)
 270 #if INET6
 271                      )
 272 #endif
 273                     )
 274                         tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
 275                 else
 276                         tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
 277         }
 278         tp->t_flags &= ~TF_LASTIDLE;
 279         if (idle) {
 280                 if (tp->t_flags & TF_MORETOCOME) {
 281                         tp->t_flags |= TF_LASTIDLE;
 282                         idle = 0;
 283                 }
 284         }
 285 again:
 286         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 287
 288 #if INET6
 289         if (isipv6) {
 290
 291                 KERNEL_DEBUG(DBG_LAYER_BEG,
 292                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
 293                      (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
 294                       (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
 295                      sendalot,0,0);
 296         }
 297         else
 298 #endif
 299
 300         {
 301                 KERNEL_DEBUG(DBG_LAYER_BEG,
 302                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
 303                      (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
 304                       (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
 305                      sendalot,0,0);
 306         /*
 307          * If the route generation id changed, we need to check that our
 308          * local (source) IP address is still valid. If it isn't either
 309          * return error or silently do nothing (assuming the address will
 310          * come back before the TCP connection times out).
 311          */
 312         rt = tp->t_inpcb->inp_route.ro_rt;
 313         if (rt != NULL && rt->generation_id != route_generation) {
 314                 struct ifnet *ifp;
 315
 316                 /* disable multipages at the socket */
 317                 somultipages(so, FALSE);
 318
 319                 /* check that the source address is still valid */
 320                 if (ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr) == 0) {
 321
 322                         if (tp->t_state >= TCPS_CLOSE_WAIT) {
 323                                 tcp_drop(tp, EADDRNOTAVAIL);
 324                                 return(EADDRNOTAVAIL);
 325                         }
 326
 327                         /* set Retransmit  timer if it wasn't set
 328                          * reset Persist timer and shift register as the
 329                          * adversed peer window may not be valid anymore
 330                          */
 331
 332                         if (!tp->t_timer[TCPT_REXMT]) {
 333                                 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 334                                 if (tp->t_timer[TCPT_PERSIST]) {
 335                                         tp->t_timer[TCPT_PERSIST] = 0;
 336                                         tp->t_rxtshift = 0;
 337                                 }
 338                         }
 339
 340                         if (tp->t_pktlist_head != NULL)
 341                                 m_freem_list(tp->t_pktlist_head);
 342                         TCP_PKTLIST_CLEAR(tp);
 343
 344                         /* drop connection if source address isn't available */
 345                         if (so->so_flags & SOF_NOADDRAVAIL) {
 346                                 tcp_drop(tp, EADDRNOTAVAIL);
 347                                 return(EADDRNOTAVAIL);
 348                         }
 349                         else
 350                                 return(0); /* silently ignore, keep data in socket: address may be back */
 351                 }
 352
 353                 /*
 354                  * Address is still valid; check for multipages capability
 355                  * again in case the outgoing interface has changed.
 356                  */
 357                 lck_mtx_lock(rt_mtx);
 358                 rt = tp->t_inpcb->inp_route.ro_rt;
 359                 if (rt != NULL && (ifp = rt->rt_ifp) != NULL)
 360                         somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
 361                 if (rt != NULL && rt->generation_id != route_generation)
 362                         rt->generation_id = route_generation;
 363                 /*
 364                  * See if we should do MTU discovery. Don't do it if:
 365                  *      1) it is disabled via the sysctl
 366                  *      2) the route isn't up
 367                  *      3) the MTU is locked (if it is, then discovery has been
 368                  *         disabled)
 369                  */
 370
 371                 if (!path_mtu_discovery || ((rt != NULL) &&
 372                     (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
 373                         tp->t_flags &= ~TF_PMTUD;
 374                 else
 375                         tp->t_flags |= TF_PMTUD;
 376
 377                 lck_mtx_unlock(rt_mtx);
 378         }
 379         }
 380
 381         /*
 382          * If we've recently taken a timeout, snd_max will be greater than
 383          * snd_nxt.  There may be SACK information that allows us to avoid
 384          * resending already delivered data.  Adjust snd_nxt accordingly.
 385          */
 386         if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
 387                 tcp_sack_adjust(tp);
 388         sendalot = 0;
 389         off = tp->snd_nxt - tp->snd_una;
 390         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 391
 392         if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
 393                 sendwin = min(sendwin, slowlink_wsize);
 394
 395         flags = tcp_outflags[tp->t_state];
 396         /*
 397          * Send any SACK-generated retransmissions.  If we're explicitly trying
 398          * to send out new data (when sendalot is 1), bypass this function.
 399          * If we retransmit in fast recovery mode, decrement snd_cwnd, since
 400          * we're replacing a (future) new transmission with a retransmission
 401          * now, and we previously incremented snd_cwnd in tcp_input().
 402          */
 403         /*
 404          * Still in sack recovery , reset rxmit flag to zero.
 405          */
 406         sack_rxmit = 0;
 407         sack_bytes_rxmt = 0;
 408         len = 0;
 409         p = NULL;
 410         if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
 411             (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 412                 long cwin;
 413
 414                 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
 415                 if (cwin < 0)
 416                         cwin = 0;
 417                 /* Do not retransmit SACK segments beyond snd_recover */
 418                 if (SEQ_GT(p->end, tp->snd_recover)) {
 419                         /*
 420                          * (At least) part of sack hole extends beyond
 421                          * snd_recover. Check to see if we can rexmit data
 422                          * for this hole.
 423                          */
 424                         if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
 425                                 /*
 426                                  * Can't rexmit any more data for this hole.
 427                                  * That data will be rexmitted in the next
 428                                  * sack recovery episode, when snd_recover
 429                                  * moves past p->rxmit.
 430                                  */
 431                                 p = NULL;
 432                                 goto after_sack_rexmit;
 433                         } else
 434                                 /* Can rexmit part of the current hole */
 435                                 len = ((long)ulmin(cwin,
 436                                                    tp->snd_recover - p->rxmit));
 437                 } else
 438                         len = ((long)ulmin(cwin, p->end - p->rxmit));
 439                 off = p->rxmit - tp->snd_una;
 440                 if (len > 0) {
 441                         sack_rxmit = 1;
 442                         sendalot = 1;
 443                         tcpstat.tcps_sack_rexmits++;
 444                         tcpstat.tcps_sack_rexmit_bytes +=
 445                             min(len, tp->t_maxseg);
 446                 }
 447         }
 448 after_sack_rexmit:
 449         /*
 450          * Get standard flags, and add SYN or FIN if requested by 'hidden'
 451          * state flags.
 452          */
 453         if (tp->t_flags & TF_NEEDFIN)
 454                 flags |= TH_FIN;
 455         if (tp->t_flags & TF_NEEDSYN)
 456                 flags |= TH_SYN;
 457
 458         /*
 459          * If in persist timeout with window of 0, send 1 byte.
 460          * Otherwise, if window is small but nonzero
 461          * and timer expired, we will send what we can
 462          * and go to transmit state.
 463          */
 464         if (tp->t_force) {
 465                 if (sendwin == 0) {
 466                         /*
 467                          * If we still have some data to send, then
 468                          * clear the FIN bit.  Usually this would
 469                          * happen below when it realizes that we
 470                          * aren't sending all the data.  However,
 471                          * if we have exactly 1 byte of unsent data,
 472                          * then it won't clear the FIN bit below,
 473                          * and if we are in persist state, we wind
 474                          * up sending the packet without recording
 475                          * that we sent the FIN bit.
 476                          *
 477                          * We can't just blindly clear the FIN bit,
 478                          * because if we don't have any more data
 479                          * to send then the probe will be the FIN
 480                          * itself.
 481                          */
 482                         if (off < so->so_snd.sb_cc)
 483                                 flags &= ~TH_FIN;
 484                         sendwin = 1;
 485                 } else {
 486                         tp->t_timer[TCPT_PERSIST] = 0;
 487                         tp->t_rxtshift = 0;
 488                 }
 489         }
 490
 491         /*
 492          * If snd_nxt == snd_max and we have transmitted a FIN, the
 493          * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
 494          * a negative length.  This can also occur when TCP opens up
 495          * its congestion window while receiving additional duplicate
 496          * acks after fast-retransmit because TCP will reset snd_nxt
 497          * to snd_max after the fast-retransmit.
 498          *
 499          * In the normal retransmit-FIN-only case, however, snd_nxt will
 500          * be set to snd_una, the offset will be 0, and the length may
 501          * wind up 0.
 502          *
 503          * If sack_rxmit is true we are retransmitting from the scoreboard
 504          * in which case len is already set.
 505          */
 506         if (sack_rxmit == 0) {
 507                 if (sack_bytes_rxmt == 0)
 508                         len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
 509                 else {
 510                         long cwin;
 511
 512                         /*
 513                          * We are inside of a SACK recovery episode and are
 514                          * sending new data, having retransmitted all the
 515                          * data possible in the scoreboard.
 516                          */
 517                         len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd)
 518                                - off);
 519                         /*
 520                          * Don't remove this (len > 0) check !
 521                          * We explicitly check for len > 0 here (although it
 522                          * isn't really necessary), to work around a gcc
 523                          * optimization issue - to force gcc to compute
 524                          * len above. Without this check, the computation
 525                          * of len is bungled by the optimizer.
 526                          */
 527                         if (len > 0) {
 528                                 cwin = tp->snd_cwnd -
 529                                         (tp->snd_nxt - tp->sack_newdata) -
 530                                         sack_bytes_rxmt;
 531                                 if (cwin < 0)
 532                                         cwin = 0;
 533                                 len = lmin(len, cwin);
 534                         }
 535                 }
 536         }
 537
 538         /*
 539          * Lop off SYN bit if it has already been sent.  However, if this
 540          * is SYN-SENT state and if segment contains data and if we don't
 541          * know that foreign host supports TAO, suppress sending segment.
 542          */
 543         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 544                 flags &= ~TH_SYN;
 545                 off--, len++;
 546                 if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
 547                         while (!(tp->t_flags & TF_SENDINPROG) &&
 548                             tp->t_pktlist_head != NULL) {
 549                                 packetlist = tp->t_pktlist_head;
 550                                 packchain_listadd = tp->t_lastchain;
 551                                 packchain_sent++;
 552                                 TCP_PKTLIST_CLEAR(tp);
 553                                 tp->t_flags |= TF_SENDINPROG;
 554
 555                                 error = tcp_ip_output(so, tp, packetlist,
 556                                     packchain_listadd, tp_inp_options,
 557                                     (so_options & SO_DONTROUTE));
 558
 559                                 tp->t_flags &= ~TF_SENDINPROG;
 560                         }
 561                         /* tcp was closed while we were in ip; resume close */
 562                         if ((tp->t_flags &
 563                             (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
 564                                 tp->t_flags &= ~TF_CLOSING;
 565                                 (void) tcp_close(tp);
 566                         }
 567                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,
 568                             0,0,0,0,0);
 569                         return 0;
 570                 }
 571         }
 572
 573         /*
 574          * Be careful not to send data and/or FIN on SYN segments.
 575          * This measure is needed to prevent interoperability problems
 576          * with not fully conformant TCP implementations.
 577          */
 578         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 579                 len = 0;
 580                 flags &= ~TH_FIN;
 581         }
 582
 583         if (len < 0) {
 584                 /*
 585                  * If FIN has been sent but not acked,
 586                  * but we haven't been called to retransmit,
 587                  * len will be < 0.  Otherwise, window shrank
 588                  * after we sent into it.  If window shrank to 0,
 589                  * cancel pending retransmit, pull snd_nxt back
 590                  * to (closed) window, and set the persist timer
 591                  * if it isn't already going.  If the window didn't
 592                  * close completely, just wait for an ACK.
 593                  */
 594                 len = 0;
 595                 if (sendwin == 0) {
 596                         tp->t_timer[TCPT_REXMT] = 0;
 597                         tp->t_rxtshift = 0;
 598                         tp->snd_nxt = tp->snd_una;
 599                         if (tp->t_timer[TCPT_PERSIST] == 0)
 600                                 tcp_setpersist(tp);
 601                 }
 602         }
 603
 604         /*
 605          * len will be >= 0 after this point.  Truncate to the maximum
 606          * segment length and ensure that FIN is removed if the length
 607          * no longer contains the last data byte.
 608          */
 609         if (len > tp->t_maxseg) {
 610                 len = tp->t_maxseg;
 611                 sendalot = 1;
 612         }
 613         if (sack_rxmit) {
 614                 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
 615                         flags &= ~TH_FIN;
 616         } else {
 617                 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
 618                         flags &= ~TH_FIN;
 619         }
 620
 621         recwin = tcp_sbspace(tp);
 622
 623         /*
 624          * Sender silly window avoidance.   We transmit under the following
 625          * conditions when len is non-zero:
 626          *
 627          *      - We have a full segment
 628          *      - This is the last buffer in a write()/send() and we are
 629          *        either idle or running NODELAY
 630          *      - we've timed out (e.g. persist timer)
 631          *      - we have more then 1/2 the maximum send window's worth of
 632          *        data (receiver may be limited the window size)
 633          *      - we need to retransmit
 634          */
 635         if (len) {
 636                 if (len == tp->t_maxseg) {
 637                         tp->t_flags |= TF_MAXSEGSNT;
 638                         goto send;
 639                 }
 640                 if (!(tp->t_flags & TF_MORETOCOME) &&
 641                     (idle || tp->t_flags & TF_NODELAY || tp->t_flags & TF_MAXSEGSNT) &&
 642                     (tp->t_flags & TF_NOPUSH) == 0 &&
 643                     len + off >= so->so_snd.sb_cc) {
 644                         tp->t_flags &= ~TF_MAXSEGSNT;
 645                         goto send;
 646                 }
 647                 if (tp->t_force) {
 648                         tp->t_flags &= ~TF_MAXSEGSNT;
 649                         goto send;
 650                 }
 651                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
 652                         tp->t_flags &= ~TF_MAXSEGSNT;
 653                         goto send;
 654                 }
 655                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 656                         tp->t_flags &= ~TF_MAXSEGSNT;
 657                         goto send;
 658                 }
 659                 if (sack_rxmit)
 660                         goto send;
 661         }
 662
 663         /*
 664          * Compare available window to amount of window
 665          * known to peer (as advertised window less
 666          * next expected input).  If the difference is at least two
 667          * max size segments, or at least 50% of the maximum possible
 668          * window, then want to send a window update to peer.
 669          * Skip this if the connection is in T/TCP half-open state.
 670          */
 671         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
 672                 /*
 673                  * "adv" is the amount we can increase the window,
 674                  * taking into account that we are limited by
 675                  * TCP_MAXWIN << tp->rcv_scale.
 676                  */
 677                 long adv = lmin(recwin, (long)TCP_MAXWIN << tp->rcv_scale) -
 678                         (tp->rcv_adv - tp->rcv_nxt);
 679
 680                 if (adv >= (long) (2 * tp->t_maxseg))
 681                         goto send;
 682                 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
 683                         goto send;
 684         }
 685
 686         /*
 687          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 688          * is also a catch-all for the retransmit timer timeout case.
 689          */
 690         if (tp->t_flags & TF_ACKNOW)
 691                 goto send;
 692         if ((flags & TH_RST) ||
 693             ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 694                 goto send;
 695         if (SEQ_GT(tp->snd_up, tp->snd_una))
 696                 goto send;
 697         /*
 698          * If our state indicates that FIN should be sent
 699          * and we have not yet done so, then we need to send.
 700          */
 701         if (flags & TH_FIN &&
 702             ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 703                 goto send;
 704         /*
 705          * In SACK, it is possible for tcp_output to fail to send a segment
 706          * after the retransmission timer has been turned off.  Make sure
 707          * that the retransmission timer is set.
 708          */
 709         if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) &&
 710                 tp->t_timer[TCPT_REXMT] == 0 &&
 711             tp->t_timer[TCPT_PERSIST] == 0) {
 712                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 713                         goto just_return;
 714         }
 715         /*
 716          * TCP window updates are not reliable, rather a polling protocol
 717          * using ``persist'' packets is used to insure receipt of window
 718          * updates.  The three ``states'' for the output side are:
 719          *      idle                    not doing retransmits or persists
 720          *      persisting              to move a small or zero window
 721          *      (re)transmitting        and thereby not persisting
 722          *
 723          * tp->t_timer[TCPT_PERSIST]
 724          *      is set when we are in persist state.
 725          * tp->t_force
 726          *      is set when we are called to send a persist packet.
 727          * tp->t_timer[TCPT_REXMT]
 728          *      is set when we are retransmitting
 729          * The output side is idle when both timers are zero.
 730          *
 731          * If send window is too small, there is data to transmit, and no
 732          * retransmit or persist is pending, then go to persist state.
 733          * If nothing happens soon, send when timer expires:
 734          * if window is nonzero, transmit what we can,
 735          * otherwise force out a byte.
 736          */
 737         if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
 738             tp->t_timer[TCPT_PERSIST] == 0) {
 739                 tp->t_rxtshift = 0;
 740                 tcp_setpersist(tp);
 741         }
 742 just_return:
 743         /*
 744          * If there is no reason to send a segment, just return.
 745          * but if there is some packets left in the packet list, send them now.
 746          */
 747         while (!(tp->t_flags & TF_SENDINPROG) && tp->t_pktlist_head != NULL) {
 748                 packetlist = tp->t_pktlist_head;
 749                 packchain_listadd = tp->t_lastchain;
 750                 packchain_sent++;
 751                 TCP_PKTLIST_CLEAR(tp);
 752                 tp->t_flags |= TF_SENDINPROG;
 753
 754                 error = tcp_ip_output(so, tp, packetlist, packchain_listadd,
 755                     tp_inp_options, (so_options & SO_DONTROUTE));
 756
 757                 tp->t_flags &= ~TF_SENDINPROG;
 758         }
 759         /* tcp was closed while we were in ip; resume close */
 760         if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
 761                 tp->t_flags &= ~TF_CLOSING;
 762                 (void) tcp_close(tp);
 763         }
 764         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 765         return (0);
 766
 767 send:
 768         /*
 769          * Before ESTABLISHED, force sending of initial options
 770          * unless TCP set not to do any options.
 771          * NOTE: we assume that the IP/TCP header plus TCP options
 772          * always fit in a single mbuf, leaving room for a maximum
 773          * link header, i.e.
 774          *      max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 775          */
 776         optlen = 0;
 777 #if INET6
 778         if (isipv6)
 779                 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 780         else
 781 #endif
 782         hdrlen = sizeof (struct tcpiphdr);
 783         if (flags & TH_SYN) {
 784                 tp->snd_nxt = tp->iss;
 785                 if ((tp->t_flags & TF_NOOPT) == 0) {
 786                         u_short mss;
 787
 788                         opt[0] = TCPOPT_MAXSEG;
 789                         opt[1] = TCPOLEN_MAXSEG;
 790                         mss = htons((u_short) tcp_mssopt(tp));
 791                         (void)memcpy(opt + 2, &mss, sizeof(mss));
 792                         optlen = TCPOLEN_MAXSEG;
 793
 794                         if ((tp->t_flags & TF_REQ_SCALE) &&
 795                             ((flags & TH_ACK) == 0 ||
 796                             (tp->t_flags & TF_RCVD_SCALE))) {
 797                                 *((u_int32_t *)(opt + optlen)) = htonl(
 798                                         TCPOPT_NOP << 24 |
 799                                         TCPOPT_WINDOW << 16 |
 800                                         TCPOLEN_WINDOW << 8 |
 801                                         tp->request_r_scale);
 802                                 optlen += 4;
 803                         }
 804                 }
 805
 806         }
 807
 808         /*
 809           RFC 3168 states that:
 810            - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
 811            to handle the TCP ECE flag, even if you also later send a
 812            non-ECN-setup SYN/SYN-ACK.
 813            - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
 814            the ip ECT flag.
 815
 816            It is not clear how the ECE flag would ever be set if you never
 817            set the IP ECT flag on outbound packets. All the same, we use
 818            the TE_SETUPSENT to indicate that we have committed to handling
 819            the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
 820            whether or not we should set the IP ECT flag on outbound packets.
 821          */
 822         /*
 823          * For a SYN-ACK, send an ECN setup SYN-ACK
 824          */
 825         if (tcp_ecn_inbound && (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
 826                 if ((tp->ecn_flags & TE_SETUPRECEIVED) != 0) {
 827                         if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
 828                                 /* Setting TH_ECE makes this an ECN-setup SYN-ACK */
 829                                 flags |= TH_ECE;
 830
 831                                 /*
 832                                  * Record that we sent the ECN-setup and default to
 833                                  * setting IP ECT.
 834                                  */
 835                                 tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
 836                         }
 837                         else {
 838                                 /*
 839                                  * We sent an ECN-setup SYN-ACK but it was dropped.
 840                                  * Fallback to non-ECN-setup SYN-ACK and clear flag
 841                                  * that to indicate we should not send data with IP ECT set.
 842                                  *
 843                                  * Pretend we didn't receive an ECN-setup SYN.
 844                                  */
 845                                 tp->ecn_flags &= ~TE_SETUPRECEIVED;
 846                         }
 847                 }
 848         }
 849         else if (tcp_ecn_outbound && (flags & (TH_SYN | TH_ACK)) == TH_SYN) {
 850                 if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
 851                         /* Setting TH_ECE and TH_CWR makes this an ECN-setup SYN */
 852                         flags |= (TH_ECE | TH_CWR);
 853
 854                         /*
 855                          * Record that we sent the ECN-setup and default to
 856                          * setting IP ECT.
 857                          */
 858                         tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
 859                 }
 860                 else {
 861                         /*
 862                          * We sent an ECN-setup SYN but it was dropped.
 863                          * Fall back to no ECN and clear flag indicating
 864                          * we should send data with IP ECT set.
 865                          */
 866                         tp->ecn_flags &= ~TE_SENDIPECT;
 867                 }
 868         }
 869
 870         /*
 871          * Check if we should set the TCP CWR flag.
 872          * CWR flag is sent when we reduced the congestion window because
 873          * we received a TCP ECE or we performed a fast retransmit. We
 874          * never set the CWR flag on retransmitted packets. We only set
 875          * the CWR flag on data packets. Pure acks don't have this set.
 876          */
 877         if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 &&
 878                 !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 879                 flags |= TH_CWR;
 880                 tp->ecn_flags &= ~TE_SENDCWR;
 881         }
 882
 883         /*
 884          * Check if we should set the TCP ECE flag.
 885          */
 886         if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) {
 887                 flags |= TH_ECE;
 888         }
 889
 890         /*
 891          * Send a timestamp and echo-reply if this is a SYN and our side
 892          * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
 893          * and our peer have sent timestamps in our SYN's.
 894          */
 895         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 896             (flags & TH_RST) == 0 &&
 897             ((flags & TH_ACK) == 0 ||
 898              (tp->t_flags & TF_RCVD_TSTMP))) {
 899                 u_int32_t *lp = (u_int32_t *)(opt + optlen);
 900
 901                 /* Form timestamp option as shown in appendix A of RFC 1323. */
 902                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
 903                 *lp++ = htonl(tcp_now);
 904                 *lp   = htonl(tp->ts_recent);
 905                 optlen += TCPOLEN_TSTAMP_APPA;
 906         }
 907
 908         if (tp->sack_enable && ((tp->t_flags & TF_NOOPT) == 0)) {
 909                 /*
 910                  * Tack on the SACK permitted option *last*.
 911                  * And do padding of options after tacking this on.
 912                  * This is because of MSS, TS, WinScale and Signatures are
 913                  * all present, we have just 2 bytes left for the SACK
 914                  * permitted option, which is just enough.
 915                  */
 916                 /*
 917                  * If this is the first SYN of connection (not a SYN
 918                  * ACK), include SACK permitted option.  If this is a
 919                  * SYN ACK, include SACK permitted option if peer has
 920                  * already done so. This is only for active connect,
 921                  * since the syncache takes care of the passive connect.
 922                  */
 923                 if ((flags & TH_SYN) &&
 924                     (!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) {
 925                         u_char *bp;
 926                         bp = (u_char *)opt + optlen;
 927
 928                         *bp++ = TCPOPT_SACK_PERMITTED;
 929                         *bp++ = TCPOLEN_SACK_PERMITTED;
 930                         optlen += TCPOLEN_SACK_PERMITTED;
 931                 }
 932
 933                 /*
 934                  * Send SACKs if necessary.  This should be the last
 935                  * option processed.  Only as many SACKs are sent as
 936                  * are permitted by the maximum options size.
 937                  *
 938                  * In general, SACK blocks consume 8*n+2 bytes.
 939                  * So a full size SACK blocks option is 34 bytes
 940                  * (to generate 4 SACK blocks).  At a minimum,
 941                  * we need 10 bytes (to generate 1 SACK block).
 942                  * If TCP Timestamps (12 bytes) and TCP Signatures
 943                  * (18 bytes) are both present, we'll just have
 944                  * 10 bytes for SACK options 40 - (12 + 18).
 945                  */
 946                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 947                     (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0 &&
 948                     MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) {
 949                         int nsack, sackoptlen, padlen;
 950                         u_char *bp = (u_char *)opt + optlen;
 951                         u_int32_t *lp;
 952
 953                         nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK;
 954                         nsack = min(nsack, tp->rcv_numsacks);
 955                         sackoptlen = (2 + nsack * TCPOLEN_SACK);
 956
 957                         /*
 958                          * First we need to pad options so that the
 959                          * SACK blocks can start at a 4-byte boundary
 960                          * (sack option and length are at a 2 byte offset).
 961                          */
 962                         padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4;
 963                         optlen += padlen;
 964                         while (padlen-- > 0)
 965                                 *bp++ = TCPOPT_NOP;
 966
 967                         tcpstat.tcps_sack_send_blocks++;
 968                         *bp++ = TCPOPT_SACK;
 969                         *bp++ = sackoptlen;
 970                         lp = (u_int32_t *)bp;
 971                         for (i = 0; i < nsack; i++) {
 972                                 struct sackblk sack = tp->sackblks[i];
 973                                 *lp++ = htonl(sack.start);
 974                                 *lp++ = htonl(sack.end);
 975                         }
 976                         optlen += sackoptlen;
 977                 }
 978         }
 979
 980         /* Pad TCP options to a 4 byte boundary */
 981         if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
 982                 int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
 983                 u_char *bp = (u_char *)opt + optlen;
 984
 985                 optlen += pad;
 986                 while (pad) {
 987                         *bp++ = TCPOPT_EOL;
 988                         pad--;
 989                 }
 990         }
 991
 992         hdrlen += optlen;
 993
 994 #if INET6
 995         if (isipv6)
 996                 ipoptlen = ip6_optlen(tp->t_inpcb);
 997         else
 998 #endif
 999         {
1000                 if (tp_inp_options) {
1001                         ipoptlen = tp_inp_options->m_len -
1002                                 offsetof(struct ipoption, ipopt_list);
1003                 } else
1004                         ipoptlen = 0;
1005         }
1006 #if IPSEC
1007         if (ipsec_bypass == 0)
1008                 ipoptlen += ipsec_hdrsiz_tcp(tp);
1009 #endif
1010
1011         /*
1012          * Adjust data length if insertion of options will
1013          * bump the packet length beyond the t_maxopd length.
1014          * Clear the FIN bit because we cut off the tail of
1015          * the segment.
1016          */
1017         if (len + optlen + ipoptlen > tp->t_maxopd) {
1018                 /*
1019                  * If there is still more to send, don't close the connection.
1020                  */
1021                 flags &= ~TH_FIN;
1022                 len = tp->t_maxopd - optlen - ipoptlen;
1023                 sendalot = 1;
1024         }
1025
1026 /*#ifdef DIAGNOSTIC*/
1027 #if INET6
1028         if (max_linkhdr + hdrlen > MCLBYTES)
1029                 panic("tcphdr too big");
1030 #else
1031         if (max_linkhdr + hdrlen > MHLEN)
1032                 panic("tcphdr too big");
1033 #endif
1034 /*#endif*/
1035
1036         /*
1037          * Grab a header mbuf, attaching a copy of data to
1038          * be transmitted, and initialize the header from
1039          * the template for sends on this connection.
1040          */
1041         if (len) {
1042                 if (tp->t_force && len == 1)
1043                         tcpstat.tcps_sndprobe++;
1044                 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1045                         tcpstat.tcps_sndrexmitpack++;
1046                         tcpstat.tcps_sndrexmitbyte += len;
1047                 } else {
1048                         tcpstat.tcps_sndpack++;
1049                         tcpstat.tcps_sndbyte += len;
1050                 }
1051 #ifdef notyet
1052                 if ((m = m_copypack(so->so_snd.sb_mb, off,
1053                     (int)len, max_linkhdr + hdrlen)) == 0) {
1054                         error = ENOBUFS;
1055                         goto out;
1056                 }
1057                 /*
1058                  * m_copypack left space for our hdr; use it.
1059                  */
1060                 m->m_len += hdrlen;
1061                 m->m_data -= hdrlen;
1062 #else
1063                 /*
1064                  * try to use the new interface that allocates all
1065                  * the necessary mbuf hdrs under 1 mbuf lock and
1066                  * avoids rescanning the socket mbuf list if
1067                  * certain conditions are met.  This routine can't
1068                  * be used in the following cases...
1069                  * 1) the protocol headers exceed the capacity of
1070                  * of a single mbuf header's data area (no cluster attached)
1071                  * 2) the length of the data being transmitted plus
1072                  * the protocol headers fits into a single mbuf header's
1073                  * data area (no cluster attached)
1074                  */
1075                 m = NULL;
1076 #if INET6
1077                 if (MHLEN < hdrlen + max_linkhdr) {
1078                         MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1079                         if (m == NULL) {
1080                                 error = ENOBUFS;
1081                                 goto out;
1082                         }
1083                         MCLGET(m, M_DONTWAIT);
1084                         if ((m->m_flags & M_EXT) == 0) {
1085                                 m_freem(m);
1086                                 error = ENOBUFS;
1087                                 goto out;
1088                         }
1089                         m->m_data += max_linkhdr;
1090                         m->m_len = hdrlen;
1091                 }
1092 #endif
1093                 if (len <= MHLEN - hdrlen - max_linkhdr) {
1094                         if (m == NULL) {
1095                                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1096                                 if (m == NULL) {
1097                                         error = ENOBUFS;
1098                                         goto out;
1099                                 }
1100                                 m->m_data += max_linkhdr;
1101                                 m->m_len = hdrlen;
1102                         }
1103                         /* makes sure we still have data left to be sent at this point */
1104                         if (so->so_snd.sb_mb == NULL || off == -1) {
1105                                 if (m != NULL)  m_freem(m);
1106                                 error = 0; /* should we return an error? */
1107                                 goto out;
1108                         }
1109                         m_copydata(so->so_snd.sb_mb, off, (int) len,
1110                             mtod(m, caddr_t) + hdrlen);
1111                         m->m_len += len;
1112                 } else {
1113                         if (m != NULL) {
1114                                 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
1115                                 if (m->m_next == 0) {
1116                                         (void) m_free(m);
1117                                         error = ENOBUFS;
1118                                         goto out;
1119                                 }
1120                         } else {
1121                                 /*
1122                                  * determine whether the mbuf pointer and offset passed back by the 'last' call
1123                                  * to m_copym_with_hdrs are still valid... if the head of the socket chain has
1124                                  * changed (due to an incoming ACK for instance), or the offset into the chain we
1125                                  * just computed is different from the one last returned by m_copym_with_hdrs (perhaps
1126                                  * we're re-transmitting a packet sent earlier), than we can't pass the mbuf pointer and
1127                                  * offset into it as valid hints for m_copym_with_hdrs to use (if valid, these hints allow
1128                                  * m_copym_with_hdrs to avoid rescanning from the beginning of the socket buffer mbuf list.
1129                                  * setting the mbuf pointer to NULL is sufficient to disable the hint mechanism.
1130                                  */
1131                                 if (m_head != so->so_snd.sb_mb || last_off != off)
1132                                         m_last = NULL;
1133                                 last_off = off + len;
1134                                 m_head = so->so_snd.sb_mb;
1135
1136                                 /* makes sure we still have data left to be sent at this point */
1137                                 if (m_head == NULL) {
1138                                         error = 0; /* should we return an error? */
1139                                         goto out;
1140                                 }
1141
1142                                 /*
1143                                  * m_copym_with_hdrs will always return the last mbuf pointer and the offset into it that
1144                                  * it acted on to fullfill the current request, whether a valid 'hint' was passed in or not
1145                                  */
1146                                 if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, (int) len, M_DONTWAIT, &m_last, &m_off)) == NULL) {
1147                                         error = ENOBUFS;
1148                                         goto out;
1149                                 }
1150                                 m->m_data += max_linkhdr;
1151                                 m->m_len = hdrlen;
1152                         }
1153                 }
1154 #endif
1155                 /*
1156                  * If we're sending everything we've got, set PUSH.
1157                  * (This will keep happy those implementations which only
1158                  * give data to the user when a buffer fills or
1159                  * a PUSH comes in.)
1160                  */
1161                 if (off + len == so->so_snd.sb_cc)
1162                         flags |= TH_PUSH;
1163         } else {
1164                 if (tp->t_flags & TF_ACKNOW)
1165                         tcpstat.tcps_sndacks++;
1166                 else if (flags & (TH_SYN|TH_FIN|TH_RST))
1167                         tcpstat.tcps_sndctrl++;
1168                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
1169                         tcpstat.tcps_sndurg++;
1170                 else
1171                         tcpstat.tcps_sndwinup++;
1172
1173                 MGETHDR(m, M_DONTWAIT, MT_HEADER);      /* MAC-OK */
1174                 if (m == NULL) {
1175                         error = ENOBUFS;
1176                         goto out;
1177                 }
1178 #if INET6
1179                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
1180                     MHLEN >= hdrlen) {
1181                         MH_ALIGN(m, hdrlen);
1182                 } else
1183 #endif
1184                 m->m_data += max_linkhdr;
1185                 m->m_len = hdrlen;
1186         }
1187         m->m_pkthdr.rcvif = 0;
1188 #if CONFIG_MACF_NET
1189         mac_mbuf_label_associate_inpcb(tp->t_inpcb, m);
1190 #endif
1191 #if INET6
1192         if (isipv6) {
1193                 ip6 = mtod(m, struct ip6_hdr *);
1194                 th = (struct tcphdr *)(ip6 + 1);
1195                 tcp_fillheaders(tp, ip6, th);
1196         } else
1197 #endif /* INET6 */
1198         {
1199                 ip = mtod(m, struct ip *);
1200                 ipov = (struct ipovly *)ip;
1201                 th = (struct tcphdr *)(ip + 1);
1202                 /* this picks up the pseudo header (w/o the length) */
1203                 tcp_fillheaders(tp, ip, th);
1204                 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
1205                         !SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1206                         ip->ip_tos = IPTOS_ECN_ECT0;
1207                 }
1208         }
1209
1210         /*
1211          * Fill in fields, remembering maximum advertised
1212          * window for use in delaying messages about window sizes.
1213          * If resending a FIN, be sure not to use a new sequence number.
1214          */
1215         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
1216             tp->snd_nxt == tp->snd_max)
1217                 tp->snd_nxt--;
1218         /*
1219          * If we are doing retransmissions, then snd_nxt will
1220          * not reflect the first unsent octet.  For ACK only
1221          * packets, we do not want the sequence number of the
1222          * retransmitted packet, we want the sequence number
1223          * of the next unsent octet.  So, if there is no data
1224          * (and no SYN or FIN), use snd_max instead of snd_nxt
1225          * when filling in ti_seq.  But if we are in persist
1226          * state, snd_max might reflect one byte beyond the
1227          * right edge of the window, so use snd_nxt in that
1228          * case, since we know we aren't doing a retransmission.
1229          * (retransmit and persist are mutually exclusive...)
1230          */
1231         if (sack_rxmit == 0) {
1232                 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
1233                         th->th_seq = htonl(tp->snd_nxt);
1234                 else
1235                         th->th_seq = htonl(tp->snd_max);
1236         } else {
1237                 th->th_seq = htonl(p->rxmit);
1238                 p->rxmit += len;
1239                 tp->sackhint.sack_bytes_rexmit += len;
1240         }
1241         th->th_ack = htonl(tp->rcv_nxt);
1242         if (optlen) {
1243                 bcopy(opt, th + 1, optlen);
1244                 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
1245         }
1246         th->th_flags = flags;
1247         /*
1248          * Calculate receive window.  Don't shrink window,
1249          * but avoid silly window syndrome.
1250          */
1251         if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)tp->t_maxseg)
1252                 recwin = 0;
1253         if (recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
1254                 recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
1255         if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
1256                 if (recwin > (long)slowlink_wsize)
1257                         recwin = slowlink_wsize;
1258                         th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
1259         }
1260         else {
1261                 if (recwin > (long)(TCP_MAXWIN << tp->rcv_scale))
1262                         recwin = (long)(TCP_MAXWIN << tp->rcv_scale);
1263                 th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
1264         }
1265
1266         /*
1267          * Adjust the RXWIN0SENT flag - indicate that we have advertised
1268          * a 0 window.  This may cause the remote transmitter to stall.  This
1269          * flag tells soreceive() to disable delayed acknowledgements when
1270          * draining the buffer.  This can occur if the receiver is attempting
1271          * to read more data then can be buffered prior to transmitting on
1272          * the connection.
1273          */
1274         if (recwin == 0)
1275                 tp->t_flags |= TF_RXWIN0SENT;
1276         else
1277                 tp->t_flags &= ~TF_RXWIN0SENT;
1278         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1279                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
1280                 th->th_flags |= TH_URG;
1281         } else
1282                 /*
1283                  * If no urgent pointer to send, then we pull
1284                  * the urgent pointer to the left edge of the send window
1285                  * so that it doesn't drift into the send window on sequence
1286                  * number wraparound.
1287                  */
1288                 tp->snd_up = tp->snd_una;               /* drag it along */
1289
1290         /*
1291          * Put TCP length in extended header, and then
1292          * checksum extended header and data.
1293          */
1294         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
1295 #if INET6
1296         if (isipv6)
1297                 /*
1298                  * ip6_plen is not need to be filled now, and will be filled
1299                  * in ip6_output.
1300                  */
1301                 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
1302                                        sizeof(struct tcphdr) + optlen + len);
1303         else
1304 #endif /* INET6 */
1305         {
1306                 m->m_pkthdr.csum_flags = CSUM_TCP;
1307                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1308                 if (len + optlen)
1309                         th->th_sum = in_addword(th->th_sum,
1310                                 htons((u_short)(optlen + len)));
1311         }
1312
1313         /*
1314          * In transmit state, time the transmission and arrange for
1315          * the retransmit.  In persist state, just set snd_max.
1316          */
1317         if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
1318                 tcp_seq startseq = tp->snd_nxt;
1319
1320                 /*
1321                  * Advance snd_nxt over sequence space of this segment.
1322                  */
1323                 if (flags & (TH_SYN|TH_FIN)) {
1324                         if (flags & TH_SYN)
1325                                 tp->snd_nxt++;
1326                         if (flags & TH_FIN) {
1327                                 tp->snd_nxt++;
1328                                 tp->t_flags |= TF_SENTFIN;
1329                         }
1330                 }
1331                 if (sack_rxmit)
1332                         goto timer;
1333                 tp->snd_nxt += len;
1334                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1335                         tp->snd_max = tp->snd_nxt;
1336                         /*
1337                          * Time this transmission if not a retransmission and
1338                          * not currently timing anything.
1339                          */
1340                         if (tp->t_rtttime == 0) {
1341                                 tp->t_rtttime = 1;
1342                                 tp->t_rtseq = startseq;
1343                                 tcpstat.tcps_segstimed++;
1344                         }
1345                 }
1346
1347                 /*
1348                  * Set retransmit timer if not currently set,
1349                  * and not doing an ack or a keep-alive probe.
1350                  * Initial value for retransmit timer is smoothed
1351                  * round-trip time + 2 * round-trip time variance.
1352                  * Initialize shift counter which is used for backoff
1353                  * of retransmit time.
1354                  */
1355 timer:
1356                 if (tp->t_timer[TCPT_REXMT] == 0 &&
1357                     ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
1358                         tp->snd_nxt != tp->snd_una)) {
1359                         if (tp->t_timer[TCPT_PERSIST]) {
1360                                 tp->t_timer[TCPT_PERSIST] = 0;
1361                                 tp->t_rxtshift = 0;
1362                         }
1363                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1364                 }
1365         } else {
1366                 /*
1367                  * Persist case, update snd_max but since we are in
1368                  * persist mode (no window) we do not update snd_nxt.
1369                  */
1370                 int xlen = len;
1371                 if (flags & TH_SYN)
1372                         ++xlen;
1373                 if (flags & TH_FIN) {
1374                         ++xlen;
1375                         tp->t_flags |= TF_SENTFIN;
1376                 }
1377                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
1378                         tp->snd_max = tp->snd_nxt + len;
1379         }
1380
1381 #if TCPDEBUG
1382         /*
1383          * Trace.
1384          */
1385         if (so_options & SO_DEBUG)
1386                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
1387 #endif
1388
1389         /*
1390          * Fill in IP length and desired time to live and
1391          * send to IP level.  There should be a better way
1392          * to handle ttl and tos; we could keep them in
1393          * the template, but need a way to checksum without them.
1394          */
1395         /*
1396          * m->m_pkthdr.len should have been set before cksum calcuration,
1397          * because in6_cksum() need it.
1398          */
1399 #if INET6
1400         if (isipv6) {
1401                 /*
1402                  * we separately set hoplimit for every segment, since the
1403                  * user might want to change the value via setsockopt.
1404                  * Also, desired default hop limit might be changed via
1405                  * Neighbor Discovery.
1406                  */
1407                 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
1408                                                tp->t_inpcb->in6p_route.ro_rt ?
1409                                                tp->t_inpcb->in6p_route.ro_rt->rt_ifp
1410                                                : NULL);
1411
1412                 /* TODO: IPv6 IP6TOS_ECT bit on */
1413 #if IPSEC
1414                 if (ipsec_bypass == 0 && ipsec_setsocket(m, so) != 0) {
1415                         m_freem(m);
1416                         error = ENOBUFS;
1417                         goto out;
1418                 }
1419 #endif /*IPSEC*/
1420                 m->m_pkthdr.socket_id = socket_id;
1421                 error = ip6_output(m,
1422                             inp6_pktopts,
1423                             &tp->t_inpcb->in6p_route,
1424                             (so_options & SO_DONTROUTE), NULL, NULL, 0);
1425         } else
1426 #endif /* INET6 */
1427     {
1428         ip->ip_len = m->m_pkthdr.len;
1429 #if INET6
1430         if (isipv6)
1431                 ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
1432                                             tp->t_inpcb->in6p_route.ro_rt ?
1433                                             tp->t_inpcb->in6p_route.ro_rt->rt_ifp
1434                                             : NULL);
1435         else
1436 #endif /* INET6 */
1437         ip->ip_ttl = tp->t_inpcb->inp_ip_ttl;   /* XXX */
1438         ip->ip_tos |= (tp->t_inpcb->inp_ip_tos & ~IPTOS_ECN_MASK);      /* XXX */
1439
1440
1441 #if INET6
1442         if (isipv6) {
1443                 KERNEL_DEBUG(DBG_LAYER_BEG,
1444                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
1445                      (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
1446                       (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
1447                      0,0,0);
1448         }
1449         else
1450 #endif
1451         {
1452                 KERNEL_DEBUG(DBG_LAYER_BEG,
1453                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
1454                      (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
1455                       (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
1456                      0,0,0);
1457         }
1458
1459         /*
1460          * See if we should do MTU discovery.
1461          * Look at the flag updated on the following criterias:
1462          *      1) Path MTU discovery is authorized by the sysctl
1463          *      2) The route isn't set yet (unlikely but could happen)
1464          *      3) The route is up
1465          *      4) the MTU is not locked (if it is, then discovery has been
1466          *         disabled for that route)
1467          */
1468
1469         if (path_mtu_discovery && (tp->t_flags & TF_PMTUD))
1470                 ip->ip_off |= IP_DF;
1471
1472 #if IPSEC
1473         if (ipsec_bypass == 0)
1474                 ipsec_setsocket(m, so);
1475 #endif /*IPSEC*/
1476
1477         /*
1478          * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
1479          */
1480         lost = 0;
1481         m->m_pkthdr.socket_id = socket_id;
1482         m->m_nextpkt = NULL;
1483         tp->t_pktlist_sentlen += len;
1484         tp->t_lastchain++;
1485         if (tp->t_pktlist_head != NULL) {
1486                 tp->t_pktlist_tail->m_nextpkt = m;
1487                 tp->t_pktlist_tail = m;
1488         } else {
1489                 packchain_newlist++;
1490                 tp->t_pktlist_head = tp->t_pktlist_tail = m;
1491         }
1492
1493         if (sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) ||
1494               (tp->snd_cwnd <= (tp->snd_wnd / 8)) ||
1495               (tp->t_flags & (TH_PUSH | TF_ACKNOW)) || tp->t_force != 0 ||
1496               tp->t_lastchain >= tcp_packet_chaining) {
1497                 error = 0;
1498                 while (!(tp->t_flags & TF_SENDINPROG) &&
1499                     tp->t_pktlist_head != NULL) {
1500                         packetlist = tp->t_pktlist_head;
1501                         packchain_listadd = tp->t_lastchain;
1502                         packchain_sent++;
1503                         lost = tp->t_pktlist_sentlen;
1504                         TCP_PKTLIST_CLEAR(tp);
1505                         tp->t_flags |= TF_SENDINPROG;
1506
1507                         error = tcp_ip_output(so, tp, packetlist,
1508                             packchain_listadd, tp_inp_options,
1509                             (so_options & SO_DONTROUTE));
1510
1511                         tp->t_flags &= ~TF_SENDINPROG;
1512                         if (error) {
1513                                 /*
1514                                  * Take into account the rest of unsent
1515                                  * packets in the packet list for this tcp
1516                                  * into "lost", since we're about to free
1517                                  * the whole list below.
1518                                  */
1519                                 lost += tp->t_pktlist_sentlen;
1520                                 break;
1521                         } else {
1522                                 lost = 0;
1523                         }
1524                 }
1525                 /* tcp was closed while we were in ip; resume close */
1526                 if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) {
1527                         tp->t_flags &= ~TF_CLOSING;
1528                         (void) tcp_close(tp);
1529                         return (0);
1530                 }
1531         }
1532         else {
1533                 error = 0;
1534                 packchain_looped++;
1535                 tcpstat.tcps_sndtotal++;
1536
1537                 if (recwin > 0 && SEQ_GT(tp->rcv_nxt+recwin, tp->rcv_adv))
1538                         tp->rcv_adv = tp->rcv_nxt + recwin;
1539                 tp->last_ack_sent = tp->rcv_nxt;
1540                 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
1541                 goto again;
1542         }
1543    }
1544         if (error) {
1545                 /*
1546                  * Assume that the packets were lost, so back out the
1547                  * sequence number advance, if any.  Note that the "lost"
1548                  * variable represents the amount of user data sent during
1549                  * the recent call to ip_output_list() plus the amount of
1550                  * user data in the packet list for this tcp at the moment.
1551                  */
1552                 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
1553                         /*
1554                          * No need to check for TH_FIN here because
1555                          * the TF_SENTFIN flag handles that case.
1556                          */
1557                         if ((flags & TH_SYN) == 0) {
1558                                 if (sack_rxmit) {
1559                                         p->rxmit -= lost;
1560                                         tp->sackhint.sack_bytes_rexmit -= lost;
1561                                 } else
1562                                         tp->snd_nxt -= lost;
1563                         }
1564                 }
1565 out:
1566                 if (tp->t_pktlist_head != NULL)
1567                         m_freem_list(tp->t_pktlist_head);
1568                 TCP_PKTLIST_CLEAR(tp);
1569
1570                 if (error == ENOBUFS) {
1571                         if (!tp->t_timer[TCPT_REXMT] &&
1572                              !tp->t_timer[TCPT_PERSIST])
1573                                 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1574                         tcp_quench(tp->t_inpcb, 0);
1575                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1576                         return (0);
1577                 }
1578                 if (error == EMSGSIZE) {
1579                         /*
1580                          * ip_output() will have already fixed the route
1581                          * for us.  tcp_mtudisc() will, as its last action,
1582                          * initiate retransmission, so it is important to
1583                          * not do so here.
1584                          */
1585                         tcp_mtudisc(tp->t_inpcb, 0);
1586                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1587                         return 0;
1588                 }
1589                 if ((error == EHOSTUNREACH || error == ENETDOWN)
1590                     && TCPS_HAVERCVDSYN(tp->t_state)) {
1591                         tp->t_softerror = error;
1592                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1593                         return (0);
1594                 }
1595                 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1596                 return (error);
1597         }
1598
1599         tcpstat.tcps_sndtotal++;
1600
1601         /*
1602          * Data sent (as far as we can tell).
1603          * If this advertises a larger window than any other segment,
1604          * then remember the size of the advertised window.
1605          * Any pending ACK has now been sent.
1606          */
1607         if (recwin > 0 && SEQ_GT(tp->rcv_nxt+recwin, tp->rcv_adv))
1608                 tp->rcv_adv = tp->rcv_nxt + recwin;
1609         tp->last_ack_sent = tp->rcv_nxt;
1610         tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
1611
1612         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0);
1613         if (sendalot && (!tcp_do_newreno || --maxburst))
1614                 goto again;
1615         return (0);
1616 }
1617
1618 static int
1619 tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
1620     int cnt, struct mbuf *opt, int flags)
1621 {
1622         int error = 0;
1623         boolean_t chain;
1624         boolean_t unlocked = FALSE;
1625
1626         /*
1627          * If allowed, unlock TCP socket while in IP
1628          * but only if the connection is established and
1629          * if we're not sending from an upcall.
1630          */
1631
1632         if (tcp_output_unlocked && ((so->so_flags & SOF_UPCALLINUSE) == 0) &&
1633             (tp->t_state == TCPS_ESTABLISHED)) {
1634                         unlocked = TRUE;
1635                         socket_unlock(so, 0);
1636         }
1637
1638         /*
1639          * Don't send down a chain of packets when:
1640          * - TCP chaining is disabled
1641          * - there is an IPsec rule set
1642          * - there is a non default rule set for the firewall
1643          */
1644
1645         chain = tcp_packet_chaining > 1 &&
1646 #if IPSEC
1647                 ipsec_bypass &&
1648 #endif
1649                 (fw_enable == 0 || fw_bypass);
1650
1651         while (pkt != NULL) {
1652                 struct mbuf *npkt = pkt->m_nextpkt;
1653
1654                 if (!chain) {
1655                         pkt->m_nextpkt = NULL;
1656                         /*
1657                          * If we are not chaining, make sure to set the packet
1658                          * list count to 0 so that IP takes the right path;
1659                          * this is important for cases such as IPSec where a
1660                          * single mbuf might result in multiple mbufs as part
1661                          * of the encapsulation.  If a non-zero count is passed
1662                          * down to IP, the head of the chain might change and
1663                          * we could end up skipping it (thus generating bogus
1664                          * packets).  Fixing it in IP would be desirable, but
1665                          * for now this would do it.
1666                          */
1667                         cnt = 0;
1668                 }
1669 #if CONFIG_FORCE_OUT_IFP
1670                 error = ip_output_list(pkt, cnt, opt, &tp->t_inpcb->inp_route,
1671                     flags, 0, tp->t_inpcb->pdp_ifp);
1672 #else
1673                 error = ip_output_list(pkt, cnt, opt, &tp->t_inpcb->inp_route,
1674                     flags, 0, NULL);
1675 #endif
1676                 if (chain || error) {
1677                         /*
1678                          * If we sent down a chain then we are done since
1679                          * the callee had taken care of everything; else
1680                          * we need to free the rest of the chain ourselves.
1681                          */
1682                         if (!chain)
1683                                 m_freem_list(npkt);
1684                         break;
1685                 }
1686                 pkt = npkt;
1687         }
1688
1689         if (unlocked)
1690                 socket_lock(so, 0);
1691
1692         return (error);
1693 }
1694
1695 void
1696 tcp_setpersist(tp)
1697         register struct tcpcb *tp;
1698 {
1699         int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
1700
1701         if (tp->t_timer[TCPT_REXMT])
1702                 panic("tcp_setpersist: retransmit pending");
1703         /*
1704          * Start/restart persistance timer.
1705          */
1706         TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
1707             t * tcp_backoff[tp->t_rxtshift],
1708             TCPTV_PERSMIN, TCPTV_PERSMAX);
1709         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1710                 tp->t_rxtshift++;
1711 }