bsd/netinet/tcp_output.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*
  23  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  24  *      The Regents of the University of California.  All rights reserved.
  25  *
  26  * Redistribution and use in source and binary forms, with or without
  27  * modification, are permitted provided that the following conditions
  28  * are met:
  29  * 1. Redistributions of source code must retain the above copyright
  30  *    notice, this list of conditions and the following disclaimer.
  31  * 2. Redistributions in binary form must reproduce the above copyright
  32  *    notice, this list of conditions and the following disclaimer in the
  33  *    documentation and/or other materials provided with the distribution.
  34  * 3. All advertising materials mentioning features or use of this software
  35  *    must display the following acknowledgement:
  36  *      This product includes software developed by the University of
  37  *      California, Berkeley and its contributors.
  38  * 4. Neither the name of the University nor the names of its contributors
  39  *    may be used to endorse or promote products derived from this software
  40  *    without specific prior written permission.
  41  *
  42  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  52  * SUCH DAMAGE.
  53  *
  54  *      @(#)tcp_output.c        8.4 (Berkeley) 5/24/95
  55  */
  56
  57 #if ISFB31
  58 #include "opt_tcpdebug.h"
  59 #endif
  60 #define _IP_VHL
  61
  62 #include <stddef.h>
  63
  64 #include <sys/param.h>
  65 #include <sys/systm.h>
  66 #include <sys/mbuf.h>
  67 #include <sys/domain.h>
  68 #include <sys/protosw.h>
  69 #include <sys/socket.h>
  70 #include <sys/socketvar.h>
  71
  72 #include <net/route.h>
  73
  74 #include <netinet/in.h>
  75 #include <netinet/in_systm.h>
  76 #include <netinet/ip.h>
  77 #include <netinet/ip_var.h>
  78 #if INET6
  79 #include <netinet/ip6.h>
  80 #include <netinet/ip_var.h>
  81 #include <netinet6/ip6_var.h>
  82 #endif
  83 #include <netinet/in_pcb.h>
  84 #include <netinet/tcp.h>
  85 #define TCPOUTFLAGS
  86 #include <netinet/tcp_fsm.h>
  87 #include <netinet/tcp_seq.h>
  88 #include <netinet/tcp_timer.h>
  89 #include <netinet/tcp_var.h>
  90 #include <netinet/tcpip.h>
  91 #if TCPDEBUG
  92 #include <netinet/tcp_debug.h>
  93 #endif
  94 #include <sys/kdebug.h>
  95
  96 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETTCP, 1)
  97 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETTCP, 3)
  98 #define DBG_FNC_TCP_OUTPUT      NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
  99
 100
 101 #ifdef notyet
 102 extern struct mbuf *m_copypack();
 103 #endif
 104
 105
 106 /*
 107  * Tcp output routine: figure out what should be sent and send it.
 108  */
 109 int
 110 tcp_output(tp)
 111         register struct tcpcb *tp;
 112 {
 113         register struct socket *so = tp->t_inpcb->inp_socket;
 114         register long len, win;
 115         int off, flags, error;
 116         register struct mbuf *m;
 117         struct ip *ip = NULL;
 118         struct ipovly *ipov = NULL;
 119 #if INET6
 120         struct ip6_hdr *ip6 = NULL;
 121 #endif /* INET6 */
 122         struct tcphdr *th;
 123         u_char opt[TCP_MAXOLEN];
 124         unsigned ipoptlen, optlen, hdrlen;
 125         int idle, sendalot;
 126         struct rmxp_tao *taop;
 127         struct rmxp_tao tao_noncached;
 128 #if INET6
 129         int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
 130 #endif
 131
 132         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 133         KERNEL_DEBUG(DBG_LAYER_BEG,
 134                      ((tp->t_template->th_dport << 16) | tp->t_template->th_sport),
 135                      (((tp->t_template->th_src.s_addr & 0xffff) << 16) |
 136                       (tp->t_template->th_dst.s_addr & 0xffff)),
 137                      0,0,0);
 138
 139         /*
 140          * Determine length of data that should be transmitted,
 141          * and flags that will be used.
 142          * If there is some data or critical controls (SYN, RST)
 143          * to send, then transmit; otherwise, investigate further.
 144          */
 145         idle = (tp->snd_max == tp->snd_una);
 146         if (idle && tp->t_idle >= tp->t_rxtcur)
 147                 /*
 148                  * We have been idle for "a while" and no acks are
 149                  * expected to clock out any data we send --
 150                  * slow start to get ack "clock" running again.
 151                  */
 152                 tp->snd_cwnd = tp->t_maxseg;
 153 again:
 154         sendalot = 0;
 155         off = tp->snd_nxt - tp->snd_una;
 156         win = min(tp->snd_wnd, tp->snd_cwnd);
 157
 158         flags = tcp_outflags[tp->t_state];
 159         /*
 160          * Get standard flags, and add SYN or FIN if requested by 'hidden'
 161          * state flags.
 162          */
 163         if (tp->t_flags & TF_NEEDFIN)
 164                 flags |= TH_FIN;
 165         if (tp->t_flags & TF_NEEDSYN)
 166                 flags |= TH_SYN;
 167
 168         /*
 169          * If in persist timeout with window of 0, send 1 byte.
 170          * Otherwise, if window is small but nonzero
 171          * and timer expired, we will send what we can
 172          * and go to transmit state.
 173          */
 174         if (tp->t_force) {
 175                 if (win == 0) {
 176                         /*
 177                          * If we still have some data to send, then
 178                          * clear the FIN bit.  Usually this would
 179                          * happen below when it realizes that we
 180                          * aren't sending all the data.  However,
 181                          * if we have exactly 1 byte of unsent data,
 182                          * then it won't clear the FIN bit below,
 183                          * and if we are in persist state, we wind
 184                          * up sending the packet without recording
 185                          * that we sent the FIN bit.
 186                          *
 187                          * We can't just blindly clear the FIN bit,
 188                          * because if we don't have any more data
 189                          * to send then the probe will be the FIN
 190                          * itself.
 191                          */
 192                         if (off < so->so_snd.sb_cc)
 193                                 flags &= ~TH_FIN;
 194                         win = 1;
 195                 } else {
 196                         tp->t_timer[TCPT_PERSIST] = 0;
 197                         tp->t_rxtshift = 0;
 198                 }
 199         }
 200
 201         len = (long)ulmin(so->so_snd.sb_cc, win) - off;
 202
 203         if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
 204                 taop = &tao_noncached;
 205                 bzero(taop, sizeof(*taop));
 206         }
 207
 208         /*
 209          * Lop off SYN bit if it has already been sent.  However, if this
 210          * is SYN-SENT state and if segment contains data and if we don't
 211          * know that foreign host supports TAO, suppress sending segment.
 212          */
 213         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 214                 flags &= ~TH_SYN;
 215                 off--, len++;
 216                 if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
 217                     taop->tao_ccsent == 0) {
 218                   KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 219                   return 0;
 220                 }
 221         }
 222
 223         /*
 224          * Be careful not to send data and/or FIN on SYN segments
 225          * in cases when no CC option will be sent.
 226          * This measure is needed to prevent interoperability problems
 227          * with not fully conformant TCP implementations.
 228          */
 229         if ((flags & TH_SYN) &&
 230             ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) ||
 231              ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) {
 232                 len = 0;
 233                 flags &= ~TH_FIN;
 234         }
 235
 236         if (len < 0) {
 237                 /*
 238                  * If FIN has been sent but not acked,
 239                  * but we haven't been called to retransmit,
 240                  * len will be -1.  Otherwise, window shrank
 241                  * after we sent into it.  If window shrank to 0,
 242                  * cancel pending retransmit, pull snd_nxt back
 243                  * to (closed) window, and set the persist timer
 244                  * if it isn't already going.  If the window didn't
 245                  * close completely, just wait for an ACK.
 246                  */
 247                 len = 0;
 248                 if (win == 0) {
 249                         tp->t_timer[TCPT_REXMT] = 0;
 250                         tp->t_rxtshift = 0;
 251                         tp->snd_nxt = tp->snd_una;
 252                         if (tp->t_timer[TCPT_PERSIST] == 0)
 253                                 tcp_setpersist(tp);
 254                 }
 255         }
 256         if (len > tp->t_maxseg) {
 257                 len = tp->t_maxseg;
 258                 sendalot = 1;
 259         }
 260         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
 261                 flags &= ~TH_FIN;
 262
 263         win = sbspace(&so->so_rcv);
 264
 265         /*
 266          * Sender silly window avoidance.  If connection is idle
 267          * and can send all data, a maximum segment,
 268          * at least a maximum default-size segment do it,
 269          * or are forced, do it; otherwise don't bother.
 270          * If peer's buffer is tiny, then send
 271          * when window is at least half open.
 272          * If retransmitting (possibly after persist timer forced us
 273          * to send into a small window), then must resend.
 274          */
 275         if (len) {
 276                 if (len == tp->t_maxseg)
 277                         goto send;
 278                 if (!(tp->t_flags & TF_MORETOCOME) &&
 279                     (idle || tp->t_flags & TF_NODELAY) &&
 280                     (tp->t_flags & TF_NOPUSH) == 0 &&
 281                     len + off >= so->so_snd.sb_cc)
 282                         goto send;
 283                 if (tp->t_force)
 284                         goto send;
 285                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
 286                         goto send;
 287                 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 288                         goto send;
 289         }
 290
 291         /*
 292          * Compare available window to amount of window
 293          * known to peer (as advertised window less
 294          * next expected input).  If the difference is at least two
 295          * max size segments, or at least 50% of the maximum possible
 296          * window, then want to send a window update to peer.
 297          */
 298         if (win > 0) {
 299                 /*
 300                  * "adv" is the amount we can increase the window,
 301                  * taking into account that we are limited by
 302                  * TCP_MAXWIN << tp->rcv_scale.
 303                  */
 304                 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
 305                         (tp->rcv_adv - tp->rcv_nxt);
 306
 307                 if (adv >= (long) (2 * tp->t_maxseg))
 308                         goto send;
 309                 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
 310                         goto send;
 311         }
 312
 313         /*
 314          * Send if we owe peer an ACK.
 315          */
 316         if (tp->t_flags & TF_ACKNOW)
 317                 goto send;
 318         if ((flags & TH_RST) ||
 319             ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 320                 goto send;
 321         if (SEQ_GT(tp->snd_up, tp->snd_una))
 322                 goto send;
 323         /*
 324          * If our state indicates that FIN should be sent
 325          * and we have not yet done so, or we're retransmitting the FIN,
 326          * then we need to send.
 327          */
 328         if (flags & TH_FIN &&
 329             ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 330                 goto send;
 331
 332         /*
 333          * TCP window updates are not reliable, rather a polling protocol
 334          * using ``persist'' packets is used to insure receipt of window
 335          * updates.  The three ``states'' for the output side are:
 336          *      idle                    not doing retransmits or persists
 337          *      persisting              to move a small or zero window
 338          *      (re)transmitting        and thereby not persisting
 339          *
 340          * tp->t_timer[TCPT_PERSIST]
 341          *      is set when we are in persist state.
 342          * tp->t_force
 343          *      is set when we are called to send a persist packet.
 344          * tp->t_timer[TCPT_REXMT]
 345          *      is set when we are retransmitting
 346          * The output side is idle when both timers are zero.
 347          *
 348          * If send window is too small, there is data to transmit, and no
 349          * retransmit or persist is pending, then go to persist state.
 350          * If nothing happens soon, send when timer expires:
 351          * if window is nonzero, transmit what we can,
 352          * otherwise force out a byte.
 353          */
 354         if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
 355             tp->t_timer[TCPT_PERSIST] == 0) {
 356                 tp->t_rxtshift = 0;
 357                 tcp_setpersist(tp);
 358         }
 359
 360         /*
 361          * No reason to send a segment, just return.
 362          */
 363         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 364         return (0);
 365
 366 send:
 367         /*
 368          * Before ESTABLISHED, force sending of initial options
 369          * unless TCP set not to do any options.
 370          * NOTE: we assume that the IP/TCP header plus TCP options
 371          * always fit in a single mbuf, leaving room for a maximum
 372          * link header, i.e.
 373          *      max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
 374          */
 375         optlen = 0;
 376 #if INET6
 377         if (isipv6)
 378                 hdrlen = sizeof (struct tcpip6hdr);
 379         else
 380 #endif
 381         hdrlen = sizeof (struct tcpiphdr);
 382         if (flags & TH_SYN) {
 383                 tp->snd_nxt = tp->iss;
 384                 if ((tp->t_flags & TF_NOOPT) == 0) {
 385                         u_short mss;
 386
 387                         opt[0] = TCPOPT_MAXSEG;
 388                         opt[1] = TCPOLEN_MAXSEG;
 389                         mss = htons((u_short) tcp_mssopt(tp, isipv6));
 390                         (void)memcpy(opt + 2, &mss, sizeof(mss));
 391                         optlen = TCPOLEN_MAXSEG;
 392
 393                         if ((tp->t_flags & TF_REQ_SCALE) &&
 394                             ((flags & TH_ACK) == 0 ||
 395                             (tp->t_flags & TF_RCVD_SCALE))) {
 396                                 *((u_int32_t *)(opt + optlen)) = htonl(
 397                                         TCPOPT_NOP << 24 |
 398                                         TCPOPT_WINDOW << 16 |
 399                                         TCPOLEN_WINDOW << 8 |
 400                                         tp->request_r_scale);
 401                                 optlen += 4;
 402                         }
 403                 }
 404         }
 405
 406         /*
 407          * Send a timestamp and echo-reply if this is a SYN and our side
 408          * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
 409          * and our peer have sent timestamps in our SYN's.
 410          */
 411         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 412             (flags & TH_RST) == 0 &&
 413             ((flags & TH_ACK) == 0 ||
 414              (tp->t_flags & TF_RCVD_TSTMP))) {
 415                 u_int32_t *lp = (u_int32_t *)(opt + optlen);
 416
 417                 /* Form timestamp option as shown in appendix A of RFC 1323. */
 418                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
 419                 *lp++ = htonl(tcp_now);
 420                 *lp   = htonl(tp->ts_recent);
 421                 optlen += TCPOLEN_TSTAMP_APPA;
 422         }
 423
 424         /*
 425          * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
 426          * options are allowed (!TF_NOOPT) and it's not a RST.
 427          */
 428         if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
 429              (flags & TH_RST) == 0) {
 430                 switch (flags & (TH_SYN|TH_ACK)) {
 431                 /*
 432                  * This is a normal ACK, send CC if we received CC before
 433                  * from our peer.
 434                  */
 435                 case TH_ACK:
 436                         if (!(tp->t_flags & TF_RCVD_CC))
 437                                 break;
 438                         /*FALLTHROUGH*/
 439
 440                 /*
 441                  * We can only get here in T/TCP's SYN_SENT* state, when
 442                  * we're a sending a non-SYN segment without waiting for
 443                  * the ACK of our SYN.  A check above assures that we only
 444                  * do this if our peer understands T/TCP.
 445                  */
 446                 case 0:
 447                         opt[optlen++] = TCPOPT_NOP;
 448                         opt[optlen++] = TCPOPT_NOP;
 449                         opt[optlen++] = TCPOPT_CC;
 450                         opt[optlen++] = TCPOLEN_CC;
 451                         *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
 452
 453                         optlen += 4;
 454                         break;
 455
 456                 /*
 457                  * This is our initial SYN, check whether we have to use
 458                  * CC or CC.new.
 459                  */
 460                 case TH_SYN:
 461                         opt[optlen++] = TCPOPT_NOP;
 462                         opt[optlen++] = TCPOPT_NOP;
 463                         opt[optlen++] = tp->t_flags & TF_SENDCCNEW ?
 464                                                 TCPOPT_CCNEW : TCPOPT_CC;
 465                         opt[optlen++] = TCPOLEN_CC;
 466                         *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
 467                         optlen += 4;
 468                         break;
 469
 470                 /*
 471                  * This is a SYN,ACK; send CC and CC.echo if we received
 472                  * CC from our peer.
 473                  */
 474                 case (TH_SYN|TH_ACK):
 475                         if (tp->t_flags & TF_RCVD_CC) {
 476                                 opt[optlen++] = TCPOPT_NOP;
 477                                 opt[optlen++] = TCPOPT_NOP;
 478                                 opt[optlen++] = TCPOPT_CC;
 479                                 opt[optlen++] = TCPOLEN_CC;
 480                                 *(u_int32_t *)&opt[optlen] =
 481                                         htonl(tp->cc_send);
 482                                 optlen += 4;
 483                                 opt[optlen++] = TCPOPT_NOP;
 484                                 opt[optlen++] = TCPOPT_NOP;
 485                                 opt[optlen++] = TCPOPT_CCECHO;
 486                                 opt[optlen++] = TCPOLEN_CC;
 487                                 *(u_int32_t *)&opt[optlen] =
 488                                         htonl(tp->cc_recv);
 489                                 optlen += 4;
 490                         }
 491                         break;
 492                 }
 493         }
 494
 495         hdrlen += optlen;
 496 #if INET6
 497         if (isipv6)
 498                 ipoptlen = ip6_optlen(tp->t_inpcb);
 499         else
 500 #endif
 501         if (tp->t_inpcb->inp_options) {
 502                 ipoptlen = tp->t_inpcb->inp_options->m_len -
 503                                 offsetof(struct ipoption, ipopt_list);
 504         } else {
 505                 ipoptlen = 0;
 506         }
 507 #if IPSEC
 508 #if INET6
 509         ipoptlen += ipsec_hdrsiz_tcp(tp, isipv6);
 510 #else
 511         ipoptlen += ipsec_hdrsiz_tcp(tp, 0);
 512 #endif
 513 #endif
 514
 515         /*
 516          * Adjust data length if insertion of options will
 517          * bump the packet length beyond the t_maxopd length.
 518          * Clear the FIN bit because we cut off the tail of
 519          * the segment.
 520          */
 521         if (len + optlen + ipoptlen > tp->t_maxopd) {
 522                 /*
 523                  * If there is still more to send, don't close the connection.
 524                  */
 525                 flags &= ~TH_FIN;
 526                 len = tp->t_maxopd - optlen - ipoptlen;
 527                 sendalot = 1;
 528         }
 529
 530 /*#ifdef DIAGNOSTIC*/
 531         if (max_linkhdr + hdrlen > MHLEN)
 532                 panic("tcphdr too big");
 533 /*#endif*/
 534
 535         /*
 536          * Grab a header mbuf, attaching a copy of data to
 537          * be transmitted, and initialize the header from
 538          * the template for sends on this connection.
 539          */
 540         if (len) {
 541                 if (tp->t_force && len == 1)
 542                         tcpstat.tcps_sndprobe++;
 543                 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 544                         tcpstat.tcps_sndrexmitpack++;
 545                         tcpstat.tcps_sndrexmitbyte += len;
 546                 } else {
 547                         tcpstat.tcps_sndpack++;
 548                         tcpstat.tcps_sndbyte += len;
 549                 }
 550 #ifdef notyet
 551                 if ((m = m_copypack(so->so_snd.sb_mb, off,
 552                     (int)len, max_linkhdr + hdrlen)) == 0) {
 553                         error = ENOBUFS;
 554                         goto out;
 555                 }
 556                 /*
 557                  * m_copypack left space for our hdr; use it.
 558                  */
 559                 m->m_len += hdrlen;
 560                 m->m_data -= hdrlen;
 561 #else
 562                 MGETHDR(m, M_DONTWAIT, MT_HEADER);
 563                 if (m == NULL) {
 564                         error = ENOBUFS;
 565                         goto out;
 566                 }
 567 #if INET6
 568                 if (MHLEN < hdrlen + max_linkhdr) {
 569                         MCLGET(m, M_DONTWAIT);
 570                         if ((m->m_flags & M_EXT) == 0) {
 571                                 m_freem(m);
 572                                 error = ENOBUFS;
 573                                 goto out;
 574                         }
 575                 }
 576 #endif
 577                 m->m_data += max_linkhdr;
 578                 m->m_len = hdrlen;
 579                 if (len <= MHLEN - hdrlen - max_linkhdr) {
 580                         m_copydata(so->so_snd.sb_mb, off, (int) len,
 581                             mtod(m, caddr_t) + hdrlen);
 582                         m->m_len += len;
 583                 } else {
 584                         m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
 585                         if (m->m_next == 0) {
 586                                 (void) m_free(m);
 587                                 error = ENOBUFS;
 588                                 goto out;
 589                         }
 590                 }
 591 #endif
 592                 /*
 593                  * If we're sending everything we've got, set PUSH.
 594                  * (This will keep happy those implementations which only
 595                  * give data to the user when a buffer fills or
 596                  * a PUSH comes in.)
 597                  */
 598                 if (off + len == so->so_snd.sb_cc)
 599                         flags |= TH_PUSH;
 600         } else {
 601                 if (tp->t_flags & TF_ACKNOW)
 602                         tcpstat.tcps_sndacks++;
 603                 else if (flags & (TH_SYN|TH_FIN|TH_RST))
 604                         tcpstat.tcps_sndctrl++;
 605                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
 606                         tcpstat.tcps_sndurg++;
 607                 else
 608                         tcpstat.tcps_sndwinup++;
 609
 610                 MGETHDR(m, M_DONTWAIT, MT_HEADER);
 611                 if (m == NULL) {
 612                         error = ENOBUFS;
 613                         goto out;
 614                 }
 615 #if INET6
 616                 if (isipv6) {
 617                         MH_ALIGN(m, hdrlen);
 618                 } else
 619 #endif
 620                 m->m_data += max_linkhdr;
 621                 m->m_len = hdrlen;
 622         }
 623         m->m_pkthdr.rcvif = (struct ifnet *)0;
 624         if (tp->t_template == 0)
 625                 panic("tcp_output");
 626 #if INET6
 627         if (isipv6) {
 628                 ip6 = mtod(m, struct ip6_hdr *);
 629                 th = (struct tcphdr *)(ip6 + 1);
 630                 bcopy((caddr_t)&tp->t_template->tt_i6, (caddr_t)ip6,
 631                       sizeof(struct ip6_hdr));
 632                 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
 633                       sizeof(struct tcphdr));
 634         } else {
 635 #endif /* INET6 */
 636         ip = mtod(m, struct ip *);
 637         ipov = (struct ipovly *)ip;
 638         th = (struct tcphdr *)(ip + 1);
 639         bcopy((caddr_t)&tp->t_template->tt_i, (caddr_t)ip, sizeof(struct ip));
 640         bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
 641               sizeof(struct tcphdr));
 642 #if INET6
 643         }
 644 #endif /* INET6 */
 645
 646         /*
 647          * Fill in fields, remembering maximum advertised
 648          * window for use in delaying messages about window sizes.
 649          * If resending a FIN, be sure not to use a new sequence number.
 650          */
 651         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 652             tp->snd_nxt == tp->snd_max)
 653                 tp->snd_nxt--;
 654         /*
 655          * If we are doing retransmissions, then snd_nxt will
 656          * not reflect the first unsent octet.  For ACK only
 657          * packets, we do not want the sequence number of the
 658          * retransmitted packet, we want the sequence number
 659          * of the next unsent octet.  So, if there is no data
 660          * (and no SYN or FIN), use snd_max instead of snd_nxt
 661          * when filling in ti_seq.  But if we are in persist
 662          * state, snd_max might reflect one byte beyond the
 663          * right edge of the window, so use snd_nxt in that
 664          * case, since we know we aren't doing a retransmission.
 665          * (retransmit and persist are mutually exclusive...)
 666          */
 667         if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
 668                 th->th_seq = htonl(tp->snd_nxt);
 669         else
 670                 th->th_seq = htonl(tp->snd_max);
 671         th->th_ack = htonl(tp->rcv_nxt);
 672         if (optlen) {
 673                 bcopy(opt, th + 1, optlen);
 674                 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 675         }
 676         th->th_flags = flags;
 677         /*
 678          * Calculate receive window.  Don't shrink window,
 679          * but avoid silly window syndrome.
 680          */
 681         if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
 682                 win = 0;
 683         if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
 684                 win = (long)(tp->rcv_adv - tp->rcv_nxt);
 685         if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 686                 win = (long)TCP_MAXWIN << tp->rcv_scale;
 687         th->th_win = htons((u_short) (win>>tp->rcv_scale));
 688         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 689                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
 690                 th->th_flags |= TH_URG;
 691         } else
 692                 /*
 693                  * If no urgent pointer to send, then we pull
 694                  * the urgent pointer to the left edge of the send window
 695                  * so that it doesn't drift into the send window on sequence
 696                  * number wraparound.
 697                  */
 698                 tp->snd_up = tp->snd_una;               /* drag it along */
 699
 700         /*
 701          * Put TCP length in extended header, and then
 702          * checksum extended header and data.
 703          */
 704         m->m_pkthdr.len = hdrlen + len;
 705 #if INET6
 706         if (isipv6) {
 707 #if 0           /* ip6_plen will be filled in ip6_output. */
 708                 ip6->ip6_plen = htons((u_short)(sizeof(struct tcphdr) +
 709                                                 optlen + len));
 710 #endif
 711
 712                 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
 713                                        sizeof(struct tcphdr) + optlen + len);
 714         } else {
 715 #endif /* INET6 */
 716         if (len + optlen)
 717                 ipov->ih_len = htons((u_short)(sizeof (struct tcphdr) +
 718                                                optlen + len));
 719         th->th_sum = in_cksum(m, (int)(hdrlen + len));
 720 #if INET6
 721         }
 722 #endif /* INET6 */
 723
 724         /*
 725          * In transmit state, time the transmission and arrange for
 726          * the retransmit.  In persist state, just set snd_max.
 727          */
 728         if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
 729                 tcp_seq startseq = tp->snd_nxt;
 730
 731                 /*
 732                  * Advance snd_nxt over sequence space of this segment.
 733                  */
 734                 if (flags & (TH_SYN|TH_FIN)) {
 735                         if (flags & TH_SYN)
 736                                 tp->snd_nxt++;
 737                         if (flags & TH_FIN) {
 738                                 tp->snd_nxt++;
 739                                 tp->t_flags |= TF_SENTFIN;
 740                         }
 741                 }
 742                 tp->snd_nxt += len;
 743                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 744                         tp->snd_max = tp->snd_nxt;
 745                         /*
 746                          * Time this transmission if not a retransmission and
 747                          * not currently timing anything.
 748                          */
 749                         if (tp->t_rtt == 0) {
 750                                 tp->t_rtt = 1;
 751                                 tp->t_rtseq = startseq;
 752                                 tcpstat.tcps_segstimed++;
 753                         }
 754                 }
 755
 756                 /*
 757                  * Set retransmit timer if not currently set,
 758                  * and not doing an ack or a keep-alive probe.
 759                  * Initial value for retransmit timer is smoothed
 760                  * round-trip time + 2 * round-trip time variance.
 761                  * Initialize shift counter which is used for backoff
 762                  * of retransmit time.
 763                  */
 764                 if (tp->t_timer[TCPT_REXMT] == 0 &&
 765                     tp->snd_nxt != tp->snd_una) {
 766                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 767                         if (tp->t_timer[TCPT_PERSIST]) {
 768                                 tp->t_timer[TCPT_PERSIST] = 0;
 769                                 tp->t_rxtshift = 0;
 770                         }
 771                 }
 772         } else
 773                 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
 774                         tp->snd_max = tp->snd_nxt + len;
 775
 776 #if TCPDEBUG
 777         /*
 778          * Trace.
 779          */
 780         if (so->so_options & SO_DEBUG) {
 781 #if INET6
 782                 if (isipv6)
 783                         ip6->ip6_vfc = IPV6_VERSION;
 784                 else
 785                         ip->ip_vhl = IP_MAKE_VHL(IPVERSION,
 786                                                  IP_VHL_HL(ip->ip_vhl));
 787 #endif /* INET6 */
 788                 tcp_trace(TA_OUTPUT, tp->t_state, tp,
 789 #if INET6
 790                           isipv6 ? (void *)ip6 :
 791 #endif /* INET6 */
 792                           ip,
 793                           th, 0);
 794
 795         }
 796 #endif /* TCPDEBUG */
 797
 798         /*
 799          * Fill in IP length and desired time to live and
 800          * send to IP level.  There should be a better way
 801          * to handle ttl and tos; we could keep them in
 802          * the template, but need a way to checksum without them.
 803          */
 804 #if INET6
 805         if (isipv6) {
 806                 /*
 807                  * we separately set hoplimit for every segment, since the
 808                  * user might want to change the value via setsockopt.
 809                  * Also, desired default hop limit might be changed via
 810                  * Neighbor Discovery.
 811                  */
 812                 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
 813                                                tp->t_inpcb->in6p_route.ro_rt ?
 814                                                tp->t_inpcb->in6p_route.ro_rt->rt_ifp
 815                                                : NULL);
 816
 817                 /* TODO: IPv6 IP6TOS_ECT bit on */
 818 #if IPSEC
 819                 ipsec_setsocket(m, so);
 820 #endif /*IPSEC*/
 821                 error = ip6_output(m,
 822                             tp->t_inpcb->in6p_outputopts,
 823                             &tp->t_inpcb->in6p_route,
 824                             (so->so_options & SO_DONTROUTE) /* | IP6_DONTFRAG */,
 825                             NULL, NULL);
 826         } else
 827 #endif /* INET6 */
 828         {
 829 #if 1
 830         struct rtentry *rt;
 831 #endif
 832         ip->ip_len = m->m_pkthdr.len;
 833 #if INET6
 834         if (INP_CHECK_SOCKAF(so, AF_INET6))
 835                 ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
 836                                             tp->t_inpcb->in6p_route.ro_rt ?
 837                                             tp->t_inpcb->in6p_route.ro_rt->rt_ifp
 838                                             : NULL);
 839         else
 840 #endif /* INET6 */
 841         ip->ip_ttl = tp->t_inpcb->inp_ip_ttl;   /* XXX */
 842         ip->ip_tos = tp->t_inpcb->inp_ip_tos;   /* XXX */
 843
 844         KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
 845                    (((th->th_src.s_addr & 0xffff) << 16) | (th->th_dst.s_addr & 0xffff)),
 846                     th->th_seq, th->th_ack, th->th_win);
 847
 848
 849 #if 1
 850         /*
 851          * See if we should do MTU discovery.  We do it only if the following
 852          * are true:
 853          *      1) we have a valid route to the destination
 854          *      2) the MTU is not locked (if it is, then discovery has been
 855          *         disabled)
 856          */
 857         if ((rt = tp->t_inpcb->inp_route.ro_rt)
 858             && rt->rt_flags & RTF_UP
 859             && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
 860                 ip->ip_off |= IP_DF;
 861         }
 862 #endif
 863
 864 #if IPSEC
 865         ipsec_setsocket(m, so);
 866 #endif /*IPSEC*/
 867
 868         error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
 869             so->so_options & SO_DONTROUTE, 0);
 870     }
 871         if (error) {
 872 out:
 873                 if (error == ENOBUFS) {
 874                         tcp_quench(tp->t_inpcb, 0);
 875                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 876                         return (0);
 877                 }
 878 #if 1
 879                 if (error == EMSGSIZE) {
 880                         /*
 881                          * ip_output() will have already fixed the route
 882                          * for us.  tcp_mtudisc() will, as its last action,
 883                          * initiate retransmission, so it is important to
 884                          * not do so here.
 885                          */
 886                         tcp_mtudisc(tp->t_inpcb, 0);
 887                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 888                         return 0;
 889                 }
 890 #endif
 891                 if ((error == EHOSTUNREACH || error == ENETDOWN)
 892                     && TCPS_HAVERCVDSYN(tp->t_state)) {
 893                         tp->t_softerror = error;
 894                         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 895                         return (0);
 896                 }
 897                 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 898                 return (error);
 899         }
 900         tcpstat.tcps_sndtotal++;
 901
 902         /*
 903          * Data sent (as far as we can tell).
 904          * If this advertises a larger window than any other segment,
 905          * then remember the size of the advertised window.
 906          * Any pending ACK has now been sent.
 907          */
 908         if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
 909                 tp->rcv_adv = tp->rcv_nxt + win;
 910         tp->last_ack_sent = tp->rcv_nxt;
 911         tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
 912         if (sendalot)
 913                 goto again;
 914         KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
 915         return (0);
 916 }
 917
 918 void
 919 tcp_setpersist(tp)
 920         register struct tcpcb *tp;
 921 {
 922         register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
 923
 924         if (tp->t_timer[TCPT_REXMT])
 925                 panic("tcp_output REXMT");
 926         /*
 927          * Start/restart persistance timer.
 928          */
 929         TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
 930             t * tcp_backoff[tp->t_rxtshift],
 931             TCPTV_PERSMIN, TCPTV_PERSMAX);
 932         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 933                 tp->t_rxtshift++;
 934 }