bsd/netinet/tcp_timer.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
  62  */
  63
  64
  65 #include <sys/param.h>
  66 #include <sys/systm.h>
  67 #include <sys/kernel.h>
  68 #include <sys/mbuf.h>
  69 #include <sys/sysctl.h>
  70 #include <sys/socket.h>
  71 #include <sys/socketvar.h>
  72 #include <sys/protosw.h>
  73 #include <kern/locks.h>
  74
  75 #include <kern/cpu_number.h>    /* before tcp_seq.h, for tcp_random18() */
  76
  77 #include <net/route.h>
  78
  79 #include <netinet/in.h>
  80 #include <netinet/in_systm.h>
  81 #include <netinet/in_pcb.h>
  82 #if INET6
  83 #include <netinet6/in6_pcb.h>
  84 #endif
  85 #include <netinet/ip_var.h>
  86 #include <netinet/tcp.h>
  87 #include <netinet/tcp_fsm.h>
  88 #include <netinet/tcp_seq.h>
  89 #include <netinet/tcp_timer.h>
  90 #include <netinet/tcp_var.h>
  91 #include <netinet/tcpip.h>
  92 #if TCPDEBUG
  93 #include <netinet/tcp_debug.h>
  94 #endif
  95 #include <sys/kdebug.h>
  96
  97 #define DBG_FNC_TCP_FAST        NETDBG_CODE(DBG_NETTCP, (5 << 8))
  98 #define DBG_FNC_TCP_SLOW        NETDBG_CODE(DBG_NETTCP, (5 << 8) | 1)
  99
 100 /*
 101  * NOTE - WARNING
 102  *
 103  *
 104  *
 105  *
 106  */
 107 static int
 108 sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
 109 {
 110         int error, s, tt;
 111
 112         tt = *(int *)oidp->oid_arg1;
 113         s = tt * 1000 / hz;
 114
 115         error = sysctl_handle_int(oidp, &s, 0, req);
 116         if (error || !req->newptr)
 117                 return (error);
 118
 119         tt = s * hz / 1000;
 120         if (tt < 1)
 121                 return (EINVAL);
 122
 123         *(int *)oidp->oid_arg1 = tt;
 124         return (0);
 125 }
 126
 127 int     tcp_keepinit;
 128 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
 129     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
 130
 131 int     tcp_keepidle;
 132 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
 133     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
 134
 135 int     tcp_keepintvl;
 136 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
 137     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
 138
 139 int     tcp_delacktime;
 140 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime,
 141     CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
 142     "Time before a delayed ACK is sent");
 143
 144 int     tcp_msl;
 145 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
 146     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
 147
 148 static int      always_keepalive = 0;
 149 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
 150     &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
 151
 152 static int      tcp_keepcnt = TCPTV_KEEPCNT;
 153         /* max idle probes */
 154 int     tcp_maxpersistidle;
 155         /* max idle time in persist */
 156 int     tcp_maxidle;
 157
 158 struct  inpcbhead       time_wait_slots[N_TIME_WAIT_SLOTS];
 159 int             cur_tw_slot = 0;
 160
 161 u_long          *delack_bitmask;
 162
 163
 164 void    add_to_time_wait_locked(tp)
 165         struct tcpcb    *tp;
 166 {
 167         int             tw_slot;
 168
 169         /* pcb list should be locked when we get here */
 170 #if 0
 171         lck_mtx_assert(tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
 172 #endif
 173
 174         LIST_REMOVE(tp->t_inpcb, inp_list);
 175
 176         if (tp->t_timer[TCPT_2MSL] == 0)
 177             tp->t_timer[TCPT_2MSL] = 1;
 178
 179         tp->t_rcvtime += tp->t_timer[TCPT_2MSL] & (N_TIME_WAIT_SLOTS - 1);
 180         tw_slot = (tp->t_timer[TCPT_2MSL] & (N_TIME_WAIT_SLOTS - 1)) + cur_tw_slot;
 181         if (tw_slot >= N_TIME_WAIT_SLOTS)
 182             tw_slot -= N_TIME_WAIT_SLOTS;
 183
 184         LIST_INSERT_HEAD(&time_wait_slots[tw_slot], tp->t_inpcb, inp_list);
 185 }
 186
 187 void    add_to_time_wait(tp)
 188         struct tcpcb    *tp;
 189 {
 190         struct inpcbinfo *pcbinfo               = &tcbinfo;
 191
 192         if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) {
 193                 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
 194                 lck_rw_lock_exclusive(pcbinfo->mtx);
 195                 tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
 196         }
 197         add_to_time_wait_locked(tp);
 198         lck_rw_done(pcbinfo->mtx);
 199 }
 200
 201
 202
 203
 204 /*
 205  * Fast timeout routine for processing delayed acks
 206  */
 207 void
 208 tcp_fasttimo()
 209 {
 210     struct inpcb *inp, *inpnxt;
 211     register struct tcpcb *tp;
 212
 213
 214     struct inpcbinfo *pcbinfo   = &tcbinfo;
 215
 216     int delack_checked = 0, delack_done = 0;
 217
 218     KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_START, 0,0,0,0,0);
 219
 220     if (tcp_delack_enabled == 0)
 221         return;
 222
 223     lck_rw_lock_shared(pcbinfo->mtx);
 224
 225     /* Walk the list of valid tcpcbs and send ACKS on the ones with DELACK bit set */
 226
 227     for (inp = tcb.lh_first; inp != NULL; inp = inpnxt) {
 228         inpnxt = inp->inp_list.le_next;
 229         /* NOTE: it's OK to check the tp because the pcb can't be removed while we hold pcbinfo->mtx) */
 230         if ((tp = (struct tcpcb *)inp->inp_ppcb) && (tp->t_flags & TF_DELACK)) {
 231                 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
 232                         continue;
 233                 tcp_lock(inp->inp_socket, 1, 0);
 234                 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
 235                         tcp_unlock(inp->inp_socket, 1, 0);
 236                         continue;
 237                 }
 238                 if (tp->t_flags & TF_DELACK) {
 239                         delack_done++;
 240                         tp->t_flags &= ~TF_DELACK;
 241                         tp->t_flags |= TF_ACKNOW;
 242                         tcpstat.tcps_delack++;
 243                         (void) tcp_output(tp);
 244                 }
 245                 tcp_unlock(inp->inp_socket, 1, 0);
 246         }
 247     }
 248     KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_END, delack_checked, delack_done, tcpstat.tcps_delack,0,0);
 249     lck_rw_done(pcbinfo->mtx);
 250 }
 251
 252 /*
 253  * Tcp protocol timeout routine called every 500 ms.
 254  * Updates the timers in all active tcb's and
 255  * causes finite state machine actions if timers expire.
 256  */
 257 void
 258 tcp_slowtimo()
 259 {
 260         struct inpcb *inp, *inpnxt;
 261         struct tcpcb *tp;
 262         struct socket *so;
 263         int i;
 264 #if TCPDEBUG
 265         int ostate;
 266 #endif
 267 #if KDEBUG
 268         static int tws_checked;
 269 #endif
 270         struct inpcbinfo *pcbinfo               = &tcbinfo;
 271
 272         KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0,0,0,0,0);
 273
 274         tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
 275
 276         lck_rw_lock_shared(pcbinfo->mtx);
 277
 278         /*
 279          * Search through tcb's and update active timers.
 280          */
 281         for (inp = tcb.lh_first; inp != NULL; inp = inpnxt) {
 282                 inpnxt = inp->inp_list.le_next;
 283
 284                 so = inp->inp_socket;
 285
 286                 if (so == &tcbinfo.nat_dummy_socket)
 287                                 continue;
 288
 289                 if (in_pcb_checkstate(inp, WNT_ACQUIRE,0) == WNT_STOPUSING)
 290                         continue;
 291
 292                 tcp_lock(so, 1, 0);
 293
 294                 if ((in_pcb_checkstate(inp, WNT_RELEASE,1) == WNT_STOPUSING)  && so->so_usecount == 1) {
 295                         tcp_unlock(so, 1, 0);
 296                         continue;
 297                 }
 298                 tp = intotcpcb(inp);
 299                 if (tp == 0 || tp->t_state == TCPS_LISTEN) {
 300                         tcp_unlock(so, 1, 0);
 301                         continue;
 302                 }
 303
 304                 for (i = 0; i < TCPT_NTIMERS; i++) {
 305                         if (tp->t_timer[i] && --tp->t_timer[i] == 0) {
 306 #if TCPDEBUG
 307                                 ostate = tp->t_state;
 308 #endif
 309                                 tp = tcp_timers(tp, i);
 310                                 if (tp == NULL)
 311                                         goto tpgone;
 312 #if TCPDEBUG
 313                                 if (tp->t_inpcb->inp_socket->so_options
 314                                     & SO_DEBUG)
 315                                         tcp_trace(TA_USER, ostate, tp,
 316                                                   (void *)0,
 317                                                   (struct tcphdr *)0,
 318                                                   PRU_SLOWTIMO);
 319 #endif
 320                         }
 321                 }
 322                 tp->t_rcvtime++;
 323                 tp->t_starttime++;
 324                 if (tp->t_rtttime)
 325                         tp->t_rtttime++;
 326 tpgone:
 327                 tcp_unlock(so, 1, 0);
 328         }
 329
 330 #if KDEBUG
 331         tws_checked = 0;
 332 #endif
 333         KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_NONE, tws_checked,0,0,0,0);
 334
 335         /*
 336          * Process the items in the current time-wait slot
 337          */
 338
 339         for (inp = time_wait_slots[cur_tw_slot].lh_first; inp; inp = inpnxt)
 340         {
 341                 inpnxt = inp->inp_list.le_next;
 342 #if KDEBUG
 343                 tws_checked++;
 344 #endif
 345
 346                 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
 347                         continue;
 348
 349                 tcp_lock(inp->inp_socket, 1, 0);
 350
 351                 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)
 352                         goto twunlock;
 353
 354                 tp = intotcpcb(inp);
 355                 if (tp == NULL) { /* tp already closed, remove from list */
 356 #if TEMPDEBUG
 357                         printf("tcp_slowtimo: tp is null in time-wait slot!\n");
 358 #endif
 359                         goto twunlock;
 360                 }
 361                 if (tp->t_timer[TCPT_2MSL] >= N_TIME_WAIT_SLOTS) {
 362                     tp->t_timer[TCPT_2MSL] -= N_TIME_WAIT_SLOTS;
 363                     tp->t_rcvtime += N_TIME_WAIT_SLOTS;
 364                 }
 365                 else
 366                     tp->t_timer[TCPT_2MSL] = 0;
 367
 368                 if (tp->t_timer[TCPT_2MSL] == 0)
 369                     tp = tcp_timers(tp, TCPT_2MSL);     /* tp can be returned null if tcp_close is called */
 370 twunlock:
 371                 tcp_unlock(inp->inp_socket, 1, 0);
 372         }
 373
 374         if (lck_rw_lock_shared_to_exclusive(pcbinfo->mtx) != 0)
 375                 lck_rw_lock_exclusive(pcbinfo->mtx);    /* Upgrade failed, lost lock no take it again exclusive */
 376
 377
 378         for (inp = tcb.lh_first; inp != NULL; inp = inpnxt) {
 379                 inpnxt = inp->inp_list.le_next;
 380                 /* Ignore nat/SharedIP dummy pcbs */
 381                 if (inp->inp_socket == &tcbinfo.nat_dummy_socket)
 382                                 continue;
 383
 384                 if (inp->inp_wantcnt != WNT_STOPUSING)
 385                         continue;
 386
 387                 so = inp->inp_socket;
 388                 if (!lck_mtx_try_lock(inp->inpcb_mtx)) {/* skip if in use */
 389 #if TEMPDEBUG
 390                         printf("tcp_slowtimo so=%x STOPUSING but locked...\n", so);
 391 #endif
 392                         continue;
 393                 }
 394
 395                 if (so->so_usecount == 0)
 396                         in_pcbdispose(inp);
 397                 else {
 398                         tp = intotcpcb(inp);
 399                         /* Check for embryonic socket stuck on listener queue (4023660) */
 400                         if ((so->so_usecount == 1) && (tp->t_state == TCPS_CLOSED) &&
 401                             (so->so_head != NULL) && (so->so_state & SS_INCOMP)) {
 402                                 so->so_usecount--;
 403                                 in_pcbdispose(inp);
 404                         } else
 405                                 lck_mtx_unlock(inp->inpcb_mtx);
 406                 }
 407         }
 408
 409         /* Now cleanup the time wait ones */
 410         for (inp = time_wait_slots[cur_tw_slot].lh_first; inp; inp = inpnxt)
 411         {
 412                 inpnxt = inp->inp_list.le_next;
 413
 414                 if (inp->inp_wantcnt != WNT_STOPUSING)
 415                         continue;
 416
 417                 so = inp->inp_socket;
 418                 if (!lck_mtx_try_lock(inp->inpcb_mtx)) /* skip if in use */
 419                         continue;
 420                 if (so->so_usecount == 0)
 421                         in_pcbdispose(inp);
 422                 else  {
 423                         tp = intotcpcb(inp);
 424                         /* Check for embryonic socket stuck on listener queue (4023660) */
 425                         if ((so->so_usecount == 1) && (tp->t_state == TCPS_CLOSED) &&
 426                             (so->so_head != NULL) && (so->so_state & SS_INCOMP)) {
 427                                 so->so_usecount--;
 428                                 in_pcbdispose(inp);
 429                         } else
 430                                 lck_mtx_unlock(inp->inpcb_mtx);
 431                 }
 432         }
 433
 434         tcp_now++;
 435         if (++cur_tw_slot >= N_TIME_WAIT_SLOTS)
 436                 cur_tw_slot = 0;
 437
 438         lck_rw_done(pcbinfo->mtx);
 439         KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0);
 440 }
 441
 442 /*
 443  * Cancel all timers for TCP tp.
 444  */
 445 void
 446 tcp_canceltimers(tp)
 447         struct tcpcb *tp;
 448 {
 449         register int i;
 450
 451         for (i = 0; i < TCPT_NTIMERS; i++)
 452                 tp->t_timer[i] = 0;
 453 }
 454
 455 int     tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
 456     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
 457
 458 int     tcp_backoff[TCP_MAXRXTSHIFT + 1] =
 459     { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
 460
 461 static int tcp_totbackoff = 511;        /* sum of tcp_backoff[] */
 462
 463 /*
 464  * TCP timer processing.
 465  */
 466 struct tcpcb *
 467 tcp_timers(tp, timer)
 468         register struct tcpcb *tp;
 469         int timer;
 470 {
 471         register int rexmt;
 472         struct socket *so_tmp;
 473         struct tcptemp *t_template;
 474
 475 #if TCPDEBUG
 476         int ostate;
 477 #endif
 478
 479 #if INET6
 480         int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
 481 #endif /* INET6 */
 482
 483         so_tmp = tp->t_inpcb->inp_socket;
 484
 485         switch (timer) {
 486
 487         /*
 488          * 2 MSL timeout in shutdown went off.  If we're closed but
 489          * still waiting for peer to close and connection has been idle
 490          * too long, or if 2MSL time is up from TIME_WAIT, delete connection
 491          * control block.  Otherwise, check again in a bit.
 492          */
 493         case TCPT_2MSL:
 494                 tcp_free_sackholes(tp);
 495                 if (tp->t_state != TCPS_TIME_WAIT &&
 496                     tp->t_rcvtime <= tcp_maxidle) {
 497                         tp->t_timer[TCPT_2MSL] = (unsigned long)tcp_keepintvl;
 498                         add_to_time_wait_locked(tp);
 499                 }
 500                 else {
 501                         tp = tcp_close(tp);
 502                         return(tp);
 503                 }
 504                 break;
 505
 506         /*
 507          * Retransmission timer went off.  Message has not
 508          * been acked within retransmit interval.  Back off
 509          * to a longer retransmit interval and retransmit one segment.
 510          */
 511         case TCPT_REXMT:
 512                 tcp_free_sackholes(tp);
 513                 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 514                         tp->t_rxtshift = TCP_MAXRXTSHIFT;
 515                         tcpstat.tcps_timeoutdrop++;
 516                         tp = tcp_drop(tp, tp->t_softerror ?
 517                             tp->t_softerror : ETIMEDOUT);
 518                         postevent(so_tmp, 0, EV_TIMEOUT);
 519                         break;
 520                 }
 521
 522                 if (tp->t_rxtshift == 1) {
 523                         /*
 524                          * first retransmit; record ssthresh and cwnd so they can
 525                          * be recovered if this turns out to be a "bad" retransmit.
 526                          * A retransmit is considered "bad" if an ACK for this
 527                          * segment is received within RTT/2 interval; the assumption
 528                          * here is that the ACK was already in flight.  See
 529                          * "On Estimating End-to-End Network Path Properties" by
 530                          * Allman and Paxson for more details.
 531                          */
 532                         tp->snd_cwnd_prev = tp->snd_cwnd;
 533                         tp->snd_ssthresh_prev = tp->snd_ssthresh;
 534                         tp->snd_recover_prev = tp->snd_recover;
 535                         if (IN_FASTRECOVERY(tp))
 536                                   tp->t_flags |= TF_WASFRECOVERY;
 537                         else
 538                                   tp->t_flags &= ~TF_WASFRECOVERY;
 539                         tp->t_badrxtwin = tcp_now + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
 540                 }
 541                 tcpstat.tcps_rexmttimeo++;
 542                 if (tp->t_state == TCPS_SYN_SENT)
 543                         rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
 544                 else
 545                         rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 546                 TCPT_RANGESET(tp->t_rxtcur, rexmt,
 547                         tp->t_rttmin, TCPTV_REXMTMAX);
 548                 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 549
 550                 /*
 551                  * Disable rfc1323 and rfc1644 if we havn't got any response to
 552                  * our third SYN to work-around some broken terminal servers
 553                  * (most of which have hopefully been retired) that have bad VJ
 554                  * header compression code which trashes TCP segments containing
 555                  * unknown-to-them TCP options.
 556                  */
 557                 if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3))
 558                                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
 559                 /*
 560                  * If losing, let the lower level know and try for
 561                  * a better route.  Also, if we backed off this far,
 562                  * our srtt estimate is probably bogus.  Clobber it
 563                  * so we'll take the next rtt measurement as our srtt;
 564                  * move the current srtt into rttvar to keep the current
 565                  * retransmit times until then.
 566                  */
 567                 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
 568 #if INET6
 569                         if (isipv6)
 570                                 in6_losing(tp->t_inpcb);
 571                         else
 572 #endif /* INET6 */
 573                         in_losing(tp->t_inpcb);
 574                         tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
 575                         tp->t_srtt = 0;
 576                 }
 577                 tp->snd_nxt = tp->snd_una;
 578                 /*
 579                  * Note:  We overload snd_recover to function also as the
 580                  * snd_last variable described in RFC 2582
 581                  */
 582                 tp->snd_recover = tp->snd_max;
 583                 /*
 584                  * Force a segment to be sent.
 585                  */
 586                 tp->t_flags |= TF_ACKNOW;
 587                 /*
 588                  * If timing a segment in this window, stop the timer.
 589                  */
 590                 tp->t_rtttime = 0;
 591                 /*
 592                  * Close the congestion window down to one segment
 593                  * (we'll open it by one segment for each ack we get).
 594                  * Since we probably have a window's worth of unacked
 595                  * data accumulated, this "slow start" keeps us from
 596                  * dumping all that data as back-to-back packets (which
 597                  * might overwhelm an intermediate gateway).
 598                  *
 599                  * There are two phases to the opening: Initially we
 600                  * open by one mss on each ack.  This makes the window
 601                  * size increase exponentially with time.  If the
 602                  * window is larger than the path can handle, this
 603                  * exponential growth results in dropped packet(s)
 604                  * almost immediately.  To get more time between
 605                  * drops but still "push" the network to take advantage
 606                  * of improving conditions, we switch from exponential
 607                  * to linear window opening at some threshhold size.
 608                  * For a threshhold, we use half the current window
 609                  * size, truncated to a multiple of the mss.
 610                  *
 611                  * (the minimum cwnd that will give us exponential
 612                  * growth is 2 mss.  We don't allow the threshhold
 613                  * to go below this.)
 614                  */
 615                 {
 616                 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
 617                 if (win < 2)
 618                         win = 2;
 619                 tp->snd_cwnd = tp->t_maxseg;
 620                 tp->snd_ssthresh = win * tp->t_maxseg;
 621                 tp->t_dupacks = 0;
 622                 }
 623                 EXIT_FASTRECOVERY(tp);
 624                 (void) tcp_output(tp);
 625                 break;
 626
 627         /*
 628          * Persistance timer into zero window.
 629          * Force a byte to be output, if possible.
 630          */
 631         case TCPT_PERSIST:
 632                 tcpstat.tcps_persisttimeo++;
 633                 /*
 634                  * Hack: if the peer is dead/unreachable, we do not
 635                  * time out if the window is closed.  After a full
 636                  * backoff, drop the connection if the idle time
 637                  * (no responses to probes) reaches the maximum
 638                  * backoff that we would use if retransmitting.
 639                  */
 640                 if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 641                     (tp->t_rcvtime >= tcp_maxpersistidle ||
 642                     tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 643                         tcpstat.tcps_persistdrop++;
 644                         so_tmp = tp->t_inpcb->inp_socket;
 645                         tp = tcp_drop(tp, ETIMEDOUT);
 646                         postevent(so_tmp, 0, EV_TIMEOUT);
 647                         break;
 648                 }
 649                 tcp_setpersist(tp);
 650                 tp->t_force = 1;
 651                 (void) tcp_output(tp);
 652                 tp->t_force = 0;
 653                 break;
 654
 655         /*
 656          * Keep-alive timer went off; send something
 657          * or drop connection if idle for too long.
 658          */
 659         case TCPT_KEEP:
 660                 tcpstat.tcps_keeptimeo++;
 661                 if (tp->t_state < TCPS_ESTABLISHED)
 662                         goto dropit;
 663                 if ((always_keepalive ||
 664                     tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) &&
 665                     tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2) {
 666                         if (tp->t_rcvtime >= TCP_KEEPIDLE(tp) + (unsigned long)tcp_maxidle)
 667                                 goto dropit;
 668                         /*
 669                          * Send a packet designed to force a response
 670                          * if the peer is up and reachable:
 671                          * either an ACK if the connection is still alive,
 672                          * or an RST if the peer has closed the connection
 673                          * due to timeout or reboot.
 674                          * Using sequence number tp->snd_una-1
 675                          * causes the transmitted zero-length segment
 676                          * to lie outside the receive window;
 677                          * by the protocol spec, this requires the
 678                          * correspondent TCP to respond.
 679                          */
 680                         tcpstat.tcps_keepprobe++;
 681                         t_template = tcp_maketemplate(tp);
 682                         if (t_template) {
 683                                 tcp_respond(tp, t_template->tt_ipgen,
 684                                     &t_template->tt_t, (struct mbuf *)NULL,
 685                                     tp->rcv_nxt, tp->snd_una - 1, 0);
 686                                 (void) m_free(dtom(t_template));
 687                         }
 688                         tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
 689                 } else
 690                         tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp);
 691                 break;
 692
 693 #if TCPDEBUG
 694         if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
 695                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 696                           PRU_SLOWTIMO);
 697 #endif
 698         dropit:
 699                 tcpstat.tcps_keepdrops++;
 700                 tp = tcp_drop(tp, ETIMEDOUT);
 701                 postevent(so_tmp, 0, EV_TIMEOUT);
 702                 break;
 703         }
 704         return (tp);
 705 }