xnu-2422.115.4.tar.gz

[apple/xnu.git] / bsd / netinet / tcp_timer.c
diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c

index 706ec823c640ea62fde4e844439b3bd3f499c6fd..b1ac3138b186b3b102ab02419ae4442d0c95eff5 100644 (file)
--- a/bsd/netinet/tcp_timer.c
+++ b/bsd/netinet/tcp_timer.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -74,10 +74,11 @@
  #include <sys/mcache.h>
  #include <sys/queue.h>
  #include <kern/locks.h>
-
  #include <kern/cpu_number.h>   /* before tcp_seq.h, for tcp_random18() */
+#include <mach/boolean.h>
  
  #include <net/route.h>
+#include <net/if_var.h>
  
  #include <netinet/in.h>
  #include <netinet/in_systm.h>
@@ -101,6 +102,7 @@
  #endif
  #include <sys/kdebug.h>
  #include <mach/sdt.h>
+#include <netinet/mptcp_var.h>
  
  extern void postevent(struct socket *, struct sockbuf *,
                                                 int);
@@ -121,6 +123,12 @@ extern void postevent(struct socket *, struct sockbuf *,
                 panic("Bad link elm %p prev->next != elm", (elm));      \
  } while(0)
  
+/* tcp timer list */
+struct tcptimerlist tcp_timer_list;
+
+/* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */
+struct tcptailq tcp_tw_tailq;
+
  static int     background_io_trigger = 5;
  SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW | CTLFLAG_LOCKED,
      &background_io_trigger, 0, "Background IO Trigger Setting");
@@ -158,6 +166,10 @@ int        tcp_keepintvl;
  SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
      &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
  
+int    tcp_keepcnt;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+       &tcp_keepcnt, 0, "number of times to repeat keepalive");
+
  int    tcp_msl;
  SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
      &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
@@ -195,6 +207,12 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW | CT
      &tcp_broken_peer_syn_rxmit_thres, 0, "Number of retransmitted SYNs before "
      "TCP disables rfc1323 and rfc1644 during the rest of attempts");
  
+/* A higher threshold on local connections for disabling RFC 1323 options */
+static int tcp_broken_peer_syn_rxmit_thres_local = 10;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rexmit_thres_local, 
+       CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres_local, 0,
+       "Number of retransmitted SYNs before disabling RFC 1323 options on local connections");
+
  static int tcp_timer_advanced = 0;
  SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced, CTLFLAG_RD | CTLFLAG_LOCKED,
      &tcp_timer_advanced, 0, "Number of times one of the timers was advanced");
@@ -212,12 +230,11 @@ int       tcp_pmtud_black_hole_mss = 1200 ;
  SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW | CTLFLAG_LOCKED,
      &tcp_pmtud_black_hole_mss, 0, "Path MTU Discovery Black Hole Detection lowered MSS");
  
-static int     tcp_keepcnt = TCPTV_KEEPCNT;
-static int     tcp_gc_done = FALSE;    /* perfromed garbage collection of "used" sockets */
+/* performed garbage collection of "used" sockets */
+static boolean_t tcp_gc_done = FALSE;
+
         /* max idle probes */
  int    tcp_maxpersistidle;
-       /* max idle time in persist */
-int    tcp_maxidle;
  
  /* TCP delack timer is set to 100 ms. Since the processing of timer list in fast
   * mode will happen no faster than 100 ms, the delayed ack timer will fire some where 
@@ -225,11 +242,12 @@ int       tcp_maxidle;
   */
  int    tcp_delack = TCP_RETRANSHZ / 10;
  
-struct inpcbhead       time_wait_slots[N_TIME_WAIT_SLOTS];
-int            cur_tw_slot = 0;
-
-/* tcp timer list */
-struct tcptimerlist tcp_timer_list;
+#if MPTCP
+/*
+ * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff
+ */
+int    tcp_jack_rxmt = TCP_RETRANSHZ / 2;
+#endif /* MPTCP */
  
  /* The frequency of running through the TCP timer list in 
   * fast and slow mode can be configured.
@@ -247,6 +265,8 @@ static void tcp_sched_timerlist(uint32_t offset);
  static uint32_t tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index);
  static void tcp_sched_timers(struct tcpcb *tp);
  static inline void tcp_set_lotimer_index(struct tcpcb *);
+static void tcp_rexmt_save_state(struct tcpcb *tp);
+void tcp_remove_from_time_wait(struct inpcb *inp);
  
  /* Macro to compare two timers. If there is a reset of the sign bit, it is 
   * safe to assume that the timer has wrapped around. By doing signed comparision, 
@@ -262,60 +282,79 @@ timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) {
  /* Returns true if the timer is on the timer list */
  #define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
  
+/* Run the TCP timerlist atleast once every hour */
+#define        TCP_TIMERLIST_MAX_OFFSET        (60 * 60 * TCP_RETRANSHZ)
  
-void   add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
+static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
  void   add_to_time_wait(struct tcpcb *tp, uint32_t delay) ;
  
-static void tcp_garbage_collect(struct inpcb *, int);
+static boolean_t tcp_garbage_collect(struct inpcb *, int);
  
-void   add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay) 
+/*
+ * Add to tcp timewait list, delay is given in milliseconds.
+ */
+static void
+add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
  {
-       int             tw_slot;
-       struct inpcbinfo *pcbinfo       = &tcbinfo;
+       struct inpcbinfo *pcbinfo = &tcbinfo;
+       struct inpcb *inp = tp->t_inpcb;
         uint32_t timer;
  
-       /* pcb list should be locked when we get here */        
-       lck_rw_assert(pcbinfo->mtx, LCK_RW_ASSERT_EXCLUSIVE);
-
-       LIST_REMOVE(tp->t_inpcb, inp_list);
-
-       /* if (tp->t_timer[TCPT_2MSL] <= 0) 
-           tp->t_timer[TCPT_2MSL] = 1; */
-
-       /*
-        * Because we're pulling this pcb out of the main TCP pcb list,
-        * we need to recalculate the TCPT_2MSL timer value for tcp_slowtimo
-        * higher timer granularity.
-        */
+       /* pcb list should be locked when we get here */
+       lck_rw_assert(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
  
-       timer = (delay / TCP_RETRANSHZ) * PR_SLOWHZ;
-       tp->t_rcvtime = (tp->t_rcvtime / TCP_RETRANSHZ) * PR_SLOWHZ;
+       /* We may get here multiple times, so check */
+       if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
+               pcbinfo->ipi_twcount++;
+               inp->inp_flags2 |= INP2_TIMEWAIT;
+               
+               /* Remove from global inp list */
+               LIST_REMOVE(inp, inp_list);
+       } else {
+               TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
+       }
  
-       tp->t_rcvtime += timer & (N_TIME_WAIT_SLOTS - 1); 
+       /* Compute the time at which this socket can be closed */
+       timer = tcp_now + delay;
+       
+       /* We will use the TCPT_2MSL timer for tracking this delay */
  
-       tw_slot = (timer & (N_TIME_WAIT_SLOTS - 1)) + cur_tw_slot; 
-       if (tw_slot >= N_TIME_WAIT_SLOTS)
-           tw_slot -= N_TIME_WAIT_SLOTS;
+       if (TIMER_IS_ON_LIST(tp))
+               tcp_remove_timer(tp);
+       tp->t_timer[TCPT_2MSL] = timer;
  
-       LIST_INSERT_HEAD(&time_wait_slots[tw_slot], tp->t_inpcb, inp_list);
+       TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry);
  }
  
-void   add_to_time_wait(struct tcpcb *tp, uint32_t delay) 
+void
+add_to_time_wait(struct tcpcb *tp, uint32_t delay)
  {
-       struct inpcbinfo *pcbinfo               = &tcbinfo;
-       
-       if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) {
+       struct inpcbinfo *pcbinfo = &tcbinfo;
+
+       if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) {
                 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
-               lck_rw_lock_exclusive(pcbinfo->mtx);
+               lck_rw_lock_exclusive(pcbinfo->ipi_lock);
                 tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
         }
         add_to_time_wait_locked(tp, delay);
-       lck_rw_done(pcbinfo->mtx);
+       lck_rw_done(pcbinfo->ipi_lock);
+
+       inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY);
  }
  
-static void
+/* If this is on time wait queue, remove it. */
+void
+tcp_remove_from_time_wait(struct inpcb *inp)
+{
+       struct tcpcb *tp = intotcpcb(inp);
+       if (inp->inp_flags2 & INP2_TIMEWAIT)
+               TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
+}
+
+static boolean_t
  tcp_garbage_collect(struct inpcb *inp, int istimewait)
  {
+       boolean_t active = FALSE;
         struct socket *so;
         struct tcpcb *tp;
  
@@ -329,13 +368,23 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait)
          * overflow sockets that are eligible for garbage collection have
          * their usecounts set to 1.
          */
-       if (so->so_usecount > 1 || !lck_mtx_try_lock_spin(&inp->inpcb_mtx))
-               return;
+       if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx))
+               return (TRUE);
  
         /* Check again under the lock */
         if (so->so_usecount > 1) {
+               if (inp->inp_wantcnt == WNT_STOPUSING)
+                       active = TRUE;
                 lck_mtx_unlock(&inp->inpcb_mtx);
-               return;
+               return (active);
+       }
+
+       if (istimewait &&
+               TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) &&
+               tp->t_state != TCPS_CLOSED) {
+               /* Become a regular mutex */
+               lck_mtx_convert_spin(&inp->inpcb_mtx);
+               tcp_close(tp);
         }
  
         /*
@@ -343,42 +392,46 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait)
          * only if we are called to clean up the time wait slots, since
          * tcp_dropdropablreq() considers a socket to have been fully
          * dropped after add_to_time_wait() is finished.
-        * Also handle the case of connections getting closed by the peer while in the queue as
-        * seen with rdar://6422317
-        * 
+        * Also handle the case of connections getting closed by the peer
+        * while in the queue as seen with rdar://6422317
+        *
          */
-       if (so->so_usecount == 1 && 
+       if (so->so_usecount == 1 &&
             ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
-           ((tp != NULL) && (tp->t_state == TCPS_CLOSED) && (so->so_head != NULL)
-                && ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
-                        (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
+           ((tp != NULL) && (tp->t_state == TCPS_CLOSED) &&
+           (so->so_head != NULL) &&
+           ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
+           (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
  
                 if (inp->inp_state != INPCB_STATE_DEAD) {
                         /* Become a regular mutex */
                         lck_mtx_convert_spin(&inp->inpcb_mtx);
  #if INET6
-                       if (INP_CHECK_SOCKAF(so, AF_INET6))
+                       if (SOCK_CHECK_DOM(so, PF_INET6))
                                 in6_pcbdetach(inp);
                         else
  #endif /* INET6 */
-                       in_pcbdetach(inp);
+                               in_pcbdetach(inp);
                 }
                 so->so_usecount--;
+               if (inp->inp_wantcnt == WNT_STOPUSING)
+                       active = TRUE;
                 lck_mtx_unlock(&inp->inpcb_mtx);
-               return;
+               return (active);
         } else if (inp->inp_wantcnt != WNT_STOPUSING) {
                 lck_mtx_unlock(&inp->inpcb_mtx);
-               return;
+               return (FALSE);
         }
  
         /*
-        * We get here because the PCB is no longer searchable (WNT_STOPUSING);
-        * detach (if needed) and dispose if it is dead (usecount is 0).  This
-        * covers all cases, including overflow sockets and those that are
-        * considered as "embryonic", i.e. created by sonewconn() in TCP input
-        * path, and have not yet been committed.  For the former, we reduce
-        * the usecount to 0 as done by the code above.  For the latter, the
-        * usecount would have reduced to 0 as part calling soabort() when the
+        * We get here because the PCB is no longer searchable 
+        * (WNT_STOPUSING); detach (if needed) and dispose if it is dead 
+        * (usecount is 0).  This covers all cases, including overflow 
+        * sockets and those that are considered as "embryonic", 
+        * i.e. created by sonewconn() in TCP input path, and have 
+        * not yet been committed.  For the former, we reduce the usecount
+        *  to 0 as done by the code above.  For the latter, the usecount 
+        * would have reduced to 0 as part calling soabort() when the
          * socket is dropped at the end of tcp_input().
          */
         if (so->so_usecount == 0) {
@@ -386,113 +439,114 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait)
                         struct tcpcb *, tp, int32_t, TCPS_CLOSED);
                 /* Become a regular mutex */
                 lck_mtx_convert_spin(&inp->inpcb_mtx);
+
+               /*
+                * If this tp still happens to be on the timer list, 
+                * take it out
+                */
+               if (TIMER_IS_ON_LIST(tp)) {
+                       tcp_remove_timer(tp);
+               }
+
                 if (inp->inp_state != INPCB_STATE_DEAD) {
  #if INET6
-                       if (INP_CHECK_SOCKAF(so, AF_INET6))
+                       if (SOCK_CHECK_DOM(so, PF_INET6))
                                 in6_pcbdetach(inp);
                         else
  #endif /* INET6 */
-                       in_pcbdetach(inp);
+                               in_pcbdetach(inp);
                 }
                 in_pcbdispose(inp);
-       } else {
-               lck_mtx_unlock(&inp->inpcb_mtx);
+               return (FALSE);
         }
+
+       lck_mtx_unlock(&inp->inpcb_mtx);
+       return (TRUE);
  }
  
+/*
+ * TCP garbage collector callback (inpcb_timer_func_t).
+ *
+ * Returns the number of pcbs that will need to be gc-ed soon,
+ * returnining > 0 will keep timer active.
+ */
  void
-tcp_slowtimo(void)
+tcp_gc(struct inpcbinfo *ipi)
  {
         struct inpcb *inp, *nxt;
-       struct tcpcb *tp;
+       struct tcpcb *tw_tp, *tw_ntp;
  #if TCPDEBUG
         int ostate;
  #endif
-
  #if  KDEBUG
         static int tws_checked = 0;
  #endif
  
-       struct inpcbinfo *pcbinfo               = &tcbinfo;
-
-       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0,0,0,0,0);
-
-       tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
+       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0);
  
-       /* Update tcp_now here as it may get used while processing the slow timer */
+       /*
+        * Update tcp_now here as it may get used while
+        * processing the slow timer.
+        */
         calculate_tcp_clock();
  
-       /* Garbage collect socket/tcpcb: We need to acquire the list lock 
+       /*
+        * Garbage collect socket/tcpcb: We need to acquire the list lock
          * exclusively to do this
          */
  
-       if (lck_rw_try_lock_exclusive(pcbinfo->mtx) == FALSE) {
-               if (tcp_gc_done == TRUE) {      /* don't sweat it this time. cleanup was done last time */
+       if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
+               /* don't sweat it this time; cleanup was done last time */
+               if (tcp_gc_done == TRUE) {
                         tcp_gc_done = FALSE;
-                       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0);
-                       return; /* Upgrade failed and lost lock - give up this time. */
+                       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END,
+                           tws_checked, cur_tw_slot, 0, 0, 0);
+                       /* Lock upgrade failed, give up this round */
+                       atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
+                       return;
                 }
-               lck_rw_lock_exclusive(pcbinfo->mtx);    /* Upgrade failed, lost lock now take it again exclusive */
+               /* Upgrade failed, lost lock now take it again exclusive */
+               lck_rw_lock_exclusive(ipi->ipi_lock);
         }
         tcp_gc_done = TRUE;
  
-       /*
-        * Process the items in the current time-wait slot
-        */
-#if  KDEBUG
-       tws_checked = 0;
-#endif
-       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_NONE, tws_checked,0,0,0,0);
-
-       LIST_FOREACH(inp, &time_wait_slots[cur_tw_slot], inp_list) {
-#if KDEBUG
-               tws_checked++;
-#endif
-
-               if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) 
-                       continue;
-
-               tcp_lock(inp->inp_socket, 1, 0);
-
-               if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) 
-                       goto twunlock;
-
-               tp = intotcpcb(inp);
-               if (tp == NULL)  /* tp already closed, remove from list */
-                       goto twunlock;
-
-               if (tp->t_timer[TCPT_2MSL] >= N_TIME_WAIT_SLOTS) {
-                   tp->t_timer[TCPT_2MSL] -= N_TIME_WAIT_SLOTS;
-                   tp->t_rcvtime += N_TIME_WAIT_SLOTS;
-               }
-               else
-                   tp->t_timer[TCPT_2MSL] = 0;
-
-               if (tp->t_timer[TCPT_2MSL] == 0)  {
+       LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
+               if (tcp_garbage_collect(inp, 0))
+                       atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
+       }
  
-                       /* That pcb is ready for a close */     
-                       tcp_free_sackholes(tp);
-                       tp = tcp_close(tp);
+       /* Now cleanup the time wait ones */
+       TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) {
+               /*
+                * We check the timestamp here without holding the 
+                * socket lock for better performance. If there are
+                * any pcbs in time-wait, the timer will get rescheduled.
+                * Hence some error in this check can be tolerated.
+                *
+                * Sometimes a socket on time-wait queue can be closed if
+                * 2MSL timer expired but the application still has a
+                * usecount on it. 
+                */
+               if (tw_tp->t_state == TCPS_CLOSED ||  
+                   TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) {
+                       if (tcp_garbage_collect(tw_tp->t_inpcb, 1))
+                               atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
                 }
-twunlock:
-               tcp_unlock(inp->inp_socket, 1, 0);
         }
  
+       /* take into account pcbs that are still in time_wait_slots */
+       atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount);
  
-       LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
-               tcp_garbage_collect(inp, 0);
-       }
+       lck_rw_done(ipi->ipi_lock);
  
-       /* Now cleanup the time wait ones */
-       LIST_FOREACH_SAFE(inp, &time_wait_slots[cur_tw_slot], inp_list, nxt) {
-               tcp_garbage_collect(inp, 1);
-       }
+       /* Clean up the socache while we are here */
+       if (so_cache_timer())
+               atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
  
-       if (++cur_tw_slot >= N_TIME_WAIT_SLOTS)
-               cur_tw_slot = 0;
-       
-       lck_rw_done(pcbinfo->mtx);
-       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0);
+       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked,
+           cur_tw_slot, 0, 0, 0);
+
+       return;
  }
  
  /*
@@ -519,6 +573,41 @@ int        tcp_backoff[TCP_MAXRXTSHIFT + 1] =
  
  static int tcp_totbackoff = 511;       /* sum of tcp_backoff[] */
  
+static void tcp_rexmt_save_state(struct tcpcb *tp)
+{
+       u_int32_t fsize;
+       if (TSTMP_SUPPORTED(tp)) {
+               /*
+                * Since timestamps are supported on the connection, 
+                * we can do recovery as described in rfc 4015.
+                */
+               fsize = tp->snd_max - tp->snd_una;
+               tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh);
+               tp->snd_recover_prev = tp->snd_recover;
+       } else {
+               /*
+                * Timestamp option is not supported on this connection.
+                * Record ssthresh and cwnd so they can
+                * be recovered if this turns out to be a "bad" retransmit.
+                * A retransmit is considered "bad" if an ACK for this 
+                * segment is received within RTT/2 interval; the assumption
+                * here is that the ACK was already in flight.  See 
+                * "On Estimating End-to-End Network Path Properties" by
+                * Allman and Paxson for more details.
+                */
+               tp->snd_cwnd_prev = tp->snd_cwnd;
+               tp->snd_ssthresh_prev = tp->snd_ssthresh;
+               tp->snd_recover_prev = tp->snd_recover;
+               if (IN_FASTRECOVERY(tp))
+                       tp->t_flags |= TF_WASFRECOVERY;
+               else
+                       tp->t_flags &= ~TF_WASFRECOVERY;
+       }
+       tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2;
+       tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
+       tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
+}
+
  /*
   * TCP timer processing.
   */
@@ -528,7 +617,7 @@ tcp_timers(tp, timer)
         int timer;
  {
         register int rexmt;
-       struct socket *so_tmp;
+       struct socket *so;
         struct tcptemp *t_template;
         int optlen = 0;
         int idle_time = 0;
@@ -541,7 +630,7 @@ tcp_timers(tp, timer)
         int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
  #endif /* INET6 */
  
-       so_tmp = tp->t_inpcb->inp_socket;
+       so = tp->t_inpcb->inp_socket;
         idle_time = tcp_now - tp->t_rcvtime;
  
         switch (timer) {
@@ -558,10 +647,10 @@ tcp_timers(tp, timer)
                 tcp_free_sackholes(tp);
                 if (tp->t_state != TCPS_TIME_WAIT &&
                     tp->t_state != TCPS_FIN_WAIT_2 &&
-                   ((idle_time > 0) && (idle_time < tcp_maxidle))) {
-                       tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, (u_int32_t)tcp_keepintvl);
-               }
-               else {
+                   ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) {
+                       tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, 
+                               (u_int32_t)TCP_CONN_KEEPINTVL(tp));
+               } else {
                         tp = tcp_close(tp);
                         return(tp);
                 }
@@ -573,7 +662,6 @@ tcp_timers(tp, timer)
          * to a longer retransmit interval and retransmit one segment.
          */
         case TCPT_REXMT:
-               tcp_free_sackholes(tp);
                 /* Drop a connection in the retransmit timer
                  * 1. If we have retransmitted more than TCP_MAXRXTSHIFT times
                  * 2. If the time spent in this retransmission episode is more than
@@ -582,8 +670,8 @@ tcp_timers(tp, timer)
                  *    retransmitted the FIN 3 times without receiving an ack
                  */
                 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
-                       (tp->rxt_conndroptime > 0 && tp->rxt_start > 0 && 
-                       (tcp_now - tp->rxt_start) >= tp->rxt_conndroptime) ||
+                       (tp->t_rxt_conndroptime > 0 && tp->t_rxtstart > 0 && 
+                       (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime) ||
                         ((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
                         (tp->t_flags & TF_SENTFIN) != 0 &&
                         tp->t_rxtshift >= 4)) {
@@ -594,39 +682,51 @@ tcp_timers(tp, timer)
                                 tcpstat.tcps_timeoutdrop++;
                         }
                         tp->t_rxtshift = TCP_MAXRXTSHIFT;
+                       postevent(so, 0, EV_TIMEOUT);                   
+                       soevent(so, 
+                           (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
                         tp = tcp_drop(tp, tp->t_softerror ?
                             tp->t_softerror : ETIMEDOUT);
-                       postevent(so_tmp, 0, EV_TIMEOUT);                       
+
                         break;
                 }
  
-               if (tp->t_rxtshift == 1) {
-                       /*
-                        * first retransmit; record ssthresh and cwnd so they can
-                        * be recovered if this turns out to be a "bad" retransmit.
-                        * A retransmit is considered "bad" if an ACK for this 
-                        * segment is received within RTT/2 interval; the assumption
-                        * here is that the ACK was already in flight.  See 
-                        * "On Estimating End-to-End Network Path Properties" by
-                        * Allman and Paxson for more details.
-                        */
-                       tp->snd_cwnd_prev = tp->snd_cwnd;
-                       tp->snd_ssthresh_prev = tp->snd_ssthresh;
-                       tp->snd_recover_prev = tp->snd_recover;
-                       if (IN_FASTRECOVERY(tp))
-                                 tp->t_flags |= TF_WASFRECOVERY;
-                       else
-                                 tp->t_flags &= ~TF_WASFRECOVERY;
-                       tp->t_badrxtwin = tcp_now  + (tp->t_srtt >> (TCP_RTT_SHIFT)); 
+               tcpstat.tcps_rexmttimeo++;
  
-                       /* Set the time at which retransmission on this 
-                        * connection started
-                        */
-                       tp->rxt_start = tcp_now;
+               if (tp->t_rxtshift == 1 && 
+                       tp->t_state == TCPS_ESTABLISHED) {
+                       /* Set the time at which retransmission started. */
+                       tp->t_rxtstart = tcp_now;
+
+                       /* 
+                        * if this is the first retransmit timeout, save
+                        * the state so that we can recover if the timeout
+                        * is spurious.
+                        */ 
+                       tcp_rexmt_save_state(tp);
                 }
-               tcpstat.tcps_rexmttimeo++;
-               if (tp->t_state == TCPS_SYN_SENT)
+#if MPTCP
+               if ((tp->t_rxtshift == mptcp_fail_thresh) &&
+                   (tp->t_state == TCPS_ESTABLISHED) &&
+                   (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
+                       mptcp_act_on_txfail(so);
+
+               }
+#endif /* MPTCP */
+
+               if (tp->t_adaptive_wtimo > 0 &&
+                       tp->t_rxtshift > tp->t_adaptive_wtimo &&
+                       TCPS_HAVEESTABLISHED(tp->t_state)) {
+                       /* Send an event to the application */
+                       soevent(so,
+                               (SO_FILT_HINT_LOCKED|
+                               SO_FILT_HINT_ADAPTIVE_WTIMO));
+               }
+
+               if (tp->t_state == TCPS_SYN_SENT) {
                         rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
+                       tp->t_stat.synrxtshift = tp->t_rxtshift;
+               }
                 else
                         rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
                 TCPT_RANGESET(tp->t_rxtcur, rexmt,
@@ -634,25 +734,33 @@ tcp_timers(tp, timer)
                         TCP_ADD_REXMTSLOP(tp));
                 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
  
+               if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb))
+                       goto fc_output;
+
+               tcp_free_sackholes(tp);
                 /*
                  * Check for potential Path MTU Discovery Black Hole 
                  */
  
                 if (tcp_pmtud_black_hole_detect && (tp->t_state == TCPS_ESTABLISHED)) {
-                       if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) && (tp->t_rxtshift == 2)) {
+                       if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) &&
+                                (tp->t_rxtshift == 2)) {
                                 /* 
                                  * Enter Path MTU Black-hole Detection mechanism:
                                  * - Disable Path MTU Discovery (IP "DF" bit).
                                  * - Reduce MTU to lower value than what we negociated with peer.
                                  */
-
-                               tp->t_flags &= ~TF_PMTUD; /* Disable Path MTU Discovery for now */
-                               tp->t_flags |= TF_BLACKHOLE; /* Record that we may have found a black hole */
+                               /* Disable Path MTU Discovery for now */
+                               tp->t_flags &= ~TF_PMTUD;
+                               /* Record that we may have found a black hole */
+                               tp->t_flags |= TF_BLACKHOLE;
                                 optlen = tp->t_maxopd - tp->t_maxseg;
-                               tp->t_pmtud_saved_maxopd = tp->t_maxopd; /* Keep track of previous MSS */
-                               if (tp->t_maxopd > tcp_pmtud_black_hole_mss)
-                                       tp->t_maxopd = tcp_pmtud_black_hole_mss; /* Reduce the MSS to intermediary value */
-                               else {
+                               /* Keep track of previous MSS */
+                               tp->t_pmtud_saved_maxopd = tp->t_maxopd;
+                               /* Reduce the MSS to intermediary value */
+                               if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
+                                       tp->t_maxopd = tcp_pmtud_black_hole_mss;
+                               } else {
                                         tp->t_maxopd =  /* use the default MSS */
  #if INET6
                                                 isipv6 ? tcp_v6mssdflt :
@@ -662,7 +770,8 @@ tcp_timers(tp, timer)
                                 tp->t_maxseg = tp->t_maxopd - optlen;
  
                                 /*
-                                * Reset the slow-start flight size as it may depends on the new MSS
+                                * Reset the slow-start flight size 
+                                * as it may depend on the new MSS
                                  */
                                 if (CC_ALGO(tp)->cwnd_init != NULL)
                                         CC_ALGO(tp)->cwnd_init(tp);
@@ -681,8 +790,9 @@ tcp_timers(tp, timer)
                                         tp->t_maxopd = tp->t_pmtud_saved_maxopd;
                                         tp->t_maxseg = tp->t_maxopd - optlen;
                                         /*
-                                       * Reset the slow-start flight size as it may depends on the new MSS
-                                       */
+                                        * Reset the slow-start flight size as it 
+                                        * may depend on the new MSS
+                                        */
                                         if (CC_ALGO(tp)->cwnd_init != NULL)
                                                 CC_ALGO(tp)->cwnd_init(tp);
                                 }
@@ -696,10 +806,15 @@ tcp_timers(tp, timer)
                  * broken terminal servers (most of which have hopefully been
                  * retired) that have bad VJ header compression code which
                  * trashes TCP segments containing unknown-to-them TCP options.
+                * Do this only on non-local connections.
                  */
-               if ((tp->t_state == TCPS_SYN_SENT) &&
-                   (tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres))
+               if (tp->t_state == TCPS_SYN_SENT &&
+                   ((!(tp->t_flags & TF_LOCAL) &&
+                   tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres) ||
+                   ((tp->t_flags & TF_LOCAL) && 
+                   tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres_local)))
                         tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
+
                 /*
                  * If losing, let the lower level know and try for
                  * a better route.  Also, if we backed off this far,
@@ -733,12 +848,25 @@ tcp_timers(tp, timer)
                  */
                 tp->t_rtttime = 0;
  
-               if (CC_ALGO(tp)->after_timeout != NULL)
+               EXIT_FASTRECOVERY(tp);
+
+               /* RFC 5681 says: when a TCP sender detects segment loss
+                * using retransmit timer and the given segment has already
+                * been retransmitted by way of the retransmission timer at
+                * least once, the value of ssthresh is held constant
+                */
+               if (tp->t_rxtshift == 1 && 
+                       CC_ALGO(tp)->after_timeout != NULL)
                         CC_ALGO(tp)->after_timeout(tp);
  
-               tp->t_dupacks = 0;
-               EXIT_FASTRECOVERY(tp);
  
+               /* CWR notifications are to be sent on new data right after
+                * RTOs, Fast Retransmits and ECE notification receipts.
+                */
+               if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) {
+                       tp->ecn_flags |= TE_SENDCWR;
+               }
+fc_output:
                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb,
                         struct tcpcb *, tp, struct tcphdr *, NULL,
                         int32_t, TCP_CC_REXMT_TIMEOUT);
@@ -766,11 +894,13 @@ tcp_timers(tp, timer)
                 if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
                     (idle_time >= tcp_maxpersistidle ||
                     idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) || 
-                   ((tp->t_persist_stop != 0) && (tp->t_persist_stop <= tcp_now))) {
+                   ((tp->t_persist_stop != 0) && 
+                       TSTMP_LEQ(tp->t_persist_stop, tcp_now))) {
                         tcpstat.tcps_persistdrop++;
-                       so_tmp = tp->t_inpcb->inp_socket;
+                       postevent(so, 0, EV_TIMEOUT);
+                       soevent(so,
+                           (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
                         tp = tcp_drop(tp, ETIMEDOUT);
-                       postevent(so_tmp, 0, EV_TIMEOUT);
                         break;
                 }
                 tcp_setpersist(tp);
@@ -785,12 +915,27 @@ tcp_timers(tp, timer)
          */
         case TCPT_KEEP:
                 tcpstat.tcps_keeptimeo++;
+#if MPTCP
+               /*
+                * Regular TCP connections do not send keepalives after closing
+                * MPTCP must not also, after sending Data FINs.
+                */
+               struct mptcb *mp_tp = tp->t_mptcb;
+               if ((tp->t_mpflags & TMPF_MPTCP_TRUE) && 
+                   (mp_tp == NULL)) {
+                       goto dropit;
+               } else if (mp_tp != NULL) {
+                       if ((mptcp_ok_to_keepalive(mp_tp) == 0))
+                               goto dropit;
+               }
+#endif /* MPTCP */
                 if (tp->t_state < TCPS_ESTABLISHED)
                         goto dropit;
                 if ((always_keepalive ||
-                   tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) &&
+                   (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ||
+                   (tp->t_flagsext & TF_DETECT_READSTALL)) &&
                     (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
-                       if (idle_time >= TCP_KEEPIDLE(tp) + (u_int32_t)tcp_maxidle)
+                       if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp))
                                 goto dropit;
                         /*
                          * Send a packet designed to force a response
@@ -810,7 +955,7 @@ tcp_timers(tp, timer)
                                 unsigned int ifscope, nocell = 0;
  
                                 if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
-                                       ifscope = tp->t_inpcb->inp_boundif;
+                                       ifscope = tp->t_inpcb->inp_boundifp->if_index;
                                 else
                                         ifscope = IFSCOPE_NONE;
  
@@ -826,10 +971,34 @@ tcp_timers(tp, timer)
                                     tp->rcv_nxt, tp->snd_una - 1, 0, ifscope,
                                     nocell);
                                 (void) m_free(dtom(t_template));
+                               if (tp->t_flagsext & TF_DETECT_READSTALL)
+                                       tp->t_rtimo_probes++;
                         }
-                       tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, tcp_keepintvl);
-               } else
-                       tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp));
+                       tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
+                               TCP_CONN_KEEPINTVL(tp));
+               } else {
+                       tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
+                               TCP_CONN_KEEPIDLE(tp));
+               }
+               if (tp->t_flagsext & TF_DETECT_READSTALL) {
+                       /* 
+                        * The keep alive packets sent to detect a read
+                        * stall did not get a response from the 
+                        * peer. Generate more keep-alives to confirm this.
+                        * If the number of probes sent reaches the limit,
+                        * generate an event.
+                        */
+                       if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) {
+                               /* Generate an event */
+                               soevent(so,
+                                       (SO_FILT_HINT_LOCKED|
+                                       SO_FILT_HINT_ADAPTIVE_RTIMO));
+                               tcp_keepalive_reset(tp);
+                       } else {
+                               tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(
+                                       tp, TCP_REXMTVAL(tp));
+                       }
+               }
                 break;
         case TCPT_DELACK:
                 if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
@@ -837,17 +1006,51 @@ tcp_timers(tp, timer)
                         tp->t_timer[TCPT_DELACK] = 0;
                         tp->t_flags |= TF_ACKNOW;
  
-                       /* If delayed ack timer fired while we are stretching acks, 
+                       /* If delayed ack timer fired while stretching acks
                          * go back to acking every other packet
                          */
                         if ((tp->t_flags & TF_STRETCHACK) != 0)
                                 tcp_reset_stretch_ack(tp);
  
+                       /* If we are measuring inter packet arrival jitter for 
+                        * throttling a connection, this delayed ack might be 
+                        * the reason for accumulating some jitter. So let's
+                        * restart the measurement.
+                        */
+                       CLEAR_IAJ_STATE(tp);
+
                         tcpstat.tcps_delack++;
                         (void) tcp_output(tp);
                 }
                 break;
  
+#if MPTCP
+       case TCPT_JACK_RXMT:
+               if ((tp->t_state == TCPS_ESTABLISHED) &&
+                   (tp->t_mpflags & TMPF_PREESTABLISHED) &&
+                   (tp->t_mpflags & TMPF_JOINED_FLOW)) {
+                       if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) {
+                               tcpstat.tcps_timeoutdrop++;
+                               postevent(so, 0, EV_TIMEOUT);
+                               soevent(so, 
+                                   (SO_FILT_HINT_LOCKED|
+                                   SO_FILT_HINT_TIMEOUT));
+                               tp = tcp_drop(tp, tp->t_softerror ?
+                                   tp->t_softerror : ETIMEDOUT);
+                               break;
+                       }
+                       tcpstat.tcps_join_rxmts++;
+                       tp->t_flags |= TF_ACKNOW;
+
+                       /*
+                        * No backoff is implemented for simplicity for this 
+                        * corner case.
+                        */
+                       (void) tcp_output(tp);
+               }
+               break;
+#endif /* MPTCP */
+
  #if TCPDEBUG
         if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
@@ -855,8 +1058,10 @@ tcp_timers(tp, timer)
  #endif
         dropit:
                 tcpstat.tcps_keepdrops++;
+               postevent(so, 0, EV_TIMEOUT);
+               soevent(so,
+                   (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
                 tp = tcp_drop(tp, ETIMEDOUT);
-               postevent(so_tmp, 0, EV_TIMEOUT);
                 break;
         }
         return (tp);
@@ -887,10 +1092,10 @@ tcp_remove_timer(struct tcpcb *tp)
         tp->t_flags &= ~(TF_TIMER_ONLIST);
  
         listp->entries--;
-       lck_mtx_unlock(listp->mtx);
  
         tp->tentry.le.le_next = NULL;
         tp->tentry.le.le_prev = NULL;
+       lck_mtx_unlock(listp->mtx);
  }
  
  /* Function to check if the timerlist needs to be rescheduled to run
@@ -904,16 +1109,18 @@ need_to_resched_timerlist(uint32_t runtime, uint16_t index) {
         int32_t diff;
         boolean_t is_fast;
  
-       if (runtime == 0 || index == TCPT_NONE)
+       if (index == TCPT_NONE)
                 return FALSE;
         is_fast = !(IS_TIMER_SLOW(index));
  
         /* If the list is being processed then the state of the list is in flux.
          * In this case always acquire the lock and set the state correctly.
          */
-       if (listp->running) {
+       if (listp->running)
                 return TRUE;
-       }
+
+       if (!listp->scheduled)
+               return (TRUE);
  
         diff = timer_diff(listp->runtime, 0, runtime, 0);
         if (diff <= 0) {
@@ -940,12 +1147,16 @@ tcp_sched_timerlist(uint32_t offset)
  
         lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED);
  
+       offset = min(offset, TCP_TIMERLIST_MAX_OFFSET);
         listp->runtime = tcp_now + offset;
+       if (listp->runtime == 0)
+               listp->runtime++;
  
         clock_interval_to_deadline(offset, NSEC_PER_SEC / TCP_RETRANSHZ,
                 &deadline);
  
         thread_call_enter_delayed(listp->call, deadline);
+       listp->scheduled = TRUE;
  }
  
  /* Function to run the timers for a connection.
@@ -985,11 +1196,9 @@ tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) {
           * with another thread that can cancel or reschedule the timer that is
           * about to run. Check if we need to run anything.
           */
-       index = tp->tentry.index;
-       timer_val = tp->t_timer[index];
-
-        if (index == TCPT_NONE || tp->tentry.runtime == 0) 
+       if ((index = tp->tentry.index) == TCPT_NONE)
                 goto done;
+       timer_val = tp->t_timer[index];
  
         diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
         if (diff > 0) {
@@ -1032,8 +1241,8 @@ tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) {
         tp->tentry.index = lo_index;
         if (lo_index != TCPT_NONE) {
                 tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index];
-       } else {
-               tp->tentry.runtime = 0;
+               if (tp->tentry.runtime == 0)
+                       tp->tentry.runtime++;
         }
  
         if (count > 0) {
@@ -1042,8 +1251,11 @@ tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) {
                         if (needtorun[i]) {
                                 tp->t_timer[i] = 0;
                                 tp = tcp_timers(tp, i);
-                               if (tp == NULL) 
+                               if (tp == NULL) {
+                                       offset = 0;
+                                       *(next_index) = TCPT_NONE;
                                         goto done;
+                               }
                         }
                 }
                 tcp_set_lotimer_index(tp);
@@ -1057,6 +1269,7 @@ tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) {
  done:
         if (tp != NULL && tp->tentry.index == TCPT_NONE) {
                 tcp_remove_timer(tp);
+               offset = 0;
         }
          tcp_unlock(so, 1, 0);
          return offset;
@@ -1085,7 +1298,7 @@ tcp_run_timerlist(void * arg1, void * arg2) {
         LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
                 uint32_t offset = 0;
                 uint32_t runtime = te->runtime;
-               if (TSTMP_GT(runtime, tcp_now)) {
+               if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) {
                         offset = timer_diff(runtime, 0, tcp_now, 0);
                         if (next_timer == 0 || offset < next_timer) {
                                 next_timer = offset;
@@ -1181,8 +1394,11 @@ tcp_run_timerlist(void * arg1, void * arg2) {
  
                 tcp_sched_timerlist(next_timer);
         } else {
-               /* No need to reschedule this timer */
-               listp->runtime = 0;
+               /*
+                * No need to reschedule this timer, but always run
+                * periodically at a much higher granularity.
+                */
+               tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET);
         }
  
         listp->running = FALSE;
@@ -1199,7 +1415,7 @@ tcp_sched_timers(struct tcpcb *tp)
         struct tcptimerentry *te = &tp->tentry;
         uint16_t index = te->index;
         struct tcptimerlist *listp = &tcp_timer_list;
-       uint32_t offset = 0;
+       int32_t offset = 0;
         boolean_t is_fast;
         int list_locked = 0;
  
@@ -1217,8 +1433,8 @@ tcp_sched_timers(struct tcpcb *tp)
         }
  
         is_fast = !(IS_TIMER_SLOW(index));
-       offset = te->runtime - tcp_now;
-       if (offset == 0) {
+       offset = timer_diff(te->runtime, 0, tcp_now, 0);
+       if (offset <= 0) {
                 offset = 1;
                 tcp_timer_advanced++;
         }
@@ -1239,7 +1455,7 @@ tcp_sched_timers(struct tcpcb *tp)
                         listp->maxentries = listp->entries;
  
                 /* if the list is not scheduled, just schedule it */
-               if (listp->runtime == 0)
+               if (!listp->scheduled)
                         goto schedule;
  
         }
@@ -1261,15 +1477,22 @@ tcp_sched_timers(struct tcpcb *tp)
                         if (is_fast) {
                                 listp->pref_mode = TCP_TIMERLIST_FASTMODE;
                         } else if (listp->pref_offset == 0 ||
-                               ((int)offset) < listp->pref_offset) {
+                               offset < listp->pref_offset) {
                                 listp->pref_offset = offset;
                         }
                 } else {
-                       int32_t diff;
-                       diff = timer_diff(listp->runtime, 0, tcp_now, offset);
-                       if (diff <= 0) {
-                               /* The list is going to run before this timer */
-                               goto done;
+                       /*
+                        * The list could have got scheduled while this
+                        * thread was waiting for the lock
+                        */
+                       if (listp->scheduled) {
+                               int32_t diff;
+                               diff = timer_diff(listp->runtime, 0,
+                                   tcp_now, offset);
+                               if (diff <= 0)
+                                       goto done;
+                               else
+                                       goto schedule;
                         } else {
                                 goto schedule;
                         }
@@ -1305,8 +1528,8 @@ tcp_set_lotimer_index(struct tcpcb *tp) {
         tp->tentry.index = lo_index;
         if (lo_index != TCPT_NONE) {
                 tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index];
-       } else {
-               tp->tentry.runtime = 0;
+               if (tp->tentry.runtime == 0)
+                       tp->tentry.runtime++;
         }
  }
  
@@ -1315,6 +1538,9 @@ tcp_check_timer_state(struct tcpcb *tp) {
  
         lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
  
+       if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT)
+               return;
+
         tcp_set_lotimer_index(tp);
  
         tcp_sched_timers(tp);