]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/tcp_timer.c
xnu-2782.30.5.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_timer.c
CommitLineData
1c79356b 1/*
8a3053a0 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
9bccf70c 61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
1c79356b
A
62 */
63
1c79356b
A
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/kernel.h>
9bccf70c 68#include <sys/mbuf.h>
1c79356b
A
69#include <sys/sysctl.h>
70#include <sys/socket.h>
71#include <sys/socketvar.h>
72#include <sys/protosw.h>
b0d623f7 73#include <sys/domain.h>
6d2010ae
A
74#include <sys/mcache.h>
75#include <sys/queue.h>
91447636 76#include <kern/locks.h>
1c79356b 77#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
39236c6e 78#include <mach/boolean.h>
1c79356b
A
79
80#include <net/route.h>
316670eb 81#include <net/if_var.h>
fe8ab488 82#include <net/ntstat.h>
1c79356b
A
83
84#include <netinet/in.h>
85#include <netinet/in_systm.h>
1c79356b 86#include <netinet/in_pcb.h>
9bccf70c
A
87#if INET6
88#include <netinet6/in6_pcb.h>
89#endif
1c79356b
A
90#include <netinet/ip_var.h>
91#include <netinet/tcp.h>
92#include <netinet/tcp_fsm.h>
93#include <netinet/tcp_seq.h>
94#include <netinet/tcp_timer.h>
95#include <netinet/tcp_var.h>
6d2010ae 96#include <netinet/tcp_cc.h>
b0d623f7
A
97#if INET6
98#include <netinet6/tcp6_var.h>
99#endif
1c79356b
A
100#include <netinet/tcpip.h>
101#if TCPDEBUG
102#include <netinet/tcp_debug.h>
103#endif
104#include <sys/kdebug.h>
6d2010ae 105#include <mach/sdt.h>
39236c6e 106#include <netinet/mptcp_var.h>
1c79356b 107
6d2010ae
A
108#define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next)))
109
110#define VERIFY_NEXT_LINK(elm,field) do { \
111 if (LIST_NEXT((elm),field) != NULL && \
112 LIST_NEXT((elm),field)->field.le_prev != \
113 &((elm)->field.le_next)) \
114 panic("Bad link elm %p next->prev != elm", (elm)); \
115} while(0)
116
117#define VERIFY_PREV_LINK(elm,field) do { \
118 if (*(elm)->field.le_prev != (elm)) \
119 panic("Bad link elm %p prev->next != elm", (elm)); \
120} while(0)
121
fe8ab488
A
122#define TCP_SET_TIMER_MODE(mode, i) do { \
123 if (IS_TIMER_HZ_10MS(i)) \
124 (mode) |= TCP_TIMERLIST_10MS_MODE; \
125 else if (IS_TIMER_HZ_100MS(i)) \
126 (mode) |= TCP_TIMERLIST_100MS_MODE; \
127 else \
128 (mode) |= TCP_TIMERLIST_500MS_MODE; \
129} while(0)
130
131/* Max number of times a stretch ack can be delayed on a connection */
132#define TCP_STRETCHACK_DELAY_THRESHOLD 5
133
39236c6e
A
134/* tcp timer list */
135struct tcptimerlist tcp_timer_list;
136
137/* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */
138struct tcptailq tcp_tw_tailq;
139
9bccf70c
A
140static int
141sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
142{
2d21ac55 143#pragma unused(arg1, arg2)
9bccf70c 144 int error, s, tt;
1c79356b 145
9bccf70c 146 tt = *(int *)oidp->oid_arg1;
2d21ac55 147 s = tt * 1000 / TCP_RETRANSHZ;;
1c79356b 148
9bccf70c
A
149 error = sysctl_handle_int(oidp, &s, 0, req);
150 if (error || !req->newptr)
151 return (error);
152
2d21ac55 153 tt = s * TCP_RETRANSHZ / 1000;
9bccf70c
A
154 if (tt < 1)
155 return (EINVAL);
156
157 *(int *)oidp->oid_arg1 = tt;
158 return (0);
159}
1c79356b 160
9bccf70c 161int tcp_keepinit;
fe8ab488
A
162SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
163 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c
A
164 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
165
166int tcp_keepidle;
fe8ab488
A
167SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
168 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c
A
169 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
170
171int tcp_keepintvl;
fe8ab488
A
172SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
173 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c
A
174 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
175
39236c6e 176int tcp_keepcnt;
fe8ab488
A
177SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt,
178 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
179 &tcp_keepcnt, 0, "number of times to repeat keepalive");
39236c6e 180
9bccf70c 181int tcp_msl;
fe8ab488
A
182SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
183 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c 184 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
1c79356b 185
6d2010ae 186/*
fe8ab488
A
187 * Avoid DoS via TCP Robustness in Persist Condition
188 * (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt)
189 * by allowing a system wide maximum persistence timeout value when in
190 * Zero Window Probe mode.
191 *
192 * Expressed in milliseconds to be consistent without timeout related
193 * values, the TCP socket option is in seconds.
6d2010ae
A
194 */
195u_int32_t tcp_max_persist_timeout = 0;
fe8ab488
A
196SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout,
197 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
198 &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I",
199 "Maximum persistence timeout for ZWP");
6d2010ae 200
1c79356b 201static int always_keepalive = 0;
fe8ab488
A
202SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive,
203 CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c 204 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
1c79356b 205
fe8ab488
A
206/*
207 * This parameter determines how long the timer list will stay in fast or
208 * quick mode even though all connections are idle. In this state, the
209 * timer will run more frequently anticipating new data.
6d2010ae 210 */
fe8ab488
A
211int timer_fastmode_idlemax = TCP_FASTMODE_IDLERUN_MAX;
212SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax,
213 CTLFLAG_RW | CTLFLAG_LOCKED,
214 &timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode");
6d2010ae 215
b0d623f7
A
216/*
217 * See tcp_syn_backoff[] for interval values between SYN retransmits;
218 * the value set below defines the number of retransmits, before we
219 * disable the timestamp and window scaling options during subsequent
220 * SYN retransmits. Setting it to 0 disables the dropping off of those
221 * two options.
222 */
223static int tcp_broken_peer_syn_rxmit_thres = 7;
fe8ab488
A
224SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres,
225 CTLFLAG_RW | CTLFLAG_LOCKED,
226 &tcp_broken_peer_syn_rxmit_thres, 0,
227 "Number of retransmitted SYNs before "
b0d623f7
A
228 "TCP disables rfc1323 and rfc1644 during the rest of attempts");
229
39236c6e
A
230/* A higher threshold on local connections for disabling RFC 1323 options */
231static int tcp_broken_peer_syn_rxmit_thres_local = 10;
232SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rexmit_thres_local,
fe8ab488
A
233 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres_local, 0,
234 "Number of retransmitted SYNs before disabling RFC 1323 "
235 "options on local connections");
39236c6e 236
6d2010ae 237static int tcp_timer_advanced = 0;
fe8ab488
A
238SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced,
239 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_timer_advanced, 0,
240 "Number of times one of the timers was advanced");
6d2010ae
A
241
242static int tcp_resched_timerlist = 0;
fe8ab488
A
243SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist,
244 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_resched_timerlist, 0,
6d2010ae
A
245 "Number of times timer list was rescheduled as part of processing a packet");
246
b0d623f7 247int tcp_pmtud_black_hole_detect = 1 ;
fe8ab488
A
248SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
249 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_detect, 0,
250 "Path MTU Discovery Black Hole Detection");
b0d623f7
A
251
252int tcp_pmtud_black_hole_mss = 1200 ;
fe8ab488
A
253SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
254 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_mss, 0,
255 "Path MTU Discovery Black Hole Detection lowered MSS");
b0d623f7 256
39236c6e
A
257/* performed garbage collection of "used" sockets */
258static boolean_t tcp_gc_done = FALSE;
259
fe8ab488 260/* max idle probes */
9bccf70c 261int tcp_maxpersistidle;
1c79356b 262
fe8ab488
A
263/*
264 * TCP delack timer is set to 100 ms. Since the processing of timer list
265 * in fast mode will happen no faster than 100 ms, the delayed ack timer
266 * will fire some where between 100 and 200 ms.
6d2010ae
A
267 */
268int tcp_delack = TCP_RETRANSHZ / 10;
269
39236c6e
A
270#if MPTCP
271/*
272 * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff
273 */
274int tcp_jack_rxmt = TCP_RETRANSHZ / 2;
275#endif /* MPTCP */
1c79356b 276
6d2010ae
A
277static void tcp_remove_timer(struct tcpcb *tp);
278static void tcp_sched_timerlist(uint32_t offset);
fe8ab488 279static u_int32_t tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *mode);
6d2010ae
A
280static void tcp_sched_timers(struct tcpcb *tp);
281static inline void tcp_set_lotimer_index(struct tcpcb *);
39236c6e 282static void tcp_rexmt_save_state(struct tcpcb *tp);
fe8ab488
A
283__private_extern__ void tcp_remove_from_time_wait(struct inpcb *inp);
284__private_extern__ void tcp_report_stats(void);
6d2010ae 285
fe8ab488
A
286/*
287 * Macro to compare two timers. If there is a reset of the sign bit,
288 * it is safe to assume that the timer has wrapped around. By doing
289 * signed comparision, we take care of wrap around such that the value
290 * with the sign bit reset is actually ahead of the other.
6d2010ae 291 */
fe8ab488 292inline int32_t
6d2010ae
A
293timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) {
294 return (int32_t)((t1 + toff1) - (t2 + toff2));
295};
296
fe8ab488
A
297static u_int64_t tcp_last_report_time;
298#define TCP_REPORT_STATS_INTERVAL 345600 /* 4 days, in seconds */
299
6d2010ae
A
300/* Returns true if the timer is on the timer list */
301#define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
302
8a3053a0 303/* Run the TCP timerlist atleast once every hour */
fe8ab488 304#define TCP_TIMERLIST_MAX_OFFSET (60 * 60 * TCP_RETRANSHZ)
6d2010ae 305
2d21ac55 306
fe8ab488 307static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
39236c6e 308static boolean_t tcp_garbage_collect(struct inpcb *, int);
1c79356b 309
39236c6e
A
310/*
311 * Add to tcp timewait list, delay is given in milliseconds.
312 */
313static void
314add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
1c79356b 315{
39236c6e
A
316 struct inpcbinfo *pcbinfo = &tcbinfo;
317 struct inpcb *inp = tp->t_inpcb;
6d2010ae 318 uint32_t timer;
1c79356b 319
39236c6e
A
320 /* pcb list should be locked when we get here */
321 lck_rw_assert(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
1c79356b 322
39236c6e
A
323 /* We may get here multiple times, so check */
324 if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
325 pcbinfo->ipi_twcount++;
326 inp->inp_flags2 |= INP2_TIMEWAIT;
327
328 /* Remove from global inp list */
329 LIST_REMOVE(inp, inp_list);
330 } else {
331 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
332 }
2d21ac55 333
39236c6e
A
334 /* Compute the time at which this socket can be closed */
335 timer = tcp_now + delay;
336
337 /* We will use the TCPT_2MSL timer for tracking this delay */
2d21ac55 338
39236c6e
A
339 if (TIMER_IS_ON_LIST(tp))
340 tcp_remove_timer(tp);
341 tp->t_timer[TCPT_2MSL] = timer;
1c79356b 342
39236c6e 343 TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry);
1c79356b
A
344}
345
39236c6e
A
346void
347add_to_time_wait(struct tcpcb *tp, uint32_t delay)
91447636 348{
39236c6e 349 struct inpcbinfo *pcbinfo = &tcbinfo;
fe8ab488
A
350 if (tp->t_inpcb->inp_socket->so_options & SO_NOWAKEFROMSLEEP)
351 socket_post_kev_msg_closed(tp->t_inpcb->inp_socket);
39236c6e
A
352
353 if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) {
91447636 354 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
39236c6e 355 lck_rw_lock_exclusive(pcbinfo->ipi_lock);
91447636
A
356 tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
357 }
6d2010ae 358 add_to_time_wait_locked(tp, delay);
39236c6e
A
359 lck_rw_done(pcbinfo->ipi_lock);
360
361 inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY);
91447636 362}
1c79356b 363
39236c6e
A
364/* If this is on time wait queue, remove it. */
365void
366tcp_remove_from_time_wait(struct inpcb *inp)
367{
368 struct tcpcb *tp = intotcpcb(inp);
369 if (inp->inp_flags2 & INP2_TIMEWAIT)
370 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
371}
372
373static boolean_t
b0d623f7 374tcp_garbage_collect(struct inpcb *inp, int istimewait)
2d21ac55 375{
39236c6e 376 boolean_t active = FALSE;
2d21ac55
A
377 struct socket *so;
378 struct tcpcb *tp;
379
b0d623f7
A
380 so = inp->inp_socket;
381 tp = intotcpcb(inp);
2d21ac55 382
b0d623f7
A
383 /*
384 * Skip if still in use or busy; it would have been more efficient
385 * if we were to test so_usecount against 0, but this isn't possible
386 * due to the current implementation of tcp_dropdropablreq() where
387 * overflow sockets that are eligible for garbage collection have
388 * their usecounts set to 1.
389 */
39236c6e
A
390 if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx))
391 return (TRUE);
2d21ac55 392
b0d623f7
A
393 /* Check again under the lock */
394 if (so->so_usecount > 1) {
39236c6e
A
395 if (inp->inp_wantcnt == WNT_STOPUSING)
396 active = TRUE;
6d2010ae 397 lck_mtx_unlock(&inp->inpcb_mtx);
39236c6e
A
398 return (active);
399 }
400
401 if (istimewait &&
402 TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) &&
403 tp->t_state != TCPS_CLOSED) {
404 /* Become a regular mutex */
405 lck_mtx_convert_spin(&inp->inpcb_mtx);
406 tcp_close(tp);
b0d623f7 407 }
2d21ac55 408
b0d623f7
A
409 /*
410 * Overflowed socket dropped from the listening queue? Do this
411 * only if we are called to clean up the time wait slots, since
412 * tcp_dropdropablreq() considers a socket to have been fully
413 * dropped after add_to_time_wait() is finished.
39236c6e
A
414 * Also handle the case of connections getting closed by the peer
415 * while in the queue as seen with rdar://6422317
416 *
b0d623f7 417 */
39236c6e 418 if (so->so_usecount == 1 &&
b0d623f7 419 ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
39236c6e
A
420 ((tp != NULL) && (tp->t_state == TCPS_CLOSED) &&
421 (so->so_head != NULL) &&
422 ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
423 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
b0d623f7
A
424
425 if (inp->inp_state != INPCB_STATE_DEAD) {
426 /* Become a regular mutex */
6d2010ae 427 lck_mtx_convert_spin(&inp->inpcb_mtx);
b0d623f7 428#if INET6
39236c6e 429 if (SOCK_CHECK_DOM(so, PF_INET6))
b0d623f7
A
430 in6_pcbdetach(inp);
431 else
432#endif /* INET6 */
39236c6e 433 in_pcbdetach(inp);
2d21ac55 434 }
b0d623f7 435 so->so_usecount--;
39236c6e
A
436 if (inp->inp_wantcnt == WNT_STOPUSING)
437 active = TRUE;
6d2010ae 438 lck_mtx_unlock(&inp->inpcb_mtx);
39236c6e 439 return (active);
b0d623f7 440 } else if (inp->inp_wantcnt != WNT_STOPUSING) {
6d2010ae 441 lck_mtx_unlock(&inp->inpcb_mtx);
39236c6e 442 return (FALSE);
b0d623f7 443 }
2d21ac55 444
b0d623f7 445 /*
39236c6e
A
446 * We get here because the PCB is no longer searchable
447 * (WNT_STOPUSING); detach (if needed) and dispose if it is dead
448 * (usecount is 0). This covers all cases, including overflow
449 * sockets and those that are considered as "embryonic",
450 * i.e. created by sonewconn() in TCP input path, and have
451 * not yet been committed. For the former, we reduce the usecount
452 * to 0 as done by the code above. For the latter, the usecount
453 * would have reduced to 0 as part calling soabort() when the
b0d623f7
A
454 * socket is dropped at the end of tcp_input().
455 */
456 if (so->so_usecount == 0) {
6d2010ae
A
457 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
458 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
b0d623f7 459 /* Become a regular mutex */
6d2010ae 460 lck_mtx_convert_spin(&inp->inpcb_mtx);
39236c6e
A
461
462 /*
463 * If this tp still happens to be on the timer list,
ebb1b9f4
A
464 * take it out
465 */
466 if (TIMER_IS_ON_LIST(tp)) {
467 tcp_remove_timer(tp);
468 }
469
b0d623f7
A
470 if (inp->inp_state != INPCB_STATE_DEAD) {
471#if INET6
39236c6e 472 if (SOCK_CHECK_DOM(so, PF_INET6))
b0d623f7
A
473 in6_pcbdetach(inp);
474 else
475#endif /* INET6 */
39236c6e 476 in_pcbdetach(inp);
2d21ac55 477 }
b0d623f7 478 in_pcbdispose(inp);
39236c6e 479 return (FALSE);
b0d623f7 480 }
39236c6e
A
481
482 lck_mtx_unlock(&inp->inpcb_mtx);
483 return (TRUE);
2d21ac55
A
484}
485
39236c6e
A
486/*
487 * TCP garbage collector callback (inpcb_timer_func_t).
488 *
489 * Returns the number of pcbs that will need to be gc-ed soon,
490 * returnining > 0 will keep timer active.
491 */
1c79356b 492void
39236c6e 493tcp_gc(struct inpcbinfo *ipi)
1c79356b 494{
4a3eedf9 495 struct inpcb *inp, *nxt;
39236c6e 496 struct tcpcb *tw_tp, *tw_ntp;
1c79356b
A
497#if TCPDEBUG
498 int ostate;
499#endif
b0d623f7 500#if KDEBUG
2d21ac55 501 static int tws_checked = 0;
b0d623f7 502#endif
2d21ac55 503
39236c6e 504 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0);
1c79356b 505
39236c6e
A
506 /*
507 * Update tcp_now here as it may get used while
508 * processing the slow timer.
509 */
6d2010ae 510 calculate_tcp_clock();
8ad349bb 511
39236c6e
A
512 /*
513 * Garbage collect socket/tcpcb: We need to acquire the list lock
6d2010ae 514 * exclusively to do this
2d21ac55
A
515 */
516
39236c6e
A
517 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
518 /* don't sweat it this time; cleanup was done last time */
519 if (tcp_gc_done == TRUE) {
2d21ac55 520 tcp_gc_done = FALSE;
39236c6e
A
521 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END,
522 tws_checked, cur_tw_slot, 0, 0, 0);
523 /* Lock upgrade failed, give up this round */
524 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
525 return;
2d21ac55 526 }
39236c6e
A
527 /* Upgrade failed, lost lock now take it again exclusive */
528 lck_rw_lock_exclusive(ipi->ipi_lock);
2d21ac55
A
529 }
530 tcp_gc_done = TRUE;
1c79356b 531
39236c6e
A
532 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
533 if (tcp_garbage_collect(inp, 0))
534 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
535 }
2d21ac55 536
39236c6e
A
537 /* Now cleanup the time wait ones */
538 TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) {
539 /*
540 * We check the timestamp here without holding the
541 * socket lock for better performance. If there are
542 * any pcbs in time-wait, the timer will get rescheduled.
543 * Hence some error in this check can be tolerated.
15129b1c
A
544 *
545 * Sometimes a socket on time-wait queue can be closed if
546 * 2MSL timer expired but the application still has a
547 * usecount on it.
39236c6e 548 */
15129b1c
A
549 if (tw_tp->t_state == TCPS_CLOSED ||
550 TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) {
39236c6e
A
551 if (tcp_garbage_collect(tw_tp->t_inpcb, 1))
552 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
2d21ac55 553 }
91447636
A
554 }
555
39236c6e
A
556 /* take into account pcbs that are still in time_wait_slots */
557 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount);
91447636 558
39236c6e 559 lck_rw_done(ipi->ipi_lock);
1c79356b 560
39236c6e
A
561 /* Clean up the socache while we are here */
562 if (so_cache_timer())
563 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
91447636 564
39236c6e
A
565 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked,
566 cur_tw_slot, 0, 0, 0);
567
568 return;
1c79356b
A
569}
570
571/*
572 * Cancel all timers for TCP tp.
573 */
574void
575tcp_canceltimers(tp)
576 struct tcpcb *tp;
577{
578 register int i;
579
6d2010ae 580 tcp_remove_timer(tp);
1c79356b
A
581 for (i = 0; i < TCPT_NTIMERS; i++)
582 tp->t_timer[i] = 0;
6d2010ae
A
583 tp->tentry.timer_start = tcp_now;
584 tp->tentry.index = TCPT_NONE;
1c79356b
A
585}
586
9bccf70c
A
587int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
588 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
589
1c79356b
A
590int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
591 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
592
593static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
594
39236c6e
A
595static void tcp_rexmt_save_state(struct tcpcb *tp)
596{
597 u_int32_t fsize;
598 if (TSTMP_SUPPORTED(tp)) {
599 /*
600 * Since timestamps are supported on the connection,
601 * we can do recovery as described in rfc 4015.
602 */
603 fsize = tp->snd_max - tp->snd_una;
604 tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh);
605 tp->snd_recover_prev = tp->snd_recover;
606 } else {
607 /*
608 * Timestamp option is not supported on this connection.
609 * Record ssthresh and cwnd so they can
610 * be recovered if this turns out to be a "bad" retransmit.
611 * A retransmit is considered "bad" if an ACK for this
612 * segment is received within RTT/2 interval; the assumption
613 * here is that the ACK was already in flight. See
614 * "On Estimating End-to-End Network Path Properties" by
615 * Allman and Paxson for more details.
616 */
617 tp->snd_cwnd_prev = tp->snd_cwnd;
618 tp->snd_ssthresh_prev = tp->snd_ssthresh;
619 tp->snd_recover_prev = tp->snd_recover;
620 if (IN_FASTRECOVERY(tp))
621 tp->t_flags |= TF_WASFRECOVERY;
622 else
623 tp->t_flags &= ~TF_WASFRECOVERY;
624 }
625 tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2;
626 tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
627 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
628}
629
fe8ab488
A
630/*
631 * Revert to the older segment size if there is an indication that PMTU
632 * blackhole detection was not needed.
633 */
634void tcp_pmtud_revert_segment_size(struct tcpcb *tp)
635{
636 int32_t optlen;
637
638 VERIFY(tp->t_pmtud_saved_maxopd > 0);
639 tp->t_flags |= TF_PMTUD;
640 tp->t_flags &= ~TF_BLACKHOLE;
641 optlen = tp->t_maxopd - tp->t_maxseg;
642 tp->t_maxopd = tp->t_pmtud_saved_maxopd;
643 tp->t_maxseg = tp->t_maxopd - optlen;
644 /*
645 * Reset the slow-start flight size as it
646 * may depend on the new MSS
647 */
648 if (CC_ALGO(tp)->cwnd_init != NULL)
649 CC_ALGO(tp)->cwnd_init(tp);
650 tp->t_pmtud_start_ts = 0;
651 tcpstat.tcps_pmtudbh_reverted++;
652}
653
1c79356b
A
654/*
655 * TCP timer processing.
656 */
657struct tcpcb *
658tcp_timers(tp, timer)
659 register struct tcpcb *tp;
660 int timer;
661{
fe8ab488 662 int32_t rexmt, optlen = 0, idle_time = 0;
316670eb 663 struct socket *so;
9bccf70c 664 struct tcptemp *t_template;
55e303ae
A
665#if TCPDEBUG
666 int ostate;
667#endif
668
1c79356b
A
669#if INET6
670 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
671#endif /* INET6 */
672
316670eb 673 so = tp->t_inpcb->inp_socket;
6d2010ae 674 idle_time = tcp_now - tp->t_rcvtime;
9bccf70c 675
1c79356b
A
676 switch (timer) {
677
678 /*
679 * 2 MSL timeout in shutdown went off. If we're closed but
680 * still waiting for peer to close and connection has been idle
2d21ac55
A
681 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
682 * delete connection control block.
683 * Otherwise, (this case shouldn't happen) check again in a bit
684 * we keep the socket in the main list in that case.
1c79356b
A
685 */
686 case TCPT_2MSL:
8ad349bb 687 tcp_free_sackholes(tp);
1c79356b 688 if (tp->t_state != TCPS_TIME_WAIT &&
2d21ac55 689 tp->t_state != TCPS_FIN_WAIT_2 &&
39236c6e
A
690 ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) {
691 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
692 (u_int32_t)TCP_CONN_KEEPINTVL(tp));
693 } else {
1c79356b 694 tp = tcp_close(tp);
91447636
A
695 return(tp);
696 }
1c79356b
A
697 break;
698
699 /*
700 * Retransmission timer went off. Message has not
701 * been acked within retransmit interval. Back off
702 * to a longer retransmit interval and retransmit one segment.
703 */
704 case TCPT_REXMT:
fe8ab488
A
705 /*
706 * Drop a connection in the retransmit timer
707 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT
708 * times
709 * 2. If the time spent in this retransmission episode is
710 * more than the time limit set with TCP_RXT_CONNDROPTIME
711 * socket option
712 * 3. If TCP_RXT_FINDROP socket option was set and
713 * we have already retransmitted the FIN 3 times without
714 * receiving an ack
6d2010ae
A
715 */
716 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
fe8ab488
A
717 (tp->t_rxt_conndroptime > 0
718 && tp->t_rxtstart > 0 &&
719 (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime)
720 || ((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
6d2010ae
A
721 (tp->t_flags & TF_SENTFIN) != 0 &&
722 tp->t_rxtshift >= 4)) {
6d2010ae
A
723 if ((tp->t_flagsext & TF_RXTFINDROP) != 0) {
724 tcpstat.tcps_rxtfindrop++;
725 } else {
726 tcpstat.tcps_timeoutdrop++;
727 }
1c79356b 728 tp->t_rxtshift = TCP_MAXRXTSHIFT;
316670eb
A
729 postevent(so, 0, EV_TIMEOUT);
730 soevent(so,
731 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1c79356b
A
732 tp = tcp_drop(tp, tp->t_softerror ?
733 tp->t_softerror : ETIMEDOUT);
316670eb 734
1c79356b
A
735 break;
736 }
9bccf70c 737
39236c6e 738 tcpstat.tcps_rexmttimeo++;
6d2010ae 739
39236c6e
A
740 if (tp->t_rxtshift == 1 &&
741 tp->t_state == TCPS_ESTABLISHED) {
742 /* Set the time at which retransmission started. */
743 tp->t_rxtstart = tcp_now;
744
745 /*
746 * if this is the first retransmit timeout, save
747 * the state so that we can recover if the timeout
748 * is spurious.
749 */
750 tcp_rexmt_save_state(tp);
751 }
752#if MPTCP
fe8ab488 753 if ((tp->t_rxtshift >= mptcp_fail_thresh) &&
39236c6e
A
754 (tp->t_state == TCPS_ESTABLISHED) &&
755 (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
756 mptcp_act_on_txfail(so);
757
758 }
759#endif /* MPTCP */
760
761 if (tp->t_adaptive_wtimo > 0 &&
762 tp->t_rxtshift > tp->t_adaptive_wtimo &&
763 TCPS_HAVEESTABLISHED(tp->t_state)) {
764 /* Send an event to the application */
765 soevent(so,
766 (SO_FILT_HINT_LOCKED|
767 SO_FILT_HINT_ADAPTIVE_WTIMO));
9bccf70c 768 }
316670eb 769
fe8ab488
A
770 /*
771 * If this is a retransmit timeout after PTO, the PTO
772 * was not effective
773 */
774 if (tp->t_flagsext & TF_SENT_TLPROBE) {
775 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
776 tcpstat.tcps_rto_after_pto++;
777 }
778
779 if (tp->t_flagsext & TF_DELAY_RECOVERY) {
780 /*
781 * Retransmit timer fired before entering recovery
782 * on a connection with packet re-ordering. This
783 * suggests that the reordering metrics computed
784 * are not accurate.
785 */
786 tp->t_reorderwin = 0;
787 tp->t_timer[TCPT_DELAYFR] = 0;
788 tp->t_flagsext &= ~(TF_DELAY_RECOVERY);
789 }
790
39236c6e 791 if (tp->t_state == TCPS_SYN_SENT) {
9bccf70c 792 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
39236c6e 793 tp->t_stat.synrxtshift = tp->t_rxtshift;
fe8ab488 794 } else {
9bccf70c 795 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
fe8ab488
A
796 }
797
1c79356b 798 TCPT_RANGESET(tp->t_rxtcur, rexmt,
6d2010ae
A
799 tp->t_rttmin, TCPTV_REXMTMAX,
800 TCP_ADD_REXMTSLOP(tp));
801 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
9bccf70c 802
316670eb
A
803 if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb))
804 goto fc_output;
805
806 tcp_free_sackholes(tp);
9bccf70c 807 /*
fe8ab488 808 * Check for potential Path MTU Discovery Black Hole
b0d623f7 809 */
fe8ab488
A
810 if (tcp_pmtud_black_hole_detect &&
811 !(tp->t_flagsext & TF_NOBLACKHOLE_DETECTION) &&
812 (tp->t_state == TCPS_ESTABLISHED)) {
813 if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT))
814 == (TF_PMTUD|TF_MAXSEGSNT)) &&
316670eb 815 (tp->t_rxtshift == 2)) {
b0d623f7
A
816 /*
817 * Enter Path MTU Black-hole Detection mechanism:
818 * - Disable Path MTU Discovery (IP "DF" bit).
fe8ab488
A
819 * - Reduce MTU to lower value than what we
820 * negotiated with the peer.
b0d623f7 821 */
39236c6e
A
822 /* Disable Path MTU Discovery for now */
823 tp->t_flags &= ~TF_PMTUD;
824 /* Record that we may have found a black hole */
825 tp->t_flags |= TF_BLACKHOLE;
b0d623f7 826 optlen = tp->t_maxopd - tp->t_maxseg;
39236c6e
A
827 /* Keep track of previous MSS */
828 tp->t_pmtud_saved_maxopd = tp->t_maxopd;
fe8ab488
A
829 tp->t_pmtud_start_ts = tcp_now;
830 if (tp->t_pmtud_start_ts == 0)
831 tp->t_pmtud_start_ts++;
39236c6e
A
832 /* Reduce the MSS to intermediary value */
833 if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
834 tp->t_maxopd = tcp_pmtud_black_hole_mss;
835 } else {
b0d623f7
A
836 tp->t_maxopd = /* use the default MSS */
837#if INET6
838 isipv6 ? tcp_v6mssdflt :
839#endif /* INET6 */
840 tcp_mssdflt;
841 }
842 tp->t_maxseg = tp->t_maxopd - optlen;
6d2010ae
A
843
844 /*
39236c6e
A
845 * Reset the slow-start flight size
846 * as it may depend on the new MSS
6d2010ae
A
847 */
848 if (CC_ALGO(tp)->cwnd_init != NULL)
849 CC_ALGO(tp)->cwnd_init(tp);
b0d623f7
A
850 }
851 /*
fe8ab488
A
852 * If further retransmissions are still
853 * unsuccessful with a lowered MTU, maybe this
854 * isn't a Black Hole and we restore the previous
855 * MSS and blackhole detection flags.
b0d623f7
A
856 */
857 else {
858
fe8ab488
A
859 if ((tp->t_flags & TF_BLACKHOLE) &&
860 (tp->t_rxtshift > 4)) {
861 tcp_pmtud_revert_segment_size(tp);
b0d623f7
A
862 }
863 }
864 }
865
866
867 /*
fe8ab488
A
868 * Disable rfc1323 and rfc1644 if we haven't got any
869 * response to our SYN (after we reach the threshold)
870 * to work-around some broken terminal servers (most of
871 * which have hopefully been retired) that have bad VJ
872 * header compression code which trashes TCP segments
873 * containing unknown-to-them TCP options.
39236c6e 874 * Do this only on non-local connections.
9bccf70c 875 */
39236c6e
A
876 if (tp->t_state == TCPS_SYN_SENT &&
877 ((!(tp->t_flags & TF_LOCAL) &&
878 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres) ||
879 ((tp->t_flags & TF_LOCAL) &&
880 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres_local)))
b0d623f7 881 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
316670eb 882
1c79356b
A
883 /*
884 * If losing, let the lower level know and try for
885 * a better route. Also, if we backed off this far,
886 * our srtt estimate is probably bogus. Clobber it
887 * so we'll take the next rtt measurement as our srtt;
888 * move the current srtt into rttvar to keep the current
889 * retransmit times until then.
890 */
891 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
892#if INET6
893 if (isipv6)
894 in6_losing(tp->t_inpcb);
895 else
896#endif /* INET6 */
897 in_losing(tp->t_inpcb);
898 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
899 tp->t_srtt = 0;
900 }
901 tp->snd_nxt = tp->snd_una;
9bccf70c
A
902 /*
903 * Note: We overload snd_recover to function also as the
904 * snd_last variable described in RFC 2582
905 */
906 tp->snd_recover = tp->snd_max;
1c79356b
A
907 /*
908 * Force a segment to be sent.
909 */
910 tp->t_flags |= TF_ACKNOW;
fe8ab488
A
911
912 /* If timing a segment in this window, stop the timer */
9bccf70c 913 tp->t_rtttime = 0;
6d2010ae 914
fe8ab488
A
915 if (!IN_FASTRECOVERY(tp) && tp->t_rxtshift == 1)
916 tcpstat.tcps_tailloss_rto++;
917
39236c6e 918
fe8ab488
A
919 /*
920 * RFC 5681 says: when a TCP sender detects segment loss
39236c6e
A
921 * using retransmit timer and the given segment has already
922 * been retransmitted by way of the retransmission timer at
923 * least once, the value of ssthresh is held constant
924 */
925 if (tp->t_rxtshift == 1 &&
926 CC_ALGO(tp)->after_timeout != NULL)
6d2010ae
A
927 CC_ALGO(tp)->after_timeout(tp);
928
fe8ab488 929 EXIT_FASTRECOVERY(tp);
6d2010ae 930
316670eb
A
931 /* CWR notifications are to be sent on new data right after
932 * RTOs, Fast Retransmits and ECE notification receipts.
933 */
934 if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) {
935 tp->ecn_flags |= TE_SENDCWR;
936 }
937fc_output:
fe8ab488 938 tcp_ccdbg_trace(tp, NULL, TCP_CC_REXMT_TIMEOUT);
6d2010ae 939
1c79356b
A
940 (void) tcp_output(tp);
941 break;
942
943 /*
944 * Persistance timer into zero window.
945 * Force a byte to be output, if possible.
946 */
947 case TCPT_PERSIST:
948 tcpstat.tcps_persisttimeo++;
949 /*
950 * Hack: if the peer is dead/unreachable, we do not
951 * time out if the window is closed. After a full
952 * backoff, drop the connection if the idle time
953 * (no responses to probes) reaches the maximum
954 * backoff that we would use if retransmitting.
6d2010ae
A
955 *
956 * Drop the connection if we reached the maximum allowed time for
957 * Zero Window Probes without a non-zero update from the peer.
958 * See rdar://5805356
1c79356b 959 */
6d2010ae
A
960 if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
961 (idle_time >= tcp_maxpersistidle ||
962 idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) ||
316670eb
A
963 ((tp->t_persist_stop != 0) &&
964 TSTMP_LEQ(tp->t_persist_stop, tcp_now))) {
1c79356b 965 tcpstat.tcps_persistdrop++;
316670eb
A
966 postevent(so, 0, EV_TIMEOUT);
967 soevent(so,
968 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1c79356b 969 tp = tcp_drop(tp, ETIMEDOUT);
1c79356b
A
970 break;
971 }
972 tcp_setpersist(tp);
fe8ab488 973 tp->t_flagsext |= TF_FORCE;
1c79356b 974 (void) tcp_output(tp);
fe8ab488 975 tp->t_flagsext &= ~TF_FORCE;
1c79356b
A
976 break;
977
978 /*
979 * Keep-alive timer went off; send something
980 * or drop connection if idle for too long.
981 */
982 case TCPT_KEEP:
983 tcpstat.tcps_keeptimeo++;
39236c6e
A
984#if MPTCP
985 /*
986 * Regular TCP connections do not send keepalives after closing
987 * MPTCP must not also, after sending Data FINs.
988 */
989 struct mptcb *mp_tp = tp->t_mptcb;
fe8ab488
A
990 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
991 (tp->t_state > TCPS_ESTABLISHED)) {
39236c6e
A
992 goto dropit;
993 } else if (mp_tp != NULL) {
994 if ((mptcp_ok_to_keepalive(mp_tp) == 0))
995 goto dropit;
996 }
997#endif /* MPTCP */
1c79356b
A
998 if (tp->t_state < TCPS_ESTABLISHED)
999 goto dropit;
1000 if ((always_keepalive ||
39236c6e
A
1001 (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ||
1002 (tp->t_flagsext & TF_DETECT_READSTALL)) &&
2d21ac55 1003 (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
39236c6e 1004 if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp))
1c79356b
A
1005 goto dropit;
1006 /*
1007 * Send a packet designed to force a response
1008 * if the peer is up and reachable:
1009 * either an ACK if the connection is still alive,
1010 * or an RST if the peer has closed the connection
1011 * due to timeout or reboot.
1012 * Using sequence number tp->snd_una-1
1013 * causes the transmitted zero-length segment
1014 * to lie outside the receive window;
1015 * by the protocol spec, this requires the
1016 * correspondent TCP to respond.
1017 */
1018 tcpstat.tcps_keepprobe++;
9bccf70c
A
1019 t_template = tcp_maketemplate(tp);
1020 if (t_template) {
fe8ab488
A
1021 struct inpcb *inp = tp->t_inpcb;
1022 struct tcp_respond_args tra;
c910b4d9 1023
fe8ab488
A
1024 bzero(&tra, sizeof(tra));
1025 tra.nocell = INP_NO_CELLULAR(inp);
1026 tra.noexpensive = INP_NO_EXPENSIVE(inp);
1027 tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
c910b4d9 1028 if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
fe8ab488 1029 tra.ifscope = tp->t_inpcb->inp_boundifp->if_index;
c910b4d9 1030 else
fe8ab488 1031 tra.ifscope = IFSCOPE_NONE;
9bccf70c
A
1032 tcp_respond(tp, t_template->tt_ipgen,
1033 &t_template->tt_t, (struct mbuf *)NULL,
fe8ab488 1034 tp->rcv_nxt, tp->snd_una - 1, 0, &tra);
9bccf70c 1035 (void) m_free(dtom(t_template));
39236c6e
A
1036 if (tp->t_flagsext & TF_DETECT_READSTALL)
1037 tp->t_rtimo_probes++;
1038 }
1039 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1040 TCP_CONN_KEEPINTVL(tp));
1041 } else {
1042 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1043 TCP_CONN_KEEPIDLE(tp));
1044 }
1045 if (tp->t_flagsext & TF_DETECT_READSTALL) {
1046 /*
1047 * The keep alive packets sent to detect a read
1048 * stall did not get a response from the
1049 * peer. Generate more keep-alives to confirm this.
1050 * If the number of probes sent reaches the limit,
1051 * generate an event.
1052 */
1053 if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) {
1054 /* Generate an event */
1055 soevent(so,
1056 (SO_FILT_HINT_LOCKED|
1057 SO_FILT_HINT_ADAPTIVE_RTIMO));
1058 tcp_keepalive_reset(tp);
1059 } else {
1060 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(
1061 tp, TCP_REXMTVAL(tp));
9bccf70c 1062 }
39236c6e 1063 }
6d2010ae
A
1064 break;
1065 case TCPT_DELACK:
1066 if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
1067 tp->t_flags &= ~TF_DELACK;
1068 tp->t_timer[TCPT_DELACK] = 0;
1069 tp->t_flags |= TF_ACKNOW;
1070
fe8ab488
A
1071 /*
1072 * If delayed ack timer fired while stretching
1073 * acks, count the number of times the streaming
1074 * detection was not correct. If this exceeds a
1075 * threshold, disable strech ack on this
1076 * connection
1077 *
1078 * Also, go back to acking every other packet.
6d2010ae 1079 */
fe8ab488
A
1080 if ((tp->t_flags & TF_STRETCHACK)) {
1081 if (tp->t_unacksegs > 1 &&
1082 tp->t_unacksegs < maxseg_unacked)
1083 tp->t_stretchack_delayed++;
1084
1085 if (tp->t_stretchack_delayed >
1086 TCP_STRETCHACK_DELAY_THRESHOLD) {
1087 tp->t_flagsext |= TF_DISABLE_STRETCHACK;
1088 /*
1089 * Note the time at which stretch
1090 * ack was disabled automatically
1091 */
1092 tp->rcv_nostrack_ts = tcp_now;
1093 tcpstat.tcps_nostretchack++;
1094 tp->t_stretchack_delayed = 0;
1095 }
6d2010ae 1096 tcp_reset_stretch_ack(tp);
fe8ab488 1097 }
6d2010ae 1098
fe8ab488
A
1099 /*
1100 * If we are measuring inter packet arrival jitter
1101 * for throttling a connection, this delayed ack
1102 * might be the reason for accumulating some
1103 * jitter. So let's restart the measurement.
316670eb
A
1104 */
1105 CLEAR_IAJ_STATE(tp);
1106
6d2010ae
A
1107 tcpstat.tcps_delack++;
1108 (void) tcp_output(tp);
1109 }
1c79356b 1110 break;
9bccf70c 1111
39236c6e
A
1112#if MPTCP
1113 case TCPT_JACK_RXMT:
1114 if ((tp->t_state == TCPS_ESTABLISHED) &&
1115 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
1116 (tp->t_mpflags & TMPF_JOINED_FLOW)) {
1117 if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) {
1118 tcpstat.tcps_timeoutdrop++;
1119 postevent(so, 0, EV_TIMEOUT);
1120 soevent(so,
1121 (SO_FILT_HINT_LOCKED|
1122 SO_FILT_HINT_TIMEOUT));
1123 tp = tcp_drop(tp, tp->t_softerror ?
1124 tp->t_softerror : ETIMEDOUT);
1125 break;
1126 }
1127 tcpstat.tcps_join_rxmts++;
1128 tp->t_flags |= TF_ACKNOW;
1129
1130 /*
1131 * No backoff is implemented for simplicity for this
1132 * corner case.
1133 */
1134 (void) tcp_output(tp);
1135 }
1136 break;
1137#endif /* MPTCP */
1138
fe8ab488
A
1139 case TCPT_PTO:
1140 {
1141 tcp_seq old_snd_nxt;
1142 int32_t snd_len;
1143 boolean_t rescue_rxt = FALSE;
1144
1145 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1146
1147 /*
1148 * Check if the connection is in the right state to
1149 * send a probe
1150 */
1151 if (tp->t_state != TCPS_ESTABLISHED ||
1152 tp->t_rxtshift > 0 || tp->snd_max == tp->snd_una ||
1153 !SACK_ENABLED(tp) || TAILQ_EMPTY(&tp->snd_holes) ||
1154 (IN_FASTRECOVERY(tp) &&
1155 (SEQ_GEQ(tp->snd_fack, tp->snd_recover) ||
1156 SEQ_GT(tp->snd_nxt, tp->sack_newdata))))
1157 break;
1158
1159 tcpstat.tcps_pto++;
1160
1161 /* If timing a segment in this window, stop the timer */
1162 tp->t_rtttime = 0;
1163
1164 if (IN_FASTRECOVERY(tp)) {
1165 /*
1166 * Send a probe to detect tail loss in a
1167 * recovery window when the connection is in
1168 * fast_recovery.
1169 */
1170 old_snd_nxt = tp->snd_nxt;
1171 rescue_rxt = TRUE;
1172 VERIFY(SEQ_GEQ(tp->snd_fack, tp->snd_una));
1173 snd_len = min((tp->snd_recover - tp->snd_fack),
1174 tp->t_maxseg);
1175 tp->snd_nxt = tp->snd_recover - snd_len;
1176 tcpstat.tcps_pto_in_recovery++;
1177 tcp_ccdbg_trace(tp, NULL, TCP_CC_TLP_IN_FASTRECOVERY);
1178 } else {
1179 /*
1180 * If there is no new data to send or if the
1181 * connection is limited by receive window then
1182 * retransmit the last segment, otherwise send
1183 * new data.
1184 */
1185 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd)
1186 - (tp->snd_max - tp->snd_una);
1187 if (snd_len > 0) {
1188 tp->snd_nxt = tp->snd_max;
1189 } else {
1190 snd_len = min((tp->snd_max - tp->snd_una),
1191 tp->t_maxseg);
1192 tp->snd_nxt = tp->snd_max - snd_len;
1193 }
1194 }
1195
1196 /* Note that tail loss probe is being sent */
1197 tp->t_flagsext |= TF_SENT_TLPROBE;
1198 tp->t_tlpstart = tcp_now;
1199
1200 tp->snd_cwnd += tp->t_maxseg;
1201 (void )tcp_output(tp);
1202 tp->snd_cwnd -= tp->t_maxseg;
1203
1204 tp->t_tlphighrxt = tp->snd_nxt;
1205
1206 /*
1207 * If a tail loss probe was sent after entering recovery,
1208 * restore the old snd_nxt value so that other packets
1209 * will get retransmitted correctly.
1210 */
1211 if (rescue_rxt)
1212 tp->snd_nxt = old_snd_nxt;
1213 break;
1214 }
1215 case TCPT_DELAYFR:
1216 tp->t_flagsext &= ~TF_DELAY_RECOVERY;
1217
1218 /*
1219 * Don't do anything if one of the following is true:
1220 * - the connection is already in recovery
1221 * - sequence until snd_recover has been acknowledged.
1222 * - retransmit timeout has fired
1223 */
1224 if (IN_FASTRECOVERY(tp) ||
1225 SEQ_GEQ(tp->snd_una, tp->snd_recover) ||
1226 tp->t_rxtshift > 0)
1227 break;
1228
1229 VERIFY(SACK_ENABLED(tp));
1230 if (CC_ALGO(tp)->pre_fr != NULL)
1231 CC_ALGO(tp)->pre_fr(tp);
1232 ENTER_FASTRECOVERY(tp);
1233 if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON)
1234 tp->ecn_flags |= TE_SENDCWR;
1235
1236 tp->t_timer[TCPT_REXMT] = 0;
1237 tcpstat.tcps_sack_recovery_episode++;
1238 tp->sack_newdata = tp->snd_nxt;
1239 tp->snd_cwnd = tp->t_maxseg;
1240 tcp_ccdbg_trace(tp, NULL, TCP_CC_ENTER_FASTRECOVERY);
1241 (void) tcp_output(tp);
1242 break;
1c79356b
A
1243 dropit:
1244 tcpstat.tcps_keepdrops++;
316670eb
A
1245 postevent(so, 0, EV_TIMEOUT);
1246 soevent(so,
1247 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1c79356b 1248 tp = tcp_drop(tp, ETIMEDOUT);
1c79356b
A
1249 break;
1250 }
fe8ab488
A
1251#if TCPDEBUG
1252 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
1253 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
1254 PRU_SLOWTIMO);
1255#endif
1c79356b
A
1256 return (tp);
1257}
6d2010ae
A
1258
1259/* Remove a timer entry from timer list */
1260void
1261tcp_remove_timer(struct tcpcb *tp)
1262{
1263 struct tcptimerlist *listp = &tcp_timer_list;
1264
1265 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1266 if (!(TIMER_IS_ON_LIST(tp))) {
1267 return;
1268 }
1269 lck_mtx_lock(listp->mtx);
1270
1271 /* Check if pcb is on timer list again after acquiring the lock */
1272 if (!(TIMER_IS_ON_LIST(tp))) {
1273 lck_mtx_unlock(listp->mtx);
1274 return;
1275 }
1276
1277 if (listp->next_te != NULL && listp->next_te == &tp->tentry)
1278 listp->next_te = LIST_NEXT(&tp->tentry, le);
1279
1280 LIST_REMOVE(&tp->tentry, le);
1281 tp->t_flags &= ~(TF_TIMER_ONLIST);
1282
1283 listp->entries--;
6d2010ae
A
1284
1285 tp->tentry.le.le_next = NULL;
1286 tp->tentry.le.le_prev = NULL;
ebb1b9f4 1287 lck_mtx_unlock(listp->mtx);
6d2010ae
A
1288}
1289
fe8ab488
A
1290/*
1291 * Function to check if the timerlist needs to be rescheduled to run
6d2010ae
A
1292 * the timer entry correctly. Basically, this is to check if we can avoid
1293 * taking the list lock.
1294 */
1295
1296static boolean_t
fe8ab488
A
1297need_to_resched_timerlist(u_int32_t runtime, u_int16_t mode)
1298{
6d2010ae
A
1299 struct tcptimerlist *listp = &tcp_timer_list;
1300 int32_t diff;
6d2010ae 1301
fe8ab488
A
1302 /*
1303 * If the list is being processed then the state of the list is
1304 * in flux. In this case always acquire the lock and set the state
1305 * correctly.
6d2010ae 1306 */
8a3053a0 1307 if (listp->running)
fe8ab488 1308 return (TRUE);
8a3053a0
A
1309
1310 if (!listp->scheduled)
1311 return (TRUE);
6d2010ae
A
1312
1313 diff = timer_diff(listp->runtime, 0, runtime, 0);
1314 if (diff <= 0) {
1315 /* The list is going to run before this timer */
fe8ab488 1316 return (FALSE);
6d2010ae 1317 } else {
fe8ab488
A
1318 if (mode & TCP_TIMERLIST_10MS_MODE) {
1319 if (diff <= TCP_TIMER_10MS_QUANTUM)
1320 return (FALSE);
1321 } else if (mode & TCP_TIMERLIST_100MS_MODE) {
1322 if (diff <= TCP_TIMER_100MS_QUANTUM)
1323 return (FALSE);
6d2010ae 1324 } else {
fe8ab488
A
1325 if (diff <= TCP_TIMER_500MS_QUANTUM)
1326 return (FALSE);
6d2010ae
A
1327 }
1328 }
fe8ab488 1329 return (TRUE);
6d2010ae
A
1330}
1331
1332void
1333tcp_sched_timerlist(uint32_t offset)
1334{
1335
1336 uint64_t deadline = 0;
1337 struct tcptimerlist *listp = &tcp_timer_list;
1338
1339 lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED);
1340
8a3053a0 1341 offset = min(offset, TCP_TIMERLIST_MAX_OFFSET);
6d2010ae 1342 listp->runtime = tcp_now + offset;
fe8ab488 1343 if (listp->runtime == 0) {
8a3053a0 1344 listp->runtime++;
fe8ab488
A
1345 offset++;
1346 }
6d2010ae 1347
fe8ab488 1348 clock_interval_to_deadline(offset, USEC_PER_SEC, &deadline);
6d2010ae
A
1349
1350 thread_call_enter_delayed(listp->call, deadline);
8a3053a0 1351 listp->scheduled = TRUE;
6d2010ae
A
1352}
1353
fe8ab488
A
1354/*
1355 * Function to run the timers for a connection.
6d2010ae
A
1356 *
1357 * Returns the offset of next timer to be run for this connection which
1358 * can be used to reschedule the timerlist.
fe8ab488
A
1359 *
1360 * te_mode is an out parameter that indicates the modes of active
1361 * timers for this connection.
6d2010ae 1362 */
fe8ab488
A
1363u_int32_t
1364tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode) {
6d2010ae 1365
fe8ab488
A
1366 struct socket *so;
1367 u_int16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE;
1368 u_int32_t timer_val, offset = 0, lo_timer = 0;
6d2010ae
A
1369 int32_t diff;
1370 boolean_t needtorun[TCPT_NTIMERS];
1371 int count = 0;
1372
fe8ab488
A
1373 VERIFY(tp != NULL);
1374 bzero(needtorun, sizeof(needtorun));
1375 *te_mode = 0;
6d2010ae 1376
fe8ab488 1377 tcp_lock(tp->t_inpcb->inp_socket, 1, 0);
6d2010ae 1378
fe8ab488 1379 so = tp->t_inpcb->inp_socket;
6d2010ae 1380 /* Release the want count on inp */
fe8ab488
A
1381 if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1)
1382 == WNT_STOPUSING) {
6d2010ae
A
1383 if (TIMER_IS_ON_LIST(tp)) {
1384 tcp_remove_timer(tp);
1385 }
1386
1387 /* Looks like the TCP connection got closed while we
1388 * were waiting for the lock.. Done
1389 */
1390 goto done;
1391 }
1392
fe8ab488
A
1393 /*
1394 * Since the timer thread needs to wait for tcp lock, it may race
1395 * with another thread that can cancel or reschedule the timer
1396 * that is about to run. Check if we need to run anything.
1397 */
8a3053a0 1398 if ((index = tp->tentry.index) == TCPT_NONE)
6d2010ae 1399 goto done;
fe8ab488 1400
8a3053a0 1401 timer_val = tp->t_timer[index];
6d2010ae
A
1402
1403 diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
1404 if (diff > 0) {
1405 if (tp->tentry.index != TCPT_NONE) {
1406 offset = diff;
fe8ab488 1407 *(te_mode) = tp->tentry.mode;
6d2010ae
A
1408 }
1409 goto done;
1410 }
1411
1412 tp->t_timer[index] = 0;
1413 if (timer_val > 0) {
1414 tp = tcp_timers(tp, index);
fe8ab488 1415 if (tp == NULL)
6d2010ae
A
1416 goto done;
1417 }
1418
fe8ab488
A
1419 /*
1420 * Check if there are any other timers that need to be run.
1421 * While doing it, adjust the timer values wrt tcp_now.
6d2010ae 1422 */
fe8ab488 1423 tp->tentry.mode = 0;
6d2010ae
A
1424 for (i = 0; i < TCPT_NTIMERS; ++i) {
1425 if (tp->t_timer[i] != 0) {
fe8ab488
A
1426 diff = timer_diff(tp->tentry.timer_start,
1427 tp->t_timer[i], tcp_now, 0);
6d2010ae 1428 if (diff <= 0) {
6d2010ae
A
1429 needtorun[i] = TRUE;
1430 count++;
1431 } else {
1432 tp->t_timer[i] = diff;
1433 needtorun[i] = FALSE;
1434 if (lo_timer == 0 || diff < lo_timer) {
1435 lo_timer = diff;
1436 lo_index = i;
1437 }
fe8ab488 1438 TCP_SET_TIMER_MODE(tp->tentry.mode, i);
6d2010ae
A
1439 }
1440 }
1441 }
1442
1443 tp->tentry.timer_start = tcp_now;
1444 tp->tentry.index = lo_index;
fe8ab488
A
1445 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
1446
1447 if (tp->tentry.index != TCPT_NONE) {
1448 tp->tentry.runtime = tp->tentry.timer_start +
1449 tp->t_timer[tp->tentry.index];
8a3053a0
A
1450 if (tp->tentry.runtime == 0)
1451 tp->tentry.runtime++;
6d2010ae
A
1452 }
1453
1454 if (count > 0) {
fe8ab488 1455 /* run any other timers outstanding at this time. */
6d2010ae
A
1456 for (i = 0; i < TCPT_NTIMERS; ++i) {
1457 if (needtorun[i]) {
1458 tp->t_timer[i] = 0;
1459 tp = tcp_timers(tp, i);
8a3053a0
A
1460 if (tp == NULL) {
1461 offset = 0;
fe8ab488 1462 *(te_mode) = 0;
6d2010ae 1463 goto done;
8a3053a0 1464 }
6d2010ae
A
1465 }
1466 }
1467 tcp_set_lotimer_index(tp);
1468 }
1469
1470 if (tp->tentry.index < TCPT_NONE) {
1471 offset = tp->t_timer[tp->tentry.index];
fe8ab488 1472 *(te_mode) = tp->tentry.mode;
6d2010ae
A
1473 }
1474
1475done:
1476 if (tp != NULL && tp->tentry.index == TCPT_NONE) {
1477 tcp_remove_timer(tp);
8a3053a0 1478 offset = 0;
6d2010ae 1479 }
fe8ab488
A
1480
1481 tcp_unlock(so, 1, 0);
1482 return(offset);
6d2010ae
A
1483}
1484
1485void
1486tcp_run_timerlist(void * arg1, void * arg2) {
6d2010ae 1487#pragma unused(arg1, arg2)
6d2010ae
A
1488 struct tcptimerentry *te, *next_te;
1489 struct tcptimerlist *listp = &tcp_timer_list;
1490 struct tcpcb *tp;
fe8ab488
A
1491 uint32_t next_timer = 0; /* offset of the next timer on the list */
1492 u_int16_t te_mode = 0; /* modes of all active timers in a tcpcb */
1493 u_int16_t list_mode = 0; /* cumulative of modes of all tcpcbs */
6d2010ae 1494 uint32_t active_count = 0;
6d2010ae
A
1495
1496 calculate_tcp_clock();
1497
1498 lck_mtx_lock(listp->mtx);
1499
1500 listp->running = TRUE;
1501
1502 LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
1503 uint32_t offset = 0;
1504 uint32_t runtime = te->runtime;
8a3053a0 1505 if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) {
6d2010ae
A
1506 offset = timer_diff(runtime, 0, tcp_now, 0);
1507 if (next_timer == 0 || offset < next_timer) {
1508 next_timer = offset;
1509 }
fe8ab488 1510 list_mode |= te->mode;
6d2010ae
A
1511 continue;
1512 }
6d2010ae
A
1513
1514 tp = TIMERENTRY_TO_TP(te);
1515
fe8ab488
A
1516 /*
1517 * Acquire an inp wantcnt on the inpcb so that the socket
1518 * won't get detached even if tcp_close is called
6d2010ae 1519 */
fe8ab488
A
1520 if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0)
1521 == WNT_STOPUSING) {
1522 /*
1523 * Some how this pcb went into dead state while
1524 * on the timer list, just take it off the list.
1525 * Since the timer list entry pointers are
1526 * protected by the timer list lock, we can
1527 * do it here without the socket lock.
6d2010ae
A
1528 */
1529 if (TIMER_IS_ON_LIST(tp)) {
1530 tp->t_flags &= ~(TF_TIMER_ONLIST);
1531 LIST_REMOVE(&tp->tentry, le);
1532 listp->entries--;
1533
1534 tp->tentry.le.le_next = NULL;
1535 tp->tentry.le.le_prev = NULL;
1536 }
1537 continue;
1538 }
fe8ab488 1539 active_count++;
6d2010ae 1540
fe8ab488
A
1541 /*
1542 * Store the next timerentry pointer before releasing the
1543 * list lock. If that entry has to be removed when we
1544 * release the lock, this pointer will be updated to the
1545 * element after that.
6d2010ae
A
1546 */
1547 listp->next_te = next_te;
1548
1549 VERIFY_NEXT_LINK(&tp->tentry, le);
1550 VERIFY_PREV_LINK(&tp->tentry, le);
1551
1552 lck_mtx_unlock(listp->mtx);
1553
fe8ab488 1554 offset = tcp_run_conn_timer(tp, &te_mode);
6d2010ae
A
1555
1556 lck_mtx_lock(listp->mtx);
1557
1558 next_te = listp->next_te;
1559 listp->next_te = NULL;
1560
fe8ab488
A
1561 if (offset > 0 && te_mode != 0) {
1562 list_mode |= te_mode;
6d2010ae 1563
fe8ab488
A
1564 if (next_timer == 0 || offset < next_timer)
1565 next_timer = offset;
6d2010ae
A
1566 }
1567 }
1568
1569 if (!LIST_EMPTY(&listp->lhead)) {
fe8ab488
A
1570 u_int16_t next_mode = 0;
1571 if ((list_mode & TCP_TIMERLIST_10MS_MODE) ||
1572 (listp->pref_mode & TCP_TIMERLIST_10MS_MODE))
1573 next_mode = TCP_TIMERLIST_10MS_MODE;
1574 else if ((list_mode & TCP_TIMERLIST_100MS_MODE) ||
1575 (listp->pref_mode & TCP_TIMERLIST_100MS_MODE))
1576 next_mode = TCP_TIMERLIST_100MS_MODE;
1577 else
1578 next_mode = TCP_TIMERLIST_500MS_MODE;
6d2010ae 1579
fe8ab488
A
1580 if (next_mode != TCP_TIMERLIST_500MS_MODE) {
1581 listp->idleruns = 0;
6d2010ae 1582 } else {
fe8ab488
A
1583 /*
1584 * the next required mode is slow mode, but if
1585 * the last one was a faster mode and we did not
1586 * have enough idle runs, repeat the last mode.
1587 *
1588 * We try to keep the timer list in fast mode for
1589 * some idle time in expectation of new data.
1590 */
1591 if (listp->mode != next_mode &&
1592 listp->idleruns < timer_fastmode_idlemax) {
1593 listp->idleruns++;
1594 next_mode = listp->mode;
1595 next_timer = TCP_TIMER_100MS_QUANTUM;
1596 } else {
1597 listp->idleruns = 0;
1598 }
6d2010ae 1599 }
fe8ab488
A
1600 listp->mode = next_mode;
1601 if (listp->pref_offset != 0)
1602 next_timer = min(listp->pref_offset, next_timer);
6d2010ae 1603
fe8ab488
A
1604 if (listp->mode == TCP_TIMERLIST_500MS_MODE)
1605 next_timer = max(next_timer,
1606 TCP_TIMER_500MS_QUANTUM);
6d2010ae
A
1607
1608 tcp_sched_timerlist(next_timer);
1609 } else {
8a3053a0
A
1610 /*
1611 * No need to reschedule this timer, but always run
1612 * periodically at a much higher granularity.
1613 */
1614 tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET);
6d2010ae
A
1615 }
1616
1617 listp->running = FALSE;
1618 listp->pref_mode = 0;
1619 listp->pref_offset = 0;
1620
1621 lck_mtx_unlock(listp->mtx);
1622}
1623
fe8ab488
A
1624/*
1625 * Function to check if the timerlist needs to be reschduled to run this
1626 * connection's timers correctly.
1627 */
6d2010ae
A
1628void
1629tcp_sched_timers(struct tcpcb *tp)
1630{
1631 struct tcptimerentry *te = &tp->tentry;
fe8ab488
A
1632 u_int16_t index = te->index;
1633 u_int16_t mode = te->mode;
6d2010ae 1634 struct tcptimerlist *listp = &tcp_timer_list;
8a3053a0 1635 int32_t offset = 0;
fe8ab488 1636 boolean_t list_locked = FALSE;
6d2010ae
A
1637
1638 if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) {
1639 /* Just return without adding the dead pcb to the list */
1640 if (TIMER_IS_ON_LIST(tp)) {
1641 tcp_remove_timer(tp);
1642 }
1643 return;
1644 }
1645
1646 if (index == TCPT_NONE) {
fe8ab488 1647 /* Nothing to run */
6d2010ae
A
1648 tcp_remove_timer(tp);
1649 return;
1650 }
1651
fe8ab488
A
1652 /*
1653 * compute the offset at which the next timer for this connection
1654 * has to run.
1655 */
8a3053a0
A
1656 offset = timer_diff(te->runtime, 0, tcp_now, 0);
1657 if (offset <= 0) {
6d2010ae
A
1658 offset = 1;
1659 tcp_timer_advanced++;
1660 }
6d2010ae
A
1661
1662 if (!TIMER_IS_ON_LIST(tp)) {
1663 if (!list_locked) {
1664 lck_mtx_lock(listp->mtx);
fe8ab488 1665 list_locked = TRUE;
6d2010ae
A
1666 }
1667
1668 LIST_INSERT_HEAD(&listp->lhead, te, le);
1669 tp->t_flags |= TF_TIMER_ONLIST;
1670
fe8ab488
A
1671 listp->entries++;
1672 if (listp->entries > listp->maxentries)
1673 listp->maxentries = listp->entries;
6d2010ae
A
1674
1675 /* if the list is not scheduled, just schedule it */
8a3053a0 1676 if (!listp->scheduled)
6d2010ae 1677 goto schedule;
6d2010ae
A
1678 }
1679
1680
fe8ab488
A
1681 /*
1682 * Timer entry is currently on the list, check if the list needs
1683 * to be rescheduled.
1684 */
1685 if (need_to_resched_timerlist(te->runtime, mode)) {
6d2010ae
A
1686 tcp_resched_timerlist++;
1687
1688 if (!list_locked) {
1689 lck_mtx_lock(listp->mtx);
fe8ab488 1690 list_locked = TRUE;
6d2010ae
A
1691 }
1692
1693 VERIFY_NEXT_LINK(te, le);
1694 VERIFY_PREV_LINK(te, le);
1695
1696 if (listp->running) {
fe8ab488
A
1697 listp->pref_mode |= mode;
1698 if (listp->pref_offset == 0 ||
8a3053a0 1699 offset < listp->pref_offset) {
6d2010ae
A
1700 listp->pref_offset = offset;
1701 }
1702 } else {
8a3053a0 1703 /*
fe8ab488
A
1704 * The list could have got rescheduled while
1705 * this thread was waiting for the lock
8a3053a0
A
1706 */
1707 if (listp->scheduled) {
1708 int32_t diff;
1709 diff = timer_diff(listp->runtime, 0,
1710 tcp_now, offset);
1711 if (diff <= 0)
1712 goto done;
1713 else
1714 goto schedule;
6d2010ae
A
1715 } else {
1716 goto schedule;
1717 }
1718 }
1719 }
1720 goto done;
1721
1722schedule:
fe8ab488
A
1723 /*
1724 * Since a connection with timers is getting scheduled, the timer
1725 * list moves from idle to active state and that is why idlegen is
1726 * reset
1727 */
1728 if (mode & TCP_TIMERLIST_10MS_MODE) {
1729 listp->mode = TCP_TIMERLIST_10MS_MODE;
1730 listp->idleruns = 0;
1731 offset = min(offset, TCP_TIMER_10MS_QUANTUM);
1732 } else if (mode & TCP_TIMERLIST_100MS_MODE) {
1733 if (listp->mode > TCP_TIMERLIST_100MS_MODE)
1734 listp->mode = TCP_TIMERLIST_100MS_MODE;
1735 listp->idleruns = 0;
1736 offset = min(offset, TCP_TIMER_100MS_QUANTUM);
6d2010ae
A
1737 }
1738 tcp_sched_timerlist(offset);
1739
1740done:
1741 if (list_locked)
1742 lck_mtx_unlock(listp->mtx);
1743
1744 return;
1745}
1746
fe8ab488 1747static inline void
6d2010ae 1748tcp_set_lotimer_index(struct tcpcb *tp) {
fe8ab488 1749 uint16_t i, lo_index = TCPT_NONE, mode = 0;
6d2010ae
A
1750 uint32_t lo_timer = 0;
1751 for (i = 0; i < TCPT_NTIMERS; ++i) {
fe8ab488
A
1752 if (tp->t_timer[i] != 0) {
1753 TCP_SET_TIMER_MODE(mode, i);
1754 if (lo_timer == 0 || tp->t_timer[i] < lo_timer) {
1755 lo_timer = tp->t_timer[i];
1756 lo_index = i;
1757 }
6d2010ae
A
1758 }
1759 }
1760 tp->tentry.index = lo_index;
fe8ab488
A
1761 tp->tentry.mode = mode;
1762 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
1763
1764 if (tp->tentry.index != TCPT_NONE) {
1765 tp->tentry.runtime = tp->tentry.timer_start
1766 + tp->t_timer[tp->tentry.index];
8a3053a0
A
1767 if (tp->tentry.runtime == 0)
1768 tp->tentry.runtime++;
6d2010ae
A
1769 }
1770}
1771
1772void
1773tcp_check_timer_state(struct tcpcb *tp) {
1774
1775 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1776
8a3053a0
A
1777 if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT)
1778 return;
1779
6d2010ae
A
1780 tcp_set_lotimer_index(tp);
1781
1782 tcp_sched_timers(tp);
1783 return;
1784}
fe8ab488
A
1785
1786__private_extern__ void
1787tcp_report_stats(void)
1788{
1789 struct nstat_sysinfo_data data;
1790 struct sockaddr_in dst;
1791 struct sockaddr_in6 dst6;
1792 struct rtentry *rt = NULL;
1793 u_int64_t var, uptime;
1794
1795#define stat data.u.tcp_stats
1796 if (((uptime = net_uptime()) - tcp_last_report_time) <
1797 TCP_REPORT_STATS_INTERVAL)
1798 return;
1799
1800 tcp_last_report_time = uptime;
1801
1802 bzero(&data, sizeof(data));
1803 data.flags = NSTAT_SYSINFO_TCP_STATS;
1804
1805 bzero(&dst, sizeof(dst));
1806 dst.sin_len = sizeof(dst);
1807 dst.sin_family = AF_INET;
1808
1809 /* ipv4 avg rtt */
1810 lck_mtx_lock(rnh_lock);
1811 rt = rt_lookup(TRUE, (struct sockaddr *)&dst, NULL,
1812 rt_tables[AF_INET], IFSCOPE_NONE);
1813 lck_mtx_unlock(rnh_lock);
1814 if (rt != NULL) {
1815 RT_LOCK(rt);
1816 if (rt_primary_default(rt, rt_key(rt)) &&
1817 rt->rt_stats != NULL) {
1818 stat.ipv4_avgrtt = rt->rt_stats->nstat_avg_rtt;
1819 }
1820 RT_UNLOCK(rt);
1821 rtfree(rt);
1822 rt = NULL;
1823 }
1824
1825 /* ipv6 avg rtt */
1826 bzero(&dst6, sizeof(dst6));
1827 dst6.sin6_len = sizeof(dst6);
1828 dst6.sin6_family = AF_INET6;
1829
1830 lck_mtx_lock(rnh_lock);
1831 rt = rt_lookup(TRUE,(struct sockaddr *)&dst6, NULL,
1832 rt_tables[AF_INET6], IFSCOPE_NONE);
1833 lck_mtx_unlock(rnh_lock);
1834 if (rt != NULL) {
1835 RT_LOCK(rt);
1836 if (rt_primary_default(rt, rt_key(rt)) &&
1837 rt->rt_stats != NULL) {
1838 stat.ipv6_avgrtt = rt->rt_stats->nstat_avg_rtt;
1839 }
1840 RT_UNLOCK(rt);
1841 rtfree(rt);
1842 rt = NULL;
1843 }
1844
1845 /* send packet loss rate, shift by 10 for precision */
1846 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_sndrexmitpack > 0) {
1847 var = tcpstat.tcps_sndrexmitpack << 10;
1848 stat.send_plr = (var * 100) / tcpstat.tcps_sndpack;
1849 }
1850
1851 /* recv packet loss rate, shift by 10 for precision */
1852 if (tcpstat.tcps_rcvpack > 0 && tcpstat.tcps_recovered_pkts > 0) {
1853 var = tcpstat.tcps_recovered_pkts << 10;
1854 stat.recv_plr = (var * 100) / tcpstat.tcps_rcvpack;
1855 }
1856
1857 /* RTO after tail loss, shift by 10 for precision */
1858 if (tcpstat.tcps_sndrexmitpack > 0
1859 && tcpstat.tcps_tailloss_rto > 0) {
1860 var = tcpstat.tcps_tailloss_rto << 10;
1861 stat.send_tlrto_rate =
1862 (var * 100) / tcpstat.tcps_sndrexmitpack;
1863 }
1864
1865 /* packet reordering */
1866 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_reordered_pkts > 0) {
1867 var = tcpstat.tcps_reordered_pkts << 10;
1868 stat.send_reorder_rate =
1869 (var * 100) / tcpstat.tcps_sndpack;
1870 }
1871
1872 nstat_sysinfo_send_data(&data);
1873
1874#undef stat
1875}