]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/tcp_timer.c
xnu-3789.1.32.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_timer.c
CommitLineData
1c79356b 1/*
39037602 2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
39037602 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
39037602 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
39037602 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
39037602 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
9bccf70c 61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
1c79356b
A
62 */
63
1c79356b
A
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/kernel.h>
9bccf70c 68#include <sys/mbuf.h>
1c79356b
A
69#include <sys/sysctl.h>
70#include <sys/socket.h>
71#include <sys/socketvar.h>
72#include <sys/protosw.h>
b0d623f7 73#include <sys/domain.h>
6d2010ae
A
74#include <sys/mcache.h>
75#include <sys/queue.h>
91447636 76#include <kern/locks.h>
1c79356b 77#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
39236c6e 78#include <mach/boolean.h>
1c79356b
A
79
80#include <net/route.h>
316670eb 81#include <net/if_var.h>
fe8ab488 82#include <net/ntstat.h>
1c79356b
A
83
84#include <netinet/in.h>
85#include <netinet/in_systm.h>
1c79356b 86#include <netinet/in_pcb.h>
9bccf70c
A
87#if INET6
88#include <netinet6/in6_pcb.h>
89#endif
1c79356b
A
90#include <netinet/ip_var.h>
91#include <netinet/tcp.h>
3e170ce0 92#include <netinet/tcp_cache.h>
1c79356b
A
93#include <netinet/tcp_fsm.h>
94#include <netinet/tcp_seq.h>
95#include <netinet/tcp_timer.h>
96#include <netinet/tcp_var.h>
6d2010ae 97#include <netinet/tcp_cc.h>
b0d623f7
A
98#if INET6
99#include <netinet6/tcp6_var.h>
100#endif
1c79356b
A
101#include <netinet/tcpip.h>
102#if TCPDEBUG
103#include <netinet/tcp_debug.h>
104#endif
105#include <sys/kdebug.h>
6d2010ae 106#include <mach/sdt.h>
39236c6e 107#include <netinet/mptcp_var.h>
1c79356b 108
fe8ab488
A
109/* Max number of times a stretch ack can be delayed on a connection */
110#define TCP_STRETCHACK_DELAY_THRESHOLD 5
111
3e170ce0
A
112/*
113 * If the host processor has been sleeping for too long, this is the threshold
114 * used to avoid sending stale retransmissions.
115 */
116#define TCP_SLEEP_TOO_LONG (10 * 60 * 1000) /* 10 minutes in ms */
117
39236c6e
A
118/* tcp timer list */
119struct tcptimerlist tcp_timer_list;
120
121/* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */
122struct tcptailq tcp_tw_tailq;
123
9bccf70c
A
124static int
125sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
126{
2d21ac55 127#pragma unused(arg1, arg2)
9bccf70c 128 int error, s, tt;
1c79356b 129
9bccf70c 130 tt = *(int *)oidp->oid_arg1;
2d21ac55 131 s = tt * 1000 / TCP_RETRANSHZ;;
1c79356b 132
9bccf70c
A
133 error = sysctl_handle_int(oidp, &s, 0, req);
134 if (error || !req->newptr)
135 return (error);
136
2d21ac55 137 tt = s * TCP_RETRANSHZ / 1000;
9bccf70c
A
138 if (tt < 1)
139 return (EINVAL);
140
141 *(int *)oidp->oid_arg1 = tt;
142 return (0);
143}
1c79356b 144
9bccf70c 145int tcp_keepinit;
fe8ab488
A
146SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
147 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c
A
148 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
149
150int tcp_keepidle;
fe8ab488
A
151SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
152 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c
A
153 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
154
155int tcp_keepintvl;
fe8ab488
A
156SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
157 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c
A
158 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
159
39236c6e 160int tcp_keepcnt;
fe8ab488
A
161SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt,
162 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
163 &tcp_keepcnt, 0, "number of times to repeat keepalive");
39236c6e 164
9bccf70c 165int tcp_msl;
fe8ab488
A
166SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
167 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c 168 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
1c79356b 169
39037602 170/*
fe8ab488
A
171 * Avoid DoS via TCP Robustness in Persist Condition
172 * (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt)
173 * by allowing a system wide maximum persistence timeout value when in
174 * Zero Window Probe mode.
175 *
176 * Expressed in milliseconds to be consistent without timeout related
177 * values, the TCP socket option is in seconds.
6d2010ae
A
178 */
179u_int32_t tcp_max_persist_timeout = 0;
fe8ab488
A
180SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout,
181 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
39037602 182 &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I",
fe8ab488 183 "Maximum persistence timeout for ZWP");
6d2010ae 184
1c79356b 185static int always_keepalive = 0;
fe8ab488
A
186SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive,
187 CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c 188 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
1c79356b 189
fe8ab488
A
190/*
191 * This parameter determines how long the timer list will stay in fast or
39037602 192 * quick mode even though all connections are idle. In this state, the
fe8ab488 193 * timer will run more frequently anticipating new data.
6d2010ae 194 */
fe8ab488
A
195int timer_fastmode_idlemax = TCP_FASTMODE_IDLERUN_MAX;
196SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax,
197 CTLFLAG_RW | CTLFLAG_LOCKED,
198 &timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode");
6d2010ae 199
b0d623f7
A
200/*
201 * See tcp_syn_backoff[] for interval values between SYN retransmits;
202 * the value set below defines the number of retransmits, before we
203 * disable the timestamp and window scaling options during subsequent
204 * SYN retransmits. Setting it to 0 disables the dropping off of those
205 * two options.
206 */
3e170ce0
A
207static int tcp_broken_peer_syn_rxmit_thres = 10;
208SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rexmit_thres,
209 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres, 0,
fe8ab488
A
210 "Number of retransmitted SYNs before disabling RFC 1323 "
211 "options on local connections");
39236c6e 212
6d2010ae 213static int tcp_timer_advanced = 0;
fe8ab488
A
214SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced,
215 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_timer_advanced, 0,
216 "Number of times one of the timers was advanced");
6d2010ae
A
217
218static int tcp_resched_timerlist = 0;
fe8ab488 219SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist,
39037602 220 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_resched_timerlist, 0,
6d2010ae
A
221 "Number of times timer list was rescheduled as part of processing a packet");
222
b0d623f7 223int tcp_pmtud_black_hole_detect = 1 ;
fe8ab488
A
224SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
225 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_detect, 0,
226 "Path MTU Discovery Black Hole Detection");
b0d623f7
A
227
228int tcp_pmtud_black_hole_mss = 1200 ;
fe8ab488
A
229SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
230 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_mss, 0,
231 "Path MTU Discovery Black Hole Detection lowered MSS");
b0d623f7 232
39037602
A
233static u_int32_t tcp_mss_rec_medium = 1200;
234static u_int32_t tcp_mss_rec_low = 512;
235
3e170ce0
A
236#define TCP_REPORT_STATS_INTERVAL 43200 /* 12 hours, in seconds */
237int tcp_report_stats_interval = TCP_REPORT_STATS_INTERVAL;
3e170ce0 238
39236c6e
A
239/* performed garbage collection of "used" sockets */
240static boolean_t tcp_gc_done = FALSE;
241
fe8ab488 242/* max idle probes */
9bccf70c 243int tcp_maxpersistidle;
1c79356b 244
fe8ab488
A
245/*
246 * TCP delack timer is set to 100 ms. Since the processing of timer list
247 * in fast mode will happen no faster than 100 ms, the delayed ack timer
248 * will fire some where between 100 and 200 ms.
6d2010ae
A
249 */
250int tcp_delack = TCP_RETRANSHZ / 10;
251
39236c6e
A
252#if MPTCP
253/*
254 * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff
255 */
256int tcp_jack_rxmt = TCP_RETRANSHZ / 2;
257#endif /* MPTCP */
1c79356b 258
3e170ce0
A
259static boolean_t tcp_itimer_done = FALSE;
260
6d2010ae
A
261static void tcp_remove_timer(struct tcpcb *tp);
262static void tcp_sched_timerlist(uint32_t offset);
3e170ce0
A
263static u_int32_t tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *mode,
264 u_int16_t probe_if_index);
6d2010ae
A
265static void tcp_sched_timers(struct tcpcb *tp);
266static inline void tcp_set_lotimer_index(struct tcpcb *);
fe8ab488 267__private_extern__ void tcp_remove_from_time_wait(struct inpcb *inp);
39037602 268static inline void tcp_update_mss_core(struct tcpcb *tp, struct ifnet *ifp);
fe8ab488 269__private_extern__ void tcp_report_stats(void);
6d2010ae 270
fe8ab488 271static u_int64_t tcp_last_report_time;
3e170ce0
A
272
273/*
274 * Structure to store previously reported stats so that we can send
275 * incremental changes in each report interval.
276 */
277struct tcp_last_report_stats {
278 u_int32_t tcps_connattempt;
279 u_int32_t tcps_accepts;
280 u_int32_t tcps_ecn_client_setup;
281 u_int32_t tcps_ecn_server_setup;
282 u_int32_t tcps_ecn_client_success;
283 u_int32_t tcps_ecn_server_success;
284 u_int32_t tcps_ecn_not_supported;
285 u_int32_t tcps_ecn_lost_syn;
286 u_int32_t tcps_ecn_lost_synack;
287 u_int32_t tcps_ecn_recv_ce;
288 u_int32_t tcps_ecn_recv_ece;
289 u_int32_t tcps_ecn_sent_ece;
290 u_int32_t tcps_ecn_conn_recv_ce;
291 u_int32_t tcps_ecn_conn_recv_ece;
292 u_int32_t tcps_ecn_conn_plnoce;
293 u_int32_t tcps_ecn_conn_pl_ce;
294 u_int32_t tcps_ecn_conn_nopl_ce;
4bd07ac2
A
295 u_int32_t tcps_ecn_fallback_synloss;
296 u_int32_t tcps_ecn_fallback_reorder;
297 u_int32_t tcps_ecn_fallback_ce;
3e170ce0
A
298
299 /* TFO-related statistics */
300 u_int32_t tcps_tfo_syn_data_rcv;
301 u_int32_t tcps_tfo_cookie_req_rcv;
302 u_int32_t tcps_tfo_cookie_sent;
303 u_int32_t tcps_tfo_cookie_invalid;
304 u_int32_t tcps_tfo_cookie_req;
305 u_int32_t tcps_tfo_cookie_rcv;
306 u_int32_t tcps_tfo_syn_data_sent;
307 u_int32_t tcps_tfo_syn_data_acked;
308 u_int32_t tcps_tfo_syn_loss;
309 u_int32_t tcps_tfo_blackhole;
39037602
A
310 u_int32_t tcps_tfo_cookie_wrong;
311 u_int32_t tcps_tfo_no_cookie_rcv;
312 u_int32_t tcps_tfo_heuristics_disable;
313 u_int32_t tcps_tfo_sndblackhole;
3e170ce0
A
314};
315
fe8ab488 316
6d2010ae
A
317/* Returns true if the timer is on the timer list */
318#define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
319
8a3053a0 320/* Run the TCP timerlist atleast once every hour */
fe8ab488 321#define TCP_TIMERLIST_MAX_OFFSET (60 * 60 * TCP_RETRANSHZ)
6d2010ae 322
2d21ac55 323
fe8ab488 324static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
39236c6e 325static boolean_t tcp_garbage_collect(struct inpcb *, int);
1c79356b 326
39037602
A
327#define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next)))
328
329#define VERIFY_NEXT_LINK(elm,field) do { \
330 if (LIST_NEXT((elm),field) != NULL && \
331 LIST_NEXT((elm),field)->field.le_prev != \
332 &((elm)->field.le_next)) \
333 panic("Bad link elm %p next->prev != elm", (elm)); \
334} while(0)
335
336#define VERIFY_PREV_LINK(elm,field) do { \
337 if (*(elm)->field.le_prev != (elm)) \
338 panic("Bad link elm %p prev->next != elm", (elm)); \
339} while(0)
340
341#define TCP_SET_TIMER_MODE(mode, i) do { \
342 if (IS_TIMER_HZ_10MS(i)) \
343 (mode) |= TCP_TIMERLIST_10MS_MODE; \
344 else if (IS_TIMER_HZ_100MS(i)) \
345 (mode) |= TCP_TIMERLIST_100MS_MODE; \
346 else \
347 (mode) |= TCP_TIMERLIST_500MS_MODE; \
348} while(0)
349
350#if (DEVELOPMENT || DEBUG)
351SYSCTL_UINT(_net_inet_tcp, OID_AUTO, mss_rec_medium,
352 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mss_rec_medium, 0,
353 "Medium MSS based on recommendation in link status report");
354SYSCTL_UINT(_net_inet_tcp, OID_AUTO, mss_rec_low,
355 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mss_rec_low, 0,
356 "Low MSS based on recommendation in link status report");
357
358static int32_t tcp_change_mss_recommended = 0;
359static int
360sysctl_change_mss_recommended SYSCTL_HANDLER_ARGS
361{
362#pragma unused(oidp, arg1, arg2)
363 int i, err = 0, changed = 0;
364 struct ifnet *ifp;
365 struct if_link_status ifsr;
366 struct if_cellular_status_v1 *new_cell_sr;
367 err = sysctl_io_number(req, tcp_change_mss_recommended,
368 sizeof (int32_t), &i, &changed);
369 if (changed) {
370 ifnet_head_lock_shared();
371 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
372 if (IFNET_IS_CELLULAR(ifp)) {
373 bzero(&ifsr, sizeof (ifsr));
374 new_cell_sr = &ifsr.ifsr_u.ifsr_cell.if_cell_u.if_status_v1;
375 ifsr.ifsr_version = IF_CELLULAR_STATUS_REPORT_CURRENT_VERSION;
376 ifsr.ifsr_len = sizeof(*new_cell_sr);
377
378 /* Set MSS recommended */
379 new_cell_sr->valid_bitmask |= IF_CELL_UL_MSS_RECOMMENDED_VALID;
380 new_cell_sr->mss_recommended = i;
381 err = ifnet_link_status_report(ifp, new_cell_sr, sizeof (new_cell_sr));
382 if (err == 0) {
383 tcp_change_mss_recommended = i;
384 } else {
385 break;
386 }
387 }
388 }
389 ifnet_head_done();
390 }
391 return (err);
392}
393
394SYSCTL_PROC(_net_inet_tcp, OID_AUTO, change_mss_recommended,
395 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_change_mss_recommended,
396 0, sysctl_change_mss_recommended, "IU", "Change MSS recommended");
397
398SYSCTL_INT(_net_inet_tcp, OID_AUTO, report_stats_interval,
399 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_report_stats_interval, 0,
400 "Report stats interval");
401#endif /* (DEVELOPMENT || DEBUG) */
402
403/*
404 * Macro to compare two timers. If there is a reset of the sign bit,
405 * it is safe to assume that the timer has wrapped around. By doing
406 * signed comparision, we take care of wrap around such that the value
407 * with the sign bit reset is actually ahead of the other.
408 */
409inline int32_t
410timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) {
411 return (int32_t)((t1 + toff1) - (t2 + toff2));
412};
413
39236c6e
A
414/*
415 * Add to tcp timewait list, delay is given in milliseconds.
416 */
417static void
418add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
1c79356b 419{
39236c6e
A
420 struct inpcbinfo *pcbinfo = &tcbinfo;
421 struct inpcb *inp = tp->t_inpcb;
6d2010ae 422 uint32_t timer;
1c79356b 423
39236c6e
A
424 /* pcb list should be locked when we get here */
425 lck_rw_assert(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
1c79356b 426
39236c6e
A
427 /* We may get here multiple times, so check */
428 if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
429 pcbinfo->ipi_twcount++;
430 inp->inp_flags2 |= INP2_TIMEWAIT;
39037602 431
39236c6e
A
432 /* Remove from global inp list */
433 LIST_REMOVE(inp, inp_list);
434 } else {
435 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
436 }
2d21ac55 437
39236c6e
A
438 /* Compute the time at which this socket can be closed */
439 timer = tcp_now + delay;
39037602 440
39236c6e 441 /* We will use the TCPT_2MSL timer for tracking this delay */
2d21ac55 442
39236c6e
A
443 if (TIMER_IS_ON_LIST(tp))
444 tcp_remove_timer(tp);
445 tp->t_timer[TCPT_2MSL] = timer;
1c79356b 446
39236c6e 447 TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry);
1c79356b
A
448}
449
39236c6e
A
450void
451add_to_time_wait(struct tcpcb *tp, uint32_t delay)
91447636 452{
39236c6e 453 struct inpcbinfo *pcbinfo = &tcbinfo;
fe8ab488
A
454 if (tp->t_inpcb->inp_socket->so_options & SO_NOWAKEFROMSLEEP)
455 socket_post_kev_msg_closed(tp->t_inpcb->inp_socket);
39236c6e 456
3e170ce0
A
457 /* 19182803: Notify nstat that connection is closing before waiting. */
458 nstat_pcb_detach(tp->t_inpcb);
459
39236c6e 460 if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) {
91447636 461 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
39236c6e 462 lck_rw_lock_exclusive(pcbinfo->ipi_lock);
91447636
A
463 tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
464 }
6d2010ae 465 add_to_time_wait_locked(tp, delay);
39236c6e
A
466 lck_rw_done(pcbinfo->ipi_lock);
467
468 inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY);
91447636 469}
1c79356b 470
39236c6e
A
471/* If this is on time wait queue, remove it. */
472void
473tcp_remove_from_time_wait(struct inpcb *inp)
474{
475 struct tcpcb *tp = intotcpcb(inp);
476 if (inp->inp_flags2 & INP2_TIMEWAIT)
477 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
478}
479
480static boolean_t
b0d623f7 481tcp_garbage_collect(struct inpcb *inp, int istimewait)
2d21ac55 482{
39236c6e 483 boolean_t active = FALSE;
2d21ac55
A
484 struct socket *so;
485 struct tcpcb *tp;
486
b0d623f7
A
487 so = inp->inp_socket;
488 tp = intotcpcb(inp);
2d21ac55 489
b0d623f7
A
490 /*
491 * Skip if still in use or busy; it would have been more efficient
492 * if we were to test so_usecount against 0, but this isn't possible
493 * due to the current implementation of tcp_dropdropablreq() where
494 * overflow sockets that are eligible for garbage collection have
495 * their usecounts set to 1.
496 */
39236c6e
A
497 if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx))
498 return (TRUE);
2d21ac55 499
b0d623f7
A
500 /* Check again under the lock */
501 if (so->so_usecount > 1) {
39236c6e
A
502 if (inp->inp_wantcnt == WNT_STOPUSING)
503 active = TRUE;
6d2010ae 504 lck_mtx_unlock(&inp->inpcb_mtx);
39236c6e
A
505 return (active);
506 }
507
508 if (istimewait &&
509 TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) &&
510 tp->t_state != TCPS_CLOSED) {
511 /* Become a regular mutex */
512 lck_mtx_convert_spin(&inp->inpcb_mtx);
513 tcp_close(tp);
b0d623f7 514 }
2d21ac55 515
b0d623f7
A
516 /*
517 * Overflowed socket dropped from the listening queue? Do this
518 * only if we are called to clean up the time wait slots, since
519 * tcp_dropdropablreq() considers a socket to have been fully
520 * dropped after add_to_time_wait() is finished.
39236c6e
A
521 * Also handle the case of connections getting closed by the peer
522 * while in the queue as seen with rdar://6422317
523 *
b0d623f7 524 */
39236c6e 525 if (so->so_usecount == 1 &&
b0d623f7 526 ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
39236c6e
A
527 ((tp != NULL) && (tp->t_state == TCPS_CLOSED) &&
528 (so->so_head != NULL) &&
529 ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
530 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
b0d623f7
A
531
532 if (inp->inp_state != INPCB_STATE_DEAD) {
533 /* Become a regular mutex */
6d2010ae 534 lck_mtx_convert_spin(&inp->inpcb_mtx);
b0d623f7 535#if INET6
39236c6e 536 if (SOCK_CHECK_DOM(so, PF_INET6))
b0d623f7
A
537 in6_pcbdetach(inp);
538 else
539#endif /* INET6 */
39236c6e 540 in_pcbdetach(inp);
2d21ac55 541 }
b0d623f7 542 so->so_usecount--;
39236c6e
A
543 if (inp->inp_wantcnt == WNT_STOPUSING)
544 active = TRUE;
6d2010ae 545 lck_mtx_unlock(&inp->inpcb_mtx);
39236c6e 546 return (active);
b0d623f7 547 } else if (inp->inp_wantcnt != WNT_STOPUSING) {
6d2010ae 548 lck_mtx_unlock(&inp->inpcb_mtx);
39236c6e 549 return (FALSE);
b0d623f7 550 }
2d21ac55 551
b0d623f7 552 /*
39037602
A
553 * We get here because the PCB is no longer searchable
554 * (WNT_STOPUSING); detach (if needed) and dispose if it is dead
555 * (usecount is 0). This covers all cases, including overflow
556 * sockets and those that are considered as "embryonic",
557 * i.e. created by sonewconn() in TCP input path, and have
39236c6e 558 * not yet been committed. For the former, we reduce the usecount
39037602 559 * to 0 as done by the code above. For the latter, the usecount
39236c6e 560 * would have reduced to 0 as part calling soabort() when the
b0d623f7
A
561 * socket is dropped at the end of tcp_input().
562 */
563 if (so->so_usecount == 0) {
6d2010ae
A
564 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
565 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
b0d623f7 566 /* Become a regular mutex */
6d2010ae 567 lck_mtx_convert_spin(&inp->inpcb_mtx);
39236c6e
A
568
569 /*
39037602 570 * If this tp still happens to be on the timer list,
ebb1b9f4
A
571 * take it out
572 */
573 if (TIMER_IS_ON_LIST(tp)) {
574 tcp_remove_timer(tp);
575 }
576
b0d623f7
A
577 if (inp->inp_state != INPCB_STATE_DEAD) {
578#if INET6
39236c6e 579 if (SOCK_CHECK_DOM(so, PF_INET6))
b0d623f7
A
580 in6_pcbdetach(inp);
581 else
582#endif /* INET6 */
39236c6e 583 in_pcbdetach(inp);
2d21ac55 584 }
b0d623f7 585 in_pcbdispose(inp);
39236c6e 586 return (FALSE);
b0d623f7 587 }
39236c6e
A
588
589 lck_mtx_unlock(&inp->inpcb_mtx);
590 return (TRUE);
2d21ac55
A
591}
592
39236c6e
A
593/*
594 * TCP garbage collector callback (inpcb_timer_func_t).
595 *
596 * Returns the number of pcbs that will need to be gc-ed soon,
597 * returnining > 0 will keep timer active.
598 */
1c79356b 599void
39236c6e 600tcp_gc(struct inpcbinfo *ipi)
1c79356b 601{
4a3eedf9 602 struct inpcb *inp, *nxt;
39236c6e 603 struct tcpcb *tw_tp, *tw_ntp;
1c79356b
A
604#if TCPDEBUG
605 int ostate;
606#endif
b0d623f7 607#if KDEBUG
2d21ac55 608 static int tws_checked = 0;
b0d623f7 609#endif
2d21ac55 610
39236c6e 611 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0);
1c79356b 612
39236c6e
A
613 /*
614 * Update tcp_now here as it may get used while
615 * processing the slow timer.
616 */
6d2010ae 617 calculate_tcp_clock();
8ad349bb 618
39236c6e
A
619 /*
620 * Garbage collect socket/tcpcb: We need to acquire the list lock
6d2010ae 621 * exclusively to do this
2d21ac55
A
622 */
623
39236c6e
A
624 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
625 /* don't sweat it this time; cleanup was done last time */
626 if (tcp_gc_done == TRUE) {
2d21ac55 627 tcp_gc_done = FALSE;
39236c6e
A
628 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END,
629 tws_checked, cur_tw_slot, 0, 0, 0);
630 /* Lock upgrade failed, give up this round */
631 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
632 return;
2d21ac55 633 }
39236c6e
A
634 /* Upgrade failed, lost lock now take it again exclusive */
635 lck_rw_lock_exclusive(ipi->ipi_lock);
2d21ac55
A
636 }
637 tcp_gc_done = TRUE;
1c79356b 638
39236c6e
A
639 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
640 if (tcp_garbage_collect(inp, 0))
641 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
642 }
2d21ac55 643
39236c6e
A
644 /* Now cleanup the time wait ones */
645 TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) {
646 /*
39037602 647 * We check the timestamp here without holding the
39236c6e
A
648 * socket lock for better performance. If there are
649 * any pcbs in time-wait, the timer will get rescheduled.
650 * Hence some error in this check can be tolerated.
15129b1c
A
651 *
652 * Sometimes a socket on time-wait queue can be closed if
653 * 2MSL timer expired but the application still has a
39037602 654 * usecount on it.
39236c6e 655 */
39037602 656 if (tw_tp->t_state == TCPS_CLOSED ||
15129b1c 657 TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) {
39236c6e
A
658 if (tcp_garbage_collect(tw_tp->t_inpcb, 1))
659 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
2d21ac55 660 }
91447636
A
661 }
662
39236c6e
A
663 /* take into account pcbs that are still in time_wait_slots */
664 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount);
91447636 665
39236c6e 666 lck_rw_done(ipi->ipi_lock);
1c79356b 667
39236c6e
A
668 /* Clean up the socache while we are here */
669 if (so_cache_timer())
670 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
91447636 671
39236c6e
A
672 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked,
673 cur_tw_slot, 0, 0, 0);
674
675 return;
1c79356b
A
676}
677
678/*
679 * Cancel all timers for TCP tp.
680 */
681void
39037602 682tcp_canceltimers(struct tcpcb *tp)
1c79356b 683{
39037602 684 int i;
1c79356b 685
6d2010ae 686 tcp_remove_timer(tp);
1c79356b
A
687 for (i = 0; i < TCPT_NTIMERS; i++)
688 tp->t_timer[i] = 0;
6d2010ae
A
689 tp->tentry.timer_start = tcp_now;
690 tp->tentry.index = TCPT_NONE;
1c79356b
A
691}
692
9bccf70c
A
693int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
694 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
695
1c79356b
A
696int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
697 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
698
699static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
700
39037602
A
701void
702tcp_rexmt_save_state(struct tcpcb *tp)
39236c6e
A
703{
704 u_int32_t fsize;
705 if (TSTMP_SUPPORTED(tp)) {
706 /*
39037602 707 * Since timestamps are supported on the connection,
39236c6e
A
708 * we can do recovery as described in rfc 4015.
709 */
710 fsize = tp->snd_max - tp->snd_una;
711 tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh);
712 tp->snd_recover_prev = tp->snd_recover;
713 } else {
714 /*
715 * Timestamp option is not supported on this connection.
716 * Record ssthresh and cwnd so they can
717 * be recovered if this turns out to be a "bad" retransmit.
39037602 718 * A retransmit is considered "bad" if an ACK for this
39236c6e 719 * segment is received within RTT/2 interval; the assumption
39037602 720 * here is that the ACK was already in flight. See
39236c6e
A
721 * "On Estimating End-to-End Network Path Properties" by
722 * Allman and Paxson for more details.
723 */
724 tp->snd_cwnd_prev = tp->snd_cwnd;
725 tp->snd_ssthresh_prev = tp->snd_ssthresh;
726 tp->snd_recover_prev = tp->snd_recover;
727 if (IN_FASTRECOVERY(tp))
728 tp->t_flags |= TF_WASFRECOVERY;
729 else
730 tp->t_flags &= ~TF_WASFRECOVERY;
731 }
732 tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2;
733 tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
734 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
735}
736
fe8ab488
A
737/*
738 * Revert to the older segment size if there is an indication that PMTU
739 * blackhole detection was not needed.
740 */
39037602
A
741void
742tcp_pmtud_revert_segment_size(struct tcpcb *tp)
fe8ab488
A
743{
744 int32_t optlen;
745
746 VERIFY(tp->t_pmtud_saved_maxopd > 0);
39037602
A
747 tp->t_flags |= TF_PMTUD;
748 tp->t_flags &= ~TF_BLACKHOLE;
fe8ab488
A
749 optlen = tp->t_maxopd - tp->t_maxseg;
750 tp->t_maxopd = tp->t_pmtud_saved_maxopd;
751 tp->t_maxseg = tp->t_maxopd - optlen;
752 /*
39037602 753 * Reset the slow-start flight size as it
fe8ab488
A
754 * may depend on the new MSS
755 */
756 if (CC_ALGO(tp)->cwnd_init != NULL)
757 CC_ALGO(tp)->cwnd_init(tp);
758 tp->t_pmtud_start_ts = 0;
759 tcpstat.tcps_pmtudbh_reverted++;
760}
761
1c79356b
A
762/*
763 * TCP timer processing.
764 */
765struct tcpcb *
39037602 766tcp_timers(struct tcpcb *tp, int timer)
1c79356b 767{
fe8ab488 768 int32_t rexmt, optlen = 0, idle_time = 0;
316670eb 769 struct socket *so;
9bccf70c 770 struct tcptemp *t_template;
55e303ae
A
771#if TCPDEBUG
772 int ostate;
773#endif
774
1c79356b
A
775#if INET6
776 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
777#endif /* INET6 */
3e170ce0
A
778 u_int64_t accsleep_ms;
779 u_int32_t last_sleep_ms = 0;
1c79356b 780
316670eb 781 so = tp->t_inpcb->inp_socket;
6d2010ae 782 idle_time = tcp_now - tp->t_rcvtime;
9bccf70c 783
1c79356b
A
784 switch (timer) {
785
786 /*
787 * 2 MSL timeout in shutdown went off. If we're closed but
788 * still waiting for peer to close and connection has been idle
2d21ac55
A
789 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
790 * delete connection control block.
791 * Otherwise, (this case shouldn't happen) check again in a bit
792 * we keep the socket in the main list in that case.
1c79356b
A
793 */
794 case TCPT_2MSL:
8ad349bb 795 tcp_free_sackholes(tp);
1c79356b 796 if (tp->t_state != TCPS_TIME_WAIT &&
2d21ac55 797 tp->t_state != TCPS_FIN_WAIT_2 &&
39236c6e 798 ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) {
39037602 799 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
39236c6e
A
800 (u_int32_t)TCP_CONN_KEEPINTVL(tp));
801 } else {
1c79356b 802 tp = tcp_close(tp);
91447636
A
803 return(tp);
804 }
1c79356b
A
805 break;
806
807 /*
808 * Retransmission timer went off. Message has not
809 * been acked within retransmit interval. Back off
810 * to a longer retransmit interval and retransmit one segment.
811 */
812 case TCPT_REXMT:
39037602
A
813 absolutetime_to_nanoseconds(mach_absolutetime_asleep,
814 &accsleep_ms);
815 accsleep_ms = accsleep_ms / 1000000UL;
3e170ce0
A
816 if (accsleep_ms > tp->t_accsleep_ms)
817 last_sleep_ms = accsleep_ms - tp->t_accsleep_ms;
fe8ab488
A
818 /*
819 * Drop a connection in the retransmit timer
820 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT
821 * times
822 * 2. If the time spent in this retransmission episode is
823 * more than the time limit set with TCP_RXT_CONNDROPTIME
824 * socket option
825 * 3. If TCP_RXT_FINDROP socket option was set and
826 * we have already retransmitted the FIN 3 times without
827 * receiving an ack
6d2010ae
A
828 */
829 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
3e170ce0
A
830 (tp->t_rxt_conndroptime > 0 && tp->t_rxtstart > 0 &&
831 (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime) ||
832 ((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
833 (tp->t_flags & TF_SENTFIN) != 0 && tp->t_rxtshift >= 4) ||
834 (tp->t_rxtshift > 4 && last_sleep_ms >= TCP_SLEEP_TOO_LONG)) {
6d2010ae
A
835 if ((tp->t_flagsext & TF_RXTFINDROP) != 0) {
836 tcpstat.tcps_rxtfindrop++;
3e170ce0
A
837 } else if (last_sleep_ms >= TCP_SLEEP_TOO_LONG) {
838 tcpstat.tcps_drop_after_sleep++;
6d2010ae
A
839 } else {
840 tcpstat.tcps_timeoutdrop++;
841 }
4bd07ac2
A
842 if (tp->t_rxtshift >= TCP_MAXRXTSHIFT) {
843 if (TCP_ECN_ENABLED(tp)) {
844 INP_INC_IFNET_STAT(tp->t_inpcb,
845 ecn_on.rxmit_drop);
846 } else {
847 INP_INC_IFNET_STAT(tp->t_inpcb,
848 ecn_off.rxmit_drop);
849 }
850 }
1c79356b 851 tp->t_rxtshift = TCP_MAXRXTSHIFT;
39037602
A
852 postevent(so, 0, EV_TIMEOUT);
853 soevent(so,
316670eb 854 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
39037602
A
855
856 if (TCP_ECN_ENABLED(tp) &&
857 tp->t_state == TCPS_ESTABLISHED)
858 tcp_heuristic_ecn_droprxmt(tp);
859
1c79356b
A
860 tp = tcp_drop(tp, tp->t_softerror ?
861 tp->t_softerror : ETIMEDOUT);
316670eb 862
1c79356b
A
863 break;
864 }
9bccf70c 865
39236c6e 866 tcpstat.tcps_rexmttimeo++;
3e170ce0 867 tp->t_accsleep_ms = accsleep_ms;
6d2010ae 868
39037602 869 if (tp->t_rxtshift == 1 &&
39236c6e
A
870 tp->t_state == TCPS_ESTABLISHED) {
871 /* Set the time at which retransmission started. */
872 tp->t_rxtstart = tcp_now;
873
39037602 874 /*
39236c6e
A
875 * if this is the first retransmit timeout, save
876 * the state so that we can recover if the timeout
877 * is spurious.
39037602 878 */
39236c6e
A
879 tcp_rexmt_save_state(tp);
880 }
881#if MPTCP
fe8ab488 882 if ((tp->t_rxtshift >= mptcp_fail_thresh) &&
39236c6e
A
883 (tp->t_state == TCPS_ESTABLISHED) &&
884 (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
885 mptcp_act_on_txfail(so);
886
887 }
888#endif /* MPTCP */
889
890 if (tp->t_adaptive_wtimo > 0 &&
891 tp->t_rxtshift > tp->t_adaptive_wtimo &&
892 TCPS_HAVEESTABLISHED(tp->t_state)) {
893 /* Send an event to the application */
894 soevent(so,
895 (SO_FILT_HINT_LOCKED|
896 SO_FILT_HINT_ADAPTIVE_WTIMO));
9bccf70c 897 }
316670eb 898
fe8ab488
A
899 /*
900 * If this is a retransmit timeout after PTO, the PTO
901 * was not effective
902 */
903 if (tp->t_flagsext & TF_SENT_TLPROBE) {
904 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
905 tcpstat.tcps_rto_after_pto++;
906 }
907
908 if (tp->t_flagsext & TF_DELAY_RECOVERY) {
909 /*
910 * Retransmit timer fired before entering recovery
911 * on a connection with packet re-ordering. This
912 * suggests that the reordering metrics computed
913 * are not accurate.
914 */
915 tp->t_reorderwin = 0;
916 tp->t_timer[TCPT_DELAYFR] = 0;
917 tp->t_flagsext &= ~(TF_DELAY_RECOVERY);
918 }
919
3e170ce0
A
920 if (tp->t_state == TCPS_SYN_RECEIVED)
921 tcp_disable_tfo(tp);
922
923 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
924 !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
925 ((tp->t_state != TCPS_SYN_SENT && tp->t_rxtshift > 1) ||
926 tp->t_rxtshift > 2)) {
927 /*
928 * For regular retransmissions, a first one is being
929 * done for tail-loss probe.
930 * Thus, if rxtshift > 1, this means we have sent the segment
931 * a total of 3 times.
932 *
933 * If we are in SYN-SENT state, then there is no tail-loss
934 * probe thus we have to let rxtshift go up to 3.
935 */
936 tcp_heuristic_tfo_middlebox(tp);
937
938 so->so_error = ENODATA;
939 sorwakeup(so);
940 sowwakeup(so);
39037602
A
941
942 tp->t_tfo_stats |= TFO_S_SEND_BLACKHOLE;
943 tcpstat.tcps_tfo_sndblackhole++;
3e170ce0
A
944 }
945
39236c6e 946 if (tp->t_state == TCPS_SYN_SENT) {
9bccf70c 947 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
39236c6e 948 tp->t_stat.synrxtshift = tp->t_rxtshift;
3e170ce0
A
949
950 /* When retransmitting, disable TFO */
951 if (tfo_enabled(tp)) {
952 tp->t_flagsext &= ~TF_FASTOPEN;
953 tp->t_tfo_flags |= TFO_F_SYN_LOSS;
3e170ce0 954 }
fe8ab488 955 } else {
9bccf70c 956 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
fe8ab488
A
957 }
958
490019cf 959 TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX,
6d2010ae
A
960 TCP_ADD_REXMTSLOP(tp));
961 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
9bccf70c 962
316670eb
A
963 if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb))
964 goto fc_output;
965
966 tcp_free_sackholes(tp);
9bccf70c 967 /*
fe8ab488 968 * Check for potential Path MTU Discovery Black Hole
b0d623f7 969 */
fe8ab488
A
970 if (tcp_pmtud_black_hole_detect &&
971 !(tp->t_flagsext & TF_NOBLACKHOLE_DETECTION) &&
972 (tp->t_state == TCPS_ESTABLISHED)) {
3e170ce0
A
973 if ((tp->t_flags & TF_PMTUD) &&
974 ((tp->t_flags & TF_MAXSEGSNT)
975 || tp->t_pmtud_lastseg_size > tcp_pmtud_black_hole_mss) &&
976 tp->t_rxtshift == 2) {
39037602 977 /*
b0d623f7
A
978 * Enter Path MTU Black-hole Detection mechanism:
979 * - Disable Path MTU Discovery (IP "DF" bit).
fe8ab488
A
980 * - Reduce MTU to lower value than what we
981 * negotiated with the peer.
b0d623f7 982 */
39236c6e
A
983 /* Disable Path MTU Discovery for now */
984 tp->t_flags &= ~TF_PMTUD;
985 /* Record that we may have found a black hole */
986 tp->t_flags |= TF_BLACKHOLE;
b0d623f7 987 optlen = tp->t_maxopd - tp->t_maxseg;
39236c6e
A
988 /* Keep track of previous MSS */
989 tp->t_pmtud_saved_maxopd = tp->t_maxopd;
fe8ab488
A
990 tp->t_pmtud_start_ts = tcp_now;
991 if (tp->t_pmtud_start_ts == 0)
992 tp->t_pmtud_start_ts++;
39236c6e
A
993 /* Reduce the MSS to intermediary value */
994 if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
995 tp->t_maxopd = tcp_pmtud_black_hole_mss;
996 } else {
b0d623f7
A
997 tp->t_maxopd = /* use the default MSS */
998#if INET6
999 isipv6 ? tcp_v6mssdflt :
1000#endif /* INET6 */
1001 tcp_mssdflt;
1002 }
1003 tp->t_maxseg = tp->t_maxopd - optlen;
6d2010ae
A
1004
1005 /*
39037602 1006 * Reset the slow-start flight size
39236c6e 1007 * as it may depend on the new MSS
6d2010ae
A
1008 */
1009 if (CC_ALGO(tp)->cwnd_init != NULL)
1010 CC_ALGO(tp)->cwnd_init(tp);
39037602 1011 tp->snd_cwnd = tp->t_maxseg;
b0d623f7
A
1012 }
1013 /*
fe8ab488
A
1014 * If further retransmissions are still
1015 * unsuccessful with a lowered MTU, maybe this
1016 * isn't a Black Hole and we restore the previous
1017 * MSS and blackhole detection flags.
b0d623f7
A
1018 */
1019 else {
39037602 1020
fe8ab488
A
1021 if ((tp->t_flags & TF_BLACKHOLE) &&
1022 (tp->t_rxtshift > 4)) {
1023 tcp_pmtud_revert_segment_size(tp);
39037602 1024 tp->snd_cwnd = tp->t_maxseg;
b0d623f7
A
1025 }
1026 }
1027 }
1028
1029
1030 /*
fe8ab488
A
1031 * Disable rfc1323 and rfc1644 if we haven't got any
1032 * response to our SYN (after we reach the threshold)
1033 * to work-around some broken terminal servers (most of
1034 * which have hopefully been retired) that have bad VJ
1035 * header compression code which trashes TCP segments
1036 * containing unknown-to-them TCP options.
39236c6e 1037 * Do this only on non-local connections.
9bccf70c 1038 */
39236c6e 1039 if (tp->t_state == TCPS_SYN_SENT &&
3e170ce0 1040 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres)
b0d623f7 1041 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
316670eb 1042
1c79356b
A
1043 /*
1044 * If losing, let the lower level know and try for
1045 * a better route. Also, if we backed off this far,
1046 * our srtt estimate is probably bogus. Clobber it
1047 * so we'll take the next rtt measurement as our srtt;
1048 * move the current srtt into rttvar to keep the current
1049 * retransmit times until then.
1050 */
1051 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
1052#if INET6
1053 if (isipv6)
1054 in6_losing(tp->t_inpcb);
1055 else
1056#endif /* INET6 */
1057 in_losing(tp->t_inpcb);
1058 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
1059 tp->t_srtt = 0;
1060 }
1061 tp->snd_nxt = tp->snd_una;
9bccf70c
A
1062 /*
1063 * Note: We overload snd_recover to function also as the
1064 * snd_last variable described in RFC 2582
1065 */
1066 tp->snd_recover = tp->snd_max;
1c79356b
A
1067 /*
1068 * Force a segment to be sent.
1069 */
1070 tp->t_flags |= TF_ACKNOW;
fe8ab488
A
1071
1072 /* If timing a segment in this window, stop the timer */
9bccf70c 1073 tp->t_rtttime = 0;
6d2010ae 1074
fe8ab488
A
1075 if (!IN_FASTRECOVERY(tp) && tp->t_rxtshift == 1)
1076 tcpstat.tcps_tailloss_rto++;
1077
39236c6e 1078
fe8ab488
A
1079 /*
1080 * RFC 5681 says: when a TCP sender detects segment loss
39236c6e
A
1081 * using retransmit timer and the given segment has already
1082 * been retransmitted by way of the retransmission timer at
1083 * least once, the value of ssthresh is held constant
1084 */
39037602 1085 if (tp->t_rxtshift == 1 &&
3e170ce0 1086 CC_ALGO(tp)->after_timeout != NULL) {
6d2010ae 1087 CC_ALGO(tp)->after_timeout(tp);
3e170ce0
A
1088 /*
1089 * CWR notifications are to be sent on new data
1090 * right after Fast Retransmits and ECE
1091 * notification receipts.
1092 */
1093 if (TCP_ECN_ENABLED(tp))
1094 tp->ecn_flags |= TE_SENDCWR;
1095 }
6d2010ae 1096
fe8ab488 1097 EXIT_FASTRECOVERY(tp);
6d2010ae 1098
3e170ce0
A
1099 /* Exit cwnd non validated phase */
1100 tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
1101
1102
316670eb 1103fc_output:
fe8ab488 1104 tcp_ccdbg_trace(tp, NULL, TCP_CC_REXMT_TIMEOUT);
6d2010ae 1105
1c79356b
A
1106 (void) tcp_output(tp);
1107 break;
1108
1109 /*
1110 * Persistance timer into zero window.
1111 * Force a byte to be output, if possible.
1112 */
1113 case TCPT_PERSIST:
1114 tcpstat.tcps_persisttimeo++;
1115 /*
1116 * Hack: if the peer is dead/unreachable, we do not
1117 * time out if the window is closed. After a full
1118 * backoff, drop the connection if the idle time
1119 * (no responses to probes) reaches the maximum
1120 * backoff that we would use if retransmitting.
39037602
A
1121 *
1122 * Drop the connection if we reached the maximum allowed time for
1123 * Zero Window Probes without a non-zero update from the peer.
6d2010ae 1124 * See rdar://5805356
1c79356b 1125 */
6d2010ae
A
1126 if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
1127 (idle_time >= tcp_maxpersistidle ||
39037602
A
1128 idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) ||
1129 ((tp->t_persist_stop != 0) &&
316670eb 1130 TSTMP_LEQ(tp->t_persist_stop, tcp_now))) {
1c79356b 1131 tcpstat.tcps_persistdrop++;
316670eb
A
1132 postevent(so, 0, EV_TIMEOUT);
1133 soevent(so,
1134 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1c79356b 1135 tp = tcp_drop(tp, ETIMEDOUT);
1c79356b
A
1136 break;
1137 }
1138 tcp_setpersist(tp);
fe8ab488 1139 tp->t_flagsext |= TF_FORCE;
1c79356b 1140 (void) tcp_output(tp);
fe8ab488 1141 tp->t_flagsext &= ~TF_FORCE;
1c79356b
A
1142 break;
1143
1144 /*
1145 * Keep-alive timer went off; send something
1146 * or drop connection if idle for too long.
1147 */
1148 case TCPT_KEEP:
1149 tcpstat.tcps_keeptimeo++;
39236c6e
A
1150#if MPTCP
1151 /*
1152 * Regular TCP connections do not send keepalives after closing
1153 * MPTCP must not also, after sending Data FINs.
1154 */
1155 struct mptcb *mp_tp = tp->t_mptcb;
fe8ab488
A
1156 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
1157 (tp->t_state > TCPS_ESTABLISHED)) {
39236c6e
A
1158 goto dropit;
1159 } else if (mp_tp != NULL) {
1160 if ((mptcp_ok_to_keepalive(mp_tp) == 0))
1161 goto dropit;
1162 }
1163#endif /* MPTCP */
1c79356b
A
1164 if (tp->t_state < TCPS_ESTABLISHED)
1165 goto dropit;
1166 if ((always_keepalive ||
39236c6e 1167 (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ||
3e170ce0
A
1168 (tp->t_flagsext & TF_DETECT_READSTALL) ||
1169 (tp->t_tfo_probe_state == TFO_PROBE_PROBING)) &&
2d21ac55 1170 (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
39236c6e 1171 if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp))
1c79356b
A
1172 goto dropit;
1173 /*
1174 * Send a packet designed to force a response
1175 * if the peer is up and reachable:
1176 * either an ACK if the connection is still alive,
1177 * or an RST if the peer has closed the connection
1178 * due to timeout or reboot.
1179 * Using sequence number tp->snd_una-1
1180 * causes the transmitted zero-length segment
1181 * to lie outside the receive window;
1182 * by the protocol spec, this requires the
1183 * correspondent TCP to respond.
1184 */
1185 tcpstat.tcps_keepprobe++;
9bccf70c
A
1186 t_template = tcp_maketemplate(tp);
1187 if (t_template) {
fe8ab488
A
1188 struct inpcb *inp = tp->t_inpcb;
1189 struct tcp_respond_args tra;
c910b4d9 1190
fe8ab488
A
1191 bzero(&tra, sizeof(tra));
1192 tra.nocell = INP_NO_CELLULAR(inp);
1193 tra.noexpensive = INP_NO_EXPENSIVE(inp);
1194 tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
39037602 1195 tra.intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp);
c910b4d9 1196 if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
fe8ab488 1197 tra.ifscope = tp->t_inpcb->inp_boundifp->if_index;
c910b4d9 1198 else
fe8ab488 1199 tra.ifscope = IFSCOPE_NONE;
9bccf70c
A
1200 tcp_respond(tp, t_template->tt_ipgen,
1201 &t_template->tt_t, (struct mbuf *)NULL,
fe8ab488 1202 tp->rcv_nxt, tp->snd_una - 1, 0, &tra);
9bccf70c 1203 (void) m_free(dtom(t_template));
39236c6e
A
1204 if (tp->t_flagsext & TF_DETECT_READSTALL)
1205 tp->t_rtimo_probes++;
1206 }
1207 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3e170ce0 1208 TCP_CONN_KEEPINTVL(tp));
39236c6e
A
1209 } else {
1210 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3e170ce0 1211 TCP_CONN_KEEPIDLE(tp));
39236c6e
A
1212 }
1213 if (tp->t_flagsext & TF_DETECT_READSTALL) {
3e170ce0
A
1214 struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
1215 bool reenable_probe = false;
39037602 1216 /*
39236c6e 1217 * The keep alive packets sent to detect a read
39037602 1218 * stall did not get a response from the
39236c6e
A
1219 * peer. Generate more keep-alives to confirm this.
1220 * If the number of probes sent reaches the limit,
1221 * generate an event.
1222 */
3e170ce0
A
1223 if (tp->t_adaptive_rtimo > 0) {
1224 if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) {
1225 /* Generate an event */
1226 soevent(so,
1227 (SO_FILT_HINT_LOCKED |
1228 SO_FILT_HINT_ADAPTIVE_RTIMO));
1229 tcp_keepalive_reset(tp);
1230 } else {
1231 reenable_probe = true;
1232 }
1233 } else if (outifp != NULL &&
1234 (outifp->if_eflags & IFEF_PROBE_CONNECTIVITY) &&
1235 tp->t_rtimo_probes <= TCP_CONNECTIVITY_PROBES_MAX) {
1236 reenable_probe = true;
39236c6e 1237 } else {
3e170ce0
A
1238 tp->t_flagsext &= ~TF_DETECT_READSTALL;
1239 }
1240 if (reenable_probe) {
1241 int ind = min(tp->t_rtimo_probes,
1242 TCP_MAXRXTSHIFT);
39236c6e 1243 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(
3e170ce0 1244 tp, tcp_backoff[ind] * TCP_REXMTVAL(tp));
9bccf70c 1245 }
39236c6e 1246 }
3e170ce0
A
1247 if (tp->t_tfo_probe_state == TFO_PROBE_PROBING) {
1248 int ind;
1249
1250 tp->t_tfo_probes++;
1251 ind = min(tp->t_tfo_probes, TCP_MAXRXTSHIFT);
1252
1253 /*
1254 * We take the minimum among the time set by true
1255 * keepalive (see above) and the backoff'd RTO. That
1256 * way we backoff in case of packet-loss but will never
1257 * timeout slower than regular keepalive due to the
1258 * backing off.
1259 */
1260 tp->t_timer[TCPT_KEEP] = min(OFFSET_FROM_START(
1261 tp, tcp_backoff[ind] * TCP_REXMTVAL(tp)),
1262 tp->t_timer[TCPT_KEEP]);
1263 } else if (tp->t_tfo_probe_state == TFO_PROBE_WAIT_DATA) {
1264 /* Still no data! Let's assume a TFO-error and err out... */
1265 tcp_heuristic_tfo_middlebox(tp);
1266
1267 so->so_error = ENODATA;
1268 sorwakeup(so);
39037602 1269 tp->t_tfo_stats |= TFO_S_RECV_BLACKHOLE;
3e170ce0
A
1270 tcpstat.tcps_tfo_blackhole++;
1271 }
6d2010ae
A
1272 break;
1273 case TCPT_DELACK:
1274 if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
1275 tp->t_flags &= ~TF_DELACK;
1276 tp->t_timer[TCPT_DELACK] = 0;
1277 tp->t_flags |= TF_ACKNOW;
1278
fe8ab488
A
1279 /*
1280 * If delayed ack timer fired while stretching
1281 * acks, count the number of times the streaming
39037602 1282 * detection was not correct. If this exceeds a
fe8ab488
A
1283 * threshold, disable strech ack on this
1284 * connection
1285 *
1286 * Also, go back to acking every other packet.
6d2010ae 1287 */
fe8ab488
A
1288 if ((tp->t_flags & TF_STRETCHACK)) {
1289 if (tp->t_unacksegs > 1 &&
1290 tp->t_unacksegs < maxseg_unacked)
1291 tp->t_stretchack_delayed++;
1292
1293 if (tp->t_stretchack_delayed >
1294 TCP_STRETCHACK_DELAY_THRESHOLD) {
1295 tp->t_flagsext |= TF_DISABLE_STRETCHACK;
1296 /*
1297 * Note the time at which stretch
1298 * ack was disabled automatically
1299 */
1300 tp->rcv_nostrack_ts = tcp_now;
1301 tcpstat.tcps_nostretchack++;
1302 tp->t_stretchack_delayed = 0;
39037602 1303 tp->rcv_nostrack_pkts = 0;
fe8ab488 1304 }
6d2010ae 1305 tcp_reset_stretch_ack(tp);
fe8ab488 1306 }
6d2010ae 1307
fe8ab488
A
1308 /*
1309 * If we are measuring inter packet arrival jitter
1310 * for throttling a connection, this delayed ack
1311 * might be the reason for accumulating some
1312 * jitter. So let's restart the measurement.
316670eb
A
1313 */
1314 CLEAR_IAJ_STATE(tp);
1315
6d2010ae
A
1316 tcpstat.tcps_delack++;
1317 (void) tcp_output(tp);
1318 }
1c79356b 1319 break;
9bccf70c 1320
39236c6e
A
1321#if MPTCP
1322 case TCPT_JACK_RXMT:
1323 if ((tp->t_state == TCPS_ESTABLISHED) &&
1324 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
1325 (tp->t_mpflags & TMPF_JOINED_FLOW)) {
1326 if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) {
1327 tcpstat.tcps_timeoutdrop++;
1328 postevent(so, 0, EV_TIMEOUT);
39037602 1329 soevent(so,
39236c6e
A
1330 (SO_FILT_HINT_LOCKED|
1331 SO_FILT_HINT_TIMEOUT));
1332 tp = tcp_drop(tp, tp->t_softerror ?
1333 tp->t_softerror : ETIMEDOUT);
1334 break;
1335 }
1336 tcpstat.tcps_join_rxmts++;
1337 tp->t_flags |= TF_ACKNOW;
1338
1339 /*
39037602 1340 * No backoff is implemented for simplicity for this
39236c6e
A
1341 * corner case.
1342 */
1343 (void) tcp_output(tp);
1344 }
1345 break;
1346#endif /* MPTCP */
1347
fe8ab488
A
1348 case TCPT_PTO:
1349 {
fe8ab488 1350 int32_t snd_len;
fe8ab488
A
1351 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1352
1353 /*
1354 * Check if the connection is in the right state to
1355 * send a probe
1356 */
1357 if (tp->t_state != TCPS_ESTABLISHED ||
3e170ce0
A
1358 (tp->t_rxtshift > 0 && !(tp->t_flagsext & TF_PROBING))
1359 || tp->snd_max == tp->snd_una ||
1360 !SACK_ENABLED(tp) || !TAILQ_EMPTY(&tp->snd_holes) ||
1361 IN_FASTRECOVERY(tp))
fe8ab488
A
1362 break;
1363
3e170ce0
A
1364 /*
1365 * If there is no new data to send or if the
1366 * connection is limited by receive window then
1367 * retransmit the last segment, otherwise send
1368 * new data.
1369 */
1370 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd)
1371 - (tp->snd_max - tp->snd_una);
1372 if (snd_len > 0) {
1373 tp->snd_nxt = tp->snd_max;
1374 } else {
1375 snd_len = min((tp->snd_max - tp->snd_una),
1376 tp->t_maxseg);
1377 tp->snd_nxt = tp->snd_max - snd_len;
1378 }
1379
fe8ab488 1380 tcpstat.tcps_pto++;
3e170ce0
A
1381 if (tp->t_flagsext & TF_PROBING)
1382 tcpstat.tcps_probe_if++;
fe8ab488
A
1383
1384 /* If timing a segment in this window, stop the timer */
1385 tp->t_rtttime = 0;
fe8ab488
A
1386 /* Note that tail loss probe is being sent */
1387 tp->t_flagsext |= TF_SENT_TLPROBE;
1388 tp->t_tlpstart = tcp_now;
1389
1390 tp->snd_cwnd += tp->t_maxseg;
1391 (void )tcp_output(tp);
1392 tp->snd_cwnd -= tp->t_maxseg;
1393
1394 tp->t_tlphighrxt = tp->snd_nxt;
fe8ab488
A
1395 break;
1396 }
1397 case TCPT_DELAYFR:
1398 tp->t_flagsext &= ~TF_DELAY_RECOVERY;
1399
1400 /*
1401 * Don't do anything if one of the following is true:
1402 * - the connection is already in recovery
1403 * - sequence until snd_recover has been acknowledged.
1404 * - retransmit timeout has fired
1405 */
1406 if (IN_FASTRECOVERY(tp) ||
1407 SEQ_GEQ(tp->snd_una, tp->snd_recover) ||
1408 tp->t_rxtshift > 0)
1409 break;
1410
1411 VERIFY(SACK_ENABLED(tp));
3e170ce0
A
1412 tcp_rexmt_save_state(tp);
1413 if (CC_ALGO(tp)->pre_fr != NULL) {
fe8ab488 1414 CC_ALGO(tp)->pre_fr(tp);
3e170ce0
A
1415 if (TCP_ECN_ENABLED(tp))
1416 tp->ecn_flags |= TE_SENDCWR;
1417 }
fe8ab488 1418 ENTER_FASTRECOVERY(tp);
fe8ab488
A
1419
1420 tp->t_timer[TCPT_REXMT] = 0;
1421 tcpstat.tcps_sack_recovery_episode++;
4bd07ac2 1422 tp->t_sack_recovery_episode++;
fe8ab488
A
1423 tp->sack_newdata = tp->snd_nxt;
1424 tp->snd_cwnd = tp->t_maxseg;
1425 tcp_ccdbg_trace(tp, NULL, TCP_CC_ENTER_FASTRECOVERY);
1426 (void) tcp_output(tp);
1427 break;
1c79356b
A
1428 dropit:
1429 tcpstat.tcps_keepdrops++;
316670eb
A
1430 postevent(so, 0, EV_TIMEOUT);
1431 soevent(so,
1432 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1c79356b 1433 tp = tcp_drop(tp, ETIMEDOUT);
1c79356b
A
1434 break;
1435 }
fe8ab488
A
1436#if TCPDEBUG
1437 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
1438 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
1439 PRU_SLOWTIMO);
1440#endif
1c79356b
A
1441 return (tp);
1442}
6d2010ae
A
1443
1444/* Remove a timer entry from timer list */
1445void
1446tcp_remove_timer(struct tcpcb *tp)
1447{
1448 struct tcptimerlist *listp = &tcp_timer_list;
1449
1450 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1451 if (!(TIMER_IS_ON_LIST(tp))) {
1452 return;
1453 }
1454 lck_mtx_lock(listp->mtx);
39037602 1455
6d2010ae
A
1456 /* Check if pcb is on timer list again after acquiring the lock */
1457 if (!(TIMER_IS_ON_LIST(tp))) {
1458 lck_mtx_unlock(listp->mtx);
1459 return;
1460 }
39037602 1461
6d2010ae
A
1462 if (listp->next_te != NULL && listp->next_te == &tp->tentry)
1463 listp->next_te = LIST_NEXT(&tp->tentry, le);
1464
1465 LIST_REMOVE(&tp->tentry, le);
1466 tp->t_flags &= ~(TF_TIMER_ONLIST);
1467
1468 listp->entries--;
6d2010ae
A
1469
1470 tp->tentry.le.le_next = NULL;
1471 tp->tentry.le.le_prev = NULL;
ebb1b9f4 1472 lck_mtx_unlock(listp->mtx);
6d2010ae
A
1473}
1474
fe8ab488
A
1475/*
1476 * Function to check if the timerlist needs to be rescheduled to run
6d2010ae
A
1477 * the timer entry correctly. Basically, this is to check if we can avoid
1478 * taking the list lock.
1479 */
1480
1481static boolean_t
fe8ab488
A
1482need_to_resched_timerlist(u_int32_t runtime, u_int16_t mode)
1483{
6d2010ae
A
1484 struct tcptimerlist *listp = &tcp_timer_list;
1485 int32_t diff;
6d2010ae 1486
fe8ab488
A
1487 /*
1488 * If the list is being processed then the state of the list is
1489 * in flux. In this case always acquire the lock and set the state
1490 * correctly.
6d2010ae 1491 */
8a3053a0 1492 if (listp->running)
fe8ab488 1493 return (TRUE);
8a3053a0
A
1494
1495 if (!listp->scheduled)
1496 return (TRUE);
6d2010ae
A
1497
1498 diff = timer_diff(listp->runtime, 0, runtime, 0);
1499 if (diff <= 0) {
1500 /* The list is going to run before this timer */
fe8ab488 1501 return (FALSE);
6d2010ae 1502 } else {
fe8ab488
A
1503 if (mode & TCP_TIMERLIST_10MS_MODE) {
1504 if (diff <= TCP_TIMER_10MS_QUANTUM)
1505 return (FALSE);
1506 } else if (mode & TCP_TIMERLIST_100MS_MODE) {
1507 if (diff <= TCP_TIMER_100MS_QUANTUM)
1508 return (FALSE);
6d2010ae 1509 } else {
fe8ab488
A
1510 if (diff <= TCP_TIMER_500MS_QUANTUM)
1511 return (FALSE);
6d2010ae
A
1512 }
1513 }
fe8ab488 1514 return (TRUE);
6d2010ae
A
1515}
1516
1517void
39037602 1518tcp_sched_timerlist(uint32_t offset)
6d2010ae 1519{
6d2010ae
A
1520 uint64_t deadline = 0;
1521 struct tcptimerlist *listp = &tcp_timer_list;
1522
1523 lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED);
1524
8a3053a0 1525 offset = min(offset, TCP_TIMERLIST_MAX_OFFSET);
6d2010ae 1526 listp->runtime = tcp_now + offset;
fe8ab488 1527 if (listp->runtime == 0) {
8a3053a0 1528 listp->runtime++;
fe8ab488
A
1529 offset++;
1530 }
6d2010ae 1531
fe8ab488 1532 clock_interval_to_deadline(offset, USEC_PER_SEC, &deadline);
6d2010ae
A
1533
1534 thread_call_enter_delayed(listp->call, deadline);
8a3053a0 1535 listp->scheduled = TRUE;
6d2010ae
A
1536}
1537
fe8ab488
A
1538/*
1539 * Function to run the timers for a connection.
6d2010ae 1540 *
39037602 1541 * Returns the offset of next timer to be run for this connection which
6d2010ae 1542 * can be used to reschedule the timerlist.
fe8ab488
A
1543 *
1544 * te_mode is an out parameter that indicates the modes of active
1545 * timers for this connection.
6d2010ae 1546 */
fe8ab488 1547u_int32_t
3e170ce0
A
1548tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode,
1549 u_int16_t probe_if_index)
1550{
fe8ab488
A
1551 struct socket *so;
1552 u_int16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE;
1553 u_int32_t timer_val, offset = 0, lo_timer = 0;
6d2010ae
A
1554 int32_t diff;
1555 boolean_t needtorun[TCPT_NTIMERS];
1556 int count = 0;
1557
fe8ab488
A
1558 VERIFY(tp != NULL);
1559 bzero(needtorun, sizeof(needtorun));
1560 *te_mode = 0;
6d2010ae 1561
fe8ab488 1562 tcp_lock(tp->t_inpcb->inp_socket, 1, 0);
6d2010ae 1563
fe8ab488 1564 so = tp->t_inpcb->inp_socket;
39037602 1565 /* Release the want count on inp */
fe8ab488
A
1566 if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1)
1567 == WNT_STOPUSING) {
6d2010ae
A
1568 if (TIMER_IS_ON_LIST(tp)) {
1569 tcp_remove_timer(tp);
1570 }
1571
39037602 1572 /* Looks like the TCP connection got closed while we
6d2010ae
A
1573 * were waiting for the lock.. Done
1574 */
1575 goto done;
1576 }
1577
3e170ce0
A
1578 /*
1579 * If this connection is over an interface that needs to
1580 * be probed, send probe packets to reinitiate communication.
1581 */
1582 if (probe_if_index > 0 && tp->t_inpcb->inp_last_outifp != NULL &&
1583 tp->t_inpcb->inp_last_outifp->if_index == probe_if_index) {
1584 tp->t_flagsext |= TF_PROBING;
1585 tcp_timers(tp, TCPT_PTO);
1586 tp->t_timer[TCPT_PTO] = 0;
39037602 1587 tp->t_flagsext &= ~TF_PROBING;
3e170ce0
A
1588 }
1589
fe8ab488
A
1590 /*
1591 * Since the timer thread needs to wait for tcp lock, it may race
1592 * with another thread that can cancel or reschedule the timer
1593 * that is about to run. Check if we need to run anything.
1594 */
8a3053a0 1595 if ((index = tp->tentry.index) == TCPT_NONE)
6d2010ae 1596 goto done;
39037602 1597
8a3053a0 1598 timer_val = tp->t_timer[index];
6d2010ae
A
1599
1600 diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
1601 if (diff > 0) {
1602 if (tp->tentry.index != TCPT_NONE) {
1603 offset = diff;
fe8ab488 1604 *(te_mode) = tp->tentry.mode;
6d2010ae
A
1605 }
1606 goto done;
1607 }
1608
1609 tp->t_timer[index] = 0;
1610 if (timer_val > 0) {
1611 tp = tcp_timers(tp, index);
fe8ab488 1612 if (tp == NULL)
6d2010ae
A
1613 goto done;
1614 }
39037602 1615
fe8ab488
A
1616 /*
1617 * Check if there are any other timers that need to be run.
1618 * While doing it, adjust the timer values wrt tcp_now.
6d2010ae 1619 */
fe8ab488 1620 tp->tentry.mode = 0;
6d2010ae
A
1621 for (i = 0; i < TCPT_NTIMERS; ++i) {
1622 if (tp->t_timer[i] != 0) {
fe8ab488
A
1623 diff = timer_diff(tp->tentry.timer_start,
1624 tp->t_timer[i], tcp_now, 0);
6d2010ae 1625 if (diff <= 0) {
6d2010ae
A
1626 needtorun[i] = TRUE;
1627 count++;
1628 } else {
1629 tp->t_timer[i] = diff;
1630 needtorun[i] = FALSE;
1631 if (lo_timer == 0 || diff < lo_timer) {
1632 lo_timer = diff;
1633 lo_index = i;
1634 }
fe8ab488 1635 TCP_SET_TIMER_MODE(tp->tentry.mode, i);
6d2010ae
A
1636 }
1637 }
1638 }
39037602 1639
6d2010ae
A
1640 tp->tentry.timer_start = tcp_now;
1641 tp->tentry.index = lo_index;
fe8ab488
A
1642 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
1643
1644 if (tp->tentry.index != TCPT_NONE) {
1645 tp->tentry.runtime = tp->tentry.timer_start +
1646 tp->t_timer[tp->tentry.index];
8a3053a0
A
1647 if (tp->tentry.runtime == 0)
1648 tp->tentry.runtime++;
6d2010ae
A
1649 }
1650
1651 if (count > 0) {
fe8ab488 1652 /* run any other timers outstanding at this time. */
6d2010ae
A
1653 for (i = 0; i < TCPT_NTIMERS; ++i) {
1654 if (needtorun[i]) {
1655 tp->t_timer[i] = 0;
1656 tp = tcp_timers(tp, i);
8a3053a0
A
1657 if (tp == NULL) {
1658 offset = 0;
fe8ab488 1659 *(te_mode) = 0;
6d2010ae 1660 goto done;
8a3053a0 1661 }
6d2010ae
A
1662 }
1663 }
1664 tcp_set_lotimer_index(tp);
1665 }
1666
1667 if (tp->tentry.index < TCPT_NONE) {
1668 offset = tp->t_timer[tp->tentry.index];
fe8ab488 1669 *(te_mode) = tp->tentry.mode;
6d2010ae
A
1670 }
1671
1672done:
1673 if (tp != NULL && tp->tentry.index == TCPT_NONE) {
1674 tcp_remove_timer(tp);
8a3053a0 1675 offset = 0;
6d2010ae 1676 }
fe8ab488
A
1677
1678 tcp_unlock(so, 1, 0);
1679 return(offset);
6d2010ae
A
1680}
1681
1682void
39037602
A
1683tcp_run_timerlist(void * arg1, void * arg2)
1684{
6d2010ae 1685#pragma unused(arg1, arg2)
6d2010ae
A
1686 struct tcptimerentry *te, *next_te;
1687 struct tcptimerlist *listp = &tcp_timer_list;
1688 struct tcpcb *tp;
fe8ab488
A
1689 uint32_t next_timer = 0; /* offset of the next timer on the list */
1690 u_int16_t te_mode = 0; /* modes of all active timers in a tcpcb */
1691 u_int16_t list_mode = 0; /* cumulative of modes of all tcpcbs */
6d2010ae 1692 uint32_t active_count = 0;
6d2010ae
A
1693
1694 calculate_tcp_clock();
1695
1696 lck_mtx_lock(listp->mtx);
1697
1698 listp->running = TRUE;
39037602 1699
6d2010ae
A
1700 LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
1701 uint32_t offset = 0;
1702 uint32_t runtime = te->runtime;
8a3053a0 1703 if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) {
6d2010ae
A
1704 offset = timer_diff(runtime, 0, tcp_now, 0);
1705 if (next_timer == 0 || offset < next_timer) {
1706 next_timer = offset;
1707 }
fe8ab488 1708 list_mode |= te->mode;
6d2010ae
A
1709 continue;
1710 }
6d2010ae
A
1711
1712 tp = TIMERENTRY_TO_TP(te);
1713
fe8ab488
A
1714 /*
1715 * Acquire an inp wantcnt on the inpcb so that the socket
1716 * won't get detached even if tcp_close is called
6d2010ae 1717 */
fe8ab488
A
1718 if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0)
1719 == WNT_STOPUSING) {
1720 /*
1721 * Some how this pcb went into dead state while
1722 * on the timer list, just take it off the list.
1723 * Since the timer list entry pointers are
39037602 1724 * protected by the timer list lock, we can
fe8ab488 1725 * do it here without the socket lock.
6d2010ae
A
1726 */
1727 if (TIMER_IS_ON_LIST(tp)) {
1728 tp->t_flags &= ~(TF_TIMER_ONLIST);
1729 LIST_REMOVE(&tp->tentry, le);
1730 listp->entries--;
1731
1732 tp->tentry.le.le_next = NULL;
1733 tp->tentry.le.le_prev = NULL;
1734 }
1735 continue;
1736 }
fe8ab488 1737 active_count++;
6d2010ae 1738
fe8ab488
A
1739 /*
1740 * Store the next timerentry pointer before releasing the
1741 * list lock. If that entry has to be removed when we
1742 * release the lock, this pointer will be updated to the
1743 * element after that.
6d2010ae 1744 */
39037602 1745 listp->next_te = next_te;
6d2010ae
A
1746
1747 VERIFY_NEXT_LINK(&tp->tentry, le);
1748 VERIFY_PREV_LINK(&tp->tentry, le);
1749
1750 lck_mtx_unlock(listp->mtx);
1751
3e170ce0
A
1752 offset = tcp_run_conn_timer(tp, &te_mode,
1753 listp->probe_if_index);
39037602 1754
6d2010ae
A
1755 lck_mtx_lock(listp->mtx);
1756
1757 next_te = listp->next_te;
1758 listp->next_te = NULL;
1759
fe8ab488
A
1760 if (offset > 0 && te_mode != 0) {
1761 list_mode |= te_mode;
6d2010ae 1762
fe8ab488
A
1763 if (next_timer == 0 || offset < next_timer)
1764 next_timer = offset;
6d2010ae
A
1765 }
1766 }
1767
1768 if (!LIST_EMPTY(&listp->lhead)) {
fe8ab488
A
1769 u_int16_t next_mode = 0;
1770 if ((list_mode & TCP_TIMERLIST_10MS_MODE) ||
1771 (listp->pref_mode & TCP_TIMERLIST_10MS_MODE))
1772 next_mode = TCP_TIMERLIST_10MS_MODE;
1773 else if ((list_mode & TCP_TIMERLIST_100MS_MODE) ||
1774 (listp->pref_mode & TCP_TIMERLIST_100MS_MODE))
1775 next_mode = TCP_TIMERLIST_100MS_MODE;
1776 else
1777 next_mode = TCP_TIMERLIST_500MS_MODE;
6d2010ae 1778
fe8ab488
A
1779 if (next_mode != TCP_TIMERLIST_500MS_MODE) {
1780 listp->idleruns = 0;
6d2010ae 1781 } else {
fe8ab488
A
1782 /*
1783 * the next required mode is slow mode, but if
1784 * the last one was a faster mode and we did not
1785 * have enough idle runs, repeat the last mode.
1786 *
1787 * We try to keep the timer list in fast mode for
1788 * some idle time in expectation of new data.
1789 */
1790 if (listp->mode != next_mode &&
1791 listp->idleruns < timer_fastmode_idlemax) {
1792 listp->idleruns++;
1793 next_mode = listp->mode;
1794 next_timer = TCP_TIMER_100MS_QUANTUM;
1795 } else {
1796 listp->idleruns = 0;
1797 }
6d2010ae 1798 }
fe8ab488
A
1799 listp->mode = next_mode;
1800 if (listp->pref_offset != 0)
1801 next_timer = min(listp->pref_offset, next_timer);
6d2010ae 1802
fe8ab488
A
1803 if (listp->mode == TCP_TIMERLIST_500MS_MODE)
1804 next_timer = max(next_timer,
1805 TCP_TIMER_500MS_QUANTUM);
6d2010ae
A
1806
1807 tcp_sched_timerlist(next_timer);
1808 } else {
8a3053a0
A
1809 /*
1810 * No need to reschedule this timer, but always run
1811 * periodically at a much higher granularity.
1812 */
1813 tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET);
6d2010ae
A
1814 }
1815
1816 listp->running = FALSE;
1817 listp->pref_mode = 0;
1818 listp->pref_offset = 0;
3e170ce0 1819 listp->probe_if_index = 0;
6d2010ae
A
1820
1821 lck_mtx_unlock(listp->mtx);
1822}
1823
fe8ab488 1824/*
3e170ce0 1825 * Function to check if the timerlist needs to be rescheduled to run this
fe8ab488
A
1826 * connection's timers correctly.
1827 */
39037602
A
1828void
1829tcp_sched_timers(struct tcpcb *tp)
6d2010ae
A
1830{
1831 struct tcptimerentry *te = &tp->tentry;
fe8ab488
A
1832 u_int16_t index = te->index;
1833 u_int16_t mode = te->mode;
6d2010ae 1834 struct tcptimerlist *listp = &tcp_timer_list;
8a3053a0 1835 int32_t offset = 0;
fe8ab488 1836 boolean_t list_locked = FALSE;
6d2010ae
A
1837
1838 if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) {
1839 /* Just return without adding the dead pcb to the list */
1840 if (TIMER_IS_ON_LIST(tp)) {
1841 tcp_remove_timer(tp);
1842 }
1843 return;
1844 }
1845
1846 if (index == TCPT_NONE) {
fe8ab488 1847 /* Nothing to run */
6d2010ae
A
1848 tcp_remove_timer(tp);
1849 return;
1850 }
1851
fe8ab488
A
1852 /*
1853 * compute the offset at which the next timer for this connection
1854 * has to run.
1855 */
8a3053a0
A
1856 offset = timer_diff(te->runtime, 0, tcp_now, 0);
1857 if (offset <= 0) {
6d2010ae
A
1858 offset = 1;
1859 tcp_timer_advanced++;
1860 }
6d2010ae
A
1861
1862 if (!TIMER_IS_ON_LIST(tp)) {
1863 if (!list_locked) {
1864 lck_mtx_lock(listp->mtx);
fe8ab488 1865 list_locked = TRUE;
6d2010ae
A
1866 }
1867
1868 LIST_INSERT_HEAD(&listp->lhead, te, le);
1869 tp->t_flags |= TF_TIMER_ONLIST;
1870
fe8ab488
A
1871 listp->entries++;
1872 if (listp->entries > listp->maxentries)
1873 listp->maxentries = listp->entries;
6d2010ae
A
1874
1875 /* if the list is not scheduled, just schedule it */
8a3053a0 1876 if (!listp->scheduled)
6d2010ae 1877 goto schedule;
6d2010ae
A
1878 }
1879
1880
fe8ab488
A
1881 /*
1882 * Timer entry is currently on the list, check if the list needs
1883 * to be rescheduled.
1884 */
1885 if (need_to_resched_timerlist(te->runtime, mode)) {
6d2010ae 1886 tcp_resched_timerlist++;
39037602 1887
6d2010ae
A
1888 if (!list_locked) {
1889 lck_mtx_lock(listp->mtx);
fe8ab488 1890 list_locked = TRUE;
6d2010ae
A
1891 }
1892
1893 VERIFY_NEXT_LINK(te, le);
1894 VERIFY_PREV_LINK(te, le);
1895
1896 if (listp->running) {
fe8ab488
A
1897 listp->pref_mode |= mode;
1898 if (listp->pref_offset == 0 ||
8a3053a0 1899 offset < listp->pref_offset) {
6d2010ae
A
1900 listp->pref_offset = offset;
1901 }
1902 } else {
8a3053a0 1903 /*
fe8ab488
A
1904 * The list could have got rescheduled while
1905 * this thread was waiting for the lock
8a3053a0
A
1906 */
1907 if (listp->scheduled) {
1908 int32_t diff;
1909 diff = timer_diff(listp->runtime, 0,
1910 tcp_now, offset);
1911 if (diff <= 0)
1912 goto done;
1913 else
1914 goto schedule;
6d2010ae
A
1915 } else {
1916 goto schedule;
1917 }
1918 }
1919 }
1920 goto done;
1921
1922schedule:
fe8ab488
A
1923 /*
1924 * Since a connection with timers is getting scheduled, the timer
1925 * list moves from idle to active state and that is why idlegen is
1926 * reset
1927 */
1928 if (mode & TCP_TIMERLIST_10MS_MODE) {
1929 listp->mode = TCP_TIMERLIST_10MS_MODE;
1930 listp->idleruns = 0;
1931 offset = min(offset, TCP_TIMER_10MS_QUANTUM);
1932 } else if (mode & TCP_TIMERLIST_100MS_MODE) {
1933 if (listp->mode > TCP_TIMERLIST_100MS_MODE)
1934 listp->mode = TCP_TIMERLIST_100MS_MODE;
1935 listp->idleruns = 0;
1936 offset = min(offset, TCP_TIMER_100MS_QUANTUM);
6d2010ae
A
1937 }
1938 tcp_sched_timerlist(offset);
1939
1940done:
1941 if (list_locked)
1942 lck_mtx_unlock(listp->mtx);
1943
1944 return;
1945}
39037602 1946
fe8ab488 1947static inline void
3e170ce0
A
1948tcp_set_lotimer_index(struct tcpcb *tp)
1949{
fe8ab488 1950 uint16_t i, lo_index = TCPT_NONE, mode = 0;
6d2010ae
A
1951 uint32_t lo_timer = 0;
1952 for (i = 0; i < TCPT_NTIMERS; ++i) {
fe8ab488
A
1953 if (tp->t_timer[i] != 0) {
1954 TCP_SET_TIMER_MODE(mode, i);
1955 if (lo_timer == 0 || tp->t_timer[i] < lo_timer) {
1956 lo_timer = tp->t_timer[i];
1957 lo_index = i;
1958 }
6d2010ae
A
1959 }
1960 }
1961 tp->tentry.index = lo_index;
fe8ab488
A
1962 tp->tentry.mode = mode;
1963 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
1964
1965 if (tp->tentry.index != TCPT_NONE) {
39037602 1966 tp->tentry.runtime = tp->tentry.timer_start
fe8ab488 1967 + tp->t_timer[tp->tentry.index];
8a3053a0
A
1968 if (tp->tentry.runtime == 0)
1969 tp->tentry.runtime++;
6d2010ae
A
1970 }
1971}
1972
1973void
3e170ce0
A
1974tcp_check_timer_state(struct tcpcb *tp)
1975{
6d2010ae
A
1976 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1977
8a3053a0
A
1978 if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT)
1979 return;
1980
6d2010ae
A
1981 tcp_set_lotimer_index(tp);
1982
1983 tcp_sched_timers(tp);
1984 return;
1985}
fe8ab488 1986
3e170ce0
A
1987static inline void
1988tcp_cumulative_stat(u_int32_t cur, u_int32_t *prev, u_int32_t *dest)
1989{
1990 /* handle wrap around */
1991 int32_t diff = (int32_t) (cur - *prev);
1992 if (diff > 0)
1993 *dest = diff;
1994 else
1995 *dest = 0;
1996 *prev = cur;
1997 return;
1998}
1999
fe8ab488
A
2000__private_extern__ void
2001tcp_report_stats(void)
2002{
2003 struct nstat_sysinfo_data data;
2004 struct sockaddr_in dst;
2005 struct sockaddr_in6 dst6;
2006 struct rtentry *rt = NULL;
3e170ce0 2007 static struct tcp_last_report_stats prev;
39037602 2008 u_int64_t var, uptime;
fe8ab488
A
2009
2010#define stat data.u.tcp_stats
2011 if (((uptime = net_uptime()) - tcp_last_report_time) <
3e170ce0 2012 tcp_report_stats_interval)
fe8ab488
A
2013 return;
2014
2015 tcp_last_report_time = uptime;
2016
2017 bzero(&data, sizeof(data));
2018 data.flags = NSTAT_SYSINFO_TCP_STATS;
2019
2020 bzero(&dst, sizeof(dst));
2021 dst.sin_len = sizeof(dst);
2022 dst.sin_family = AF_INET;
2023
2024 /* ipv4 avg rtt */
2025 lck_mtx_lock(rnh_lock);
2026 rt = rt_lookup(TRUE, (struct sockaddr *)&dst, NULL,
2027 rt_tables[AF_INET], IFSCOPE_NONE);
2028 lck_mtx_unlock(rnh_lock);
2029 if (rt != NULL) {
2030 RT_LOCK(rt);
2031 if (rt_primary_default(rt, rt_key(rt)) &&
2032 rt->rt_stats != NULL) {
2033 stat.ipv4_avgrtt = rt->rt_stats->nstat_avg_rtt;
2034 }
2035 RT_UNLOCK(rt);
2036 rtfree(rt);
2037 rt = NULL;
2038 }
2039
2040 /* ipv6 avg rtt */
2041 bzero(&dst6, sizeof(dst6));
2042 dst6.sin6_len = sizeof(dst6);
2043 dst6.sin6_family = AF_INET6;
2044
2045 lck_mtx_lock(rnh_lock);
2046 rt = rt_lookup(TRUE,(struct sockaddr *)&dst6, NULL,
2047 rt_tables[AF_INET6], IFSCOPE_NONE);
2048 lck_mtx_unlock(rnh_lock);
2049 if (rt != NULL) {
2050 RT_LOCK(rt);
2051 if (rt_primary_default(rt, rt_key(rt)) &&
2052 rt->rt_stats != NULL) {
2053 stat.ipv6_avgrtt = rt->rt_stats->nstat_avg_rtt;
2054 }
2055 RT_UNLOCK(rt);
2056 rtfree(rt);
2057 rt = NULL;
2058 }
2059
2060 /* send packet loss rate, shift by 10 for precision */
2061 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_sndrexmitpack > 0) {
2062 var = tcpstat.tcps_sndrexmitpack << 10;
2063 stat.send_plr = (var * 100) / tcpstat.tcps_sndpack;
2064 }
2065
2066 /* recv packet loss rate, shift by 10 for precision */
2067 if (tcpstat.tcps_rcvpack > 0 && tcpstat.tcps_recovered_pkts > 0) {
2068 var = tcpstat.tcps_recovered_pkts << 10;
2069 stat.recv_plr = (var * 100) / tcpstat.tcps_rcvpack;
2070 }
2071
2072 /* RTO after tail loss, shift by 10 for precision */
39037602 2073 if (tcpstat.tcps_sndrexmitpack > 0
fe8ab488
A
2074 && tcpstat.tcps_tailloss_rto > 0) {
2075 var = tcpstat.tcps_tailloss_rto << 10;
2076 stat.send_tlrto_rate =
2077 (var * 100) / tcpstat.tcps_sndrexmitpack;
2078 }
39037602 2079
fe8ab488
A
2080 /* packet reordering */
2081 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_reordered_pkts > 0) {
2082 var = tcpstat.tcps_reordered_pkts << 10;
2083 stat.send_reorder_rate =
2084 (var * 100) / tcpstat.tcps_sndpack;
2085 }
2086
3e170ce0
A
2087 if (tcp_ecn_outbound == 1)
2088 stat.ecn_client_enabled = 1;
2089 if (tcp_ecn_inbound == 1)
2090 stat.ecn_server_enabled = 1;
2091 tcp_cumulative_stat(tcpstat.tcps_connattempt,
2092 &prev.tcps_connattempt, &stat.connection_attempts);
2093 tcp_cumulative_stat(tcpstat.tcps_accepts,
2094 &prev.tcps_accepts, &stat.connection_accepts);
2095 tcp_cumulative_stat(tcpstat.tcps_ecn_client_setup,
2096 &prev.tcps_ecn_client_setup, &stat.ecn_client_setup);
2097 tcp_cumulative_stat(tcpstat.tcps_ecn_server_setup,
2098 &prev.tcps_ecn_server_setup, &stat.ecn_server_setup);
2099 tcp_cumulative_stat(tcpstat.tcps_ecn_client_success,
2100 &prev.tcps_ecn_client_success, &stat.ecn_client_success);
2101 tcp_cumulative_stat(tcpstat.tcps_ecn_server_success,
2102 &prev.tcps_ecn_server_success, &stat.ecn_server_success);
2103 tcp_cumulative_stat(tcpstat.tcps_ecn_not_supported,
2104 &prev.tcps_ecn_not_supported, &stat.ecn_not_supported);
2105 tcp_cumulative_stat(tcpstat.tcps_ecn_lost_syn,
2106 &prev.tcps_ecn_lost_syn, &stat.ecn_lost_syn);
2107 tcp_cumulative_stat(tcpstat.tcps_ecn_lost_synack,
2108 &prev.tcps_ecn_lost_synack, &stat.ecn_lost_synack);
2109 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ce,
2110 &prev.tcps_ecn_recv_ce, &stat.ecn_recv_ce);
2111 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ece,
2112 &prev.tcps_ecn_recv_ece, &stat.ecn_recv_ece);
2113 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ece,
2114 &prev.tcps_ecn_recv_ece, &stat.ecn_recv_ece);
2115 tcp_cumulative_stat(tcpstat.tcps_ecn_sent_ece,
2116 &prev.tcps_ecn_sent_ece, &stat.ecn_sent_ece);
2117 tcp_cumulative_stat(tcpstat.tcps_ecn_sent_ece,
2118 &prev.tcps_ecn_sent_ece, &stat.ecn_sent_ece);
2119 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_recv_ce,
2120 &prev.tcps_ecn_conn_recv_ce, &stat.ecn_conn_recv_ce);
2121 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_recv_ece,
2122 &prev.tcps_ecn_conn_recv_ece, &stat.ecn_conn_recv_ece);
2123 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_plnoce,
2124 &prev.tcps_ecn_conn_plnoce, &stat.ecn_conn_plnoce);
2125 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_pl_ce,
2126 &prev.tcps_ecn_conn_pl_ce, &stat.ecn_conn_pl_ce);
2127 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_nopl_ce,
2128 &prev.tcps_ecn_conn_nopl_ce, &stat.ecn_conn_nopl_ce);
4bd07ac2
A
2129 tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_synloss,
2130 &prev.tcps_ecn_fallback_synloss, &stat.ecn_fallback_synloss);
2131 tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_reorder,
2132 &prev.tcps_ecn_fallback_reorder, &stat.ecn_fallback_reorder);
2133 tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_ce,
2134 &prev.tcps_ecn_fallback_ce, &stat.ecn_fallback_ce);
3e170ce0
A
2135 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_rcv,
2136 &prev.tcps_tfo_syn_data_rcv, &stat.tfo_syn_data_rcv);
2137 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req_rcv,
2138 &prev.tcps_tfo_cookie_req_rcv, &stat.tfo_cookie_req_rcv);
2139 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_sent,
2140 &prev.tcps_tfo_cookie_sent, &stat.tfo_cookie_sent);
2141 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_invalid,
2142 &prev.tcps_tfo_cookie_invalid, &stat.tfo_cookie_invalid);
2143 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req,
2144 &prev.tcps_tfo_cookie_req, &stat.tfo_cookie_req);
2145 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_rcv,
2146 &prev.tcps_tfo_cookie_rcv, &stat.tfo_cookie_rcv);
2147 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_sent,
2148 &prev.tcps_tfo_syn_data_sent, &stat.tfo_syn_data_sent);
2149 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_acked,
2150 &prev.tcps_tfo_syn_data_acked, &stat.tfo_syn_data_acked);
2151 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_loss,
2152 &prev.tcps_tfo_syn_loss, &stat.tfo_syn_loss);
2153 tcp_cumulative_stat(tcpstat.tcps_tfo_blackhole,
2154 &prev.tcps_tfo_blackhole, &stat.tfo_blackhole);
39037602
A
2155 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_wrong,
2156 &prev.tcps_tfo_cookie_wrong, &stat.tfo_cookie_wrong);
2157 tcp_cumulative_stat(tcpstat.tcps_tfo_no_cookie_rcv,
2158 &prev.tcps_tfo_no_cookie_rcv, &stat.tfo_no_cookie_rcv);
2159 tcp_cumulative_stat(tcpstat.tcps_tfo_heuristics_disable,
2160 &prev.tcps_tfo_heuristics_disable, &stat.tfo_heuristics_disable);
2161 tcp_cumulative_stat(tcpstat.tcps_tfo_sndblackhole,
2162 &prev.tcps_tfo_sndblackhole, &stat.tfo_sndblackhole);
2163
2164
2165
3e170ce0 2166
fe8ab488
A
2167 nstat_sysinfo_send_data(&data);
2168
2169#undef stat
2170}
3e170ce0
A
2171
2172void
2173tcp_interface_send_probe(u_int16_t probe_if_index)
2174{
2175 int32_t offset = 0;
2176 struct tcptimerlist *listp = &tcp_timer_list;
2177
2178 /* Make sure TCP clock is up to date */
2179 calculate_tcp_clock();
2180
2181 lck_mtx_lock(listp->mtx);
2182 if (listp->probe_if_index > 0) {
2183 tcpstat.tcps_probe_if_conflict++;
2184 goto done;
2185 }
2186
2187 listp->probe_if_index = probe_if_index;
2188 if (listp->running)
2189 goto done;
2190
2191 /*
2192 * Reschedule the timerlist to run within the next 10ms, which is
2193 * the fastest that we can do.
2194 */
2195 offset = TCP_TIMER_10MS_QUANTUM;
2196 if (listp->scheduled) {
2197 int32_t diff;
2198 diff = timer_diff(listp->runtime, 0, tcp_now, offset);
2199 if (diff <= 0) {
2200 /* The timer will fire sooner than what's needed */
2201 goto done;
2202 }
2203 }
2204 listp->mode = TCP_TIMERLIST_10MS_MODE;
2205 listp->idleruns = 0;
2206
2207 tcp_sched_timerlist(offset);
2208
2209done:
2210 lck_mtx_unlock(listp->mtx);
2211 return;
2212}
2213
2214/*
2215 * Enable read probes on this connection, if:
2216 * - it is in established state
2217 * - doesn't have any data outstanding
2218 * - the outgoing ifp matches
2219 * - we have not already sent any read probes
2220 */
2221static void
2222tcp_enable_read_probe(struct tcpcb *tp, struct ifnet *ifp)
2223{
2224 if (tp->t_state == TCPS_ESTABLISHED &&
2225 tp->snd_max == tp->snd_una &&
2226 tp->t_inpcb->inp_last_outifp == ifp &&
2227 !(tp->t_flagsext & TF_DETECT_READSTALL) &&
2228 tp->t_rtimo_probes == 0) {
2229 tp->t_flagsext |= TF_DETECT_READSTALL;
2230 tp->t_rtimo_probes = 0;
2231 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2232 TCP_TIMER_10MS_QUANTUM);
2233 if (tp->tentry.index == TCPT_NONE) {
2234 tp->tentry.index = TCPT_KEEP;
2235 tp->tentry.runtime = tcp_now +
2236 TCP_TIMER_10MS_QUANTUM;
2237 } else {
2238 int32_t diff = 0;
2239
2240 /* Reset runtime to be in next 10ms */
2241 diff = timer_diff(tp->tentry.runtime, 0,
2242 tcp_now, TCP_TIMER_10MS_QUANTUM);
2243 if (diff > 0) {
2244 tp->tentry.index = TCPT_KEEP;
2245 tp->tentry.runtime = tcp_now +
2246 TCP_TIMER_10MS_QUANTUM;
2247 if (tp->tentry.runtime == 0)
2248 tp->tentry.runtime++;
2249 }
2250 }
2251 }
2252}
2253
2254/*
2255 * Disable read probe and reset the keep alive timer
2256 */
2257static void
2258tcp_disable_read_probe(struct tcpcb *tp)
2259{
2260 if (tp->t_adaptive_rtimo == 0 &&
2261 ((tp->t_flagsext & TF_DETECT_READSTALL) ||
2262 tp->t_rtimo_probes > 0)) {
2263 tcp_keepalive_reset(tp);
2264 }
2265}
2266
2267/*
2268 * Reschedule the tcp timerlist in the next 10ms to re-enable read/write
2269 * probes on connections going over a particular interface.
2270 */
2271void
2272tcp_probe_connectivity(struct ifnet *ifp, u_int32_t enable)
2273{
2274 int32_t offset;
2275 struct tcptimerlist *listp = &tcp_timer_list;
2276 struct inpcbinfo *pcbinfo = &tcbinfo;
2277 struct inpcb *inp, *nxt;
2278
2279 if (ifp == NULL)
2280 return;
2281
2282 /* update clock */
2283 calculate_tcp_clock();
2284
2285 /*
2286 * Enable keep alive timer on all connections that are
2287 * active/established on this interface.
2288 */
2289 lck_rw_lock_shared(pcbinfo->ipi_lock);
2290
2291 LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, nxt) {
2292 struct tcpcb *tp = NULL;
2293 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) ==
2294 WNT_STOPUSING)
2295 continue;
2296
2297 /* Acquire lock to look at the state of the connection */
2298 tcp_lock(inp->inp_socket, 1, 0);
2299
2300 /* Release the want count */
490019cf
A
2301 if (inp->inp_ppcb == NULL ||
2302 (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)) {
3e170ce0
A
2303 tcp_unlock(inp->inp_socket, 1, 0);
2304 continue;
2305 }
3e170ce0
A
2306 tp = intotcpcb(inp);
2307 if (enable)
2308 tcp_enable_read_probe(tp, ifp);
2309 else
2310 tcp_disable_read_probe(tp);
2311
2312 tcp_unlock(inp->inp_socket, 1, 0);
2313 }
2314 lck_rw_done(pcbinfo->ipi_lock);
2315
2316 lck_mtx_lock(listp->mtx);
2317 if (listp->running) {
2318 listp->pref_mode |= TCP_TIMERLIST_10MS_MODE;
2319 goto done;
2320 }
2321
2322 /* Reschedule within the next 10ms */
2323 offset = TCP_TIMER_10MS_QUANTUM;
2324 if (listp->scheduled) {
2325 int32_t diff;
2326 diff = timer_diff(listp->runtime, 0, tcp_now, offset);
2327 if (diff <= 0) {
2328 /* The timer will fire sooner than what's needed */
2329 goto done;
2330 }
2331 }
2332 listp->mode = TCP_TIMERLIST_10MS_MODE;
2333 listp->idleruns = 0;
2334
2335 tcp_sched_timerlist(offset);
2336done:
2337 lck_mtx_unlock(listp->mtx);
2338 return;
2339}
2340
39037602
A
2341inline void
2342tcp_update_mss_core(struct tcpcb *tp, struct ifnet *ifp)
2343{
2344 struct if_cellular_status_v1 *ifsr;
2345 u_int32_t optlen;
2346 ifsr = &ifp->if_link_status->ifsr_u.ifsr_cell.if_cell_u.if_status_v1;
2347 if (ifsr->valid_bitmask & IF_CELL_UL_MSS_RECOMMENDED_VALID) {
2348 optlen = tp->t_maxopd - tp->t_maxseg;
2349
2350 if (ifsr->mss_recommended ==
2351 IF_CELL_UL_MSS_RECOMMENDED_NONE &&
2352 tp->t_cached_maxopd > 0 &&
2353 tp->t_maxopd < tp->t_cached_maxopd) {
2354 tp->t_maxopd = tp->t_cached_maxopd;
2355 tcpstat.tcps_mss_to_default++;
2356 } else if (ifsr->mss_recommended ==
2357 IF_CELL_UL_MSS_RECOMMENDED_MEDIUM &&
2358 tp->t_maxopd > tcp_mss_rec_medium) {
2359 tp->t_cached_maxopd = tp->t_maxopd;
2360 tp->t_maxopd = tcp_mss_rec_medium;
2361 tcpstat.tcps_mss_to_medium++;
2362 } else if (ifsr->mss_recommended ==
2363 IF_CELL_UL_MSS_RECOMMENDED_LOW &&
2364 tp->t_maxopd > tcp_mss_rec_low) {
2365 tp->t_cached_maxopd = tp->t_maxopd;
2366 tp->t_maxopd = tcp_mss_rec_low;
2367 tcpstat.tcps_mss_to_low++;
2368 }
2369 tp->t_maxseg = tp->t_maxopd - optlen;
2370
2371 /*
2372 * clear the cached value if it is same as the current
2373 */
2374 if (tp->t_maxopd == tp->t_cached_maxopd)
2375 tp->t_cached_maxopd = 0;
2376 }
2377}
2378
2379void
2380tcp_update_mss_locked(struct socket *so, struct ifnet *ifp)
2381{
2382 struct inpcb *inp = sotoinpcb(so);
2383 struct tcpcb *tp = intotcpcb(inp);
2384
2385 if (ifp == NULL && inp->inp_last_outifp == NULL)
2386 return;
2387
2388 if (ifp == NULL)
2389 ifp = inp->inp_last_outifp;
2390
2391 if (!IFNET_IS_CELLULAR(ifp)) {
2392 /*
2393 * This optimization is implemented for cellular
2394 * networks only
2395 */
2396 return;
2397 }
2398 if ( tp->t_state <= TCPS_CLOSE_WAIT) {
2399 /*
2400 * If the connection is currently doing or has done PMTU
2401 * blackhole detection, do not change the MSS
2402 */
2403 if (tp->t_flags & TF_BLACKHOLE)
2404 return;
2405 if (ifp->if_link_status == NULL)
2406 return;
2407 tcp_update_mss_core(tp, ifp);
2408 }
2409}
2410
3e170ce0
A
2411void
2412tcp_itimer(struct inpcbinfo *ipi)
2413{
2414 struct inpcb *inp, *nxt;
2415
2416 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
2417 if (tcp_itimer_done == TRUE) {
2418 tcp_itimer_done = FALSE;
2419 atomic_add_32(&ipi->ipi_timer_req.intimer_fast, 1);
2420 return;
2421 }
2422 /* Upgrade failed, lost lock now take it again exclusive */
2423 lck_rw_lock_exclusive(ipi->ipi_lock);
2424 }
2425 tcp_itimer_done = TRUE;
2426
2427 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
2428 struct socket *so;
2429
39037602
A
2430 if (inp->inp_ppcb == NULL ||
2431 in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
3e170ce0
A
2432 continue;
2433 so = inp->inp_socket;
2434 tcp_lock(so, 1, 0);
2435 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2436 tcp_unlock(so, 1, 0);
2437 continue;
2438 }
2439 so_check_extended_bk_idle_time(so);
39037602
A
2440 if (ipi->ipi_flags & INPCBINFO_UPDATE_MSS) {
2441 tcp_update_mss_locked(so, NULL);
2442 }
3e170ce0
A
2443 tcp_unlock(so, 1, 0);
2444 }
2445
39037602 2446 ipi->ipi_flags &= ~INPCBINFO_UPDATE_MSS;
3e170ce0
A
2447 lck_rw_done(ipi->ipi_lock);
2448}