]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_timer.c
xnu-4903.270.47.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_timer.c
1 /*
2 * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
62 */
63
64
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/kernel.h>
68 #include <sys/mbuf.h>
69 #include <sys/sysctl.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/protosw.h>
73 #include <sys/domain.h>
74 #include <sys/mcache.h>
75 #include <sys/queue.h>
76 #include <kern/locks.h>
77 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
78 #include <mach/boolean.h>
79
80 #include <net/route.h>
81 #include <net/if_var.h>
82 #include <net/ntstat.h>
83
84 #include <netinet/in.h>
85 #include <netinet/in_systm.h>
86 #include <netinet/in_pcb.h>
87 #if INET6
88 #include <netinet6/in6_pcb.h>
89 #endif
90 #include <netinet/ip_var.h>
91 #include <netinet/tcp.h>
92 #include <netinet/tcp_cache.h>
93 #include <netinet/tcp_fsm.h>
94 #include <netinet/tcp_seq.h>
95 #include <netinet/tcp_timer.h>
96 #include <netinet/tcp_var.h>
97 #include <netinet/tcp_cc.h>
98 #if INET6
99 #include <netinet6/tcp6_var.h>
100 #endif
101 #include <netinet/tcpip.h>
102 #if TCPDEBUG
103 #include <netinet/tcp_debug.h>
104 #endif
105 #include <sys/kdebug.h>
106 #include <mach/sdt.h>
107 #include <netinet/mptcp_var.h>
108
109 /* Max number of times a stretch ack can be delayed on a connection */
110 #define TCP_STRETCHACK_DELAY_THRESHOLD 5
111
112 /*
113 * If the host processor has been sleeping for too long, this is the threshold
114 * used to avoid sending stale retransmissions.
115 */
116 #define TCP_SLEEP_TOO_LONG (10 * 60 * 1000) /* 10 minutes in ms */
117
118 /* tcp timer list */
119 struct tcptimerlist tcp_timer_list;
120
121 /* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */
122 struct tcptailq tcp_tw_tailq;
123
124 static int
125 sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
126 {
127 #pragma unused(arg2)
128 int error, s, tt;
129
130 tt = *(int *)arg1;
131 s = tt * 1000 / TCP_RETRANSHZ;;
132
133 error = sysctl_handle_int(oidp, &s, 0, req);
134 if (error || !req->newptr) {
135 return error;
136 }
137
138 tt = s * TCP_RETRANSHZ / 1000;
139 if (tt < 1) {
140 return EINVAL;
141 }
142
143 *(int *)arg1 = tt;
144 SYSCTL_SKMEM_UPDATE_AT_OFFSET(arg2, *(int*)arg1);
145 return 0;
146 }
147
148 #if SYSCTL_SKMEM
149 int tcp_keepinit = TCPTV_KEEP_INIT;
150 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
151 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
152 &tcp_keepinit, offsetof(skmem_sysctl, tcp.keepinit),
153 sysctl_msec_to_ticks, "I", "");
154
155 int tcp_keepidle = TCPTV_KEEP_IDLE;
156 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
157 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
158 &tcp_keepidle, offsetof(skmem_sysctl, tcp.keepidle),
159 sysctl_msec_to_ticks, "I", "");
160
161 int tcp_keepintvl = TCPTV_KEEPINTVL;
162 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
163 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
164 &tcp_keepintvl, offsetof(skmem_sysctl, tcp.keepintvl),
165 sysctl_msec_to_ticks, "I", "");
166
167 SYSCTL_SKMEM_TCP_INT(OID_AUTO, keepcnt,
168 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
169 int, tcp_keepcnt, TCPTV_KEEPCNT, "number of times to repeat keepalive");
170
171 int tcp_msl = TCPTV_MSL;
172 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
173 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
174 &tcp_msl, offsetof(skmem_sysctl, tcp.msl),
175 sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
176 #else /* SYSCTL_SKMEM */
177 int tcp_keepinit;
178 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
179 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
180 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
181
182 int tcp_keepidle;
183 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
184 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
185 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
186
187 int tcp_keepintvl;
188 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
189 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
190 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
191
192 int tcp_keepcnt;
193 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt,
194 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
195 &tcp_keepcnt, 0, "number of times to repeat keepalive");
196
197 int tcp_msl;
198 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
199 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
200 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
201 #endif /* SYSCTL_SKMEM */
202
203 /*
204 * Avoid DoS via TCP Robustness in Persist Condition
205 * (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt)
206 * by allowing a system wide maximum persistence timeout value when in
207 * Zero Window Probe mode.
208 *
209 * Expressed in milliseconds to be consistent without timeout related
210 * values, the TCP socket option is in seconds.
211 */
212 #if SYSCTL_SKMEM
213 u_int32_t tcp_max_persist_timeout = 0;
214 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout,
215 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
216 &tcp_max_persist_timeout, offsetof(skmem_sysctl, tcp.max_persist_timeout),
217 sysctl_msec_to_ticks, "I", "Maximum persistence timeout for ZWP");
218 #else /* SYSCTL_SKMEM */
219 u_int32_t tcp_max_persist_timeout = 0;
220 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout,
221 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
222 &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I",
223 "Maximum persistence timeout for ZWP");
224 #endif /* SYSCTL_SKMEM */
225
226 SYSCTL_SKMEM_TCP_INT(OID_AUTO, always_keepalive,
227 CTLFLAG_RW | CTLFLAG_LOCKED, static int, always_keepalive, 0,
228 "Assume SO_KEEPALIVE on all TCP connections");
229
230 /*
231 * This parameter determines how long the timer list will stay in fast or
232 * quick mode even though all connections are idle. In this state, the
233 * timer will run more frequently anticipating new data.
234 */
235 SYSCTL_SKMEM_TCP_INT(OID_AUTO, timer_fastmode_idlemax,
236 CTLFLAG_RW | CTLFLAG_LOCKED, int, timer_fastmode_idlemax,
237 TCP_FASTMODE_IDLERUN_MAX, "Maximum idle generations in fast mode");
238
239 /*
240 * See tcp_syn_backoff[] for interval values between SYN retransmits;
241 * the value set below defines the number of retransmits, before we
242 * disable the timestamp and window scaling options during subsequent
243 * SYN retransmits. Setting it to 0 disables the dropping off of those
244 * two options.
245 */
246 SYSCTL_SKMEM_TCP_INT(OID_AUTO, broken_peer_syn_rexmit_thres,
247 CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_broken_peer_syn_rxmit_thres,
248 10, "Number of retransmitted SYNs before disabling RFC 1323 "
249 "options on local connections");
250
251 static int tcp_timer_advanced = 0;
252 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced,
253 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_timer_advanced, 0,
254 "Number of times one of the timers was advanced");
255
256 static int tcp_resched_timerlist = 0;
257 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist,
258 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_resched_timerlist, 0,
259 "Number of times timer list was rescheduled as part of processing a packet");
260
261 SYSCTL_SKMEM_TCP_INT(OID_AUTO, pmtud_blackhole_detection,
262 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_pmtud_black_hole_detect, 1,
263 "Path MTU Discovery Black Hole Detection");
264
265 SYSCTL_SKMEM_TCP_INT(OID_AUTO, pmtud_blackhole_mss,
266 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_pmtud_black_hole_mss, 1200,
267 "Path MTU Discovery Black Hole Detection lowered MSS");
268
269 static u_int32_t tcp_mss_rec_medium = 1200;
270 static u_int32_t tcp_mss_rec_low = 512;
271
272 #define TCP_REPORT_STATS_INTERVAL 43200 /* 12 hours, in seconds */
273 int tcp_report_stats_interval = TCP_REPORT_STATS_INTERVAL;
274
275 /* performed garbage collection of "used" sockets */
276 static boolean_t tcp_gc_done = FALSE;
277
278 /* max idle probes */
279 int tcp_maxpersistidle = TCPTV_KEEP_IDLE;
280
281 /*
282 * TCP delack timer is set to 100 ms. Since the processing of timer list
283 * in fast mode will happen no faster than 100 ms, the delayed ack timer
284 * will fire some where between 100 and 200 ms.
285 */
286 int tcp_delack = TCP_RETRANSHZ / 10;
287
288 #if MPTCP
289 /*
290 * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff
291 */
292 int tcp_jack_rxmt = TCP_RETRANSHZ / 2;
293 #endif /* MPTCP */
294
295 static boolean_t tcp_itimer_done = FALSE;
296
297 static void tcp_remove_timer(struct tcpcb *tp);
298 static void tcp_sched_timerlist(uint32_t offset);
299 static u_int32_t tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *mode,
300 u_int16_t probe_if_index);
301 static void tcp_sched_timers(struct tcpcb *tp);
302 static inline void tcp_set_lotimer_index(struct tcpcb *);
303 __private_extern__ void tcp_remove_from_time_wait(struct inpcb *inp);
304 static inline void tcp_update_mss_core(struct tcpcb *tp, struct ifnet *ifp);
305 __private_extern__ void tcp_report_stats(void);
306
307 static u_int64_t tcp_last_report_time;
308
309 /*
310 * Structure to store previously reported stats so that we can send
311 * incremental changes in each report interval.
312 */
313 struct tcp_last_report_stats {
314 u_int32_t tcps_connattempt;
315 u_int32_t tcps_accepts;
316 u_int32_t tcps_ecn_client_setup;
317 u_int32_t tcps_ecn_server_setup;
318 u_int32_t tcps_ecn_client_success;
319 u_int32_t tcps_ecn_server_success;
320 u_int32_t tcps_ecn_not_supported;
321 u_int32_t tcps_ecn_lost_syn;
322 u_int32_t tcps_ecn_lost_synack;
323 u_int32_t tcps_ecn_recv_ce;
324 u_int32_t tcps_ecn_recv_ece;
325 u_int32_t tcps_ecn_sent_ece;
326 u_int32_t tcps_ecn_conn_recv_ce;
327 u_int32_t tcps_ecn_conn_recv_ece;
328 u_int32_t tcps_ecn_conn_plnoce;
329 u_int32_t tcps_ecn_conn_pl_ce;
330 u_int32_t tcps_ecn_conn_nopl_ce;
331 u_int32_t tcps_ecn_fallback_synloss;
332 u_int32_t tcps_ecn_fallback_reorder;
333 u_int32_t tcps_ecn_fallback_ce;
334
335 /* TFO-related statistics */
336 u_int32_t tcps_tfo_syn_data_rcv;
337 u_int32_t tcps_tfo_cookie_req_rcv;
338 u_int32_t tcps_tfo_cookie_sent;
339 u_int32_t tcps_tfo_cookie_invalid;
340 u_int32_t tcps_tfo_cookie_req;
341 u_int32_t tcps_tfo_cookie_rcv;
342 u_int32_t tcps_tfo_syn_data_sent;
343 u_int32_t tcps_tfo_syn_data_acked;
344 u_int32_t tcps_tfo_syn_loss;
345 u_int32_t tcps_tfo_blackhole;
346 u_int32_t tcps_tfo_cookie_wrong;
347 u_int32_t tcps_tfo_no_cookie_rcv;
348 u_int32_t tcps_tfo_heuristics_disable;
349 u_int32_t tcps_tfo_sndblackhole;
350
351 /* MPTCP-related statistics */
352 u_int32_t tcps_mptcp_handover_attempt;
353 u_int32_t tcps_mptcp_interactive_attempt;
354 u_int32_t tcps_mptcp_aggregate_attempt;
355 u_int32_t tcps_mptcp_fp_handover_attempt;
356 u_int32_t tcps_mptcp_fp_interactive_attempt;
357 u_int32_t tcps_mptcp_fp_aggregate_attempt;
358 u_int32_t tcps_mptcp_heuristic_fallback;
359 u_int32_t tcps_mptcp_fp_heuristic_fallback;
360 u_int32_t tcps_mptcp_handover_success_wifi;
361 u_int32_t tcps_mptcp_handover_success_cell;
362 u_int32_t tcps_mptcp_interactive_success;
363 u_int32_t tcps_mptcp_aggregate_success;
364 u_int32_t tcps_mptcp_fp_handover_success_wifi;
365 u_int32_t tcps_mptcp_fp_handover_success_cell;
366 u_int32_t tcps_mptcp_fp_interactive_success;
367 u_int32_t tcps_mptcp_fp_aggregate_success;
368 u_int32_t tcps_mptcp_handover_cell_from_wifi;
369 u_int32_t tcps_mptcp_handover_wifi_from_cell;
370 u_int32_t tcps_mptcp_interactive_cell_from_wifi;
371 u_int64_t tcps_mptcp_handover_cell_bytes;
372 u_int64_t tcps_mptcp_interactive_cell_bytes;
373 u_int64_t tcps_mptcp_aggregate_cell_bytes;
374 u_int64_t tcps_mptcp_handover_all_bytes;
375 u_int64_t tcps_mptcp_interactive_all_bytes;
376 u_int64_t tcps_mptcp_aggregate_all_bytes;
377 u_int32_t tcps_mptcp_back_to_wifi;
378 u_int32_t tcps_mptcp_wifi_proxy;
379 u_int32_t tcps_mptcp_cell_proxy;
380 u_int32_t tcps_mptcp_triggered_cell;
381 };
382
383
384 /* Returns true if the timer is on the timer list */
385 #define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
386
387 /* Run the TCP timerlist atleast once every hour */
388 #define TCP_TIMERLIST_MAX_OFFSET (60 * 60 * TCP_RETRANSHZ)
389
390
391 static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
392 static boolean_t tcp_garbage_collect(struct inpcb *, int);
393
394 #define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next)))
395
396 #define VERIFY_NEXT_LINK(elm, field) do { \
397 if (LIST_NEXT((elm),field) != NULL && \
398 LIST_NEXT((elm),field)->field.le_prev != \
399 &((elm)->field.le_next)) \
400 panic("Bad link elm %p next->prev != elm", (elm)); \
401 } while(0)
402
403 #define VERIFY_PREV_LINK(elm, field) do { \
404 if (*(elm)->field.le_prev != (elm)) \
405 panic("Bad link elm %p prev->next != elm", (elm)); \
406 } while(0)
407
408 #define TCP_SET_TIMER_MODE(mode, i) do { \
409 if (IS_TIMER_HZ_10MS(i)) \
410 (mode) |= TCP_TIMERLIST_10MS_MODE; \
411 else if (IS_TIMER_HZ_100MS(i)) \
412 (mode) |= TCP_TIMERLIST_100MS_MODE; \
413 else \
414 (mode) |= TCP_TIMERLIST_500MS_MODE; \
415 } while(0)
416
417 #if (DEVELOPMENT || DEBUG)
418 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, mss_rec_medium,
419 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mss_rec_medium, 0,
420 "Medium MSS based on recommendation in link status report");
421 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, mss_rec_low,
422 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mss_rec_low, 0,
423 "Low MSS based on recommendation in link status report");
424
425 static int32_t tcp_change_mss_recommended = 0;
426 static int
427 sysctl_change_mss_recommended SYSCTL_HANDLER_ARGS
428 {
429 #pragma unused(oidp, arg1, arg2)
430 int i, err = 0, changed = 0;
431 struct ifnet *ifp;
432 struct if_link_status ifsr;
433 struct if_cellular_status_v1 *new_cell_sr;
434 err = sysctl_io_number(req, tcp_change_mss_recommended,
435 sizeof(int32_t), &i, &changed);
436 if (changed) {
437 ifnet_head_lock_shared();
438 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
439 if (IFNET_IS_CELLULAR(ifp)) {
440 bzero(&ifsr, sizeof(ifsr));
441 new_cell_sr = &ifsr.ifsr_u.ifsr_cell.if_cell_u.if_status_v1;
442 ifsr.ifsr_version = IF_CELLULAR_STATUS_REPORT_CURRENT_VERSION;
443 ifsr.ifsr_len = sizeof(*new_cell_sr);
444
445 /* Set MSS recommended */
446 new_cell_sr->valid_bitmask |= IF_CELL_UL_MSS_RECOMMENDED_VALID;
447 new_cell_sr->mss_recommended = i;
448 err = ifnet_link_status_report(ifp, new_cell_sr, sizeof(new_cell_sr));
449 if (err == 0) {
450 tcp_change_mss_recommended = i;
451 } else {
452 break;
453 }
454 }
455 }
456 ifnet_head_done();
457 }
458 return err;
459 }
460
461 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, change_mss_recommended,
462 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_change_mss_recommended,
463 0, sysctl_change_mss_recommended, "IU", "Change MSS recommended");
464
465 SYSCTL_INT(_net_inet_tcp, OID_AUTO, report_stats_interval,
466 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_report_stats_interval, 0,
467 "Report stats interval");
468 #endif /* (DEVELOPMENT || DEBUG) */
469
470 /*
471 * Macro to compare two timers. If there is a reset of the sign bit,
472 * it is safe to assume that the timer has wrapped around. By doing
473 * signed comparision, we take care of wrap around such that the value
474 * with the sign bit reset is actually ahead of the other.
475 */
476 inline int32_t
477 timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2)
478 {
479 return (int32_t)((t1 + toff1) - (t2 + toff2));
480 };
481
482 /*
483 * Add to tcp timewait list, delay is given in milliseconds.
484 */
485 static void
486 add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
487 {
488 struct inpcbinfo *pcbinfo = &tcbinfo;
489 struct inpcb *inp = tp->t_inpcb;
490 uint32_t timer;
491
492 /* pcb list should be locked when we get here */
493 LCK_RW_ASSERT(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
494
495 /* We may get here multiple times, so check */
496 if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
497 pcbinfo->ipi_twcount++;
498 inp->inp_flags2 |= INP2_TIMEWAIT;
499
500 /* Remove from global inp list */
501 LIST_REMOVE(inp, inp_list);
502 } else {
503 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
504 }
505
506 /* Compute the time at which this socket can be closed */
507 timer = tcp_now + delay;
508
509 /* We will use the TCPT_2MSL timer for tracking this delay */
510
511 if (TIMER_IS_ON_LIST(tp)) {
512 tcp_remove_timer(tp);
513 }
514 tp->t_timer[TCPT_2MSL] = timer;
515
516 TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry);
517 }
518
519 void
520 add_to_time_wait(struct tcpcb *tp, uint32_t delay)
521 {
522 struct inpcbinfo *pcbinfo = &tcbinfo;
523 if (tp->t_inpcb->inp_socket->so_options & SO_NOWAKEFROMSLEEP) {
524 socket_post_kev_msg_closed(tp->t_inpcb->inp_socket);
525 }
526
527 /* 19182803: Notify nstat that connection is closing before waiting. */
528 nstat_pcb_detach(tp->t_inpcb);
529
530 if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) {
531 socket_unlock(tp->t_inpcb->inp_socket, 0);
532 lck_rw_lock_exclusive(pcbinfo->ipi_lock);
533 socket_lock(tp->t_inpcb->inp_socket, 0);
534 }
535 add_to_time_wait_locked(tp, delay);
536 lck_rw_done(pcbinfo->ipi_lock);
537
538 inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY);
539 }
540
541 /* If this is on time wait queue, remove it. */
542 void
543 tcp_remove_from_time_wait(struct inpcb *inp)
544 {
545 struct tcpcb *tp = intotcpcb(inp);
546 if (inp->inp_flags2 & INP2_TIMEWAIT) {
547 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
548 }
549 }
550
551 static boolean_t
552 tcp_garbage_collect(struct inpcb *inp, int istimewait)
553 {
554 boolean_t active = FALSE;
555 struct socket *so, *mp_so = NULL;
556 struct tcpcb *tp;
557
558 so = inp->inp_socket;
559 tp = intotcpcb(inp);
560
561 if (so->so_flags & SOF_MP_SUBFLOW) {
562 mp_so = mptetoso(tptomptp(tp)->mpt_mpte);
563 if (!socket_try_lock(mp_so)) {
564 mp_so = NULL;
565 active = TRUE;
566 goto out;
567 }
568 mp_so->so_usecount++;
569 }
570
571 /*
572 * Skip if still in use or busy; it would have been more efficient
573 * if we were to test so_usecount against 0, but this isn't possible
574 * due to the current implementation of tcp_dropdropablreq() where
575 * overflow sockets that are eligible for garbage collection have
576 * their usecounts set to 1.
577 */
578 if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx)) {
579 active = TRUE;
580 goto out;
581 }
582
583 /* Check again under the lock */
584 if (so->so_usecount > 1) {
585 if (inp->inp_wantcnt == WNT_STOPUSING) {
586 active = TRUE;
587 }
588 lck_mtx_unlock(&inp->inpcb_mtx);
589 goto out;
590 }
591
592 if (istimewait && TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) &&
593 tp->t_state != TCPS_CLOSED) {
594 /* Become a regular mutex */
595 lck_mtx_convert_spin(&inp->inpcb_mtx);
596 tcp_close(tp);
597 }
598
599 /*
600 * Overflowed socket dropped from the listening queue? Do this
601 * only if we are called to clean up the time wait slots, since
602 * tcp_dropdropablreq() considers a socket to have been fully
603 * dropped after add_to_time_wait() is finished.
604 * Also handle the case of connections getting closed by the peer
605 * while in the queue as seen with rdar://6422317
606 *
607 */
608 if (so->so_usecount == 1 &&
609 ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
610 ((tp != NULL) && (tp->t_state == TCPS_CLOSED) &&
611 (so->so_head != NULL) &&
612 ((so->so_state & (SS_INCOMP | SS_CANTSENDMORE | SS_CANTRCVMORE)) ==
613 (SS_INCOMP | SS_CANTSENDMORE | SS_CANTRCVMORE))))) {
614 if (inp->inp_state != INPCB_STATE_DEAD) {
615 /* Become a regular mutex */
616 lck_mtx_convert_spin(&inp->inpcb_mtx);
617 #if INET6
618 if (SOCK_CHECK_DOM(so, PF_INET6)) {
619 in6_pcbdetach(inp);
620 } else
621 #endif /* INET6 */
622 in_pcbdetach(inp);
623 }
624 VERIFY(so->so_usecount > 0);
625 so->so_usecount--;
626 if (inp->inp_wantcnt == WNT_STOPUSING) {
627 active = TRUE;
628 }
629 lck_mtx_unlock(&inp->inpcb_mtx);
630 goto out;
631 } else if (inp->inp_wantcnt != WNT_STOPUSING) {
632 lck_mtx_unlock(&inp->inpcb_mtx);
633 active = FALSE;
634 goto out;
635 }
636
637 /*
638 * We get here because the PCB is no longer searchable
639 * (WNT_STOPUSING); detach (if needed) and dispose if it is dead
640 * (usecount is 0). This covers all cases, including overflow
641 * sockets and those that are considered as "embryonic",
642 * i.e. created by sonewconn() in TCP input path, and have
643 * not yet been committed. For the former, we reduce the usecount
644 * to 0 as done by the code above. For the latter, the usecount
645 * would have reduced to 0 as part calling soabort() when the
646 * socket is dropped at the end of tcp_input().
647 */
648 if (so->so_usecount == 0) {
649 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
650 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
651 /* Become a regular mutex */
652 lck_mtx_convert_spin(&inp->inpcb_mtx);
653
654 /*
655 * If this tp still happens to be on the timer list,
656 * take it out
657 */
658 if (TIMER_IS_ON_LIST(tp)) {
659 tcp_remove_timer(tp);
660 }
661
662 if (inp->inp_state != INPCB_STATE_DEAD) {
663 #if INET6
664 if (SOCK_CHECK_DOM(so, PF_INET6)) {
665 in6_pcbdetach(inp);
666 } else
667 #endif /* INET6 */
668 in_pcbdetach(inp);
669 }
670
671 if (mp_so) {
672 mptcp_subflow_del(tptomptp(tp)->mpt_mpte, tp->t_mpsub);
673
674 /* so is now unlinked from mp_so - let's drop the lock */
675 socket_unlock(mp_so, 1);
676 mp_so = NULL;
677 }
678
679 in_pcbdispose(inp);
680 active = FALSE;
681 goto out;
682 }
683
684 lck_mtx_unlock(&inp->inpcb_mtx);
685 active = TRUE;
686
687 out:
688 if (mp_so) {
689 socket_unlock(mp_so, 1);
690 }
691
692 return active;
693 }
694
695 /*
696 * TCP garbage collector callback (inpcb_timer_func_t).
697 *
698 * Returns the number of pcbs that will need to be gc-ed soon,
699 * returnining > 0 will keep timer active.
700 */
701 void
702 tcp_gc(struct inpcbinfo *ipi)
703 {
704 struct inpcb *inp, *nxt;
705 struct tcpcb *tw_tp, *tw_ntp;
706 #if TCPDEBUG
707 int ostate;
708 #endif
709 #if KDEBUG
710 static int tws_checked = 0;
711 #endif
712
713 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0);
714
715 /*
716 * Update tcp_now here as it may get used while
717 * processing the slow timer.
718 */
719 calculate_tcp_clock();
720
721 /*
722 * Garbage collect socket/tcpcb: We need to acquire the list lock
723 * exclusively to do this
724 */
725
726 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
727 /* don't sweat it this time; cleanup was done last time */
728 if (tcp_gc_done == TRUE) {
729 tcp_gc_done = FALSE;
730 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END,
731 tws_checked, cur_tw_slot, 0, 0, 0);
732 /* Lock upgrade failed, give up this round */
733 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
734 return;
735 }
736 /* Upgrade failed, lost lock now take it again exclusive */
737 lck_rw_lock_exclusive(ipi->ipi_lock);
738 }
739 tcp_gc_done = TRUE;
740
741 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
742 if (tcp_garbage_collect(inp, 0)) {
743 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
744 }
745 }
746
747 /* Now cleanup the time wait ones */
748 TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) {
749 /*
750 * We check the timestamp here without holding the
751 * socket lock for better performance. If there are
752 * any pcbs in time-wait, the timer will get rescheduled.
753 * Hence some error in this check can be tolerated.
754 *
755 * Sometimes a socket on time-wait queue can be closed if
756 * 2MSL timer expired but the application still has a
757 * usecount on it.
758 */
759 if (tw_tp->t_state == TCPS_CLOSED ||
760 TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) {
761 if (tcp_garbage_collect(tw_tp->t_inpcb, 1)) {
762 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
763 }
764 }
765 }
766
767 /* take into account pcbs that are still in time_wait_slots */
768 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount);
769
770 lck_rw_done(ipi->ipi_lock);
771
772 /* Clean up the socache while we are here */
773 if (so_cache_timer()) {
774 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
775 }
776
777 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked,
778 cur_tw_slot, 0, 0, 0);
779
780 return;
781 }
782
783 /*
784 * Cancel all timers for TCP tp.
785 */
786 void
787 tcp_canceltimers(struct tcpcb *tp)
788 {
789 int i;
790
791 tcp_remove_timer(tp);
792 for (i = 0; i < TCPT_NTIMERS; i++) {
793 tp->t_timer[i] = 0;
794 }
795 tp->tentry.timer_start = tcp_now;
796 tp->tentry.index = TCPT_NONE;
797 }
798
799 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
800 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
801
802 int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
803 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
804
805 static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
806
807 void
808 tcp_rexmt_save_state(struct tcpcb *tp)
809 {
810 u_int32_t fsize;
811 if (TSTMP_SUPPORTED(tp)) {
812 /*
813 * Since timestamps are supported on the connection,
814 * we can do recovery as described in rfc 4015.
815 */
816 fsize = tp->snd_max - tp->snd_una;
817 tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh);
818 tp->snd_recover_prev = tp->snd_recover;
819 } else {
820 /*
821 * Timestamp option is not supported on this connection.
822 * Record ssthresh and cwnd so they can
823 * be recovered if this turns out to be a "bad" retransmit.
824 * A retransmit is considered "bad" if an ACK for this
825 * segment is received within RTT/2 interval; the assumption
826 * here is that the ACK was already in flight. See
827 * "On Estimating End-to-End Network Path Properties" by
828 * Allman and Paxson for more details.
829 */
830 tp->snd_cwnd_prev = tp->snd_cwnd;
831 tp->snd_ssthresh_prev = tp->snd_ssthresh;
832 tp->snd_recover_prev = tp->snd_recover;
833 if (IN_FASTRECOVERY(tp)) {
834 tp->t_flags |= TF_WASFRECOVERY;
835 } else {
836 tp->t_flags &= ~TF_WASFRECOVERY;
837 }
838 }
839 tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2;
840 tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
841 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
842 }
843
844 /*
845 * Revert to the older segment size if there is an indication that PMTU
846 * blackhole detection was not needed.
847 */
848 void
849 tcp_pmtud_revert_segment_size(struct tcpcb *tp)
850 {
851 int32_t optlen;
852
853 VERIFY(tp->t_pmtud_saved_maxopd > 0);
854 tp->t_flags |= TF_PMTUD;
855 tp->t_flags &= ~TF_BLACKHOLE;
856 optlen = tp->t_maxopd - tp->t_maxseg;
857 tp->t_maxopd = tp->t_pmtud_saved_maxopd;
858 tp->t_maxseg = tp->t_maxopd - optlen;
859
860 /*
861 * Reset the slow-start flight size as it
862 * may depend on the new MSS
863 */
864 if (CC_ALGO(tp)->cwnd_init != NULL) {
865 CC_ALGO(tp)->cwnd_init(tp);
866 }
867 tp->t_pmtud_start_ts = 0;
868 tcpstat.tcps_pmtudbh_reverted++;
869
870 /* change MSS according to recommendation, if there was one */
871 tcp_update_mss_locked(tp->t_inpcb->inp_socket, NULL);
872 }
873
874 /*
875 * TCP timer processing.
876 */
877 struct tcpcb *
878 tcp_timers(struct tcpcb *tp, int timer)
879 {
880 int32_t rexmt, optlen = 0, idle_time = 0;
881 struct socket *so;
882 struct tcptemp *t_template;
883 #if TCPDEBUG
884 int ostate;
885 #endif
886
887 #if INET6
888 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
889 #endif /* INET6 */
890 u_int64_t accsleep_ms;
891 u_int32_t last_sleep_ms = 0;
892
893 so = tp->t_inpcb->inp_socket;
894 idle_time = tcp_now - tp->t_rcvtime;
895
896 switch (timer) {
897 /*
898 * 2 MSL timeout in shutdown went off. If we're closed but
899 * still waiting for peer to close and connection has been idle
900 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
901 * delete connection control block.
902 * Otherwise, (this case shouldn't happen) check again in a bit
903 * we keep the socket in the main list in that case.
904 */
905 case TCPT_2MSL:
906 tcp_free_sackholes(tp);
907 if (tp->t_state != TCPS_TIME_WAIT &&
908 tp->t_state != TCPS_FIN_WAIT_2 &&
909 ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) {
910 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
911 (u_int32_t)TCP_CONN_KEEPINTVL(tp));
912 } else {
913 tp = tcp_close(tp);
914 return tp;
915 }
916 break;
917
918 /*
919 * Retransmission timer went off. Message has not
920 * been acked within retransmit interval. Back off
921 * to a longer retransmit interval and retransmit one segment.
922 */
923 case TCPT_REXMT:
924 absolutetime_to_nanoseconds(mach_absolutetime_asleep,
925 &accsleep_ms);
926 accsleep_ms = accsleep_ms / 1000000UL;
927 if (accsleep_ms > tp->t_accsleep_ms) {
928 last_sleep_ms = accsleep_ms - tp->t_accsleep_ms;
929 }
930 /*
931 * Drop a connection in the retransmit timer
932 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT
933 * times
934 * 2. If the time spent in this retransmission episode is
935 * more than the time limit set with TCP_RXT_CONNDROPTIME
936 * socket option
937 * 3. If TCP_RXT_FINDROP socket option was set and
938 * we have already retransmitted the FIN 3 times without
939 * receiving an ack
940 */
941 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
942 (tp->t_rxt_conndroptime > 0 && tp->t_rxtstart > 0 &&
943 (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime) ||
944 ((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
945 (tp->t_flags & TF_SENTFIN) != 0 && tp->t_rxtshift >= 4) ||
946 (tp->t_rxtshift > 4 && last_sleep_ms >= TCP_SLEEP_TOO_LONG)) {
947 if (tp->t_state == TCPS_ESTABLISHED &&
948 tp->t_rxt_minimum_timeout > 0) {
949 /*
950 * Avoid dropping a connection if minimum
951 * timeout is set and that time did not
952 * pass. We will retry sending
953 * retransmissions at the maximum interval
954 */
955 if (TSTMP_LT(tcp_now, (tp->t_rxtstart +
956 tp->t_rxt_minimum_timeout))) {
957 tp->t_rxtshift = TCP_MAXRXTSHIFT - 1;
958 goto retransmit_packet;
959 }
960 }
961 if ((tp->t_flagsext & TF_RXTFINDROP) != 0) {
962 tcpstat.tcps_rxtfindrop++;
963 } else if (last_sleep_ms >= TCP_SLEEP_TOO_LONG) {
964 tcpstat.tcps_drop_after_sleep++;
965 } else {
966 tcpstat.tcps_timeoutdrop++;
967 }
968 if (tp->t_rxtshift >= TCP_MAXRXTSHIFT) {
969 if (TCP_ECN_ENABLED(tp)) {
970 INP_INC_IFNET_STAT(tp->t_inpcb,
971 ecn_on.rxmit_drop);
972 } else {
973 INP_INC_IFNET_STAT(tp->t_inpcb,
974 ecn_off.rxmit_drop);
975 }
976 }
977 tp->t_rxtshift = TCP_MAXRXTSHIFT;
978 postevent(so, 0, EV_TIMEOUT);
979 soevent(so,
980 (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT));
981
982 if (TCP_ECN_ENABLED(tp) &&
983 tp->t_state == TCPS_ESTABLISHED) {
984 tcp_heuristic_ecn_droprxmt(tp);
985 }
986
987 tp = tcp_drop(tp, tp->t_softerror ?
988 tp->t_softerror : ETIMEDOUT);
989
990 break;
991 }
992 retransmit_packet:
993 tcpstat.tcps_rexmttimeo++;
994 tp->t_accsleep_ms = accsleep_ms;
995
996 if (tp->t_rxtshift == 1 &&
997 tp->t_state == TCPS_ESTABLISHED) {
998 /* Set the time at which retransmission started. */
999 tp->t_rxtstart = tcp_now;
1000
1001 /*
1002 * if this is the first retransmit timeout, save
1003 * the state so that we can recover if the timeout
1004 * is spurious.
1005 */
1006 tcp_rexmt_save_state(tp);
1007 }
1008 #if MPTCP
1009 if ((tp->t_rxtshift >= mptcp_fail_thresh) &&
1010 (tp->t_state == TCPS_ESTABLISHED) &&
1011 (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
1012 mptcp_act_on_txfail(so);
1013 }
1014
1015 if (so->so_flags & SOF_MP_SUBFLOW) {
1016 struct mptses *mpte = tptomptp(tp)->mpt_mpte;
1017
1018 mptcp_check_subflows_and_add(mpte);
1019 }
1020 #endif /* MPTCP */
1021
1022 if (tp->t_adaptive_wtimo > 0 &&
1023 tp->t_rxtshift > tp->t_adaptive_wtimo &&
1024 TCPS_HAVEESTABLISHED(tp->t_state)) {
1025 /* Send an event to the application */
1026 soevent(so,
1027 (SO_FILT_HINT_LOCKED |
1028 SO_FILT_HINT_ADAPTIVE_WTIMO));
1029 }
1030
1031 /*
1032 * If this is a retransmit timeout after PTO, the PTO
1033 * was not effective
1034 */
1035 if (tp->t_flagsext & TF_SENT_TLPROBE) {
1036 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1037 tcpstat.tcps_rto_after_pto++;
1038 }
1039
1040 if (tp->t_flagsext & TF_DELAY_RECOVERY) {
1041 /*
1042 * Retransmit timer fired before entering recovery
1043 * on a connection with packet re-ordering. This
1044 * suggests that the reordering metrics computed
1045 * are not accurate.
1046 */
1047 tp->t_reorderwin = 0;
1048 tp->t_timer[TCPT_DELAYFR] = 0;
1049 tp->t_flagsext &= ~(TF_DELAY_RECOVERY);
1050 }
1051
1052 if (tp->t_state == TCPS_SYN_RECEIVED) {
1053 tcp_disable_tfo(tp);
1054 }
1055
1056 if (!(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
1057 (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
1058 !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
1059 ((tp->t_state != TCPS_SYN_SENT && tp->t_rxtshift > 1) ||
1060 tp->t_rxtshift > 4)) {
1061 /*
1062 * For regular retransmissions, a first one is being
1063 * done for tail-loss probe.
1064 * Thus, if rxtshift > 1, this means we have sent the segment
1065 * a total of 3 times.
1066 *
1067 * If we are in SYN-SENT state, then there is no tail-loss
1068 * probe thus we have to let rxtshift go up to 3.
1069 */
1070 tcp_heuristic_tfo_middlebox(tp);
1071
1072 so->so_error = ENODATA;
1073 sorwakeup(so);
1074 sowwakeup(so);
1075
1076 tp->t_tfo_stats |= TFO_S_SEND_BLACKHOLE;
1077 tcpstat.tcps_tfo_sndblackhole++;
1078 }
1079
1080 if (!(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
1081 (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) &&
1082 tp->t_rxtshift > 3) {
1083 if (TSTMP_GT(tp->t_sndtime - 10 * TCP_RETRANSHZ, tp->t_rcvtime)) {
1084 tcp_heuristic_tfo_middlebox(tp);
1085
1086 so->so_error = ENODATA;
1087 sorwakeup(so);
1088 sowwakeup(so);
1089 }
1090 }
1091
1092 if (tp->t_state == TCPS_SYN_SENT) {
1093 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
1094 tp->t_stat.synrxtshift = tp->t_rxtshift;
1095
1096 /* When retransmitting, disable TFO */
1097 if (tfo_enabled(tp) &&
1098 (!(so->so_flags1 & SOF1_DATA_AUTHENTICATED) ||
1099 (tp->t_flagsext & TF_FASTOPEN_HEUR))) {
1100 tp->t_flagsext &= ~TF_FASTOPEN;
1101 tp->t_tfo_flags |= TFO_F_SYN_LOSS;
1102 }
1103 } else {
1104 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
1105 }
1106
1107 TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX,
1108 TCP_ADD_REXMTSLOP(tp));
1109 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
1110
1111 if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb)) {
1112 goto fc_output;
1113 }
1114
1115 tcp_free_sackholes(tp);
1116 /*
1117 * Check for potential Path MTU Discovery Black Hole
1118 */
1119 if (tcp_pmtud_black_hole_detect &&
1120 !(tp->t_flagsext & TF_NOBLACKHOLE_DETECTION) &&
1121 (tp->t_state == TCPS_ESTABLISHED)) {
1122 if ((tp->t_flags & TF_PMTUD) &&
1123 ((tp->t_flags & TF_MAXSEGSNT)
1124 || tp->t_pmtud_lastseg_size > tcp_pmtud_black_hole_mss) &&
1125 tp->t_rxtshift == 2) {
1126 /*
1127 * Enter Path MTU Black-hole Detection mechanism:
1128 * - Disable Path MTU Discovery (IP "DF" bit).
1129 * - Reduce MTU to lower value than what we
1130 * negotiated with the peer.
1131 */
1132 /* Disable Path MTU Discovery for now */
1133 tp->t_flags &= ~TF_PMTUD;
1134 /* Record that we may have found a black hole */
1135 tp->t_flags |= TF_BLACKHOLE;
1136 optlen = tp->t_maxopd - tp->t_maxseg;
1137 /* Keep track of previous MSS */
1138 tp->t_pmtud_saved_maxopd = tp->t_maxopd;
1139 tp->t_pmtud_start_ts = tcp_now;
1140 if (tp->t_pmtud_start_ts == 0) {
1141 tp->t_pmtud_start_ts++;
1142 }
1143 /* Reduce the MSS to intermediary value */
1144 if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
1145 tp->t_maxopd = tcp_pmtud_black_hole_mss;
1146 } else {
1147 tp->t_maxopd = /* use the default MSS */
1148 #if INET6
1149 isipv6 ? tcp_v6mssdflt :
1150 #endif /* INET6 */
1151 tcp_mssdflt;
1152 }
1153 tp->t_maxseg = tp->t_maxopd - optlen;
1154
1155 /*
1156 * Reset the slow-start flight size
1157 * as it may depend on the new MSS
1158 */
1159 if (CC_ALGO(tp)->cwnd_init != NULL) {
1160 CC_ALGO(tp)->cwnd_init(tp);
1161 }
1162 tp->snd_cwnd = tp->t_maxseg;
1163 }
1164 /*
1165 * If further retransmissions are still
1166 * unsuccessful with a lowered MTU, maybe this
1167 * isn't a Black Hole and we restore the previous
1168 * MSS and blackhole detection flags.
1169 */
1170 else {
1171 if ((tp->t_flags & TF_BLACKHOLE) &&
1172 (tp->t_rxtshift > 4)) {
1173 tcp_pmtud_revert_segment_size(tp);
1174 tp->snd_cwnd = tp->t_maxseg;
1175 }
1176 }
1177 }
1178
1179
1180 /*
1181 * Disable rfc1323 and rfc1644 if we haven't got any
1182 * response to our SYN (after we reach the threshold)
1183 * to work-around some broken terminal servers (most of
1184 * which have hopefully been retired) that have bad VJ
1185 * header compression code which trashes TCP segments
1186 * containing unknown-to-them TCP options.
1187 * Do this only on non-local connections.
1188 */
1189 if (tp->t_state == TCPS_SYN_SENT &&
1190 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres) {
1191 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_REQ_CC);
1192 }
1193
1194 /*
1195 * If losing, let the lower level know and try for
1196 * a better route. Also, if we backed off this far,
1197 * our srtt estimate is probably bogus. Clobber it
1198 * so we'll take the next rtt measurement as our srtt;
1199 * move the current srtt into rttvar to keep the current
1200 * retransmit times until then.
1201 */
1202 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
1203 #if INET6
1204 if (isipv6) {
1205 in6_losing(tp->t_inpcb);
1206 } else
1207 #endif /* INET6 */
1208 in_losing(tp->t_inpcb);
1209 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
1210 tp->t_srtt = 0;
1211 }
1212 tp->snd_nxt = tp->snd_una;
1213 /*
1214 * Note: We overload snd_recover to function also as the
1215 * snd_last variable described in RFC 2582
1216 */
1217 tp->snd_recover = tp->snd_max;
1218 /*
1219 * Force a segment to be sent.
1220 */
1221 tp->t_flags |= TF_ACKNOW;
1222
1223 /* If timing a segment in this window, stop the timer */
1224 tp->t_rtttime = 0;
1225
1226 if (!IN_FASTRECOVERY(tp) && tp->t_rxtshift == 1) {
1227 tcpstat.tcps_tailloss_rto++;
1228 }
1229
1230
1231 /*
1232 * RFC 5681 says: when a TCP sender detects segment loss
1233 * using retransmit timer and the given segment has already
1234 * been retransmitted by way of the retransmission timer at
1235 * least once, the value of ssthresh is held constant
1236 */
1237 if (tp->t_rxtshift == 1 &&
1238 CC_ALGO(tp)->after_timeout != NULL) {
1239 CC_ALGO(tp)->after_timeout(tp);
1240 /*
1241 * CWR notifications are to be sent on new data
1242 * right after Fast Retransmits and ECE
1243 * notification receipts.
1244 */
1245 if (TCP_ECN_ENABLED(tp)) {
1246 tp->ecn_flags |= TE_SENDCWR;
1247 }
1248 }
1249
1250 EXIT_FASTRECOVERY(tp);
1251
1252 /* Exit cwnd non validated phase */
1253 tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
1254
1255
1256 fc_output:
1257 tcp_ccdbg_trace(tp, NULL, TCP_CC_REXMT_TIMEOUT);
1258
1259 (void) tcp_output(tp);
1260 break;
1261
1262 /*
1263 * Persistance timer into zero window.
1264 * Force a byte to be output, if possible.
1265 */
1266 case TCPT_PERSIST:
1267 tcpstat.tcps_persisttimeo++;
1268 /*
1269 * Hack: if the peer is dead/unreachable, we do not
1270 * time out if the window is closed. After a full
1271 * backoff, drop the connection if the idle time
1272 * (no responses to probes) reaches the maximum
1273 * backoff that we would use if retransmitting.
1274 *
1275 * Drop the connection if we reached the maximum allowed time for
1276 * Zero Window Probes without a non-zero update from the peer.
1277 * See rdar://5805356
1278 */
1279 if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
1280 (idle_time >= tcp_maxpersistidle ||
1281 idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) ||
1282 ((tp->t_persist_stop != 0) &&
1283 TSTMP_LEQ(tp->t_persist_stop, tcp_now))) {
1284 tcpstat.tcps_persistdrop++;
1285 postevent(so, 0, EV_TIMEOUT);
1286 soevent(so,
1287 (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT));
1288 tp = tcp_drop(tp, ETIMEDOUT);
1289 break;
1290 }
1291 tcp_setpersist(tp);
1292 tp->t_flagsext |= TF_FORCE;
1293 (void) tcp_output(tp);
1294 tp->t_flagsext &= ~TF_FORCE;
1295 break;
1296
1297 /*
1298 * Keep-alive timer went off; send something
1299 * or drop connection if idle for too long.
1300 */
1301 case TCPT_KEEP:
1302 tcpstat.tcps_keeptimeo++;
1303 #if MPTCP
1304 /*
1305 * Regular TCP connections do not send keepalives after closing
1306 * MPTCP must not also, after sending Data FINs.
1307 */
1308 struct mptcb *mp_tp = tptomptp(tp);
1309 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
1310 (tp->t_state > TCPS_ESTABLISHED)) {
1311 goto dropit;
1312 } else if (mp_tp != NULL) {
1313 if ((mptcp_ok_to_keepalive(mp_tp) == 0)) {
1314 goto dropit;
1315 }
1316 }
1317 #endif /* MPTCP */
1318 if (tp->t_state < TCPS_ESTABLISHED) {
1319 goto dropit;
1320 }
1321 if ((always_keepalive ||
1322 (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ||
1323 (tp->t_flagsext & TF_DETECT_READSTALL) ||
1324 (tp->t_tfo_probe_state == TFO_PROBE_PROBING)) &&
1325 (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
1326 if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp)) {
1327 goto dropit;
1328 }
1329 /*
1330 * Send a packet designed to force a response
1331 * if the peer is up and reachable:
1332 * either an ACK if the connection is still alive,
1333 * or an RST if the peer has closed the connection
1334 * due to timeout or reboot.
1335 * Using sequence number tp->snd_una-1
1336 * causes the transmitted zero-length segment
1337 * to lie outside the receive window;
1338 * by the protocol spec, this requires the
1339 * correspondent TCP to respond.
1340 */
1341 tcpstat.tcps_keepprobe++;
1342 t_template = tcp_maketemplate(tp);
1343 if (t_template) {
1344 struct inpcb *inp = tp->t_inpcb;
1345 struct tcp_respond_args tra;
1346
1347 bzero(&tra, sizeof(tra));
1348 tra.nocell = INP_NO_CELLULAR(inp);
1349 tra.noexpensive = INP_NO_EXPENSIVE(inp);
1350 tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
1351 tra.intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp);
1352 if (tp->t_inpcb->inp_flags & INP_BOUND_IF) {
1353 tra.ifscope = tp->t_inpcb->inp_boundifp->if_index;
1354 } else {
1355 tra.ifscope = IFSCOPE_NONE;
1356 }
1357 tcp_respond(tp, t_template->tt_ipgen,
1358 &t_template->tt_t, (struct mbuf *)NULL,
1359 tp->rcv_nxt, tp->snd_una - 1, 0, &tra);
1360 (void) m_free(dtom(t_template));
1361 if (tp->t_flagsext & TF_DETECT_READSTALL) {
1362 tp->t_rtimo_probes++;
1363 }
1364 }
1365 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1366 TCP_CONN_KEEPINTVL(tp));
1367 } else {
1368 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1369 TCP_CONN_KEEPIDLE(tp));
1370 }
1371 if (tp->t_flagsext & TF_DETECT_READSTALL) {
1372 struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
1373 bool reenable_probe = false;
1374 /*
1375 * The keep alive packets sent to detect a read
1376 * stall did not get a response from the
1377 * peer. Generate more keep-alives to confirm this.
1378 * If the number of probes sent reaches the limit,
1379 * generate an event.
1380 */
1381 if (tp->t_adaptive_rtimo > 0) {
1382 if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) {
1383 /* Generate an event */
1384 soevent(so,
1385 (SO_FILT_HINT_LOCKED |
1386 SO_FILT_HINT_ADAPTIVE_RTIMO));
1387 tcp_keepalive_reset(tp);
1388 } else {
1389 reenable_probe = true;
1390 }
1391 } else if (outifp != NULL &&
1392 (outifp->if_eflags & IFEF_PROBE_CONNECTIVITY) &&
1393 tp->t_rtimo_probes <= TCP_CONNECTIVITY_PROBES_MAX) {
1394 reenable_probe = true;
1395 } else {
1396 tp->t_flagsext &= ~TF_DETECT_READSTALL;
1397 }
1398 if (reenable_probe) {
1399 int ind = min(tp->t_rtimo_probes,
1400 TCP_MAXRXTSHIFT);
1401 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(
1402 tp, tcp_backoff[ind] * TCP_REXMTVAL(tp));
1403 }
1404 }
1405 if (tp->t_tfo_probe_state == TFO_PROBE_PROBING) {
1406 int ind;
1407
1408 tp->t_tfo_probes++;
1409 ind = min(tp->t_tfo_probes, TCP_MAXRXTSHIFT);
1410
1411 /*
1412 * We take the minimum among the time set by true
1413 * keepalive (see above) and the backoff'd RTO. That
1414 * way we backoff in case of packet-loss but will never
1415 * timeout slower than regular keepalive due to the
1416 * backing off.
1417 */
1418 tp->t_timer[TCPT_KEEP] = min(OFFSET_FROM_START(
1419 tp, tcp_backoff[ind] * TCP_REXMTVAL(tp)),
1420 tp->t_timer[TCPT_KEEP]);
1421 } else if (!(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
1422 tp->t_tfo_probe_state == TFO_PROBE_WAIT_DATA) {
1423 /* Still no data! Let's assume a TFO-error and err out... */
1424 tcp_heuristic_tfo_middlebox(tp);
1425
1426 so->so_error = ENODATA;
1427 sorwakeup(so);
1428 tp->t_tfo_stats |= TFO_S_RECV_BLACKHOLE;
1429 tcpstat.tcps_tfo_blackhole++;
1430 }
1431 break;
1432 case TCPT_DELACK:
1433 if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
1434 tp->t_flags &= ~TF_DELACK;
1435 tp->t_timer[TCPT_DELACK] = 0;
1436 tp->t_flags |= TF_ACKNOW;
1437
1438 /*
1439 * If delayed ack timer fired while stretching
1440 * acks, count the number of times the streaming
1441 * detection was not correct. If this exceeds a
1442 * threshold, disable strech ack on this
1443 * connection
1444 *
1445 * Also, go back to acking every other packet.
1446 */
1447 if ((tp->t_flags & TF_STRETCHACK)) {
1448 if (tp->t_unacksegs > 1 &&
1449 tp->t_unacksegs < maxseg_unacked) {
1450 tp->t_stretchack_delayed++;
1451 }
1452
1453 if (tp->t_stretchack_delayed >
1454 TCP_STRETCHACK_DELAY_THRESHOLD) {
1455 tp->t_flagsext |= TF_DISABLE_STRETCHACK;
1456 /*
1457 * Note the time at which stretch
1458 * ack was disabled automatically
1459 */
1460 tp->rcv_nostrack_ts = tcp_now;
1461 tcpstat.tcps_nostretchack++;
1462 tp->t_stretchack_delayed = 0;
1463 tp->rcv_nostrack_pkts = 0;
1464 }
1465 tcp_reset_stretch_ack(tp);
1466 }
1467
1468 /*
1469 * If we are measuring inter packet arrival jitter
1470 * for throttling a connection, this delayed ack
1471 * might be the reason for accumulating some
1472 * jitter. So let's restart the measurement.
1473 */
1474 CLEAR_IAJ_STATE(tp);
1475
1476 tcpstat.tcps_delack++;
1477 (void) tcp_output(tp);
1478 }
1479 break;
1480
1481 #if MPTCP
1482 case TCPT_JACK_RXMT:
1483 if ((tp->t_state == TCPS_ESTABLISHED) &&
1484 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
1485 (tp->t_mpflags & TMPF_JOINED_FLOW)) {
1486 if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) {
1487 tcpstat.tcps_timeoutdrop++;
1488 postevent(so, 0, EV_TIMEOUT);
1489 soevent(so,
1490 (SO_FILT_HINT_LOCKED |
1491 SO_FILT_HINT_TIMEOUT));
1492 tp = tcp_drop(tp, tp->t_softerror ?
1493 tp->t_softerror : ETIMEDOUT);
1494 break;
1495 }
1496 tcpstat.tcps_join_rxmts++;
1497 tp->t_mpflags |= TMPF_SND_JACK;
1498 tp->t_flags |= TF_ACKNOW;
1499
1500 /*
1501 * No backoff is implemented for simplicity for this
1502 * corner case.
1503 */
1504 (void) tcp_output(tp);
1505 }
1506 break;
1507 #endif /* MPTCP */
1508
1509 case TCPT_PTO:
1510 {
1511 int32_t snd_len;
1512 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1513
1514 /*
1515 * Check if the connection is in the right state to
1516 * send a probe
1517 */
1518 if (tp->t_state != TCPS_ESTABLISHED ||
1519 (tp->t_rxtshift > 0 && !(tp->t_flagsext & TF_PROBING)) ||
1520 tp->snd_max == tp->snd_una ||
1521 !SACK_ENABLED(tp) ||
1522 !TAILQ_EMPTY(&tp->snd_holes) ||
1523 IN_FASTRECOVERY(tp)) {
1524 break;
1525 }
1526
1527 /*
1528 * If there is no new data to send or if the
1529 * connection is limited by receive window then
1530 * retransmit the last segment, otherwise send
1531 * new data.
1532 */
1533 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd)
1534 - (tp->snd_max - tp->snd_una);
1535 if (snd_len > 0) {
1536 tp->snd_nxt = tp->snd_max;
1537 } else {
1538 snd_len = min((tp->snd_max - tp->snd_una),
1539 tp->t_maxseg);
1540 tp->snd_nxt = tp->snd_max - snd_len;
1541 }
1542
1543 tcpstat.tcps_pto++;
1544 if (tp->t_flagsext & TF_PROBING) {
1545 tcpstat.tcps_probe_if++;
1546 }
1547
1548 /* If timing a segment in this window, stop the timer */
1549 tp->t_rtttime = 0;
1550 /* Note that tail loss probe is being sent */
1551 tp->t_flagsext |= TF_SENT_TLPROBE;
1552 tp->t_tlpstart = tcp_now;
1553
1554 tp->snd_cwnd += tp->t_maxseg;
1555
1556 /*
1557 * When tail-loss-probe fires, we reset the RTO timer, because
1558 * a probe just got sent, so we are good to push out the timer.
1559 *
1560 * Set to 0 to ensure that tcp_output() will reschedule it
1561 */
1562 tp->t_timer[TCPT_REXMT] = 0;
1563
1564 (void)tcp_output(tp);
1565 tp->snd_cwnd -= tp->t_maxseg;
1566
1567 tp->t_tlphighrxt = tp->snd_nxt;
1568 break;
1569 }
1570 case TCPT_DELAYFR:
1571 tp->t_flagsext &= ~TF_DELAY_RECOVERY;
1572
1573 /*
1574 * Don't do anything if one of the following is true:
1575 * - the connection is already in recovery
1576 * - sequence until snd_recover has been acknowledged.
1577 * - retransmit timeout has fired
1578 */
1579 if (IN_FASTRECOVERY(tp) ||
1580 SEQ_GEQ(tp->snd_una, tp->snd_recover) ||
1581 tp->t_rxtshift > 0) {
1582 break;
1583 }
1584
1585 VERIFY(SACK_ENABLED(tp));
1586 tcp_rexmt_save_state(tp);
1587 if (CC_ALGO(tp)->pre_fr != NULL) {
1588 CC_ALGO(tp)->pre_fr(tp);
1589 if (TCP_ECN_ENABLED(tp)) {
1590 tp->ecn_flags |= TE_SENDCWR;
1591 }
1592 }
1593 ENTER_FASTRECOVERY(tp);
1594
1595 tp->t_timer[TCPT_REXMT] = 0;
1596 tcpstat.tcps_sack_recovery_episode++;
1597 tp->t_sack_recovery_episode++;
1598 tp->sack_newdata = tp->snd_nxt;
1599 tp->snd_cwnd = tp->t_maxseg;
1600 tcp_ccdbg_trace(tp, NULL, TCP_CC_ENTER_FASTRECOVERY);
1601 (void) tcp_output(tp);
1602 break;
1603 dropit:
1604 tcpstat.tcps_keepdrops++;
1605 postevent(so, 0, EV_TIMEOUT);
1606 soevent(so,
1607 (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT));
1608 tp = tcp_drop(tp, ETIMEDOUT);
1609 break;
1610 }
1611 #if TCPDEBUG
1612 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
1613 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
1614 PRU_SLOWTIMO);
1615 }
1616 #endif
1617 return tp;
1618 }
1619
1620 /* Remove a timer entry from timer list */
1621 void
1622 tcp_remove_timer(struct tcpcb *tp)
1623 {
1624 struct tcptimerlist *listp = &tcp_timer_list;
1625
1626 socket_lock_assert_owned(tp->t_inpcb->inp_socket);
1627 if (!(TIMER_IS_ON_LIST(tp))) {
1628 return;
1629 }
1630 lck_mtx_lock(listp->mtx);
1631
1632 /* Check if pcb is on timer list again after acquiring the lock */
1633 if (!(TIMER_IS_ON_LIST(tp))) {
1634 lck_mtx_unlock(listp->mtx);
1635 return;
1636 }
1637
1638 if (listp->next_te != NULL && listp->next_te == &tp->tentry) {
1639 listp->next_te = LIST_NEXT(&tp->tentry, le);
1640 }
1641
1642 LIST_REMOVE(&tp->tentry, le);
1643 tp->t_flags &= ~(TF_TIMER_ONLIST);
1644
1645 listp->entries--;
1646
1647 tp->tentry.le.le_next = NULL;
1648 tp->tentry.le.le_prev = NULL;
1649 lck_mtx_unlock(listp->mtx);
1650 }
1651
1652 /*
1653 * Function to check if the timerlist needs to be rescheduled to run
1654 * the timer entry correctly. Basically, this is to check if we can avoid
1655 * taking the list lock.
1656 */
1657
1658 static boolean_t
1659 need_to_resched_timerlist(u_int32_t runtime, u_int16_t mode)
1660 {
1661 struct tcptimerlist *listp = &tcp_timer_list;
1662 int32_t diff;
1663
1664 /*
1665 * If the list is being processed then the state of the list is
1666 * in flux. In this case always acquire the lock and set the state
1667 * correctly.
1668 */
1669 if (listp->running) {
1670 return TRUE;
1671 }
1672
1673 if (!listp->scheduled) {
1674 return TRUE;
1675 }
1676
1677 diff = timer_diff(listp->runtime, 0, runtime, 0);
1678 if (diff <= 0) {
1679 /* The list is going to run before this timer */
1680 return FALSE;
1681 } else {
1682 if (mode & TCP_TIMERLIST_10MS_MODE) {
1683 if (diff <= TCP_TIMER_10MS_QUANTUM) {
1684 return FALSE;
1685 }
1686 } else if (mode & TCP_TIMERLIST_100MS_MODE) {
1687 if (diff <= TCP_TIMER_100MS_QUANTUM) {
1688 return FALSE;
1689 }
1690 } else {
1691 if (diff <= TCP_TIMER_500MS_QUANTUM) {
1692 return FALSE;
1693 }
1694 }
1695 }
1696 return TRUE;
1697 }
1698
1699 void
1700 tcp_sched_timerlist(uint32_t offset)
1701 {
1702 uint64_t deadline = 0;
1703 struct tcptimerlist *listp = &tcp_timer_list;
1704
1705 LCK_MTX_ASSERT(listp->mtx, LCK_MTX_ASSERT_OWNED);
1706
1707 offset = min(offset, TCP_TIMERLIST_MAX_OFFSET);
1708 listp->runtime = tcp_now + offset;
1709 listp->schedtime = tcp_now;
1710 if (listp->runtime == 0) {
1711 listp->runtime++;
1712 offset++;
1713 }
1714
1715 clock_interval_to_deadline(offset, USEC_PER_SEC, &deadline);
1716
1717 thread_call_enter_delayed(listp->call, deadline);
1718 listp->scheduled = TRUE;
1719 }
1720
1721 /*
1722 * Function to run the timers for a connection.
1723 *
1724 * Returns the offset of next timer to be run for this connection which
1725 * can be used to reschedule the timerlist.
1726 *
1727 * te_mode is an out parameter that indicates the modes of active
1728 * timers for this connection.
1729 */
1730 u_int32_t
1731 tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode,
1732 u_int16_t probe_if_index)
1733 {
1734 struct socket *so;
1735 u_int16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE;
1736 u_int32_t timer_val, offset = 0, lo_timer = 0;
1737 int32_t diff;
1738 boolean_t needtorun[TCPT_NTIMERS];
1739 int count = 0;
1740
1741 VERIFY(tp != NULL);
1742 bzero(needtorun, sizeof(needtorun));
1743 *te_mode = 0;
1744
1745 socket_lock(tp->t_inpcb->inp_socket, 1);
1746
1747 so = tp->t_inpcb->inp_socket;
1748 /* Release the want count on inp */
1749 if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1)
1750 == WNT_STOPUSING) {
1751 if (TIMER_IS_ON_LIST(tp)) {
1752 tcp_remove_timer(tp);
1753 }
1754
1755 /* Looks like the TCP connection got closed while we
1756 * were waiting for the lock.. Done
1757 */
1758 goto done;
1759 }
1760
1761 /*
1762 * If this connection is over an interface that needs to
1763 * be probed, send probe packets to reinitiate communication.
1764 */
1765 if (probe_if_index > 0 && tp->t_inpcb->inp_last_outifp != NULL &&
1766 tp->t_inpcb->inp_last_outifp->if_index == probe_if_index) {
1767 tp->t_flagsext |= TF_PROBING;
1768 tcp_timers(tp, TCPT_PTO);
1769 tp->t_timer[TCPT_PTO] = 0;
1770 tp->t_flagsext &= ~TF_PROBING;
1771 }
1772
1773 /*
1774 * Since the timer thread needs to wait for tcp lock, it may race
1775 * with another thread that can cancel or reschedule the timer
1776 * that is about to run. Check if we need to run anything.
1777 */
1778 if ((index = tp->tentry.index) == TCPT_NONE) {
1779 goto done;
1780 }
1781
1782 timer_val = tp->t_timer[index];
1783
1784 diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
1785 if (diff > 0) {
1786 if (tp->tentry.index != TCPT_NONE) {
1787 offset = diff;
1788 *(te_mode) = tp->tentry.mode;
1789 }
1790 goto done;
1791 }
1792
1793 tp->t_timer[index] = 0;
1794 if (timer_val > 0) {
1795 tp = tcp_timers(tp, index);
1796 if (tp == NULL) {
1797 goto done;
1798 }
1799 }
1800
1801 /*
1802 * Check if there are any other timers that need to be run.
1803 * While doing it, adjust the timer values wrt tcp_now.
1804 */
1805 tp->tentry.mode = 0;
1806 for (i = 0; i < TCPT_NTIMERS; ++i) {
1807 if (tp->t_timer[i] != 0) {
1808 diff = timer_diff(tp->tentry.timer_start,
1809 tp->t_timer[i], tcp_now, 0);
1810 if (diff <= 0) {
1811 needtorun[i] = TRUE;
1812 count++;
1813 } else {
1814 tp->t_timer[i] = diff;
1815 needtorun[i] = FALSE;
1816 if (lo_timer == 0 || diff < lo_timer) {
1817 lo_timer = diff;
1818 lo_index = i;
1819 }
1820 TCP_SET_TIMER_MODE(tp->tentry.mode, i);
1821 }
1822 }
1823 }
1824
1825 tp->tentry.timer_start = tcp_now;
1826 tp->tentry.index = lo_index;
1827 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
1828
1829 if (tp->tentry.index != TCPT_NONE) {
1830 tp->tentry.runtime = tp->tentry.timer_start +
1831 tp->t_timer[tp->tentry.index];
1832 if (tp->tentry.runtime == 0) {
1833 tp->tentry.runtime++;
1834 }
1835 }
1836
1837 if (count > 0) {
1838 /* run any other timers outstanding at this time. */
1839 for (i = 0; i < TCPT_NTIMERS; ++i) {
1840 if (needtorun[i]) {
1841 tp->t_timer[i] = 0;
1842 tp = tcp_timers(tp, i);
1843 if (tp == NULL) {
1844 offset = 0;
1845 *(te_mode) = 0;
1846 goto done;
1847 }
1848 }
1849 }
1850 tcp_set_lotimer_index(tp);
1851 }
1852
1853 if (tp->tentry.index < TCPT_NONE) {
1854 offset = tp->t_timer[tp->tentry.index];
1855 *(te_mode) = tp->tentry.mode;
1856 }
1857
1858 done:
1859 if (tp != NULL && tp->tentry.index == TCPT_NONE) {
1860 tcp_remove_timer(tp);
1861 offset = 0;
1862 }
1863
1864 socket_unlock(so, 1);
1865 return offset;
1866 }
1867
1868 void
1869 tcp_run_timerlist(void * arg1, void * arg2)
1870 {
1871 #pragma unused(arg1, arg2)
1872 struct tcptimerentry *te, *next_te;
1873 struct tcptimerlist *listp = &tcp_timer_list;
1874 struct tcpcb *tp;
1875 uint32_t next_timer = 0; /* offset of the next timer on the list */
1876 u_int16_t te_mode = 0; /* modes of all active timers in a tcpcb */
1877 u_int16_t list_mode = 0; /* cumulative of modes of all tcpcbs */
1878 uint32_t active_count = 0;
1879
1880 calculate_tcp_clock();
1881
1882 lck_mtx_lock(listp->mtx);
1883
1884 int32_t drift = tcp_now - listp->runtime;
1885 if (drift <= 1) {
1886 tcpstat.tcps_timer_drift_le_1_ms++;
1887 } else if (drift <= 10) {
1888 tcpstat.tcps_timer_drift_le_10_ms++;
1889 } else if (drift <= 20) {
1890 tcpstat.tcps_timer_drift_le_20_ms++;
1891 } else if (drift <= 50) {
1892 tcpstat.tcps_timer_drift_le_50_ms++;
1893 } else if (drift <= 100) {
1894 tcpstat.tcps_timer_drift_le_100_ms++;
1895 } else if (drift <= 200) {
1896 tcpstat.tcps_timer_drift_le_200_ms++;
1897 } else if (drift <= 500) {
1898 tcpstat.tcps_timer_drift_le_500_ms++;
1899 } else if (drift <= 1000) {
1900 tcpstat.tcps_timer_drift_le_1000_ms++;
1901 } else {
1902 tcpstat.tcps_timer_drift_gt_1000_ms++;
1903 }
1904
1905 listp->running = TRUE;
1906
1907 LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
1908 uint32_t offset = 0;
1909 uint32_t runtime = te->runtime;
1910 if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) {
1911 offset = timer_diff(runtime, 0, tcp_now, 0);
1912 if (next_timer == 0 || offset < next_timer) {
1913 next_timer = offset;
1914 }
1915 list_mode |= te->mode;
1916 continue;
1917 }
1918
1919 tp = TIMERENTRY_TO_TP(te);
1920
1921 /*
1922 * Acquire an inp wantcnt on the inpcb so that the socket
1923 * won't get detached even if tcp_close is called
1924 */
1925 if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0)
1926 == WNT_STOPUSING) {
1927 /*
1928 * Some how this pcb went into dead state while
1929 * on the timer list, just take it off the list.
1930 * Since the timer list entry pointers are
1931 * protected by the timer list lock, we can
1932 * do it here without the socket lock.
1933 */
1934 if (TIMER_IS_ON_LIST(tp)) {
1935 tp->t_flags &= ~(TF_TIMER_ONLIST);
1936 LIST_REMOVE(&tp->tentry, le);
1937 listp->entries--;
1938
1939 tp->tentry.le.le_next = NULL;
1940 tp->tentry.le.le_prev = NULL;
1941 }
1942 continue;
1943 }
1944 active_count++;
1945
1946 /*
1947 * Store the next timerentry pointer before releasing the
1948 * list lock. If that entry has to be removed when we
1949 * release the lock, this pointer will be updated to the
1950 * element after that.
1951 */
1952 listp->next_te = next_te;
1953
1954 VERIFY_NEXT_LINK(&tp->tentry, le);
1955 VERIFY_PREV_LINK(&tp->tentry, le);
1956
1957 lck_mtx_unlock(listp->mtx);
1958
1959 offset = tcp_run_conn_timer(tp, &te_mode,
1960 listp->probe_if_index);
1961
1962 lck_mtx_lock(listp->mtx);
1963
1964 next_te = listp->next_te;
1965 listp->next_te = NULL;
1966
1967 if (offset > 0 && te_mode != 0) {
1968 list_mode |= te_mode;
1969
1970 if (next_timer == 0 || offset < next_timer) {
1971 next_timer = offset;
1972 }
1973 }
1974 }
1975
1976 if (!LIST_EMPTY(&listp->lhead)) {
1977 u_int16_t next_mode = 0;
1978 if ((list_mode & TCP_TIMERLIST_10MS_MODE) ||
1979 (listp->pref_mode & TCP_TIMERLIST_10MS_MODE)) {
1980 next_mode = TCP_TIMERLIST_10MS_MODE;
1981 } else if ((list_mode & TCP_TIMERLIST_100MS_MODE) ||
1982 (listp->pref_mode & TCP_TIMERLIST_100MS_MODE)) {
1983 next_mode = TCP_TIMERLIST_100MS_MODE;
1984 } else {
1985 next_mode = TCP_TIMERLIST_500MS_MODE;
1986 }
1987
1988 if (next_mode != TCP_TIMERLIST_500MS_MODE) {
1989 listp->idleruns = 0;
1990 } else {
1991 /*
1992 * the next required mode is slow mode, but if
1993 * the last one was a faster mode and we did not
1994 * have enough idle runs, repeat the last mode.
1995 *
1996 * We try to keep the timer list in fast mode for
1997 * some idle time in expectation of new data.
1998 */
1999 if (listp->mode != next_mode &&
2000 listp->idleruns < timer_fastmode_idlemax) {
2001 listp->idleruns++;
2002 next_mode = listp->mode;
2003 next_timer = TCP_TIMER_100MS_QUANTUM;
2004 } else {
2005 listp->idleruns = 0;
2006 }
2007 }
2008 listp->mode = next_mode;
2009 if (listp->pref_offset != 0) {
2010 next_timer = min(listp->pref_offset, next_timer);
2011 }
2012
2013 if (listp->mode == TCP_TIMERLIST_500MS_MODE) {
2014 next_timer = max(next_timer,
2015 TCP_TIMER_500MS_QUANTUM);
2016 }
2017
2018 tcp_sched_timerlist(next_timer);
2019 } else {
2020 /*
2021 * No need to reschedule this timer, but always run
2022 * periodically at a much higher granularity.
2023 */
2024 tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET);
2025 }
2026
2027 listp->running = FALSE;
2028 listp->pref_mode = 0;
2029 listp->pref_offset = 0;
2030 listp->probe_if_index = 0;
2031
2032 lck_mtx_unlock(listp->mtx);
2033 }
2034
2035 /*
2036 * Function to check if the timerlist needs to be rescheduled to run this
2037 * connection's timers correctly.
2038 */
2039 void
2040 tcp_sched_timers(struct tcpcb *tp)
2041 {
2042 struct tcptimerentry *te = &tp->tentry;
2043 u_int16_t index = te->index;
2044 u_int16_t mode = te->mode;
2045 struct tcptimerlist *listp = &tcp_timer_list;
2046 int32_t offset = 0;
2047 boolean_t list_locked = FALSE;
2048
2049 if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) {
2050 /* Just return without adding the dead pcb to the list */
2051 if (TIMER_IS_ON_LIST(tp)) {
2052 tcp_remove_timer(tp);
2053 }
2054 return;
2055 }
2056
2057 if (index == TCPT_NONE) {
2058 /* Nothing to run */
2059 tcp_remove_timer(tp);
2060 return;
2061 }
2062
2063 /*
2064 * compute the offset at which the next timer for this connection
2065 * has to run.
2066 */
2067 offset = timer_diff(te->runtime, 0, tcp_now, 0);
2068 if (offset <= 0) {
2069 offset = 1;
2070 tcp_timer_advanced++;
2071 }
2072
2073 if (!TIMER_IS_ON_LIST(tp)) {
2074 if (!list_locked) {
2075 lck_mtx_lock(listp->mtx);
2076 list_locked = TRUE;
2077 }
2078
2079 if (!TIMER_IS_ON_LIST(tp)) {
2080 LIST_INSERT_HEAD(&listp->lhead, te, le);
2081 tp->t_flags |= TF_TIMER_ONLIST;
2082
2083 listp->entries++;
2084 if (listp->entries > listp->maxentries) {
2085 listp->maxentries = listp->entries;
2086 }
2087
2088 /* if the list is not scheduled, just schedule it */
2089 if (!listp->scheduled) {
2090 goto schedule;
2091 }
2092 }
2093 }
2094
2095 /*
2096 * Timer entry is currently on the list, check if the list needs
2097 * to be rescheduled.
2098 */
2099 if (need_to_resched_timerlist(te->runtime, mode)) {
2100 tcp_resched_timerlist++;
2101
2102 if (!list_locked) {
2103 lck_mtx_lock(listp->mtx);
2104 list_locked = TRUE;
2105 }
2106
2107 VERIFY_NEXT_LINK(te, le);
2108 VERIFY_PREV_LINK(te, le);
2109
2110 if (listp->running) {
2111 listp->pref_mode |= mode;
2112 if (listp->pref_offset == 0 ||
2113 offset < listp->pref_offset) {
2114 listp->pref_offset = offset;
2115 }
2116 } else {
2117 /*
2118 * The list could have got rescheduled while
2119 * this thread was waiting for the lock
2120 */
2121 if (listp->scheduled) {
2122 int32_t diff;
2123 diff = timer_diff(listp->runtime, 0,
2124 tcp_now, offset);
2125 if (diff <= 0) {
2126 goto done;
2127 } else {
2128 goto schedule;
2129 }
2130 } else {
2131 goto schedule;
2132 }
2133 }
2134 }
2135 goto done;
2136
2137 schedule:
2138 /*
2139 * Since a connection with timers is getting scheduled, the timer
2140 * list moves from idle to active state and that is why idlegen is
2141 * reset
2142 */
2143 if (mode & TCP_TIMERLIST_10MS_MODE) {
2144 listp->mode = TCP_TIMERLIST_10MS_MODE;
2145 listp->idleruns = 0;
2146 offset = min(offset, TCP_TIMER_10MS_QUANTUM);
2147 } else if (mode & TCP_TIMERLIST_100MS_MODE) {
2148 if (listp->mode > TCP_TIMERLIST_100MS_MODE) {
2149 listp->mode = TCP_TIMERLIST_100MS_MODE;
2150 }
2151 listp->idleruns = 0;
2152 offset = min(offset, TCP_TIMER_100MS_QUANTUM);
2153 }
2154 tcp_sched_timerlist(offset);
2155
2156 done:
2157 if (list_locked) {
2158 lck_mtx_unlock(listp->mtx);
2159 }
2160
2161 return;
2162 }
2163
2164 static inline void
2165 tcp_set_lotimer_index(struct tcpcb *tp)
2166 {
2167 uint16_t i, lo_index = TCPT_NONE, mode = 0;
2168 uint32_t lo_timer = 0;
2169 for (i = 0; i < TCPT_NTIMERS; ++i) {
2170 if (tp->t_timer[i] != 0) {
2171 TCP_SET_TIMER_MODE(mode, i);
2172 if (lo_timer == 0 || tp->t_timer[i] < lo_timer) {
2173 lo_timer = tp->t_timer[i];
2174 lo_index = i;
2175 }
2176 }
2177 }
2178 tp->tentry.index = lo_index;
2179 tp->tentry.mode = mode;
2180 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
2181
2182 if (tp->tentry.index != TCPT_NONE) {
2183 tp->tentry.runtime = tp->tentry.timer_start
2184 + tp->t_timer[tp->tentry.index];
2185 if (tp->tentry.runtime == 0) {
2186 tp->tentry.runtime++;
2187 }
2188 }
2189 }
2190
2191 void
2192 tcp_check_timer_state(struct tcpcb *tp)
2193 {
2194 socket_lock_assert_owned(tp->t_inpcb->inp_socket);
2195
2196 if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT) {
2197 return;
2198 }
2199
2200 tcp_set_lotimer_index(tp);
2201
2202 tcp_sched_timers(tp);
2203 return;
2204 }
2205
2206 static inline void
2207 tcp_cumulative_stat(u_int32_t cur, u_int32_t *prev, u_int32_t *dest)
2208 {
2209 /* handle wrap around */
2210 int32_t diff = (int32_t) (cur - *prev);
2211 if (diff > 0) {
2212 *dest = diff;
2213 } else {
2214 *dest = 0;
2215 }
2216 *prev = cur;
2217 return;
2218 }
2219
2220 static inline void
2221 tcp_cumulative_stat64(u_int64_t cur, u_int64_t *prev, u_int64_t *dest)
2222 {
2223 /* handle wrap around */
2224 int64_t diff = (int64_t) (cur - *prev);
2225 if (diff > 0) {
2226 *dest = diff;
2227 } else {
2228 *dest = 0;
2229 }
2230 *prev = cur;
2231 return;
2232 }
2233
2234 __private_extern__ void
2235 tcp_report_stats(void)
2236 {
2237 struct nstat_sysinfo_data data;
2238 struct sockaddr_in dst;
2239 struct sockaddr_in6 dst6;
2240 struct rtentry *rt = NULL;
2241 static struct tcp_last_report_stats prev;
2242 u_int64_t var, uptime;
2243
2244 #define stat data.u.tcp_stats
2245 if (((uptime = net_uptime()) - tcp_last_report_time) <
2246 tcp_report_stats_interval) {
2247 return;
2248 }
2249
2250 tcp_last_report_time = uptime;
2251
2252 bzero(&data, sizeof(data));
2253 data.flags = NSTAT_SYSINFO_TCP_STATS;
2254
2255 bzero(&dst, sizeof(dst));
2256 dst.sin_len = sizeof(dst);
2257 dst.sin_family = AF_INET;
2258
2259 /* ipv4 avg rtt */
2260 lck_mtx_lock(rnh_lock);
2261 rt = rt_lookup(TRUE, (struct sockaddr *)&dst, NULL,
2262 rt_tables[AF_INET], IFSCOPE_NONE);
2263 lck_mtx_unlock(rnh_lock);
2264 if (rt != NULL) {
2265 RT_LOCK(rt);
2266 if (rt_primary_default(rt, rt_key(rt)) &&
2267 rt->rt_stats != NULL) {
2268 stat.ipv4_avgrtt = rt->rt_stats->nstat_avg_rtt;
2269 }
2270 RT_UNLOCK(rt);
2271 rtfree(rt);
2272 rt = NULL;
2273 }
2274
2275 /* ipv6 avg rtt */
2276 bzero(&dst6, sizeof(dst6));
2277 dst6.sin6_len = sizeof(dst6);
2278 dst6.sin6_family = AF_INET6;
2279
2280 lck_mtx_lock(rnh_lock);
2281 rt = rt_lookup(TRUE, (struct sockaddr *)&dst6, NULL,
2282 rt_tables[AF_INET6], IFSCOPE_NONE);
2283 lck_mtx_unlock(rnh_lock);
2284 if (rt != NULL) {
2285 RT_LOCK(rt);
2286 if (rt_primary_default(rt, rt_key(rt)) &&
2287 rt->rt_stats != NULL) {
2288 stat.ipv6_avgrtt = rt->rt_stats->nstat_avg_rtt;
2289 }
2290 RT_UNLOCK(rt);
2291 rtfree(rt);
2292 rt = NULL;
2293 }
2294
2295 /* send packet loss rate, shift by 10 for precision */
2296 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_sndrexmitpack > 0) {
2297 var = tcpstat.tcps_sndrexmitpack << 10;
2298 stat.send_plr = (var * 100) / tcpstat.tcps_sndpack;
2299 }
2300
2301 /* recv packet loss rate, shift by 10 for precision */
2302 if (tcpstat.tcps_rcvpack > 0 && tcpstat.tcps_recovered_pkts > 0) {
2303 var = tcpstat.tcps_recovered_pkts << 10;
2304 stat.recv_plr = (var * 100) / tcpstat.tcps_rcvpack;
2305 }
2306
2307 /* RTO after tail loss, shift by 10 for precision */
2308 if (tcpstat.tcps_sndrexmitpack > 0
2309 && tcpstat.tcps_tailloss_rto > 0) {
2310 var = tcpstat.tcps_tailloss_rto << 10;
2311 stat.send_tlrto_rate =
2312 (var * 100) / tcpstat.tcps_sndrexmitpack;
2313 }
2314
2315 /* packet reordering */
2316 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_reordered_pkts > 0) {
2317 var = tcpstat.tcps_reordered_pkts << 10;
2318 stat.send_reorder_rate =
2319 (var * 100) / tcpstat.tcps_sndpack;
2320 }
2321
2322 if (tcp_ecn_outbound == 1) {
2323 stat.ecn_client_enabled = 1;
2324 }
2325 if (tcp_ecn_inbound == 1) {
2326 stat.ecn_server_enabled = 1;
2327 }
2328 tcp_cumulative_stat(tcpstat.tcps_connattempt,
2329 &prev.tcps_connattempt, &stat.connection_attempts);
2330 tcp_cumulative_stat(tcpstat.tcps_accepts,
2331 &prev.tcps_accepts, &stat.connection_accepts);
2332 tcp_cumulative_stat(tcpstat.tcps_ecn_client_setup,
2333 &prev.tcps_ecn_client_setup, &stat.ecn_client_setup);
2334 tcp_cumulative_stat(tcpstat.tcps_ecn_server_setup,
2335 &prev.tcps_ecn_server_setup, &stat.ecn_server_setup);
2336 tcp_cumulative_stat(tcpstat.tcps_ecn_client_success,
2337 &prev.tcps_ecn_client_success, &stat.ecn_client_success);
2338 tcp_cumulative_stat(tcpstat.tcps_ecn_server_success,
2339 &prev.tcps_ecn_server_success, &stat.ecn_server_success);
2340 tcp_cumulative_stat(tcpstat.tcps_ecn_not_supported,
2341 &prev.tcps_ecn_not_supported, &stat.ecn_not_supported);
2342 tcp_cumulative_stat(tcpstat.tcps_ecn_lost_syn,
2343 &prev.tcps_ecn_lost_syn, &stat.ecn_lost_syn);
2344 tcp_cumulative_stat(tcpstat.tcps_ecn_lost_synack,
2345 &prev.tcps_ecn_lost_synack, &stat.ecn_lost_synack);
2346 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ce,
2347 &prev.tcps_ecn_recv_ce, &stat.ecn_recv_ce);
2348 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ece,
2349 &prev.tcps_ecn_recv_ece, &stat.ecn_recv_ece);
2350 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ece,
2351 &prev.tcps_ecn_recv_ece, &stat.ecn_recv_ece);
2352 tcp_cumulative_stat(tcpstat.tcps_ecn_sent_ece,
2353 &prev.tcps_ecn_sent_ece, &stat.ecn_sent_ece);
2354 tcp_cumulative_stat(tcpstat.tcps_ecn_sent_ece,
2355 &prev.tcps_ecn_sent_ece, &stat.ecn_sent_ece);
2356 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_recv_ce,
2357 &prev.tcps_ecn_conn_recv_ce, &stat.ecn_conn_recv_ce);
2358 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_recv_ece,
2359 &prev.tcps_ecn_conn_recv_ece, &stat.ecn_conn_recv_ece);
2360 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_plnoce,
2361 &prev.tcps_ecn_conn_plnoce, &stat.ecn_conn_plnoce);
2362 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_pl_ce,
2363 &prev.tcps_ecn_conn_pl_ce, &stat.ecn_conn_pl_ce);
2364 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_nopl_ce,
2365 &prev.tcps_ecn_conn_nopl_ce, &stat.ecn_conn_nopl_ce);
2366 tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_synloss,
2367 &prev.tcps_ecn_fallback_synloss, &stat.ecn_fallback_synloss);
2368 tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_reorder,
2369 &prev.tcps_ecn_fallback_reorder, &stat.ecn_fallback_reorder);
2370 tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_ce,
2371 &prev.tcps_ecn_fallback_ce, &stat.ecn_fallback_ce);
2372 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_rcv,
2373 &prev.tcps_tfo_syn_data_rcv, &stat.tfo_syn_data_rcv);
2374 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req_rcv,
2375 &prev.tcps_tfo_cookie_req_rcv, &stat.tfo_cookie_req_rcv);
2376 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_sent,
2377 &prev.tcps_tfo_cookie_sent, &stat.tfo_cookie_sent);
2378 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_invalid,
2379 &prev.tcps_tfo_cookie_invalid, &stat.tfo_cookie_invalid);
2380 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req,
2381 &prev.tcps_tfo_cookie_req, &stat.tfo_cookie_req);
2382 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_rcv,
2383 &prev.tcps_tfo_cookie_rcv, &stat.tfo_cookie_rcv);
2384 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_sent,
2385 &prev.tcps_tfo_syn_data_sent, &stat.tfo_syn_data_sent);
2386 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_acked,
2387 &prev.tcps_tfo_syn_data_acked, &stat.tfo_syn_data_acked);
2388 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_loss,
2389 &prev.tcps_tfo_syn_loss, &stat.tfo_syn_loss);
2390 tcp_cumulative_stat(tcpstat.tcps_tfo_blackhole,
2391 &prev.tcps_tfo_blackhole, &stat.tfo_blackhole);
2392 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_wrong,
2393 &prev.tcps_tfo_cookie_wrong, &stat.tfo_cookie_wrong);
2394 tcp_cumulative_stat(tcpstat.tcps_tfo_no_cookie_rcv,
2395 &prev.tcps_tfo_no_cookie_rcv, &stat.tfo_no_cookie_rcv);
2396 tcp_cumulative_stat(tcpstat.tcps_tfo_heuristics_disable,
2397 &prev.tcps_tfo_heuristics_disable, &stat.tfo_heuristics_disable);
2398 tcp_cumulative_stat(tcpstat.tcps_tfo_sndblackhole,
2399 &prev.tcps_tfo_sndblackhole, &stat.tfo_sndblackhole);
2400
2401
2402 tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_attempt,
2403 &prev.tcps_mptcp_handover_attempt, &stat.mptcp_handover_attempt);
2404 tcp_cumulative_stat(tcpstat.tcps_mptcp_interactive_attempt,
2405 &prev.tcps_mptcp_interactive_attempt, &stat.mptcp_interactive_attempt);
2406 tcp_cumulative_stat(tcpstat.tcps_mptcp_aggregate_attempt,
2407 &prev.tcps_mptcp_aggregate_attempt, &stat.mptcp_aggregate_attempt);
2408 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_handover_attempt,
2409 &prev.tcps_mptcp_fp_handover_attempt, &stat.mptcp_fp_handover_attempt);
2410 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_interactive_attempt,
2411 &prev.tcps_mptcp_fp_interactive_attempt, &stat.mptcp_fp_interactive_attempt);
2412 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_aggregate_attempt,
2413 &prev.tcps_mptcp_fp_aggregate_attempt, &stat.mptcp_fp_aggregate_attempt);
2414 tcp_cumulative_stat(tcpstat.tcps_mptcp_heuristic_fallback,
2415 &prev.tcps_mptcp_heuristic_fallback, &stat.mptcp_heuristic_fallback);
2416 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_heuristic_fallback,
2417 &prev.tcps_mptcp_fp_heuristic_fallback, &stat.mptcp_fp_heuristic_fallback);
2418 tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_success_wifi,
2419 &prev.tcps_mptcp_handover_success_wifi, &stat.mptcp_handover_success_wifi);
2420 tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_success_cell,
2421 &prev.tcps_mptcp_handover_success_cell, &stat.mptcp_handover_success_cell);
2422 tcp_cumulative_stat(tcpstat.tcps_mptcp_interactive_success,
2423 &prev.tcps_mptcp_interactive_success, &stat.mptcp_interactive_success);
2424 tcp_cumulative_stat(tcpstat.tcps_mptcp_aggregate_success,
2425 &prev.tcps_mptcp_aggregate_success, &stat.mptcp_aggregate_success);
2426 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_handover_success_wifi,
2427 &prev.tcps_mptcp_fp_handover_success_wifi, &stat.mptcp_fp_handover_success_wifi);
2428 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_handover_success_cell,
2429 &prev.tcps_mptcp_fp_handover_success_cell, &stat.mptcp_fp_handover_success_cell);
2430 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_interactive_success,
2431 &prev.tcps_mptcp_fp_interactive_success, &stat.mptcp_fp_interactive_success);
2432 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_aggregate_success,
2433 &prev.tcps_mptcp_fp_aggregate_success, &stat.mptcp_fp_aggregate_success);
2434 tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_cell_from_wifi,
2435 &prev.tcps_mptcp_handover_cell_from_wifi, &stat.mptcp_handover_cell_from_wifi);
2436 tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_wifi_from_cell,
2437 &prev.tcps_mptcp_handover_wifi_from_cell, &stat.mptcp_handover_wifi_from_cell);
2438 tcp_cumulative_stat(tcpstat.tcps_mptcp_interactive_cell_from_wifi,
2439 &prev.tcps_mptcp_interactive_cell_from_wifi, &stat.mptcp_interactive_cell_from_wifi);
2440 tcp_cumulative_stat64(tcpstat.tcps_mptcp_handover_cell_bytes,
2441 &prev.tcps_mptcp_handover_cell_bytes, &stat.mptcp_handover_cell_bytes);
2442 tcp_cumulative_stat64(tcpstat.tcps_mptcp_interactive_cell_bytes,
2443 &prev.tcps_mptcp_interactive_cell_bytes, &stat.mptcp_interactive_cell_bytes);
2444 tcp_cumulative_stat64(tcpstat.tcps_mptcp_aggregate_cell_bytes,
2445 &prev.tcps_mptcp_aggregate_cell_bytes, &stat.mptcp_aggregate_cell_bytes);
2446 tcp_cumulative_stat64(tcpstat.tcps_mptcp_handover_all_bytes,
2447 &prev.tcps_mptcp_handover_all_bytes, &stat.mptcp_handover_all_bytes);
2448 tcp_cumulative_stat64(tcpstat.tcps_mptcp_interactive_all_bytes,
2449 &prev.tcps_mptcp_interactive_all_bytes, &stat.mptcp_interactive_all_bytes);
2450 tcp_cumulative_stat64(tcpstat.tcps_mptcp_aggregate_all_bytes,
2451 &prev.tcps_mptcp_aggregate_all_bytes, &stat.mptcp_aggregate_all_bytes);
2452 tcp_cumulative_stat(tcpstat.tcps_mptcp_back_to_wifi,
2453 &prev.tcps_mptcp_back_to_wifi, &stat.mptcp_back_to_wifi);
2454 tcp_cumulative_stat(tcpstat.tcps_mptcp_wifi_proxy,
2455 &prev.tcps_mptcp_wifi_proxy, &stat.mptcp_wifi_proxy);
2456 tcp_cumulative_stat(tcpstat.tcps_mptcp_cell_proxy,
2457 &prev.tcps_mptcp_cell_proxy, &stat.mptcp_cell_proxy);
2458 tcp_cumulative_stat(tcpstat.tcps_mptcp_triggered_cell,
2459 &prev.tcps_mptcp_triggered_cell, &stat.mptcp_triggered_cell);
2460
2461 nstat_sysinfo_send_data(&data);
2462
2463 #undef stat
2464 }
2465
2466 void
2467 tcp_interface_send_probe(u_int16_t probe_if_index)
2468 {
2469 int32_t offset = 0;
2470 struct tcptimerlist *listp = &tcp_timer_list;
2471
2472 /* Make sure TCP clock is up to date */
2473 calculate_tcp_clock();
2474
2475 lck_mtx_lock(listp->mtx);
2476 if (listp->probe_if_index > 0) {
2477 tcpstat.tcps_probe_if_conflict++;
2478 goto done;
2479 }
2480
2481 listp->probe_if_index = probe_if_index;
2482 if (listp->running) {
2483 goto done;
2484 }
2485
2486 /*
2487 * Reschedule the timerlist to run within the next 10ms, which is
2488 * the fastest that we can do.
2489 */
2490 offset = TCP_TIMER_10MS_QUANTUM;
2491 if (listp->scheduled) {
2492 int32_t diff;
2493 diff = timer_diff(listp->runtime, 0, tcp_now, offset);
2494 if (diff <= 0) {
2495 /* The timer will fire sooner than what's needed */
2496 goto done;
2497 }
2498 }
2499 listp->mode = TCP_TIMERLIST_10MS_MODE;
2500 listp->idleruns = 0;
2501
2502 tcp_sched_timerlist(offset);
2503
2504 done:
2505 lck_mtx_unlock(listp->mtx);
2506 return;
2507 }
2508
2509 /*
2510 * Enable read probes on this connection, if:
2511 * - it is in established state
2512 * - doesn't have any data outstanding
2513 * - the outgoing ifp matches
2514 * - we have not already sent any read probes
2515 */
2516 static void
2517 tcp_enable_read_probe(struct tcpcb *tp, struct ifnet *ifp)
2518 {
2519 if (tp->t_state == TCPS_ESTABLISHED &&
2520 tp->snd_max == tp->snd_una &&
2521 tp->t_inpcb->inp_last_outifp == ifp &&
2522 !(tp->t_flagsext & TF_DETECT_READSTALL) &&
2523 tp->t_rtimo_probes == 0) {
2524 tp->t_flagsext |= TF_DETECT_READSTALL;
2525 tp->t_rtimo_probes = 0;
2526 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2527 TCP_TIMER_10MS_QUANTUM);
2528 if (tp->tentry.index == TCPT_NONE) {
2529 tp->tentry.index = TCPT_KEEP;
2530 tp->tentry.runtime = tcp_now +
2531 TCP_TIMER_10MS_QUANTUM;
2532 } else {
2533 int32_t diff = 0;
2534
2535 /* Reset runtime to be in next 10ms */
2536 diff = timer_diff(tp->tentry.runtime, 0,
2537 tcp_now, TCP_TIMER_10MS_QUANTUM);
2538 if (diff > 0) {
2539 tp->tentry.index = TCPT_KEEP;
2540 tp->tentry.runtime = tcp_now +
2541 TCP_TIMER_10MS_QUANTUM;
2542 if (tp->tentry.runtime == 0) {
2543 tp->tentry.runtime++;
2544 }
2545 }
2546 }
2547 }
2548 }
2549
2550 /*
2551 * Disable read probe and reset the keep alive timer
2552 */
2553 static void
2554 tcp_disable_read_probe(struct tcpcb *tp)
2555 {
2556 if (tp->t_adaptive_rtimo == 0 &&
2557 ((tp->t_flagsext & TF_DETECT_READSTALL) ||
2558 tp->t_rtimo_probes > 0)) {
2559 tcp_keepalive_reset(tp);
2560
2561 if (tp->t_mpsub) {
2562 mptcp_reset_keepalive(tp);
2563 }
2564 }
2565 }
2566
2567 /*
2568 * Reschedule the tcp timerlist in the next 10ms to re-enable read/write
2569 * probes on connections going over a particular interface.
2570 */
2571 void
2572 tcp_probe_connectivity(struct ifnet *ifp, u_int32_t enable)
2573 {
2574 int32_t offset;
2575 struct tcptimerlist *listp = &tcp_timer_list;
2576 struct inpcbinfo *pcbinfo = &tcbinfo;
2577 struct inpcb *inp, *nxt;
2578
2579 if (ifp == NULL) {
2580 return;
2581 }
2582
2583 /* update clock */
2584 calculate_tcp_clock();
2585
2586 /*
2587 * Enable keep alive timer on all connections that are
2588 * active/established on this interface.
2589 */
2590 lck_rw_lock_shared(pcbinfo->ipi_lock);
2591
2592 LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, nxt) {
2593 struct tcpcb *tp = NULL;
2594 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) ==
2595 WNT_STOPUSING) {
2596 continue;
2597 }
2598
2599 /* Acquire lock to look at the state of the connection */
2600 socket_lock(inp->inp_socket, 1);
2601
2602 /* Release the want count */
2603 if (inp->inp_ppcb == NULL ||
2604 (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)) {
2605 socket_unlock(inp->inp_socket, 1);
2606 continue;
2607 }
2608 tp = intotcpcb(inp);
2609 if (enable) {
2610 tcp_enable_read_probe(tp, ifp);
2611 } else {
2612 tcp_disable_read_probe(tp);
2613 }
2614
2615 socket_unlock(inp->inp_socket, 1);
2616 }
2617 lck_rw_done(pcbinfo->ipi_lock);
2618
2619 lck_mtx_lock(listp->mtx);
2620 if (listp->running) {
2621 listp->pref_mode |= TCP_TIMERLIST_10MS_MODE;
2622 goto done;
2623 }
2624
2625 /* Reschedule within the next 10ms */
2626 offset = TCP_TIMER_10MS_QUANTUM;
2627 if (listp->scheduled) {
2628 int32_t diff;
2629 diff = timer_diff(listp->runtime, 0, tcp_now, offset);
2630 if (diff <= 0) {
2631 /* The timer will fire sooner than what's needed */
2632 goto done;
2633 }
2634 }
2635 listp->mode = TCP_TIMERLIST_10MS_MODE;
2636 listp->idleruns = 0;
2637
2638 tcp_sched_timerlist(offset);
2639 done:
2640 lck_mtx_unlock(listp->mtx);
2641 return;
2642 }
2643
2644 inline void
2645 tcp_update_mss_core(struct tcpcb *tp, struct ifnet *ifp)
2646 {
2647 struct if_cellular_status_v1 *ifsr;
2648 u_int32_t optlen;
2649 ifsr = &ifp->if_link_status->ifsr_u.ifsr_cell.if_cell_u.if_status_v1;
2650 if (ifsr->valid_bitmask & IF_CELL_UL_MSS_RECOMMENDED_VALID) {
2651 optlen = tp->t_maxopd - tp->t_maxseg;
2652
2653 if (ifsr->mss_recommended ==
2654 IF_CELL_UL_MSS_RECOMMENDED_NONE &&
2655 tp->t_cached_maxopd > 0 &&
2656 tp->t_maxopd < tp->t_cached_maxopd) {
2657 tp->t_maxopd = tp->t_cached_maxopd;
2658 tcpstat.tcps_mss_to_default++;
2659 } else if (ifsr->mss_recommended ==
2660 IF_CELL_UL_MSS_RECOMMENDED_MEDIUM &&
2661 tp->t_maxopd > tcp_mss_rec_medium) {
2662 tp->t_cached_maxopd = tp->t_maxopd;
2663 tp->t_maxopd = tcp_mss_rec_medium;
2664 tcpstat.tcps_mss_to_medium++;
2665 } else if (ifsr->mss_recommended ==
2666 IF_CELL_UL_MSS_RECOMMENDED_LOW &&
2667 tp->t_maxopd > tcp_mss_rec_low) {
2668 tp->t_cached_maxopd = tp->t_maxopd;
2669 tp->t_maxopd = tcp_mss_rec_low;
2670 tcpstat.tcps_mss_to_low++;
2671 }
2672 tp->t_maxseg = tp->t_maxopd - optlen;
2673
2674 /*
2675 * clear the cached value if it is same as the current
2676 */
2677 if (tp->t_maxopd == tp->t_cached_maxopd) {
2678 tp->t_cached_maxopd = 0;
2679 }
2680 }
2681 }
2682
2683 void
2684 tcp_update_mss_locked(struct socket *so, struct ifnet *ifp)
2685 {
2686 struct inpcb *inp = sotoinpcb(so);
2687 struct tcpcb *tp = intotcpcb(inp);
2688
2689 if (ifp == NULL && (ifp = inp->inp_last_outifp) == NULL) {
2690 return;
2691 }
2692
2693 if (!IFNET_IS_CELLULAR(ifp)) {
2694 /*
2695 * This optimization is implemented for cellular
2696 * networks only
2697 */
2698 return;
2699 }
2700 if (tp->t_state <= TCPS_CLOSE_WAIT) {
2701 /*
2702 * If the connection is currently doing or has done PMTU
2703 * blackhole detection, do not change the MSS
2704 */
2705 if (tp->t_flags & TF_BLACKHOLE) {
2706 return;
2707 }
2708 if (ifp->if_link_status == NULL) {
2709 return;
2710 }
2711 tcp_update_mss_core(tp, ifp);
2712 }
2713 }
2714
2715 void
2716 tcp_itimer(struct inpcbinfo *ipi)
2717 {
2718 struct inpcb *inp, *nxt;
2719
2720 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
2721 if (tcp_itimer_done == TRUE) {
2722 tcp_itimer_done = FALSE;
2723 atomic_add_32(&ipi->ipi_timer_req.intimer_fast, 1);
2724 return;
2725 }
2726 /* Upgrade failed, lost lock now take it again exclusive */
2727 lck_rw_lock_exclusive(ipi->ipi_lock);
2728 }
2729 tcp_itimer_done = TRUE;
2730
2731 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
2732 struct socket *so;
2733 struct ifnet *ifp;
2734
2735 if (inp->inp_ppcb == NULL ||
2736 in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
2737 continue;
2738 }
2739 so = inp->inp_socket;
2740 ifp = inp->inp_last_outifp;
2741 socket_lock(so, 1);
2742 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2743 socket_unlock(so, 1);
2744 continue;
2745 }
2746 so_check_extended_bk_idle_time(so);
2747 if (ipi->ipi_flags & INPCBINFO_UPDATE_MSS) {
2748 tcp_update_mss_locked(so, NULL);
2749 }
2750 socket_unlock(so, 1);
2751
2752 /*
2753 * Defunct all system-initiated background sockets if the
2754 * socket is using the cellular interface and the interface
2755 * has its LQM set to abort.
2756 */
2757 if ((ipi->ipi_flags & INPCBINFO_HANDLE_LQM_ABORT) &&
2758 IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class) &&
2759 ifp != NULL && IFNET_IS_CELLULAR(ifp) &&
2760 (ifp->if_interface_state.valid_bitmask &
2761 IF_INTERFACE_STATE_LQM_STATE_VALID) &&
2762 ifp->if_interface_state.lqm_state ==
2763 IFNET_LQM_THRESH_ABORT) {
2764 socket_defunct(current_proc(), so,
2765 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2766 }
2767 }
2768
2769 ipi->ipi_flags &= ~(INPCBINFO_UPDATE_MSS | INPCBINFO_HANDLE_LQM_ABORT);
2770 lck_rw_done(ipi->ipi_lock);
2771 }