]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_timer.c
xnu-3789.51.2.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_timer.c
1 /*
2 * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
62 */
63
64
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/kernel.h>
68 #include <sys/mbuf.h>
69 #include <sys/sysctl.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/protosw.h>
73 #include <sys/domain.h>
74 #include <sys/mcache.h>
75 #include <sys/queue.h>
76 #include <kern/locks.h>
77 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
78 #include <mach/boolean.h>
79
80 #include <net/route.h>
81 #include <net/if_var.h>
82 #include <net/ntstat.h>
83
84 #include <netinet/in.h>
85 #include <netinet/in_systm.h>
86 #include <netinet/in_pcb.h>
87 #if INET6
88 #include <netinet6/in6_pcb.h>
89 #endif
90 #include <netinet/ip_var.h>
91 #include <netinet/tcp.h>
92 #include <netinet/tcp_cache.h>
93 #include <netinet/tcp_fsm.h>
94 #include <netinet/tcp_seq.h>
95 #include <netinet/tcp_timer.h>
96 #include <netinet/tcp_var.h>
97 #include <netinet/tcp_cc.h>
98 #if INET6
99 #include <netinet6/tcp6_var.h>
100 #endif
101 #include <netinet/tcpip.h>
102 #if TCPDEBUG
103 #include <netinet/tcp_debug.h>
104 #endif
105 #include <sys/kdebug.h>
106 #include <mach/sdt.h>
107 #include <netinet/mptcp_var.h>
108
109 /* Max number of times a stretch ack can be delayed on a connection */
110 #define TCP_STRETCHACK_DELAY_THRESHOLD 5
111
112 /*
113 * If the host processor has been sleeping for too long, this is the threshold
114 * used to avoid sending stale retransmissions.
115 */
116 #define TCP_SLEEP_TOO_LONG (10 * 60 * 1000) /* 10 minutes in ms */
117
118 /* tcp timer list */
119 struct tcptimerlist tcp_timer_list;
120
121 /* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */
122 struct tcptailq tcp_tw_tailq;
123
124 static int
125 sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
126 {
127 #pragma unused(arg1, arg2)
128 int error, s, tt;
129
130 tt = *(int *)oidp->oid_arg1;
131 s = tt * 1000 / TCP_RETRANSHZ;;
132
133 error = sysctl_handle_int(oidp, &s, 0, req);
134 if (error || !req->newptr)
135 return (error);
136
137 tt = s * TCP_RETRANSHZ / 1000;
138 if (tt < 1)
139 return (EINVAL);
140
141 *(int *)oidp->oid_arg1 = tt;
142 return (0);
143 }
144
145 int tcp_keepinit;
146 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
147 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
148 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
149
150 int tcp_keepidle;
151 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
152 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
153 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
154
155 int tcp_keepintvl;
156 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
157 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
158 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
159
160 int tcp_keepcnt;
161 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt,
162 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
163 &tcp_keepcnt, 0, "number of times to repeat keepalive");
164
165 int tcp_msl;
166 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
167 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
168 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
169
170 /*
171 * Avoid DoS via TCP Robustness in Persist Condition
172 * (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt)
173 * by allowing a system wide maximum persistence timeout value when in
174 * Zero Window Probe mode.
175 *
176 * Expressed in milliseconds to be consistent without timeout related
177 * values, the TCP socket option is in seconds.
178 */
179 u_int32_t tcp_max_persist_timeout = 0;
180 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout,
181 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
182 &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I",
183 "Maximum persistence timeout for ZWP");
184
185 static int always_keepalive = 0;
186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive,
187 CTLFLAG_RW | CTLFLAG_LOCKED,
188 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
189
190 /*
191 * This parameter determines how long the timer list will stay in fast or
192 * quick mode even though all connections are idle. In this state, the
193 * timer will run more frequently anticipating new data.
194 */
195 int timer_fastmode_idlemax = TCP_FASTMODE_IDLERUN_MAX;
196 SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax,
197 CTLFLAG_RW | CTLFLAG_LOCKED,
198 &timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode");
199
200 /*
201 * See tcp_syn_backoff[] for interval values between SYN retransmits;
202 * the value set below defines the number of retransmits, before we
203 * disable the timestamp and window scaling options during subsequent
204 * SYN retransmits. Setting it to 0 disables the dropping off of those
205 * two options.
206 */
207 static int tcp_broken_peer_syn_rxmit_thres = 10;
208 SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rexmit_thres,
209 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres, 0,
210 "Number of retransmitted SYNs before disabling RFC 1323 "
211 "options on local connections");
212
213 static int tcp_timer_advanced = 0;
214 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced,
215 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_timer_advanced, 0,
216 "Number of times one of the timers was advanced");
217
218 static int tcp_resched_timerlist = 0;
219 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist,
220 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_resched_timerlist, 0,
221 "Number of times timer list was rescheduled as part of processing a packet");
222
223 int tcp_pmtud_black_hole_detect = 1 ;
224 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
225 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_detect, 0,
226 "Path MTU Discovery Black Hole Detection");
227
228 int tcp_pmtud_black_hole_mss = 1200 ;
229 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
230 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_mss, 0,
231 "Path MTU Discovery Black Hole Detection lowered MSS");
232
233 static u_int32_t tcp_mss_rec_medium = 1200;
234 static u_int32_t tcp_mss_rec_low = 512;
235
236 #define TCP_REPORT_STATS_INTERVAL 43200 /* 12 hours, in seconds */
237 int tcp_report_stats_interval = TCP_REPORT_STATS_INTERVAL;
238
239 /* performed garbage collection of "used" sockets */
240 static boolean_t tcp_gc_done = FALSE;
241
242 /* max idle probes */
243 int tcp_maxpersistidle;
244
245 /*
246 * TCP delack timer is set to 100 ms. Since the processing of timer list
247 * in fast mode will happen no faster than 100 ms, the delayed ack timer
248 * will fire some where between 100 and 200 ms.
249 */
250 int tcp_delack = TCP_RETRANSHZ / 10;
251
252 #if MPTCP
253 /*
254 * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff
255 */
256 int tcp_jack_rxmt = TCP_RETRANSHZ / 2;
257 #endif /* MPTCP */
258
259 static boolean_t tcp_itimer_done = FALSE;
260
261 static void tcp_remove_timer(struct tcpcb *tp);
262 static void tcp_sched_timerlist(uint32_t offset);
263 static u_int32_t tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *mode,
264 u_int16_t probe_if_index);
265 static void tcp_sched_timers(struct tcpcb *tp);
266 static inline void tcp_set_lotimer_index(struct tcpcb *);
267 __private_extern__ void tcp_remove_from_time_wait(struct inpcb *inp);
268 static inline void tcp_update_mss_core(struct tcpcb *tp, struct ifnet *ifp);
269 __private_extern__ void tcp_report_stats(void);
270
271 static u_int64_t tcp_last_report_time;
272
273 /*
274 * Structure to store previously reported stats so that we can send
275 * incremental changes in each report interval.
276 */
277 struct tcp_last_report_stats {
278 u_int32_t tcps_connattempt;
279 u_int32_t tcps_accepts;
280 u_int32_t tcps_ecn_client_setup;
281 u_int32_t tcps_ecn_server_setup;
282 u_int32_t tcps_ecn_client_success;
283 u_int32_t tcps_ecn_server_success;
284 u_int32_t tcps_ecn_not_supported;
285 u_int32_t tcps_ecn_lost_syn;
286 u_int32_t tcps_ecn_lost_synack;
287 u_int32_t tcps_ecn_recv_ce;
288 u_int32_t tcps_ecn_recv_ece;
289 u_int32_t tcps_ecn_sent_ece;
290 u_int32_t tcps_ecn_conn_recv_ce;
291 u_int32_t tcps_ecn_conn_recv_ece;
292 u_int32_t tcps_ecn_conn_plnoce;
293 u_int32_t tcps_ecn_conn_pl_ce;
294 u_int32_t tcps_ecn_conn_nopl_ce;
295 u_int32_t tcps_ecn_fallback_synloss;
296 u_int32_t tcps_ecn_fallback_reorder;
297 u_int32_t tcps_ecn_fallback_ce;
298
299 /* TFO-related statistics */
300 u_int32_t tcps_tfo_syn_data_rcv;
301 u_int32_t tcps_tfo_cookie_req_rcv;
302 u_int32_t tcps_tfo_cookie_sent;
303 u_int32_t tcps_tfo_cookie_invalid;
304 u_int32_t tcps_tfo_cookie_req;
305 u_int32_t tcps_tfo_cookie_rcv;
306 u_int32_t tcps_tfo_syn_data_sent;
307 u_int32_t tcps_tfo_syn_data_acked;
308 u_int32_t tcps_tfo_syn_loss;
309 u_int32_t tcps_tfo_blackhole;
310 u_int32_t tcps_tfo_cookie_wrong;
311 u_int32_t tcps_tfo_no_cookie_rcv;
312 u_int32_t tcps_tfo_heuristics_disable;
313 u_int32_t tcps_tfo_sndblackhole;
314 };
315
316
317 /* Returns true if the timer is on the timer list */
318 #define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
319
320 /* Run the TCP timerlist atleast once every hour */
321 #define TCP_TIMERLIST_MAX_OFFSET (60 * 60 * TCP_RETRANSHZ)
322
323
324 static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
325 static boolean_t tcp_garbage_collect(struct inpcb *, int);
326
327 #define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next)))
328
329 #define VERIFY_NEXT_LINK(elm,field) do { \
330 if (LIST_NEXT((elm),field) != NULL && \
331 LIST_NEXT((elm),field)->field.le_prev != \
332 &((elm)->field.le_next)) \
333 panic("Bad link elm %p next->prev != elm", (elm)); \
334 } while(0)
335
336 #define VERIFY_PREV_LINK(elm,field) do { \
337 if (*(elm)->field.le_prev != (elm)) \
338 panic("Bad link elm %p prev->next != elm", (elm)); \
339 } while(0)
340
341 #define TCP_SET_TIMER_MODE(mode, i) do { \
342 if (IS_TIMER_HZ_10MS(i)) \
343 (mode) |= TCP_TIMERLIST_10MS_MODE; \
344 else if (IS_TIMER_HZ_100MS(i)) \
345 (mode) |= TCP_TIMERLIST_100MS_MODE; \
346 else \
347 (mode) |= TCP_TIMERLIST_500MS_MODE; \
348 } while(0)
349
350 #if (DEVELOPMENT || DEBUG)
351 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, mss_rec_medium,
352 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mss_rec_medium, 0,
353 "Medium MSS based on recommendation in link status report");
354 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, mss_rec_low,
355 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mss_rec_low, 0,
356 "Low MSS based on recommendation in link status report");
357
358 static int32_t tcp_change_mss_recommended = 0;
359 static int
360 sysctl_change_mss_recommended SYSCTL_HANDLER_ARGS
361 {
362 #pragma unused(oidp, arg1, arg2)
363 int i, err = 0, changed = 0;
364 struct ifnet *ifp;
365 struct if_link_status ifsr;
366 struct if_cellular_status_v1 *new_cell_sr;
367 err = sysctl_io_number(req, tcp_change_mss_recommended,
368 sizeof (int32_t), &i, &changed);
369 if (changed) {
370 ifnet_head_lock_shared();
371 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
372 if (IFNET_IS_CELLULAR(ifp)) {
373 bzero(&ifsr, sizeof (ifsr));
374 new_cell_sr = &ifsr.ifsr_u.ifsr_cell.if_cell_u.if_status_v1;
375 ifsr.ifsr_version = IF_CELLULAR_STATUS_REPORT_CURRENT_VERSION;
376 ifsr.ifsr_len = sizeof(*new_cell_sr);
377
378 /* Set MSS recommended */
379 new_cell_sr->valid_bitmask |= IF_CELL_UL_MSS_RECOMMENDED_VALID;
380 new_cell_sr->mss_recommended = i;
381 err = ifnet_link_status_report(ifp, new_cell_sr, sizeof (new_cell_sr));
382 if (err == 0) {
383 tcp_change_mss_recommended = i;
384 } else {
385 break;
386 }
387 }
388 }
389 ifnet_head_done();
390 }
391 return (err);
392 }
393
394 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, change_mss_recommended,
395 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_change_mss_recommended,
396 0, sysctl_change_mss_recommended, "IU", "Change MSS recommended");
397
398 SYSCTL_INT(_net_inet_tcp, OID_AUTO, report_stats_interval,
399 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_report_stats_interval, 0,
400 "Report stats interval");
401 #endif /* (DEVELOPMENT || DEBUG) */
402
403 /*
404 * Macro to compare two timers. If there is a reset of the sign bit,
405 * it is safe to assume that the timer has wrapped around. By doing
406 * signed comparision, we take care of wrap around such that the value
407 * with the sign bit reset is actually ahead of the other.
408 */
409 inline int32_t
410 timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) {
411 return (int32_t)((t1 + toff1) - (t2 + toff2));
412 };
413
414 /*
415 * Add to tcp timewait list, delay is given in milliseconds.
416 */
417 static void
418 add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
419 {
420 struct inpcbinfo *pcbinfo = &tcbinfo;
421 struct inpcb *inp = tp->t_inpcb;
422 uint32_t timer;
423
424 /* pcb list should be locked when we get here */
425 lck_rw_assert(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
426
427 /* We may get here multiple times, so check */
428 if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
429 pcbinfo->ipi_twcount++;
430 inp->inp_flags2 |= INP2_TIMEWAIT;
431
432 /* Remove from global inp list */
433 LIST_REMOVE(inp, inp_list);
434 } else {
435 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
436 }
437
438 /* Compute the time at which this socket can be closed */
439 timer = tcp_now + delay;
440
441 /* We will use the TCPT_2MSL timer for tracking this delay */
442
443 if (TIMER_IS_ON_LIST(tp))
444 tcp_remove_timer(tp);
445 tp->t_timer[TCPT_2MSL] = timer;
446
447 TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry);
448 }
449
450 void
451 add_to_time_wait(struct tcpcb *tp, uint32_t delay)
452 {
453 struct inpcbinfo *pcbinfo = &tcbinfo;
454 if (tp->t_inpcb->inp_socket->so_options & SO_NOWAKEFROMSLEEP)
455 socket_post_kev_msg_closed(tp->t_inpcb->inp_socket);
456
457 /* 19182803: Notify nstat that connection is closing before waiting. */
458 nstat_pcb_detach(tp->t_inpcb);
459
460 if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) {
461 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
462 lck_rw_lock_exclusive(pcbinfo->ipi_lock);
463 tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
464 }
465 add_to_time_wait_locked(tp, delay);
466 lck_rw_done(pcbinfo->ipi_lock);
467
468 inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY);
469 }
470
471 /* If this is on time wait queue, remove it. */
472 void
473 tcp_remove_from_time_wait(struct inpcb *inp)
474 {
475 struct tcpcb *tp = intotcpcb(inp);
476 if (inp->inp_flags2 & INP2_TIMEWAIT)
477 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
478 }
479
480 static boolean_t
481 tcp_garbage_collect(struct inpcb *inp, int istimewait)
482 {
483 boolean_t active = FALSE;
484 struct socket *so;
485 struct tcpcb *tp;
486
487 so = inp->inp_socket;
488 tp = intotcpcb(inp);
489
490 /*
491 * Skip if still in use or busy; it would have been more efficient
492 * if we were to test so_usecount against 0, but this isn't possible
493 * due to the current implementation of tcp_dropdropablreq() where
494 * overflow sockets that are eligible for garbage collection have
495 * their usecounts set to 1.
496 */
497 if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx))
498 return (TRUE);
499
500 /* Check again under the lock */
501 if (so->so_usecount > 1) {
502 if (inp->inp_wantcnt == WNT_STOPUSING)
503 active = TRUE;
504 lck_mtx_unlock(&inp->inpcb_mtx);
505 return (active);
506 }
507
508 if (istimewait &&
509 TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) &&
510 tp->t_state != TCPS_CLOSED) {
511 /* Become a regular mutex */
512 lck_mtx_convert_spin(&inp->inpcb_mtx);
513 tcp_close(tp);
514 }
515
516 /*
517 * Overflowed socket dropped from the listening queue? Do this
518 * only if we are called to clean up the time wait slots, since
519 * tcp_dropdropablreq() considers a socket to have been fully
520 * dropped after add_to_time_wait() is finished.
521 * Also handle the case of connections getting closed by the peer
522 * while in the queue as seen with rdar://6422317
523 *
524 */
525 if (so->so_usecount == 1 &&
526 ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
527 ((tp != NULL) && (tp->t_state == TCPS_CLOSED) &&
528 (so->so_head != NULL) &&
529 ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
530 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
531
532 if (inp->inp_state != INPCB_STATE_DEAD) {
533 /* Become a regular mutex */
534 lck_mtx_convert_spin(&inp->inpcb_mtx);
535 #if INET6
536 if (SOCK_CHECK_DOM(so, PF_INET6))
537 in6_pcbdetach(inp);
538 else
539 #endif /* INET6 */
540 in_pcbdetach(inp);
541 }
542 VERIFY(so->so_usecount > 0);
543 so->so_usecount--;
544 if (inp->inp_wantcnt == WNT_STOPUSING)
545 active = TRUE;
546 lck_mtx_unlock(&inp->inpcb_mtx);
547 return (active);
548 } else if (inp->inp_wantcnt != WNT_STOPUSING) {
549 lck_mtx_unlock(&inp->inpcb_mtx);
550 return (FALSE);
551 }
552
553 /*
554 * We get here because the PCB is no longer searchable
555 * (WNT_STOPUSING); detach (if needed) and dispose if it is dead
556 * (usecount is 0). This covers all cases, including overflow
557 * sockets and those that are considered as "embryonic",
558 * i.e. created by sonewconn() in TCP input path, and have
559 * not yet been committed. For the former, we reduce the usecount
560 * to 0 as done by the code above. For the latter, the usecount
561 * would have reduced to 0 as part calling soabort() when the
562 * socket is dropped at the end of tcp_input().
563 */
564 if (so->so_usecount == 0) {
565 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
566 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
567 /* Become a regular mutex */
568 lck_mtx_convert_spin(&inp->inpcb_mtx);
569
570 /*
571 * If this tp still happens to be on the timer list,
572 * take it out
573 */
574 if (TIMER_IS_ON_LIST(tp)) {
575 tcp_remove_timer(tp);
576 }
577
578 if (inp->inp_state != INPCB_STATE_DEAD) {
579 #if INET6
580 if (SOCK_CHECK_DOM(so, PF_INET6))
581 in6_pcbdetach(inp);
582 else
583 #endif /* INET6 */
584 in_pcbdetach(inp);
585 }
586 in_pcbdispose(inp);
587 return (FALSE);
588 }
589
590 lck_mtx_unlock(&inp->inpcb_mtx);
591 return (TRUE);
592 }
593
594 /*
595 * TCP garbage collector callback (inpcb_timer_func_t).
596 *
597 * Returns the number of pcbs that will need to be gc-ed soon,
598 * returnining > 0 will keep timer active.
599 */
600 void
601 tcp_gc(struct inpcbinfo *ipi)
602 {
603 struct inpcb *inp, *nxt;
604 struct tcpcb *tw_tp, *tw_ntp;
605 #if TCPDEBUG
606 int ostate;
607 #endif
608 #if KDEBUG
609 static int tws_checked = 0;
610 #endif
611
612 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0);
613
614 /*
615 * Update tcp_now here as it may get used while
616 * processing the slow timer.
617 */
618 calculate_tcp_clock();
619
620 /*
621 * Garbage collect socket/tcpcb: We need to acquire the list lock
622 * exclusively to do this
623 */
624
625 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
626 /* don't sweat it this time; cleanup was done last time */
627 if (tcp_gc_done == TRUE) {
628 tcp_gc_done = FALSE;
629 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END,
630 tws_checked, cur_tw_slot, 0, 0, 0);
631 /* Lock upgrade failed, give up this round */
632 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
633 return;
634 }
635 /* Upgrade failed, lost lock now take it again exclusive */
636 lck_rw_lock_exclusive(ipi->ipi_lock);
637 }
638 tcp_gc_done = TRUE;
639
640 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
641 if (tcp_garbage_collect(inp, 0))
642 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
643 }
644
645 /* Now cleanup the time wait ones */
646 TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) {
647 /*
648 * We check the timestamp here without holding the
649 * socket lock for better performance. If there are
650 * any pcbs in time-wait, the timer will get rescheduled.
651 * Hence some error in this check can be tolerated.
652 *
653 * Sometimes a socket on time-wait queue can be closed if
654 * 2MSL timer expired but the application still has a
655 * usecount on it.
656 */
657 if (tw_tp->t_state == TCPS_CLOSED ||
658 TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) {
659 if (tcp_garbage_collect(tw_tp->t_inpcb, 1))
660 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
661 }
662 }
663
664 /* take into account pcbs that are still in time_wait_slots */
665 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount);
666
667 lck_rw_done(ipi->ipi_lock);
668
669 /* Clean up the socache while we are here */
670 if (so_cache_timer())
671 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
672
673 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked,
674 cur_tw_slot, 0, 0, 0);
675
676 return;
677 }
678
679 /*
680 * Cancel all timers for TCP tp.
681 */
682 void
683 tcp_canceltimers(struct tcpcb *tp)
684 {
685 int i;
686
687 tcp_remove_timer(tp);
688 for (i = 0; i < TCPT_NTIMERS; i++)
689 tp->t_timer[i] = 0;
690 tp->tentry.timer_start = tcp_now;
691 tp->tentry.index = TCPT_NONE;
692 }
693
694 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
695 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
696
697 int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
698 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
699
700 static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
701
702 void
703 tcp_rexmt_save_state(struct tcpcb *tp)
704 {
705 u_int32_t fsize;
706 if (TSTMP_SUPPORTED(tp)) {
707 /*
708 * Since timestamps are supported on the connection,
709 * we can do recovery as described in rfc 4015.
710 */
711 fsize = tp->snd_max - tp->snd_una;
712 tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh);
713 tp->snd_recover_prev = tp->snd_recover;
714 } else {
715 /*
716 * Timestamp option is not supported on this connection.
717 * Record ssthresh and cwnd so they can
718 * be recovered if this turns out to be a "bad" retransmit.
719 * A retransmit is considered "bad" if an ACK for this
720 * segment is received within RTT/2 interval; the assumption
721 * here is that the ACK was already in flight. See
722 * "On Estimating End-to-End Network Path Properties" by
723 * Allman and Paxson for more details.
724 */
725 tp->snd_cwnd_prev = tp->snd_cwnd;
726 tp->snd_ssthresh_prev = tp->snd_ssthresh;
727 tp->snd_recover_prev = tp->snd_recover;
728 if (IN_FASTRECOVERY(tp))
729 tp->t_flags |= TF_WASFRECOVERY;
730 else
731 tp->t_flags &= ~TF_WASFRECOVERY;
732 }
733 tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2;
734 tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
735 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
736 }
737
738 /*
739 * Revert to the older segment size if there is an indication that PMTU
740 * blackhole detection was not needed.
741 */
742 void
743 tcp_pmtud_revert_segment_size(struct tcpcb *tp)
744 {
745 int32_t optlen;
746
747 VERIFY(tp->t_pmtud_saved_maxopd > 0);
748 tp->t_flags |= TF_PMTUD;
749 tp->t_flags &= ~TF_BLACKHOLE;
750 optlen = tp->t_maxopd - tp->t_maxseg;
751 tp->t_maxopd = tp->t_pmtud_saved_maxopd;
752 tp->t_maxseg = tp->t_maxopd - optlen;
753 /*
754 * Reset the slow-start flight size as it
755 * may depend on the new MSS
756 */
757 if (CC_ALGO(tp)->cwnd_init != NULL)
758 CC_ALGO(tp)->cwnd_init(tp);
759 tp->t_pmtud_start_ts = 0;
760 tcpstat.tcps_pmtudbh_reverted++;
761 }
762
763 /*
764 * TCP timer processing.
765 */
766 struct tcpcb *
767 tcp_timers(struct tcpcb *tp, int timer)
768 {
769 int32_t rexmt, optlen = 0, idle_time = 0;
770 struct socket *so;
771 struct tcptemp *t_template;
772 #if TCPDEBUG
773 int ostate;
774 #endif
775
776 #if INET6
777 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
778 #endif /* INET6 */
779 u_int64_t accsleep_ms;
780 u_int32_t last_sleep_ms = 0;
781
782 so = tp->t_inpcb->inp_socket;
783 idle_time = tcp_now - tp->t_rcvtime;
784
785 switch (timer) {
786
787 /*
788 * 2 MSL timeout in shutdown went off. If we're closed but
789 * still waiting for peer to close and connection has been idle
790 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
791 * delete connection control block.
792 * Otherwise, (this case shouldn't happen) check again in a bit
793 * we keep the socket in the main list in that case.
794 */
795 case TCPT_2MSL:
796 tcp_free_sackholes(tp);
797 if (tp->t_state != TCPS_TIME_WAIT &&
798 tp->t_state != TCPS_FIN_WAIT_2 &&
799 ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) {
800 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
801 (u_int32_t)TCP_CONN_KEEPINTVL(tp));
802 } else {
803 tp = tcp_close(tp);
804 return(tp);
805 }
806 break;
807
808 /*
809 * Retransmission timer went off. Message has not
810 * been acked within retransmit interval. Back off
811 * to a longer retransmit interval and retransmit one segment.
812 */
813 case TCPT_REXMT:
814 absolutetime_to_nanoseconds(mach_absolutetime_asleep,
815 &accsleep_ms);
816 accsleep_ms = accsleep_ms / 1000000UL;
817 if (accsleep_ms > tp->t_accsleep_ms)
818 last_sleep_ms = accsleep_ms - tp->t_accsleep_ms;
819 /*
820 * Drop a connection in the retransmit timer
821 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT
822 * times
823 * 2. If the time spent in this retransmission episode is
824 * more than the time limit set with TCP_RXT_CONNDROPTIME
825 * socket option
826 * 3. If TCP_RXT_FINDROP socket option was set and
827 * we have already retransmitted the FIN 3 times without
828 * receiving an ack
829 */
830 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
831 (tp->t_rxt_conndroptime > 0 && tp->t_rxtstart > 0 &&
832 (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime) ||
833 ((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
834 (tp->t_flags & TF_SENTFIN) != 0 && tp->t_rxtshift >= 4) ||
835 (tp->t_rxtshift > 4 && last_sleep_ms >= TCP_SLEEP_TOO_LONG)) {
836 if ((tp->t_flagsext & TF_RXTFINDROP) != 0) {
837 tcpstat.tcps_rxtfindrop++;
838 } else if (last_sleep_ms >= TCP_SLEEP_TOO_LONG) {
839 tcpstat.tcps_drop_after_sleep++;
840 } else {
841 tcpstat.tcps_timeoutdrop++;
842 }
843 if (tp->t_rxtshift >= TCP_MAXRXTSHIFT) {
844 if (TCP_ECN_ENABLED(tp)) {
845 INP_INC_IFNET_STAT(tp->t_inpcb,
846 ecn_on.rxmit_drop);
847 } else {
848 INP_INC_IFNET_STAT(tp->t_inpcb,
849 ecn_off.rxmit_drop);
850 }
851 }
852 tp->t_rxtshift = TCP_MAXRXTSHIFT;
853 postevent(so, 0, EV_TIMEOUT);
854 soevent(so,
855 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
856
857 if (TCP_ECN_ENABLED(tp) &&
858 tp->t_state == TCPS_ESTABLISHED)
859 tcp_heuristic_ecn_droprxmt(tp);
860
861 tp = tcp_drop(tp, tp->t_softerror ?
862 tp->t_softerror : ETIMEDOUT);
863
864 break;
865 }
866
867 tcpstat.tcps_rexmttimeo++;
868 tp->t_accsleep_ms = accsleep_ms;
869
870 if (tp->t_rxtshift == 1 &&
871 tp->t_state == TCPS_ESTABLISHED) {
872 /* Set the time at which retransmission started. */
873 tp->t_rxtstart = tcp_now;
874
875 /*
876 * if this is the first retransmit timeout, save
877 * the state so that we can recover if the timeout
878 * is spurious.
879 */
880 tcp_rexmt_save_state(tp);
881 }
882 #if MPTCP
883 if ((tp->t_rxtshift >= mptcp_fail_thresh) &&
884 (tp->t_state == TCPS_ESTABLISHED) &&
885 (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
886 mptcp_act_on_txfail(so);
887
888 }
889 #endif /* MPTCP */
890
891 if (tp->t_adaptive_wtimo > 0 &&
892 tp->t_rxtshift > tp->t_adaptive_wtimo &&
893 TCPS_HAVEESTABLISHED(tp->t_state)) {
894 /* Send an event to the application */
895 soevent(so,
896 (SO_FILT_HINT_LOCKED|
897 SO_FILT_HINT_ADAPTIVE_WTIMO));
898 }
899
900 /*
901 * If this is a retransmit timeout after PTO, the PTO
902 * was not effective
903 */
904 if (tp->t_flagsext & TF_SENT_TLPROBE) {
905 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
906 tcpstat.tcps_rto_after_pto++;
907 }
908
909 if (tp->t_flagsext & TF_DELAY_RECOVERY) {
910 /*
911 * Retransmit timer fired before entering recovery
912 * on a connection with packet re-ordering. This
913 * suggests that the reordering metrics computed
914 * are not accurate.
915 */
916 tp->t_reorderwin = 0;
917 tp->t_timer[TCPT_DELAYFR] = 0;
918 tp->t_flagsext &= ~(TF_DELAY_RECOVERY);
919 }
920
921 if (tp->t_state == TCPS_SYN_RECEIVED)
922 tcp_disable_tfo(tp);
923
924 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
925 !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
926 ((tp->t_state != TCPS_SYN_SENT && tp->t_rxtshift > 1) ||
927 tp->t_rxtshift > 2)) {
928 /*
929 * For regular retransmissions, a first one is being
930 * done for tail-loss probe.
931 * Thus, if rxtshift > 1, this means we have sent the segment
932 * a total of 3 times.
933 *
934 * If we are in SYN-SENT state, then there is no tail-loss
935 * probe thus we have to let rxtshift go up to 3.
936 */
937 tcp_heuristic_tfo_middlebox(tp);
938
939 so->so_error = ENODATA;
940 sorwakeup(so);
941 sowwakeup(so);
942
943 tp->t_tfo_stats |= TFO_S_SEND_BLACKHOLE;
944 tcpstat.tcps_tfo_sndblackhole++;
945 }
946
947 if (tp->t_state == TCPS_SYN_SENT) {
948 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
949 tp->t_stat.synrxtshift = tp->t_rxtshift;
950
951 /* When retransmitting, disable TFO */
952 if (tfo_enabled(tp) && !(so->so_flags & SOF1_DATA_AUTHENTICATED)) {
953 tp->t_flagsext &= ~TF_FASTOPEN;
954 tp->t_tfo_flags |= TFO_F_SYN_LOSS;
955 }
956 } else {
957 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
958 }
959
960 TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX,
961 TCP_ADD_REXMTSLOP(tp));
962 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
963
964 if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb))
965 goto fc_output;
966
967 tcp_free_sackholes(tp);
968 /*
969 * Check for potential Path MTU Discovery Black Hole
970 */
971 if (tcp_pmtud_black_hole_detect &&
972 !(tp->t_flagsext & TF_NOBLACKHOLE_DETECTION) &&
973 (tp->t_state == TCPS_ESTABLISHED)) {
974 if ((tp->t_flags & TF_PMTUD) &&
975 ((tp->t_flags & TF_MAXSEGSNT)
976 || tp->t_pmtud_lastseg_size > tcp_pmtud_black_hole_mss) &&
977 tp->t_rxtshift == 2) {
978 /*
979 * Enter Path MTU Black-hole Detection mechanism:
980 * - Disable Path MTU Discovery (IP "DF" bit).
981 * - Reduce MTU to lower value than what we
982 * negotiated with the peer.
983 */
984 /* Disable Path MTU Discovery for now */
985 tp->t_flags &= ~TF_PMTUD;
986 /* Record that we may have found a black hole */
987 tp->t_flags |= TF_BLACKHOLE;
988 optlen = tp->t_maxopd - tp->t_maxseg;
989 /* Keep track of previous MSS */
990 tp->t_pmtud_saved_maxopd = tp->t_maxopd;
991 tp->t_pmtud_start_ts = tcp_now;
992 if (tp->t_pmtud_start_ts == 0)
993 tp->t_pmtud_start_ts++;
994 /* Reduce the MSS to intermediary value */
995 if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
996 tp->t_maxopd = tcp_pmtud_black_hole_mss;
997 } else {
998 tp->t_maxopd = /* use the default MSS */
999 #if INET6
1000 isipv6 ? tcp_v6mssdflt :
1001 #endif /* INET6 */
1002 tcp_mssdflt;
1003 }
1004 tp->t_maxseg = tp->t_maxopd - optlen;
1005
1006 /*
1007 * Reset the slow-start flight size
1008 * as it may depend on the new MSS
1009 */
1010 if (CC_ALGO(tp)->cwnd_init != NULL)
1011 CC_ALGO(tp)->cwnd_init(tp);
1012 tp->snd_cwnd = tp->t_maxseg;
1013 }
1014 /*
1015 * If further retransmissions are still
1016 * unsuccessful with a lowered MTU, maybe this
1017 * isn't a Black Hole and we restore the previous
1018 * MSS and blackhole detection flags.
1019 */
1020 else {
1021
1022 if ((tp->t_flags & TF_BLACKHOLE) &&
1023 (tp->t_rxtshift > 4)) {
1024 tcp_pmtud_revert_segment_size(tp);
1025 tp->snd_cwnd = tp->t_maxseg;
1026 }
1027 }
1028 }
1029
1030
1031 /*
1032 * Disable rfc1323 and rfc1644 if we haven't got any
1033 * response to our SYN (after we reach the threshold)
1034 * to work-around some broken terminal servers (most of
1035 * which have hopefully been retired) that have bad VJ
1036 * header compression code which trashes TCP segments
1037 * containing unknown-to-them TCP options.
1038 * Do this only on non-local connections.
1039 */
1040 if (tp->t_state == TCPS_SYN_SENT &&
1041 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres)
1042 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
1043
1044 /*
1045 * If losing, let the lower level know and try for
1046 * a better route. Also, if we backed off this far,
1047 * our srtt estimate is probably bogus. Clobber it
1048 * so we'll take the next rtt measurement as our srtt;
1049 * move the current srtt into rttvar to keep the current
1050 * retransmit times until then.
1051 */
1052 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
1053 #if INET6
1054 if (isipv6)
1055 in6_losing(tp->t_inpcb);
1056 else
1057 #endif /* INET6 */
1058 in_losing(tp->t_inpcb);
1059 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
1060 tp->t_srtt = 0;
1061 }
1062 tp->snd_nxt = tp->snd_una;
1063 /*
1064 * Note: We overload snd_recover to function also as the
1065 * snd_last variable described in RFC 2582
1066 */
1067 tp->snd_recover = tp->snd_max;
1068 /*
1069 * Force a segment to be sent.
1070 */
1071 tp->t_flags |= TF_ACKNOW;
1072
1073 /* If timing a segment in this window, stop the timer */
1074 tp->t_rtttime = 0;
1075
1076 if (!IN_FASTRECOVERY(tp) && tp->t_rxtshift == 1)
1077 tcpstat.tcps_tailloss_rto++;
1078
1079
1080 /*
1081 * RFC 5681 says: when a TCP sender detects segment loss
1082 * using retransmit timer and the given segment has already
1083 * been retransmitted by way of the retransmission timer at
1084 * least once, the value of ssthresh is held constant
1085 */
1086 if (tp->t_rxtshift == 1 &&
1087 CC_ALGO(tp)->after_timeout != NULL) {
1088 CC_ALGO(tp)->after_timeout(tp);
1089 /*
1090 * CWR notifications are to be sent on new data
1091 * right after Fast Retransmits and ECE
1092 * notification receipts.
1093 */
1094 if (TCP_ECN_ENABLED(tp))
1095 tp->ecn_flags |= TE_SENDCWR;
1096 }
1097
1098 EXIT_FASTRECOVERY(tp);
1099
1100 /* Exit cwnd non validated phase */
1101 tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
1102
1103
1104 fc_output:
1105 tcp_ccdbg_trace(tp, NULL, TCP_CC_REXMT_TIMEOUT);
1106
1107 (void) tcp_output(tp);
1108 break;
1109
1110 /*
1111 * Persistance timer into zero window.
1112 * Force a byte to be output, if possible.
1113 */
1114 case TCPT_PERSIST:
1115 tcpstat.tcps_persisttimeo++;
1116 /*
1117 * Hack: if the peer is dead/unreachable, we do not
1118 * time out if the window is closed. After a full
1119 * backoff, drop the connection if the idle time
1120 * (no responses to probes) reaches the maximum
1121 * backoff that we would use if retransmitting.
1122 *
1123 * Drop the connection if we reached the maximum allowed time for
1124 * Zero Window Probes without a non-zero update from the peer.
1125 * See rdar://5805356
1126 */
1127 if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
1128 (idle_time >= tcp_maxpersistidle ||
1129 idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) ||
1130 ((tp->t_persist_stop != 0) &&
1131 TSTMP_LEQ(tp->t_persist_stop, tcp_now))) {
1132 tcpstat.tcps_persistdrop++;
1133 postevent(so, 0, EV_TIMEOUT);
1134 soevent(so,
1135 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1136 tp = tcp_drop(tp, ETIMEDOUT);
1137 break;
1138 }
1139 tcp_setpersist(tp);
1140 tp->t_flagsext |= TF_FORCE;
1141 (void) tcp_output(tp);
1142 tp->t_flagsext &= ~TF_FORCE;
1143 break;
1144
1145 /*
1146 * Keep-alive timer went off; send something
1147 * or drop connection if idle for too long.
1148 */
1149 case TCPT_KEEP:
1150 tcpstat.tcps_keeptimeo++;
1151 #if MPTCP
1152 /*
1153 * Regular TCP connections do not send keepalives after closing
1154 * MPTCP must not also, after sending Data FINs.
1155 */
1156 struct mptcb *mp_tp = tp->t_mptcb;
1157 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
1158 (tp->t_state > TCPS_ESTABLISHED)) {
1159 goto dropit;
1160 } else if (mp_tp != NULL) {
1161 if ((mptcp_ok_to_keepalive(mp_tp) == 0))
1162 goto dropit;
1163 }
1164 #endif /* MPTCP */
1165 if (tp->t_state < TCPS_ESTABLISHED)
1166 goto dropit;
1167 if ((always_keepalive ||
1168 (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ||
1169 (tp->t_flagsext & TF_DETECT_READSTALL) ||
1170 (tp->t_tfo_probe_state == TFO_PROBE_PROBING)) &&
1171 (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
1172 if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp))
1173 goto dropit;
1174 /*
1175 * Send a packet designed to force a response
1176 * if the peer is up and reachable:
1177 * either an ACK if the connection is still alive,
1178 * or an RST if the peer has closed the connection
1179 * due to timeout or reboot.
1180 * Using sequence number tp->snd_una-1
1181 * causes the transmitted zero-length segment
1182 * to lie outside the receive window;
1183 * by the protocol spec, this requires the
1184 * correspondent TCP to respond.
1185 */
1186 tcpstat.tcps_keepprobe++;
1187 t_template = tcp_maketemplate(tp);
1188 if (t_template) {
1189 struct inpcb *inp = tp->t_inpcb;
1190 struct tcp_respond_args tra;
1191
1192 bzero(&tra, sizeof(tra));
1193 tra.nocell = INP_NO_CELLULAR(inp);
1194 tra.noexpensive = INP_NO_EXPENSIVE(inp);
1195 tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
1196 tra.intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp);
1197 if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
1198 tra.ifscope = tp->t_inpcb->inp_boundifp->if_index;
1199 else
1200 tra.ifscope = IFSCOPE_NONE;
1201 tcp_respond(tp, t_template->tt_ipgen,
1202 &t_template->tt_t, (struct mbuf *)NULL,
1203 tp->rcv_nxt, tp->snd_una - 1, 0, &tra);
1204 (void) m_free(dtom(t_template));
1205 if (tp->t_flagsext & TF_DETECT_READSTALL)
1206 tp->t_rtimo_probes++;
1207 }
1208 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1209 TCP_CONN_KEEPINTVL(tp));
1210 } else {
1211 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1212 TCP_CONN_KEEPIDLE(tp));
1213 }
1214 if (tp->t_flagsext & TF_DETECT_READSTALL) {
1215 struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
1216 bool reenable_probe = false;
1217 /*
1218 * The keep alive packets sent to detect a read
1219 * stall did not get a response from the
1220 * peer. Generate more keep-alives to confirm this.
1221 * If the number of probes sent reaches the limit,
1222 * generate an event.
1223 */
1224 if (tp->t_adaptive_rtimo > 0) {
1225 if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) {
1226 /* Generate an event */
1227 soevent(so,
1228 (SO_FILT_HINT_LOCKED |
1229 SO_FILT_HINT_ADAPTIVE_RTIMO));
1230 tcp_keepalive_reset(tp);
1231 } else {
1232 reenable_probe = true;
1233 }
1234 } else if (outifp != NULL &&
1235 (outifp->if_eflags & IFEF_PROBE_CONNECTIVITY) &&
1236 tp->t_rtimo_probes <= TCP_CONNECTIVITY_PROBES_MAX) {
1237 reenable_probe = true;
1238 } else {
1239 tp->t_flagsext &= ~TF_DETECT_READSTALL;
1240 }
1241 if (reenable_probe) {
1242 int ind = min(tp->t_rtimo_probes,
1243 TCP_MAXRXTSHIFT);
1244 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(
1245 tp, tcp_backoff[ind] * TCP_REXMTVAL(tp));
1246 }
1247 }
1248 if (tp->t_tfo_probe_state == TFO_PROBE_PROBING) {
1249 int ind;
1250
1251 tp->t_tfo_probes++;
1252 ind = min(tp->t_tfo_probes, TCP_MAXRXTSHIFT);
1253
1254 /*
1255 * We take the minimum among the time set by true
1256 * keepalive (see above) and the backoff'd RTO. That
1257 * way we backoff in case of packet-loss but will never
1258 * timeout slower than regular keepalive due to the
1259 * backing off.
1260 */
1261 tp->t_timer[TCPT_KEEP] = min(OFFSET_FROM_START(
1262 tp, tcp_backoff[ind] * TCP_REXMTVAL(tp)),
1263 tp->t_timer[TCPT_KEEP]);
1264 } else if (tp->t_tfo_probe_state == TFO_PROBE_WAIT_DATA) {
1265 /* Still no data! Let's assume a TFO-error and err out... */
1266 tcp_heuristic_tfo_middlebox(tp);
1267
1268 so->so_error = ENODATA;
1269 sorwakeup(so);
1270 tp->t_tfo_stats |= TFO_S_RECV_BLACKHOLE;
1271 tcpstat.tcps_tfo_blackhole++;
1272 }
1273 break;
1274 case TCPT_DELACK:
1275 if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
1276 tp->t_flags &= ~TF_DELACK;
1277 tp->t_timer[TCPT_DELACK] = 0;
1278 tp->t_flags |= TF_ACKNOW;
1279
1280 /*
1281 * If delayed ack timer fired while stretching
1282 * acks, count the number of times the streaming
1283 * detection was not correct. If this exceeds a
1284 * threshold, disable strech ack on this
1285 * connection
1286 *
1287 * Also, go back to acking every other packet.
1288 */
1289 if ((tp->t_flags & TF_STRETCHACK)) {
1290 if (tp->t_unacksegs > 1 &&
1291 tp->t_unacksegs < maxseg_unacked)
1292 tp->t_stretchack_delayed++;
1293
1294 if (tp->t_stretchack_delayed >
1295 TCP_STRETCHACK_DELAY_THRESHOLD) {
1296 tp->t_flagsext |= TF_DISABLE_STRETCHACK;
1297 /*
1298 * Note the time at which stretch
1299 * ack was disabled automatically
1300 */
1301 tp->rcv_nostrack_ts = tcp_now;
1302 tcpstat.tcps_nostretchack++;
1303 tp->t_stretchack_delayed = 0;
1304 tp->rcv_nostrack_pkts = 0;
1305 }
1306 tcp_reset_stretch_ack(tp);
1307 }
1308
1309 /*
1310 * If we are measuring inter packet arrival jitter
1311 * for throttling a connection, this delayed ack
1312 * might be the reason for accumulating some
1313 * jitter. So let's restart the measurement.
1314 */
1315 CLEAR_IAJ_STATE(tp);
1316
1317 tcpstat.tcps_delack++;
1318 (void) tcp_output(tp);
1319 }
1320 break;
1321
1322 #if MPTCP
1323 case TCPT_JACK_RXMT:
1324 if ((tp->t_state == TCPS_ESTABLISHED) &&
1325 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
1326 (tp->t_mpflags & TMPF_JOINED_FLOW)) {
1327 if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) {
1328 tcpstat.tcps_timeoutdrop++;
1329 postevent(so, 0, EV_TIMEOUT);
1330 soevent(so,
1331 (SO_FILT_HINT_LOCKED|
1332 SO_FILT_HINT_TIMEOUT));
1333 tp = tcp_drop(tp, tp->t_softerror ?
1334 tp->t_softerror : ETIMEDOUT);
1335 break;
1336 }
1337 tcpstat.tcps_join_rxmts++;
1338 tp->t_flags |= TF_ACKNOW;
1339
1340 /*
1341 * No backoff is implemented for simplicity for this
1342 * corner case.
1343 */
1344 (void) tcp_output(tp);
1345 }
1346 break;
1347 #endif /* MPTCP */
1348
1349 case TCPT_PTO:
1350 {
1351 int32_t snd_len;
1352 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1353
1354 /*
1355 * Check if the connection is in the right state to
1356 * send a probe
1357 */
1358 if (tp->t_state != TCPS_ESTABLISHED ||
1359 (tp->t_rxtshift > 0 && !(tp->t_flagsext & TF_PROBING))
1360 || tp->snd_max == tp->snd_una ||
1361 !SACK_ENABLED(tp) || !TAILQ_EMPTY(&tp->snd_holes) ||
1362 IN_FASTRECOVERY(tp))
1363 break;
1364
1365 /*
1366 * If there is no new data to send or if the
1367 * connection is limited by receive window then
1368 * retransmit the last segment, otherwise send
1369 * new data.
1370 */
1371 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd)
1372 - (tp->snd_max - tp->snd_una);
1373 if (snd_len > 0) {
1374 tp->snd_nxt = tp->snd_max;
1375 } else {
1376 snd_len = min((tp->snd_max - tp->snd_una),
1377 tp->t_maxseg);
1378 tp->snd_nxt = tp->snd_max - snd_len;
1379 }
1380
1381 tcpstat.tcps_pto++;
1382 if (tp->t_flagsext & TF_PROBING)
1383 tcpstat.tcps_probe_if++;
1384
1385 /* If timing a segment in this window, stop the timer */
1386 tp->t_rtttime = 0;
1387 /* Note that tail loss probe is being sent */
1388 tp->t_flagsext |= TF_SENT_TLPROBE;
1389 tp->t_tlpstart = tcp_now;
1390
1391 tp->snd_cwnd += tp->t_maxseg;
1392 (void )tcp_output(tp);
1393 tp->snd_cwnd -= tp->t_maxseg;
1394
1395 tp->t_tlphighrxt = tp->snd_nxt;
1396 break;
1397 }
1398 case TCPT_DELAYFR:
1399 tp->t_flagsext &= ~TF_DELAY_RECOVERY;
1400
1401 /*
1402 * Don't do anything if one of the following is true:
1403 * - the connection is already in recovery
1404 * - sequence until snd_recover has been acknowledged.
1405 * - retransmit timeout has fired
1406 */
1407 if (IN_FASTRECOVERY(tp) ||
1408 SEQ_GEQ(tp->snd_una, tp->snd_recover) ||
1409 tp->t_rxtshift > 0)
1410 break;
1411
1412 VERIFY(SACK_ENABLED(tp));
1413 tcp_rexmt_save_state(tp);
1414 if (CC_ALGO(tp)->pre_fr != NULL) {
1415 CC_ALGO(tp)->pre_fr(tp);
1416 if (TCP_ECN_ENABLED(tp))
1417 tp->ecn_flags |= TE_SENDCWR;
1418 }
1419 ENTER_FASTRECOVERY(tp);
1420
1421 tp->t_timer[TCPT_REXMT] = 0;
1422 tcpstat.tcps_sack_recovery_episode++;
1423 tp->t_sack_recovery_episode++;
1424 tp->sack_newdata = tp->snd_nxt;
1425 tp->snd_cwnd = tp->t_maxseg;
1426 tcp_ccdbg_trace(tp, NULL, TCP_CC_ENTER_FASTRECOVERY);
1427 (void) tcp_output(tp);
1428 break;
1429 dropit:
1430 tcpstat.tcps_keepdrops++;
1431 postevent(so, 0, EV_TIMEOUT);
1432 soevent(so,
1433 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1434 tp = tcp_drop(tp, ETIMEDOUT);
1435 break;
1436 }
1437 #if TCPDEBUG
1438 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
1439 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
1440 PRU_SLOWTIMO);
1441 #endif
1442 return (tp);
1443 }
1444
1445 /* Remove a timer entry from timer list */
1446 void
1447 tcp_remove_timer(struct tcpcb *tp)
1448 {
1449 struct tcptimerlist *listp = &tcp_timer_list;
1450
1451 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1452 if (!(TIMER_IS_ON_LIST(tp))) {
1453 return;
1454 }
1455 lck_mtx_lock(listp->mtx);
1456
1457 /* Check if pcb is on timer list again after acquiring the lock */
1458 if (!(TIMER_IS_ON_LIST(tp))) {
1459 lck_mtx_unlock(listp->mtx);
1460 return;
1461 }
1462
1463 if (listp->next_te != NULL && listp->next_te == &tp->tentry)
1464 listp->next_te = LIST_NEXT(&tp->tentry, le);
1465
1466 LIST_REMOVE(&tp->tentry, le);
1467 tp->t_flags &= ~(TF_TIMER_ONLIST);
1468
1469 listp->entries--;
1470
1471 tp->tentry.le.le_next = NULL;
1472 tp->tentry.le.le_prev = NULL;
1473 lck_mtx_unlock(listp->mtx);
1474 }
1475
1476 /*
1477 * Function to check if the timerlist needs to be rescheduled to run
1478 * the timer entry correctly. Basically, this is to check if we can avoid
1479 * taking the list lock.
1480 */
1481
1482 static boolean_t
1483 need_to_resched_timerlist(u_int32_t runtime, u_int16_t mode)
1484 {
1485 struct tcptimerlist *listp = &tcp_timer_list;
1486 int32_t diff;
1487
1488 /*
1489 * If the list is being processed then the state of the list is
1490 * in flux. In this case always acquire the lock and set the state
1491 * correctly.
1492 */
1493 if (listp->running)
1494 return (TRUE);
1495
1496 if (!listp->scheduled)
1497 return (TRUE);
1498
1499 diff = timer_diff(listp->runtime, 0, runtime, 0);
1500 if (diff <= 0) {
1501 /* The list is going to run before this timer */
1502 return (FALSE);
1503 } else {
1504 if (mode & TCP_TIMERLIST_10MS_MODE) {
1505 if (diff <= TCP_TIMER_10MS_QUANTUM)
1506 return (FALSE);
1507 } else if (mode & TCP_TIMERLIST_100MS_MODE) {
1508 if (diff <= TCP_TIMER_100MS_QUANTUM)
1509 return (FALSE);
1510 } else {
1511 if (diff <= TCP_TIMER_500MS_QUANTUM)
1512 return (FALSE);
1513 }
1514 }
1515 return (TRUE);
1516 }
1517
1518 void
1519 tcp_sched_timerlist(uint32_t offset)
1520 {
1521 uint64_t deadline = 0;
1522 struct tcptimerlist *listp = &tcp_timer_list;
1523
1524 lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED);
1525
1526 offset = min(offset, TCP_TIMERLIST_MAX_OFFSET);
1527 listp->runtime = tcp_now + offset;
1528 if (listp->runtime == 0) {
1529 listp->runtime++;
1530 offset++;
1531 }
1532
1533 clock_interval_to_deadline(offset, USEC_PER_SEC, &deadline);
1534
1535 thread_call_enter_delayed(listp->call, deadline);
1536 listp->scheduled = TRUE;
1537 }
1538
1539 /*
1540 * Function to run the timers for a connection.
1541 *
1542 * Returns the offset of next timer to be run for this connection which
1543 * can be used to reschedule the timerlist.
1544 *
1545 * te_mode is an out parameter that indicates the modes of active
1546 * timers for this connection.
1547 */
1548 u_int32_t
1549 tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode,
1550 u_int16_t probe_if_index)
1551 {
1552 struct socket *so;
1553 u_int16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE;
1554 u_int32_t timer_val, offset = 0, lo_timer = 0;
1555 int32_t diff;
1556 boolean_t needtorun[TCPT_NTIMERS];
1557 int count = 0;
1558
1559 VERIFY(tp != NULL);
1560 bzero(needtorun, sizeof(needtorun));
1561 *te_mode = 0;
1562
1563 tcp_lock(tp->t_inpcb->inp_socket, 1, 0);
1564
1565 so = tp->t_inpcb->inp_socket;
1566 /* Release the want count on inp */
1567 if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1)
1568 == WNT_STOPUSING) {
1569 if (TIMER_IS_ON_LIST(tp)) {
1570 tcp_remove_timer(tp);
1571 }
1572
1573 /* Looks like the TCP connection got closed while we
1574 * were waiting for the lock.. Done
1575 */
1576 goto done;
1577 }
1578
1579 /*
1580 * If this connection is over an interface that needs to
1581 * be probed, send probe packets to reinitiate communication.
1582 */
1583 if (probe_if_index > 0 && tp->t_inpcb->inp_last_outifp != NULL &&
1584 tp->t_inpcb->inp_last_outifp->if_index == probe_if_index) {
1585 tp->t_flagsext |= TF_PROBING;
1586 tcp_timers(tp, TCPT_PTO);
1587 tp->t_timer[TCPT_PTO] = 0;
1588 tp->t_flagsext &= ~TF_PROBING;
1589 }
1590
1591 /*
1592 * Since the timer thread needs to wait for tcp lock, it may race
1593 * with another thread that can cancel or reschedule the timer
1594 * that is about to run. Check if we need to run anything.
1595 */
1596 if ((index = tp->tentry.index) == TCPT_NONE)
1597 goto done;
1598
1599 timer_val = tp->t_timer[index];
1600
1601 diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
1602 if (diff > 0) {
1603 if (tp->tentry.index != TCPT_NONE) {
1604 offset = diff;
1605 *(te_mode) = tp->tentry.mode;
1606 }
1607 goto done;
1608 }
1609
1610 tp->t_timer[index] = 0;
1611 if (timer_val > 0) {
1612 tp = tcp_timers(tp, index);
1613 if (tp == NULL)
1614 goto done;
1615 }
1616
1617 /*
1618 * Check if there are any other timers that need to be run.
1619 * While doing it, adjust the timer values wrt tcp_now.
1620 */
1621 tp->tentry.mode = 0;
1622 for (i = 0; i < TCPT_NTIMERS; ++i) {
1623 if (tp->t_timer[i] != 0) {
1624 diff = timer_diff(tp->tentry.timer_start,
1625 tp->t_timer[i], tcp_now, 0);
1626 if (diff <= 0) {
1627 needtorun[i] = TRUE;
1628 count++;
1629 } else {
1630 tp->t_timer[i] = diff;
1631 needtorun[i] = FALSE;
1632 if (lo_timer == 0 || diff < lo_timer) {
1633 lo_timer = diff;
1634 lo_index = i;
1635 }
1636 TCP_SET_TIMER_MODE(tp->tentry.mode, i);
1637 }
1638 }
1639 }
1640
1641 tp->tentry.timer_start = tcp_now;
1642 tp->tentry.index = lo_index;
1643 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
1644
1645 if (tp->tentry.index != TCPT_NONE) {
1646 tp->tentry.runtime = tp->tentry.timer_start +
1647 tp->t_timer[tp->tentry.index];
1648 if (tp->tentry.runtime == 0)
1649 tp->tentry.runtime++;
1650 }
1651
1652 if (count > 0) {
1653 /* run any other timers outstanding at this time. */
1654 for (i = 0; i < TCPT_NTIMERS; ++i) {
1655 if (needtorun[i]) {
1656 tp->t_timer[i] = 0;
1657 tp = tcp_timers(tp, i);
1658 if (tp == NULL) {
1659 offset = 0;
1660 *(te_mode) = 0;
1661 goto done;
1662 }
1663 }
1664 }
1665 tcp_set_lotimer_index(tp);
1666 }
1667
1668 if (tp->tentry.index < TCPT_NONE) {
1669 offset = tp->t_timer[tp->tentry.index];
1670 *(te_mode) = tp->tentry.mode;
1671 }
1672
1673 done:
1674 if (tp != NULL && tp->tentry.index == TCPT_NONE) {
1675 tcp_remove_timer(tp);
1676 offset = 0;
1677 }
1678
1679 tcp_unlock(so, 1, 0);
1680 return(offset);
1681 }
1682
1683 void
1684 tcp_run_timerlist(void * arg1, void * arg2)
1685 {
1686 #pragma unused(arg1, arg2)
1687 struct tcptimerentry *te, *next_te;
1688 struct tcptimerlist *listp = &tcp_timer_list;
1689 struct tcpcb *tp;
1690 uint32_t next_timer = 0; /* offset of the next timer on the list */
1691 u_int16_t te_mode = 0; /* modes of all active timers in a tcpcb */
1692 u_int16_t list_mode = 0; /* cumulative of modes of all tcpcbs */
1693 uint32_t active_count = 0;
1694
1695 calculate_tcp_clock();
1696
1697 lck_mtx_lock(listp->mtx);
1698
1699 listp->running = TRUE;
1700
1701 LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
1702 uint32_t offset = 0;
1703 uint32_t runtime = te->runtime;
1704 if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) {
1705 offset = timer_diff(runtime, 0, tcp_now, 0);
1706 if (next_timer == 0 || offset < next_timer) {
1707 next_timer = offset;
1708 }
1709 list_mode |= te->mode;
1710 continue;
1711 }
1712
1713 tp = TIMERENTRY_TO_TP(te);
1714
1715 /*
1716 * Acquire an inp wantcnt on the inpcb so that the socket
1717 * won't get detached even if tcp_close is called
1718 */
1719 if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0)
1720 == WNT_STOPUSING) {
1721 /*
1722 * Some how this pcb went into dead state while
1723 * on the timer list, just take it off the list.
1724 * Since the timer list entry pointers are
1725 * protected by the timer list lock, we can
1726 * do it here without the socket lock.
1727 */
1728 if (TIMER_IS_ON_LIST(tp)) {
1729 tp->t_flags &= ~(TF_TIMER_ONLIST);
1730 LIST_REMOVE(&tp->tentry, le);
1731 listp->entries--;
1732
1733 tp->tentry.le.le_next = NULL;
1734 tp->tentry.le.le_prev = NULL;
1735 }
1736 continue;
1737 }
1738 active_count++;
1739
1740 /*
1741 * Store the next timerentry pointer before releasing the
1742 * list lock. If that entry has to be removed when we
1743 * release the lock, this pointer will be updated to the
1744 * element after that.
1745 */
1746 listp->next_te = next_te;
1747
1748 VERIFY_NEXT_LINK(&tp->tentry, le);
1749 VERIFY_PREV_LINK(&tp->tentry, le);
1750
1751 lck_mtx_unlock(listp->mtx);
1752
1753 offset = tcp_run_conn_timer(tp, &te_mode,
1754 listp->probe_if_index);
1755
1756 lck_mtx_lock(listp->mtx);
1757
1758 next_te = listp->next_te;
1759 listp->next_te = NULL;
1760
1761 if (offset > 0 && te_mode != 0) {
1762 list_mode |= te_mode;
1763
1764 if (next_timer == 0 || offset < next_timer)
1765 next_timer = offset;
1766 }
1767 }
1768
1769 if (!LIST_EMPTY(&listp->lhead)) {
1770 u_int16_t next_mode = 0;
1771 if ((list_mode & TCP_TIMERLIST_10MS_MODE) ||
1772 (listp->pref_mode & TCP_TIMERLIST_10MS_MODE))
1773 next_mode = TCP_TIMERLIST_10MS_MODE;
1774 else if ((list_mode & TCP_TIMERLIST_100MS_MODE) ||
1775 (listp->pref_mode & TCP_TIMERLIST_100MS_MODE))
1776 next_mode = TCP_TIMERLIST_100MS_MODE;
1777 else
1778 next_mode = TCP_TIMERLIST_500MS_MODE;
1779
1780 if (next_mode != TCP_TIMERLIST_500MS_MODE) {
1781 listp->idleruns = 0;
1782 } else {
1783 /*
1784 * the next required mode is slow mode, but if
1785 * the last one was a faster mode and we did not
1786 * have enough idle runs, repeat the last mode.
1787 *
1788 * We try to keep the timer list in fast mode for
1789 * some idle time in expectation of new data.
1790 */
1791 if (listp->mode != next_mode &&
1792 listp->idleruns < timer_fastmode_idlemax) {
1793 listp->idleruns++;
1794 next_mode = listp->mode;
1795 next_timer = TCP_TIMER_100MS_QUANTUM;
1796 } else {
1797 listp->idleruns = 0;
1798 }
1799 }
1800 listp->mode = next_mode;
1801 if (listp->pref_offset != 0)
1802 next_timer = min(listp->pref_offset, next_timer);
1803
1804 if (listp->mode == TCP_TIMERLIST_500MS_MODE)
1805 next_timer = max(next_timer,
1806 TCP_TIMER_500MS_QUANTUM);
1807
1808 tcp_sched_timerlist(next_timer);
1809 } else {
1810 /*
1811 * No need to reschedule this timer, but always run
1812 * periodically at a much higher granularity.
1813 */
1814 tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET);
1815 }
1816
1817 listp->running = FALSE;
1818 listp->pref_mode = 0;
1819 listp->pref_offset = 0;
1820 listp->probe_if_index = 0;
1821
1822 lck_mtx_unlock(listp->mtx);
1823 }
1824
1825 /*
1826 * Function to check if the timerlist needs to be rescheduled to run this
1827 * connection's timers correctly.
1828 */
1829 void
1830 tcp_sched_timers(struct tcpcb *tp)
1831 {
1832 struct tcptimerentry *te = &tp->tentry;
1833 u_int16_t index = te->index;
1834 u_int16_t mode = te->mode;
1835 struct tcptimerlist *listp = &tcp_timer_list;
1836 int32_t offset = 0;
1837 boolean_t list_locked = FALSE;
1838
1839 if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) {
1840 /* Just return without adding the dead pcb to the list */
1841 if (TIMER_IS_ON_LIST(tp)) {
1842 tcp_remove_timer(tp);
1843 }
1844 return;
1845 }
1846
1847 if (index == TCPT_NONE) {
1848 /* Nothing to run */
1849 tcp_remove_timer(tp);
1850 return;
1851 }
1852
1853 /*
1854 * compute the offset at which the next timer for this connection
1855 * has to run.
1856 */
1857 offset = timer_diff(te->runtime, 0, tcp_now, 0);
1858 if (offset <= 0) {
1859 offset = 1;
1860 tcp_timer_advanced++;
1861 }
1862
1863 if (!TIMER_IS_ON_LIST(tp)) {
1864 if (!list_locked) {
1865 lck_mtx_lock(listp->mtx);
1866 list_locked = TRUE;
1867 }
1868
1869 if (!TIMER_IS_ON_LIST(tp)) {
1870 LIST_INSERT_HEAD(&listp->lhead, te, le);
1871 tp->t_flags |= TF_TIMER_ONLIST;
1872
1873 listp->entries++;
1874 if (listp->entries > listp->maxentries)
1875 listp->maxentries = listp->entries;
1876
1877 /* if the list is not scheduled, just schedule it */
1878 if (!listp->scheduled)
1879 goto schedule;
1880 }
1881 }
1882
1883 /*
1884 * Timer entry is currently on the list, check if the list needs
1885 * to be rescheduled.
1886 */
1887 if (need_to_resched_timerlist(te->runtime, mode)) {
1888 tcp_resched_timerlist++;
1889
1890 if (!list_locked) {
1891 lck_mtx_lock(listp->mtx);
1892 list_locked = TRUE;
1893 }
1894
1895 VERIFY_NEXT_LINK(te, le);
1896 VERIFY_PREV_LINK(te, le);
1897
1898 if (listp->running) {
1899 listp->pref_mode |= mode;
1900 if (listp->pref_offset == 0 ||
1901 offset < listp->pref_offset) {
1902 listp->pref_offset = offset;
1903 }
1904 } else {
1905 /*
1906 * The list could have got rescheduled while
1907 * this thread was waiting for the lock
1908 */
1909 if (listp->scheduled) {
1910 int32_t diff;
1911 diff = timer_diff(listp->runtime, 0,
1912 tcp_now, offset);
1913 if (diff <= 0)
1914 goto done;
1915 else
1916 goto schedule;
1917 } else {
1918 goto schedule;
1919 }
1920 }
1921 }
1922 goto done;
1923
1924 schedule:
1925 /*
1926 * Since a connection with timers is getting scheduled, the timer
1927 * list moves from idle to active state and that is why idlegen is
1928 * reset
1929 */
1930 if (mode & TCP_TIMERLIST_10MS_MODE) {
1931 listp->mode = TCP_TIMERLIST_10MS_MODE;
1932 listp->idleruns = 0;
1933 offset = min(offset, TCP_TIMER_10MS_QUANTUM);
1934 } else if (mode & TCP_TIMERLIST_100MS_MODE) {
1935 if (listp->mode > TCP_TIMERLIST_100MS_MODE)
1936 listp->mode = TCP_TIMERLIST_100MS_MODE;
1937 listp->idleruns = 0;
1938 offset = min(offset, TCP_TIMER_100MS_QUANTUM);
1939 }
1940 tcp_sched_timerlist(offset);
1941
1942 done:
1943 if (list_locked)
1944 lck_mtx_unlock(listp->mtx);
1945
1946 return;
1947 }
1948
1949 static inline void
1950 tcp_set_lotimer_index(struct tcpcb *tp)
1951 {
1952 uint16_t i, lo_index = TCPT_NONE, mode = 0;
1953 uint32_t lo_timer = 0;
1954 for (i = 0; i < TCPT_NTIMERS; ++i) {
1955 if (tp->t_timer[i] != 0) {
1956 TCP_SET_TIMER_MODE(mode, i);
1957 if (lo_timer == 0 || tp->t_timer[i] < lo_timer) {
1958 lo_timer = tp->t_timer[i];
1959 lo_index = i;
1960 }
1961 }
1962 }
1963 tp->tentry.index = lo_index;
1964 tp->tentry.mode = mode;
1965 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
1966
1967 if (tp->tentry.index != TCPT_NONE) {
1968 tp->tentry.runtime = tp->tentry.timer_start
1969 + tp->t_timer[tp->tentry.index];
1970 if (tp->tentry.runtime == 0)
1971 tp->tentry.runtime++;
1972 }
1973 }
1974
1975 void
1976 tcp_check_timer_state(struct tcpcb *tp)
1977 {
1978 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1979
1980 if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT)
1981 return;
1982
1983 tcp_set_lotimer_index(tp);
1984
1985 tcp_sched_timers(tp);
1986 return;
1987 }
1988
1989 static inline void
1990 tcp_cumulative_stat(u_int32_t cur, u_int32_t *prev, u_int32_t *dest)
1991 {
1992 /* handle wrap around */
1993 int32_t diff = (int32_t) (cur - *prev);
1994 if (diff > 0)
1995 *dest = diff;
1996 else
1997 *dest = 0;
1998 *prev = cur;
1999 return;
2000 }
2001
2002 __private_extern__ void
2003 tcp_report_stats(void)
2004 {
2005 struct nstat_sysinfo_data data;
2006 struct sockaddr_in dst;
2007 struct sockaddr_in6 dst6;
2008 struct rtentry *rt = NULL;
2009 static struct tcp_last_report_stats prev;
2010 u_int64_t var, uptime;
2011
2012 #define stat data.u.tcp_stats
2013 if (((uptime = net_uptime()) - tcp_last_report_time) <
2014 tcp_report_stats_interval)
2015 return;
2016
2017 tcp_last_report_time = uptime;
2018
2019 bzero(&data, sizeof(data));
2020 data.flags = NSTAT_SYSINFO_TCP_STATS;
2021
2022 bzero(&dst, sizeof(dst));
2023 dst.sin_len = sizeof(dst);
2024 dst.sin_family = AF_INET;
2025
2026 /* ipv4 avg rtt */
2027 lck_mtx_lock(rnh_lock);
2028 rt = rt_lookup(TRUE, (struct sockaddr *)&dst, NULL,
2029 rt_tables[AF_INET], IFSCOPE_NONE);
2030 lck_mtx_unlock(rnh_lock);
2031 if (rt != NULL) {
2032 RT_LOCK(rt);
2033 if (rt_primary_default(rt, rt_key(rt)) &&
2034 rt->rt_stats != NULL) {
2035 stat.ipv4_avgrtt = rt->rt_stats->nstat_avg_rtt;
2036 }
2037 RT_UNLOCK(rt);
2038 rtfree(rt);
2039 rt = NULL;
2040 }
2041
2042 /* ipv6 avg rtt */
2043 bzero(&dst6, sizeof(dst6));
2044 dst6.sin6_len = sizeof(dst6);
2045 dst6.sin6_family = AF_INET6;
2046
2047 lck_mtx_lock(rnh_lock);
2048 rt = rt_lookup(TRUE,(struct sockaddr *)&dst6, NULL,
2049 rt_tables[AF_INET6], IFSCOPE_NONE);
2050 lck_mtx_unlock(rnh_lock);
2051 if (rt != NULL) {
2052 RT_LOCK(rt);
2053 if (rt_primary_default(rt, rt_key(rt)) &&
2054 rt->rt_stats != NULL) {
2055 stat.ipv6_avgrtt = rt->rt_stats->nstat_avg_rtt;
2056 }
2057 RT_UNLOCK(rt);
2058 rtfree(rt);
2059 rt = NULL;
2060 }
2061
2062 /* send packet loss rate, shift by 10 for precision */
2063 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_sndrexmitpack > 0) {
2064 var = tcpstat.tcps_sndrexmitpack << 10;
2065 stat.send_plr = (var * 100) / tcpstat.tcps_sndpack;
2066 }
2067
2068 /* recv packet loss rate, shift by 10 for precision */
2069 if (tcpstat.tcps_rcvpack > 0 && tcpstat.tcps_recovered_pkts > 0) {
2070 var = tcpstat.tcps_recovered_pkts << 10;
2071 stat.recv_plr = (var * 100) / tcpstat.tcps_rcvpack;
2072 }
2073
2074 /* RTO after tail loss, shift by 10 for precision */
2075 if (tcpstat.tcps_sndrexmitpack > 0
2076 && tcpstat.tcps_tailloss_rto > 0) {
2077 var = tcpstat.tcps_tailloss_rto << 10;
2078 stat.send_tlrto_rate =
2079 (var * 100) / tcpstat.tcps_sndrexmitpack;
2080 }
2081
2082 /* packet reordering */
2083 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_reordered_pkts > 0) {
2084 var = tcpstat.tcps_reordered_pkts << 10;
2085 stat.send_reorder_rate =
2086 (var * 100) / tcpstat.tcps_sndpack;
2087 }
2088
2089 if (tcp_ecn_outbound == 1)
2090 stat.ecn_client_enabled = 1;
2091 if (tcp_ecn_inbound == 1)
2092 stat.ecn_server_enabled = 1;
2093 tcp_cumulative_stat(tcpstat.tcps_connattempt,
2094 &prev.tcps_connattempt, &stat.connection_attempts);
2095 tcp_cumulative_stat(tcpstat.tcps_accepts,
2096 &prev.tcps_accepts, &stat.connection_accepts);
2097 tcp_cumulative_stat(tcpstat.tcps_ecn_client_setup,
2098 &prev.tcps_ecn_client_setup, &stat.ecn_client_setup);
2099 tcp_cumulative_stat(tcpstat.tcps_ecn_server_setup,
2100 &prev.tcps_ecn_server_setup, &stat.ecn_server_setup);
2101 tcp_cumulative_stat(tcpstat.tcps_ecn_client_success,
2102 &prev.tcps_ecn_client_success, &stat.ecn_client_success);
2103 tcp_cumulative_stat(tcpstat.tcps_ecn_server_success,
2104 &prev.tcps_ecn_server_success, &stat.ecn_server_success);
2105 tcp_cumulative_stat(tcpstat.tcps_ecn_not_supported,
2106 &prev.tcps_ecn_not_supported, &stat.ecn_not_supported);
2107 tcp_cumulative_stat(tcpstat.tcps_ecn_lost_syn,
2108 &prev.tcps_ecn_lost_syn, &stat.ecn_lost_syn);
2109 tcp_cumulative_stat(tcpstat.tcps_ecn_lost_synack,
2110 &prev.tcps_ecn_lost_synack, &stat.ecn_lost_synack);
2111 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ce,
2112 &prev.tcps_ecn_recv_ce, &stat.ecn_recv_ce);
2113 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ece,
2114 &prev.tcps_ecn_recv_ece, &stat.ecn_recv_ece);
2115 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ece,
2116 &prev.tcps_ecn_recv_ece, &stat.ecn_recv_ece);
2117 tcp_cumulative_stat(tcpstat.tcps_ecn_sent_ece,
2118 &prev.tcps_ecn_sent_ece, &stat.ecn_sent_ece);
2119 tcp_cumulative_stat(tcpstat.tcps_ecn_sent_ece,
2120 &prev.tcps_ecn_sent_ece, &stat.ecn_sent_ece);
2121 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_recv_ce,
2122 &prev.tcps_ecn_conn_recv_ce, &stat.ecn_conn_recv_ce);
2123 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_recv_ece,
2124 &prev.tcps_ecn_conn_recv_ece, &stat.ecn_conn_recv_ece);
2125 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_plnoce,
2126 &prev.tcps_ecn_conn_plnoce, &stat.ecn_conn_plnoce);
2127 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_pl_ce,
2128 &prev.tcps_ecn_conn_pl_ce, &stat.ecn_conn_pl_ce);
2129 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_nopl_ce,
2130 &prev.tcps_ecn_conn_nopl_ce, &stat.ecn_conn_nopl_ce);
2131 tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_synloss,
2132 &prev.tcps_ecn_fallback_synloss, &stat.ecn_fallback_synloss);
2133 tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_reorder,
2134 &prev.tcps_ecn_fallback_reorder, &stat.ecn_fallback_reorder);
2135 tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_ce,
2136 &prev.tcps_ecn_fallback_ce, &stat.ecn_fallback_ce);
2137 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_rcv,
2138 &prev.tcps_tfo_syn_data_rcv, &stat.tfo_syn_data_rcv);
2139 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req_rcv,
2140 &prev.tcps_tfo_cookie_req_rcv, &stat.tfo_cookie_req_rcv);
2141 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_sent,
2142 &prev.tcps_tfo_cookie_sent, &stat.tfo_cookie_sent);
2143 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_invalid,
2144 &prev.tcps_tfo_cookie_invalid, &stat.tfo_cookie_invalid);
2145 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req,
2146 &prev.tcps_tfo_cookie_req, &stat.tfo_cookie_req);
2147 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_rcv,
2148 &prev.tcps_tfo_cookie_rcv, &stat.tfo_cookie_rcv);
2149 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_sent,
2150 &prev.tcps_tfo_syn_data_sent, &stat.tfo_syn_data_sent);
2151 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_acked,
2152 &prev.tcps_tfo_syn_data_acked, &stat.tfo_syn_data_acked);
2153 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_loss,
2154 &prev.tcps_tfo_syn_loss, &stat.tfo_syn_loss);
2155 tcp_cumulative_stat(tcpstat.tcps_tfo_blackhole,
2156 &prev.tcps_tfo_blackhole, &stat.tfo_blackhole);
2157 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_wrong,
2158 &prev.tcps_tfo_cookie_wrong, &stat.tfo_cookie_wrong);
2159 tcp_cumulative_stat(tcpstat.tcps_tfo_no_cookie_rcv,
2160 &prev.tcps_tfo_no_cookie_rcv, &stat.tfo_no_cookie_rcv);
2161 tcp_cumulative_stat(tcpstat.tcps_tfo_heuristics_disable,
2162 &prev.tcps_tfo_heuristics_disable, &stat.tfo_heuristics_disable);
2163 tcp_cumulative_stat(tcpstat.tcps_tfo_sndblackhole,
2164 &prev.tcps_tfo_sndblackhole, &stat.tfo_sndblackhole);
2165
2166
2167
2168
2169 nstat_sysinfo_send_data(&data);
2170
2171 #undef stat
2172 }
2173
2174 void
2175 tcp_interface_send_probe(u_int16_t probe_if_index)
2176 {
2177 int32_t offset = 0;
2178 struct tcptimerlist *listp = &tcp_timer_list;
2179
2180 /* Make sure TCP clock is up to date */
2181 calculate_tcp_clock();
2182
2183 lck_mtx_lock(listp->mtx);
2184 if (listp->probe_if_index > 0) {
2185 tcpstat.tcps_probe_if_conflict++;
2186 goto done;
2187 }
2188
2189 listp->probe_if_index = probe_if_index;
2190 if (listp->running)
2191 goto done;
2192
2193 /*
2194 * Reschedule the timerlist to run within the next 10ms, which is
2195 * the fastest that we can do.
2196 */
2197 offset = TCP_TIMER_10MS_QUANTUM;
2198 if (listp->scheduled) {
2199 int32_t diff;
2200 diff = timer_diff(listp->runtime, 0, tcp_now, offset);
2201 if (diff <= 0) {
2202 /* The timer will fire sooner than what's needed */
2203 goto done;
2204 }
2205 }
2206 listp->mode = TCP_TIMERLIST_10MS_MODE;
2207 listp->idleruns = 0;
2208
2209 tcp_sched_timerlist(offset);
2210
2211 done:
2212 lck_mtx_unlock(listp->mtx);
2213 return;
2214 }
2215
2216 /*
2217 * Enable read probes on this connection, if:
2218 * - it is in established state
2219 * - doesn't have any data outstanding
2220 * - the outgoing ifp matches
2221 * - we have not already sent any read probes
2222 */
2223 static void
2224 tcp_enable_read_probe(struct tcpcb *tp, struct ifnet *ifp)
2225 {
2226 if (tp->t_state == TCPS_ESTABLISHED &&
2227 tp->snd_max == tp->snd_una &&
2228 tp->t_inpcb->inp_last_outifp == ifp &&
2229 !(tp->t_flagsext & TF_DETECT_READSTALL) &&
2230 tp->t_rtimo_probes == 0) {
2231 tp->t_flagsext |= TF_DETECT_READSTALL;
2232 tp->t_rtimo_probes = 0;
2233 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2234 TCP_TIMER_10MS_QUANTUM);
2235 if (tp->tentry.index == TCPT_NONE) {
2236 tp->tentry.index = TCPT_KEEP;
2237 tp->tentry.runtime = tcp_now +
2238 TCP_TIMER_10MS_QUANTUM;
2239 } else {
2240 int32_t diff = 0;
2241
2242 /* Reset runtime to be in next 10ms */
2243 diff = timer_diff(tp->tentry.runtime, 0,
2244 tcp_now, TCP_TIMER_10MS_QUANTUM);
2245 if (diff > 0) {
2246 tp->tentry.index = TCPT_KEEP;
2247 tp->tentry.runtime = tcp_now +
2248 TCP_TIMER_10MS_QUANTUM;
2249 if (tp->tentry.runtime == 0)
2250 tp->tentry.runtime++;
2251 }
2252 }
2253 }
2254 }
2255
2256 /*
2257 * Disable read probe and reset the keep alive timer
2258 */
2259 static void
2260 tcp_disable_read_probe(struct tcpcb *tp)
2261 {
2262 if (tp->t_adaptive_rtimo == 0 &&
2263 ((tp->t_flagsext & TF_DETECT_READSTALL) ||
2264 tp->t_rtimo_probes > 0)) {
2265 tcp_keepalive_reset(tp);
2266 }
2267 }
2268
2269 /*
2270 * Reschedule the tcp timerlist in the next 10ms to re-enable read/write
2271 * probes on connections going over a particular interface.
2272 */
2273 void
2274 tcp_probe_connectivity(struct ifnet *ifp, u_int32_t enable)
2275 {
2276 int32_t offset;
2277 struct tcptimerlist *listp = &tcp_timer_list;
2278 struct inpcbinfo *pcbinfo = &tcbinfo;
2279 struct inpcb *inp, *nxt;
2280
2281 if (ifp == NULL)
2282 return;
2283
2284 /* update clock */
2285 calculate_tcp_clock();
2286
2287 /*
2288 * Enable keep alive timer on all connections that are
2289 * active/established on this interface.
2290 */
2291 lck_rw_lock_shared(pcbinfo->ipi_lock);
2292
2293 LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, nxt) {
2294 struct tcpcb *tp = NULL;
2295 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) ==
2296 WNT_STOPUSING)
2297 continue;
2298
2299 /* Acquire lock to look at the state of the connection */
2300 tcp_lock(inp->inp_socket, 1, 0);
2301
2302 /* Release the want count */
2303 if (inp->inp_ppcb == NULL ||
2304 (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)) {
2305 tcp_unlock(inp->inp_socket, 1, 0);
2306 continue;
2307 }
2308 tp = intotcpcb(inp);
2309 if (enable)
2310 tcp_enable_read_probe(tp, ifp);
2311 else
2312 tcp_disable_read_probe(tp);
2313
2314 tcp_unlock(inp->inp_socket, 1, 0);
2315 }
2316 lck_rw_done(pcbinfo->ipi_lock);
2317
2318 lck_mtx_lock(listp->mtx);
2319 if (listp->running) {
2320 listp->pref_mode |= TCP_TIMERLIST_10MS_MODE;
2321 goto done;
2322 }
2323
2324 /* Reschedule within the next 10ms */
2325 offset = TCP_TIMER_10MS_QUANTUM;
2326 if (listp->scheduled) {
2327 int32_t diff;
2328 diff = timer_diff(listp->runtime, 0, tcp_now, offset);
2329 if (diff <= 0) {
2330 /* The timer will fire sooner than what's needed */
2331 goto done;
2332 }
2333 }
2334 listp->mode = TCP_TIMERLIST_10MS_MODE;
2335 listp->idleruns = 0;
2336
2337 tcp_sched_timerlist(offset);
2338 done:
2339 lck_mtx_unlock(listp->mtx);
2340 return;
2341 }
2342
2343 inline void
2344 tcp_update_mss_core(struct tcpcb *tp, struct ifnet *ifp)
2345 {
2346 struct if_cellular_status_v1 *ifsr;
2347 u_int32_t optlen;
2348 ifsr = &ifp->if_link_status->ifsr_u.ifsr_cell.if_cell_u.if_status_v1;
2349 if (ifsr->valid_bitmask & IF_CELL_UL_MSS_RECOMMENDED_VALID) {
2350 optlen = tp->t_maxopd - tp->t_maxseg;
2351
2352 if (ifsr->mss_recommended ==
2353 IF_CELL_UL_MSS_RECOMMENDED_NONE &&
2354 tp->t_cached_maxopd > 0 &&
2355 tp->t_maxopd < tp->t_cached_maxopd) {
2356 tp->t_maxopd = tp->t_cached_maxopd;
2357 tcpstat.tcps_mss_to_default++;
2358 } else if (ifsr->mss_recommended ==
2359 IF_CELL_UL_MSS_RECOMMENDED_MEDIUM &&
2360 tp->t_maxopd > tcp_mss_rec_medium) {
2361 tp->t_cached_maxopd = tp->t_maxopd;
2362 tp->t_maxopd = tcp_mss_rec_medium;
2363 tcpstat.tcps_mss_to_medium++;
2364 } else if (ifsr->mss_recommended ==
2365 IF_CELL_UL_MSS_RECOMMENDED_LOW &&
2366 tp->t_maxopd > tcp_mss_rec_low) {
2367 tp->t_cached_maxopd = tp->t_maxopd;
2368 tp->t_maxopd = tcp_mss_rec_low;
2369 tcpstat.tcps_mss_to_low++;
2370 }
2371 tp->t_maxseg = tp->t_maxopd - optlen;
2372
2373 /*
2374 * clear the cached value if it is same as the current
2375 */
2376 if (tp->t_maxopd == tp->t_cached_maxopd)
2377 tp->t_cached_maxopd = 0;
2378 }
2379 }
2380
2381 void
2382 tcp_update_mss_locked(struct socket *so, struct ifnet *ifp)
2383 {
2384 struct inpcb *inp = sotoinpcb(so);
2385 struct tcpcb *tp = intotcpcb(inp);
2386
2387 if (ifp == NULL && inp->inp_last_outifp == NULL)
2388 return;
2389
2390 if (ifp == NULL)
2391 ifp = inp->inp_last_outifp;
2392
2393 if (!IFNET_IS_CELLULAR(ifp)) {
2394 /*
2395 * This optimization is implemented for cellular
2396 * networks only
2397 */
2398 return;
2399 }
2400 if ( tp->t_state <= TCPS_CLOSE_WAIT) {
2401 /*
2402 * If the connection is currently doing or has done PMTU
2403 * blackhole detection, do not change the MSS
2404 */
2405 if (tp->t_flags & TF_BLACKHOLE)
2406 return;
2407 if (ifp->if_link_status == NULL)
2408 return;
2409 tcp_update_mss_core(tp, ifp);
2410 }
2411 }
2412
2413 void
2414 tcp_itimer(struct inpcbinfo *ipi)
2415 {
2416 struct inpcb *inp, *nxt;
2417
2418 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
2419 if (tcp_itimer_done == TRUE) {
2420 tcp_itimer_done = FALSE;
2421 atomic_add_32(&ipi->ipi_timer_req.intimer_fast, 1);
2422 return;
2423 }
2424 /* Upgrade failed, lost lock now take it again exclusive */
2425 lck_rw_lock_exclusive(ipi->ipi_lock);
2426 }
2427 tcp_itimer_done = TRUE;
2428
2429 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
2430 struct socket *so;
2431
2432 if (inp->inp_ppcb == NULL ||
2433 in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
2434 continue;
2435 so = inp->inp_socket;
2436 tcp_lock(so, 1, 0);
2437 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2438 tcp_unlock(so, 1, 0);
2439 continue;
2440 }
2441 so_check_extended_bk_idle_time(so);
2442 if (ipi->ipi_flags & INPCBINFO_UPDATE_MSS) {
2443 tcp_update_mss_locked(so, NULL);
2444 }
2445 tcp_unlock(so, 1, 0);
2446 }
2447
2448 ipi->ipi_flags &= ~INPCBINFO_UPDATE_MSS;
2449 lck_rw_done(ipi->ipi_lock);
2450 }