/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
+#include <sys/domain.h>
#include <kern/locks.h>
#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
+#if INET6
+#include <netinet6/tcp6_var.h>
+#endif
#include <netinet/tcpip.h>
#if TCPDEBUG
#include <netinet/tcp_debug.h>
SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
&always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
+/*
+ * See tcp_syn_backoff[] for interval values between SYN retransmits;
+ * the value set below defines the number of retransmits, before we
+ * disable the timestamp and window scaling options during subsequent
+ * SYN retransmits. Setting it to 0 disables the dropping off of those
+ * two options.
+ */
+static int tcp_broken_peer_syn_rxmit_thres = 7;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW,
+ &tcp_broken_peer_syn_rxmit_thres, 0, "Number of retransmitted SYNs before "
+ "TCP disables rfc1323 and rfc1644 during the rest of attempts");
+
+int tcp_pmtud_black_hole_detect = 1 ;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW,
+ &tcp_pmtud_black_hole_detect, 0, "Path MTU Discovery Black Hole Detection");
+
+int tcp_pmtud_black_hole_mss = 1200 ;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW,
+ &tcp_pmtud_black_hole_mss, 0, "Path MTU Discovery Black Hole Detection lowered MSS");
+
static int tcp_keepcnt = TCPTV_KEEPCNT;
static int tcp_gc_done = FALSE; /* perfromed garbage collection of "used" sockets */
/* max idle probes */
struct inpcbhead time_wait_slots[N_TIME_WAIT_SLOTS];
int cur_tw_slot = 0;
-u_long *delack_bitmask;
+u_int32_t *delack_bitmask;
void add_to_time_wait_locked(struct tcpcb *tp);
void add_to_time_wait(struct tcpcb *tp) ;
+static void tcp_garbage_collect(struct inpcb *, int);
void add_to_time_wait_locked(struct tcpcb *tp)
{
* Fast timeout routine for processing delayed acks
*/
void
-tcp_fasttimo()
+tcp_fasttimo(void *arg)
{
+#pragma unused(arg)
struct inpcb *inp;
register struct tcpcb *tp;
struct socket *so;
so = inp->inp_socket;
- if (so == &tcbinfo.nat_dummy_socket)
- continue;
-
if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
continue;
timeout(tcp_fasttimo, 0, hz/TCP_RETRANSHZ);
}
-void
-tcp_garbage_collect(inp, istimewait)
- struct inpcb *inp;
- int istimewait;
+static void
+tcp_garbage_collect(struct inpcb *inp, int istimewait)
{
struct socket *so;
struct tcpcb *tp;
+ so = inp->inp_socket;
+ tp = intotcpcb(inp);
- if (inp->inp_socket == &tcbinfo.nat_dummy_socket)
- return;
-
-
- if (!lck_mtx_try_lock(inp->inpcb_mtx)) /* skip if still in use */
- return;
+ /*
+ * Skip if still in use or busy; it would have been more efficient
+ * if we were to test so_usecount against 0, but this isn't possible
+ * due to the current implementation of tcp_dropdropablreq() where
+ * overflow sockets that are eligible for garbage collection have
+ * their usecounts set to 1.
+ */
+ if (so->so_usecount > 1 || !lck_mtx_try_lock_spin(inp->inpcb_mtx))
+ return;
- so = inp->inp_socket;
- tp = intotcpcb(inp);
+ /* Check again under the lock */
+ if (so->so_usecount > 1) {
+ lck_mtx_unlock(inp->inpcb_mtx);
+ return;
+ }
- if ((so->so_usecount == 1) &&
- (so->so_flags & SOF_OVERFLOW)) {
- in_pcbdetach(inp);
- so->so_usecount--;
- lck_mtx_unlock(inp->inpcb_mtx);
- return;
- }
- else {
- if (inp->inp_wantcnt != WNT_STOPUSING) {
- lck_mtx_unlock(inp->inpcb_mtx);
- return;
- }
+ /*
+ * Overflowed socket dropped from the listening queue? Do this
+ * only if we are called to clean up the time wait slots, since
+ * tcp_dropdropablreq() considers a socket to have been fully
+ * dropped after add_to_time_wait() is finished.
+ * Also handle the case of connections getting closed by the peer while in the queue as
+ * seen with rdar://6422317
+ *
+ */
+ if (so->so_usecount == 1 &&
+ ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
+ ((tp != NULL) && (tp->t_state == TCPS_CLOSED) && (so->so_head != NULL)
+ && ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
+ (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
+
+ if (inp->inp_state != INPCB_STATE_DEAD) {
+ /* Become a regular mutex */
+ lck_mtx_convert_spin(inp->inpcb_mtx);
+#if INET6
+ if (INP_CHECK_SOCKAF(so, AF_INET6))
+ in6_pcbdetach(inp);
+ else
+#endif /* INET6 */
+ in_pcbdetach(inp);
}
+ so->so_usecount--;
+ lck_mtx_unlock(inp->inpcb_mtx);
+ return;
+ } else if (inp->inp_wantcnt != WNT_STOPUSING) {
+ lck_mtx_unlock(inp->inpcb_mtx);
+ return;
+ }
-
- if (so->so_usecount == 0)
- in_pcbdispose(inp);
- else {
- /* Special case:
- * - Check for embryonic socket stuck on listener queue (4023660)
- * - overflowed socket dropped from the listening queue
- * and dispose of remaining reference
- */
- if ((so->so_usecount == 1) &&
- (((tp->t_state == TCPS_CLOSED) && (so->so_head != NULL) && (so->so_state & SS_INCOMP)) ||
- (istimewait && (so->so_flags & SOF_OVERFLOW)))) {
- so->so_usecount--;
- in_pcbdispose(inp);
- } else
- lck_mtx_unlock(inp->inpcb_mtx);
+ /*
+ * We get here because the PCB is no longer searchable (WNT_STOPUSING);
+ * detach (if needed) and dispose if it is dead (usecount is 0). This
+ * covers all cases, including overflow sockets and those that are
+ * considered as "embryonic", i.e. created by sonewconn() in TCP input
+ * path, and have not yet been committed. For the former, we reduce
+ * the usecount to 0 as done by the code above. For the latter, the
+ * usecount would have reduced to 0 as part calling soabort() when the
+ * socket is dropped at the end of tcp_input().
+ */
+ if (so->so_usecount == 0) {
+ /* Become a regular mutex */
+ lck_mtx_convert_spin(inp->inpcb_mtx);
+ if (inp->inp_state != INPCB_STATE_DEAD) {
+#if INET6
+ if (INP_CHECK_SOCKAF(so, AF_INET6))
+ in6_pcbdetach(inp);
+ else
+#endif /* INET6 */
+ in_pcbdetach(inp);
}
+ in_pcbdispose(inp);
+ } else {
+ lck_mtx_unlock(inp->inpcb_mtx);
+ }
}
static int bg_cnt = 0;
#define BG_COUNTER_MAX 3
void
-tcp_slowtimo()
+tcp_slowtimo(void)
{
struct inpcb *inp, *nxt;
struct tcpcb *tp;
int ostate;
#endif
+#if KDEBUG
static int tws_checked = 0;
+#endif
struct inpcbinfo *pcbinfo = &tcbinfo;
LIST_FOREACH(inp, &tcb, inp_list) {
so = inp->inp_socket;
-
- if (so == &tcbinfo.nat_dummy_socket)
- continue;
if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
continue;
{
register int rexmt;
struct socket *so_tmp;
- struct inpcbinfo *pcbinfo = &tcbinfo;
struct tcptemp *t_template;
+ int optlen = 0;
#if TCPDEBUG
int ostate;
if (tp->t_state != TCPS_TIME_WAIT &&
tp->t_state != TCPS_FIN_WAIT_2 &&
tp->t_rcvtime < tcp_maxidle) {
- tp->t_timer[TCPT_2MSL] = (unsigned long)tcp_keepintvl;
+ tp->t_timer[TCPT_2MSL] = (u_int32_t)tcp_keepintvl;
}
else {
tp = tcp_close(tp);
tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
/*
- * Disable rfc1323 and rfc1644 if we havn't got any response to
- * our third SYN to work-around some broken terminal servers
- * (most of which have hopefully been retired) that have bad VJ
- * header compression code which trashes TCP segments containing
- * unknown-to-them TCP options.
+ * Check for potential Path MTU Discovery Black Hole
+ */
+
+ if (tcp_pmtud_black_hole_detect && (tp->t_state == TCPS_ESTABLISHED)) {
+ if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) && (tp->t_rxtshift == 2)) {
+ /*
+ * Enter Path MTU Black-hole Detection mechanism:
+ * - Disable Path MTU Discovery (IP "DF" bit).
+ * - Reduce MTU to lower value than what we negociated with peer.
+ */
+
+ tp->t_flags &= ~TF_PMTUD; /* Disable Path MTU Discovery for now */
+ tp->t_flags |= TF_BLACKHOLE; /* Record that we may have found a black hole */
+ optlen = tp->t_maxopd - tp->t_maxseg;
+ tp->t_pmtud_saved_maxopd = tp->t_maxopd; /* Keep track of previous MSS */
+ if (tp->t_maxopd > tcp_pmtud_black_hole_mss)
+ tp->t_maxopd = tcp_pmtud_black_hole_mss; /* Reduce the MSS to intermediary value */
+ else {
+ tp->t_maxopd = /* use the default MSS */
+#if INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif /* INET6 */
+ tcp_mssdflt;
+ }
+ tp->t_maxseg = tp->t_maxopd - optlen;
+ }
+ /*
+ * If further retransmissions are still unsuccessful with a lowered MTU,
+ * maybe this isn't a Black Hole and we restore the previous MSS and
+ * blackhole detection flags.
+ */
+ else {
+
+ if ((tp->t_flags & TF_BLACKHOLE) && (tp->t_rxtshift > 4)) {
+ tp->t_flags |= TF_PMTUD;
+ tp->t_flags &= ~TF_BLACKHOLE;
+ optlen = tp->t_maxopd - tp->t_maxseg;
+ tp->t_maxopd = tp->t_pmtud_saved_maxopd;
+ tp->t_maxseg = tp->t_maxopd - optlen;
+ }
+ }
+ }
+
+
+ /*
+ * Disable rfc1323 and rfc1644 if we haven't got any response to
+ * our SYN (after we reach the threshold) to work-around some
+ * broken terminal servers (most of which have hopefully been
+ * retired) that have bad VJ header compression code which
+ * trashes TCP segments containing unknown-to-them TCP options.
*/
- if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3))
- tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
+ if ((tp->t_state == TCPS_SYN_SENT) &&
+ (tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres))
+ tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
/*
* If losing, let the lower level know and try for
* a better route. Also, if we backed off this far,
* growth is 2 mss. We don't allow the threshhold
* to go below this.)
*/
- {
- u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
- if (win < 2)
- win = 2;
- tp->snd_cwnd = tp->t_maxseg;
- tp->snd_ssthresh = win * tp->t_maxseg;
- tp->t_bytes_acked = 0;
- tp->t_dupacks = 0;
- tp->t_unacksegs = 0;
+ if (tp->t_state >= TCPS_ESTABLISHED) {
+ u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
+ if (win < 2)
+ win = 2;
+ tp->snd_cwnd = tp->t_maxseg;
+ tp->snd_ssthresh = win * tp->t_maxseg;
+ tp->t_bytes_acked = 0;
+ tp->t_dupacks = 0;
+ tp->t_unacksegs = 0;
}
EXIT_FASTRECOVERY(tp);
(void) tcp_output(tp);
if ((always_keepalive ||
tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) &&
(tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
- if (tp->t_rcvtime >= TCP_KEEPIDLE(tp) + (unsigned long)tcp_maxidle)
+ if (tp->t_rcvtime >= TCP_KEEPIDLE(tp) + (u_int32_t)tcp_maxidle)
goto dropit;
/*
* Send a packet designed to force a response
tcpstat.tcps_keepprobe++;
t_template = tcp_maketemplate(tp);
if (t_template) {
+ unsigned int ifscope;
+
+ if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
+ ifscope = tp->t_inpcb->inp_boundif;
+ else
+ ifscope = IFSCOPE_NONE;
+
tcp_respond(tp, t_template->tt_ipgen,
&t_template->tt_t, (struct mbuf *)NULL,
- tp->rcv_nxt, tp->snd_una - 1, 0, NULL);
+ tp->rcv_nxt, tp->snd_una - 1, 0, ifscope);
(void) m_free(dtom(t_template));
}
tp->t_timer[TCPT_KEEP] = tcp_keepintvl;