X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/2d21ac55c334faf3a56e5634905ed6987fc787d4..c910b4d9d2451126ae3917b931cd4390c11e1d52:/bsd/netinet/tcp_input.c diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index 39a5fc252..138bcb3c7 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -210,7 +210,7 @@ struct inpcbhead tcb; struct inpcbinfo tcbinfo; static void tcp_dooptions(struct tcpcb *, - u_char *, int, struct tcphdr *, struct tcpopt *); + u_char *, int, struct tcphdr *, struct tcpopt *, unsigned int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, @@ -552,6 +552,19 @@ tcp_input(m, off0) #endif struct m_tag *fwd_tag; u_char ip_ecn = IPTOS_ECN_NOTECT; + unsigned int ifscope; + + /* + * Record the interface where this segment arrived on; this does not + * affect normal data output (for non-detached TCP) as it provides a + * hint about which route and interface to use for sending in the + * absence of a PCB, when scoped routing (and thus source interface + * selection) are enabled. + */ + if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) + ifscope = m->m_pkthdr.rcvif->if_index; + else + ifscope = IFSCOPE_NONE; /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL); @@ -821,6 +834,14 @@ findpcb: ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); } + /* + * Use the interface scope information from the PCB for outbound + * segments. If the PCB isn't present and if scoped routing is + * enabled, tcp_respond will use the scope of the interface where + * the segment arrived on. + */ + if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) + ifscope = inp->inp_boundif; #if IPSEC if (ipsec_bypass == 0) { #if INET6 @@ -981,6 +1002,11 @@ findpcb: struct inpcb *oinp = sotoinpcb(so); #endif /* INET6 */ int ogencnt = so->so_gencnt; + unsigned int head_ifscope; + + /* Get listener's bound-to-interface, if any */ + head_ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : IFSCOPE_NONE; #if !IPSEC /* @@ -1107,6 +1133,21 @@ findpcb: */ dropsocket++; inp = (struct inpcb *)so->so_pcb; + + /* + * Inherit INP_BOUND_IF from listener; testing if + * head_ifscope is non-zero is sufficient, since it + * can only be set to a non-zero value earlier if + * the listener has such a flag set. + */ +#if INET6 + if (head_ifscope != IFSCOPE_NONE && !isipv6) { +#else + if (head_ifscope != IFSCOPE_NONE) { +#endif /* INET6 */ + inp->inp_flags |= INP_BOUND_IF; + inp->inp_boundif = head_ifscope; + } #if INET6 if (isipv6) inp->in6p_laddr = ip6->ip6_dst; @@ -1344,7 +1385,7 @@ findpcb: * else do it below (after getting remote address). */ if (tp->t_state != TCPS_LISTEN && optp) - tcp_dooptions(tp, optp, optlen, th, &to); + tcp_dooptions(tp, optp, optlen, th, &to, ifscope); if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if (to.to_flags & TOF_SCALE) { @@ -1359,7 +1400,7 @@ findpcb: tp->ts_recent_age = tcp_now; } if (to.to_flags & TOF_MSS) - tcp_mss(tp, to.to_mss); + tcp_mss(tp, to.to_mss, ifscope); if (tp->sack_enable) { if (!(to.to_flags & TOF_SACK)) tp->sack_enable = 0; @@ -1406,6 +1447,11 @@ findpcb: tp->ts_recent = to.to_tsval; } + /* Force acknowledgment if we received a FIN */ + + if (thflags & TH_FIN) + tp->t_flags |= TF_ACKNOW; + if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && @@ -1700,7 +1746,7 @@ findpcb: FREE(sin, M_SONAME); } - tcp_dooptions(tp, optp, optlen, th, &to); + tcp_dooptions(tp, optp, optlen, th, &to, ifscope); if (tp->sack_enable) { if (!(to.to_flags & TOF_SACK)) @@ -1734,6 +1780,14 @@ findpcb: /* ECN-setup SYN */ tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT); } +#ifdef IFEF_NOWINDOWSCALE + if (m->m_pkthdr.rcvif != NULL && + (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE) != 0) + { + // Timestamps are not enabled on this interface + tp->t_flags &= ~(TF_REQ_SCALE); + } +#endif goto trimthenstep6; } @@ -1794,7 +1848,6 @@ findpcb: tp->ecn_flags &= ~TE_SENDIPECT; } - soisconnected(so); #if CONFIG_MACF_NET && CONFIG_MACF_SOCKET /* XXXMAC: recursive lock: SOCK_LOCK(so); */ mac_socketpeer_label_associate_mbuf(m, so); @@ -1835,6 +1888,10 @@ findpcb: tp->t_state = TCPS_ESTABLISHED; tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); } + /* soisconnected may lead to socket_unlock in case of upcalls, + * make sure this is done when everything is setup. + */ + soisconnected(so); } else { /* * Received initial SYN in SYN-SENT[*] state => simul- @@ -2223,7 +2280,6 @@ trimthenstep6: case TCPS_SYN_RECEIVED: tcpstat.tcps_connects++; - soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == @@ -2252,8 +2308,14 @@ trimthenstep6: (void) tcp_reass(tp, (struct tcphdr *)0, &tlen, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; + /* FALLTHROUGH */ + /* soisconnected may lead to socket_unlock in case of upcalls, + * make sure this is done when everything is setup. + */ + soisconnected(so); + /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range @@ -2385,44 +2447,72 @@ trimthenstep6: tp->t_dupacks = 0; break; } + + if (!IN_FASTRECOVERY(tp)) { + /* + * We were not in fast recovery. Reset the duplicate ack + * counter. + */ + tp->t_dupacks = 0; + } /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (tcp_do_newreno || tp->sack_enable) { - if (IN_FASTRECOVERY(tp)) { + else { + if (tcp_do_newreno || tp->sack_enable) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->sack_enable) tcp_sack_partialack(tp, th); else - tcp_newreno_partial_ack(tp, th); - } else { - /* - * Out of fast recovery. - * Window inflation should have left us - * with approximately snd_ssthresh - * outstanding data. - * But in case we would be inclined to - * send a burst, better to do it via - * the slow start mechanism. - */ - if (SEQ_GT(th->th_ack + - tp->snd_ssthresh, - tp->snd_max)) - tp->snd_cwnd = tp->snd_max - - th->th_ack + - tp->t_maxseg; - else - tp->snd_cwnd = tp->snd_ssthresh; + tcp_newreno_partial_ack(tp, th); + } + else { + if (tcp_do_newreno) { + long ss = tp->snd_max - th->th_ack; + + /* + * Complete ack. Inflate the congestion window to + * ssthresh and exit fast recovery. + * + * Window inflation should have left us with approx. + * snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do + * it via the slow start mechanism. + */ + if (ss < tp->snd_ssthresh) + tp->snd_cwnd = ss + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; + } + else { + /* + * Clamp the congestion window to the crossover point + * and exit fast recovery. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + } + + EXIT_FASTRECOVERY(tp); + tp->t_dupacks = 0; + tp->t_bytes_acked = 0; } } - } else { - if (tp->t_dupacks >= tcprexmtthresh && - tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; + else { + /* + * Clamp the congestion window to the crossover point + * and exit fast recovery in non-newreno and non-SACK case. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + EXIT_FASTRECOVERY(tp); + tp->t_dupacks = 0; + tp->t_bytes_acked = 0; + } } - tp->t_dupacks = 0; - tp->t_bytes_acked = 0; + + /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. @@ -2577,7 +2667,6 @@ process_ACK: tp->snd_wnd -= acked; ourfinisacked = 0; } - sowwakeup(so); /* detect una wraparound */ if ((tcp_do_newreno || tp->sack_enable) && !IN_FASTRECOVERY(tp) && @@ -2595,6 +2684,12 @@ process_ACK: } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; + + /* + * sowwakeup must happen after snd_una, et al. are updated so that + * the sequence numbers are in sync with so_snd + */ + sowwakeup(so); switch (tp->t_state) { @@ -2613,13 +2708,14 @@ process_ACK: * we'll hang forever. */ if (so->so_state & SS_CANTRCVMORE) { - soisdisconnected(so); tp->t_timer[TCPT_2MSL] = tcp_maxidle; add_to_time_wait(tp); + soisdisconnected(so); } tp->t_state = TCPS_FIN_WAIT_2; - goto drop; + /* fall through and make sure we also recognize data ACKed with the FIN */ } + tp->t_flags |= TF_ACKNOW; break; /* @@ -2642,6 +2738,7 @@ process_ACK: add_to_time_wait(tp); soisdisconnected(so); } + tp->t_flags |= TF_ACKNOW; break; /* @@ -2762,7 +2859,7 @@ dodata: /* XXX */ * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ - if ((tlen || (thflags&TH_FIN)) && + if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_end = th->th_seq + tlen; @@ -3007,13 +3104,13 @@ dropwithreset: if (thflags & TH_ACK) /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, - TH_RST, m->m_pkthdr.rcvif); + TH_RST, ifscope); else { if (thflags & TH_SYN) tlen++; /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, - (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.rcvif); + (tcp_seq)0, TH_RST|TH_ACK, ifscope); } /* destroy temporarily created socket */ if (dropsocket) { @@ -3050,7 +3147,7 @@ drop: } static void -tcp_dooptions(tp, cp, cnt, th, to) +tcp_dooptions(tp, cp, cnt, th, to, input_ifscope) /* * Parse TCP options and place in tcpopt. */ @@ -3059,6 +3156,7 @@ tcp_dooptions(tp, cp, cnt, th, to) int cnt; struct tcphdr *th; struct tcpopt *to; + unsigned int input_ifscope; { u_short mss = 0; int opt, optlen; @@ -3138,7 +3236,7 @@ tcp_dooptions(tp, cp, cnt, th, to) } } if (th->th_flags & TH_SYN) - tcp_mss(tp, mss); /* sets t_maxseg */ + tcp_mss(tp, mss, input_ifscope); /* sets t_maxseg */ } /* @@ -3312,9 +3410,10 @@ tcp_maxmtu6(struct rtentry *rt) * */ void -tcp_mss(tp, offer) +tcp_mss(tp, offer, input_ifscope) struct tcpcb *tp; int offer; + unsigned int input_ifscope; { register struct rtentry *rt; struct ifnet *ifp; @@ -3349,7 +3448,7 @@ tcp_mss(tp, offer) else #endif /* INET6 */ { - rt = tcp_rtlookup(inp); + rt = tcp_rtlookup(inp, input_ifscope); if (rt && (rt->rt_gateway->sa_family == AF_LINK || rt->rt_ifp->if_flags & IFF_LOOPBACK)) isnetlocal = TRUE; @@ -3538,6 +3637,9 @@ tcp_mss(tp, offer) tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); tcpstat.tcps_usedssthresh++; } + else + tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; + lck_mtx_unlock(rt_mtx); } @@ -3568,7 +3670,7 @@ tcp_mssopt(tp) rt = tcp_rtlookup6(tp->t_inpcb); else #endif /* INET6 */ - rt = tcp_rtlookup(tp->t_inpcb); + rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE); if (rt == NULL) { lck_mtx_unlock(rt_mtx); return (