/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
+#include <machine/endian.h>
+
#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>
static int tcp_do_rfc3465 = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
&tcp_do_rfc3465, 0, "");
+
+static int tcp_do_rfc3465_lim2 = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2, CTLFLAG_RW,
+ &tcp_do_rfc3465_lim2, 0, "Appropriate bytes counting w/ L=2*SMSS");
+
+#if CONFIG_IFEF_NOWINDOWSCALE
+int tcp_obey_ifef_nowindowscale = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW,
+ &tcp_obey_ifef_nowindowscale, 0, "");
+#endif
+
extern int tcp_TCPTV_MIN;
-u_long tcp_now;
+u_int32_t tcp_now;
struct inpcbhead tcb;
#define tcb6 tcb /* for KAME src sync over BSD*'s */
struct inpcbinfo tcbinfo;
static void tcp_dooptions(struct tcpcb *,
- u_char *, int, struct tcphdr *, struct tcpopt *);
+ u_char *, int, struct tcphdr *, struct tcpopt *, unsigned int);
static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
#define ND6_HINT(tp)
#endif
-extern u_long *delack_bitmask;
+extern u_int32_t *delack_bitmask;
extern void add_to_time_wait(struct tcpcb *);
extern void postevent(struct socket *, struct sockbuf *, int);
int dropsocket = 0;
int iss = 0;
int nosock = 0;
- u_long tiwin;
+ u_int32_t tiwin;
struct tcpopt to; /* options in this segment */
struct sockaddr_in *next_hop = NULL;
#if TCPDEBUG
#endif
struct m_tag *fwd_tag;
u_char ip_ecn = IPTOS_ECN_NOTECT;
+ unsigned int ifscope;
+
+ /*
+ * Record the interface where this segment arrived on; this does not
+ * affect normal data output (for non-detached TCP) as it provides a
+ * hint about which route and interface to use for sending in the
+ * absence of a PCB, when scoped routing (and thus source interface
+ * selection) are enabled.
+ */
+ if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL)
+ ifscope = m->m_pkthdr.rcvif->if_index;
+ else
+ ifscope = IFSCOPE_NONE;
/* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
- fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL);
+ if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
+ fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
+ KERNEL_TAG_TYPE_IPFORWARD, NULL);
+ } else {
+ fwd_tag = NULL;
+ }
if (fwd_tag != NULL) {
struct ip_fwd_tag *ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
ipov->ih_len = (u_short)tlen;
+
+#if BYTE_ORDER != BIG_ENDIAN
HTONS(ipov->ih_len);
+#endif
+
pseudo = in_cksum(m, sizeof (struct ip));
*(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0];
len = sizeof (struct ip) + tlen;
bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
ipov->ih_len = (u_short)tlen;
+
+#if BYTE_ORDER != BIG_ENDIAN
HTONS(ipov->ih_len);
+#endif
+
th->th_sum = in_cksum(m, len);
*(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0];
/*
* Convert TCP protocol specific fields to host format.
*/
+
+#if BYTE_ORDER != BIG_ENDIAN
NTOHL(th->th_seq);
NTOHL(th->th_ack);
NTOHS(th->th_win);
NTOHS(th->th_urp);
+#endif
/*
* Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
}
+ /*
+ * Use the interface scope information from the PCB for outbound
+ * segments. If the PCB isn't present and if scoped routing is
+ * enabled, tcp_respond will use the scope of the interface where
+ * the segment arrived on.
+ */
+ if (inp != NULL && (inp->inp_flags & INP_BOUND_IF))
+ ifscope = inp->inp_boundif;
#if IPSEC
if (ipsec_bypass == 0) {
#if INET6
if (isipv6) {
if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
IPSEC_STAT_INCREMENT(ipsec6stat.in_polvio);
+ if (in_pcb_checkstate(inp, WNT_RELEASE, 0) == WNT_STOPUSING)
+ inp = NULL; // pretend we didn't find it
goto dropnosock;
}
} else
#endif /* INET6 */
if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
- IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
+ IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
+ if (in_pcb_checkstate(inp, WNT_RELEASE, 0) == WNT_STOPUSING)
+ inp = NULL; // pretend we didn't find it
goto dropnosock;
}
}
}
so = inp->inp_socket;
if (so == NULL) {
- if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)
- inp = NULL; // pretend we didn't find it
+ /* This case shouldn't happen as the socket shouldn't be null
+ * if inp_state isn't set to INPCB_STATE_DEAD
+ * But just in case, we pretend we didn't find the socket if we hit this case
+ * as this isn't cause for a panic (the socket might be leaked however)...
+ */
+ inp = NULL;
#if TEMPDEBUG
- printf("tcp_input: no more socket for inp=%x\n", inp);
+ printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
#endif
goto dropnosock;
}
-#ifdef __APPLE__
- /*
- * Bogus state when listening port owned by SharedIP with loopback as the
- * only configured interface: BlueBox does not filters loopback
- */
- if (so == &tcbinfo.nat_dummy_socket)
- goto drop;
-
-#endif
- tcp_lock(so, 1, 2);
+ tcp_lock(so, 1, (void *)2);
if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
- tcp_unlock(so, 1, 2);
+ tcp_unlock(so, 1, (void *)2);
inp = NULL; // pretend we didn't find it
goto dropnosock;
}
#if INET6
struct inpcb *oinp = sotoinpcb(so);
#endif /* INET6 */
- int ogencnt = so->so_gencnt;
+ unsigned int head_ifscope;
+
+ /* Get listener's bound-to-interface, if any */
+ head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
+ inp->inp_boundif : IFSCOPE_NONE;
#if !IPSEC
/*
if (!so2)
goto drop;
}
- /*
- * Make sure listening socket did not get closed during socket allocation,
- * not only this is incorrect but it is know to cause panic
- */
- if (so->so_gencnt != ogencnt)
- goto drop;
+
+ /* Point "inp" and "tp" in tandem to new socket */
+ inp = (struct inpcb *)so2->so_pcb;
+ tp = intotcpcb(inp);
oso = so;
tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */
* we're committed to it below in TCPS_LISTEN.
*/
dropsocket++;
- inp = (struct inpcb *)so->so_pcb;
+
+ /*
+ * Inherit INP_BOUND_IF from listener; testing if
+ * head_ifscope is non-zero is sufficient, since it
+ * can only be set to a non-zero value earlier if
+ * the listener has such a flag set.
+ */
+#if INET6
+ if (head_ifscope != IFSCOPE_NONE && !isipv6) {
+#else
+ if (head_ifscope != IFSCOPE_NONE) {
+#endif /* INET6 */
+ inp->inp_flags |= INP_BOUND_IF;
+ inp->inp_boundif = head_ifscope;
+ }
#if INET6
if (isipv6)
inp->in6p_laddr = ip6->ip6_dst;
printf("tcp_input: could not copy policy\n");
}
#endif
- tcp_unlock(oso, 1, 0); /* now drop the reference on the listener */
- tp = intotcpcb(inp);
+ /* inherit states from the listener */
tp->t_state = TCPS_LISTEN;
tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
+ tp->t_keepinit = tp0->t_keepinit;
tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
+
+ /* now drop the reference on the listener */
+ tcp_unlock(oso, 1, 0);
+
/* Compute proper scaling value from buffer space */
if (inp->inp_pcbinfo->ipi_count < tcp_sockthreshold) {
tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale);
- so->so_rcv.sb_hiwat = lmin(TCP_MAXWIN << tp->request_r_scale, (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES);
+ so->so_rcv.sb_hiwat = imin(TCP_MAXWIN << tp->request_r_scale, (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES);
}
else {
while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) {
char ipstrbuf[MAX_IPv6_STR_LEN];
printf("too many small tcp packets from "
- "%s:%u, av. %lubyte/packet, "
+ "%s:%u, av. %ubyte/packet, "
"dropping connection\n",
#if INET6
isipv6 ?
* else do it below (after getting remote address).
*/
if (tp->t_state != TCPS_LISTEN && optp)
- tcp_dooptions(tp, optp, optlen, th, &to);
+ tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
if (to.to_flags & TOF_SCALE) {
tp->ts_recent_age = tcp_now;
}
if (to.to_flags & TOF_MSS)
- tcp_mss(tp, to.to_mss);
+ tcp_mss(tp, to.to_mss, ifscope);
if (tp->sack_enable) {
if (!(to.to_flags & TOF_SACK))
tp->sack_enable = 0;
tp->ts_recent = to.to_tsval;
}
+ /* Force acknowledgment if we received a FIN */
+
+ if (thflags & TH_FIN)
+ tp->t_flags |= TF_ACKNOW;
+
if (tlen == 0) {
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
FREE(sin, M_SONAME);
}
- tcp_dooptions(tp, optp, optlen, th, &to);
+ tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
if (tp->sack_enable) {
if (!(to.to_flags & TOF_SACK))
tp->t_flags |= TF_ACKNOW;
tp->t_unacksegs = 0;
tp->t_state = TCPS_SYN_RECEIVED;
- tp->t_timer[TCPT_KEEP] = tcp_keepinit;
+ tp->t_timer[TCPT_KEEP] = tp->t_keepinit ? tp->t_keepinit : tcp_keepinit;
dropsocket = 0; /* committed to socket */
tcpstat.tcps_accepts++;
if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) {
/* ECN-setup SYN */
tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
}
-#ifdef IFEF_NOWINDOWSCALE
- if (m->m_pkthdr.rcvif != NULL &&
- (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE) != 0)
- {
- // Timestamps are not enabled on this interface
- tp->t_flags &= ~(TF_REQ_SCALE);
+#if CONFIG_IFEF_NOWINDOWSCALE
+ if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL &&
+ (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) {
+ /* Window scaling is not enabled on this interface */
+ tp->t_flags &= ~TF_REQ_SCALE;
}
#endif
goto trimthenstep6;
tp->t_dupacks = 0;
break;
}
-
+ /*
+ * If the congestion window was inflated to account
+ * for the other side's cached packets, retract it.
+ */
if (!IN_FASTRECOVERY(tp)) {
/*
* We were not in fast recovery. Reset the duplicate ack
}
else {
if (tcp_do_newreno) {
- long ss = tp->snd_max - th->th_ack;
+ int32_t ss = tp->snd_max - th->th_ack;
/*
* Complete ack. Inflate the congestion window to
register u_int cw = tp->snd_cwnd;
register u_int incr = tp->t_maxseg;
- if (cw >= tp->snd_ssthresh) {
- tp->t_bytes_acked += acked;
- if (tp->t_bytes_acked >= cw) {
+ if (tcp_do_rfc3465) {
+
+ if (cw >= tp->snd_ssthresh) {
+ tp->t_bytes_acked += acked;
+ if (tp->t_bytes_acked >= cw) {
/* Time to increase the window. */
- tp->t_bytes_acked -= cw;
- } else {
+ tp->t_bytes_acked -= cw;
+ } else {
/* No need to increase yet. */
- incr = 0;
+ incr = 0;
+ }
+ } else {
+ /*
+ * If the user explicitly enables RFC3465
+ * use 2*SMSS for the "L" param. Otherwise
+ * use the more conservative 1*SMSS.
+ *
+ * (See RFC 3465 2.3 Choosing the Limit)
+ */
+ u_int abc_lim;
+
+ abc_lim = (tcp_do_rfc3465_lim2 &&
+ tp->snd_nxt == tp->snd_max) ? incr * 2 : incr;
+
+ incr = lmin(acked, abc_lim);
}
- } else {
+ }
+ else {
/*
- * If the user explicitly enables RFC3465
- * use 2*SMSS for the "L" param. Otherwise
- * use the more conservative 1*SMSS.
- *
- * (See RFC 3465 2.3 Choosing the Limit)
+ * If the window gives us less than ssthresh packets
+ * in flight, open exponentially (segsz per packet).
+ * Otherwise open linearly: segsz per window
+ * (segsz^2 / cwnd per packet).
*/
- u_int abc_lim;
-
- abc_lim = (tcp_do_rfc3465 == 0) ?
- incr : incr * 2;
- incr = min(acked, abc_lim);
+
+ if (cw >= tp->snd_ssthresh) {
+ incr = max((incr * incr / cw), 1);
+ }
}
+
tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
}
if (acked > so->so_snd.sb_cc) {
soisdisconnected(so);
}
tp->t_state = TCPS_FIN_WAIT_2;
- goto drop;
+ /* fall through and make sure we also recognize data ACKed with the FIN */
}
+ tp->t_flags |= TF_ACKNOW;
break;
/*
tcp_canceltimers(tp);
/* Shorten TIME_WAIT [RFC-1644, p.28] */
if (tp->cc_recv != 0 &&
- tp->t_starttime < (u_long)tcp_msl)
+ tp->t_starttime < (u_int32_t)tcp_msl)
tp->t_timer[TCPT_2MSL] =
tp->t_rxtcur * TCPTV_TWTRUNC;
else
add_to_time_wait(tp);
soisdisconnected(so);
}
+ tp->t_flags |= TF_ACKNOW;
break;
/*
* but if two URG's are pending at once, some out-of-band
* data may creep in... ick.
*/
- if (th->th_urp <= (u_long)tlen
+ if (th->th_urp <= (u_int32_t)tlen
#if SO_OOBINLINE
&& (so->so_options & SO_OOBINLINE) == 0
#endif
* case PRU_RCVD). If a FIN has already been received on this
* connection then we just ignore the text.
*/
- if ((tlen || (thflags&TH_FIN)) &&
+ if ((tlen || (thflags & TH_FIN)) &&
TCPS_HAVERCVDFIN(tp->t_state) == 0) {
tcp_seq save_start = th->th_seq;
tcp_seq save_end = th->th_seq + tlen;
tcp_canceltimers(tp);
/* Shorten TIME_WAIT [RFC-1644, p.28] */
if (tp->cc_recv != 0 &&
- tp->t_starttime < (u_long)tcp_msl) {
+ tp->t_starttime < (u_int32_t)tcp_msl) {
tp->t_timer[TCPT_2MSL] =
tp->t_rxtcur * TCPTV_TWTRUNC;
/* For transaction client, force ACK now. */
if (thflags & TH_ACK)
/* mtod() below is safe as long as hdr dropping is delayed */
tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
- TH_RST, m->m_pkthdr.rcvif);
+ TH_RST, ifscope);
else {
if (thflags & TH_SYN)
tlen++;
/* mtod() below is safe as long as hdr dropping is delayed */
tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
- (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.rcvif);
+ (tcp_seq)0, TH_RST|TH_ACK, ifscope);
}
/* destroy temporarily created socket */
if (dropsocket) {
}
static void
-tcp_dooptions(tp, cp, cnt, th, to)
+tcp_dooptions(tp, cp, cnt, th, to, input_ifscope)
/*
* Parse TCP options and place in tcpopt.
*/
int cnt;
struct tcphdr *th;
struct tcpopt *to;
+ unsigned int input_ifscope;
{
u_short mss = 0;
int opt, optlen;
if (!(th->th_flags & TH_SYN))
continue;
bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
+
+#if BYTE_ORDER != BIG_ENDIAN
NTOHS(mss);
+#endif
+
break;
case TCPOPT_WINDOW:
to->to_flags |= TOF_TS;
bcopy((char *)cp + 2,
(char *)&to->to_tsval, sizeof(to->to_tsval));
+
+#if BYTE_ORDER != BIG_ENDIAN
NTOHL(to->to_tsval);
+#endif
+
bcopy((char *)cp + 6,
(char *)&to->to_tsecr, sizeof(to->to_tsecr));
+
+#if BYTE_ORDER != BIG_ENDIAN
NTOHL(to->to_tsecr);
+#endif
/*
* A timestamp received in a SYN makes
}
}
if (th->th_flags & TH_SYN)
- tcp_mss(tp, mss); /* sets t_maxseg */
+ tcp_mss(tp, mss, input_ifscope); /* sets t_maxseg */
}
/*
{
unsigned int maxmtu;
+ RT_LOCK_ASSERT_HELD(rt);
if (rt->rt_rmx.rmx_mtu == 0)
maxmtu = rt->rt_ifp->if_mtu;
else
{
unsigned int maxmtu;
+ RT_LOCK_ASSERT_HELD(rt);
+ lck_rw_lock_shared(nd_if_rwlock);
if (rt->rt_rmx.rmx_mtu == 0)
maxmtu = IN6_LINKMTU(rt->rt_ifp);
else
maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
+ lck_rw_done(nd_if_rwlock);
return (maxmtu);
}
*
*/
void
-tcp_mss(tp, offer)
+tcp_mss(tp, offer, input_ifscope)
struct tcpcb *tp;
int offer;
+ unsigned int input_ifscope;
{
register struct rtentry *rt;
struct ifnet *ifp;
register int rtt, mss;
- u_long bufsize;
+ u_int32_t bufsize;
struct inpcb *inp;
struct socket *so;
struct rmxp_tao *taop;
int origoffer = offer;
- u_long sb_max_corrected;
+ u_int32_t sb_max_corrected;
int isnetlocal = 0;
#if INET6
int isipv6;
#else
#define min_protoh (sizeof (struct tcpiphdr))
#endif
- lck_mtx_lock(rt_mtx);
+
#if INET6
if (isipv6) {
rt = tcp_rtlookup6(inp);
- if (rt && (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) || IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) || rt->rt_gateway->sa_family == AF_LINK))
- isnetlocal = TRUE;
+ if (rt != NULL &&
+ (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) ||
+ IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) ||
+ rt->rt_gateway->sa_family == AF_LINK))
+ isnetlocal = TRUE;
}
else
#endif /* INET6 */
{
- rt = tcp_rtlookup(inp);
- if (rt && (rt->rt_gateway->sa_family == AF_LINK ||
- rt->rt_ifp->if_flags & IFF_LOOPBACK))
+ rt = tcp_rtlookup(inp, input_ifscope);
+ if (rt != NULL &&
+ (rt->rt_gateway->sa_family == AF_LINK ||
+ rt->rt_ifp->if_flags & IFF_LOOPBACK))
isnetlocal = TRUE;
}
if (rt == NULL) {
isipv6 ? tcp_v6mssdflt :
#endif /* INET6 */
tcp_mssdflt;
- lck_mtx_unlock(rt_mtx);
return;
}
ifp = rt->rt_ifp;
*/
tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
tcpstat.tcps_usedssthresh++;
- }
- else
+ } else {
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ }
- lck_mtx_unlock(rt_mtx);
+ /* Route locked during lookup above */
+ RT_UNLOCK(rt);
}
/*
#else
#define min_protoh (sizeof (struct tcpiphdr))
#endif
- lck_mtx_lock(rt_mtx);
+
#if INET6
if (isipv6)
rt = tcp_rtlookup6(tp->t_inpcb);
else
#endif /* INET6 */
- rt = tcp_rtlookup(tp->t_inpcb);
+ rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
if (rt == NULL) {
- lck_mtx_unlock(rt_mtx);
return (
#if INET6
isipv6 ? tcp_v6mssdflt :
#else
mss = tcp_maxmtu(rt);
#endif
- lck_mtx_unlock(rt_mtx);
+ /* Route locked during lookup above */
+ RT_UNLOCK(rt);
return (mss - min_protoh);
}
struct tcphdr *th;
{
tcp_seq onxt = tp->snd_nxt;
- u_long ocwnd = tp->snd_cwnd;
+ u_int32_t ocwnd = tp->snd_cwnd;
tp->t_timer[TCPT_REXMT] = 0;
tp->t_rtttime = 0;
tp->snd_nxt = th->th_ack;
tcp_unlock(so, 1, 0);
}
}
+ else {
+ /* do not try to lock the inp in in_pcb_checkstate
+ * because the lock is already held in some other thread.
+ * Only drop the inp_wntcnt reference.
+ */
+ in_pcb_checkstate(inp, WNT_RELEASE, 1);
+ }
}
so = sonext;