X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/4a3eedf9ecc9bbe3f3a5c6ce5e53ad199d639d32..c331a0bec715536613c8dd5f34a4e115d5b15824:/bsd/netinet/tcp_input.c diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index 302ab9431..b65e9d5c6 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -81,6 +81,8 @@ #include /* before tcp_seq.h, for tcp_random18() */ +#include + #include #include #include @@ -201,16 +203,27 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked, CTLFLAG_RW, static int tcp_do_rfc3465 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, &tcp_do_rfc3465, 0, ""); + +static int tcp_do_rfc3465_lim2 = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2, CTLFLAG_RW, + &tcp_do_rfc3465_lim2, 0, "Appropriate bytes counting w/ L=2*SMSS"); + +#if CONFIG_IFEF_NOWINDOWSCALE +int tcp_obey_ifef_nowindowscale = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW, + &tcp_obey_ifef_nowindowscale, 0, ""); +#endif + extern int tcp_TCPTV_MIN; -u_long tcp_now; +u_int32_t tcp_now; struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; static void tcp_dooptions(struct tcpcb *, - u_char *, int, struct tcphdr *, struct tcpopt *); + u_char *, int, struct tcphdr *, struct tcpopt *, unsigned int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, @@ -234,7 +247,7 @@ do { \ #define ND6_HINT(tp) #endif -extern u_long *delack_bitmask; +extern u_int32_t *delack_bitmask; extern void add_to_time_wait(struct tcpcb *); extern void postevent(struct socket *, struct sockbuf *, int); @@ -544,7 +557,7 @@ tcp_input(m, off0) int dropsocket = 0; int iss = 0; int nosock = 0; - u_long tiwin; + u_int32_t tiwin; struct tcpopt to; /* options in this segment */ struct sockaddr_in *next_hop = NULL; #if TCPDEBUG @@ -552,9 +565,27 @@ tcp_input(m, off0) #endif struct m_tag *fwd_tag; u_char ip_ecn = IPTOS_ECN_NOTECT; + unsigned int ifscope; + + /* + * Record the interface where this segment arrived on; this does not + * affect normal data output (for non-detached TCP) as it provides a + * hint about which route and interface to use for sending in the + * absence of a PCB, when scoped routing (and thus source interface + * selection) are enabled. + */ + if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) + ifscope = m->m_pkthdr.rcvif->if_index; + else + ifscope = IFSCOPE_NONE; /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ - fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL); + if (!SLIST_EMPTY(&m->m_pkthdr.tags)) { + fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, NULL); + } else { + fwd_tag = NULL; + } if (fwd_tag != NULL) { struct ip_fwd_tag *ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); @@ -645,7 +676,11 @@ tcp_input(m, off0) bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = (u_short)tlen; + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ipov->ih_len); +#endif + pseudo = in_cksum(m, sizeof (struct ip)); *(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0]; @@ -674,7 +709,11 @@ tcp_input(m, off0) len = sizeof (struct ip) + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = (u_short)tlen; + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ipov->ih_len); +#endif + th->th_sum = in_cksum(m, len); *(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0]; @@ -760,10 +799,13 @@ tcp_input(m, off0) /* * Convert TCP protocol specific fields to host format. */ + +#if BYTE_ORDER != BIG_ENDIAN NTOHL(th->th_seq); NTOHL(th->th_ack); NTOHS(th->th_win); NTOHS(th->th_urp); +#endif /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options, @@ -821,18 +863,30 @@ findpcb: ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); } + /* + * Use the interface scope information from the PCB for outbound + * segments. If the PCB isn't present and if scoped routing is + * enabled, tcp_respond will use the scope of the interface where + * the segment arrived on. + */ + if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) + ifscope = inp->inp_boundif; #if IPSEC if (ipsec_bypass == 0) { #if INET6 if (isipv6) { if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) { IPSEC_STAT_INCREMENT(ipsec6stat.in_polvio); + if (in_pcb_checkstate(inp, WNT_RELEASE, 0) == WNT_STOPUSING) + inp = NULL; // pretend we didn't find it goto dropnosock; } } else #endif /* INET6 */ if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { - IPSEC_STAT_INCREMENT(ipsecstat.in_polvio); + IPSEC_STAT_INCREMENT(ipsecstat.in_polvio); + if (in_pcb_checkstate(inp, WNT_RELEASE, 0) == WNT_STOPUSING) + inp = NULL; // pretend we didn't find it goto dropnosock; } } @@ -915,26 +969,21 @@ findpcb: } so = inp->inp_socket; if (so == NULL) { - if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) - inp = NULL; // pretend we didn't find it + /* This case shouldn't happen as the socket shouldn't be null + * if inp_state isn't set to INPCB_STATE_DEAD + * But just in case, we pretend we didn't find the socket if we hit this case + * as this isn't cause for a panic (the socket might be leaked however)... + */ + inp = NULL; #if TEMPDEBUG - printf("tcp_input: no more socket for inp=%x\n", inp); + printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp); #endif goto dropnosock; } -#ifdef __APPLE__ - /* - * Bogus state when listening port owned by SharedIP with loopback as the - * only configured interface: BlueBox does not filters loopback - */ - if (so == &tcbinfo.nat_dummy_socket) - goto drop; - -#endif - tcp_lock(so, 1, 2); + tcp_lock(so, 1, (void *)2); if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { - tcp_unlock(so, 1, 2); + tcp_unlock(so, 1, (void *)2); inp = NULL; // pretend we didn't find it goto dropnosock; } @@ -958,6 +1007,10 @@ findpcb: goto drop; #endif + /* Radar 7377561: Avoid processing packets while closing a listen socket */ + if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN) == 0) + goto drop; + if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { #if TCPDEBUG if (so->so_options & SO_DEBUG) { @@ -980,28 +1033,68 @@ findpcb: #if INET6 struct inpcb *oinp = sotoinpcb(so); #endif /* INET6 */ - int ogencnt = so->so_gencnt; + unsigned int head_ifscope; + + /* Get listener's bound-to-interface, if any */ + head_ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : IFSCOPE_NONE; -#if !IPSEC /* - * Current IPsec implementation makes incorrect IPsec - * cache if this check is done here. - * So delay this until duplicated socket is created. + * If the state is LISTEN then ignore segment if it contains an RST. + * If the segment contains an ACK then it is bad and send a RST. + * If it does not contain a SYN then it is not interesting; drop it. + * If it is from this socket, drop it, it must be forged. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { - /* - * Note: dropwithreset makes sure we don't - * send a RST in response to a RST. - */ + if (thflags & TH_RST) { + goto drop; + } if (thflags & TH_ACK) { + tp = NULL; tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } + + /* We come here if there is no SYN set */ + tcpstat.tcps_badsyn++; goto drop; } -#endif KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0); + if (th->th_dport == th->th_sport) { +#if INET6 + if (isipv6) { + if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, + &ip6->ip6_src)) + goto drop; + } else +#endif /* INET6 */ + if (ip->ip_dst.s_addr == ip->ip_src.s_addr) + goto drop; + } + /* + * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN + * in_broadcast() should never return true on a received + * packet with M_BCAST not set. + * + * Packets with a multicast source address should also + * be discarded. + */ + if (m->m_flags & (M_BCAST|M_MCAST)) + goto drop; +#if INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || + IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) + goto drop; + } else +#endif + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) + goto drop; + #if INET6 /* @@ -1082,12 +1175,10 @@ findpcb: if (!so2) goto drop; } - /* - * Make sure listening socket did not get closed during socket allocation, - * not only this is incorrect but it is know to cause panic - */ - if (so->so_gencnt != ogencnt) - goto drop; + + /* Point "inp" and "tp" in tandem to new socket */ + inp = (struct inpcb *)so2->so_pcb; + tp = intotcpcb(inp); oso = so; tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */ @@ -1095,8 +1186,6 @@ findpcb: so = so2; tcp_lock(so, 1, 0); /* - * This is ugly, but .... - * * Mark socket as temporary until we're * committed to keeping it. The code at * ``drop'' and ``dropwithreset'' check the @@ -1104,9 +1193,24 @@ findpcb: * socket created here should be discarded. * We mark the socket as discardable until * we're committed to it below in TCPS_LISTEN. + * There are some error conditions in which we + * have to drop the temporary socket. */ dropsocket++; - inp = (struct inpcb *)so->so_pcb; + /* + * Inherit INP_BOUND_IF from listener; testing if + * head_ifscope is non-zero is sufficient, since it + * can only be set to a non-zero value earlier if + * the listener has such a flag set. + */ +#if INET6 + if (head_ifscope != IFSCOPE_NONE && !isipv6) { +#else + if (head_ifscope != IFSCOPE_NONE) { +#endif /* INET6 */ + inp->inp_flags |= INP_BOUND_IF; + inp->inp_boundif = head_ifscope; + } #if INET6 if (isipv6) inp->in6p_laddr = ip6->ip6_dst; @@ -1114,7 +1218,7 @@ findpcb: inp->inp_vflag &= ~INP_IPV6; inp->inp_vflag |= INP_IPV4; #endif /* INET6 */ - inp->inp_laddr = ip->ip_dst; + inp->inp_laddr = ip->ip_dst; #if INET6 } #endif /* INET6 */ @@ -1135,30 +1239,6 @@ findpcb: tcp_unlock(oso, 1, 0); goto drop; } -#if IPSEC - /* - * To avoid creating incorrectly cached IPsec - * association, this is need to be done here. - * - * Subject: (KAME-snap 748) - * From: Wayne Knowles - * ftp://ftp.kame.net/pub/mail-list/snap-users/748 - */ - if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { - /* - * Note: dropwithreset makes sure we don't - * send a RST in response to a RST. - */ - tcp_lock(oso, 0, 0); /* release ref on parent */ - tcp_unlock(oso, 1, 0); - if (thflags & TH_ACK) { - tcpstat.tcps_badsyn++; - rstreason = BANDLIM_RST_OPENPORT; - goto dropwithreset; - } - goto drop; - } -#endif #if INET6 if (isipv6) { /* @@ -1196,15 +1276,19 @@ findpcb: printf("tcp_input: could not copy policy\n"); } #endif - tcp_unlock(oso, 1, 0); /* now drop the reference on the listener */ - tp = intotcpcb(inp); + /* inherit states from the listener */ tp->t_state = TCPS_LISTEN; tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY); + tp->t_keepinit = tp0->t_keepinit; tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl; + + /* now drop the reference on the listener */ + tcp_unlock(oso, 1, 0); + /* Compute proper scaling value from buffer space */ if (inp->inp_pcbinfo->ipi_count < tcp_sockthreshold) { tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale); - so->so_rcv.sb_hiwat = lmin(TCP_MAXWIN << tp->request_r_scale, (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES); + so->so_rcv.sb_hiwat = imin(TCP_MAXWIN << tp->request_r_scale, (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES); } else { while (tp->request_r_scale < TCP_MAX_WINSHIFT && @@ -1216,7 +1300,6 @@ findpcb: KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0); } } - #if 1 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); #endif @@ -1265,7 +1348,7 @@ findpcb: if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) { char ipstrbuf[MAX_IPv6_STR_LEN]; printf("too many small tcp packets from " - "%s:%u, av. %lubyte/packet, " + "%s:%u, av. %ubyte/packet, " "dropping connection\n", #if INET6 isipv6 ? @@ -1289,7 +1372,7 @@ findpcb: } #if TRAFFIC_MGT - if (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND) { + if (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_REGULATE) { tcpstat.tcps_bg_rcvtotal++; /* Take snapshots of pkts recv; @@ -1344,7 +1427,7 @@ findpcb: * else do it below (after getting remote address). */ if (tp->t_state != TCPS_LISTEN && optp) - tcp_dooptions(tp, optp, optlen, th, &to); + tcp_dooptions(tp, optp, optlen, th, &to, ifscope); if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if (to.to_flags & TOF_SCALE) { @@ -1359,7 +1442,7 @@ findpcb: tp->ts_recent_age = tcp_now; } if (to.to_flags & TOF_MSS) - tcp_mss(tp, to.to_mss); + tcp_mss(tp, to.to_mss, ifscope); if (tp->sack_enable) { if (!(to.to_flags & TOF_SACK)) tp->sack_enable = 0; @@ -1406,6 +1489,11 @@ findpcb: tp->ts_recent = to.to_tsval; } + /* Force acknowledgment if we received a FIN */ + + if (thflags & TH_FIN) + tp->t_flags |= TF_ACKNOW; + if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && @@ -1462,6 +1550,13 @@ findpcb: * Grow the congestion window, if the * connection is cwnd bound. */ + if (tp->snd_cwnd < tp->snd_wnd) { + tp->t_bytes_acked += acked; + if (tp->t_bytes_acked > tp->snd_cwnd) { + tp->t_bytes_acked -= tp->snd_cwnd; + tp->snd_cwnd += tp->t_maxseg; + } + } sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) @@ -1583,12 +1678,7 @@ findpcb: switch (tp->t_state) { /* - * If the state is LISTEN then ignore segment if it contains an RST. - * If the segment contains an ACK then it is bad and send a RST. - * If it does not contain a SYN then it is not interesting; drop it. - * If it is from this socket, drop it, it must be forged. - * Don't bother responding if the destination was a broadcast. - * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial + * Initialize tp->rcv_nxt, and tp->irs, select an initial * tp->iss, and send a segment: * * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. @@ -1605,47 +1695,6 @@ findpcb: #if 1 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); #endif - if (thflags & TH_RST) - goto drop; - if (thflags & TH_ACK) { - rstreason = BANDLIM_RST_OPENPORT; - goto dropwithreset; - } - if ((thflags & TH_SYN) == 0) - goto drop; - if (th->th_dport == th->th_sport) { -#if INET6 - if (isipv6) { - if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, - &ip6->ip6_src)) - goto drop; - } else -#endif /* INET6 */ - if (ip->ip_dst.s_addr == ip->ip_src.s_addr) - goto drop; - } - /* - * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN - * in_broadcast() should never return true on a received - * packet with M_BCAST not set. - * - * Packets with a multicast source address should also - * be discarded. - */ - if (m->m_flags & (M_BCAST|M_MCAST)) - goto drop; -#if INET6 - if (isipv6) { - if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || - IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) - goto drop; - } else -#endif - if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || - IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || - ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || - in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) - goto drop; #if INET6 if (isipv6) { MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, @@ -1693,7 +1742,7 @@ findpcb: FREE(sin, M_SONAME); } - tcp_dooptions(tp, optp, optlen, th, &to); + tcp_dooptions(tp, optp, optlen, th, &to, ifscope); if (tp->sack_enable) { if (!(to.to_flags & TOF_SACK)) @@ -1720,13 +1769,20 @@ findpcb: tp->t_flags |= TF_ACKNOW; tp->t_unacksegs = 0; tp->t_state = TCPS_SYN_RECEIVED; - tp->t_timer[TCPT_KEEP] = tcp_keepinit; + tp->t_timer[TCPT_KEEP] = tp->t_keepinit ? tp->t_keepinit : tcp_keepinit; dropsocket = 0; /* committed to socket */ tcpstat.tcps_accepts++; if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) { /* ECN-setup SYN */ tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT); } +#if CONFIG_IFEF_NOWINDOWSCALE + if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL && + (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) { + /* Window scaling is not enabled on this interface */ + tp->t_flags &= ~TF_REQ_SCALE; + } +#endif goto trimthenstep6; } @@ -2390,40 +2446,71 @@ trimthenstep6: * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (tcp_do_newreno || tp->sack_enable) { - if (IN_FASTRECOVERY(tp)) { + if (!IN_FASTRECOVERY(tp)) { + /* + * We were not in fast recovery. Reset the duplicate ack + * counter. + */ + tp->t_dupacks = 0; + } + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + else { + if (tcp_do_newreno || tp->sack_enable) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->sack_enable) tcp_sack_partialack(tp, th); else - tcp_newreno_partial_ack(tp, th); - } else { - /* - * Out of fast recovery. - * Window inflation should have left us - * with approximately snd_ssthresh - * outstanding data. - * But in case we would be inclined to - * send a burst, better to do it via - * the slow start mechanism. - */ - if (SEQ_GT(th->th_ack + - tp->snd_ssthresh, - tp->snd_max)) - tp->snd_cwnd = tp->snd_max - - th->th_ack + - tp->t_maxseg; - else - tp->snd_cwnd = tp->snd_ssthresh; + tcp_newreno_partial_ack(tp, th); + } + else { + if (tcp_do_newreno) { + int32_t ss = tp->snd_max - th->th_ack; + + /* + * Complete ack. Inflate the congestion window to + * ssthresh and exit fast recovery. + * + * Window inflation should have left us with approx. + * snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do + * it via the slow start mechanism. + */ + if (ss < tp->snd_ssthresh) + tp->snd_cwnd = ss + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; + } + else { + /* + * Clamp the congestion window to the crossover point + * and exit fast recovery. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + } + + EXIT_FASTRECOVERY(tp); + tp->t_dupacks = 0; + tp->t_bytes_acked = 0; } } - } else { - if (tp->t_dupacks >= tcprexmtthresh && - tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; + else { + /* + * Clamp the congestion window to the crossover point + * and exit fast recovery in non-newreno and non-SACK case. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + EXIT_FASTRECOVERY(tp); + tp->t_dupacks = 0; + tp->t_bytes_acked = 0; + } } - tp->t_dupacks = 0; - tp->t_bytes_acked = 0; + + /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. @@ -2543,7 +2630,8 @@ process_ACK: register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; - if ((acked > incr) && tcp_do_rfc3465) { + if (tcp_do_rfc3465) { + if (cw >= tp->snd_ssthresh) { tp->t_bytes_acked += acked; if (tp->t_bytes_acked >= cw) { @@ -2563,8 +2651,9 @@ process_ACK: */ u_int abc_lim; - abc_lim = (tcp_do_rfc3465 == 0) ? - incr : incr * 2; + abc_lim = (tcp_do_rfc3465_lim2 && + tp->snd_nxt == tp->snd_max) ? incr * 2 : incr; + incr = lmin(acked, abc_lim); } } @@ -2577,7 +2666,7 @@ process_ACK: */ if (cw >= tp->snd_ssthresh) { - incr = incr * incr / cw; + incr = max((incr * incr / cw), 1); } } @@ -2639,8 +2728,9 @@ process_ACK: soisdisconnected(so); } tp->t_state = TCPS_FIN_WAIT_2; - goto drop; + /* fall through and make sure we also recognize data ACKed with the FIN */ } + tp->t_flags |= TF_ACKNOW; break; /* @@ -2655,7 +2745,7 @@ process_ACK: tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && - tp->t_starttime < (u_long)tcp_msl) + tp->t_starttime < (u_int32_t)tcp_msl) tp->t_timer[TCPT_2MSL] = tp->t_rxtcur * TCPTV_TWTRUNC; else @@ -2663,6 +2753,7 @@ process_ACK: add_to_time_wait(tp); soisdisconnected(so); } + tp->t_flags |= TF_ACKNOW; break; /* @@ -2758,7 +2849,7 @@ step6: * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ - if (th->th_urp <= (u_long)tlen + if (th->th_urp <= (u_int32_t)tlen #if SO_OOBINLINE && (so->so_options & SO_OOBINLINE) == 0 #endif @@ -2783,7 +2874,7 @@ dodata: /* XXX */ * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ - if ((tlen || (thflags&TH_FIN)) && + if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_end = th->th_seq + tlen; @@ -2914,7 +3005,7 @@ dodata: /* XXX */ tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && - tp->t_starttime < (u_long)tcp_msl) { + tp->t_starttime < (u_int32_t)tcp_msl) { tp->t_timer[TCPT_2MSL] = tp->t_rxtcur * TCPTV_TWTRUNC; /* For transaction client, force ACK now. */ @@ -3028,13 +3119,13 @@ dropwithreset: if (thflags & TH_ACK) /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, - TH_RST, m->m_pkthdr.rcvif); + TH_RST, ifscope); else { if (thflags & TH_SYN) tlen++; /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, - (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.rcvif); + (tcp_seq)0, TH_RST|TH_ACK, ifscope); } /* destroy temporarily created socket */ if (dropsocket) { @@ -3071,7 +3162,7 @@ drop: } static void -tcp_dooptions(tp, cp, cnt, th, to) +tcp_dooptions(tp, cp, cnt, th, to, input_ifscope) /* * Parse TCP options and place in tcpopt. */ @@ -3080,6 +3171,7 @@ tcp_dooptions(tp, cp, cnt, th, to) int cnt; struct tcphdr *th; struct tcpopt *to; + unsigned int input_ifscope; { u_short mss = 0; int opt, optlen; @@ -3108,7 +3200,11 @@ tcp_dooptions(tp, cp, cnt, th, to) if (!(th->th_flags & TH_SYN)) continue; bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); + +#if BYTE_ORDER != BIG_ENDIAN NTOHS(mss); +#endif + break; case TCPOPT_WINDOW: @@ -3126,10 +3222,17 @@ tcp_dooptions(tp, cp, cnt, th, to) to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); + +#if BYTE_ORDER != BIG_ENDIAN NTOHL(to->to_tsval); +#endif + bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); + +#if BYTE_ORDER != BIG_ENDIAN NTOHL(to->to_tsecr); +#endif /* * A timestamp received in a SYN makes @@ -3159,7 +3262,7 @@ tcp_dooptions(tp, cp, cnt, th, to) } } if (th->th_flags & TH_SYN) - tcp_mss(tp, mss); /* sets t_maxseg */ + tcp_mss(tp, mss, input_ifscope); /* sets t_maxseg */ } /* @@ -3284,6 +3387,7 @@ tcp_maxmtu(struct rtentry *rt) { unsigned int maxmtu; + RT_LOCK_ASSERT_HELD(rt); if (rt->rt_rmx.rmx_mtu == 0) maxmtu = rt->rt_ifp->if_mtu; else @@ -3298,10 +3402,13 @@ tcp_maxmtu6(struct rtentry *rt) { unsigned int maxmtu; + RT_LOCK_ASSERT_HELD(rt); + lck_rw_lock_shared(nd_if_rwlock); if (rt->rt_rmx.rmx_mtu == 0) maxmtu = IN6_LINKMTU(rt->rt_ifp); else maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp)); + lck_rw_done(nd_if_rwlock); return (maxmtu); } @@ -3333,19 +3440,20 @@ tcp_maxmtu6(struct rtentry *rt) * */ void -tcp_mss(tp, offer) +tcp_mss(tp, offer, input_ifscope) struct tcpcb *tp; int offer; + unsigned int input_ifscope; { register struct rtentry *rt; struct ifnet *ifp; register int rtt, mss; - u_long bufsize; + u_int32_t bufsize; struct inpcb *inp; struct socket *so; struct rmxp_tao *taop; int origoffer = offer; - u_long sb_max_corrected; + u_int32_t sb_max_corrected; int isnetlocal = 0; #if INET6 int isipv6; @@ -3360,19 +3468,23 @@ tcp_mss(tp, offer) #else #define min_protoh (sizeof (struct tcpiphdr)) #endif - lck_mtx_lock(rt_mtx); + #if INET6 if (isipv6) { rt = tcp_rtlookup6(inp); - if (rt && (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) || IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) || rt->rt_gateway->sa_family == AF_LINK)) - isnetlocal = TRUE; + if (rt != NULL && + (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) || + IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) || + rt->rt_gateway->sa_family == AF_LINK)) + isnetlocal = TRUE; } else #endif /* INET6 */ { - rt = tcp_rtlookup(inp); - if (rt && (rt->rt_gateway->sa_family == AF_LINK || - rt->rt_ifp->if_flags & IFF_LOOPBACK)) + rt = tcp_rtlookup(inp, input_ifscope); + if (rt != NULL && + (rt->rt_gateway->sa_family == AF_LINK || + rt->rt_ifp->if_flags & IFF_LOOPBACK)) isnetlocal = TRUE; } if (rt == NULL) { @@ -3381,7 +3493,6 @@ tcp_mss(tp, offer) isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; - lck_mtx_unlock(rt_mtx); return; } ifp = rt->rt_ifp; @@ -3558,8 +3669,12 @@ tcp_mss(tp, offer) */ tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); tcpstat.tcps_usedssthresh++; + } else { + tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; } - lck_mtx_unlock(rt_mtx); + + /* Route locked during lookup above */ + RT_UNLOCK(rt); } /* @@ -3583,15 +3698,14 @@ tcp_mssopt(tp) #else #define min_protoh (sizeof (struct tcpiphdr)) #endif - lck_mtx_lock(rt_mtx); + #if INET6 if (isipv6) rt = tcp_rtlookup6(tp->t_inpcb); else #endif /* INET6 */ - rt = tcp_rtlookup(tp->t_inpcb); + rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE); if (rt == NULL) { - lck_mtx_unlock(rt_mtx); return ( #if INET6 isipv6 ? tcp_v6mssdflt : @@ -3614,7 +3728,8 @@ tcp_mssopt(tp) #else mss = tcp_maxmtu(rt); #endif - lck_mtx_unlock(rt_mtx); + /* Route locked during lookup above */ + RT_UNLOCK(rt); return (mss - min_protoh); } @@ -3630,7 +3745,7 @@ tcp_newreno_partial_ack(tp, th) struct tcphdr *th; { tcp_seq onxt = tp->snd_nxt; - u_long ocwnd = tp->snd_cwnd; + u_int32_t ocwnd = tp->snd_cwnd; tp->t_timer[TCPT_REXMT] = 0; tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; @@ -3722,6 +3837,13 @@ tcp_dropdropablreq(struct socket *head) tcp_unlock(so, 1, 0); } } + else { + /* do not try to lock the inp in in_pcb_checkstate + * because the lock is already held in some other thread. + * Only drop the inp_wntcnt reference. + */ + in_pcb_checkstate(inp, WNT_RELEASE, 1); + } } so = sonext;