X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/de355530ae67247cbd0da700edb3a2a1dae884c2..743b15655a24ee3fe9f458f383003e011db0558f:/bsd/netinet/tcp_output.c?ds=inline diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index d3e5558d3..36e310fd1 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -113,7 +113,7 @@ int ss_fltsz = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, &ss_fltsz, 1, "Slow start flight size"); -int ss_fltsz_local = TCP_MAXWIN; /* something large */ +int ss_fltsz_local = 4; /* starts with four segments max */ SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, &ss_fltsz_local, 1, "Slow start flight size for local networks"); @@ -121,7 +121,14 @@ int tcp_do_newreno = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, 0, "Enable NewReno Algorithms"); -struct mbuf *m_copym_with_hdrs __P((struct mbuf*, int, int, int, struct mbuf**, int*)); +int tcp_packet_chaining = 50; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW, &tcp_packet_chaining, + 0, "Enable TCP output packet chaining"); + +struct mbuf *m_copym_with_hdrs(struct mbuf*, int, int, int, struct mbuf**, int*); +static long packchain_newlist = 0; +static long packchain_looped = 0; +static long packchain_sent = 0; /* temporary: for testing */ @@ -129,6 +136,28 @@ struct mbuf *m_copym_with_hdrs __P((struct mbuf*, int, int, int, struct mbuf**, extern int ipsec_bypass; #endif +extern int slowlink_wsize; /* window correction for slow links */ +extern u_long route_generation; +extern int fw_enable; /* firewall is on: disable packet chaining */ +extern int ipsec_bypass; + +extern vm_size_t so_cache_zone_element_size; + +static __inline__ u_int16_t +get_socket_id(struct socket * s) +{ + u_int16_t val; + + if (so_cache_zone_element_size == 0) { + return (0); + } + val = (u_int16_t)(((u_int32_t)s) / so_cache_zone_element_size); + if (val == 0) { + val = 0xffff; + } + return (val); +} + /* * Tcp output routine: figure out what should be sent and send it. */ @@ -148,39 +177,23 @@ tcp_output(tp) register struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; - int idle, sendalot; + int idle, sendalot, howmuchsent = 0; int maxburst = TCP_MAXBURST; struct rmxp_tao *taop; struct rmxp_tao tao_noncached; -#if INET6 - int isipv6; -#endif - int last_off; + int last_off = 0; int m_off; struct mbuf *m_last = 0; struct mbuf *m_head = 0; - - - KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); + struct mbuf *packetlist = 0; + struct mbuf *lastpacket = 0; #if INET6 - if (isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)) { - - KERNEL_DEBUG(DBG_LAYER_BEG, - ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), - (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | - (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), - 0,0,0); - } - else + int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ; #endif + short packchain_listadd = 0; + u_int16_t socket_id = get_socket_id(so); + - { - KERNEL_DEBUG(DBG_LAYER_BEG, - ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), - (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | - (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), - 0,0,0); - } /* * Determine length of data that should be transmitted, * and flags that will be used. @@ -188,11 +201,7 @@ tcp_output(tp) * to send, then transmit; otherwise, investigate further. */ idle = (tp->snd_max == tp->snd_una); -#ifdef __APPLE__ if (idle && tp->t_rcvtime >= tp->t_rxtcur) { -#else - if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { -#endif /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- @@ -215,10 +224,74 @@ tcp_output(tp) else tp->snd_cwnd = tp->t_maxseg * ss_fltsz; } + again: + KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); + +#if INET6 + if (isipv6) { + + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | + (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), + sendalot,0,0); + } + else +#endif + + { + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | + (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), + sendalot,0,0); + /* + * If the route generation id changed, we need to check that our + * local (source) IP address is still valid. If it isn't either + * return error or silently do nothing (assuming the address will + * come back before the TCP connection times out). + */ + + if ((tp->t_inpcb->inp_route.ro_rt != NULL && + (tp->t_inpcb->inp_route.ro_rt->generation_id != route_generation)) || (tp->t_inpcb->inp_route.ro_rt == NULL)) { + /* check that the source address is still valid */ + if (ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr) == 0) { + if (tp->t_state >= TCPS_CLOSE_WAIT) { + tcp_close(tp); + return(EADDRNOTAVAIL); + } + + /* set Retransmit timer if it wasn't set + * reset Persist timer and shift register as the + * adversed peer window may not be valid anymore + */ + + if (!tp->t_timer[TCPT_REXMT]) { + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + if (tp->t_timer[TCPT_PERSIST]) { + tp->t_timer[TCPT_PERSIST] = 0; + tp->t_rxtshift = 0; + } + } + + if (packetlist) { + error = ip_output_list(packetlist, packchain_listadd, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + tp->t_lastchain = 0; + } + if (so->so_flags & SOF_NOADDRAVAIL) + return(EADDRNOTAVAIL); + else + return(0); /* silently ignore and keep data in socket */ + } + } + } sendalot = 0; off = tp->snd_nxt - tp->snd_una; win = min(tp->snd_wnd, tp->snd_cwnd); + if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) + win = min(win, slowlink_wsize); flags = tcp_outflags[tp->t_state]; /* @@ -280,6 +353,11 @@ again: off--, len++; if (len > 0 && tp->t_state == TCPS_SYN_SENT && taop->tao_ccsent == 0) { + if (packetlist) { + error = ip_output_list(packetlist, packchain_listadd, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + tp->t_lastchain = 0; + } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return 0; } @@ -320,12 +398,16 @@ again: } if (len > tp->t_maxseg) { len = tp->t_maxseg; + howmuchsent += len; sendalot = 1; } if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; - win = sbspace(&so->so_rcv); + if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0 ) /* Clips window size for slow links */ + win = min(sbspace(&so->so_rcv), slowlink_wsize); + else + win = sbspace(&so->so_rcv); /* * Sender silly window avoidance. If connection is idle @@ -423,8 +505,13 @@ again: } /* - * No reason to send a segment, just return. + * If there is no reason to send a segment, just return. + * but if there is some packets left in the packet list, send them now. */ + if (packetlist) { + error = ip_output_list(packetlist, packchain_listadd, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); @@ -588,6 +675,7 @@ send: */ flags &= ~TH_FIN; len = tp->t_maxopd - optlen - ipoptlen; + howmuchsent += len; sendalot = 1; } @@ -668,6 +756,12 @@ send: m->m_data += max_linkhdr; m->m_len = hdrlen; } + /* makes sure we still have data left to be sent at this point */ + if (so->so_snd.sb_mb == NULL || off == -1) { + if (m != NULL) m_freem(m); + error = 0; /* should we return an error? */ + goto out; + } m_copydata(so->so_snd.sb_mb, off, (int) len, mtod(m, caddr_t) + hdrlen); m->m_len += len; @@ -694,7 +788,13 @@ send: m_last = NULL; last_off = off + len; m_head = so->so_snd.sb_mb; - + + /* makes sure we still have data left to be sent at this point */ + if (m_head == NULL) { + error = 0; /* should we return an error? */ + goto out; + } + /* * m_copym_with_hdrs will always return the last mbuf pointer and the offset into it that * it acted on to fullfill the current request, whether a valid 'hint' was passed in or not @@ -740,7 +840,7 @@ send: m->m_data += max_linkhdr; m->m_len = hdrlen; } - m->m_pkthdr.rcvif = (struct ifnet *)0; + m->m_pkthdr.rcvif = 0; #if INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); @@ -795,9 +895,31 @@ send: win = 0; if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) win = (long)(tp->rcv_adv - tp->rcv_nxt); - if (win > (long)TCP_MAXWIN << tp->rcv_scale) + if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) { + if (win > (long)slowlink_wsize) + win = slowlink_wsize; + th->th_win = htons((u_short) (win>>tp->rcv_scale)); + } + else { + + if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; - th->th_win = htons((u_short) (win>>tp->rcv_scale)); + th->th_win = htons((u_short) (win>>tp->rcv_scale)); + } + + /* + * Adjust the RXWIN0SENT flag - indicate that we have advertised + * a 0 window. This may cause the remote transmitter to stall. This + * flag tells soreceive() to disable delayed acknowledgements when + * draining the buffer. This can occur if the receiver is attempting + * to read more data then can be buffered prior to transmitting on + * the connection. + */ + if (win == 0) + tp->t_flags |= TF_RXWIN0SENT; + else + tp->t_flags &= ~TF_RXWIN0SENT; + if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; @@ -928,17 +1050,18 @@ send: goto out; } #endif /*IPSEC*/ + m->m_pkthdr.socket_id = socket_id; error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &tp->t_inpcb->in6p_route, - (so->so_options & SO_DONTROUTE), NULL, NULL); + (so->so_options & SO_DONTROUTE), NULL, NULL, 0); } else #endif /* INET6 */ { struct rtentry *rt; ip->ip_len = m->m_pkthdr.len; #if INET6 - if (INP_CHECK_SOCKAF(so, AF_INET6)) + if (isipv6) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, tp->t_inpcb->in6p_route.ro_rt ? tp->t_inpcb->in6p_route.ro_rt->rt_ifp @@ -984,9 +1107,49 @@ send: if (ipsec_bypass == 0) ipsec_setsocket(m, so); #endif /*IPSEC*/ - error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, - (so->so_options & SO_DONTROUTE), 0); - } + + /* + * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active. + */ + + m->m_pkthdr.socket_id = socket_id; + if (packetlist) { + m->m_nextpkt = NULL; + lastpacket->m_nextpkt = m; + lastpacket = m; + packchain_listadd++; + } + else { + m->m_nextpkt = NULL; + packchain_newlist++; + packetlist = lastpacket = m; + packchain_listadd=0; + } + + if ((ipsec_bypass == 0) || fw_enable || sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) || + (tp->snd_cwnd <= (tp->snd_wnd / 4)) || + (tp->t_flags & (TH_PUSH | TF_ACKNOW)) || tp->t_force != 0 || + packchain_listadd >= tcp_packet_chaining) { + lastpacket->m_nextpkt = 0; + error = ip_output_list(packetlist, packchain_listadd, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + tp->t_lastchain = packchain_listadd; + packchain_sent++; + packetlist = NULL; + if (error == 0) + howmuchsent = 0; + } + else { + error = 0; + packchain_looped++; + tcpstat.tcps_sndtotal++; + if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + win; + tp->last_ack_sent = tp->rcv_nxt; + tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); + goto again; + } + } if (error) { /* @@ -998,15 +1161,19 @@ send: * No need to check for TH_FIN here because * the TF_SENTFIN flag handles that case. */ - if ((flags & TH_SYN) == 0) - tp->snd_nxt -= len; + if ((flags & TH_SYN) == 0) + tp->snd_nxt -= howmuchsent; } + howmuchsent = 0; out: if (error == ENOBUFS) { if (!tp->t_timer[TCPT_REXMT] && !tp->t_timer[TCPT_PERSIST]) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; tcp_quench(tp->t_inpcb, 0); + if (packetlist) + m_freem_list(packetlist); + tp->t_lastchain = 0; KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); } @@ -1018,18 +1185,28 @@ out: * not do so here. */ tcp_mtudisc(tp->t_inpcb, 0); + if (packetlist) + m_freem_list(packetlist); + tp->t_lastchain = 0; KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return 0; } if ((error == EHOSTUNREACH || error == ENETDOWN) && TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; + if (packetlist) + m_freem_list(packetlist); + tp->t_lastchain = 0; KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); } + if (packetlist) + m_freem_list(packetlist); + tp->t_lastchain = 0; KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (error); } +sentit: tcpstat.tcps_sndtotal++; /* @@ -1042,9 +1219,10 @@ out: tp->rcv_adv = tp->rcv_nxt + win; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); - if (sendalot) + + KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0); + if (sendalot && (!tcp_do_newreno || --maxburst)) goto again; - KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); } @@ -1053,7 +1231,6 @@ tcp_setpersist(tp) register struct tcpcb *tp; { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; - int tt; if (tp->t_timer[TCPT_REXMT]) panic("tcp_setpersist: retransmit pending");