X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/1c79356b52d46aa6b508fb032f5ae709b1f2897b..743b15655a24ee3fe9f458f383003e011db0558f:/bsd/netinet/tcp_output.c diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index 14535b6bb..36e310fd1 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -52,17 +52,16 @@ * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 + * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $ */ -#if ISFB31 -#include "opt_tcpdebug.h" -#endif #define _IP_VHL -#include #include #include +#include +#include #include #include #include @@ -74,13 +73,13 @@ #include #include #include +#include #include #if INET6 +#include #include -#include #include #endif -#include #include #define TCPOUTFLAGS #include @@ -93,6 +92,10 @@ #endif #include +#if IPSEC +#include +#endif /*IPSEC*/ + #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1) #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3) #define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1) @@ -102,6 +105,58 @@ extern struct mbuf *m_copypack(); #endif +static int path_mtu_discovery = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, + &path_mtu_discovery, 1, "Enable Path MTU Discovery"); + +int ss_fltsz = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, + &ss_fltsz, 1, "Slow start flight size"); + +int ss_fltsz_local = 4; /* starts with four segments max */ +SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, + &ss_fltsz_local, 1, "Slow start flight size for local networks"); + +int tcp_do_newreno = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, + 0, "Enable NewReno Algorithms"); + +int tcp_packet_chaining = 50; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW, &tcp_packet_chaining, + 0, "Enable TCP output packet chaining"); + +struct mbuf *m_copym_with_hdrs(struct mbuf*, int, int, int, struct mbuf**, int*); +static long packchain_newlist = 0; +static long packchain_looped = 0; +static long packchain_sent = 0; + + +/* temporary: for testing */ +#if IPSEC +extern int ipsec_bypass; +#endif + +extern int slowlink_wsize; /* window correction for slow links */ +extern u_long route_generation; +extern int fw_enable; /* firewall is on: disable packet chaining */ +extern int ipsec_bypass; + +extern vm_size_t so_cache_zone_element_size; + +static __inline__ u_int16_t +get_socket_id(struct socket * s) +{ + u_int16_t val; + + if (so_cache_zone_element_size == 0) { + return (0); + } + val = (u_int16_t)(((u_int32_t)s) / so_cache_zone_element_size); + if (val == 0) { + val = 0xffff; + } + return (val); +} /* * Tcp output routine: figure out what should be sent and send it. @@ -115,26 +170,29 @@ tcp_output(tp) int off, flags, error; register struct mbuf *m; struct ip *ip = NULL; - struct ipovly *ipov = NULL; + register struct ipovly *ipov = NULL; #if INET6 struct ip6_hdr *ip6 = NULL; #endif /* INET6 */ - struct tcphdr *th; + register struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; - int idle, sendalot; + int idle, sendalot, howmuchsent = 0; + int maxburst = TCP_MAXBURST; struct rmxp_tao *taop; struct rmxp_tao tao_noncached; + int last_off = 0; + int m_off; + struct mbuf *m_last = 0; + struct mbuf *m_head = 0; + struct mbuf *packetlist = 0; + struct mbuf *lastpacket = 0; #if INET6 - int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0; + int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ; #endif + short packchain_listadd = 0; + u_int16_t socket_id = get_socket_id(so); - KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); - KERNEL_DEBUG(DBG_LAYER_BEG, - ((tp->t_template->th_dport << 16) | tp->t_template->th_sport), - (((tp->t_template->th_src.s_addr & 0xffff) << 16) | - (tp->t_template->th_dst.s_addr & 0xffff)), - 0,0,0); /* * Determine length of data that should be transmitted, @@ -143,17 +201,97 @@ tcp_output(tp) * to send, then transmit; otherwise, investigate further. */ idle = (tp->snd_max == tp->snd_una); - if (idle && tp->t_idle >= tp->t_rxtcur) + if (idle && tp->t_rcvtime >= tp->t_rxtcur) { /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- * slow start to get ack "clock" running again. - */ - tp->snd_cwnd = tp->t_maxseg; + * + * Set the slow-start flight size depending on whether + * this is a local network or not. + */ + if ( +#if INET6 + (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) || + (!isipv6 && +#endif + in_localaddr(tp->t_inpcb->inp_faddr) +#if INET6 + ) +#endif + ) + tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; + else + tp->snd_cwnd = tp->t_maxseg * ss_fltsz; + } + again: + KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); + +#if INET6 + if (isipv6) { + + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | + (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), + sendalot,0,0); + } + else +#endif + + { + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | + (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), + sendalot,0,0); + /* + * If the route generation id changed, we need to check that our + * local (source) IP address is still valid. If it isn't either + * return error or silently do nothing (assuming the address will + * come back before the TCP connection times out). + */ + + if ((tp->t_inpcb->inp_route.ro_rt != NULL && + (tp->t_inpcb->inp_route.ro_rt->generation_id != route_generation)) || (tp->t_inpcb->inp_route.ro_rt == NULL)) { + /* check that the source address is still valid */ + if (ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr) == 0) { + if (tp->t_state >= TCPS_CLOSE_WAIT) { + tcp_close(tp); + return(EADDRNOTAVAIL); + } + + /* set Retransmit timer if it wasn't set + * reset Persist timer and shift register as the + * adversed peer window may not be valid anymore + */ + + if (!tp->t_timer[TCPT_REXMT]) { + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + if (tp->t_timer[TCPT_PERSIST]) { + tp->t_timer[TCPT_PERSIST] = 0; + tp->t_rxtshift = 0; + } + } + + if (packetlist) { + error = ip_output_list(packetlist, packchain_listadd, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + tp->t_lastchain = 0; + } + if (so->so_flags & SOF_NOADDRAVAIL) + return(EADDRNOTAVAIL); + else + return(0); /* silently ignore and keep data in socket */ + } + } + } sendalot = 0; off = tp->snd_nxt - tp->snd_una; win = min(tp->snd_wnd, tp->snd_cwnd); + if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) + win = min(win, slowlink_wsize); flags = tcp_outflags[tp->t_state]; /* @@ -215,6 +353,11 @@ again: off--, len++; if (len > 0 && tp->t_state == TCPS_SYN_SENT && taop->tao_ccsent == 0) { + if (packetlist) { + error = ip_output_list(packetlist, packchain_listadd, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + tp->t_lastchain = 0; + } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return 0; } @@ -255,12 +398,16 @@ again: } if (len > tp->t_maxseg) { len = tp->t_maxseg; + howmuchsent += len; sendalot = 1; } if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; - win = sbspace(&so->so_rcv); + if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0 ) /* Clips window size for slow links */ + win = min(sbspace(&so->so_rcv), slowlink_wsize); + else + win = sbspace(&so->so_rcv); /* * Sender silly window avoidance. If connection is idle @@ -358,8 +505,13 @@ again: } /* - * No reason to send a segment, just return. + * If there is no reason to send a segment, just return. + * but if there is some packets left in the packet list, send them now. */ + if (packetlist) { + error = ip_output_list(packetlist, packchain_listadd, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); @@ -370,12 +522,12 @@ send: * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. - * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN + * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #if INET6 if (isipv6) - hdrlen = sizeof (struct tcpip6hdr); + hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); @@ -386,7 +538,7 @@ send: opt[0] = TCPOPT_MAXSEG; opt[1] = TCPOLEN_MAXSEG; - mss = htons((u_short) tcp_mssopt(tp, isipv6)); + mss = htons((u_short) tcp_mssopt(tp)); (void)memcpy(opt + 2, &mss, sizeof(mss)); optlen = TCPOLEN_MAXSEG; @@ -498,18 +650,17 @@ send: ipoptlen = ip6_optlen(tp->t_inpcb); else #endif - if (tp->t_inpcb->inp_options) { - ipoptlen = tp->t_inpcb->inp_options->m_len - + { + if (tp->t_inpcb->inp_options) { + ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); - } else { - ipoptlen = 0; + } else { + ipoptlen = 0; + } } #if IPSEC -#if INET6 - ipoptlen += ipsec_hdrsiz_tcp(tp, isipv6); -#else - ipoptlen += ipsec_hdrsiz_tcp(tp, 0); -#endif + if (ipsec_bypass == 0) + ipoptlen += ipsec_hdrsiz_tcp(tp); #endif /* @@ -524,12 +675,18 @@ send: */ flags &= ~TH_FIN; len = tp->t_maxopd - optlen - ipoptlen; + howmuchsent += len; sendalot = 1; } /*#ifdef DIAGNOSTIC*/ +#if INET6 + if (max_linkhdr + hdrlen > MCLBYTES) + panic("tcphdr too big"); +#else if (max_linkhdr + hdrlen > MHLEN) panic("tcphdr too big"); +#endif /*#endif*/ /* @@ -559,33 +716,95 @@ send: m->m_len += hdrlen; m->m_data -= hdrlen; #else - MGETHDR(m, M_DONTWAIT, MT_HEADER); - if (m == NULL) { - error = ENOBUFS; - goto out; - } + /* + * try to use the new interface that allocates all + * the necessary mbuf hdrs under 1 mbuf lock and + * avoids rescanning the socket mbuf list if + * certain conditions are met. This routine can't + * be used in the following cases... + * 1) the protocol headers exceed the capacity of + * of a single mbuf header's data area (no cluster attached) + * 2) the length of the data being transmitted plus + * the protocol headers fits into a single mbuf header's + * data area (no cluster attached) + */ + m = NULL; #if INET6 if (MHLEN < hdrlen + max_linkhdr) { + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto out; + } MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); error = ENOBUFS; goto out; } + m->m_data += max_linkhdr; + m->m_len = hdrlen; } #endif - m->m_data += max_linkhdr; - m->m_len = hdrlen; if (len <= MHLEN - hdrlen - max_linkhdr) { + if (m == NULL) { + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto out; + } + m->m_data += max_linkhdr; + m->m_len = hdrlen; + } + /* makes sure we still have data left to be sent at this point */ + if (so->so_snd.sb_mb == NULL || off == -1) { + if (m != NULL) m_freem(m); + error = 0; /* should we return an error? */ + goto out; + } m_copydata(so->so_snd.sb_mb, off, (int) len, mtod(m, caddr_t) + hdrlen); m->m_len += len; } else { - m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); - if (m->m_next == 0) { - (void) m_free(m); - error = ENOBUFS; - goto out; + if (m != NULL) { + m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); + if (m->m_next == 0) { + (void) m_free(m); + error = ENOBUFS; + goto out; + } + } else { + /* + * determine whether the mbuf pointer and offset passed back by the 'last' call + * to m_copym_with_hdrs are still valid... if the head of the socket chain has + * changed (due to an incoming ACK for instance), or the offset into the chain we + * just computed is different from the one last returned by m_copym_with_hdrs (perhaps + * we're re-transmitting a packet sent earlier), than we can't pass the mbuf pointer and + * offset into it as valid hints for m_copym_with_hdrs to use (if valid, these hints allow + * m_copym_with_hdrs to avoid rescanning from the beginning of the socket buffer mbuf list. + * setting the mbuf pointer to NULL is sufficient to disable the hint mechanism. + */ + if (m_head != so->so_snd.sb_mb || last_off != off) + m_last = NULL; + last_off = off + len; + m_head = so->so_snd.sb_mb; + + /* makes sure we still have data left to be sent at this point */ + if (m_head == NULL) { + error = 0; /* should we return an error? */ + goto out; + } + + /* + * m_copym_with_hdrs will always return the last mbuf pointer and the offset into it that + * it acted on to fullfill the current request, whether a valid 'hint' was passed in or not + */ + if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, (int) len, M_DONTWAIT, &m_last, &m_off)) == NULL) { + error = ENOBUFS; + goto out; + } + m->m_data += max_linkhdr; + m->m_len = hdrlen; } } #endif @@ -613,35 +832,29 @@ send: goto out; } #if INET6 - if (isipv6) { + if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && + MHLEN >= hdrlen) { MH_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } - m->m_pkthdr.rcvif = (struct ifnet *)0; - if (tp->t_template == 0) - panic("tcp_output"); + m->m_pkthdr.rcvif = 0; #if INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); - bcopy((caddr_t)&tp->t_template->tt_i6, (caddr_t)ip6, - sizeof(struct ip6_hdr)); - bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, - sizeof(struct tcphdr)); - } else { + tcp_fillheaders(tp, ip6, th); + } else #endif /* INET6 */ - ip = mtod(m, struct ip *); - ipov = (struct ipovly *)ip; - th = (struct tcphdr *)(ip + 1); - bcopy((caddr_t)&tp->t_template->tt_i, (caddr_t)ip, sizeof(struct ip)); - bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, - sizeof(struct tcphdr)); -#if INET6 + { + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)(ip + 1); + /* this picks up the pseudo header (w/o the length) */ + tcp_fillheaders(tp, ip, th); } -#endif /* INET6 */ /* * Fill in fields, remembering maximum advertised @@ -682,9 +895,31 @@ send: win = 0; if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) win = (long)(tp->rcv_adv - tp->rcv_nxt); - if (win > (long)TCP_MAXWIN << tp->rcv_scale) + if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) { + if (win > (long)slowlink_wsize) + win = slowlink_wsize; + th->th_win = htons((u_short) (win>>tp->rcv_scale)); + } + else { + + if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; - th->th_win = htons((u_short) (win>>tp->rcv_scale)); + th->th_win = htons((u_short) (win>>tp->rcv_scale)); + } + + /* + * Adjust the RXWIN0SENT flag - indicate that we have advertised + * a 0 window. This may cause the remote transmitter to stall. This + * flag tells soreceive() to disable delayed acknowledgements when + * draining the buffer. This can occur if the receiver is attempting + * to read more data then can be buffered prior to transmitting on + * the connection. + */ + if (win == 0) + tp->t_flags |= TF_RXWIN0SENT; + else + tp->t_flags &= ~TF_RXWIN0SENT; + if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; @@ -701,25 +936,28 @@ send: * Put TCP length in extended header, and then * checksum extended header and data. */ - m->m_pkthdr.len = hdrlen + len; + m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #if INET6 - if (isipv6) { -#if 0 /* ip6_plen will be filled in ip6_output. */ - ip6->ip6_plen = htons((u_short)(sizeof(struct tcphdr) + - optlen + len)); -#endif - + if (isipv6) + /* + * ip6_plen is not need to be filled now, and will be filled + * in ip6_output. + */ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), sizeof(struct tcphdr) + optlen + len); - } else { -#endif /* INET6 */ - if (len + optlen) - ipov->ih_len = htons((u_short)(sizeof (struct tcphdr) + - optlen + len)); - th->th_sum = in_cksum(m, (int)(hdrlen + len)); -#if INET6 - } + else #endif /* INET6 */ + { + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + if (len + optlen) + th->th_sum = in_addword(th->th_sum, + htons((u_short)(optlen + len))); + + /* IP version must be set here for ipv4/ipv6 checking later */ + KASSERT(ip->ip_v == IPVERSION, + ("%s: IP version incorrect: %d", __FUNCTION__, ip->ip_v)); + } /* * In transmit state, time the transmission and arrange for @@ -746,8 +984,8 @@ send: * Time this transmission if not a retransmission and * not currently timing anything. */ - if (tp->t_rtt == 0) { - tp->t_rtt = 1; + if (tp->t_rtttime == 0) { + tp->t_rtttime = 1; tp->t_rtseq = startseq; tcpstat.tcps_segstimed++; } @@ -777,23 +1015,9 @@ send: /* * Trace. */ - if (so->so_options & SO_DEBUG) { -#if INET6 - if (isipv6) - ip6->ip6_vfc = IPV6_VERSION; - else - ip->ip_vhl = IP_MAKE_VHL(IPVERSION, - IP_VHL_HL(ip->ip_vhl)); -#endif /* INET6 */ - tcp_trace(TA_OUTPUT, tp->t_state, tp, -#if INET6 - isipv6 ? (void *)ip6 : -#endif /* INET6 */ - ip, - th, 0); - - } -#endif /* TCPDEBUG */ + if (so->so_options & SO_DEBUG) + tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); +#endif /* * Fill in IP length and desired time to live and @@ -801,9 +1025,13 @@ send: * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ + /* + * m->m_pkthdr.len should have been set before cksum calcuration, + * because in6_cksum() need it. + */ #if INET6 if (isipv6) { - /* + /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. * Also, desired default hop limit might be changed via @@ -816,37 +1044,52 @@ send: /* TODO: IPv6 IP6TOS_ECT bit on */ #if IPSEC - ipsec_setsocket(m, so); + if (ipsec_bypass == 0 && ipsec_setsocket(m, so) != 0) { + m_freem(m); + error = ENOBUFS; + goto out; + } #endif /*IPSEC*/ + m->m_pkthdr.socket_id = socket_id; error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &tp->t_inpcb->in6p_route, - (so->so_options & SO_DONTROUTE) /* | IP6_DONTFRAG */, - NULL, NULL); + (so->so_options & SO_DONTROUTE), NULL, NULL, 0); } else #endif /* INET6 */ - { -#if 1 + { struct rtentry *rt; -#endif ip->ip_len = m->m_pkthdr.len; #if INET6 - if (INP_CHECK_SOCKAF(so, AF_INET6)) - ip->ip_ttl = in6_selecthlim(tp->t_inpcb, - tp->t_inpcb->in6p_route.ro_rt ? - tp->t_inpcb->in6p_route.ro_rt->rt_ifp - : NULL); - else + if (isipv6) + ip->ip_ttl = in6_selecthlim(tp->t_inpcb, + tp->t_inpcb->in6p_route.ro_rt ? + tp->t_inpcb->in6p_route.ro_rt->rt_ifp + : NULL); + else #endif /* INET6 */ ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ ip->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */ - KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), - (((th->th_src.s_addr & 0xffff) << 16) | (th->th_dst.s_addr & 0xffff)), - th->th_seq, th->th_ack, th->th_win); +#if INET6 + if (isipv6) { + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | + (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), + 0,0,0); + } + else +#endif + { + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | + (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), + 0,0,0); + } -#if 1 /* * See if we should do MTU discovery. We do it only if the following * are true: @@ -854,28 +1097,86 @@ send: * 2) the MTU is not locked (if it is, then discovery has been * disabled) */ - if ((rt = tp->t_inpcb->inp_route.ro_rt) + if (path_mtu_discovery + && (rt = tp->t_inpcb->inp_route.ro_rt) && rt->rt_flags & RTF_UP && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { ip->ip_off |= IP_DF; } -#endif - #if IPSEC - ipsec_setsocket(m, so); + if (ipsec_bypass == 0) + ipsec_setsocket(m, so); #endif /*IPSEC*/ - error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, - so->so_options & SO_DONTROUTE, 0); - } + /* + * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active. + */ + + m->m_pkthdr.socket_id = socket_id; + if (packetlist) { + m->m_nextpkt = NULL; + lastpacket->m_nextpkt = m; + lastpacket = m; + packchain_listadd++; + } + else { + m->m_nextpkt = NULL; + packchain_newlist++; + packetlist = lastpacket = m; + packchain_listadd=0; + } + + if ((ipsec_bypass == 0) || fw_enable || sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) || + (tp->snd_cwnd <= (tp->snd_wnd / 4)) || + (tp->t_flags & (TH_PUSH | TF_ACKNOW)) || tp->t_force != 0 || + packchain_listadd >= tcp_packet_chaining) { + lastpacket->m_nextpkt = 0; + error = ip_output_list(packetlist, packchain_listadd, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + tp->t_lastchain = packchain_listadd; + packchain_sent++; + packetlist = NULL; + if (error == 0) + howmuchsent = 0; + } + else { + error = 0; + packchain_looped++; + tcpstat.tcps_sndtotal++; + if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + win; + tp->last_ack_sent = tp->rcv_nxt; + tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); + goto again; + } + } if (error) { + + /* + * We know that the packet was lost, so back out the + * sequence number advance, if any. + */ + if (tp->t_force == 0 || !tp->t_timer[TCPT_PERSIST]) { + /* + * No need to check for TH_FIN here because + * the TF_SENTFIN flag handles that case. + */ + if ((flags & TH_SYN) == 0) + tp->snd_nxt -= howmuchsent; + } + howmuchsent = 0; out: if (error == ENOBUFS) { + if (!tp->t_timer[TCPT_REXMT] && + !tp->t_timer[TCPT_PERSIST]) + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; tcp_quench(tp->t_inpcb, 0); + if (packetlist) + m_freem_list(packetlist); + tp->t_lastchain = 0; KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); } -#if 1 if (error == EMSGSIZE) { /* * ip_output() will have already fixed the route @@ -884,19 +1185,28 @@ out: * not do so here. */ tcp_mtudisc(tp->t_inpcb, 0); + if (packetlist) + m_freem_list(packetlist); + tp->t_lastchain = 0; KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return 0; } -#endif if ((error == EHOSTUNREACH || error == ENETDOWN) && TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; + if (packetlist) + m_freem_list(packetlist); + tp->t_lastchain = 0; KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); } + if (packetlist) + m_freem_list(packetlist); + tp->t_lastchain = 0; KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (error); } +sentit: tcpstat.tcps_sndtotal++; /* @@ -909,9 +1219,10 @@ out: tp->rcv_adv = tp->rcv_nxt + win; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); - if (sendalot) + + KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0); + if (sendalot && (!tcp_do_newreno || --maxburst)) goto again; - KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); } @@ -919,10 +1230,10 @@ void tcp_setpersist(tp) register struct tcpcb *tp; { - register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; + int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; if (tp->t_timer[TCPT_REXMT]) - panic("tcp_output REXMT"); + panic("tcp_setpersist: retransmit pending"); /* * Start/restart persistance timer. */