+ } else if (to->to_tsecr != 0) {
+ /*
+ * If the timestamp shows that one RTT has
+ * completed, we can stop counting the
+ * bytes. Here we consider increasing
+ * the socket buffer if the bandwidth measured in
+ * last rtt, is more than half of sb_hiwat, this will
+ * help to scale the buffer according to the bandwidth
+ * on the link.
+ */
+ if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
+ if (tp->rfbuf_cnt > (sbrcv->sb_hiwat -
+ (sbrcv->sb_hiwat >> 1))) {
+ int32_t rcvbuf_inc, min_incr;
+ /*
+ * Increment the receive window by a
+ * multiple of maximum sized segments.
+ * This will prevent a connection from
+ * sending smaller segments on wire if it
+ * is limited by the receive window.
+ *
+ * Set the ideal size based on current
+ * bandwidth measurements. We set the
+ * ideal size on receive socket buffer to
+ * be twice the bandwidth delay product.
+ */
+ rcvbuf_inc = (tp->rfbuf_cnt << 1)
+ - sbrcv->sb_hiwat;
+
+ /*
+ * Make the increment equal to 8 segments
+ * at least
+ */
+ min_incr = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
+ if (rcvbuf_inc < min_incr)
+ rcvbuf_inc = min_incr;
+
+ rcvbuf_inc =
+ (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
+ tcp_sbrcv_reserve(tp, sbrcv,
+ sbrcv->sb_hiwat + rcvbuf_inc,
+ (tp->rfbuf_cnt * 2));
+ }
+ goto out;
+ } else {
+ tp->rfbuf_cnt += pktlen;
+ return;
+ }
+ }
+out:
+ /* Restart the measurement */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+ return;
+}
+
+/* This function will trim the excess space added to the socket buffer
+ * to help a slow-reading app. The ideal-size of a socket buffer depends
+ * on the link bandwidth or it is set by an application and we aim to
+ * reach that size.
+ */
+void
+tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) {
+ if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
+ sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
+ int32_t trim;
+ /* compute the difference between ideal and current sizes */
+ u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
+
+ /* Compute the maximum advertised window for
+ * this connection.
+ */
+ u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
+
+ /* How much can we trim the receive socket buffer?
+ * 1. it can not be trimmed beyond the max rcv win advertised
+ * 2. if possible, leave 1/16 of bandwidth*delay to
+ * avoid closing the win completely
+ */
+ u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4));
+
+ /* Sometimes leave can be zero, in that case leave at least
+ * a few segments worth of space.
+ */
+ if (leave == 0)
+ leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
+
+ trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
+ trim = imin(trim, (int32_t)diff);
+
+ if (trim > 0)
+ sbreserve(sbrcv, (sbrcv->sb_hiwat - trim));
+ }
+}
+
+/* We may need to trim the send socket buffer size for two reasons:
+ * 1. if the rtt seen on the connection is climbing up, we do not
+ * want to fill the buffers any more.
+ * 2. if the congestion win on the socket backed off, there is no need
+ * to hold more mbufs for that connection than what the cwnd will allow.
+ */
+void
+tcp_sbsnd_trim(struct sockbuf *sbsnd) {
+ if (tcp_do_autosendbuf == 1 &&
+ ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) ==
+ (SB_AUTOSIZE | SB_TRIM)) &&
+ (sbsnd->sb_idealsize > 0) &&
+ (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
+ u_int32_t trim = 0;
+ if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
+ trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
+ } else {
+ trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
+ }
+ sbreserve(sbsnd, (sbsnd->sb_hiwat - trim));
+ }
+ if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize)
+ sbsnd->sb_flags &= ~(SB_TRIM);
+}
+
+/*
+ * If timestamp option was not negotiated on this connection
+ * and this connection is on the receiving side of a stream
+ * then we can not measure the delay on the link accurately.
+ * Instead of enabling automatic receive socket buffer
+ * resizing, just give more space to the receive socket buffer.
+ */
+static inline void
+tcp_sbrcv_tstmp_check(struct tcpcb *tp) {
+ struct socket *so = tp->t_inpcb->inp_socket;
+ u_int32_t newsize = 2 * tcp_recvspace;
+ struct sockbuf *sbrcv = &so->so_rcv;
+
+ if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) !=
+ (TF_REQ_TSTMP | TF_RCVD_TSTMP) &&
+ (sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
+ tcp_sbrcv_reserve(tp, sbrcv, newsize, 0);
+ }
+}
+
+/* A receiver will evaluate the flow of packets on a connection
+ * to see if it can reduce ack traffic. The receiver will start
+ * stretching acks if all of the following conditions are met:
+ * 1. tcp_delack_enabled is set to 3
+ * 2. If the bytes received in the last 100ms is greater than a threshold
+ * defined by maxseg_unacked
+ * 3. If the connection has not been idle for tcp_maxrcvidle period.
+ * 4. If the connection has seen enough packets to let the slow-start
+ * finish after connection establishment or after some packet loss.
+ *
+ * The receiver will stop stretching acks if there is congestion/reordering
+ * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
+ * timer fires while stretching acks, it means that the packet flow has gone
+ * below the threshold defined by maxseg_unacked and the receiver will stop
+ * stretching acks. The receiver gets no indication when slow-start is completed
+ * or when the connection reaches an idle state. That is why we use
+ * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
+ * state.
+ */
+static inline int
+tcp_stretch_ack_enable(struct tcpcb *tp)
+{
+ if (!(tp->t_flagsext & (TF_NOSTRETCHACK|TF_DISABLE_STRETCHACK)) &&
+ tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
+ TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) &&
+ (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) ||
+ (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
+ return(1);
+ }
+
+ return(0);
+}
+
+/*
+ * Reset the state related to stretch-ack algorithm. This will make
+ * the receiver generate an ack every other packet. The receiver
+ * will start re-evaluating the rate at which packets come to decide
+ * if it can benefit by lowering the ack traffic.
+ */
+void
+tcp_reset_stretch_ack(struct tcpcb *tp)
+{
+ tp->t_flags &= ~(TF_STRETCHACK);
+ tp->rcv_by_unackwin = 0;
+ tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
+
+ /*
+ * When there is packet loss or packet re-ordering or CWR due to
+ * ECN, the sender's congestion window is reduced. In these states,
+ * generate an ack for every other packet for some time to allow
+ * the sender's congestion window to grow.
+ */
+ tp->t_flagsext |= TF_RCVUNACK_WAITSS;
+ tp->rcv_waitforss = 0;
+}
+
+/*
+ * The last packet was a retransmission, check if this ack
+ * indicates that the retransmission was spurious.
+ *
+ * If the connection supports timestamps, we could use it to
+ * detect if the last retransmit was not needed. Otherwise,
+ * we check if the ACK arrived within RTT/2 window, then it
+ * was a mistake to do the retransmit in the first place.
+ *
+ * This function will return 1 if it is a spurious retransmit,
+ * 0 otherwise.
+ */
+int
+tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th,
+ struct tcpopt *to, u_int32_t rxtime)
+{
+ int32_t tdiff, bad_rexmt_win;
+ bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+
+ /* If the ack has ECN CE bit, then cwnd has to be adjusted */
+ if ((tp->ecn_flags & (TE_ECN_ON)) == (TE_ECN_ON)
+ && (th->th_flags & TH_ECE))
+ return (0);
+ if (TSTMP_SUPPORTED(tp)) {
+ if (rxtime > 0 && (to->to_flags & TOF_TS)
+ && to->to_tsecr != 0
+ && TSTMP_LT(to->to_tsecr, rxtime))
+ return (1);
+ } else {
+ if ((tp->t_rxtshift == 1
+ || (tp->t_flagsext & TF_SENT_TLPROBE))
+ && rxtime > 0) {
+ tdiff = (int32_t)(tcp_now - rxtime);
+ if (tdiff < bad_rexmt_win)
+ return(1);
+ }
+ }
+ return(0);
+}
+
+
+/*
+ * Restore congestion window state if a spurious timeout
+ * was detected.
+ */
+static void
+tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
+{
+ if (TSTMP_SUPPORTED(tp)) {
+ u_int32_t fsize, acked;
+ fsize = tp->snd_max - th->th_ack;
+ acked = BYTES_ACKED(th, tp);
+
+ /*
+ * Implement bad retransmit recovery as
+ * described in RFC 4015.
+ */
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+
+ /* Initialize cwnd to the initial window */
+ if (CC_ALGO(tp)->cwnd_init != NULL)
+ CC_ALGO(tp)->cwnd_init(tp);
+
+ tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
+
+ } else {
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+ if (tp->t_flags & TF_WASFRECOVERY)
+ ENTER_FASTRECOVERY(tp);
+ }
+ tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
+ tp->snd_recover = tp->snd_recover_prev;
+ tp->snd_nxt = tp->snd_max;
+ tp->t_rxtshift = 0;
+ tp->t_rxtstart = 0;
+
+ /* Fix send socket buffer to reflect the change in cwnd */
+ tcp_bad_rexmt_fix_sndbuf(tp);
+
+ /*
+ * This RTT might reflect the extra delay induced
+ * by the network. Skip using this sample for RTO
+ * calculation and mark the connection so we can
+ * recompute RTT when the next eligible sample is
+ * found.
+ */
+ tp->t_flagsext |= TF_RECOMPUTE_RTT;
+ tp->t_badrexmt_time = tcp_now;
+ tp->t_rtttime = 0;
+}
+
+/*
+ * If the previous packet was sent in retransmission timer, and it was
+ * not needed, then restore the congestion window to the state before that
+ * transmission.
+ *
+ * If the last packet was sent in tail loss probe timeout, check if that
+ * recovered the last packet. If so, that will indicate a real loss and
+ * the congestion window needs to be lowered.
+ */
+static void
+tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
+{
+ if (tp->t_rxtshift > 0 &&
+ tcp_detect_bad_rexmt(tp, th, to, tp->t_rxtstart)) {
+ ++tcpstat.tcps_sndrexmitbad;
+ tcp_bad_rexmt_restore_state(tp, th);
+ tcp_ccdbg_trace(tp, th, TCP_CC_BAD_REXMT_RECOVERY);
+ } else if ((tp->t_flagsext & TF_SENT_TLPROBE)
+ && tp->t_tlphighrxt > 0
+ && SEQ_GEQ(th->th_ack, tp->t_tlphighrxt)
+ && !tcp_detect_bad_rexmt(tp, th, to, tp->t_tlpstart)) {
+ /*
+ * The tail loss probe recovered the last packet and
+ * we need to adjust the congestion window to take
+ * this loss into account.
+ */
+ ++tcpstat.tcps_tlp_recoverlastpkt;
+ if (!IN_FASTRECOVERY(tp)) {
+ tcp_reduce_congestion_window(tp);
+ EXIT_FASTRECOVERY(tp);
+ }
+ tcp_ccdbg_trace(tp, th, TCP_CC_TLP_RECOVER_LASTPACKET);
+ }
+
+ tp->t_flagsext &= ~(TF_SENT_TLPROBE);
+ tp->t_tlphighrxt = 0;
+ tp->t_tlpstart = 0;
+
+ /*
+ * check if the latest ack was for a segment sent during PMTU
+ * blackhole detection. If the timestamp on the ack is before
+ * PMTU blackhole detection, then revert the size of the max
+ * segment to previous size.
+ */
+ if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) &&
+ tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) {
+ if ((to->to_flags & TOF_TS) && to->to_tsecr != 0
+ && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
+ tcp_pmtud_revert_segment_size(tp);
+ }
+ }
+ if (tp->t_pmtud_start_ts > 0)
+ tp->t_pmtud_start_ts = 0;
+}
+
+/*
+ * Check if early retransmit can be attempted according to RFC 5827.
+ *
+ * If packet reordering is detected on a connection, fast recovery will
+ * be delayed until it is clear that the packet was lost and not reordered.
+ * But reordering detection is done only when SACK is enabled.
+ *
+ * On connections that do not support SACK, there is a limit on the number
+ * of early retransmits that can be done per minute. This limit is needed
+ * to make sure that too many packets are not retransmitted when there is
+ * packet reordering.
+ */
+static void
+tcp_early_rexmt_check (struct tcpcb *tp, struct tcphdr *th)
+{
+ u_int32_t obytes, snd_off;
+ int32_t snd_len;
+ struct socket *so = tp->t_inpcb->inp_socket;
+
+ if (early_rexmt && (SACK_ENABLED(tp) ||
+ tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) &&
+ SEQ_GT(tp->snd_max, tp->snd_una) &&
+ (tp->t_dupacks == 1 ||
+ (SACK_ENABLED(tp) &&
+ !TAILQ_EMPTY(&tp->snd_holes)))) {
+ /*
+ * If there are only a few outstanding
+ * segments on the connection, we might need
+ * to lower the retransmit threshold. This
+ * will allow us to do Early Retransmit as
+ * described in RFC 5827.
+ */
+ if (SACK_ENABLED(tp) &&
+ !TAILQ_EMPTY(&tp->snd_holes)) {
+ obytes = (tp->snd_max - tp->snd_fack) +
+ tp->sackhint.sack_bytes_rexmit;
+ } else {
+ obytes = (tp->snd_max - tp->snd_una);
+ }
+
+ /*
+ * In order to lower retransmit threshold the
+ * following two conditions must be met.
+ * 1. the amount of outstanding data is less
+ * than 4*SMSS bytes
+ * 2. there is no unsent data ready for
+ * transmission or the advertised window
+ * will limit sending new segments.
+ */
+ snd_off = tp->snd_max - tp->snd_una;
+ snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off;
+ if (obytes < (tp->t_maxseg << 2) &&
+ snd_len <= 0) {
+ u_int32_t osegs;
+
+ osegs = obytes / tp->t_maxseg;
+ if ((osegs * tp->t_maxseg) < obytes)
+ osegs++;
+
+ /*
+ * Since the connection might have already
+ * received some dupacks, we add them to
+ * to the outstanding segments count to get
+ * the correct retransmit threshold.
+ *
+ * By checking for early retransmit after
+ * receiving some duplicate acks when SACK
+ * is supported, the connection will
+ * enter fast recovery even if multiple
+ * segments are lost in the same window.
+ */
+ osegs += tp->t_dupacks;
+ if (osegs < 4) {
+ tp->t_rexmtthresh =
+ ((osegs - 1) > 1) ? (osegs - 1) : 1;
+ tp->t_rexmtthresh =
+ min(tp->t_rexmtthresh, tcprexmtthresh);
+ tp->t_rexmtthresh =
+ max(tp->t_rexmtthresh, tp->t_dupacks);
+
+ if (tp->t_early_rexmt_count == 0)
+ tp->t_early_rexmt_win = tcp_now;
+
+ if (tp->t_flagsext & TF_SENT_TLPROBE) {
+ tcpstat.tcps_tlp_recovery++;
+ tcp_ccdbg_trace(tp, th,
+ TCP_CC_TLP_RECOVERY);
+ } else {
+ tcpstat.tcps_early_rexmt++;
+ tp->t_early_rexmt_count++;
+ tcp_ccdbg_trace(tp, th,
+ TCP_CC_EARLY_RETRANSMIT);
+ }
+ }
+ }
+ }
+
+ /*
+ * If we ever sent a TLP probe, the acknowledgement will trigger
+ * early retransmit because the value of snd_fack will be close
+ * to snd_max. This will take care of adjustments to the
+ * congestion window. So we can reset TF_SENT_PROBE flag.
+ */
+ tp->t_flagsext &= ~(TF_SENT_TLPROBE);
+ tp->t_tlphighrxt = 0;
+ tp->t_tlpstart = 0;
+}
+
+void
+tcp_input(m, off0)
+ struct mbuf *m;
+ int off0;
+{
+ register struct tcphdr *th;
+ register struct ip *ip = NULL;
+ register struct inpcb *inp;
+ u_char *optp = NULL;
+ int optlen = 0;
+ int tlen, off;
+ int drop_hdrlen;
+ register struct tcpcb *tp = 0;
+ register int thflags;
+ struct socket *so = 0;
+ int todrop, acked, ourfinisacked, needoutput = 0;
+ struct in_addr laddr;
+#if INET6
+ struct in6_addr laddr6;
+#endif
+ int dropsocket = 0;
+ int iss = 0, nosock = 0;
+ u_int32_t tiwin, sack_bytes_acked = 0;
+ struct tcpopt to; /* options in this segment */
+ struct sockaddr_in *next_hop = NULL;
+#if TCPDEBUG
+ short ostate = 0;
+#endif
+ struct m_tag *fwd_tag;
+ u_char ip_ecn = IPTOS_ECN_NOTECT;
+ unsigned int ifscope;
+ uint8_t isconnected, isdisconnected;
+ struct ifnet *ifp = m->m_pkthdr.rcvif;
+ int pktf_sw_lro_pkt = (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) ? 1 : 0;
+ int nlropkts = (pktf_sw_lro_pkt == 1) ? m->m_pkthdr.lro_npkts : 1;
+ int turnoff_lro = 0, win;
+#if MPTCP
+ struct mptcb *mp_tp = NULL;
+ uint16_t mptcp_csum = 0;
+#endif /* MPTCP */
+ boolean_t cell = IFNET_IS_CELLULAR(ifp);
+ boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
+ boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
+ struct tcp_respond_args tra;
+
+#define TCP_INC_VAR(stat, npkts) do { \
+ stat += npkts; \
+} while (0)
+
+ TCP_INC_VAR(tcpstat.tcps_rcvtotal, nlropkts);
+
+ /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
+ if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
+ fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
+ KERNEL_TAG_TYPE_IPFORWARD, NULL);
+ } else {
+ fwd_tag = NULL;
+ }
+ if (fwd_tag != NULL) {
+ struct ip_fwd_tag *ipfwd_tag =
+ (struct ip_fwd_tag *)(fwd_tag+1);
+
+ next_hop = ipfwd_tag->next_hop;
+ m_tag_delete(m, fwd_tag);
+ }
+
+#if INET6
+ struct ip6_hdr *ip6 = NULL;
+ int isipv6;
+#endif /* INET6 */
+ int rstreason; /* For badport_bandlim accounting purposes */
+ struct proc *proc0=current_proc();
+
+ KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
+
+#if INET6
+ isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
+#endif
+ bzero((char *)&to, sizeof(to));
+
+#if INET6
+ if (isipv6) {
+ /*
+ * Expect 32-bit aligned data pointer on
+ * strict-align platforms
+ */
+ MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
+
+ /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
+ ip6 = mtod(m, struct ip6_hdr *);
+ tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
+ th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
+
+ if (tcp_input_checksum(AF_INET6, m, th, off0, tlen))
+ goto dropnosock;
+
+ KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
+ (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
+ th->th_seq, th->th_ack, th->th_win);
+ /*
+ * Be proactive about unspecified IPv6 address in source.
+ * As we use all-zero to indicate unbounded/unconnected pcb,
+ * unspecified IPv6 address can be used to confuse us.
+ *
+ * Note that packets with unspecified IPv6 destination is
+ * already dropped in ip6_input.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
+ /* XXX stat */
+ IF_TCP_STATINC(ifp, unspecv6);
+ goto dropnosock;
+ }
+ DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
+ struct ip6_hdr *, ip6, struct tcpcb *, NULL,
+ struct tcphdr *, th);
+
+ ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
+ } else