+ u_int32_t sbspacelim = ((nmbclusters >> 4) << MCLSHIFT);
+
+ if ((total_sbmb_cnt < mblim) &&
+ (sb->sb_hiwat < sbspacelim)) {
+ return 1;
+ } else {
+ OSIncrementAtomic64(&sbmb_limreached);
+ }
+ return 0;
+}
+
+static void
+tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv,
+ u_int32_t newsize, u_int32_t idealsize, u_int32_t rcvbuf_max)
+{
+ /* newsize should not exceed max */
+ newsize = min(newsize, rcvbuf_max);
+
+ /* The receive window scale negotiated at the
+ * beginning of the connection will also set a
+ * limit on the socket buffer size
+ */
+ newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale);
+
+ /* Set new socket buffer size */
+ if (newsize > sbrcv->sb_hiwat &&
+ (sbreserve(sbrcv, newsize) == 1)) {
+ sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
+ (idealsize != 0) ? idealsize : newsize), rcvbuf_max);
+
+ /* Again check the limit set by the advertised
+ * window scale
+ */
+ sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
+ TCP_MAXWIN << tp->rcv_scale);
+ }
+}
+
+/*
+ * This function is used to grow a receive socket buffer. It
+ * will take into account system-level memory usage and the
+ * bandwidth available on the link to make a decision.
+ */
+static void
+tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
+ struct tcpopt *to, u_int32_t pktlen, u_int32_t rcvbuf_max)
+{
+ struct socket *so = sbrcv->sb_so;
+
+ /*
+ * Do not grow the receive socket buffer if
+ * - auto resizing is disabled, globally or on this socket
+ * - the high water mark already reached the maximum
+ * - the stream is in background and receive side is being
+ * throttled
+ * - if there are segments in reassembly queue indicating loss,
+ * do not need to increase recv window during recovery as more
+ * data is not going to be sent. A duplicate ack sent during
+ * recovery should not change the receive window
+ */
+ if (tcp_do_autorcvbuf == 0 ||
+ (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
+ tcp_cansbgrow(sbrcv) == 0 ||
+ sbrcv->sb_hiwat >= rcvbuf_max ||
+ (tp->t_flagsext & TF_RECV_THROTTLE) ||
+ (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
+ !LIST_EMPTY(&tp->t_segq)) {
+ /* Can not resize the socket buffer, just return */
+ goto out;
+ }
+
+ if (TSTMP_GT(tcp_now,
+ tp->rfbuf_ts + TCPTV_RCVBUFIDLE)) {
+ /* If there has been an idle period in the
+ * connection, just restart the measurement
+ */
+ goto out;
+ }
+
+ if (!TSTMP_SUPPORTED(tp)) {
+ /*
+ * Timestamp option is not supported on this connection.
+ * If the connection reached a state to indicate that
+ * the receive socket buffer needs to grow, increase
+ * the high water mark.
+ */
+ if (TSTMP_GEQ(tcp_now,
+ tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) {
+ if (tp->rfbuf_cnt + pktlen >= TCP_RCVNOTS_BYTELEVEL) {
+ tcp_sbrcv_reserve(tp, sbrcv,
+ tcp_autorcvbuf_max, 0,
+ tcp_autorcvbuf_max);
+ }
+ goto out;
+ } else {
+ tp->rfbuf_cnt += pktlen;
+ return;
+ }
+ } else if (to->to_tsecr != 0) {
+ /*
+ * If the timestamp shows that one RTT has
+ * completed, we can stop counting the
+ * bytes. Here we consider increasing
+ * the socket buffer if the bandwidth measured in
+ * last rtt, is more than half of sb_hiwat, this will
+ * help to scale the buffer according to the bandwidth
+ * on the link.
+ */
+ if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
+ if (tp->rfbuf_cnt + pktlen > (sbrcv->sb_hiwat -
+ (sbrcv->sb_hiwat >> 1))) {
+ tp->rfbuf_cnt += pktlen;
+ int32_t rcvbuf_inc, min_incr;
+ /*
+ * Increment the receive window by a
+ * multiple of maximum sized segments.
+ * This will prevent a connection from
+ * sending smaller segments on wire if it
+ * is limited by the receive window.
+ *
+ * Set the ideal size based on current
+ * bandwidth measurements. We set the
+ * ideal size on receive socket buffer to
+ * be twice the bandwidth delay product.
+ */
+ rcvbuf_inc = (tp->rfbuf_cnt << 1)
+ - sbrcv->sb_hiwat;
+
+ /*
+ * Make the increment equal to 8 segments
+ * at least
+ */
+ min_incr = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
+ if (rcvbuf_inc < min_incr) {
+ rcvbuf_inc = min_incr;
+ }
+
+ rcvbuf_inc =
+ (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
+ tcp_sbrcv_reserve(tp, sbrcv,
+ sbrcv->sb_hiwat + rcvbuf_inc,
+ (tp->rfbuf_cnt << 1), rcvbuf_max);
+ }
+ /* Measure instantaneous receive bandwidth */
+ if (tp->t_bwmeas != NULL && tp->rfbuf_cnt > 0 &&
+ TSTMP_GT(tcp_now, tp->rfbuf_ts)) {
+ u_int32_t rcv_bw;
+ rcv_bw = tp->rfbuf_cnt /
+ (int)(tcp_now - tp->rfbuf_ts);
+ if (tp->t_bwmeas->bw_rcvbw_max == 0) {
+ tp->t_bwmeas->bw_rcvbw_max = rcv_bw;
+ } else {
+ tp->t_bwmeas->bw_rcvbw_max = max(
+ tp->t_bwmeas->bw_rcvbw_max, rcv_bw);
+ }
+ }
+ goto out;
+ } else {
+ tp->rfbuf_cnt += pktlen;
+ return;
+ }
+ }
+out:
+ /* Restart the measurement */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+ return;
+}
+
+/* This function will trim the excess space added to the socket buffer
+ * to help a slow-reading app. The ideal-size of a socket buffer depends
+ * on the link bandwidth or it is set by an application and we aim to
+ * reach that size.
+ */
+void
+tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv)
+{
+ if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
+ sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
+ int32_t trim;
+ /* compute the difference between ideal and current sizes */
+ u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
+
+ /* Compute the maximum advertised window for
+ * this connection.
+ */
+ u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
+
+ /* How much can we trim the receive socket buffer?
+ * 1. it can not be trimmed beyond the max rcv win advertised
+ * 2. if possible, leave 1/16 of bandwidth*delay to
+ * avoid closing the win completely
+ */
+ u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4));
+
+ /* Sometimes leave can be zero, in that case leave at least
+ * a few segments worth of space.
+ */
+ if (leave == 0) {
+ leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
+ }
+
+ trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
+ trim = imin(trim, (int32_t)diff);
+
+ if (trim > 0) {
+ sbreserve(sbrcv, (sbrcv->sb_hiwat - trim));
+ }
+ }
+}
+
+/* We may need to trim the send socket buffer size for two reasons:
+ * 1. if the rtt seen on the connection is climbing up, we do not
+ * want to fill the buffers any more.
+ * 2. if the congestion win on the socket backed off, there is no need
+ * to hold more mbufs for that connection than what the cwnd will allow.
+ */
+void
+tcp_sbsnd_trim(struct sockbuf *sbsnd)
+{
+ if (tcp_do_autosendbuf == 1 &&
+ ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) ==
+ (SB_AUTOSIZE | SB_TRIM)) &&
+ (sbsnd->sb_idealsize > 0) &&
+ (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
+ u_int32_t trim = 0;
+ if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
+ trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
+ } else {
+ trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
+ }
+ sbreserve(sbsnd, (sbsnd->sb_hiwat - trim));
+ }
+ if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize) {
+ sbsnd->sb_flags &= ~(SB_TRIM);
+ }
+}
+
+/*
+ * If timestamp option was not negotiated on this connection
+ * and this connection is on the receiving side of a stream
+ * then we can not measure the delay on the link accurately.
+ * Instead of enabling automatic receive socket buffer
+ * resizing, just give more space to the receive socket buffer.
+ */
+static inline void
+tcp_sbrcv_tstmp_check(struct tcpcb *tp)
+{
+ struct socket *so = tp->t_inpcb->inp_socket;
+ u_int32_t newsize = 2 * tcp_recvspace;
+ struct sockbuf *sbrcv = &so->so_rcv;
+
+ if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) !=
+ (TF_REQ_TSTMP | TF_RCVD_TSTMP) &&
+ (sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
+ tcp_sbrcv_reserve(tp, sbrcv, newsize, 0, newsize);
+ }
+}
+
+/* A receiver will evaluate the flow of packets on a connection
+ * to see if it can reduce ack traffic. The receiver will start
+ * stretching acks if all of the following conditions are met:
+ * 1. tcp_delack_enabled is set to 3
+ * 2. If the bytes received in the last 100ms is greater than a threshold
+ * defined by maxseg_unacked
+ * 3. If the connection has not been idle for tcp_maxrcvidle period.
+ * 4. If the connection has seen enough packets to let the slow-start
+ * finish after connection establishment or after some packet loss.
+ *
+ * The receiver will stop stretching acks if there is congestion/reordering
+ * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
+ * timer fires while stretching acks, it means that the packet flow has gone
+ * below the threshold defined by maxseg_unacked and the receiver will stop
+ * stretching acks. The receiver gets no indication when slow-start is completed
+ * or when the connection reaches an idle state. That is why we use
+ * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
+ * state.
+ */
+static inline int
+tcp_stretch_ack_enable(struct tcpcb *tp, int thflags)
+{
+ if (tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
+ TSTMP_GEQ(tp->rcv_unackwin, tcp_now)) {
+ tp->t_flags |= TF_STREAMING_ON;
+ } else {
+ tp->t_flags &= ~TF_STREAMING_ON;
+ }
+
+ /* If there has been an idle time, reset streaming detection */
+ if (TSTMP_GT(tcp_now, tp->rcv_unackwin + tcp_maxrcvidle)) {
+ tp->t_flags &= ~TF_STREAMING_ON;
+ }
+
+ /*
+ * If there are flags other than TH_ACK set, reset streaming
+ * detection
+ */
+ if (thflags & ~TH_ACK) {
+ tp->t_flags &= ~TF_STREAMING_ON;
+ }
+
+ if (tp->t_flagsext & TF_DISABLE_STRETCHACK) {
+ if (tp->rcv_nostrack_pkts >= TCP_STRETCHACK_ENABLE_PKTCNT) {
+ tp->t_flagsext &= ~TF_DISABLE_STRETCHACK;
+ tp->rcv_nostrack_pkts = 0;
+ tp->rcv_nostrack_ts = 0;
+ } else {
+ tp->rcv_nostrack_pkts++;
+ }
+ }
+
+ if (!(tp->t_flagsext & (TF_NOSTRETCHACK | TF_DISABLE_STRETCHACK)) &&
+ (tp->t_flags & TF_STREAMING_ON) &&
+ (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) ||
+ (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Reset the state related to stretch-ack algorithm. This will make
+ * the receiver generate an ack every other packet. The receiver
+ * will start re-evaluating the rate at which packets come to decide
+ * if it can benefit by lowering the ack traffic.
+ */
+void
+tcp_reset_stretch_ack(struct tcpcb *tp)
+{
+ tp->t_flags &= ~(TF_STRETCHACK | TF_STREAMING_ON);
+ tp->rcv_by_unackwin = 0;
+ tp->rcv_by_unackhalfwin = 0;
+ tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
+
+ /*
+ * When there is packet loss or packet re-ordering or CWR due to
+ * ECN, the sender's congestion window is reduced. In these states,
+ * generate an ack for every other packet for some time to allow
+ * the sender's congestion window to grow.
+ */
+ tp->t_flagsext |= TF_RCVUNACK_WAITSS;
+ tp->rcv_waitforss = 0;
+}
+
+/*
+ * The last packet was a retransmission, check if this ack
+ * indicates that the retransmission was spurious.
+ *
+ * If the connection supports timestamps, we could use it to
+ * detect if the last retransmit was not needed. Otherwise,
+ * we check if the ACK arrived within RTT/2 window, then it
+ * was a mistake to do the retransmit in the first place.
+ *
+ * This function will return 1 if it is a spurious retransmit,
+ * 0 otherwise.
+ */
+int
+tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th,
+ struct tcpopt *to, u_int32_t rxtime)
+{
+ int32_t tdiff, bad_rexmt_win;
+ bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+
+ /* If the ack has ECN CE bit, then cwnd has to be adjusted */
+ if (TCP_ECN_ENABLED(tp) && (th->th_flags & TH_ECE)) {
+ return 0;
+ }
+ if (TSTMP_SUPPORTED(tp)) {
+ if (rxtime > 0 && (to->to_flags & TOF_TS)
+ && to->to_tsecr != 0
+ && TSTMP_LT(to->to_tsecr, rxtime)) {
+ return 1;
+ }
+ } else {
+ if ((tp->t_rxtshift == 1
+ || (tp->t_flagsext & TF_SENT_TLPROBE))
+ && rxtime > 0) {
+ tdiff = (int32_t)(tcp_now - rxtime);
+ if (tdiff < bad_rexmt_win) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Restore congestion window state if a spurious timeout
+ * was detected.
+ */
+static void
+tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
+{
+ if (TSTMP_SUPPORTED(tp)) {
+ u_int32_t fsize, acked;
+ fsize = tp->snd_max - th->th_ack;
+ acked = BYTES_ACKED(th, tp);
+
+ /*
+ * Implement bad retransmit recovery as
+ * described in RFC 4015.
+ */
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+
+ /* Initialize cwnd to the initial window */
+ if (CC_ALGO(tp)->cwnd_init != NULL) {
+ CC_ALGO(tp)->cwnd_init(tp);
+ }
+
+ tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
+ } else {
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+ if (tp->t_flags & TF_WASFRECOVERY) {
+ ENTER_FASTRECOVERY(tp);
+ }
+
+ /* Do not use the loss flight size in this case */
+ tp->t_lossflightsize = 0;
+ }
+ tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
+ tp->snd_recover = tp->snd_recover_prev;
+ tp->snd_nxt = tp->snd_max;
+
+ /* Fix send socket buffer to reflect the change in cwnd */
+ tcp_bad_rexmt_fix_sndbuf(tp);
+
+ /*
+ * This RTT might reflect the extra delay induced
+ * by the network. Skip using this sample for RTO
+ * calculation and mark the connection so we can
+ * recompute RTT when the next eligible sample is
+ * found.
+ */
+ tp->t_flagsext |= TF_RECOMPUTE_RTT;
+ tp->t_badrexmt_time = tcp_now;
+ tp->t_rtttime = 0;
+}
+
+/*
+ * If the previous packet was sent in retransmission timer, and it was
+ * not needed, then restore the congestion window to the state before that
+ * transmission.
+ *
+ * If the last packet was sent in tail loss probe timeout, check if that
+ * recovered the last packet. If so, that will indicate a real loss and
+ * the congestion window needs to be lowered.
+ */
+static void
+tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
+{
+ if (tp->t_rxtshift > 0 &&
+ tcp_detect_bad_rexmt(tp, th, to, tp->t_rxtstart)) {
+ ++tcpstat.tcps_sndrexmitbad;
+ tcp_bad_rexmt_restore_state(tp, th);
+ tcp_ccdbg_trace(tp, th, TCP_CC_BAD_REXMT_RECOVERY);
+ } else if ((tp->t_flagsext & TF_SENT_TLPROBE)
+ && tp->t_tlphighrxt > 0
+ && SEQ_GEQ(th->th_ack, tp->t_tlphighrxt)
+ && !tcp_detect_bad_rexmt(tp, th, to, tp->t_tlpstart)) {
+ /*
+ * check DSACK information also to make sure that
+ * the TLP was indeed needed
+ */
+ if (tcp_rxtseg_dsack_for_tlp(tp)) {
+ /*
+ * received a DSACK to indicate that TLP was
+ * not needed
+ */
+ tcp_rxtseg_clean(tp);
+ goto out;
+ }
+
+ /*
+ * The tail loss probe recovered the last packet and
+ * we need to adjust the congestion window to take
+ * this loss into account.
+ */
+ ++tcpstat.tcps_tlp_recoverlastpkt;
+ if (!IN_FASTRECOVERY(tp)) {
+ tcp_reduce_congestion_window(tp);
+ EXIT_FASTRECOVERY(tp);
+ }
+ tcp_ccdbg_trace(tp, th, TCP_CC_TLP_RECOVER_LASTPACKET);
+ } else if (tcp_rxtseg_detect_bad_rexmt(tp, th->th_ack)) {
+ /*
+ * All of the retransmitted segments were duplicated, this
+ * can be an indication of bad fast retransmit.
+ */
+ tcpstat.tcps_dsack_badrexmt++;
+ tcp_bad_rexmt_restore_state(tp, th);
+ tcp_ccdbg_trace(tp, th, TCP_CC_DSACK_BAD_REXMT);
+ tcp_rxtseg_clean(tp);
+ }
+out:
+ tp->t_flagsext &= ~(TF_SENT_TLPROBE);
+ tp->t_tlphighrxt = 0;
+ tp->t_tlpstart = 0;
+
+ /*
+ * check if the latest ack was for a segment sent during PMTU
+ * blackhole detection. If the timestamp on the ack is before
+ * PMTU blackhole detection, then revert the size of the max
+ * segment to previous size.
+ */
+ if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) &&
+ tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) {
+ if ((to->to_flags & TOF_TS) && to->to_tsecr != 0
+ && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
+ tcp_pmtud_revert_segment_size(tp);
+ }
+ }
+ if (tp->t_pmtud_start_ts > 0) {
+ tp->t_pmtud_start_ts = 0;
+ }
+}
+
+/*
+ * Check if early retransmit can be attempted according to RFC 5827.
+ *
+ * If packet reordering is detected on a connection, fast recovery will
+ * be delayed until it is clear that the packet was lost and not reordered.
+ * But reordering detection is done only when SACK is enabled.
+ *
+ * On connections that do not support SACK, there is a limit on the number
+ * of early retransmits that can be done per minute. This limit is needed
+ * to make sure that too many packets are not retransmitted when there is
+ * packet reordering.
+ */
+static void
+tcp_early_rexmt_check(struct tcpcb *tp, struct tcphdr *th)
+{
+ u_int32_t obytes, snd_off;
+ int32_t snd_len;
+ struct socket *so = tp->t_inpcb->inp_socket;
+
+ if (early_rexmt && (SACK_ENABLED(tp) ||
+ tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) &&
+ SEQ_GT(tp->snd_max, tp->snd_una) &&
+ (tp->t_dupacks == 1 ||
+ (SACK_ENABLED(tp) &&
+ !TAILQ_EMPTY(&tp->snd_holes)))) {
+ /*
+ * If there are only a few outstanding
+ * segments on the connection, we might need
+ * to lower the retransmit threshold. This
+ * will allow us to do Early Retransmit as
+ * described in RFC 5827.
+ */
+ if (SACK_ENABLED(tp) &&
+ !TAILQ_EMPTY(&tp->snd_holes)) {
+ obytes = (tp->snd_max - tp->snd_fack) +
+ tp->sackhint.sack_bytes_rexmit;
+ } else {
+ obytes = (tp->snd_max - tp->snd_una);
+ }
+
+ /*
+ * In order to lower retransmit threshold the
+ * following two conditions must be met.
+ * 1. the amount of outstanding data is less
+ * than 4*SMSS bytes
+ * 2. there is no unsent data ready for
+ * transmission or the advertised window
+ * will limit sending new segments.
+ */
+ snd_off = tp->snd_max - tp->snd_una;
+ snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off;
+ if (obytes < (tp->t_maxseg << 2) &&
+ snd_len <= 0) {
+ u_int32_t osegs;
+
+ osegs = obytes / tp->t_maxseg;
+ if ((osegs * tp->t_maxseg) < obytes) {
+ osegs++;
+ }
+
+ /*
+ * Since the connection might have already
+ * received some dupacks, we add them to
+ * to the outstanding segments count to get
+ * the correct retransmit threshold.
+ *
+ * By checking for early retransmit after
+ * receiving some duplicate acks when SACK
+ * is supported, the connection will
+ * enter fast recovery even if multiple
+ * segments are lost in the same window.
+ */
+ osegs += tp->t_dupacks;
+ if (osegs < 4) {
+ tp->t_rexmtthresh =
+ ((osegs - 1) > 1) ? (osegs - 1) : 1;
+ tp->t_rexmtthresh =
+ min(tp->t_rexmtthresh, tcprexmtthresh);
+ tp->t_rexmtthresh =
+ max(tp->t_rexmtthresh, tp->t_dupacks);
+
+ if (tp->t_early_rexmt_count == 0) {
+ tp->t_early_rexmt_win = tcp_now;
+ }
+
+ if (tp->t_flagsext & TF_SENT_TLPROBE) {
+ tcpstat.tcps_tlp_recovery++;
+ tcp_ccdbg_trace(tp, th,
+ TCP_CC_TLP_RECOVERY);
+ } else {
+ tcpstat.tcps_early_rexmt++;
+ tp->t_early_rexmt_count++;
+ tcp_ccdbg_trace(tp, th,
+ TCP_CC_EARLY_RETRANSMIT);
+ }
+ }
+ }
+ }
+
+ /*
+ * If we ever sent a TLP probe, the acknowledgement will trigger
+ * early retransmit because the value of snd_fack will be close
+ * to snd_max. This will take care of adjustments to the
+ * congestion window. So we can reset TF_SENT_PROBE flag.
+ */
+ tp->t_flagsext &= ~(TF_SENT_TLPROBE);
+ tp->t_tlphighrxt = 0;
+ tp->t_tlpstart = 0;
+}
+
+static boolean_t
+tcp_tfo_syn(struct tcpcb *tp, struct tcpopt *to)
+{
+ u_char out[CCAES_BLOCK_SIZE];
+ unsigned char len;
+
+ if (!(to->to_flags & (TOF_TFO | TOF_TFOREQ)) ||
+ !(tcp_fastopen & TCP_FASTOPEN_SERVER)) {
+ return FALSE;
+ }
+
+ if ((to->to_flags & TOF_TFOREQ)) {
+ tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
+
+ tp->t_tfo_stats |= TFO_S_COOKIEREQ_RECV;
+ tcpstat.tcps_tfo_cookie_req_rcv++;
+ return FALSE;
+ }
+
+ /* Ok, then it must be an offered cookie. We need to check that ... */
+ tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out));
+
+ len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
+ to->to_tfo++;
+ if (memcmp(out, to->to_tfo, len)) {
+ /* Cookies are different! Let's return and offer a new cookie */
+ tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
+
+ tp->t_tfo_stats |= TFO_S_COOKIE_INVALID;
+ tcpstat.tcps_tfo_cookie_invalid++;
+ return FALSE;
+ }
+
+ if (OSIncrementAtomic(&tcp_tfo_halfcnt) >= tcp_tfo_backlog) {
+ /* Need to decrement again as we just increased it... */
+ OSDecrementAtomic(&tcp_tfo_halfcnt);
+ return FALSE;
+ }
+
+ tp->t_tfo_flags |= TFO_F_COOKIE_VALID;
+
+ tp->t_tfo_stats |= TFO_S_SYNDATA_RCV;
+ tcpstat.tcps_tfo_syn_data_rcv++;
+
+ return TRUE;
+}
+
+static void
+tcp_tfo_synack(struct tcpcb *tp, struct tcpopt *to)
+{
+ if (to->to_flags & TOF_TFO) {
+ unsigned char len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
+
+ /*
+ * If this happens, things have gone terribly wrong. len should
+ * have been checked in tcp_dooptions.
+ */
+ VERIFY(len <= TFO_COOKIE_LEN_MAX);
+
+ to->to_tfo++;
+
+ tcp_cache_set_cookie(tp, to->to_tfo, len);
+ tcp_heuristic_tfo_success(tp);
+
+ tp->t_tfo_stats |= TFO_S_COOKIE_RCV;
+ tcpstat.tcps_tfo_cookie_rcv++;
+ if (tp->t_tfo_flags & TFO_F_COOKIE_SENT) {
+ tcpstat.tcps_tfo_cookie_wrong++;
+ tp->t_tfo_stats |= TFO_S_COOKIE_WRONG;
+ }
+ } else {
+ /*
+ * Thus, no cookie in the response, but we either asked for one
+ * or sent SYN+DATA. Now, we need to check whether we had to
+ * rexmit the SYN. If that's the case, it's better to start
+ * backing of TFO-cookie requests.
+ */
+ if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
+ tp->t_tfo_flags & TFO_F_SYN_LOSS) {
+ tp->t_tfo_stats |= TFO_S_SYN_LOSS;
+ tcpstat.tcps_tfo_syn_loss++;
+
+ tcp_heuristic_tfo_loss(tp);
+ } else {
+ if (tp->t_tfo_flags & TFO_F_COOKIE_REQ) {
+ tp->t_tfo_stats |= TFO_S_NO_COOKIE_RCV;
+ tcpstat.tcps_tfo_no_cookie_rcv++;
+ }
+
+ tcp_heuristic_tfo_success(tp);
+ }
+ }
+}
+
+static void
+tcp_tfo_rcv_probe(struct tcpcb *tp, int tlen)
+{
+ if (tlen != 0) {
+ return;
+ }
+
+ tp->t_tfo_probe_state = TFO_PROBE_PROBING;
+
+ /*
+ * We send the probe out rather quickly (after one RTO). It does not
+ * really hurt that much, it's only one additional segment on the wire.
+ */
+ tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, (TCP_REXMTVAL(tp)));
+}
+
+static void
+tcp_tfo_rcv_data(struct tcpcb *tp)
+{
+ /* Transition from PROBING to NONE as data has been received */
+ if (tp->t_tfo_probe_state >= TFO_PROBE_PROBING) {
+ tp->t_tfo_probe_state = TFO_PROBE_NONE;
+ }
+}
+
+static void
+tcp_tfo_rcv_ack(struct tcpcb *tp, struct tcphdr *th)
+{
+ if (tp->t_tfo_probe_state == TFO_PROBE_PROBING &&
+ tp->t_tfo_probes > 0) {
+ if (th->th_seq == tp->rcv_nxt) {
+ /* No hole, so stop probing */
+ tp->t_tfo_probe_state = TFO_PROBE_NONE;
+ } else if (SEQ_GT(th->th_seq, tp->rcv_nxt)) {
+ /* There is a hole! Wait a bit for data... */
+ tp->t_tfo_probe_state = TFO_PROBE_WAIT_DATA;
+ tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
+ TCP_REXMTVAL(tp));
+ }
+ }
+}
+
+/*
+ * Update snd_wnd information.
+ */
+static inline bool
+tcp_update_window(struct tcpcb *tp, int thflags, struct tcphdr * th,
+ u_int32_t tiwin, int tlen)
+{
+ /* Don't look at the window if there is no ACK flag */
+ if ((thflags & TH_ACK) &&
+ (SEQ_LT(tp->snd_wl1, th->th_seq) ||
+ (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+ (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
+ /* keep track of pure window updates */
+ if (tlen == 0 &&
+ tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
+ tcpstat.tcps_rcvwinupd++;
+ }
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ tp->snd_wl2 = th->th_ack;
+ if (tp->snd_wnd > tp->max_sndwnd) {
+ tp->max_sndwnd = tp->snd_wnd;
+ }
+
+ if (tp->t_inpcb->inp_socket->so_flags & SOF_MP_SUBFLOW) {
+ mptcp_update_window_wakeup(tp);
+ }
+ return true;
+ }
+ return false;
+}
+
+static void
+tcp_handle_wakeup(struct socket *so, int read_wakeup, int write_wakeup)
+{
+ if (read_wakeup != 0) {
+ sorwakeup(so);
+ }
+ if (write_wakeup != 0) {
+ sowwakeup(so);
+ }
+}
+
+void
+tcp_input(struct mbuf *m, int off0)
+{
+ struct tcphdr *th;
+ struct ip *ip = NULL;
+ struct inpcb *inp;
+ u_char *optp = NULL;
+ int optlen = 0;
+ int tlen, off;
+ int drop_hdrlen;
+ struct tcpcb *tp = 0;
+ int thflags;
+ struct socket *so = 0;
+ int todrop, acked, ourfinisacked, needoutput = 0;
+ int read_wakeup = 0;
+ int write_wakeup = 0;
+ struct in_addr laddr;