+ /* If the ack has ECN CE bit, then cwnd has to be adjusted */
+ if (TCP_ECN_ENABLED(tp) && (th->th_flags & TH_ECE)) {
+ return 0;
+ }
+ if (TSTMP_SUPPORTED(tp)) {
+ if (rxtime > 0 && (to->to_flags & TOF_TS)
+ && to->to_tsecr != 0
+ && TSTMP_LT(to->to_tsecr, rxtime)) {
+ return 1;
+ }
+ } else {
+ if ((tp->t_rxtshift == 1
+ || (tp->t_flagsext & TF_SENT_TLPROBE))
+ && rxtime > 0) {
+ tdiff = (int32_t)(tcp_now - rxtime);
+ if (tdiff < bad_rexmt_win) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Restore congestion window state if a spurious timeout
+ * was detected.
+ */
+static void
+tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
+{
+ if (TSTMP_SUPPORTED(tp)) {
+ u_int32_t fsize, acked;
+ fsize = tp->snd_max - th->th_ack;
+ acked = BYTES_ACKED(th, tp);
+
+ /*
+ * Implement bad retransmit recovery as
+ * described in RFC 4015.
+ */
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+
+ /* Initialize cwnd to the initial window */
+ if (CC_ALGO(tp)->cwnd_init != NULL) {
+ CC_ALGO(tp)->cwnd_init(tp);
+ }
+
+ tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
+ } else {
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+ if (tp->t_flags & TF_WASFRECOVERY) {
+ ENTER_FASTRECOVERY(tp);
+ }
+
+ /* Do not use the loss flight size in this case */
+ tp->t_lossflightsize = 0;
+ }
+ tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
+ tp->snd_recover = tp->snd_recover_prev;
+ tp->snd_nxt = tp->snd_max;
+
+ /* Fix send socket buffer to reflect the change in cwnd */
+ tcp_bad_rexmt_fix_sndbuf(tp);
+
+ /*
+ * This RTT might reflect the extra delay induced
+ * by the network. Skip using this sample for RTO
+ * calculation and mark the connection so we can
+ * recompute RTT when the next eligible sample is
+ * found.
+ */
+ tp->t_flagsext |= TF_RECOMPUTE_RTT;
+ tp->t_badrexmt_time = tcp_now;
+ tp->t_rtttime = 0;
+}
+
+/*
+ * If the previous packet was sent in retransmission timer, and it was
+ * not needed, then restore the congestion window to the state before that
+ * transmission.
+ *
+ * If the last packet was sent in tail loss probe timeout, check if that
+ * recovered the last packet. If so, that will indicate a real loss and
+ * the congestion window needs to be lowered.
+ */
+static void
+tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
+{
+ if (tp->t_rxtshift > 0 &&
+ tcp_detect_bad_rexmt(tp, th, to, tp->t_rxtstart)) {
+ ++tcpstat.tcps_sndrexmitbad;
+ tcp_bad_rexmt_restore_state(tp, th);
+ tcp_ccdbg_trace(tp, th, TCP_CC_BAD_REXMT_RECOVERY);
+ } else if ((tp->t_flagsext & TF_SENT_TLPROBE)
+ && tp->t_tlphighrxt > 0
+ && SEQ_GEQ(th->th_ack, tp->t_tlphighrxt)
+ && !tcp_detect_bad_rexmt(tp, th, to, tp->t_tlpstart)) {
+ /*
+ * check DSACK information also to make sure that
+ * the TLP was indeed needed
+ */
+ if (tcp_rxtseg_dsack_for_tlp(tp)) {
+ /*
+ * received a DSACK to indicate that TLP was
+ * not needed
+ */
+ tcp_rxtseg_clean(tp);
+ goto out;
+ }
+
+ /*
+ * The tail loss probe recovered the last packet and
+ * we need to adjust the congestion window to take
+ * this loss into account.
+ */
+ ++tcpstat.tcps_tlp_recoverlastpkt;
+ if (!IN_FASTRECOVERY(tp)) {
+ tcp_reduce_congestion_window(tp);
+ EXIT_FASTRECOVERY(tp);
+ }
+ tcp_ccdbg_trace(tp, th, TCP_CC_TLP_RECOVER_LASTPACKET);
+ } else if (tcp_rxtseg_detect_bad_rexmt(tp, th->th_ack)) {
+ /*
+ * All of the retransmitted segments were duplicated, this
+ * can be an indication of bad fast retransmit.
+ */
+ tcpstat.tcps_dsack_badrexmt++;
+ tcp_bad_rexmt_restore_state(tp, th);
+ tcp_ccdbg_trace(tp, th, TCP_CC_DSACK_BAD_REXMT);
+ tcp_rxtseg_clean(tp);
+ }
+out:
+ tp->t_flagsext &= ~(TF_SENT_TLPROBE);
+ tp->t_tlphighrxt = 0;
+ tp->t_tlpstart = 0;
+
+ /*
+ * check if the latest ack was for a segment sent during PMTU
+ * blackhole detection. If the timestamp on the ack is before
+ * PMTU blackhole detection, then revert the size of the max
+ * segment to previous size.
+ */
+ if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) &&
+ tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) {
+ if ((to->to_flags & TOF_TS) && to->to_tsecr != 0
+ && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
+ tcp_pmtud_revert_segment_size(tp);
+ }
+ }
+ if (tp->t_pmtud_start_ts > 0) {
+ tp->t_pmtud_start_ts = 0;
+ }
+}
+
+/*
+ * Check if early retransmit can be attempted according to RFC 5827.
+ *
+ * If packet reordering is detected on a connection, fast recovery will
+ * be delayed until it is clear that the packet was lost and not reordered.
+ * But reordering detection is done only when SACK is enabled.
+ *
+ * On connections that do not support SACK, there is a limit on the number
+ * of early retransmits that can be done per minute. This limit is needed
+ * to make sure that too many packets are not retransmitted when there is
+ * packet reordering.
+ */
+static void
+tcp_early_rexmt_check(struct tcpcb *tp, struct tcphdr *th)
+{
+ u_int32_t obytes, snd_off;
+ int32_t snd_len;
+ struct socket *so = tp->t_inpcb->inp_socket;
+
+ if (early_rexmt && (SACK_ENABLED(tp) ||
+ tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) &&
+ SEQ_GT(tp->snd_max, tp->snd_una) &&
+ (tp->t_dupacks == 1 ||
+ (SACK_ENABLED(tp) &&
+ !TAILQ_EMPTY(&tp->snd_holes)))) {
+ /*
+ * If there are only a few outstanding
+ * segments on the connection, we might need
+ * to lower the retransmit threshold. This
+ * will allow us to do Early Retransmit as
+ * described in RFC 5827.
+ */
+ if (SACK_ENABLED(tp) &&
+ !TAILQ_EMPTY(&tp->snd_holes)) {
+ obytes = (tp->snd_max - tp->snd_fack) +
+ tp->sackhint.sack_bytes_rexmit;
+ } else {
+ obytes = (tp->snd_max - tp->snd_una);
+ }
+
+ /*
+ * In order to lower retransmit threshold the
+ * following two conditions must be met.
+ * 1. the amount of outstanding data is less
+ * than 4*SMSS bytes
+ * 2. there is no unsent data ready for
+ * transmission or the advertised window
+ * will limit sending new segments.
+ */
+ snd_off = tp->snd_max - tp->snd_una;
+ snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off;
+ if (obytes < (tp->t_maxseg << 2) &&
+ snd_len <= 0) {
+ u_int32_t osegs;
+
+ osegs = obytes / tp->t_maxseg;
+ if ((osegs * tp->t_maxseg) < obytes) {
+ osegs++;
+ }
+
+ /*
+ * Since the connection might have already
+ * received some dupacks, we add them to
+ * to the outstanding segments count to get
+ * the correct retransmit threshold.
+ *
+ * By checking for early retransmit after
+ * receiving some duplicate acks when SACK
+ * is supported, the connection will
+ * enter fast recovery even if multiple
+ * segments are lost in the same window.
+ */
+ osegs += tp->t_dupacks;
+ if (osegs < 4) {
+ tp->t_rexmtthresh =
+ ((osegs - 1) > 1) ? (osegs - 1) : 1;
+ tp->t_rexmtthresh =
+ min(tp->t_rexmtthresh, tcprexmtthresh);
+ tp->t_rexmtthresh =
+ max(tp->t_rexmtthresh, tp->t_dupacks);
+
+ if (tp->t_early_rexmt_count == 0) {
+ tp->t_early_rexmt_win = tcp_now;
+ }
+
+ if (tp->t_flagsext & TF_SENT_TLPROBE) {
+ tcpstat.tcps_tlp_recovery++;
+ tcp_ccdbg_trace(tp, th,
+ TCP_CC_TLP_RECOVERY);
+ } else {
+ tcpstat.tcps_early_rexmt++;
+ tp->t_early_rexmt_count++;
+ tcp_ccdbg_trace(tp, th,
+ TCP_CC_EARLY_RETRANSMIT);
+ }
+ }
+ }
+ }
+
+ /*
+ * If we ever sent a TLP probe, the acknowledgement will trigger
+ * early retransmit because the value of snd_fack will be close
+ * to snd_max. This will take care of adjustments to the
+ * congestion window. So we can reset TF_SENT_PROBE flag.
+ */
+ tp->t_flagsext &= ~(TF_SENT_TLPROBE);
+ tp->t_tlphighrxt = 0;
+ tp->t_tlpstart = 0;
+}
+
+static boolean_t
+tcp_tfo_syn(struct tcpcb *tp, struct tcpopt *to)
+{
+ u_char out[CCAES_BLOCK_SIZE];
+ unsigned char len;
+
+ if (!(to->to_flags & (TOF_TFO | TOF_TFOREQ)) ||
+ !(tcp_fastopen & TCP_FASTOPEN_SERVER)) {
+ return FALSE;
+ }
+
+ if ((to->to_flags & TOF_TFOREQ)) {
+ tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
+
+ tp->t_tfo_stats |= TFO_S_COOKIEREQ_RECV;
+ tcpstat.tcps_tfo_cookie_req_rcv++;
+ return FALSE;
+ }
+
+ /* Ok, then it must be an offered cookie. We need to check that ... */
+ tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out));
+
+ len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
+ to->to_tfo++;
+ if (memcmp(out, to->to_tfo, len)) {
+ /* Cookies are different! Let's return and offer a new cookie */
+ tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
+
+ tp->t_tfo_stats |= TFO_S_COOKIE_INVALID;
+ tcpstat.tcps_tfo_cookie_invalid++;
+ return FALSE;
+ }
+
+ if (OSIncrementAtomic(&tcp_tfo_halfcnt) >= tcp_tfo_backlog) {
+ /* Need to decrement again as we just increased it... */
+ OSDecrementAtomic(&tcp_tfo_halfcnt);
+ return FALSE;
+ }
+
+ tp->t_tfo_flags |= TFO_F_COOKIE_VALID;
+
+ tp->t_tfo_stats |= TFO_S_SYNDATA_RCV;
+ tcpstat.tcps_tfo_syn_data_rcv++;
+
+ return TRUE;
+}
+
+static void
+tcp_tfo_synack(struct tcpcb *tp, struct tcpopt *to)
+{
+ if (to->to_flags & TOF_TFO) {
+ unsigned char len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
+
+ /*
+ * If this happens, things have gone terribly wrong. len should
+ * have been checked in tcp_dooptions.
+ */
+ VERIFY(len <= TFO_COOKIE_LEN_MAX);
+
+ to->to_tfo++;
+
+ tcp_cache_set_cookie(tp, to->to_tfo, len);
+ tcp_heuristic_tfo_success(tp);
+
+ tp->t_tfo_stats |= TFO_S_COOKIE_RCV;
+ tcpstat.tcps_tfo_cookie_rcv++;
+ if (tp->t_tfo_flags & TFO_F_COOKIE_SENT) {
+ tcpstat.tcps_tfo_cookie_wrong++;
+ tp->t_tfo_stats |= TFO_S_COOKIE_WRONG;
+ }
+ } else {
+ /*
+ * Thus, no cookie in the response, but we either asked for one
+ * or sent SYN+DATA. Now, we need to check whether we had to
+ * rexmit the SYN. If that's the case, it's better to start
+ * backing of TFO-cookie requests.
+ */
+ if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
+ tp->t_tfo_flags & TFO_F_SYN_LOSS) {
+ tp->t_tfo_stats |= TFO_S_SYN_LOSS;
+ tcpstat.tcps_tfo_syn_loss++;
+
+ tcp_heuristic_tfo_loss(tp);
+ } else {
+ if (tp->t_tfo_flags & TFO_F_COOKIE_REQ) {
+ tp->t_tfo_stats |= TFO_S_NO_COOKIE_RCV;
+ tcpstat.tcps_tfo_no_cookie_rcv++;
+ }
+
+ tcp_heuristic_tfo_success(tp);
+ }
+ }
+}
+
+static void
+tcp_tfo_rcv_probe(struct tcpcb *tp, int tlen)
+{
+ if (tlen != 0) {
+ return;
+ }
+
+ tp->t_tfo_probe_state = TFO_PROBE_PROBING;
+
+ /*
+ * We send the probe out rather quickly (after one RTO). It does not
+ * really hurt that much, it's only one additional segment on the wire.
+ */
+ tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, (TCP_REXMTVAL(tp)));
+}
+
+static void
+tcp_tfo_rcv_data(struct tcpcb *tp)
+{
+ /* Transition from PROBING to NONE as data has been received */
+ if (tp->t_tfo_probe_state >= TFO_PROBE_PROBING) {
+ tp->t_tfo_probe_state = TFO_PROBE_NONE;
+ }
+}
+
+static void
+tcp_tfo_rcv_ack(struct tcpcb *tp, struct tcphdr *th)
+{
+ if (tp->t_tfo_probe_state == TFO_PROBE_PROBING &&
+ tp->t_tfo_probes > 0) {
+ if (th->th_seq == tp->rcv_nxt) {
+ /* No hole, so stop probing */
+ tp->t_tfo_probe_state = TFO_PROBE_NONE;
+ } else if (SEQ_GT(th->th_seq, tp->rcv_nxt)) {
+ /* There is a hole! Wait a bit for data... */
+ tp->t_tfo_probe_state = TFO_PROBE_WAIT_DATA;
+ tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
+ TCP_REXMTVAL(tp));
+ }
+ }
+}
+
+/*
+ * Update snd_wnd information.
+ */
+static inline bool
+tcp_update_window(struct tcpcb *tp, int thflags, struct tcphdr * th,
+ u_int32_t tiwin, int tlen)
+{
+ /* Don't look at the window if there is no ACK flag */
+ if ((thflags & TH_ACK) &&
+ (SEQ_LT(tp->snd_wl1, th->th_seq) ||
+ (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+ (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
+ /* keep track of pure window updates */
+ if (tlen == 0 &&
+ tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
+ tcpstat.tcps_rcvwinupd++;
+ }
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ tp->snd_wl2 = th->th_ack;
+ if (tp->snd_wnd > tp->max_sndwnd) {
+ tp->max_sndwnd = tp->snd_wnd;
+ }
+
+ if (tp->t_inpcb->inp_socket->so_flags & SOF_MP_SUBFLOW) {
+ mptcp_update_window_wakeup(tp);
+ }
+ return true;
+ }
+ return false;
+}
+
+static void
+tcp_handle_wakeup(struct socket *so, int read_wakeup, int write_wakeup)
+{
+ if (read_wakeup != 0) {
+ sorwakeup(so);
+ }
+ if (write_wakeup != 0) {
+ sowwakeup(so);
+ }
+}
+
+void
+tcp_input(struct mbuf *m, int off0)
+{
+ struct tcphdr *th;
+ struct ip *ip = NULL;
+ struct inpcb *inp;
+ u_char *optp = NULL;
+ int optlen = 0;
+ int tlen, off;
+ int drop_hdrlen;
+ struct tcpcb *tp = 0;
+ int thflags;
+ struct socket *so = 0;
+ int todrop, acked, ourfinisacked, needoutput = 0;
+ int read_wakeup = 0;
+ int write_wakeup = 0;
+ struct in_addr laddr;
+#if INET6
+ struct in6_addr laddr6;
+#endif
+ int dropsocket = 0;
+ int iss = 0, nosock = 0;
+ u_int32_t tiwin, sack_bytes_acked = 0;
+ struct tcpopt to; /* options in this segment */
+#if TCPDEBUG
+ short ostate = 0;
+#endif
+#if IPFIREWALL
+ struct sockaddr_in *next_hop = NULL;
+ struct m_tag *fwd_tag;
+#endif /* IPFIREWALL */
+ u_char ip_ecn = IPTOS_ECN_NOTECT;
+ unsigned int ifscope;
+ uint8_t isconnected, isdisconnected;
+ struct ifnet *ifp = m->m_pkthdr.rcvif;
+ int pktf_sw_lro_pkt = (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) ? 1 : 0;
+ int nlropkts = (pktf_sw_lro_pkt == 1) ? m->m_pkthdr.lro_npkts : 1;
+ int turnoff_lro = 0, win;
+#if MPTCP
+ struct mptcb *mp_tp = NULL;
+#endif /* MPTCP */
+ boolean_t cell = IFNET_IS_CELLULAR(ifp);
+ boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
+ boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
+ boolean_t recvd_dsack = FALSE;
+ struct tcp_respond_args tra;
+ int prev_t_state;
+ boolean_t check_cfil = cfil_filter_present();
+ bool findpcb_iterated = false;
+ /*
+ * The mbuf may be freed after it has been added to the receive socket
+ * buffer or the reassembly queue, so we reinitialize th to point to a
+ * safe copy of the TCP header
+ */
+ struct tcphdr saved_tcphdr = {};
+ /*
+ * Save copy of the IPv4/IPv6 header.
+ * Note: use array of uint32_t to silence compiler warning when casting
+ * to a struct ip6_hdr pointer.
+ */
+#define MAX_IPWORDS ((sizeof(struct ip) + MAX_IPOPTLEN) / sizeof(uint32_t))
+ uint32_t saved_hdr[MAX_IPWORDS];
+
+#define TCP_INC_VAR(stat, npkts) do { \
+ stat += npkts; \
+} while (0)
+
+ TCP_INC_VAR(tcpstat.tcps_rcvtotal, nlropkts);
+#if IPFIREWALL
+ /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
+ if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
+ fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
+ KERNEL_TAG_TYPE_IPFORWARD, NULL);