+ goto out;
+ }
+
+ if (!TSTMP_SUPPORTED(tp)) {
+ /*
+ * Timestamp option is not supported on this connection.
+ * If the connection reached a state to indicate that
+ * the receive socket buffer needs to grow, increase
+ * the high water mark.
+ */
+ if (TSTMP_GEQ(tcp_now,
+ tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) {
+ if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) {
+ tcp_sbrcv_reserve(tp, sbrcv,
+ tcp_autorcvbuf_max, 0);
+ }
+ goto out;
+ } else {
+ tp->rfbuf_cnt += pktlen;
+ return;
+ }
+ } else if (to->to_tsecr != 0) {
+ /* If the timestamp shows that one RTT has
+ * completed, we can stop counting the
+ * bytes. Here we consider increasing
+ * the socket buffer if it fits the following
+ * criteria:
+ * 1. the bandwidth measured in last rtt, is more
+ * than half of sb_hiwat, this will help to scale the
+ * buffer according to the bandwidth on the link.
+ * 2. the space left in sbrcv is less than
+ * one forth of the bandwidth measured in last rtt, this
+ * will help to accommodate an application reading slowly.
+ */
+ if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
+ if ((tp->rfbuf_cnt > (sbrcv->sb_hiwat -
+ (sbrcv->sb_hiwat >> tcp_rbuf_hiwat_shift)) ||
+ (sbrcv->sb_hiwat - sbrcv->sb_cc) <
+ (tp->rfbuf_cnt >> tcp_rbuf_win_shift))) {
+ u_int32_t rcvbuf_inc;
+ /*
+ * Increment the receive window by a multiple of
+ * maximum sized segments. This will prevent a
+ * connection from sending smaller segments on
+ * wire if it is limited by the receive window.
+ *
+ * Set the ideal size based on current bandwidth
+ * measurements. We set the ideal size on receive
+ * socket buffer to be twice the bandwidth delay
+ * product.
+ */
+ rcvbuf_inc = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
+ tcp_sbrcv_reserve(tp, sbrcv,
+ sbrcv->sb_hiwat + rcvbuf_inc,
+ (tp->rfbuf_cnt * 2));
+ }
+ goto out;
+ } else {
+ tp->rfbuf_cnt += pktlen;
+ return;
+ }
+ }
+out:
+ /* Restart the measurement */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+ return;
+}
+
+/* This function will trim the excess space added to the socket buffer
+ * to help a slow-reading app. The ideal-size of a socket buffer depends
+ * on the link bandwidth or it is set by an application and we aim to
+ * reach that size.
+ */
+void
+tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) {
+ if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
+ sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
+ int32_t trim;
+ /* compute the difference between ideal and current sizes */
+ u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
+
+ /* Compute the maximum advertised window for
+ * this connection.
+ */
+ u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
+
+ /* How much can we trim the receive socket buffer?
+ * 1. it can not be trimmed beyond the max rcv win advertised
+ * 2. if possible, leave 1/16 of bandwidth*delay to
+ * avoid closing the win completely
+ */
+ u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4));
+
+ /* Sometimes leave can be zero, in that case leave at least
+ * a few segments worth of space.
+ */
+ if (leave == 0)
+ leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
+
+ trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
+ trim = imin(trim, (int32_t)diff);
+
+ if (trim > 0)
+ sbreserve(sbrcv, (sbrcv->sb_hiwat - trim));
+ }
+}
+
+/* We may need to trim the send socket buffer size for two reasons:
+ * 1. if the rtt seen on the connection is climbing up, we do not
+ * want to fill the buffers any more.
+ * 2. if the congestion win on the socket backed off, there is no need
+ * to hold more mbufs for that connection than what the cwnd will allow.
+ */
+void
+tcp_sbsnd_trim(struct sockbuf *sbsnd) {
+ if (tcp_do_autosendbuf == 1 &&
+ ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) ==
+ (SB_AUTOSIZE | SB_TRIM)) &&
+ (sbsnd->sb_idealsize > 0) &&
+ (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
+ u_int32_t trim = 0;
+ if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
+ trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
+ } else {
+ trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
+ }
+ sbreserve(sbsnd, (sbsnd->sb_hiwat - trim));
+ }
+ if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize)
+ sbsnd->sb_flags &= ~(SB_TRIM);
+}
+
+/*
+ * If timestamp option was not negotiated on this connection
+ * and this connection is on the receiving side of a stream
+ * then we can not measure the delay on the link accurately.
+ * Instead of enabling automatic receive socket buffer
+ * resizing, just give more space to the receive socket buffer.
+ */
+static inline void
+tcp_sbrcv_tstmp_check(struct tcpcb *tp) {
+ struct socket *so = tp->t_inpcb->inp_socket;
+ u_int32_t newsize = 2 * tcp_recvspace;
+ struct sockbuf *sbrcv = &so->so_rcv;
+
+ if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) !=
+ (TF_REQ_TSTMP | TF_RCVD_TSTMP) &&
+ (sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
+ tcp_sbrcv_reserve(tp, sbrcv, newsize, 0);
+ }
+}
+
+/* A receiver will evaluate the flow of packets on a connection
+ * to see if it can reduce ack traffic. The receiver will start
+ * stretching acks if all of the following conditions are met:
+ * 1. tcp_delack_enabled is set to 3
+ * 2. If the bytes received in the last 100ms is greater than a threshold
+ * defined by maxseg_unacked
+ * 3. If the connection has not been idle for tcp_maxrcvidle period.
+ * 4. If the connection has seen enough packets to let the slow-start
+ * finish after connection establishment or after some packet loss.
+ *
+ * The receiver will stop stretching acks if there is congestion/reordering
+ * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
+ * timer fires while stretching acks, it means that the packet flow has gone
+ * below the threshold defined by maxseg_unacked and the receiver will stop
+ * stretching acks. The receiver gets no indication when slow-start is completed
+ * or when the connection reaches an idle state. That is why we use
+ * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
+ * state.
+ */
+static inline int
+tcp_stretch_ack_enable(struct tcpcb *tp)
+{
+ if (!(tp->t_flagsext & TF_NOSTRETCHACK) &&
+ tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
+ TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) &&
+ (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) ||
+ (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
+ return(1);
+ }
+
+ return(0);
+}
+
+/* Reset the state related to stretch-ack algorithm. This will make
+ * the receiver generate an ack every other packet. The receiver
+ * will start re-evaluating the rate at which packets come to decide
+ * if it can benefit by lowering the ack traffic.
+ */
+void
+tcp_reset_stretch_ack(struct tcpcb *tp)
+{
+ tp->t_flags &= ~(TF_STRETCHACK);
+ tp->rcv_by_unackwin = 0;
+ tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
+}
+
+/*
+ * The last packet was a retransmission, check if this ack
+ * indicates that the retransmission was spurious.
+ *
+ * If the connection supports timestamps, we could use it to
+ * detect if the last retransmit was not needed. Otherwise,
+ * we check if the ACK arrived within RTT/2 window, then it
+ * was a mistake to do the retransmit in the first place.
+ *
+ * This function will return 1 if it is a spurious retransmit,
+ * 0 otherwise.
+ */
+static int
+tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcpopt *to)
+{
+ int32_t tdiff, bad_rexmt_win;
+ tdiff = (int32_t)(tcp_now - tp->t_rxtstart);
+ bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+
+ if (TSTMP_SUPPORTED(tp) && tp->t_rxtstart > 0 &&
+ (to->to_flags & TOF_TS) != 0 &&
+ to->to_tsecr != 0 &&
+ TSTMP_LT(to->to_tsecr, tp->t_rxtstart)) {
+ return (1);
+ } else if (tp->t_rxtshift == 1 &&
+ tdiff < bad_rexmt_win) {
+ return(1);
+ }
+ return(0);
+}
+
+
+/*
+ * Restore congestion window state if a spurious timeout
+ * was detected.
+ */
+static void
+tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
+{
+ if (TSTMP_SUPPORTED(tp)) {
+ u_int32_t fsize, acked;
+ fsize = tp->snd_max - th->th_ack;
+ acked = BYTES_ACKED(th, tp);
+
+ /*
+ * Implement bad retransmit recovery as
+ * described in RFC 4015.
+ */
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+
+ /* Initialize cwnd to the initial window */
+ if (CC_ALGO(tp)->cwnd_init != NULL)
+ CC_ALGO(tp)->cwnd_init(tp);
+
+ tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
+
+ } else {
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+ if (tp->t_flags & TF_WASFRECOVERY)
+ ENTER_FASTRECOVERY(tp);
+ }
+ tp->snd_recover = tp->snd_recover_prev;
+ tp->snd_nxt = tp->snd_max;
+ tp->t_rxtshift = 0;
+ tp->t_rxtstart = 0;
+
+ /* Fix send socket buffer to reflect the change in cwnd */
+ tcp_bad_rexmt_fix_sndbuf(tp);
+
+ /*
+ * This RTT might reflect the extra delay induced
+ * by the network. Skip using this sample for RTO
+ * calculation and mark the connection so we can
+ * recompute RTT when the next eligible sample is
+ * found.
+ */
+ tp->t_flagsext |= TF_RECOMPUTE_RTT;
+ tp->t_badrexmt_time = tcp_now;
+ tp->t_rtttime = 0;
+}
+
+void
+tcp_input(m, off0)
+ struct mbuf *m;
+ int off0;
+{
+ register struct tcphdr *th;
+ register struct ip *ip = NULL;
+ register struct inpcb *inp;
+ u_char *optp = NULL;
+ int optlen = 0;
+ int tlen, off;
+ int drop_hdrlen;
+ register struct tcpcb *tp = 0;
+ register int thflags;
+ struct socket *so = 0;
+ int todrop, acked, ourfinisacked, needoutput = 0;
+ struct in_addr laddr;
+#if INET6
+ struct in6_addr laddr6;
+#endif
+ int dropsocket = 0;
+ int iss = 0, nosock = 0;
+ u_int32_t tiwin, sack_bytes_acked = 0;
+ struct tcpopt to; /* options in this segment */
+ struct sockaddr_in *next_hop = NULL;
+#if TCPDEBUG
+ short ostate = 0;
+#endif
+ struct m_tag *fwd_tag;
+ u_char ip_ecn = IPTOS_ECN_NOTECT;
+ unsigned int ifscope, nocell = 0;
+ uint8_t isconnected, isdisconnected;
+ struct ifnet *ifp = m->m_pkthdr.rcvif;
+ int pktf_sw_lro_pkt = (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) ? 1 : 0;
+ int nlropkts = (pktf_sw_lro_pkt == 1) ? m->m_pkthdr.lro_npkts : 1;
+ int turnoff_lro = 0, win;
+#if MPTCP
+ struct mptcb *mp_tp = NULL;
+ uint16_t mptcp_csum = 0;
+#endif /* MPTCP */
+ boolean_t cell = IFNET_IS_CELLULAR(ifp);
+ boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
+
+#define TCP_INC_VAR(stat, npkts) do { \
+ stat += npkts; \
+} while (0)
+
+ TCP_INC_VAR(tcpstat.tcps_rcvtotal, nlropkts);
+
+ /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
+ if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
+ fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
+ KERNEL_TAG_TYPE_IPFORWARD, NULL);
+ } else {
+ fwd_tag = NULL;
+ }
+ if (fwd_tag != NULL) {
+ struct ip_fwd_tag *ipfwd_tag =
+ (struct ip_fwd_tag *)(fwd_tag+1);
+
+ next_hop = ipfwd_tag->next_hop;
+ m_tag_delete(m, fwd_tag);
+ }
+
+#if INET6
+ struct ip6_hdr *ip6 = NULL;
+ int isipv6;
+#endif /* INET6 */
+ int rstreason; /* For badport_bandlim accounting purposes */
+ struct proc *proc0=current_proc();
+
+ KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
+
+#if INET6
+ isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
+#endif
+ bzero((char *)&to, sizeof(to));
+
+#if INET6
+ if (isipv6) {
+ /*
+ * Expect 32-bit aligned data pointer on
+ * strict-align platforms
+ */
+ MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
+
+ /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
+ ip6 = mtod(m, struct ip6_hdr *);
+ tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
+ th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
+
+ if (tcp_input_checksum(AF_INET6, m, th, off0, tlen))
+ goto dropnosock;
+
+ KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
+ (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
+ th->th_seq, th->th_ack, th->th_win);
+ /*
+ * Be proactive about unspecified IPv6 address in source.
+ * As we use all-zero to indicate unbounded/unconnected pcb,
+ * unspecified IPv6 address can be used to confuse us.
+ *
+ * Note that packets with unspecified IPv6 destination is
+ * already dropped in ip6_input.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
+ /* XXX stat */
+ IF_TCP_STATINC(ifp, unspecv6);