+ uint32_t ia6_flags;
+ struct ifnet *ifp = m->m_pkthdr.rcvif;
+
+ IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
+
+ /* Expect 32-bit aligned data pointer on strict-align platforms */
+ MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
+
+ /*
+ * draft-itojun-ipv6-tcp-to-anycast
+ * better place to put this in?
+ */
+ if (ip6_getdstifaddr_info(m, NULL, &ia6_flags) == 0) {
+ if (ia6_flags & IN6_IFF_ANYCAST) {
+ struct ip6_hdr *ip6;
+
+ ip6 = mtod(m, struct ip6_hdr *);
+ icmp6_error(m, ICMP6_DST_UNREACH,
+ ICMP6_DST_UNREACH_ADDR,
+ (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
+
+ IF_TCP_STATINC(ifp, icmp6unreach);
+
+ return (IPPROTO_DONE);
+ }
+ }
+
+ tcp_input(m, *offp);
+ return (IPPROTO_DONE);
+}
+#endif
+
+/* Depending on the usage of mbuf space in the system, this function
+ * will return true or false. This is used to determine if a socket
+ * buffer can take more memory from the system for auto-tuning or not.
+ */
+u_int8_t
+tcp_cansbgrow(struct sockbuf *sb)
+{
+ /* Calculate the host level space limit in terms of MSIZE buffers.
+ * We can use a maximum of half of the available mbuf space for
+ * socket buffers.
+ */
+ u_int32_t mblim = ((nmbclusters >> 1) << (MCLSHIFT - MSIZESHIFT));
+
+ /* Calculate per sb limit in terms of bytes. We optimize this limit
+ * for upto 16 socket buffers.
+ */
+
+ u_int32_t sbspacelim = ((nmbclusters >> 4) << MCLSHIFT);
+
+ if ((total_sbmb_cnt < mblim) &&
+ (sb->sb_hiwat < sbspacelim)) {
+ return(1);
+ } else {
+ OSIncrementAtomic64(&sbmb_limreached);
+ }
+ return(0);
+}
+
+static void
+tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv,
+ u_int32_t newsize, u_int32_t idealsize)
+{
+
+ /* newsize should not exceed max */
+ newsize = min(newsize, tcp_autorcvbuf_max);
+
+ /* The receive window scale negotiated at the
+ * beginning of the connection will also set a
+ * limit on the socket buffer size
+ */
+ newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale);
+
+ /* Set new socket buffer size */
+ if (newsize > sbrcv->sb_hiwat &&
+ (sbreserve(sbrcv, newsize) == 1)) {
+ sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
+ (idealsize != 0) ? idealsize : newsize),
+ tcp_autorcvbuf_max);
+
+ /* Again check the limit set by the advertised
+ * window scale
+ */
+ sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
+ TCP_MAXWIN << tp->rcv_scale);
+ }
+}
+
+/*
+ * This function is used to grow a receive socket buffer. It
+ * will take into account system-level memory usage and the
+ * bandwidth available on the link to make a decision.
+ */
+static void
+tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
+ struct tcpopt *to, u_int32_t pktlen) {
+
+ /*
+ * Do not grow the receive socket buffer if
+ * - auto resizing is disabled, globally or on this socket
+ * - the high water mark already reached the maximum
+ * - the stream is in background and receive side is being
+ * throttled
+ * - if there are segments in reassembly queue indicating loss,
+ * do not need to increase recv window during recovery as more
+ * data is not going to be sent. A duplicate ack sent during
+ * recovery should not change the receive window
+ */
+ if (tcp_do_autorcvbuf == 0 ||
+ (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
+ tcp_cansbgrow(sbrcv) == 0 ||
+ sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
+ (tp->t_flagsext & TF_RECV_THROTTLE) ||
+ !LIST_EMPTY(&tp->t_segq)) {
+ /* Can not resize the socket buffer, just return */
+ goto out;
+ }
+
+ if (TSTMP_GT(tcp_now,
+ tp->rfbuf_ts + TCPTV_RCVBUFIDLE)) {
+ /* If there has been an idle period in the
+ * connection, just restart the measurement
+ */
+ goto out;
+ }
+
+ if (!TSTMP_SUPPORTED(tp)) {
+ /*
+ * Timestamp option is not supported on this connection.
+ * If the connection reached a state to indicate that
+ * the receive socket buffer needs to grow, increase
+ * the high water mark.
+ */
+ if (TSTMP_GEQ(tcp_now,
+ tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) {
+ if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) {
+ tcp_sbrcv_reserve(tp, sbrcv,
+ tcp_autorcvbuf_max, 0);
+ }
+ goto out;
+ } else {
+ tp->rfbuf_cnt += pktlen;
+ return;
+ }
+ } else if (to->to_tsecr != 0) {
+ /*
+ * If the timestamp shows that one RTT has
+ * completed, we can stop counting the
+ * bytes. Here we consider increasing
+ * the socket buffer if the bandwidth measured in
+ * last rtt, is more than half of sb_hiwat, this will
+ * help to scale the buffer according to the bandwidth
+ * on the link.
+ */
+ if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
+ if (tp->rfbuf_cnt > (sbrcv->sb_hiwat -
+ (sbrcv->sb_hiwat >> 1))) {
+ int32_t rcvbuf_inc, min_incr;
+ /*
+ * Increment the receive window by a
+ * multiple of maximum sized segments.
+ * This will prevent a connection from
+ * sending smaller segments on wire if it
+ * is limited by the receive window.
+ *
+ * Set the ideal size based on current
+ * bandwidth measurements. We set the
+ * ideal size on receive socket buffer to
+ * be twice the bandwidth delay product.
+ */
+ rcvbuf_inc = (tp->rfbuf_cnt << 1)
+ - sbrcv->sb_hiwat;
+
+ /*
+ * Make the increment equal to 8 segments
+ * at least
+ */
+ min_incr = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
+ if (rcvbuf_inc < min_incr)
+ rcvbuf_inc = min_incr;
+
+ rcvbuf_inc =
+ (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
+ tcp_sbrcv_reserve(tp, sbrcv,
+ sbrcv->sb_hiwat + rcvbuf_inc,
+ (tp->rfbuf_cnt * 2));
+ }
+ goto out;
+ } else {
+ tp->rfbuf_cnt += pktlen;
+ return;
+ }
+ }
+out:
+ /* Restart the measurement */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+ return;
+}
+
+/* This function will trim the excess space added to the socket buffer
+ * to help a slow-reading app. The ideal-size of a socket buffer depends
+ * on the link bandwidth or it is set by an application and we aim to
+ * reach that size.
+ */
+void
+tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) {
+ if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
+ sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
+ int32_t trim;
+ /* compute the difference between ideal and current sizes */
+ u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
+
+ /* Compute the maximum advertised window for
+ * this connection.
+ */
+ u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
+
+ /* How much can we trim the receive socket buffer?
+ * 1. it can not be trimmed beyond the max rcv win advertised
+ * 2. if possible, leave 1/16 of bandwidth*delay to
+ * avoid closing the win completely
+ */
+ u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4));
+
+ /* Sometimes leave can be zero, in that case leave at least
+ * a few segments worth of space.
+ */
+ if (leave == 0)
+ leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
+
+ trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
+ trim = imin(trim, (int32_t)diff);
+
+ if (trim > 0)
+ sbreserve(sbrcv, (sbrcv->sb_hiwat - trim));
+ }
+}
+
+/* We may need to trim the send socket buffer size for two reasons:
+ * 1. if the rtt seen on the connection is climbing up, we do not
+ * want to fill the buffers any more.
+ * 2. if the congestion win on the socket backed off, there is no need
+ * to hold more mbufs for that connection than what the cwnd will allow.
+ */
+void
+tcp_sbsnd_trim(struct sockbuf *sbsnd) {
+ if (tcp_do_autosendbuf == 1 &&
+ ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) ==
+ (SB_AUTOSIZE | SB_TRIM)) &&
+ (sbsnd->sb_idealsize > 0) &&
+ (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
+ u_int32_t trim = 0;
+ if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
+ trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
+ } else {
+ trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
+ }
+ sbreserve(sbsnd, (sbsnd->sb_hiwat - trim));
+ }
+ if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize)
+ sbsnd->sb_flags &= ~(SB_TRIM);
+}
+
+/*
+ * If timestamp option was not negotiated on this connection
+ * and this connection is on the receiving side of a stream
+ * then we can not measure the delay on the link accurately.
+ * Instead of enabling automatic receive socket buffer
+ * resizing, just give more space to the receive socket buffer.
+ */
+static inline void
+tcp_sbrcv_tstmp_check(struct tcpcb *tp) {
+ struct socket *so = tp->t_inpcb->inp_socket;
+ u_int32_t newsize = 2 * tcp_recvspace;
+ struct sockbuf *sbrcv = &so->so_rcv;
+
+ if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) !=
+ (TF_REQ_TSTMP | TF_RCVD_TSTMP) &&
+ (sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
+ tcp_sbrcv_reserve(tp, sbrcv, newsize, 0);
+ }
+}
+
+/* A receiver will evaluate the flow of packets on a connection
+ * to see if it can reduce ack traffic. The receiver will start
+ * stretching acks if all of the following conditions are met:
+ * 1. tcp_delack_enabled is set to 3
+ * 2. If the bytes received in the last 100ms is greater than a threshold
+ * defined by maxseg_unacked
+ * 3. If the connection has not been idle for tcp_maxrcvidle period.
+ * 4. If the connection has seen enough packets to let the slow-start
+ * finish after connection establishment or after some packet loss.
+ *
+ * The receiver will stop stretching acks if there is congestion/reordering
+ * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
+ * timer fires while stretching acks, it means that the packet flow has gone
+ * below the threshold defined by maxseg_unacked and the receiver will stop
+ * stretching acks. The receiver gets no indication when slow-start is completed
+ * or when the connection reaches an idle state. That is why we use
+ * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
+ * state.
+ */
+static inline int
+tcp_stretch_ack_enable(struct tcpcb *tp)
+{
+ if (!(tp->t_flagsext & (TF_NOSTRETCHACK|TF_DISABLE_STRETCHACK)) &&
+ tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
+ TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) &&
+ (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) ||
+ (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
+ return(1);
+ }
+
+ return(0);
+}
+
+/*
+ * Reset the state related to stretch-ack algorithm. This will make
+ * the receiver generate an ack every other packet. The receiver
+ * will start re-evaluating the rate at which packets come to decide
+ * if it can benefit by lowering the ack traffic.
+ */
+void
+tcp_reset_stretch_ack(struct tcpcb *tp)
+{
+ tp->t_flags &= ~(TF_STRETCHACK);
+ tp->rcv_by_unackwin = 0;
+ tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
+
+ /*
+ * When there is packet loss or packet re-ordering or CWR due to
+ * ECN, the sender's congestion window is reduced. In these states,
+ * generate an ack for every other packet for some time to allow
+ * the sender's congestion window to grow.
+ */
+ tp->t_flagsext |= TF_RCVUNACK_WAITSS;
+ tp->rcv_waitforss = 0;
+}
+
+/*
+ * The last packet was a retransmission, check if this ack
+ * indicates that the retransmission was spurious.
+ *
+ * If the connection supports timestamps, we could use it to
+ * detect if the last retransmit was not needed. Otherwise,
+ * we check if the ACK arrived within RTT/2 window, then it
+ * was a mistake to do the retransmit in the first place.
+ *
+ * This function will return 1 if it is a spurious retransmit,
+ * 0 otherwise.
+ */
+int
+tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th,
+ struct tcpopt *to, u_int32_t rxtime)
+{
+ int32_t tdiff, bad_rexmt_win;
+ bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+
+ /* If the ack has ECN CE bit, then cwnd has to be adjusted */
+ if ((tp->ecn_flags & (TE_ECN_ON)) == (TE_ECN_ON)
+ && (th->th_flags & TH_ECE))
+ return (0);
+ if (TSTMP_SUPPORTED(tp)) {
+ if (rxtime > 0 && (to->to_flags & TOF_TS)
+ && to->to_tsecr != 0
+ && TSTMP_LT(to->to_tsecr, rxtime))
+ return (1);
+ } else {
+ if ((tp->t_rxtshift == 1
+ || (tp->t_flagsext & TF_SENT_TLPROBE))
+ && rxtime > 0) {
+ tdiff = (int32_t)(tcp_now - rxtime);
+ if (tdiff < bad_rexmt_win)
+ return(1);
+ }
+ }
+ return(0);
+}
+
+
+/*
+ * Restore congestion window state if a spurious timeout
+ * was detected.
+ */
+static void
+tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
+{
+ if (TSTMP_SUPPORTED(tp)) {
+ u_int32_t fsize, acked;
+ fsize = tp->snd_max - th->th_ack;
+ acked = BYTES_ACKED(th, tp);
+
+ /*
+ * Implement bad retransmit recovery as
+ * described in RFC 4015.
+ */
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+
+ /* Initialize cwnd to the initial window */
+ if (CC_ALGO(tp)->cwnd_init != NULL)
+ CC_ALGO(tp)->cwnd_init(tp);
+
+ tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
+
+ } else {
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+ if (tp->t_flags & TF_WASFRECOVERY)
+ ENTER_FASTRECOVERY(tp);
+ }
+ tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
+ tp->snd_recover = tp->snd_recover_prev;
+ tp->snd_nxt = tp->snd_max;
+ tp->t_rxtshift = 0;
+ tp->t_rxtstart = 0;
+
+ /* Fix send socket buffer to reflect the change in cwnd */
+ tcp_bad_rexmt_fix_sndbuf(tp);
+
+ /*
+ * This RTT might reflect the extra delay induced
+ * by the network. Skip using this sample for RTO
+ * calculation and mark the connection so we can
+ * recompute RTT when the next eligible sample is
+ * found.
+ */
+ tp->t_flagsext |= TF_RECOMPUTE_RTT;
+ tp->t_badrexmt_time = tcp_now;
+ tp->t_rtttime = 0;
+}
+
+/*
+ * If the previous packet was sent in retransmission timer, and it was
+ * not needed, then restore the congestion window to the state before that
+ * transmission.
+ *
+ * If the last packet was sent in tail loss probe timeout, check if that
+ * recovered the last packet. If so, that will indicate a real loss and
+ * the congestion window needs to be lowered.
+ */
+static void
+tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
+{
+ if (tp->t_rxtshift > 0 &&
+ tcp_detect_bad_rexmt(tp, th, to, tp->t_rxtstart)) {
+ ++tcpstat.tcps_sndrexmitbad;
+ tcp_bad_rexmt_restore_state(tp, th);
+ tcp_ccdbg_trace(tp, th, TCP_CC_BAD_REXMT_RECOVERY);
+ } else if ((tp->t_flagsext & TF_SENT_TLPROBE)
+ && tp->t_tlphighrxt > 0
+ && SEQ_GEQ(th->th_ack, tp->t_tlphighrxt)
+ && !tcp_detect_bad_rexmt(tp, th, to, tp->t_tlpstart)) {
+ /*
+ * The tail loss probe recovered the last packet and
+ * we need to adjust the congestion window to take
+ * this loss into account.
+ */
+ ++tcpstat.tcps_tlp_recoverlastpkt;
+ if (!IN_FASTRECOVERY(tp)) {
+ tcp_reduce_congestion_window(tp);
+ EXIT_FASTRECOVERY(tp);
+ }
+ tcp_ccdbg_trace(tp, th, TCP_CC_TLP_RECOVER_LASTPACKET);
+ }
+
+ tp->t_flagsext &= ~(TF_SENT_TLPROBE);
+ tp->t_tlphighrxt = 0;
+ tp->t_tlpstart = 0;
+
+ /*
+ * check if the latest ack was for a segment sent during PMTU
+ * blackhole detection. If the timestamp on the ack is before
+ * PMTU blackhole detection, then revert the size of the max
+ * segment to previous size.
+ */
+ if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) &&
+ tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) {
+ if ((to->to_flags & TOF_TS) && to->to_tsecr != 0
+ && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
+ tcp_pmtud_revert_segment_size(tp);
+ }
+ }
+ if (tp->t_pmtud_start_ts > 0)
+ tp->t_pmtud_start_ts = 0;
+}
+
+/*
+ * Check if early retransmit can be attempted according to RFC 5827.
+ *
+ * If packet reordering is detected on a connection, fast recovery will
+ * be delayed until it is clear that the packet was lost and not reordered.
+ * But reordering detection is done only when SACK is enabled.
+ *
+ * On connections that do not support SACK, there is a limit on the number
+ * of early retransmits that can be done per minute. This limit is needed
+ * to make sure that too many packets are not retransmitted when there is
+ * packet reordering.
+ */
+static void
+tcp_early_rexmt_check (struct tcpcb *tp, struct tcphdr *th)
+{
+ u_int32_t obytes, snd_off;
+ int32_t snd_len;
+ struct socket *so = tp->t_inpcb->inp_socket;