]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/net/classq/classq_fq_codel.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / bsd / net / classq / classq_fq_codel.c
index 75a568d2c521be2ac2063ca5cf58fd98c458151f..e78ceefdf2d46392a3d26be9e51927ca41267cea 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -38,6 +38,7 @@
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/kauth.h>
+#include <sys/sdt.h>
 #include <kern/zalloc.h>
 #include <netinet/in.h>
 
@@ -47,6 +48,8 @@
 #include <net/pktsched/pktsched_fq_codel.h>
 #include <net/classq/classq_fq_codel.h>
 
+#include <netinet/tcp_var.h>
+
 static uint32_t flowq_size;                     /* size of flowq */
 static struct mcache *flowq_cache = NULL;       /* mcache for flowq */
 
@@ -69,6 +72,7 @@ fq_codel_init(void)
        if (flowq_cache == NULL) {
                panic("%s: failed to allocate flowq_cache", __func__);
                /* NOTREACHED */
+               __builtin_unreachable();
        }
 }
 
@@ -84,7 +88,7 @@ fq_alloc(classq_pkt_type_t ptype)
        fq_t *fq = NULL;
        fq = mcache_alloc(flowq_cache, MCR_SLEEP);
        if (fq == NULL) {
-               log(LOG_ERR, "%s: unable to allocate from flowq_cache\n");
+               log(LOG_ERR, "%s: unable to allocate from flowq_cache\n", __func__);
                return NULL;
        }
 
@@ -93,19 +97,23 @@ fq_alloc(classq_pkt_type_t ptype)
        if (ptype == QP_MBUF) {
                MBUFQ_INIT(&fq->fq_mbufq);
        }
+       CLASSQ_PKT_INIT(&fq->fq_dq_head);
+       CLASSQ_PKT_INIT(&fq->fq_dq_tail);
+       fq->fq_in_dqlist = false;
        return fq;
 }
 
 void
 fq_destroy(fq_t *fq)
 {
+       VERIFY(fq->fq_flags & FQF_DESTROYED);
        VERIFY(fq_empty(fq));
        VERIFY(!(fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)));
        VERIFY(fq->fq_bytes == 0);
        mcache_free(flowq_cache, fq);
 }
 
-static void
+static inline void
 fq_detect_dequeue_stall(fq_if_t *fqs, fq_t *flowq, fq_if_classq_t *fq_cl,
     u_int64_t *now)
 {
@@ -123,6 +131,10 @@ fq_detect_dequeue_stall(fq_if_t *fqs, fq_t *flowq, fq_if_classq_t *fq_cl,
                 */
                FQ_SET_DELAY_HIGH(flowq);
                fq_cl->fcl_stat.fcl_dequeue_stall++;
+               os_log_error(OS_LOG_DEFAULT, "%s: dequeue stall num: %d, "
+                   "scidx: %d, flow: 0x%x, iface: %s", __func__,
+                   fq_cl->fcl_stat.fcl_dequeue_stall, flowq->fq_sc_index,
+                   flowq->fq_flowhash, if_name(fqs->fqs_ifq->ifcq_ifp));
        }
 }
 
@@ -130,12 +142,13 @@ void
 fq_head_drop(fq_if_t *fqs, fq_t *fq)
 {
        pktsched_pkt_t pkt;
-       uint32_t *pkt_flags;
+       volatile uint32_t *pkt_flags;
        uint64_t *pkt_timestamp;
        struct ifclassq *ifq = fqs->fqs_ifq;
 
        _PKTSCHED_PKT_INIT(&pkt);
-       if (fq_getq_flow_internal(fqs, fq, &pkt) == NULL) {
+       fq_getq_flow_internal(fqs, fq, &pkt);
+       if (pkt.pktsched_pkt_mbuf == NULL) {
                return;
        }
 
@@ -143,8 +156,14 @@ fq_head_drop(fq_if_t *fqs, fq_t *fq)
            NULL, NULL);
 
        *pkt_timestamp = 0;
-       if (pkt.pktsched_ptype == QP_MBUF) {
+       switch (pkt.pktsched_ptype) {
+       case QP_MBUF:
                *pkt_flags &= ~PKTF_PRIV_GUARDED;
+               break;
+       default:
+               VERIFY(0);
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
 
        IFCQ_DROP_ADD(ifq, 1, pktsched_get_pkt_len(&pkt));
@@ -152,6 +171,62 @@ fq_head_drop(fq_if_t *fqs, fq_t *fq)
        pktsched_free_pkt(&pkt);
 }
 
+
+static int
+fq_compressor(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl,
+    pktsched_pkt_t *pkt)
+{
+       classq_pkt_type_t ptype = fq->fq_ptype;
+       uint32_t comp_gencnt = 0;
+       uint64_t *pkt_timestamp;
+       uint64_t old_timestamp = 0;
+       uint32_t old_pktlen = 0;
+       struct ifclassq *ifq = fqs->fqs_ifq;
+
+       if (__improbable(!tcp_do_ack_compression)) {
+               return 0;
+       }
+
+       pktsched_get_pkt_vars(pkt, NULL, &pkt_timestamp, NULL, NULL, NULL,
+           &comp_gencnt);
+
+       if (comp_gencnt == 0) {
+               return 0;
+       }
+
+       fq_cl->fcl_stat.fcl_pkts_compressible++;
+
+       if (fq_empty(fq)) {
+               return 0;
+       }
+
+       if (ptype == QP_MBUF) {
+               struct mbuf *m = MBUFQ_LAST(&fq->fq_mbufq);
+
+               if (comp_gencnt != m->m_pkthdr.comp_gencnt) {
+                       return 0;
+               }
+
+               /* If we got until here, we should merge/replace the segment */
+               MBUFQ_REMOVE(&fq->fq_mbufq, m);
+               old_pktlen = m_pktlen(m);
+               old_timestamp = m->m_pkthdr.pkt_timestamp;
+
+               IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
+               m_freem(m);
+       }
+
+       fq->fq_bytes -= old_pktlen;
+       fq_cl->fcl_stat.fcl_byte_cnt -= old_pktlen;
+       fq_cl->fcl_stat.fcl_pkt_cnt--;
+       IFCQ_DEC_LEN(ifq);
+       IFCQ_DEC_BYTES(ifq, old_pktlen);
+
+       *pkt_timestamp = old_timestamp;
+
+       return CLASSQEQ_COMPRESSED;
+}
+
 int
 fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl)
 {
@@ -159,72 +234,95 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl)
        u_int64_t now;
        fq_t *fq = NULL;
        uint64_t *pkt_timestamp;
-       uint32_t *pkt_flags;
-       uint32_t pkt_flowid, pkt_tx_start_seq;
+       volatile uint32_t *pkt_flags;
+       uint32_t pkt_flowid, cnt;
        uint8_t pkt_proto, pkt_flowsrc;
 
+       cnt = pkt->pktsched_pcnt;
        pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, &pkt_flowid,
-           &pkt_flowsrc, &pkt_proto, &pkt_tx_start_seq);
+           &pkt_flowsrc, &pkt_proto, NULL);
 
-       if (pkt->pktsched_ptype == QP_MBUF) {
+       /*
+        * XXX Not walking the chain to set this flag on every packet.
+        * This flag is only used for debugging. Nothing is affected if it's
+        * not set.
+        */
+       switch (pkt->pktsched_ptype) {
+       case QP_MBUF:
                /* See comments in <rdar://problem/14040693> */
                VERIFY(!(*pkt_flags & PKTF_PRIV_GUARDED));
                *pkt_flags |= PKTF_PRIV_GUARDED;
+               break;
+       default:
+               VERIFY(0);
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
 
-       if (*pkt_timestamp > 0) {
-               now = *pkt_timestamp;
-       } else {
-               struct timespec now_ts;
-               nanouptime(&now_ts);
-               now = (now_ts.tv_sec * NSEC_PER_SEC) + now_ts.tv_nsec;
-               *pkt_timestamp = now;
-       }
+       /*
+        * Timestamps for every packet must be set prior to entering this path.
+        */
+       now = *pkt_timestamp;
+       ASSERT(now > 0);
 
        /* find the flowq for this packet */
        fq = fq_if_hash_pkt(fqs, pkt_flowid, pktsched_get_pkt_svc(pkt),
            now, TRUE, pkt->pktsched_ptype);
-       if (fq == NULL) {
+       if (__improbable(fq == NULL)) {
+               DTRACE_IP1(memfail__drop, fq_if_t *, fqs);
                /* drop the packet if we could not allocate a flow queue */
-               fq_cl->fcl_stat.fcl_drop_memfailure++;
-               IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
+               fq_cl->fcl_stat.fcl_drop_memfailure += cnt;
                return CLASSQEQ_DROP;
        }
        VERIFY(fq->fq_ptype == pkt->pktsched_ptype);
 
        fq_detect_dequeue_stall(fqs, fq, fq_cl, &now);
 
-       if (FQ_IS_DELAYHIGH(fq)) {
+       if (__improbable(FQ_IS_DELAYHIGH(fq))) {
                if ((fq->fq_flags & FQF_FLOWCTL_CAPABLE) &&
                    (*pkt_flags & PKTF_FLOW_ADV)) {
                        fc_adv = 1;
                        /*
                         * If the flow is suspended or it is not
-                        * TCP, drop the packet
+                        * TCP/QUIC, drop the chain.
                         */
-                       if (pkt_proto != IPPROTO_TCP) {
+                       if ((pkt_proto != IPPROTO_TCP) &&
+                           (pkt_proto != IPPROTO_QUIC)) {
                                droptype = DTYPE_EARLY;
-                               fq_cl->fcl_stat.fcl_drop_early++;
+                               fq_cl->fcl_stat.fcl_drop_early += cnt;
                        }
+                       DTRACE_IP6(flow__adv, fq_if_t *, fqs,
+                           fq_if_classq_t *, fq_cl, fq_t *, fq,
+                           int, droptype, pktsched_pkt_t *, pkt,
+                           uint32_t, cnt);
                } else {
                        /*
-                        * Need to drop a packet, instead of dropping this
-                        * one, try to drop from the head of the queue
+                        * Need to drop packets to make room for the new
+                        * ones. Try to drop from the head of the queue
+                        * instead of the latest packets.
                         */
                        if (!fq_empty(fq)) {
-                               fq_head_drop(fqs, fq);
+                               uint32_t i;
+
+                               for (i = 0; i < cnt; i++) {
+                                       fq_head_drop(fqs, fq);
+                               }
                                droptype = DTYPE_NODROP;
                        } else {
                                droptype = DTYPE_EARLY;
                        }
-                       fq_cl->fcl_stat.fcl_drop_early++;
+                       fq_cl->fcl_stat.fcl_drop_early += cnt;
+
+                       DTRACE_IP6(no__flow__adv, fq_if_t *, fqs,
+                           fq_if_classq_t *, fq_cl, fq_t *, fq,
+                           int, droptype, pktsched_pkt_t *, pkt,
+                           uint32_t, cnt);
                }
        }
 
        /* Set the return code correctly */
-       if (fc_adv == 1 && droptype != DTYPE_FORCED) {
-               if (fq_if_add_fcentry(fqs, pkt, pkt_flowid, pkt_flowsrc,
-                   fq_cl)) {
+       if (__improbable(fc_adv == 1 && droptype != DTYPE_FORCED)) {
+               if (fq_if_add_fcentry(fqs, pkt, pkt_flowsrc, fq, fq_cl)) {
                        fq->fq_flags |= FQF_FLOWCTL_ON;
                        /* deliver flow control advisory error */
                        if (droptype == DTYPE_NODROP) {
@@ -242,49 +340,87 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl)
                        ret = CLASSQEQ_DROP_FC;
                        fq_cl->fcl_stat.fcl_flow_control_fail++;
                }
+               DTRACE_IP3(fc__ret, fq_if_t *, fqs, int, droptype, int, ret);
        }
 
        /*
-        * If the queue length hits the queue limit, drop a packet from the
-        * front of the queue for a flow with maximum number of bytes. This
-        * will penalize heavy and unresponsive flows. It will also avoid a
-        * tail drop.
+        * If the queue length hits the queue limit, drop a chain with the
+        * same number of packets from the front of the queue for a flow with
+        * maximum number of bytes. This will penalize heavy and unresponsive
+        * flows. It will also avoid a tail drop.
         */
-       if (droptype == DTYPE_NODROP && fq_if_at_drop_limit(fqs)) {
+       if (__improbable(droptype == DTYPE_NODROP &&
+           fq_if_at_drop_limit(fqs))) {
+               uint32_t i;
+
                if (fqs->fqs_large_flow == fq) {
                        /*
                         * Drop from the head of the current fq. Since a
                         * new packet will be added to the tail, it is ok
                         * to leave fq in place.
                         */
-                       fq_head_drop(fqs, fq);
+                       DTRACE_IP5(large__flow, fq_if_t *, fqs,
+                           fq_if_classq_t *, fq_cl, fq_t *, fq,
+                           pktsched_pkt_t *, pkt, uint32_t, cnt);
+
+                       for (i = 0; i < cnt; i++) {
+                               fq_head_drop(fqs, fq);
+                       }
                } else {
                        if (fqs->fqs_large_flow == NULL) {
                                droptype = DTYPE_FORCED;
-                               fq_cl->fcl_stat.fcl_drop_overflow++;
+                               fq_cl->fcl_stat.fcl_drop_overflow += cnt;
                                ret = CLASSQEQ_DROP;
 
+                               DTRACE_IP5(no__large__flow, fq_if_t *, fqs,
+                                   fq_if_classq_t *, fq_cl, fq_t *, fq,
+                                   pktsched_pkt_t *, pkt, uint32_t, cnt);
+
                                /*
                                 * if this fq was freshly created and there
                                 * is nothing to enqueue, free it
                                 */
                                if (fq_empty(fq) && !(fq->fq_flags &
                                    (FQF_NEW_FLOW | FQF_OLD_FLOW))) {
-                                       fq_if_destroy_flow(fqs, fq_cl, fq);
+                                       fq_if_destroy_flow(fqs, fq_cl, fq, true);
                                        fq = NULL;
                                }
                        } else {
-                               fq_if_drop_packet(fqs);
+                               DTRACE_IP5(different__large__flow,
+                                   fq_if_t *, fqs, fq_if_classq_t *, fq_cl,
+                                   fq_t *, fq, pktsched_pkt_t *, pkt,
+                                   uint32_t, cnt);
+
+                               for (i = 0; i < cnt; i++) {
+                                       fq_if_drop_packet(fqs);
+                               }
                        }
                }
        }
 
-       if (droptype == DTYPE_NODROP) {
-               uint32_t pkt_len = pktsched_get_pkt_len(pkt);
-               fq_enqueue(fq, pkt->pktsched_pkt);
-               fq->fq_bytes += pkt_len;
-               fq_cl->fcl_stat.fcl_byte_cnt += pkt_len;
-               fq_cl->fcl_stat.fcl_pkt_cnt++;
+       if (__probable(droptype == DTYPE_NODROP)) {
+               uint32_t chain_len = pktsched_get_pkt_len(pkt);
+
+               /*
+                * We do not compress if we are enqueuing a chain.
+                * Traversing the chain to look for acks would defeat the
+                * purpose of batch enqueueing.
+                */
+               if (cnt == 1) {
+                       ret = fq_compressor(fqs, fq, fq_cl, pkt);
+                       if (ret != CLASSQEQ_COMPRESSED) {
+                               ret = CLASSQEQ_SUCCESS;
+                       } else {
+                               fq_cl->fcl_stat.fcl_pkts_compressed++;
+                       }
+               }
+               DTRACE_IP5(fq_enqueue, fq_if_t *, fqs, fq_if_classq_t *, fq_cl,
+                   fq_t *, fq, pktsched_pkt_t *, pkt, uint32_t, cnt);
+               fq_enqueue(fq, pkt->pktsched_pkt, pkt->pktsched_tail, cnt);
+
+               fq->fq_bytes += chain_len;
+               fq_cl->fcl_stat.fcl_byte_cnt += chain_len;
+               fq_cl->fcl_stat.fcl_pkt_cnt += cnt;
 
                /*
                 * check if this queue will qualify to be the next
@@ -292,7 +428,7 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl)
                 */
                fq_if_is_flow_heavy(fqs, fq);
        } else {
-               IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
+               DTRACE_IP3(fq_drop, fq_if_t *, fqs, int, droptype, int, ret);
                return (ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROP;
        }
 
@@ -312,20 +448,21 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl)
        return ret;
 }
 
-void *
+void
 fq_getq_flow_internal(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
 {
-       void *p;
+       classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p);
        uint32_t plen;
        fq_if_classq_t *fq_cl;
        struct ifclassq *ifq = fqs->fqs_ifq;
 
-       fq_dequeue(fq, p);
-       if (p == NULL) {
-               return NULL;
+       fq_dequeue(fq, &p);
+       if (p.cp_ptype == QP_INVALID) {
+               VERIFY(p.cp_mbuf == NULL);
+               return;
        }
 
-       pktsched_pkt_encap(pkt, fq->fq_ptype, p);
+       pktsched_pkt_encap(pkt, &p);
        plen = pktsched_get_pkt_len(pkt);
 
        VERIFY(fq->fq_bytes >= plen);
@@ -341,28 +478,26 @@ fq_getq_flow_internal(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
        if (fq_empty(fq)) {
                fq->fq_getqtime = 0;
        }
-
-       return p;
 }
 
-void *
+void
 fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
 {
-       void *p;
        fq_if_classq_t *fq_cl;
        u_int64_t now;
        int64_t qdelay = 0;
        struct timespec now_ts;
-       uint32_t *pkt_flags, pkt_tx_start_seq;
+       volatile uint32_t *pkt_flags;
        uint64_t *pkt_timestamp;
 
-       p = fq_getq_flow_internal(fqs, fq, pkt);
-       if (p == NULL) {
-               return NULL;
+       fq_getq_flow_internal(fqs, fq, pkt);
+       if (pkt->pktsched_ptype == QP_INVALID) {
+               VERIFY(pkt->pktsched_pkt_mbuf == NULL);
+               return;
        }
 
        pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
-           NULL, &pkt_tx_start_seq);
+           NULL, NULL);
 
        nanouptime(&now_ts);
        now = (now_ts.tv_sec * NSEC_PER_SEC) + now_ts.tv_nsec;
@@ -381,12 +516,15 @@ fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
                if (fq->fq_min_qdelay > fqs->fqs_target_qdelay) {
                        if (!FQ_IS_DELAYHIGH(fq)) {
                                FQ_SET_DELAY_HIGH(fq);
+                               os_log_error(OS_LOG_DEFAULT,
+                                   "%s: high delay idx: %d, %llu, flow: 0x%x, "
+                                   "iface: %s", __func__, fq->fq_sc_index,
+                                   fq->fq_min_qdelay, fq->fq_flowhash,
+                                   if_name(fqs->fqs_ifq->ifcq_ifp));
                        }
                } else {
                        FQ_CLEAR_DELAY_HIGH(fq);
                }
-
-
                /* Reset measured queue delay and update time */
                fq->fq_updatetime = now + fqs->fqs_update_interval;
                fq->fq_min_qdelay = 0;
@@ -407,9 +545,13 @@ fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
        fq_if_is_flow_heavy(fqs, fq);
 
        *pkt_timestamp = 0;
-       if (pkt->pktsched_ptype == QP_MBUF) {
+       switch (pkt->pktsched_ptype) {
+       case QP_MBUF:
                *pkt_flags &= ~PKTF_PRIV_GUARDED;
+               break;
+       default:
+               VERIFY(0);
+               /* NOTREACHED */
+               __builtin_unreachable();
        }
-
-       return p;
 }