2 * Copyright (c) 2011-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
32 #include <sys/errno.h>
33 #include <sys/random.h>
34 #include <sys/kernel_types.h>
35 #include <sys/sysctl.h>
37 #include <kern/zalloc.h>
40 #include <net/net_osdep.h>
41 #include <net/classq/classq.h>
42 #include <pexpert/pexpert.h>
43 #include <net/classq/classq_sfb.h>
44 #include <net/classq/classq_fq_codel.h>
45 #include <net/pktsched/pktsched.h>
46 #include <net/pktsched/pktsched_fq_codel.h>
47 #include <net/flowadv.h>
49 #include <libkern/libkern.h>
52 static errno_t
ifclassq_dequeue_common(struct ifclassq
*, mbuf_svc_class_t
,
53 u_int32_t
, u_int32_t
, classq_pkt_t
*, classq_pkt_t
*, u_int32_t
*,
54 u_int32_t
*, boolean_t
);
55 static void ifclassq_tbr_dequeue_common(struct ifclassq
*, mbuf_svc_class_t
,
56 boolean_t
, classq_pkt_t
*);
58 static u_int64_t ifclassq_target_qdelay
= 0;
59 SYSCTL_QUAD(_net_classq
, OID_AUTO
, target_qdelay
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
60 &ifclassq_target_qdelay
, "target queue delay in nanoseconds");
62 static u_int64_t ifclassq_update_interval
= 0;
63 SYSCTL_QUAD(_net_classq
, OID_AUTO
, update_interval
,
64 CTLFLAG_RW
| CTLFLAG_LOCKED
, &ifclassq_update_interval
,
65 "update interval in nanoseconds");
70 _CASSERT(MBUF_TC_BE
== 0);
71 _CASSERT(MBUF_SC_BE
== 0);
72 _CASSERT(IFCQ_SC_MAX
== MBUF_SC_MAX_CLASSES
);
78 ifclassq_setup(struct ifnet
*ifp
, u_int32_t sflags
, boolean_t reuse
)
81 struct ifclassq
*ifq
= &ifp
->if_snd
;
85 VERIFY(IFCQ_IS_EMPTY(ifq
));
89 bzero(&ifq
->ifcq_xmitcnt
, sizeof(ifq
->ifcq_xmitcnt
));
90 bzero(&ifq
->ifcq_dropcnt
, sizeof(ifq
->ifcq_dropcnt
));
92 VERIFY(!IFCQ_TBR_IS_ENABLED(ifq
));
93 VERIFY(ifq
->ifcq_type
== PKTSCHEDT_NONE
);
94 VERIFY(ifq
->ifcq_flags
== 0);
95 VERIFY(ifq
->ifcq_sflags
== 0);
96 VERIFY(ifq
->ifcq_disc
== NULL
);
98 if (ifp
->if_eflags
& IFEF_TXSTART
) {
101 if ((maxlen
= IFCQ_MAXLEN(ifq
)) == 0) {
102 maxlen
= if_sndq_maxlen
;
104 IFCQ_SET_MAXLEN(ifq
, maxlen
);
106 if (IFCQ_MAXLEN(ifq
) != if_sndq_maxlen
&&
107 IFCQ_TARGET_QDELAY(ifq
) == 0) {
109 * Choose static queues because the interface has
110 * maximum queue size set
112 sflags
&= ~PKTSCHEDF_QALG_DELAYBASED
;
114 ifq
->ifcq_sflags
= sflags
;
115 err
= ifclassq_pktsched_setup(ifq
);
117 ifq
->ifcq_flags
= (IFCQF_READY
| IFCQF_ENABLED
);
125 ifclassq_teardown(struct ifnet
*ifp
)
127 struct ifclassq
*ifq
= &ifp
->if_snd
;
131 if (IFCQ_IS_READY(ifq
)) {
132 if (IFCQ_TBR_IS_ENABLED(ifq
)) {
133 struct tb_profile tb
= { .rate
= 0, .percent
= 0, .depth
= 0 };
134 (void) ifclassq_tbr_set(ifq
, &tb
, FALSE
);
136 pktsched_teardown(ifq
);
139 ifq
->ifcq_sflags
= 0;
141 VERIFY(IFCQ_IS_EMPTY(ifq
));
142 VERIFY(!IFCQ_TBR_IS_ENABLED(ifq
));
143 VERIFY(ifq
->ifcq_type
== PKTSCHEDT_NONE
);
144 VERIFY(ifq
->ifcq_flags
== 0);
145 VERIFY(ifq
->ifcq_sflags
== 0);
146 VERIFY(ifq
->ifcq_disc
== NULL
);
149 IFCQ_MAXLEN(ifq
) = 0;
150 bzero(&ifq
->ifcq_xmitcnt
, sizeof(ifq
->ifcq_xmitcnt
));
151 bzero(&ifq
->ifcq_dropcnt
, sizeof(ifq
->ifcq_dropcnt
));
157 ifclassq_pktsched_setup(struct ifclassq
*ifq
)
159 struct ifnet
*ifp
= ifq
->ifcq_ifp
;
160 classq_pkt_type_t ptype
= QP_MBUF
;
163 IFCQ_LOCK_ASSERT_HELD(ifq
);
164 VERIFY(ifp
->if_eflags
& IFEF_TXSTART
);
166 err
= pktsched_setup(ifq
, PKTSCHEDT_FQ_CODEL
, ifq
->ifcq_sflags
, ptype
);
172 ifclassq_set_maxlen(struct ifclassq
*ifq
, u_int32_t maxqlen
)
176 maxqlen
= if_sndq_maxlen
;
178 IFCQ_SET_MAXLEN(ifq
, maxqlen
);
183 ifclassq_get_maxlen(struct ifclassq
*ifq
)
185 return IFCQ_MAXLEN(ifq
);
189 ifclassq_get_len(struct ifclassq
*ifq
, mbuf_svc_class_t sc
, u_int32_t
*packets
,
195 if (sc
== MBUF_SC_UNSPEC
) {
196 VERIFY(packets
!= NULL
);
197 *packets
= IFCQ_LEN(ifq
);
199 cqrq_stat_sc_t req
= { sc
, 0, 0 };
201 VERIFY(MBUF_VALID_SC(sc
));
202 VERIFY(packets
!= NULL
&& bytes
!= NULL
);
204 err
= fq_if_request_classq(ifq
, CLASSQRQ_STAT_SC
, &req
);
205 if (packets
!= NULL
) {
206 *packets
= req
.packets
;
219 ifclassq_set_packet_metadata(struct ifclassq
*ifq
, struct ifnet
*ifp
,
222 if (!IFNET_IS_CELLULAR(ifp
)) {
226 switch (p
->cp_ptype
) {
228 struct mbuf
*m
= p
->cp_mbuf
;
229 m
->m_pkthdr
.pkt_flags
|= PKTF_VALID_UNSENT_DATA
;
230 m
->m_pkthdr
.bufstatus_if
= IFCQ_BYTES(ifq
);
231 m
->m_pkthdr
.bufstatus_sndbuf
= (uint32_t)ifp
->if_sndbyte_unsent
;
239 __builtin_unreachable();
244 ifclassq_enqueue(struct ifclassq
*ifq
, classq_pkt_t
*head
, classq_pkt_t
*tail
,
245 u_int32_t cnt
, u_int32_t bytes
, boolean_t
*pdrop
)
247 return fq_if_enqueue_classq(ifq
, head
, tail
, cnt
, bytes
, pdrop
);
251 ifclassq_dequeue(struct ifclassq
*ifq
, u_int32_t pkt_limit
,
252 u_int32_t byte_limit
, classq_pkt_t
*head
, classq_pkt_t
*tail
,
253 u_int32_t
*cnt
, u_int32_t
*len
)
255 return ifclassq_dequeue_common(ifq
, MBUF_SC_UNSPEC
, pkt_limit
,
256 byte_limit
, head
, tail
, cnt
, len
, FALSE
);
260 ifclassq_dequeue_sc(struct ifclassq
*ifq
, mbuf_svc_class_t sc
,
261 u_int32_t pkt_limit
, u_int32_t byte_limit
, classq_pkt_t
*head
,
262 classq_pkt_t
*tail
, u_int32_t
*cnt
, u_int32_t
*len
)
264 return ifclassq_dequeue_common(ifq
, sc
, pkt_limit
, byte_limit
,
265 head
, tail
, cnt
, len
, TRUE
);
269 ifclassq_dequeue_common_default(struct ifclassq
*ifq
, mbuf_svc_class_t sc
,
270 u_int32_t pkt_limit
, u_int32_t byte_limit
, classq_pkt_t
*head
,
271 classq_pkt_t
*tail
, u_int32_t
*cnt
, u_int32_t
*len
, boolean_t drvmgt
)
273 struct ifnet
*ifp
= ifq
->ifcq_ifp
;
274 u_int32_t i
= 0, l
= 0;
275 classq_pkt_t first
= CLASSQ_PKT_INITIALIZER(first
);
276 classq_pkt_t last
= CLASSQ_PKT_INITIALIZER(last
);
278 VERIFY(!drvmgt
|| MBUF_VALID_SC(sc
));
280 if (IFCQ_TBR_IS_ENABLED(ifq
)) {
285 * If the scheduler support dequeueing multiple packets at the
286 * same time, call that one instead.
292 err
= fq_if_dequeue_sc_classq_multi(ifq
, sc
, pkt_limit
,
293 byte_limit
, head
, tail
, cnt
, len
);
296 if (err
== 0 && head
->cp_mbuf
== NULL
) {
304 err
= fq_if_dequeue_classq_multi(ifq
, pkt_limit
, byte_limit
,
305 head
, tail
, cnt
, len
);
308 if (err
== 0 && head
->cp_mbuf
== NULL
) {
318 while (i
< pkt_limit
&& l
< byte_limit
) {
320 if (IFCQ_TBR_IS_ENABLED(ifq
)) {
321 IFCQ_TBR_DEQUEUE_SC(ifq
, sc
, head
);
323 fq_if_dequeue_sc_classq(ifq
, sc
, head
);
326 if (IFCQ_TBR_IS_ENABLED(ifq
)) {
327 IFCQ_TBR_DEQUEUE(ifq
, head
);
329 fq_if_dequeue_classq(ifq
, head
);
333 if (head
->cp_mbuf
== NULL
) {
337 if (first
.cp_mbuf
== NULL
) {
341 switch (head
->cp_ptype
) {
343 head
->cp_mbuf
->m_nextpkt
= NULL
;
344 l
+= head
->cp_mbuf
->m_pkthdr
.len
;
345 ifclassq_set_packet_metadata(ifq
, ifp
, head
);
346 if (last
.cp_mbuf
!= NULL
) {
347 last
.cp_mbuf
->m_nextpkt
= head
->cp_mbuf
;
355 __builtin_unreachable();
375 return (first
.cp_mbuf
!= NULL
) ? 0 : EAGAIN
;
379 ifclassq_dequeue_common(struct ifclassq
*ifq
, mbuf_svc_class_t sc
,
380 u_int32_t pkt_limit
, u_int32_t byte_limit
, classq_pkt_t
*head
,
381 classq_pkt_t
*tail
, u_int32_t
*cnt
, u_int32_t
*len
, boolean_t drvmgt
)
383 return ifclassq_dequeue_common_default(ifq
, sc
,
384 pkt_limit
, byte_limit
, head
, tail
, cnt
, len
, drvmgt
);
388 ifclassq_update(struct ifclassq
*ifq
, cqev_t ev
)
390 IFCQ_LOCK_ASSERT_HELD(ifq
);
391 VERIFY(IFCQ_IS_READY(ifq
));
392 fq_if_request_classq(ifq
, CLASSQRQ_EVENT
, (void *)ev
);
396 ifclassq_attach(struct ifclassq
*ifq
, u_int32_t type
, void *discipline
)
398 IFCQ_LOCK_ASSERT_HELD(ifq
);
400 VERIFY(ifq
->ifcq_disc
== NULL
);
402 ifq
->ifcq_type
= type
;
403 ifq
->ifcq_disc
= discipline
;
409 ifclassq_detach(struct ifclassq
*ifq
)
411 IFCQ_LOCK_ASSERT_HELD(ifq
);
413 VERIFY(ifq
->ifcq_disc
== NULL
);
415 ifq
->ifcq_type
= PKTSCHEDT_NONE
;
419 ifclassq_getqstats(struct ifclassq
*ifq
, u_int32_t qid
, void *ubuf
,
422 struct if_ifclassq_stats
*ifqs
;
425 if (*nbytes
< sizeof(*ifqs
)) {
429 ifqs
= _MALLOC(sizeof(*ifqs
), M_TEMP
, M_WAITOK
| M_ZERO
);
435 if (!IFCQ_IS_READY(ifq
)) {
441 ifqs
->ifqs_len
= IFCQ_LEN(ifq
);
442 ifqs
->ifqs_maxlen
= IFCQ_MAXLEN(ifq
);
443 *(&ifqs
->ifqs_xmitcnt
) = *(&ifq
->ifcq_xmitcnt
);
444 *(&ifqs
->ifqs_dropcnt
) = *(&ifq
->ifcq_dropcnt
);
445 ifqs
->ifqs_scheduler
= ifq
->ifcq_type
;
447 err
= pktsched_getqstats(ifq
, qid
, ifqs
);
450 if (err
== 0 && (err
= copyout((caddr_t
)ifqs
,
451 (user_addr_t
)(uintptr_t)ubuf
, sizeof(*ifqs
))) == 0) {
452 *nbytes
= sizeof(*ifqs
);
461 ifclassq_ev2str(cqev_t ev
)
466 case CLASSQ_EV_LINK_BANDWIDTH
:
467 c
= "LINK_BANDWIDTH";
470 case CLASSQ_EV_LINK_LATENCY
:
474 case CLASSQ_EV_LINK_MTU
:
478 case CLASSQ_EV_LINK_UP
:
482 case CLASSQ_EV_LINK_DOWN
:
495 * internal representation of token bucket parameters
496 * rate: byte_per_unittime << 32
497 * (((bits_per_sec) / 8) << 32) / machclk_freq
502 #define TBR_SCALE(x) ((int64_t)(x) << TBR_SHIFT)
503 #define TBR_UNSCALE(x) ((x) >> TBR_SHIFT)
506 ifclassq_tbr_dequeue(struct ifclassq
*ifq
, classq_pkt_t
*pkt
)
508 ifclassq_tbr_dequeue_common(ifq
, MBUF_SC_UNSPEC
, FALSE
, pkt
);
512 ifclassq_tbr_dequeue_sc(struct ifclassq
*ifq
, mbuf_svc_class_t sc
,
515 ifclassq_tbr_dequeue_common(ifq
, sc
, TRUE
, pkt
);
519 ifclassq_tbr_dequeue_common(struct ifclassq
*ifq
, mbuf_svc_class_t sc
,
520 boolean_t drvmgt
, classq_pkt_t
*pkt
)
522 struct tb_regulator
*tbr
;
526 IFCQ_LOCK_ASSERT_HELD(ifq
);
528 VERIFY(!drvmgt
|| MBUF_VALID_SC(sc
));
529 VERIFY(IFCQ_TBR_IS_ENABLED(ifq
));
531 *pkt
= CLASSQ_PKT_INITIALIZER(*pkt
);
532 tbr
= &ifq
->ifcq_tbr
;
533 /* update token only when it is negative */
534 if (tbr
->tbr_token
<= 0) {
535 now
= read_machclk();
536 interval
= now
- tbr
->tbr_last
;
537 if (interval
>= tbr
->tbr_filluptime
) {
538 tbr
->tbr_token
= tbr
->tbr_depth
;
540 tbr
->tbr_token
+= interval
* tbr
->tbr_rate
;
541 if (tbr
->tbr_token
> tbr
->tbr_depth
) {
542 tbr
->tbr_token
= tbr
->tbr_depth
;
547 /* if token is still negative, don't allow dequeue */
548 if (tbr
->tbr_token
<= 0) {
553 * ifclassq takes precedence over ALTQ queue;
554 * ifcq_drain count is adjusted by the caller.
557 fq_if_dequeue_sc_classq(ifq
, sc
, pkt
);
559 fq_if_dequeue_classq(ifq
, pkt
);
562 if (pkt
->cp_mbuf
!= NULL
) {
563 switch (pkt
->cp_ptype
) {
565 tbr
->tbr_token
-= TBR_SCALE(m_pktlen(pkt
->cp_mbuf
));
577 * set a token bucket regulator.
578 * if the specified rate is zero, the token bucket regulator is deleted.
581 ifclassq_tbr_set(struct ifclassq
*ifq
, struct tb_profile
*profile
,
584 struct tb_regulator
*tbr
;
585 struct ifnet
*ifp
= ifq
->ifcq_ifp
;
586 u_int64_t rate
, old_rate
;
588 IFCQ_LOCK_ASSERT_HELD(ifq
);
589 VERIFY(IFCQ_IS_READY(ifq
));
591 VERIFY(machclk_freq
!= 0);
593 tbr
= &ifq
->ifcq_tbr
;
594 old_rate
= tbr
->tbr_rate_raw
;
596 rate
= profile
->rate
;
597 if (profile
->percent
> 0) {
600 if (profile
->percent
> 100) {
603 if ((eff_rate
= ifp
->if_output_bw
.eff_bw
) == 0) {
606 rate
= (eff_rate
* profile
->percent
) / 100;
610 if (!IFCQ_TBR_IS_ENABLED(ifq
)) {
614 if (pktsched_verbose
) {
615 printf("%s: TBR disabled\n", if_name(ifp
));
618 /* disable this TBR */
619 ifq
->ifcq_flags
&= ~IFCQF_TBR
;
620 bzero(tbr
, sizeof(*tbr
));
621 ifnet_set_start_cycle(ifp
, NULL
);
623 ifclassq_update(ifq
, CLASSQ_EV_LINK_BANDWIDTH
);
628 if (pktsched_verbose
) {
629 printf("%s: TBR %s (rate %llu bps depth %u)\n", if_name(ifp
),
630 (ifq
->ifcq_flags
& IFCQF_TBR
) ? "reconfigured" :
631 "enabled", rate
, profile
->depth
);
634 /* set the new TBR */
635 bzero(tbr
, sizeof(*tbr
));
636 tbr
->tbr_rate_raw
= rate
;
637 tbr
->tbr_percent
= profile
->percent
;
638 ifq
->ifcq_flags
|= IFCQF_TBR
;
641 * Note that the TBR fill up time (hence the ifnet restart time)
642 * is directly related to the specified TBR depth. The ideal
643 * depth value should be computed such that the interval time
644 * between each successive wakeup is adequately spaced apart,
645 * in order to reduce scheduling overheads. A target interval
646 * of 10 ms seems to provide good performance balance. This can be
647 * overridden by specifying the depth profile. Values smaller than
648 * the ideal depth will reduce delay at the expense of CPU cycles.
650 tbr
->tbr_rate
= TBR_SCALE(rate
/ 8) / machclk_freq
;
651 if (tbr
->tbr_rate
> 0) {
652 u_int32_t mtu
= ifp
->if_mtu
;
653 int64_t ival
, idepth
= 0;
656 if (mtu
< IF_MINMTU
) {
660 ival
= pktsched_nsecs_to_abstime(10 * NSEC_PER_MSEC
); /* 10ms */
663 idepth
= TBR_SCALE(i
* mtu
);
664 if ((idepth
/ tbr
->tbr_rate
) > ival
) {
670 tbr
->tbr_depth
= TBR_SCALE(profile
->depth
);
671 if (tbr
->tbr_depth
== 0) {
672 tbr
->tbr_filluptime
= idepth
/ tbr
->tbr_rate
;
673 /* a little fudge factor to get closer to rate */
674 tbr
->tbr_depth
= idepth
+ (idepth
>> 3);
676 tbr
->tbr_filluptime
= tbr
->tbr_depth
/ tbr
->tbr_rate
;
679 tbr
->tbr_depth
= TBR_SCALE(profile
->depth
);
680 tbr
->tbr_filluptime
= 0xffffffffffffffffLL
;
682 tbr
->tbr_token
= tbr
->tbr_depth
;
683 tbr
->tbr_last
= read_machclk();
685 if (tbr
->tbr_rate
> 0 && (ifp
->if_flags
& IFF_UP
)) {
687 { 0, (long)pktsched_abs_to_nsecs(tbr
->tbr_filluptime
) };
688 if (pktsched_verbose
) {
689 printf("%s: TBR calculated tokens %lld "
690 "filluptime %llu ns\n", if_name(ifp
),
691 TBR_UNSCALE(tbr
->tbr_token
),
692 pktsched_abs_to_nsecs(tbr
->tbr_filluptime
));
694 ifnet_set_start_cycle(ifp
, &ts
);
696 if (pktsched_verbose
) {
697 if (tbr
->tbr_rate
== 0) {
698 printf("%s: TBR calculated tokens %lld "
699 "infinite filluptime\n", if_name(ifp
),
700 TBR_UNSCALE(tbr
->tbr_token
));
701 } else if (!(ifp
->if_flags
& IFF_UP
)) {
702 printf("%s: TBR suspended (link is down)\n",
706 ifnet_set_start_cycle(ifp
, NULL
);
708 if (update
&& tbr
->tbr_rate_raw
!= old_rate
) {
709 ifclassq_update(ifq
, CLASSQ_EV_LINK_BANDWIDTH
);
716 ifclassq_calc_target_qdelay(struct ifnet
*ifp
, u_int64_t
*if_target_qdelay
)
718 u_int64_t qdelay
= 0;
719 qdelay
= IFCQ_TARGET_QDELAY(&ifp
->if_snd
);
721 if (ifclassq_target_qdelay
!= 0) {
722 qdelay
= ifclassq_target_qdelay
;
726 * If we do not know the effective bandwidth, use the default
727 * target queue delay.
730 qdelay
= IFQ_TARGET_DELAY
;
734 * If a delay has been added to ifnet start callback for
735 * coalescing, we have to add that to the pre-set target delay
736 * because the packets can be in the queue longer.
738 if ((ifp
->if_eflags
& IFEF_ENQUEUE_MULTI
) &&
739 ifp
->if_start_delay_timeout
> 0) {
740 qdelay
+= ifp
->if_start_delay_timeout
;
743 *(if_target_qdelay
) = qdelay
;
747 ifclassq_calc_update_interval(u_int64_t
*update_interval
)
751 /* If the system level override is set, use it */
752 if (ifclassq_update_interval
!= 0) {
753 uint
= ifclassq_update_interval
;
756 /* Otherwise use the default value */
758 uint
= IFQ_UPDATE_INTERVAL
;
761 *update_interval
= uint
;
765 ifclassq_reap_caches(boolean_t purge
)
767 fq_codel_reap_caches(purge
);
768 flowadv_reap_caches(purge
);