2 * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
32 #include <sys/socket.h>
33 #include <sys/sockio.h>
34 #include <sys/systm.h>
35 #include <sys/sysctl.h>
36 #include <sys/syslog.h>
38 #include <sys/errno.h>
39 #include <sys/kernel.h>
40 #include <sys/kauth.h>
42 #include <kern/zalloc.h>
43 #include <netinet/in.h>
45 #include <net/classq/classq.h>
46 #include <net/classq/if_classq.h>
47 #include <net/pktsched/pktsched.h>
48 #include <net/pktsched/pktsched_fq_codel.h>
49 #include <net/classq/classq_fq_codel.h>
51 #include <netinet/tcp_var.h>
53 static uint32_t flowq_size
; /* size of flowq */
54 static struct mcache
*flowq_cache
= NULL
; /* mcache for flowq */
56 #define FQ_ZONE_MAX (32 * 1024) /* across all interfaces */
58 #define DTYPE_NODROP 0 /* no drop */
59 #define DTYPE_FORCED 1 /* a "forced" drop */
60 #define DTYPE_EARLY 2 /* an "unforced" (early) drop */
65 if (flowq_cache
!= NULL
) {
69 flowq_size
= sizeof(fq_t
);
70 flowq_cache
= mcache_create("fq.flowq", flowq_size
, sizeof(uint64_t),
72 if (flowq_cache
== NULL
) {
73 panic("%s: failed to allocate flowq_cache", __func__
);
75 __builtin_unreachable();
80 fq_codel_reap_caches(boolean_t purge
)
82 mcache_reap_now(flowq_cache
, purge
);
86 fq_alloc(classq_pkt_type_t ptype
)
89 fq
= mcache_alloc(flowq_cache
, MCR_SLEEP
);
91 log(LOG_ERR
, "%s: unable to allocate from flowq_cache\n", __func__
);
95 bzero(fq
, flowq_size
);
97 if (ptype
== QP_MBUF
) {
98 MBUFQ_INIT(&fq
->fq_mbufq
);
100 CLASSQ_PKT_INIT(&fq
->fq_dq_head
);
101 CLASSQ_PKT_INIT(&fq
->fq_dq_tail
);
102 fq
->fq_in_dqlist
= false;
109 VERIFY(fq
->fq_flags
& FQF_DESTROYED
);
110 VERIFY(fq_empty(fq
));
111 VERIFY(!(fq
->fq_flags
& (FQF_NEW_FLOW
| FQF_OLD_FLOW
)));
112 VERIFY(fq
->fq_bytes
== 0);
113 mcache_free(flowq_cache
, fq
);
117 fq_detect_dequeue_stall(fq_if_t
*fqs
, fq_t
*flowq
, fq_if_classq_t
*fq_cl
,
120 u_int64_t maxgetqtime
;
121 if (FQ_IS_DELAYHIGH(flowq
) || flowq
->fq_getqtime
== 0 ||
123 flowq
->fq_bytes
< FQ_MIN_FC_THRESHOLD_BYTES
) {
126 maxgetqtime
= flowq
->fq_getqtime
+ fqs
->fqs_update_interval
;
127 if ((*now
) > maxgetqtime
) {
129 * there was no dequeue in an update interval worth of
130 * time. It means that the queue is stalled.
132 FQ_SET_DELAY_HIGH(flowq
);
133 fq_cl
->fcl_stat
.fcl_dequeue_stall
++;
134 os_log_error(OS_LOG_DEFAULT
, "%s: dequeue stall num: %d, "
135 "scidx: %d, flow: 0x%x, iface: %s", __func__
,
136 fq_cl
->fcl_stat
.fcl_dequeue_stall
, flowq
->fq_sc_index
,
137 flowq
->fq_flowhash
, if_name(fqs
->fqs_ifq
->ifcq_ifp
));
142 fq_head_drop(fq_if_t
*fqs
, fq_t
*fq
)
145 volatile uint32_t *pkt_flags
;
146 uint64_t *pkt_timestamp
;
147 struct ifclassq
*ifq
= fqs
->fqs_ifq
;
149 _PKTSCHED_PKT_INIT(&pkt
);
150 fq_getq_flow_internal(fqs
, fq
, &pkt
);
151 if (pkt
.pktsched_pkt_mbuf
== NULL
) {
155 pktsched_get_pkt_vars(&pkt
, &pkt_flags
, &pkt_timestamp
, NULL
, NULL
,
159 switch (pkt
.pktsched_ptype
) {
161 *pkt_flags
&= ~PKTF_PRIV_GUARDED
;
166 __builtin_unreachable();
169 IFCQ_DROP_ADD(ifq
, 1, pktsched_get_pkt_len(&pkt
));
170 IFCQ_CONVERT_LOCK(ifq
);
171 pktsched_free_pkt(&pkt
);
176 fq_compressor(fq_if_t
*fqs
, fq_t
*fq
, fq_if_classq_t
*fq_cl
,
179 classq_pkt_type_t ptype
= fq
->fq_ptype
;
180 uint32_t comp_gencnt
= 0;
181 uint64_t *pkt_timestamp
;
182 uint64_t old_timestamp
= 0;
183 uint32_t old_pktlen
= 0;
184 struct ifclassq
*ifq
= fqs
->fqs_ifq
;
186 if (__improbable(!tcp_do_ack_compression
)) {
190 pktsched_get_pkt_vars(pkt
, NULL
, &pkt_timestamp
, NULL
, NULL
, NULL
,
193 if (comp_gencnt
== 0) {
197 fq_cl
->fcl_stat
.fcl_pkts_compressible
++;
203 if (ptype
== QP_MBUF
) {
204 struct mbuf
*m
= MBUFQ_LAST(&fq
->fq_mbufq
);
206 if (comp_gencnt
!= m
->m_pkthdr
.comp_gencnt
) {
210 /* If we got until here, we should merge/replace the segment */
211 MBUFQ_REMOVE(&fq
->fq_mbufq
, m
);
212 old_pktlen
= m_pktlen(m
);
213 old_timestamp
= m
->m_pkthdr
.pkt_timestamp
;
215 IFCQ_CONVERT_LOCK(fqs
->fqs_ifq
);
219 fq
->fq_bytes
-= old_pktlen
;
220 fq_cl
->fcl_stat
.fcl_byte_cnt
-= old_pktlen
;
221 fq_cl
->fcl_stat
.fcl_pkt_cnt
--;
223 IFCQ_DEC_BYTES(ifq
, old_pktlen
);
225 *pkt_timestamp
= old_timestamp
;
227 return CLASSQEQ_COMPRESSED
;
231 fq_addq(fq_if_t
*fqs
, pktsched_pkt_t
*pkt
, fq_if_classq_t
*fq_cl
)
233 int droptype
= DTYPE_NODROP
, fc_adv
= 0, ret
= CLASSQEQ_SUCCESS
;
236 uint64_t *pkt_timestamp
;
237 volatile uint32_t *pkt_flags
;
238 uint32_t pkt_flowid
, cnt
;
239 uint8_t pkt_proto
, pkt_flowsrc
;
241 cnt
= pkt
->pktsched_pcnt
;
242 pktsched_get_pkt_vars(pkt
, &pkt_flags
, &pkt_timestamp
, &pkt_flowid
,
243 &pkt_flowsrc
, &pkt_proto
, NULL
);
246 * XXX Not walking the chain to set this flag on every packet.
247 * This flag is only used for debugging. Nothing is affected if it's
250 switch (pkt
->pktsched_ptype
) {
252 /* See comments in <rdar://problem/14040693> */
253 VERIFY(!(*pkt_flags
& PKTF_PRIV_GUARDED
));
254 *pkt_flags
|= PKTF_PRIV_GUARDED
;
259 __builtin_unreachable();
263 * Timestamps for every packet must be set prior to entering this path.
265 now
= *pkt_timestamp
;
268 /* find the flowq for this packet */
269 fq
= fq_if_hash_pkt(fqs
, pkt_flowid
, pktsched_get_pkt_svc(pkt
),
270 now
, TRUE
, pkt
->pktsched_ptype
);
271 if (__improbable(fq
== NULL
)) {
272 DTRACE_IP1(memfail__drop
, fq_if_t
*, fqs
);
273 /* drop the packet if we could not allocate a flow queue */
274 fq_cl
->fcl_stat
.fcl_drop_memfailure
+= cnt
;
275 return CLASSQEQ_DROP
;
277 VERIFY(fq
->fq_ptype
== pkt
->pktsched_ptype
);
279 fq_detect_dequeue_stall(fqs
, fq
, fq_cl
, &now
);
281 if (__improbable(FQ_IS_DELAYHIGH(fq
))) {
282 if ((fq
->fq_flags
& FQF_FLOWCTL_CAPABLE
) &&
283 (*pkt_flags
& PKTF_FLOW_ADV
)) {
286 * If the flow is suspended or it is not
287 * TCP/QUIC, drop the chain.
289 if ((pkt_proto
!= IPPROTO_TCP
) &&
290 (pkt_proto
!= IPPROTO_QUIC
)) {
291 droptype
= DTYPE_EARLY
;
292 fq_cl
->fcl_stat
.fcl_drop_early
+= cnt
;
294 DTRACE_IP6(flow__adv
, fq_if_t
*, fqs
,
295 fq_if_classq_t
*, fq_cl
, fq_t
*, fq
,
296 int, droptype
, pktsched_pkt_t
*, pkt
,
300 * Need to drop packets to make room for the new
301 * ones. Try to drop from the head of the queue
302 * instead of the latest packets.
307 for (i
= 0; i
< cnt
; i
++) {
308 fq_head_drop(fqs
, fq
);
310 droptype
= DTYPE_NODROP
;
312 droptype
= DTYPE_EARLY
;
314 fq_cl
->fcl_stat
.fcl_drop_early
+= cnt
;
316 DTRACE_IP6(no__flow__adv
, fq_if_t
*, fqs
,
317 fq_if_classq_t
*, fq_cl
, fq_t
*, fq
,
318 int, droptype
, pktsched_pkt_t
*, pkt
,
323 /* Set the return code correctly */
324 if (__improbable(fc_adv
== 1 && droptype
!= DTYPE_FORCED
)) {
325 if (fq_if_add_fcentry(fqs
, pkt
, pkt_flowsrc
, fq
, fq_cl
)) {
326 fq
->fq_flags
|= FQF_FLOWCTL_ON
;
327 /* deliver flow control advisory error */
328 if (droptype
== DTYPE_NODROP
) {
329 ret
= CLASSQEQ_SUCCESS_FC
;
331 /* dropped due to flow control */
332 ret
= CLASSQEQ_DROP_FC
;
336 * if we could not flow control the flow, it is
339 droptype
= DTYPE_FORCED
;
340 ret
= CLASSQEQ_DROP_FC
;
341 fq_cl
->fcl_stat
.fcl_flow_control_fail
++;
343 DTRACE_IP3(fc__ret
, fq_if_t
*, fqs
, int, droptype
, int, ret
);
347 * If the queue length hits the queue limit, drop a chain with the
348 * same number of packets from the front of the queue for a flow with
349 * maximum number of bytes. This will penalize heavy and unresponsive
350 * flows. It will also avoid a tail drop.
352 if (__improbable(droptype
== DTYPE_NODROP
&&
353 fq_if_at_drop_limit(fqs
))) {
356 if (fqs
->fqs_large_flow
== fq
) {
358 * Drop from the head of the current fq. Since a
359 * new packet will be added to the tail, it is ok
360 * to leave fq in place.
362 DTRACE_IP5(large__flow
, fq_if_t
*, fqs
,
363 fq_if_classq_t
*, fq_cl
, fq_t
*, fq
,
364 pktsched_pkt_t
*, pkt
, uint32_t, cnt
);
366 for (i
= 0; i
< cnt
; i
++) {
367 fq_head_drop(fqs
, fq
);
370 if (fqs
->fqs_large_flow
== NULL
) {
371 droptype
= DTYPE_FORCED
;
372 fq_cl
->fcl_stat
.fcl_drop_overflow
+= cnt
;
375 DTRACE_IP5(no__large__flow
, fq_if_t
*, fqs
,
376 fq_if_classq_t
*, fq_cl
, fq_t
*, fq
,
377 pktsched_pkt_t
*, pkt
, uint32_t, cnt
);
380 * if this fq was freshly created and there
381 * is nothing to enqueue, free it
383 if (fq_empty(fq
) && !(fq
->fq_flags
&
384 (FQF_NEW_FLOW
| FQF_OLD_FLOW
))) {
385 fq_if_destroy_flow(fqs
, fq_cl
, fq
, true);
389 DTRACE_IP5(different__large__flow
,
390 fq_if_t
*, fqs
, fq_if_classq_t
*, fq_cl
,
391 fq_t
*, fq
, pktsched_pkt_t
*, pkt
,
394 for (i
= 0; i
< cnt
; i
++) {
395 fq_if_drop_packet(fqs
);
401 if (__probable(droptype
== DTYPE_NODROP
)) {
402 uint32_t chain_len
= pktsched_get_pkt_len(pkt
);
405 * We do not compress if we are enqueuing a chain.
406 * Traversing the chain to look for acks would defeat the
407 * purpose of batch enqueueing.
410 ret
= fq_compressor(fqs
, fq
, fq_cl
, pkt
);
411 if (ret
!= CLASSQEQ_COMPRESSED
) {
412 ret
= CLASSQEQ_SUCCESS
;
414 fq_cl
->fcl_stat
.fcl_pkts_compressed
++;
417 DTRACE_IP5(fq_enqueue
, fq_if_t
*, fqs
, fq_if_classq_t
*, fq_cl
,
418 fq_t
*, fq
, pktsched_pkt_t
*, pkt
, uint32_t, cnt
);
419 fq_enqueue(fq
, pkt
->pktsched_pkt
, pkt
->pktsched_tail
, cnt
);
421 fq
->fq_bytes
+= chain_len
;
422 fq_cl
->fcl_stat
.fcl_byte_cnt
+= chain_len
;
423 fq_cl
->fcl_stat
.fcl_pkt_cnt
+= cnt
;
426 * check if this queue will qualify to be the next
429 fq_if_is_flow_heavy(fqs
, fq
);
431 DTRACE_IP3(fq_drop
, fq_if_t
*, fqs
, int, droptype
, int, ret
);
432 return (ret
!= CLASSQEQ_SUCCESS
) ? ret
: CLASSQEQ_DROP
;
436 * If the queue is not currently active, add it to the end of new
437 * flows list for that service class.
439 if ((fq
->fq_flags
& (FQF_NEW_FLOW
| FQF_OLD_FLOW
)) == 0) {
440 VERIFY(STAILQ_NEXT(fq
, fq_actlink
) == NULL
);
441 STAILQ_INSERT_TAIL(&fq_cl
->fcl_new_flows
, fq
, fq_actlink
);
442 fq
->fq_flags
|= FQF_NEW_FLOW
;
444 fq_cl
->fcl_stat
.fcl_newflows_cnt
++;
446 fq
->fq_deficit
= fq_cl
->fcl_quantum
;
452 fq_getq_flow_internal(fq_if_t
*fqs
, fq_t
*fq
, pktsched_pkt_t
*pkt
)
454 classq_pkt_t p
= CLASSQ_PKT_INITIALIZER(p
);
456 fq_if_classq_t
*fq_cl
;
457 struct ifclassq
*ifq
= fqs
->fqs_ifq
;
460 if (p
.cp_ptype
== QP_INVALID
) {
461 VERIFY(p
.cp_mbuf
== NULL
);
465 pktsched_pkt_encap(pkt
, &p
);
466 plen
= pktsched_get_pkt_len(pkt
);
468 VERIFY(fq
->fq_bytes
>= plen
);
469 fq
->fq_bytes
-= plen
;
471 fq_cl
= &fqs
->fqs_classq
[fq
->fq_sc_index
];
472 fq_cl
->fcl_stat
.fcl_byte_cnt
-= plen
;
473 fq_cl
->fcl_stat
.fcl_pkt_cnt
--;
475 IFCQ_DEC_BYTES(ifq
, plen
);
477 /* Reset getqtime so that we don't count idle times */
484 fq_getq_flow(fq_if_t
*fqs
, fq_t
*fq
, pktsched_pkt_t
*pkt
)
486 fq_if_classq_t
*fq_cl
;
489 struct timespec now_ts
;
490 volatile uint32_t *pkt_flags
;
491 uint64_t *pkt_timestamp
;
493 fq_getq_flow_internal(fqs
, fq
, pkt
);
494 if (pkt
->pktsched_ptype
== QP_INVALID
) {
495 VERIFY(pkt
->pktsched_pkt_mbuf
== NULL
);
499 pktsched_get_pkt_vars(pkt
, &pkt_flags
, &pkt_timestamp
, NULL
, NULL
,
503 now
= (now_ts
.tv_sec
* NSEC_PER_SEC
) + now_ts
.tv_nsec
;
505 /* this will compute qdelay in nanoseconds */
506 if (now
> *pkt_timestamp
) {
507 qdelay
= now
- *pkt_timestamp
;
509 fq_cl
= &fqs
->fqs_classq
[fq
->fq_sc_index
];
511 if (fq
->fq_min_qdelay
== 0 ||
512 (qdelay
> 0 && (u_int64_t
)qdelay
< fq
->fq_min_qdelay
)) {
513 fq
->fq_min_qdelay
= qdelay
;
515 if (now
>= fq
->fq_updatetime
) {
516 if (fq
->fq_min_qdelay
> fqs
->fqs_target_qdelay
) {
517 if (!FQ_IS_DELAYHIGH(fq
)) {
518 FQ_SET_DELAY_HIGH(fq
);
519 os_log_error(OS_LOG_DEFAULT
,
520 "%s: high delay idx: %d, %llu, flow: 0x%x, "
521 "iface: %s", __func__
, fq
->fq_sc_index
,
522 fq
->fq_min_qdelay
, fq
->fq_flowhash
,
523 if_name(fqs
->fqs_ifq
->ifcq_ifp
));
526 FQ_CLEAR_DELAY_HIGH(fq
);
528 /* Reset measured queue delay and update time */
529 fq
->fq_updatetime
= now
+ fqs
->fqs_update_interval
;
530 fq
->fq_min_qdelay
= 0;
532 if (!FQ_IS_DELAYHIGH(fq
) || fq_empty(fq
)) {
533 FQ_CLEAR_DELAY_HIGH(fq
);
534 if (fq
->fq_flags
& FQF_FLOWCTL_ON
) {
535 fq_if_flow_feedback(fqs
, fq
, fq_cl
);
540 /* Reset getqtime so that we don't count idle times */
543 fq
->fq_getqtime
= now
;
545 fq_if_is_flow_heavy(fqs
, fq
);
548 switch (pkt
->pktsched_ptype
) {
550 *pkt_flags
&= ~PKTF_PRIV_GUARDED
;
555 __builtin_unreachable();