]> git.saurik.com Git - apple/xnu.git/blob - bsd/net/classq/classq_fq_codel.c
xnu-7195.60.75.tar.gz
[apple/xnu.git] / bsd / net / classq / classq_fq_codel.c
1 /*
2 * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/mbuf.h>
32 #include <sys/socket.h>
33 #include <sys/sockio.h>
34 #include <sys/systm.h>
35 #include <sys/sysctl.h>
36 #include <sys/syslog.h>
37 #include <sys/proc.h>
38 #include <sys/errno.h>
39 #include <sys/kernel.h>
40 #include <sys/kauth.h>
41 #include <sys/sdt.h>
42 #include <kern/zalloc.h>
43 #include <netinet/in.h>
44
45 #include <net/classq/classq.h>
46 #include <net/classq/if_classq.h>
47 #include <net/pktsched/pktsched.h>
48 #include <net/pktsched/pktsched_fq_codel.h>
49 #include <net/classq/classq_fq_codel.h>
50
51 #include <netinet/tcp_var.h>
52
53 static uint32_t flowq_size; /* size of flowq */
54 static struct mcache *flowq_cache = NULL; /* mcache for flowq */
55
56 #define FQ_ZONE_MAX (32 * 1024) /* across all interfaces */
57
58 #define DTYPE_NODROP 0 /* no drop */
59 #define DTYPE_FORCED 1 /* a "forced" drop */
60 #define DTYPE_EARLY 2 /* an "unforced" (early) drop */
61
62 void
63 fq_codel_init(void)
64 {
65 if (flowq_cache != NULL) {
66 return;
67 }
68
69 flowq_size = sizeof(fq_t);
70 flowq_cache = mcache_create("fq.flowq", flowq_size, sizeof(uint64_t),
71 0, MCR_SLEEP);
72 if (flowq_cache == NULL) {
73 panic("%s: failed to allocate flowq_cache", __func__);
74 /* NOTREACHED */
75 __builtin_unreachable();
76 }
77 }
78
79 void
80 fq_codel_reap_caches(boolean_t purge)
81 {
82 mcache_reap_now(flowq_cache, purge);
83 }
84
85 fq_t *
86 fq_alloc(classq_pkt_type_t ptype)
87 {
88 fq_t *fq = NULL;
89 fq = mcache_alloc(flowq_cache, MCR_SLEEP);
90 if (fq == NULL) {
91 log(LOG_ERR, "%s: unable to allocate from flowq_cache\n", __func__);
92 return NULL;
93 }
94
95 bzero(fq, flowq_size);
96 fq->fq_ptype = ptype;
97 if (ptype == QP_MBUF) {
98 MBUFQ_INIT(&fq->fq_mbufq);
99 }
100 return fq;
101 }
102
103 void
104 fq_destroy(fq_t *fq)
105 {
106 VERIFY(fq_empty(fq));
107 VERIFY(!(fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)));
108 VERIFY(fq->fq_bytes == 0);
109 mcache_free(flowq_cache, fq);
110 }
111
112 static inline void
113 fq_detect_dequeue_stall(fq_if_t *fqs, fq_t *flowq, fq_if_classq_t *fq_cl,
114 u_int64_t *now)
115 {
116 u_int64_t maxgetqtime;
117 if (FQ_IS_DELAYHIGH(flowq) || flowq->fq_getqtime == 0 ||
118 fq_empty(flowq) ||
119 flowq->fq_bytes < FQ_MIN_FC_THRESHOLD_BYTES) {
120 return;
121 }
122 maxgetqtime = flowq->fq_getqtime + fqs->fqs_update_interval;
123 if ((*now) > maxgetqtime) {
124 /*
125 * there was no dequeue in an update interval worth of
126 * time. It means that the queue is stalled.
127 */
128 FQ_SET_DELAY_HIGH(flowq);
129 fq_cl->fcl_stat.fcl_dequeue_stall++;
130 }
131 }
132
133 void
134 fq_head_drop(fq_if_t *fqs, fq_t *fq)
135 {
136 pktsched_pkt_t pkt;
137 volatile uint32_t *pkt_flags;
138 uint64_t *pkt_timestamp;
139 struct ifclassq *ifq = fqs->fqs_ifq;
140
141 _PKTSCHED_PKT_INIT(&pkt);
142 fq_getq_flow_internal(fqs, fq, &pkt);
143 if (pkt.pktsched_pkt_mbuf == NULL) {
144 return;
145 }
146
147 pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
148 NULL, NULL);
149
150 *pkt_timestamp = 0;
151 switch (pkt.pktsched_ptype) {
152 case QP_MBUF:
153 *pkt_flags &= ~PKTF_PRIV_GUARDED;
154 break;
155 default:
156 VERIFY(0);
157 /* NOTREACHED */
158 __builtin_unreachable();
159 }
160
161 IFCQ_DROP_ADD(ifq, 1, pktsched_get_pkt_len(&pkt));
162 IFCQ_CONVERT_LOCK(ifq);
163 pktsched_free_pkt(&pkt);
164 }
165
166
167 static int
168 fq_compressor(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl,
169 pktsched_pkt_t *pkt)
170 {
171 classq_pkt_type_t ptype = fq->fq_ptype;
172 uint32_t comp_gencnt = 0;
173 uint64_t *pkt_timestamp;
174 uint64_t old_timestamp = 0;
175 uint32_t old_pktlen = 0;
176 struct ifclassq *ifq = fqs->fqs_ifq;
177
178 if (__improbable(!tcp_do_ack_compression)) {
179 return 0;
180 }
181
182 pktsched_get_pkt_vars(pkt, NULL, &pkt_timestamp, NULL, NULL, NULL,
183 &comp_gencnt);
184
185 if (comp_gencnt == 0) {
186 return 0;
187 }
188
189 fq_cl->fcl_stat.fcl_pkts_compressible++;
190
191 if (fq_empty(fq)) {
192 return 0;
193 }
194
195 if (ptype == QP_MBUF) {
196 struct mbuf *m = MBUFQ_LAST(&fq->fq_mbufq);
197
198 if (comp_gencnt != m->m_pkthdr.comp_gencnt) {
199 return 0;
200 }
201
202 /* If we got until here, we should merge/replace the segment */
203 MBUFQ_REMOVE(&fq->fq_mbufq, m);
204 old_pktlen = m_pktlen(m);
205 old_timestamp = m->m_pkthdr.pkt_timestamp;
206
207 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
208 m_freem(m);
209 }
210
211 fq->fq_bytes -= old_pktlen;
212 fq_cl->fcl_stat.fcl_byte_cnt -= old_pktlen;
213 fq_cl->fcl_stat.fcl_pkt_cnt--;
214 IFCQ_DEC_LEN(ifq);
215 IFCQ_DEC_BYTES(ifq, old_pktlen);
216
217 *pkt_timestamp = old_timestamp;
218
219 return CLASSQEQ_COMPRESSED;
220 }
221
222 int
223 fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl)
224 {
225 int droptype = DTYPE_NODROP, fc_adv = 0, ret = CLASSQEQ_SUCCESS;
226 u_int64_t now;
227 fq_t *fq = NULL;
228 uint64_t *pkt_timestamp;
229 volatile uint32_t *pkt_flags;
230 uint32_t pkt_flowid, cnt;
231 uint8_t pkt_proto, pkt_flowsrc;
232
233 cnt = pkt->pktsched_pcnt;
234 pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, &pkt_flowid,
235 &pkt_flowsrc, &pkt_proto, NULL);
236
237 /*
238 * XXX Not walking the chain to set this flag on every packet.
239 * This flag is only used for debugging. Nothing is affected if it's
240 * not set.
241 */
242 switch (pkt->pktsched_ptype) {
243 case QP_MBUF:
244 /* See comments in <rdar://problem/14040693> */
245 VERIFY(!(*pkt_flags & PKTF_PRIV_GUARDED));
246 *pkt_flags |= PKTF_PRIV_GUARDED;
247 break;
248 default:
249 VERIFY(0);
250 /* NOTREACHED */
251 __builtin_unreachable();
252 }
253
254 /*
255 * Timestamps for every packet must be set prior to entering this path.
256 */
257 now = *pkt_timestamp;
258 ASSERT(now > 0);
259
260 /* find the flowq for this packet */
261 fq = fq_if_hash_pkt(fqs, pkt_flowid, pktsched_get_pkt_svc(pkt),
262 now, TRUE, pkt->pktsched_ptype);
263 if (__improbable(fq == NULL)) {
264 DTRACE_IP1(memfail__drop, fq_if_t *, fqs);
265 /* drop the packet if we could not allocate a flow queue */
266 fq_cl->fcl_stat.fcl_drop_memfailure += cnt;
267 return CLASSQEQ_DROP;
268 }
269 VERIFY(fq->fq_ptype == pkt->pktsched_ptype);
270
271 fq_detect_dequeue_stall(fqs, fq, fq_cl, &now);
272
273 if (__improbable(FQ_IS_DELAYHIGH(fq))) {
274 if ((fq->fq_flags & FQF_FLOWCTL_CAPABLE) &&
275 (*pkt_flags & PKTF_FLOW_ADV)) {
276 fc_adv = 1;
277 /*
278 * If the flow is suspended or it is not
279 * TCP/QUIC, drop the chain.
280 */
281 if ((pkt_proto != IPPROTO_TCP) &&
282 (pkt_proto != IPPROTO_QUIC)) {
283 droptype = DTYPE_EARLY;
284 fq_cl->fcl_stat.fcl_drop_early += cnt;
285 }
286 DTRACE_IP6(flow__adv, fq_if_t *, fqs,
287 fq_if_classq_t *, fq_cl, fq_t *, fq,
288 int, droptype, pktsched_pkt_t *, pkt,
289 uint32_t, cnt);
290 } else {
291 /*
292 * Need to drop packets to make room for the new
293 * ones. Try to drop from the head of the queue
294 * instead of the latest packets.
295 */
296 if (!fq_empty(fq)) {
297 uint32_t i;
298
299 for (i = 0; i < cnt; i++) {
300 fq_head_drop(fqs, fq);
301 }
302 droptype = DTYPE_NODROP;
303 } else {
304 droptype = DTYPE_EARLY;
305 }
306 fq_cl->fcl_stat.fcl_drop_early += cnt;
307
308 DTRACE_IP6(no__flow__adv, fq_if_t *, fqs,
309 fq_if_classq_t *, fq_cl, fq_t *, fq,
310 int, droptype, pktsched_pkt_t *, pkt,
311 uint32_t, cnt);
312 }
313 }
314
315 /* Set the return code correctly */
316 if (__improbable(fc_adv == 1 && droptype != DTYPE_FORCED)) {
317 if (fq_if_add_fcentry(fqs, pkt, pkt_flowid, pkt_flowsrc,
318 fq_cl)) {
319 fq->fq_flags |= FQF_FLOWCTL_ON;
320 /* deliver flow control advisory error */
321 if (droptype == DTYPE_NODROP) {
322 ret = CLASSQEQ_SUCCESS_FC;
323 } else {
324 /* dropped due to flow control */
325 ret = CLASSQEQ_DROP_FC;
326 }
327 } else {
328 /*
329 * if we could not flow control the flow, it is
330 * better to drop
331 */
332 droptype = DTYPE_FORCED;
333 ret = CLASSQEQ_DROP_FC;
334 fq_cl->fcl_stat.fcl_flow_control_fail++;
335 }
336 DTRACE_IP3(fc__ret, fq_if_t *, fqs, int, droptype, int, ret);
337 }
338
339 /*
340 * If the queue length hits the queue limit, drop a chain with the
341 * same number of packets from the front of the queue for a flow with
342 * maximum number of bytes. This will penalize heavy and unresponsive
343 * flows. It will also avoid a tail drop.
344 */
345 if (__improbable(droptype == DTYPE_NODROP &&
346 fq_if_at_drop_limit(fqs))) {
347 uint32_t i;
348
349 if (fqs->fqs_large_flow == fq) {
350 /*
351 * Drop from the head of the current fq. Since a
352 * new packet will be added to the tail, it is ok
353 * to leave fq in place.
354 */
355 DTRACE_IP5(large__flow, fq_if_t *, fqs,
356 fq_if_classq_t *, fq_cl, fq_t *, fq,
357 pktsched_pkt_t *, pkt, uint32_t, cnt);
358
359 for (i = 0; i < cnt; i++) {
360 fq_head_drop(fqs, fq);
361 }
362 } else {
363 if (fqs->fqs_large_flow == NULL) {
364 droptype = DTYPE_FORCED;
365 fq_cl->fcl_stat.fcl_drop_overflow += cnt;
366 ret = CLASSQEQ_DROP;
367
368 DTRACE_IP5(no__large__flow, fq_if_t *, fqs,
369 fq_if_classq_t *, fq_cl, fq_t *, fq,
370 pktsched_pkt_t *, pkt, uint32_t, cnt);
371
372 /*
373 * if this fq was freshly created and there
374 * is nothing to enqueue, free it
375 */
376 if (fq_empty(fq) && !(fq->fq_flags &
377 (FQF_NEW_FLOW | FQF_OLD_FLOW))) {
378 fq_if_destroy_flow(fqs, fq_cl, fq);
379 fq = NULL;
380 }
381 } else {
382 DTRACE_IP5(different__large__flow,
383 fq_if_t *, fqs, fq_if_classq_t *, fq_cl,
384 fq_t *, fq, pktsched_pkt_t *, pkt,
385 uint32_t, cnt);
386
387 for (i = 0; i < cnt; i++) {
388 fq_if_drop_packet(fqs);
389 }
390 }
391 }
392 }
393
394 if (__probable(droptype == DTYPE_NODROP)) {
395 uint32_t chain_len = pktsched_get_pkt_len(pkt);
396
397 /*
398 * We do not compress if we are enqueuing a chain.
399 * Traversing the chain to look for acks would defeat the
400 * purpose of batch enqueueing.
401 */
402 if (cnt == 1) {
403 ret = fq_compressor(fqs, fq, fq_cl, pkt);
404 if (ret != CLASSQEQ_COMPRESSED) {
405 ret = CLASSQEQ_SUCCESS;
406 } else {
407 fq_cl->fcl_stat.fcl_pkts_compressed++;
408 }
409 }
410 DTRACE_IP5(fq_enqueue, fq_if_t *, fqs, fq_if_classq_t *, fq_cl,
411 fq_t *, fq, pktsched_pkt_t *, pkt, uint32_t, cnt);
412 fq_enqueue(fq, pkt->pktsched_pkt, pkt->pktsched_tail, cnt);
413
414 fq->fq_bytes += chain_len;
415 fq_cl->fcl_stat.fcl_byte_cnt += chain_len;
416 fq_cl->fcl_stat.fcl_pkt_cnt += cnt;
417
418 /*
419 * check if this queue will qualify to be the next
420 * victim queue
421 */
422 fq_if_is_flow_heavy(fqs, fq);
423 } else {
424 DTRACE_IP3(fq_drop, fq_if_t *, fqs, int, droptype, int, ret);
425 return (ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROP;
426 }
427
428 /*
429 * If the queue is not currently active, add it to the end of new
430 * flows list for that service class.
431 */
432 if ((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) == 0) {
433 VERIFY(STAILQ_NEXT(fq, fq_actlink) == NULL);
434 STAILQ_INSERT_TAIL(&fq_cl->fcl_new_flows, fq, fq_actlink);
435 fq->fq_flags |= FQF_NEW_FLOW;
436
437 fq_cl->fcl_stat.fcl_newflows_cnt++;
438
439 fq->fq_deficit = fq_cl->fcl_quantum;
440 }
441 return ret;
442 }
443
444 void
445 fq_getq_flow_internal(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
446 {
447 classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p);
448 uint32_t plen;
449 fq_if_classq_t *fq_cl;
450 struct ifclassq *ifq = fqs->fqs_ifq;
451
452 fq_dequeue(fq, &p);
453 if (p.cp_ptype == QP_INVALID) {
454 VERIFY(p.cp_mbuf == NULL);
455 return;
456 }
457
458 pktsched_pkt_encap(pkt, &p);
459 plen = pktsched_get_pkt_len(pkt);
460
461 VERIFY(fq->fq_bytes >= plen);
462 fq->fq_bytes -= plen;
463
464 fq_cl = &fqs->fqs_classq[fq->fq_sc_index];
465 fq_cl->fcl_stat.fcl_byte_cnt -= plen;
466 fq_cl->fcl_stat.fcl_pkt_cnt--;
467 IFCQ_DEC_LEN(ifq);
468 IFCQ_DEC_BYTES(ifq, plen);
469
470 /* Reset getqtime so that we don't count idle times */
471 if (fq_empty(fq)) {
472 fq->fq_getqtime = 0;
473 }
474 }
475
476 void
477 fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
478 {
479 fq_if_classq_t *fq_cl;
480 u_int64_t now;
481 int64_t qdelay = 0;
482 struct timespec now_ts;
483 volatile uint32_t *pkt_flags;
484 uint64_t *pkt_timestamp;
485
486 fq_getq_flow_internal(fqs, fq, pkt);
487 if (pkt->pktsched_ptype == QP_INVALID) {
488 VERIFY(pkt->pktsched_pkt_mbuf == NULL);
489 return;
490 }
491
492 pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
493 NULL, NULL);
494
495 nanouptime(&now_ts);
496 now = (now_ts.tv_sec * NSEC_PER_SEC) + now_ts.tv_nsec;
497
498 /* this will compute qdelay in nanoseconds */
499 if (now > *pkt_timestamp) {
500 qdelay = now - *pkt_timestamp;
501 }
502 fq_cl = &fqs->fqs_classq[fq->fq_sc_index];
503
504 if (fq->fq_min_qdelay == 0 ||
505 (qdelay > 0 && (u_int64_t)qdelay < fq->fq_min_qdelay)) {
506 fq->fq_min_qdelay = qdelay;
507 }
508 if (now >= fq->fq_updatetime) {
509 if (fq->fq_min_qdelay > fqs->fqs_target_qdelay) {
510 if (!FQ_IS_DELAYHIGH(fq)) {
511 FQ_SET_DELAY_HIGH(fq);
512 }
513 } else {
514 FQ_CLEAR_DELAY_HIGH(fq);
515 }
516 /* Reset measured queue delay and update time */
517 fq->fq_updatetime = now + fqs->fqs_update_interval;
518 fq->fq_min_qdelay = 0;
519 }
520 if (!FQ_IS_DELAYHIGH(fq) || fq_empty(fq)) {
521 FQ_CLEAR_DELAY_HIGH(fq);
522 if (fq->fq_flags & FQF_FLOWCTL_ON) {
523 fq_if_flow_feedback(fqs, fq, fq_cl);
524 }
525 }
526
527 if (fq_empty(fq)) {
528 /* Reset getqtime so that we don't count idle times */
529 fq->fq_getqtime = 0;
530 } else {
531 fq->fq_getqtime = now;
532 }
533 fq_if_is_flow_heavy(fqs, fq);
534
535 *pkt_timestamp = 0;
536 switch (pkt->pktsched_ptype) {
537 case QP_MBUF:
538 *pkt_flags &= ~PKTF_PRIV_GUARDED;
539 break;
540 default:
541 VERIFY(0);
542 /* NOTREACHED */
543 __builtin_unreachable();
544 }
545 }