2 * Copyright (c) 2011-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
32 #include <sys/socket.h>
33 #include <sys/sockio.h>
34 #include <sys/systm.h>
35 #include <sys/sysctl.h>
36 #include <sys/syslog.h>
38 #include <sys/errno.h>
39 #include <sys/kernel.h>
40 #include <sys/kauth.h>
42 #include <kern/zalloc.h>
45 #include <net/if_var.h>
46 #include <net/if_types.h>
48 #include <net/flowadv.h>
50 #include <netinet/in.h>
51 #include <netinet/in_systm.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip6.h>
55 #include <net/classq/classq_sfb.h>
56 #include <net/flowhash.h>
57 #include <net/net_osdep.h>
58 #include <dev/random/randomdev.h>
61 * Stochastic Fair Blue
63 * Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin
64 * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
66 * Based on the NS code with the following parameters:
71 * hold-time: 10ms-50ms (randomized)
74 * pbox-time: 50-100ms (randomized)
75 * hinterval: 11-23 (randomized)
77 * This implementation uses L = 2 and N = 32 for 2 sets of:
79 * B[L][N]: L x N array of bins (L levels, N bins per level)
81 * Each set effectively creates 32^2 virtual buckets (bin combinations)
82 * while using only O(32*2) states.
84 * Given a 32-bit hash value, we divide it such that octets [0,1,2,3] are
85 * used as index for the bins across the 2 levels, where level 1 uses [0,2]
86 * and level 2 uses [1,3]. The 2 values per level correspond to the indices
87 * for the current and warm-up sets (section 4.4. in the SFB paper regarding
88 * Moving Hash Functions explains the purposes of these 2 sets.)
92 * Use Murmur3A_x86_32 for hash function. It seems to perform consistently
93 * across platforms for 1-word key (32-bit flowhash value). See flowhash.h
94 * for other alternatives. We only need 16-bit hash output.
96 #define SFB_HASH net_flowhash_mh3_x86_32
97 #define SFB_HASHMASK HASHMASK(16)
99 #define SFB_BINMASK(_x) \
100 ((_x) & HASHMASK(SFB_BINS_SHIFT))
102 #define SFB_BINST(_sp, _l, _n, _c) \
103 (&(*(_sp)->sfb_bins)[_c].stats[_l][_n])
105 #define SFB_BINFT(_sp, _l, _n, _c) \
106 (&(*(_sp)->sfb_bins)[_c].freezetime[_l][_n])
108 #define SFB_FC_LIST(_sp, _n) \
109 (&(*(_sp)->sfb_fc_lists)[_n])
112 * The holdtime parameter determines the minimum time interval between
113 * two successive updates of the marking probability. In the event the
114 * uplink speed is not known, a default value is chosen and is randomized
115 * to be within the following range.
117 #define HOLDTIME_BASE (100ULL * 1000 * 1000) /* 100ms */
118 #define HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10ms */
119 #define HOLDTIME_MAX (100ULL * 1000 * 1000) /* 100ms */
122 * The pboxtime parameter determines the bandwidth allocated for rogue
123 * flows, i.e. the rate limiting bandwidth. In the event the uplink speed
124 * is not known, a default value is chosen and is randomized to be within
125 * the following range.
127 #define PBOXTIME_BASE (300ULL * 1000 * 1000) /* 300ms */
128 #define PBOXTIME_MIN (30ULL * 1000 * 1000) /* 30ms */
129 #define PBOXTIME_MAX (300ULL * 1000 * 1000) /* 300ms */
132 * Target queueing delay is the amount of extra delay that can be added
133 * to accommodate variations in the link bandwidth. The queue should be
134 * large enough to induce this much delay and nothing more than that.
136 #define TARGET_QDELAY_BASE (10ULL * 1000 * 1000) /* 10ms */
137 #define TARGET_QDELAY_MIN (10ULL * 1000) /* 10us */
138 #define TARGET_QDELAY_MAX (20ULL * 1000 * 1000 * 1000) /* 20s */
141 * Update interval for checking the extra delay added by the queue. This
142 * should be 90-95 percentile of RTT experienced by any TCP connection
143 * so that it will take care of the burst traffic.
145 #define UPDATE_INTERVAL_BASE (100ULL * 1000 * 1000) /* 100ms */
146 #define UPDATE_INTERVAL_MIN (100ULL * 1000 * 1000) /* 100ms */
147 #define UPDATE_INTERVAL_MAX (10ULL * 1000 * 1000 * 1000) /* 10s */
149 #define SFB_RANDOM(sp, tmin, tmax) ((sfb_random(sp) % (tmax)) + (tmin))
151 #define SFB_PKT_PBOX 0x1 /* in penalty box */
153 /* The following mantissa values are in SFB_FP_SHIFT Q format */
154 #define SFB_MAX_PMARK (1 << SFB_FP_SHIFT) /* Q14 representation of 1.00 */
157 * These are d1 (increment) and d2 (decrement) parameters, used to determine
158 * the amount by which the marking probability is incremented when the queue
159 * overflows, or is decremented when the link is idle. d1 is set higher than
160 * d2, because link underutilization can occur when congestion management is
161 * either too conservative or too aggressive, but packet loss occurs only
162 * when congestion management is too conservative. By weighing heavily
163 * against packet loss, it can quickly reach to a substantial increase in
166 #define SFB_INCREMENT 82 /* Q14 representation of 0.005 */
167 #define SFB_DECREMENT 16 /* Q14 representation of 0.001 */
169 #define SFB_PMARK_TH 16056 /* Q14 representation of 0.98 */
170 #define SFB_PMARK_WARM 3276 /* Q14 representation of 0.2 */
172 #define SFB_PMARK_INC(_bin) do { \
173 (_bin)->pmark += sfb_increment; \
174 if ((_bin)->pmark > SFB_MAX_PMARK) \
175 (_bin)->pmark = SFB_MAX_PMARK; \
178 #define SFB_PMARK_DEC(_bin) do { \
179 if ((_bin)->pmark > 0) { \
180 (_bin)->pmark -= sfb_decrement; \
181 if ((_bin)->pmark < 0) \
186 /* Minimum nuber of bytes in queue to get flow controlled */
187 #define SFB_MIN_FC_THRESHOLD_BYTES 7500
189 #define SFB_SET_DELAY_HIGH(_sp_, _q_) do { \
190 (_sp_)->sfb_flags |= SFBF_DELAYHIGH; \
191 (_sp_)->sfb_fc_threshold = ulmax(SFB_MIN_FC_THRESHOLD_BYTES, \
192 (uint32_t)(qsize((_q_)) >> 3)); \
195 #define SFB_QUEUE_DELAYBASED(_sp_) ((_sp_)->sfb_flags & SFBF_DELAYBASED)
196 #define SFB_IS_DELAYHIGH(_sp_) ((_sp_)->sfb_flags & SFBF_DELAYHIGH)
197 #define SFB_QUEUE_DELAYBASED_MAXSIZE 2048 /* max pkts */
199 #define HINTERVAL_MIN (10) /* 10 seconds */
200 #define HINTERVAL_MAX (20) /* 20 seconds */
201 #define SFB_HINTERVAL(sp) ((sfb_random(sp) % HINTERVAL_MAX) + HINTERVAL_MIN)
203 #define DEQUEUE_DECAY 7 /* ilog2 of EWMA decay rate, (128) */
204 #define DEQUEUE_SPIKE(_new, _old) \
205 ((u_int64_t)ABS((int64_t)(_new) - (int64_t)(_old)) > ((_old) << 11))
207 /* Place the flow control entries in current bin on level 0 */
208 #define SFB_FC_LEVEL 0
210 static ZONE_DECLARE(sfb_zone
, "classq_sfb",
211 sizeof(struct sfb
), ZC_ZFREE_CLEARMEM
);
213 static ZONE_DECLARE(sfb_bins_zone
, "classq_sfb_bins",
214 sizeof(struct sfb_bins
), ZC_ZFREE_CLEARMEM
);
216 static ZONE_DECLARE(sfb_fcl_zone
, "classq_sfb_fcl",
217 sizeof(struct sfb_fcl
), ZC_ZFREE_CLEARMEM
);
219 /* internal function prototypes */
220 static u_int32_t
sfb_random(struct sfb
*);
221 static void *sfb_getq_flow(struct sfb
*, class_queue_t
*, u_int32_t
, boolean_t
,
223 static void sfb_resetq(struct sfb
*, cqev_t
);
224 static void sfb_calc_holdtime(struct sfb
*, u_int64_t
);
225 static void sfb_calc_pboxtime(struct sfb
*, u_int64_t
);
226 static void sfb_calc_hinterval(struct sfb
*, u_int64_t
*);
227 static void sfb_calc_update_interval(struct sfb
*, u_int64_t
);
228 static void sfb_swap_bins(struct sfb
*, u_int32_t
);
229 static inline int sfb_pcheck(struct sfb
*, uint32_t);
230 static int sfb_penalize(struct sfb
*, uint32_t, uint32_t *, struct timespec
*);
231 static void sfb_adjust_bin(struct sfb
*, struct sfbbinstats
*,
232 struct timespec
*, struct timespec
*, boolean_t
);
233 static void sfb_decrement_bin(struct sfb
*, struct sfbbinstats
*,
234 struct timespec
*, struct timespec
*);
235 static void sfb_increment_bin(struct sfb
*, struct sfbbinstats
*,
236 struct timespec
*, struct timespec
*);
237 static inline void sfb_dq_update_bins(struct sfb
*, uint32_t, uint32_t,
238 struct timespec
*, u_int64_t qsize
);
239 static inline void sfb_eq_update_bins(struct sfb
*, uint32_t, uint32_t);
240 static int sfb_drop_early(struct sfb
*, uint32_t, u_int16_t
*,
242 static boolean_t
sfb_bin_addfcentry(struct sfb
*, pktsched_pkt_t
*,
243 uint32_t, uint8_t, uint32_t);
244 static void sfb_fclist_append(struct sfb
*, struct sfb_fcl
*);
245 static void sfb_fclists_clean(struct sfb
*sp
);
246 static int sfb_bin_mark_or_drop(struct sfb
*sp
, struct sfbbinstats
*bin
);
247 static void sfb_detect_dequeue_stall(struct sfb
*sp
, class_queue_t
*,
250 SYSCTL_NODE(_net_classq
, OID_AUTO
, sfb
, CTLFLAG_RW
| CTLFLAG_LOCKED
, 0, "SFB");
252 static u_int64_t sfb_holdtime
= 0; /* 0 indicates "automatic" */
253 SYSCTL_QUAD(_net_classq_sfb
, OID_AUTO
, holdtime
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
254 &sfb_holdtime
, "SFB freeze time in nanoseconds");
256 static u_int64_t sfb_pboxtime
= 0; /* 0 indicates "automatic" */
257 SYSCTL_QUAD(_net_classq_sfb
, OID_AUTO
, pboxtime
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
258 &sfb_pboxtime
, "SFB penalty box time in nanoseconds");
260 static u_int64_t sfb_hinterval
;
261 SYSCTL_QUAD(_net_classq_sfb
, OID_AUTO
, hinterval
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
262 &sfb_hinterval
, "SFB hash interval in nanoseconds");
264 static u_int32_t sfb_increment
= SFB_INCREMENT
;
265 SYSCTL_UINT(_net_classq_sfb
, OID_AUTO
, increment
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
266 &sfb_increment
, SFB_INCREMENT
, "SFB increment [d1]");
268 static u_int32_t sfb_decrement
= SFB_DECREMENT
;
269 SYSCTL_UINT(_net_classq_sfb
, OID_AUTO
, decrement
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
270 &sfb_decrement
, SFB_DECREMENT
, "SFB decrement [d2]");
272 static u_int32_t sfb_allocation
= 0; /* 0 means "automatic" */
273 SYSCTL_UINT(_net_classq_sfb
, OID_AUTO
, allocation
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
274 &sfb_allocation
, 0, "SFB bin allocation");
276 static u_int32_t sfb_ratelimit
= 0;
277 SYSCTL_UINT(_net_classq_sfb
, OID_AUTO
, ratelimit
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
278 &sfb_ratelimit
, 0, "SFB rate limit");
280 #define KBPS (1ULL * 1000) /* 1 Kbits per second */
281 #define MBPS (1ULL * 1000 * 1000) /* 1 Mbits per second */
282 #define GBPS (MBPS * 1000) /* 1 Gbits per second */
284 struct sfb_time_tbl
{
285 u_int64_t speed
; /* uplink speed */
286 u_int64_t holdtime
; /* hold time */
287 u_int64_t pboxtime
; /* penalty box time */
290 static struct sfb_time_tbl sfb_ttbl
[] = {
291 { .speed
= 1 * MBPS
, .holdtime
= HOLDTIME_BASE
* 1000, .pboxtime
= PBOXTIME_BASE
* 1000},
292 { .speed
= 10 * MBPS
, .holdtime
= HOLDTIME_BASE
* 100, .pboxtime
= PBOXTIME_BASE
* 100 },
293 { .speed
= 100 * MBPS
, .holdtime
= HOLDTIME_BASE
* 10, .pboxtime
= PBOXTIME_BASE
* 10 },
294 { .speed
= 1 * GBPS
, .holdtime
= HOLDTIME_BASE
, .pboxtime
= PBOXTIME_BASE
},
295 { .speed
= 10 * GBPS
, .holdtime
= HOLDTIME_BASE
/ 10, .pboxtime
= PBOXTIME_BASE
/ 10 },
296 { .speed
= 100 * GBPS
, .holdtime
= HOLDTIME_BASE
/ 100, .pboxtime
= PBOXTIME_BASE
/ 100 },
297 { .speed
= 0, .holdtime
= 0, .pboxtime
= 0 }
300 static_assert(SFBF_ECN4
== CLASSQF_ECN4
);
301 static_assert(SFBF_ECN6
== CLASSQF_ECN6
);
304 sfb_random(struct sfb
*sp
)
306 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
307 return RandomULong();
311 sfb_calc_holdtime(struct sfb
*sp
, u_int64_t outbw
)
315 if (sfb_holdtime
!= 0) {
316 holdtime
= sfb_holdtime
;
317 } else if (outbw
== 0) {
318 holdtime
= SFB_RANDOM(sp
, HOLDTIME_MIN
, HOLDTIME_MAX
);
322 n
= sfb_ttbl
[0].holdtime
;
323 for (i
= 0; sfb_ttbl
[i
].speed
!= 0; i
++) {
324 if (outbw
< sfb_ttbl
[i
].speed
) {
327 n
= sfb_ttbl
[i
].holdtime
;
331 net_nsectimer(&holdtime
, &sp
->sfb_holdtime
);
335 sfb_calc_pboxtime(struct sfb
*sp
, u_int64_t outbw
)
339 if (sfb_pboxtime
!= 0) {
340 pboxtime
= sfb_pboxtime
;
341 } else if (outbw
== 0) {
342 pboxtime
= SFB_RANDOM(sp
, PBOXTIME_MIN
, PBOXTIME_MAX
);
346 n
= sfb_ttbl
[0].pboxtime
;
347 for (i
= 0; sfb_ttbl
[i
].speed
!= 0; i
++) {
348 if (outbw
< sfb_ttbl
[i
].speed
) {
351 n
= sfb_ttbl
[i
].pboxtime
;
355 net_nsectimer(&pboxtime
, &sp
->sfb_pboxtime
);
356 net_timerclear(&sp
->sfb_pboxfreeze
);
360 sfb_calc_hinterval(struct sfb
*sp
, u_int64_t
*t
)
362 u_int64_t hinterval
= 0;
367 * TODO adi@apple.com: use dq_avg to derive hinterval.
372 if (sfb_hinterval
!= 0) {
373 hinterval
= sfb_hinterval
;
374 } else if (t
== NULL
|| hinterval
== 0) {
375 hinterval
= ((u_int64_t
)SFB_HINTERVAL(sp
) * NSEC_PER_SEC
);
378 net_nsectimer(&hinterval
, &sp
->sfb_hinterval
);
381 net_timeradd(&now
, &sp
->sfb_hinterval
, &sp
->sfb_nextreset
);
385 sfb_calc_update_interval(struct sfb
*sp
, u_int64_t out_bw
)
387 #pragma unused(out_bw)
388 u_int64_t update_interval
= 0;
389 ifclassq_calc_update_interval(&update_interval
);
390 net_nsectimer(&update_interval
, &sp
->sfb_update_interval
);
394 * sfb support routines
397 sfb_alloc(struct ifnet
*ifp
, u_int32_t qid
, u_int32_t qlim
, u_int32_t flags
)
402 VERIFY(ifp
!= NULL
&& qlim
> 0);
404 sp
= zalloc_flags(sfb_zone
, Z_WAITOK
| Z_ZERO
);
405 sp
->sfb_bins
= zalloc_flags(sfb_bins_zone
, Z_WAITOK
| Z_ZERO
);
406 sp
->sfb_fc_lists
= zalloc_flags(sfb_fcl_zone
, Z_WAITOK
| Z_ZERO
);
408 for (i
= 0; i
< SFB_BINS
; ++i
) {
409 STAILQ_INIT(&SFB_FC_LIST(sp
, i
)->fclist
);
415 sp
->sfb_flags
= (flags
& SFBF_USERFLAGS
);
417 if (sp
->sfb_flags
& SFBF_ECN
) {
418 sp
->sfb_flags
&= ~SFBF_ECN
;
419 log(LOG_ERR
, "%s: SFB qid=%d, ECN not available; ignoring "
420 "SFBF_ECN flag!\n", if_name(ifp
), sp
->sfb_qid
);
424 sfb_resetq(sp
, CLASSQ_EV_INIT
);
430 sfb_fclist_append(struct sfb
*sp
, struct sfb_fcl
*fcl
)
432 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
433 VERIFY(STAILQ_EMPTY(&fcl
->fclist
) || fcl
->cnt
> 0);
434 sp
->sfb_stats
.flow_feedback
+= fcl
->cnt
;
437 flowadv_add(&fcl
->fclist
);
438 VERIFY(fcl
->cnt
== 0 && STAILQ_EMPTY(&fcl
->fclist
));
442 sfb_fclists_clean(struct sfb
*sp
)
446 /* Move all the flow control entries to the flowadv list */
447 for (i
= 0; i
< SFB_BINS
; ++i
) {
448 struct sfb_fcl
*fcl
= SFB_FC_LIST(sp
, i
);
449 if (!STAILQ_EMPTY(&fcl
->fclist
)) {
450 sfb_fclist_append(sp
, fcl
);
456 sfb_destroy(struct sfb
*sp
)
458 sfb_fclists_clean(sp
);
459 if (sp
->sfb_bins
!= NULL
) {
460 zfree(sfb_bins_zone
, sp
->sfb_bins
);
463 if (sp
->sfb_fc_lists
!= NULL
) {
464 zfree(sfb_fcl_zone
, sp
->sfb_fc_lists
);
465 sp
->sfb_fc_lists
= NULL
;
471 sfb_resetq(struct sfb
*sp
, cqev_t ev
)
473 struct ifnet
*ifp
= sp
->sfb_ifp
;
478 if (ev
!= CLASSQ_EV_LINK_DOWN
) {
479 (*sp
->sfb_bins
)[0].fudge
= sfb_random(sp
);
480 (*sp
->sfb_bins
)[1].fudge
= sfb_random(sp
);
481 sp
->sfb_allocation
= sfb_allocation
== 0 ?
482 (uint16_t)(sp
->sfb_qlim
/ 3) :
483 (uint16_t)sfb_allocation
;
484 sp
->sfb_drop_thresh
= sp
->sfb_allocation
+
485 (sp
->sfb_allocation
>> 1);
488 sp
->sfb_clearpkts
= 0;
491 eff_rate
= ifnet_output_linkrate(ifp
);
492 sp
->sfb_eff_rate
= eff_rate
;
494 sfb_calc_holdtime(sp
, eff_rate
);
495 sfb_calc_pboxtime(sp
, eff_rate
);
496 sfb_calc_hinterval(sp
, NULL
);
497 ifclassq_calc_target_qdelay(ifp
, &sp
->sfb_target_qdelay
);
498 sfb_calc_update_interval(sp
, eff_rate
);
500 if (ev
== CLASSQ_EV_LINK_DOWN
||
501 ev
== CLASSQ_EV_LINK_UP
) {
502 sfb_fclists_clean(sp
);
505 bzero(sp
->sfb_bins
, sizeof(*sp
->sfb_bins
));
506 bzero(&sp
->sfb_stats
, sizeof(sp
->sfb_stats
));
508 if (ev
== CLASSQ_EV_LINK_DOWN
|| !classq_verbose
) {
512 log(LOG_DEBUG
, "%s: SFB qid=%d, holdtime=%llu nsec, "
513 "pboxtime=%llu nsec, allocation=%d, drop_thresh=%d, "
514 "hinterval=%d sec, sfb_bins=%d bytes, eff_rate=%llu bps"
515 "target_qdelay= %llu nsec "
516 "update_interval=%llu sec %llu nsec flags=0x%x\n",
517 if_name(ifp
), sp
->sfb_qid
, (u_int64_t
)sp
->sfb_holdtime
.tv_nsec
,
518 (u_int64_t
)sp
->sfb_pboxtime
.tv_nsec
,
519 (u_int32_t
)sp
->sfb_allocation
, (u_int32_t
)sp
->sfb_drop_thresh
,
520 (int)sp
->sfb_hinterval
.tv_sec
, (int)sizeof(*sp
->sfb_bins
),
521 eff_rate
, (u_int64_t
)sp
->sfb_target_qdelay
,
522 (u_int64_t
)sp
->sfb_update_interval
.tv_sec
,
523 (u_int64_t
)sp
->sfb_update_interval
.tv_nsec
, sp
->sfb_flags
);
527 sfb_getstats(struct sfb
*sp
, struct sfb_stats
*sps
)
529 sps
->allocation
= sp
->sfb_allocation
;
530 sps
->dropthresh
= sp
->sfb_drop_thresh
;
531 sps
->clearpkts
= sp
->sfb_clearpkts
;
532 sps
->current
= sp
->sfb_current
;
533 sps
->target_qdelay
= sp
->sfb_target_qdelay
;
534 sps
->min_estdelay
= sp
->sfb_min_qdelay
;
535 sps
->delay_fcthreshold
= (uint32_t)sp
->sfb_fc_threshold
;
536 sps
->flags
= sp
->sfb_flags
;
538 net_timernsec(&sp
->sfb_holdtime
, &sp
->sfb_stats
.hold_time
);
539 net_timernsec(&sp
->sfb_pboxtime
, &sp
->sfb_stats
.pbox_time
);
540 net_timernsec(&sp
->sfb_hinterval
, &sp
->sfb_stats
.rehash_intval
);
541 net_timernsec(&sp
->sfb_update_interval
, &sps
->update_interval
);
542 *(&(sps
->sfbstats
)) = *(&(sp
->sfb_stats
));
544 _CASSERT(sizeof((*sp
->sfb_bins
)[0].stats
) ==
545 sizeof(sps
->binstats
[0].stats
));
547 bcopy(&(*sp
->sfb_bins
)[0].stats
, &sps
->binstats
[0].stats
,
548 sizeof(sps
->binstats
[0].stats
));
549 bcopy(&(*sp
->sfb_bins
)[1].stats
, &sps
->binstats
[1].stats
,
550 sizeof(sps
->binstats
[1].stats
));
554 sfb_swap_bins(struct sfb
*sp
, u_int32_t len
)
558 if (sp
->sfb_flags
& SFBF_SUSPENDED
) {
563 VERIFY((s
+ (s
^ 1)) == 1);
565 (*sp
->sfb_bins
)[s
].fudge
= sfb_random(sp
); /* recompute perturbation */
566 sp
->sfb_clearpkts
= len
;
567 sp
->sfb_stats
.num_rehash
++;
569 s
= (sp
->sfb_current
^= 1); /* flip the bit (swap current) */
571 if (classq_verbose
) {
572 log(LOG_DEBUG
, "%s: SFB qid=%d, set %d is now current, "
573 "qlen=%d\n", if_name(sp
->sfb_ifp
), sp
->sfb_qid
, s
, len
);
576 /* clear freezetime for all current bins */
577 bzero(&(*sp
->sfb_bins
)[s
].freezetime
,
578 sizeof((*sp
->sfb_bins
)[s
].freezetime
));
580 /* clear/adjust bin statistics and flow control lists */
581 for (i
= 0; i
< SFB_BINS
; i
++) {
582 struct sfb_fcl
*fcl
= SFB_FC_LIST(sp
, i
);
584 if (!STAILQ_EMPTY(&fcl
->fclist
)) {
585 sfb_fclist_append(sp
, fcl
);
588 for (j
= 0; j
< SFB_LEVELS
; j
++) {
589 struct sfbbinstats
*cbin
, *wbin
;
591 cbin
= SFB_BINST(sp
, j
, i
, s
); /* current */
592 wbin
= SFB_BINST(sp
, j
, i
, s
^ 1); /* warm-up */
596 if (cbin
->pmark
> SFB_MAX_PMARK
) {
597 cbin
->pmark
= SFB_MAX_PMARK
;
599 if (cbin
->pmark
< 0) {
604 * Keep pmark from before to identify
605 * non-responsives immediately.
607 if (wbin
->pmark
> SFB_PMARK_WARM
) {
608 wbin
->pmark
= SFB_PMARK_WARM
;
615 sfb_pcheck(struct sfb
*sp
, uint32_t pkt_sfb_hash
)
619 #endif /* SFB_LEVELS != 2 */
620 uint8_t *pkt_sfb_hash8
= (uint8_t *)&pkt_sfb_hash
;
624 VERIFY((s
+ (s
^ 1)) == 1);
627 * For current bins, returns 1 if all pmark >= SFB_PMARK_TH,
628 * 0 otherwise; optimize for SFB_LEVELS=2.
632 * Level 0: bin index at [0] for set 0; [2] for set 1
633 * Level 1: bin index at [1] for set 0; [3] for set 1
635 if (SFB_BINST(sp
, 0, SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1)]),
636 s
)->pmark
< SFB_PMARK_TH
||
637 SFB_BINST(sp
, 1, SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1) + 1]),
638 s
)->pmark
< SFB_PMARK_TH
) {
641 #else /* SFB_LEVELS != 2 */
642 for (i
= 0; i
< SFB_LEVELS
; i
++) {
643 if (s
== 0) { /* set 0, bin index [0,1] */
644 n
= SFB_BINMASK(pkt_sfb_hash8
[i
]);
645 } else { /* set 1, bin index [2,3] */
646 n
= SFB_BINMASK(pkt_sfb_hash8
[i
+ 2]);
649 if (SFB_BINST(sp
, i
, n
, s
)->pmark
< SFB_PMARK_TH
) {
653 #endif /* SFB_LEVELS != 2 */
658 sfb_penalize(struct sfb
*sp
, uint32_t pkt_sfb_hash
, uint32_t *pkt_sfb_flags
,
659 struct timespec
*now
)
661 struct timespec delta
= { .tv_sec
= 0, .tv_nsec
= 0 };
662 uint8_t *pkt_sfb_hash8
= (uint8_t *)&pkt_sfb_hash
;
664 /* If minimum pmark of current bins is < SFB_PMARK_TH, we're done */
665 if (!sfb_ratelimit
|| !sfb_pcheck(sp
, pkt_sfb_hash
)) {
669 net_timersub(now
, &sp
->sfb_pboxfreeze
, &delta
);
670 if (net_timercmp(&delta
, &sp
->sfb_pboxtime
, <)) {
673 #endif /* SFB_LEVELS != 2 */
674 struct sfbbinstats
*bin
;
677 w
= sp
->sfb_current
^ 1;
678 VERIFY((w
+ (w
^ 1)) == 1);
681 * Update warm-up bins; optimize for SFB_LEVELS=2
684 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
685 n
= SFB_BINMASK(pkt_sfb_hash8
[(w
<< 1)]);
686 bin
= SFB_BINST(sp
, 0, n
, w
);
687 if (bin
->pkts
>= sp
->sfb_allocation
) {
688 sfb_increment_bin(sp
, bin
, SFB_BINFT(sp
, 0, n
, w
), now
);
691 /* Level 0: bin index at [1] for set 0; [3] for set 1 */
692 n
= SFB_BINMASK(pkt_sfb_hash8
[(w
<< 1) + 1]);
693 bin
= SFB_BINST(sp
, 1, n
, w
);
694 if (bin
->pkts
>= sp
->sfb_allocation
) {
695 sfb_increment_bin(sp
, bin
, SFB_BINFT(sp
, 1, n
, w
), now
);
697 #else /* SFB_LEVELS != 2 */
698 for (i
= 0; i
< SFB_LEVELS
; i
++) {
699 if (w
== 0) { /* set 0, bin index [0,1] */
700 n
= SFB_BINMASK(pkt_sfb_hash8
[i
]);
701 } else { /* set 1, bin index [2,3] */
702 n
= SFB_BINMASK(pkt_sfb_hash8
[i
+ 2]);
705 bin
= SFB_BINST(sp
, i
, n
, w
);
706 if (bin
->pkts
>= sp
->sfb_allocation
) {
707 sfb_increment_bin(sp
, bin
,
708 SFB_BINFT(sp
, i
, n
, w
), now
);
711 #endif /* SFB_LEVELS != 2 */
715 /* non-conformant or else misclassified flow; queue it anyway */
716 *pkt_sfb_flags
|= SFB_PKT_PBOX
;
717 *(&sp
->sfb_pboxfreeze
) = *now
;
723 sfb_adjust_bin(struct sfb
*sp
, struct sfbbinstats
*bin
, struct timespec
*ft
,
724 struct timespec
*now
, boolean_t inc
)
726 struct timespec delta
;
728 net_timersub(now
, ft
, &delta
);
729 if (net_timercmp(&delta
, &sp
->sfb_holdtime
, <)) {
730 if (classq_verbose
> 1) {
731 log(LOG_DEBUG
, "%s: SFB qid=%d, %s update frozen "
732 "(delta=%llu nsec)\n", if_name(sp
->sfb_ifp
),
733 sp
->sfb_qid
, inc
? "increment" : "decrement",
734 (u_int64_t
)delta
.tv_nsec
);
739 /* increment/decrement marking probability */
749 sfb_decrement_bin(struct sfb
*sp
, struct sfbbinstats
*bin
, struct timespec
*ft
,
750 struct timespec
*now
)
752 return sfb_adjust_bin(sp
, bin
, ft
, now
, FALSE
);
756 sfb_increment_bin(struct sfb
*sp
, struct sfbbinstats
*bin
, struct timespec
*ft
,
757 struct timespec
*now
)
759 return sfb_adjust_bin(sp
, bin
, ft
, now
, TRUE
);
763 sfb_dq_update_bins(struct sfb
*sp
, uint32_t pkt_sfb_hash
, uint32_t pkt_len
,
764 struct timespec
*now
, u_int64_t qsize
)
766 #if SFB_LEVELS != 2 || SFB_FC_LEVEL != 0
768 #endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
769 struct sfbbinstats
*bin
;
771 struct sfb_fcl
*fcl
= NULL
;
772 uint8_t *pkt_sfb_hash8
= (uint8_t *)&pkt_sfb_hash
;
775 VERIFY((s
+ (s
^ 1)) == 1);
778 * Update current bins; optimize for SFB_LEVELS=2 and SFB_FC_LEVEL=0
780 #if SFB_LEVELS == 2 && SFB_FC_LEVEL == 0
781 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
782 n
= SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1)]);
783 bin
= SFB_BINST(sp
, 0, n
, s
);
785 VERIFY(bin
->pkts
> 0 && bin
->bytes
>= pkt_len
);
787 bin
->bytes
-= pkt_len
;
789 if (bin
->pkts
== 0) {
790 sfb_decrement_bin(sp
, bin
, SFB_BINFT(sp
, 0, n
, s
), now
);
793 /* Deliver flow control feedback to the sockets */
794 if (SFB_QUEUE_DELAYBASED(sp
)) {
795 if (!(SFB_IS_DELAYHIGH(sp
)) ||
796 bin
->bytes
<= sp
->sfb_fc_threshold
||
797 bin
->pkts
== 0 || qsize
== 0) {
798 fcl
= SFB_FC_LIST(sp
, n
);
800 } else if (bin
->pkts
<= (sp
->sfb_allocation
>> 2)) {
801 fcl
= SFB_FC_LIST(sp
, n
);
804 if (fcl
!= NULL
&& !STAILQ_EMPTY(&fcl
->fclist
)) {
805 sfb_fclist_append(sp
, fcl
);
809 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
810 n
= SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1) + 1]);
811 bin
= SFB_BINST(sp
, 1, n
, s
);
813 VERIFY(bin
->pkts
> 0 && bin
->bytes
>= (u_int64_t
)pkt_len
);
815 bin
->bytes
-= pkt_len
;
816 if (bin
->pkts
== 0) {
817 sfb_decrement_bin(sp
, bin
, SFB_BINFT(sp
, 1, n
, s
), now
);
819 #else /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
820 for (i
= 0; i
< SFB_LEVELS
; i
++) {
821 if (s
== 0) { /* set 0, bin index [0,1] */
822 n
= SFB_BINMASK(pkt_sfb_hash8
[i
]);
823 } else { /* set 1, bin index [2,3] */
824 n
= SFB_BINMASK(pkt_sfb_hash8
[i
+ 2]);
827 bin
= SFB_BINST(sp
, i
, n
, s
);
829 VERIFY(bin
->pkts
> 0 && bin
->bytes
>= pkt_len
);
831 bin
->bytes
-= pkt_len
;
832 if (bin
->pkts
== 0) {
833 sfb_decrement_bin(sp
, bin
,
834 SFB_BINFT(sp
, i
, n
, s
), now
);
836 if (i
!= SFB_FC_LEVEL
) {
839 if (SFB_QUEUE_DELAYBASED(sp
)) {
840 if (!(SFB_IS_DELAYHIGH(sp
)) ||
841 bin
->bytes
<= sp
->sfb_fc_threshold
) {
842 fcl
= SFB_FC_LIST(sp
, n
);
844 } else if (bin
->pkts
<= (sp
->sfb_allocation
>> 2)) {
845 fcl
= SFB_FC_LIST(sp
, n
);
847 if (fcl
!= NULL
&& !STAILQ_EMPTY(&fcl
->fclist
)) {
848 sfb_fclist_append(sp
, fcl
);
852 #endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
856 sfb_eq_update_bins(struct sfb
*sp
, uint32_t pkt_sfb_hash
, uint32_t pkt_len
)
860 #endif /* SFB_LEVELS != 2 */
862 struct sfbbinstats
*bin
;
863 uint8_t *pkt_sfb_hash8
= (uint8_t *)&pkt_sfb_hash
;
865 VERIFY((s
+ (s
^ 1)) == 1);
868 * Update current bins; optimize for SFB_LEVELS=2
871 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
872 bin
= SFB_BINST(sp
, 0,
873 SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1)]), s
);
875 bin
->bytes
+= pkt_len
;
877 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
878 bin
= SFB_BINST(sp
, 1,
879 SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1) + 1]), s
);
881 bin
->bytes
+= pkt_len
;
883 #else /* SFB_LEVELS != 2 */
884 for (i
= 0; i
< SFB_LEVELS
; i
++) {
885 if (s
== 0) { /* set 0, bin index [0,1] */
886 n
= SFB_BINMASK(pkt_sfb_hash8
[i
]);
887 } else { /* set 1, bin index [2,3] */
888 n
= SFB_BINMASK(pkt_sfb_hash8
[i
+ 2]);
891 bin
= SFB_BINST(sp
, i
, n
, s
);
893 bin
->bytes
+= pkt_len
;
895 #endif /* SFB_LEVELS != 2 */
899 sfb_bin_addfcentry(struct sfb
*sp
, pktsched_pkt_t
*pkt
, uint32_t pkt_sfb_hash
,
900 uint8_t flowsrc
, uint32_t flowid
)
902 struct flowadv_fcentry
*fce
;
905 uint8_t *pkt_sfb_hash8
= (uint8_t *)&pkt_sfb_hash
;
908 VERIFY((s
+ (s
^ 1)) == 1);
911 sp
->sfb_stats
.null_flowid
++;
916 * Use value at index 0 for set 0 and
917 * value at index 2 for set 1
919 fcl
= SFB_FC_LIST(sp
, SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1)]));
920 STAILQ_FOREACH(fce
, &fcl
->fclist
, fce_link
) {
921 if ((uint8_t)fce
->fce_flowsrc_type
== flowsrc
&&
922 fce
->fce_flowid
== flowid
) {
923 /* Already on flow control list; just return */
928 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
929 fce
= pktsched_alloc_fcentry(pkt
, sp
->sfb_ifp
, M_WAITOK
);
931 STAILQ_INSERT_TAIL(&fcl
->fclist
, fce
, fce_link
);
933 sp
->sfb_stats
.flow_controlled
++;
940 * check if this flow needs to be flow-controlled or if this
941 * packet needs to be dropped.
944 sfb_bin_mark_or_drop(struct sfb
*sp
, struct sfbbinstats
*bin
)
947 if (SFB_QUEUE_DELAYBASED(sp
)) {
949 * Mark or drop if this bin has more
950 * bytes than the flowcontrol threshold.
952 if (SFB_IS_DELAYHIGH(sp
) &&
953 bin
->bytes
>= (sp
->sfb_fc_threshold
<< 1)) {
957 if (bin
->pkts
>= sp
->sfb_allocation
&&
958 bin
->pkts
>= sp
->sfb_drop_thresh
) {
959 ret
= 1; /* drop or mark */
966 * early-drop probability is kept in pmark of each bin of the flow
969 sfb_drop_early(struct sfb
*sp
, uint32_t pkt_sfb_hash
, u_int16_t
*pmin
,
970 struct timespec
*now
)
974 #endif /* SFB_LEVELS != 2 */
975 struct sfbbinstats
*bin
;
977 uint8_t *pkt_sfb_hash8
= (uint8_t *)&pkt_sfb_hash
;
980 VERIFY((s
+ (s
^ 1)) == 1);
982 *pmin
= (u_int16_t
)-1;
985 * Update current bins; optimize for SFB_LEVELS=2
988 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
989 n
= SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1)]);
990 bin
= SFB_BINST(sp
, 0, n
, s
);
991 if (*pmin
> (u_int16_t
)bin
->pmark
) {
992 *pmin
= (u_int16_t
)bin
->pmark
;
996 /* Update SFB probability */
997 if (bin
->pkts
>= sp
->sfb_allocation
) {
998 sfb_increment_bin(sp
, bin
, SFB_BINFT(sp
, 0, n
, s
), now
);
1001 ret
= sfb_bin_mark_or_drop(sp
, bin
);
1003 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
1004 n
= SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1) + 1]);
1005 bin
= SFB_BINST(sp
, 1, n
, s
);
1006 if (*pmin
> (u_int16_t
)bin
->pmark
) {
1007 *pmin
= (u_int16_t
)bin
->pmark
;
1010 if (bin
->pkts
>= sp
->sfb_allocation
) {
1011 sfb_increment_bin(sp
, bin
, SFB_BINFT(sp
, 1, n
, s
), now
);
1013 #else /* SFB_LEVELS != 2 */
1014 for (i
= 0; i
< SFB_LEVELS
; i
++) {
1015 if (s
== 0) { /* set 0, bin index [0,1] */
1016 n
= SFB_BINMASK(pkt_sfb_hash8
[i
]);
1017 } else { /* set 1, bin index [2,3] */
1018 n
= SFB_BINMASK(pkt_sfb_hash8
[i
+ 2]);
1021 bin
= SFB_BINST(sp
, i
, n
, s
);
1022 if (*pmin
> (u_int16_t
)bin
->pmark
) {
1023 *pmin
= (u_int16_t
)bin
->pmark
;
1026 if (bin
->pkts
>= sp
->sfb_allocation
) {
1027 sfb_increment_bin(sp
, bin
,
1028 SFB_BINFT(sp
, i
, n
, s
), now
);
1030 if (i
== SFB_FC_LEVEL
) {
1031 ret
= sfb_bin_mark_or_drop(sp
, bin
);
1034 #endif /* SFB_LEVELS != 2 */
1036 if (sp
->sfb_flags
& SFBF_SUSPENDED
) {
1037 ret
= 1; /* drop or mark */
1043 sfb_detect_dequeue_stall(struct sfb
*sp
, class_queue_t
*q
,
1044 struct timespec
*now
)
1046 struct timespec max_getqtime
;
1048 if (!SFB_QUEUE_DELAYBASED(sp
) || SFB_IS_DELAYHIGH(sp
) ||
1049 qsize(q
) <= SFB_MIN_FC_THRESHOLD_BYTES
||
1050 !net_timerisset(&sp
->sfb_getqtime
)) {
1054 net_timeradd(&sp
->sfb_getqtime
, &sp
->sfb_update_interval
,
1056 if (net_timercmp(now
, &max_getqtime
, >)) {
1058 * No packets have been dequeued in an update interval
1059 * worth of time. It means that the queue is stalled
1061 SFB_SET_DELAY_HIGH(sp
, q
);
1062 sp
->sfb_stats
.dequeue_stall
++;
1066 #define DTYPE_NODROP 0 /* no drop */
1067 #define DTYPE_FORCED 1 /* a "forced" drop */
1068 #define DTYPE_EARLY 2 /* an "unforced" (early) drop */
1071 sfb_addq(struct sfb
*sp
, class_queue_t
*q
, pktsched_pkt_t
*pkt
,
1076 #endif /* !PF_ECN */
1077 struct timespec now
;
1081 int ret
= CLASSQEQ_SUCCESS
;
1082 uint32_t maxqsize
= 0;
1083 uint64_t *pkt_timestamp
;
1084 uint32_t *pkt_sfb_hash
;
1085 uint16_t *pkt_sfb_hash16
;
1086 uint32_t *pkt_sfb_flags
;
1087 uint32_t pkt_flowid
;
1088 volatile uint32_t *pkt_flags
;
1089 uint8_t pkt_proto
, pkt_flowsrc
;
1091 s
= sp
->sfb_current
;
1092 VERIFY((s
+ (s
^ 1)) == 1);
1094 pktsched_get_pkt_vars(pkt
, &pkt_flags
, &pkt_timestamp
, &pkt_flowid
,
1095 &pkt_flowsrc
, &pkt_proto
, NULL
);
1096 pkt_sfb_hash
= pktsched_get_pkt_sfb_vars(pkt
, &pkt_sfb_flags
);
1097 pkt_sfb_hash16
= (uint16_t *)pkt_sfb_hash
;
1099 switch (pkt
->pktsched_ptype
) {
1101 /* See comments in <rdar://problem/14040693> */
1102 VERIFY(!(*pkt_flags
& PKTF_PRIV_GUARDED
));
1103 *pkt_flags
|= PKTF_PRIV_GUARDED
;
1108 __builtin_unreachable();
1111 if (*pkt_timestamp
> 0) {
1112 net_nsectimer(pkt_timestamp
, &now
);
1115 net_timernsec(&now
, pkt_timestamp
);
1118 /* time to swap the bins? */
1119 if (net_timercmp(&now
, &sp
->sfb_nextreset
, >=)) {
1120 net_timeradd(&now
, &sp
->sfb_hinterval
, &sp
->sfb_nextreset
);
1121 sfb_swap_bins(sp
, qlen(q
));
1122 s
= sp
->sfb_current
;
1123 VERIFY((s
+ (s
^ 1)) == 1);
1126 if (!net_timerisset(&sp
->sfb_update_time
)) {
1127 net_timeradd(&now
, &sp
->sfb_update_interval
,
1128 &sp
->sfb_update_time
);
1132 * If getq time is not set because this is the first packet
1133 * or after idle time, set it now so that we can detect a stall.
1135 if (qsize(q
) == 0 && !net_timerisset(&sp
->sfb_getqtime
)) {
1136 *(&sp
->sfb_getqtime
) = *(&now
);
1141 (SFB_HASH(&pkt_flowid
, sizeof(pkt_flowid
),
1142 (*sp
->sfb_bins
)[s
].fudge
) & SFB_HASHMASK
);
1143 pkt_sfb_hash16
[s
^ 1] =
1144 (SFB_HASH(&pkt_flowid
, sizeof(pkt_flowid
),
1145 (*sp
->sfb_bins
)[s
^ 1].fudge
) & SFB_HASHMASK
);
1147 /* check if the queue has been stalled */
1148 sfb_detect_dequeue_stall(sp
, q
, &now
);
1150 /* see if we drop early */
1151 droptype
= DTYPE_NODROP
;
1152 if (sfb_drop_early(sp
, *pkt_sfb_hash
, &pmin
, &now
)) {
1153 /* flow control, mark or drop by sfb */
1154 if ((sp
->sfb_flags
& SFBF_FLOWCTL
) &&
1155 (*pkt_flags
& PKTF_FLOW_ADV
)) {
1157 /* drop all during suspension or for non-TCP */
1158 if ((sp
->sfb_flags
& SFBF_SUSPENDED
) ||
1159 pkt_proto
!= IPPROTO_TCP
) {
1160 droptype
= DTYPE_EARLY
;
1161 sp
->sfb_stats
.drop_early
++;
1165 /* XXX: only supported for mbuf */
1166 else if ((sp
->sfb_flags
& SFBF_ECN
) &&
1167 (pkt
->pktsched_ptype
== QP_MBUF
) &&
1168 (pkt_proto
== IPPROTO_TCP
) && /* only for TCP */
1169 ((sfb_random(sp
) & SFB_MAX_PMARK
) <= pmin
) &&
1170 mark_ecn(m
, t
, sp
->sfb_flags
) &&
1171 !(sp
->sfb_flags
& SFBF_SUSPENDED
)) {
1172 /* successfully marked; do not drop. */
1173 sp
->sfb_stats
.marked_packets
++;
1177 /* unforced drop by sfb */
1178 droptype
= DTYPE_EARLY
;
1179 sp
->sfb_stats
.drop_early
++;
1183 /* non-responsive flow penalty? */
1184 if (droptype
== DTYPE_NODROP
&& sfb_penalize(sp
, *pkt_sfb_hash
,
1185 pkt_sfb_flags
, &now
)) {
1186 droptype
= DTYPE_FORCED
;
1187 sp
->sfb_stats
.drop_pbox
++;
1190 if (SFB_QUEUE_DELAYBASED(sp
)) {
1191 maxqsize
= SFB_QUEUE_DELAYBASED_MAXSIZE
;
1193 maxqsize
= qlimit(q
);
1197 * When the queue length hits the queue limit, make it a forced
1200 if (droptype
== DTYPE_NODROP
&& qlen(q
) >= maxqsize
) {
1201 if (pkt_proto
== IPPROTO_TCP
&&
1202 qlen(q
) < (maxqsize
+ (maxqsize
>> 1)) &&
1203 ((*pkt_flags
& PKTF_TCP_REXMT
) ||
1204 (sp
->sfb_flags
& SFBF_LAST_PKT_DROPPED
))) {
1206 * At some level, dropping packets will make the
1207 * flows backoff and will keep memory requirements
1208 * under control. But we should not cause a tail
1209 * drop because it can take a long time for a
1210 * TCP flow to recover. We should try to drop
1211 * alternate packets instead.
1213 sp
->sfb_flags
&= ~SFBF_LAST_PKT_DROPPED
;
1215 droptype
= DTYPE_FORCED
;
1216 sp
->sfb_stats
.drop_queue
++;
1217 sp
->sfb_flags
|= SFBF_LAST_PKT_DROPPED
;
1221 if (fc_adv
== 1 && droptype
!= DTYPE_FORCED
&&
1222 sfb_bin_addfcentry(sp
, pkt
, *pkt_sfb_hash
, pkt_flowsrc
,
1224 /* deliver flow control advisory error */
1225 if (droptype
== DTYPE_NODROP
) {
1226 ret
= CLASSQEQ_SUCCESS_FC
;
1227 VERIFY(!(sp
->sfb_flags
& SFBF_SUSPENDED
));
1228 } else if (sp
->sfb_flags
& SFBF_SUSPENDED
) {
1229 /* drop due to suspension */
1230 ret
= CLASSQEQ_DROP_SP
;
1232 /* drop due to flow-control */
1233 ret
= CLASSQEQ_DROP_FC
;
1236 /* if successful enqueue this packet, else drop it */
1237 if (droptype
== DTYPE_NODROP
) {
1238 VERIFY(pkt
->pktsched_ptype
== qptype(q
));
1239 _addq(q
, &pkt
->pktsched_pkt
);
1241 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
1242 return (ret
!= CLASSQEQ_SUCCESS
) ? ret
: CLASSQEQ_DROP
;
1245 if (!(*pkt_sfb_flags
& SFB_PKT_PBOX
)) {
1246 sfb_eq_update_bins(sp
, *pkt_sfb_hash
,
1247 pktsched_get_pkt_len(pkt
));
1249 sp
->sfb_stats
.pbox_packets
++;
1252 /* successfully queued */
1257 sfb_getq_flow(struct sfb
*sp
, class_queue_t
*q
, u_int32_t flow
, boolean_t purge
,
1258 pktsched_pkt_t
*pkt
)
1260 struct timespec now
;
1261 uint64_t *pkt_timestamp
;
1262 volatile uint32_t *pkt_flags
;
1263 uint32_t *pkt_sfb_flags
;
1264 uint32_t *pkt_sfb_hash
;
1265 classq_pkt_t p
= CLASSQ_PKT_INITIALIZER(p
);
1267 if (!purge
&& (sp
->sfb_flags
& SFBF_SUSPENDED
)) {
1273 /* flow of 0 means head of queue */
1277 _getq_flow(q
, &p
, flow
);
1280 if (p
.cp_ptype
== QP_INVALID
) {
1282 net_timerclear(&sp
->sfb_getqtime
);
1287 pktsched_pkt_encap(pkt
, &p
);
1288 pktsched_get_pkt_vars(pkt
, &pkt_flags
, &pkt_timestamp
, NULL
,
1290 pkt_sfb_hash
= pktsched_get_pkt_sfb_vars(pkt
, &pkt_sfb_flags
);
1292 /* See comments in <rdar://problem/14040693> */
1293 switch (p
.cp_ptype
) {
1295 VERIFY(*pkt_flags
& PKTF_PRIV_GUARDED
);
1300 __builtin_unreachable();
1304 /* calculate EWMA of dequeues */
1305 if (net_timerisset(&sp
->sfb_getqtime
)) {
1306 struct timespec delta
;
1308 net_timersub(&now
, &sp
->sfb_getqtime
, &delta
);
1309 net_timernsec(&delta
, &new);
1310 avg
= sp
->sfb_stats
.dequeue_avg
;
1312 int decay
= DEQUEUE_DECAY
;
1314 * If the time since last dequeue is
1315 * significantly greater than the current
1316 * average, weigh the average more against
1319 if (DEQUEUE_SPIKE(new, avg
)) {
1322 avg
= (((avg
<< decay
) - avg
) + new) >> decay
;
1326 sp
->sfb_stats
.dequeue_avg
= avg
;
1328 *(&sp
->sfb_getqtime
) = *(&now
);
1331 if (!purge
&& SFB_QUEUE_DELAYBASED(sp
)) {
1332 u_int64_t dequeue_ns
, queue_delay
= 0;
1333 net_timernsec(&now
, &dequeue_ns
);
1334 if (dequeue_ns
> *pkt_timestamp
) {
1335 queue_delay
= dequeue_ns
- *pkt_timestamp
;
1338 if (sp
->sfb_min_qdelay
== 0 ||
1339 (queue_delay
> 0 && queue_delay
< sp
->sfb_min_qdelay
)) {
1340 sp
->sfb_min_qdelay
= queue_delay
;
1342 if (net_timercmp(&now
, &sp
->sfb_update_time
, >=)) {
1343 if (sp
->sfb_min_qdelay
> sp
->sfb_target_qdelay
) {
1344 if (!SFB_IS_DELAYHIGH(sp
)) {
1345 SFB_SET_DELAY_HIGH(sp
, q
);
1348 sp
->sfb_flags
&= ~(SFBF_DELAYHIGH
);
1349 sp
->sfb_fc_threshold
= 0;
1351 net_timeradd(&now
, &sp
->sfb_update_interval
,
1352 &sp
->sfb_update_time
);
1353 sp
->sfb_min_qdelay
= 0;
1359 * Clearpkts are the ones which were in the queue when the hash
1360 * function was perturbed. Since the perturbation value (fudge),
1361 * and thus bin information for these packets is not known, we do
1362 * not change accounting information while dequeuing these packets.
1363 * It is important not to set the hash interval too small due to
1364 * this reason. A rule of thumb is to set it to K*D, where D is
1365 * the time taken to drain queue.
1367 if (*pkt_sfb_flags
& SFB_PKT_PBOX
) {
1368 *pkt_sfb_flags
&= ~SFB_PKT_PBOX
;
1369 if (sp
->sfb_clearpkts
> 0) {
1370 sp
->sfb_clearpkts
--;
1372 } else if (sp
->sfb_clearpkts
> 0) {
1373 sp
->sfb_clearpkts
--;
1375 sfb_dq_update_bins(sp
, *pkt_sfb_hash
, pktsched_get_pkt_len(pkt
),
1379 switch (p
.cp_ptype
) {
1381 /* See comments in <rdar://problem/14040693> */
1382 *pkt_flags
&= ~PKTF_PRIV_GUARDED
;
1387 __builtin_unreachable();
1391 * If the queue becomes empty before the update interval, reset
1392 * the flow control threshold
1394 if (qsize(q
) == 0) {
1395 sp
->sfb_flags
&= ~SFBF_DELAYHIGH
;
1396 sp
->sfb_min_qdelay
= 0;
1397 sp
->sfb_fc_threshold
= 0;
1398 net_timerclear(&sp
->sfb_update_time
);
1399 net_timerclear(&sp
->sfb_getqtime
);
1401 return pkt
->pktsched_pkt_mbuf
;
1405 sfb_getq(struct sfb
*sp
, class_queue_t
*q
, pktsched_pkt_t
*pkt
)
1407 sfb_getq_flow(sp
, q
, 0, FALSE
, pkt
);
1411 sfb_purgeq(struct sfb
*sp
, class_queue_t
*q
, u_int32_t flow
, u_int32_t
*packets
,
1414 u_int32_t cnt
= 0, len
= 0;
1417 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
1418 while (sfb_getq_flow(sp
, q
, flow
, TRUE
, &pkt
) != NULL
) {
1420 len
+= pktsched_get_pkt_len(&pkt
);
1421 pktsched_free_pkt(&pkt
);
1424 if (packets
!= NULL
) {
1427 if (bytes
!= NULL
) {
1433 sfb_updateq(struct sfb
*sp
, cqev_t ev
)
1435 struct ifnet
*ifp
= sp
->sfb_ifp
;
1437 VERIFY(ifp
!= NULL
);
1440 case CLASSQ_EV_LINK_BANDWIDTH
: {
1441 u_int64_t eff_rate
= ifnet_output_linkrate(ifp
);
1443 /* update parameters only if rate has changed */
1444 if (eff_rate
== sp
->sfb_eff_rate
) {
1448 if (classq_verbose
) {
1449 log(LOG_DEBUG
, "%s: SFB qid=%d, adapting to new "
1450 "eff_rate=%llu bps\n", if_name(ifp
), sp
->sfb_qid
,
1453 sfb_calc_holdtime(sp
, eff_rate
);
1454 sfb_calc_pboxtime(sp
, eff_rate
);
1455 ifclassq_calc_target_qdelay(ifp
, &sp
->sfb_target_qdelay
);
1456 sfb_calc_update_interval(sp
, eff_rate
);
1460 case CLASSQ_EV_LINK_UP
:
1461 case CLASSQ_EV_LINK_DOWN
:
1462 if (classq_verbose
) {
1463 log(LOG_DEBUG
, "%s: SFB qid=%d, resetting due to "
1464 "link %s\n", if_name(ifp
), sp
->sfb_qid
,
1465 (ev
== CLASSQ_EV_LINK_UP
) ? "UP" : "DOWN");
1470 case CLASSQ_EV_LINK_LATENCY
:
1471 case CLASSQ_EV_LINK_MTU
:
1478 sfb_suspendq(struct sfb
*sp
, class_queue_t
*q
, boolean_t on
)
1481 struct ifnet
*ifp
= sp
->sfb_ifp
;
1483 VERIFY(ifp
!= NULL
);
1485 if ((on
&& (sp
->sfb_flags
& SFBF_SUSPENDED
)) ||
1486 (!on
&& !(sp
->sfb_flags
& SFBF_SUSPENDED
))) {
1490 if (!(sp
->sfb_flags
& SFBF_FLOWCTL
)) {
1491 log(LOG_ERR
, "%s: SFB qid=%d, unable to %s queue since "
1492 "flow-control is not enabled", if_name(ifp
), sp
->sfb_qid
,
1493 (on
? "suspend" : "resume"));
1497 if (classq_verbose
) {
1498 log(LOG_DEBUG
, "%s: SFB qid=%d, setting state to %s",
1499 if_name(ifp
), sp
->sfb_qid
, (on
? "SUSPENDED" : "RUNNING"));
1503 sp
->sfb_flags
|= SFBF_SUSPENDED
;
1505 sp
->sfb_flags
&= ~SFBF_SUSPENDED
;
1506 sfb_swap_bins(sp
, qlen(q
));