2 * Copyright (c) 2011-2019 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
32 #include <sys/socket.h>
33 #include <sys/sockio.h>
34 #include <sys/systm.h>
35 #include <sys/sysctl.h>
36 #include <sys/syslog.h>
38 #include <sys/errno.h>
39 #include <sys/kernel.h>
40 #include <sys/kauth.h>
42 #include <kern/zalloc.h>
45 #include <net/if_var.h>
46 #include <net/if_types.h>
48 #include <net/flowadv.h>
50 #include <netinet/in.h>
51 #include <netinet/in_systm.h>
52 #include <netinet/ip.h>
54 #include <netinet/ip6.h>
57 #include <net/classq/classq_sfb.h>
58 #include <net/flowhash.h>
59 #include <net/net_osdep.h>
60 #include <dev/random/randomdev.h>
63 * Stochastic Fair Blue
65 * Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin
66 * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
68 * Based on the NS code with the following parameters:
73 * hold-time: 10ms-50ms (randomized)
76 * pbox-time: 50-100ms (randomized)
77 * hinterval: 11-23 (randomized)
79 * This implementation uses L = 2 and N = 32 for 2 sets of:
81 * B[L][N]: L x N array of bins (L levels, N bins per level)
83 * Each set effectively creates 32^2 virtual buckets (bin combinations)
84 * while using only O(32*2) states.
86 * Given a 32-bit hash value, we divide it such that octets [0,1,2,3] are
87 * used as index for the bins across the 2 levels, where level 1 uses [0,2]
88 * and level 2 uses [1,3]. The 2 values per level correspond to the indices
89 * for the current and warm-up sets (section 4.4. in the SFB paper regarding
90 * Moving Hash Functions explains the purposes of these 2 sets.)
94 * Use Murmur3A_x86_32 for hash function. It seems to perform consistently
95 * across platforms for 1-word key (32-bit flowhash value). See flowhash.h
96 * for other alternatives. We only need 16-bit hash output.
98 #define SFB_HASH net_flowhash_mh3_x86_32
99 #define SFB_HASHMASK HASHMASK(16)
101 #define SFB_BINMASK(_x) \
102 ((_x) & HASHMASK(SFB_BINS_SHIFT))
104 #define SFB_BINST(_sp, _l, _n, _c) \
105 (&(*(_sp)->sfb_bins)[_c].stats[_l][_n])
107 #define SFB_BINFT(_sp, _l, _n, _c) \
108 (&(*(_sp)->sfb_bins)[_c].freezetime[_l][_n])
110 #define SFB_FC_LIST(_sp, _n) \
111 (&(*(_sp)->sfb_fc_lists)[_n])
114 * The holdtime parameter determines the minimum time interval between
115 * two successive updates of the marking probability. In the event the
116 * uplink speed is not known, a default value is chosen and is randomized
117 * to be within the following range.
119 #define HOLDTIME_BASE (100ULL * 1000 * 1000) /* 100ms */
120 #define HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10ms */
121 #define HOLDTIME_MAX (100ULL * 1000 * 1000) /* 100ms */
124 * The pboxtime parameter determines the bandwidth allocated for rogue
125 * flows, i.e. the rate limiting bandwidth. In the event the uplink speed
126 * is not known, a default value is chosen and is randomized to be within
127 * the following range.
129 #define PBOXTIME_BASE (300ULL * 1000 * 1000) /* 300ms */
130 #define PBOXTIME_MIN (30ULL * 1000 * 1000) /* 30ms */
131 #define PBOXTIME_MAX (300ULL * 1000 * 1000) /* 300ms */
134 * Target queueing delay is the amount of extra delay that can be added
135 * to accommodate variations in the link bandwidth. The queue should be
136 * large enough to induce this much delay and nothing more than that.
138 #define TARGET_QDELAY_BASE (10ULL * 1000 * 1000) /* 10ms */
139 #define TARGET_QDELAY_MIN (10ULL * 1000) /* 10us */
140 #define TARGET_QDELAY_MAX (20ULL * 1000 * 1000 * 1000) /* 20s */
143 * Update interval for checking the extra delay added by the queue. This
144 * should be 90-95 percentile of RTT experienced by any TCP connection
145 * so that it will take care of the burst traffic.
147 #define UPDATE_INTERVAL_BASE (100ULL * 1000 * 1000) /* 100ms */
148 #define UPDATE_INTERVAL_MIN (100ULL * 1000 * 1000) /* 100ms */
149 #define UPDATE_INTERVAL_MAX (10ULL * 1000 * 1000 * 1000) /* 10s */
151 #define SFB_RANDOM(sp, tmin, tmax) ((sfb_random(sp) % (tmax)) + (tmin))
153 #define SFB_PKT_PBOX 0x1 /* in penalty box */
155 /* The following mantissa values are in SFB_FP_SHIFT Q format */
156 #define SFB_MAX_PMARK (1 << SFB_FP_SHIFT) /* Q14 representation of 1.00 */
159 * These are d1 (increment) and d2 (decrement) parameters, used to determine
160 * the amount by which the marking probability is incremented when the queue
161 * overflows, or is decremented when the link is idle. d1 is set higher than
162 * d2, because link underutilization can occur when congestion management is
163 * either too conservative or too aggressive, but packet loss occurs only
164 * when congestion management is too conservative. By weighing heavily
165 * against packet loss, it can quickly reach to a substantial increase in
168 #define SFB_INCREMENT 82 /* Q14 representation of 0.005 */
169 #define SFB_DECREMENT 16 /* Q14 representation of 0.001 */
171 #define SFB_PMARK_TH 16056 /* Q14 representation of 0.98 */
172 #define SFB_PMARK_WARM 3276 /* Q14 representation of 0.2 */
174 #define SFB_PMARK_INC(_bin) do { \
175 (_bin)->pmark += sfb_increment; \
176 if ((_bin)->pmark > SFB_MAX_PMARK) \
177 (_bin)->pmark = SFB_MAX_PMARK; \
180 #define SFB_PMARK_DEC(_bin) do { \
181 if ((_bin)->pmark > 0) { \
182 (_bin)->pmark -= sfb_decrement; \
183 if ((_bin)->pmark < 0) \
188 /* Minimum nuber of bytes in queue to get flow controlled */
189 #define SFB_MIN_FC_THRESHOLD_BYTES 7500
191 #define SFB_SET_DELAY_HIGH(_sp_, _q_) do { \
192 (_sp_)->sfb_flags |= SFBF_DELAYHIGH; \
193 (_sp_)->sfb_fc_threshold = max(SFB_MIN_FC_THRESHOLD_BYTES, \
194 (qsize((_q_)) >> 3)); \
197 #define SFB_QUEUE_DELAYBASED(_sp_) ((_sp_)->sfb_flags & SFBF_DELAYBASED)
198 #define SFB_IS_DELAYHIGH(_sp_) ((_sp_)->sfb_flags & SFBF_DELAYHIGH)
199 #define SFB_QUEUE_DELAYBASED_MAXSIZE 2048 /* max pkts */
201 #define HINTERVAL_MIN (10) /* 10 seconds */
202 #define HINTERVAL_MAX (20) /* 20 seconds */
203 #define SFB_HINTERVAL(sp) ((sfb_random(sp) % HINTERVAL_MAX) + HINTERVAL_MIN)
205 #define DEQUEUE_DECAY 7 /* ilog2 of EWMA decay rate, (128) */
206 #define DEQUEUE_SPIKE(_new, _old) \
207 ((u_int64_t)ABS((int64_t)(_new) - (int64_t)(_old)) > ((_old) << 11))
209 #define SFB_ZONE_MAX 32 /* maximum elements in zone */
210 #define SFB_ZONE_NAME "classq_sfb" /* zone name */
212 #define SFB_BINS_ZONE_MAX 32 /* maximum elements in zone */
213 #define SFB_BINS_ZONE_NAME "classq_sfb_bins" /* zone name */
215 #define SFB_FCL_ZONE_MAX 32 /* maximum elements in zone */
216 #define SFB_FCL_ZONE_NAME "classq_sfb_fcl" /* zone name */
218 /* Place the flow control entries in current bin on level 0 */
219 #define SFB_FC_LEVEL 0
221 static unsigned int sfb_size
; /* size of zone element */
222 static struct zone
*sfb_zone
; /* zone for sfb */
224 static unsigned int sfb_bins_size
; /* size of zone element */
225 static struct zone
*sfb_bins_zone
; /* zone for sfb_bins */
227 static unsigned int sfb_fcl_size
; /* size of zone element */
228 static struct zone
*sfb_fcl_zone
; /* zone for sfb_fc_lists */
230 /* internal function prototypes */
231 static u_int32_t
sfb_random(struct sfb
*);
232 static void *sfb_getq_flow(struct sfb
*, class_queue_t
*, u_int32_t
, boolean_t
,
234 static void sfb_resetq(struct sfb
*, cqev_t
);
235 static void sfb_calc_holdtime(struct sfb
*, u_int64_t
);
236 static void sfb_calc_pboxtime(struct sfb
*, u_int64_t
);
237 static void sfb_calc_hinterval(struct sfb
*, u_int64_t
*);
238 static void sfb_calc_update_interval(struct sfb
*, u_int64_t
);
239 static void sfb_swap_bins(struct sfb
*, u_int32_t
);
240 static inline int sfb_pcheck(struct sfb
*, uint32_t);
241 static int sfb_penalize(struct sfb
*, uint32_t, uint32_t *, struct timespec
*);
242 static void sfb_adjust_bin(struct sfb
*, struct sfbbinstats
*,
243 struct timespec
*, struct timespec
*, boolean_t
);
244 static void sfb_decrement_bin(struct sfb
*, struct sfbbinstats
*,
245 struct timespec
*, struct timespec
*);
246 static void sfb_increment_bin(struct sfb
*, struct sfbbinstats
*,
247 struct timespec
*, struct timespec
*);
248 static inline void sfb_dq_update_bins(struct sfb
*, uint32_t, uint32_t,
249 struct timespec
*, u_int32_t qsize
);
250 static inline void sfb_eq_update_bins(struct sfb
*, uint32_t, uint32_t);
251 static int sfb_drop_early(struct sfb
*, uint32_t, u_int16_t
*,
253 static boolean_t
sfb_bin_addfcentry(struct sfb
*, pktsched_pkt_t
*,
254 uint32_t, uint8_t, uint32_t);
255 static void sfb_fclist_append(struct sfb
*, struct sfb_fcl
*);
256 static void sfb_fclists_clean(struct sfb
*sp
);
257 static int sfb_bin_mark_or_drop(struct sfb
*sp
, struct sfbbinstats
*bin
);
258 static void sfb_detect_dequeue_stall(struct sfb
*sp
, class_queue_t
*,
261 SYSCTL_NODE(_net_classq
, OID_AUTO
, sfb
, CTLFLAG_RW
| CTLFLAG_LOCKED
, 0, "SFB");
263 static u_int64_t sfb_holdtime
= 0; /* 0 indicates "automatic" */
264 SYSCTL_QUAD(_net_classq_sfb
, OID_AUTO
, holdtime
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
265 &sfb_holdtime
, "SFB freeze time in nanoseconds");
267 static u_int64_t sfb_pboxtime
= 0; /* 0 indicates "automatic" */
268 SYSCTL_QUAD(_net_classq_sfb
, OID_AUTO
, pboxtime
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
269 &sfb_pboxtime
, "SFB penalty box time in nanoseconds");
271 static u_int64_t sfb_hinterval
;
272 SYSCTL_QUAD(_net_classq_sfb
, OID_AUTO
, hinterval
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
273 &sfb_hinterval
, "SFB hash interval in nanoseconds");
275 static u_int32_t sfb_increment
= SFB_INCREMENT
;
276 SYSCTL_UINT(_net_classq_sfb
, OID_AUTO
, increment
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
277 &sfb_increment
, SFB_INCREMENT
, "SFB increment [d1]");
279 static u_int32_t sfb_decrement
= SFB_DECREMENT
;
280 SYSCTL_UINT(_net_classq_sfb
, OID_AUTO
, decrement
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
281 &sfb_decrement
, SFB_DECREMENT
, "SFB decrement [d2]");
283 static u_int32_t sfb_allocation
= 0; /* 0 means "automatic" */
284 SYSCTL_UINT(_net_classq_sfb
, OID_AUTO
, allocation
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
285 &sfb_allocation
, 0, "SFB bin allocation");
287 static u_int32_t sfb_ratelimit
= 0;
288 SYSCTL_UINT(_net_classq_sfb
, OID_AUTO
, ratelimit
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
289 &sfb_ratelimit
, 0, "SFB rate limit");
291 #define KBPS (1ULL * 1000) /* 1 Kbits per second */
292 #define MBPS (1ULL * 1000 * 1000) /* 1 Mbits per second */
293 #define GBPS (MBPS * 1000) /* 1 Gbits per second */
295 struct sfb_time_tbl
{
296 u_int64_t speed
; /* uplink speed */
297 u_int64_t holdtime
; /* hold time */
298 u_int64_t pboxtime
; /* penalty box time */
301 static struct sfb_time_tbl sfb_ttbl
[] = {
302 { .speed
= 1 * MBPS
, .holdtime
= HOLDTIME_BASE
* 1000, .pboxtime
= PBOXTIME_BASE
* 1000},
303 { .speed
= 10 * MBPS
, .holdtime
= HOLDTIME_BASE
* 100, .pboxtime
= PBOXTIME_BASE
* 100 },
304 { .speed
= 100 * MBPS
, .holdtime
= HOLDTIME_BASE
* 10, .pboxtime
= PBOXTIME_BASE
* 10 },
305 { .speed
= 1 * GBPS
, .holdtime
= HOLDTIME_BASE
, .pboxtime
= PBOXTIME_BASE
},
306 { .speed
= 10 * GBPS
, .holdtime
= HOLDTIME_BASE
/ 10, .pboxtime
= PBOXTIME_BASE
/ 10 },
307 { .speed
= 100 * GBPS
, .holdtime
= HOLDTIME_BASE
/ 100, .pboxtime
= PBOXTIME_BASE
/ 100 },
308 { .speed
= 0, .holdtime
= 0, .pboxtime
= 0 }
314 _CASSERT(SFBF_ECN4
== CLASSQF_ECN4
);
315 _CASSERT(SFBF_ECN6
== CLASSQF_ECN6
);
317 sfb_size
= sizeof(struct sfb
);
318 sfb_zone
= zinit(sfb_size
, SFB_ZONE_MAX
* sfb_size
,
320 if (sfb_zone
== NULL
) {
321 panic("%s: failed allocating %s", __func__
, SFB_ZONE_NAME
);
324 zone_change(sfb_zone
, Z_EXPAND
, TRUE
);
325 zone_change(sfb_zone
, Z_CALLERACCT
, TRUE
);
327 sfb_bins_size
= sizeof(struct sfb_bins
);
328 sfb_bins_zone
= zinit(sfb_bins_size
, SFB_BINS_ZONE_MAX
* sfb_bins_size
,
329 0, SFB_BINS_ZONE_NAME
);
330 if (sfb_bins_zone
== NULL
) {
331 panic("%s: failed allocating %s", __func__
, SFB_BINS_ZONE_NAME
);
334 zone_change(sfb_bins_zone
, Z_EXPAND
, TRUE
);
335 zone_change(sfb_bins_zone
, Z_CALLERACCT
, TRUE
);
337 sfb_fcl_size
= sizeof(struct sfb_fcl
);
338 sfb_fcl_zone
= zinit(sfb_fcl_size
, SFB_FCL_ZONE_MAX
* sfb_fcl_size
,
339 0, SFB_FCL_ZONE_NAME
);
340 if (sfb_fcl_zone
== NULL
) {
341 panic("%s: failed allocating %s", __func__
, SFB_FCL_ZONE_NAME
);
344 zone_change(sfb_fcl_zone
, Z_EXPAND
, TRUE
);
345 zone_change(sfb_fcl_zone
, Z_CALLERACCT
, TRUE
);
349 sfb_random(struct sfb
*sp
)
351 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
352 return RandomULong();
356 sfb_calc_holdtime(struct sfb
*sp
, u_int64_t outbw
)
360 if (sfb_holdtime
!= 0) {
361 holdtime
= sfb_holdtime
;
362 } else if (outbw
== 0) {
363 holdtime
= SFB_RANDOM(sp
, HOLDTIME_MIN
, HOLDTIME_MAX
);
367 n
= sfb_ttbl
[0].holdtime
;
368 for (i
= 0; sfb_ttbl
[i
].speed
!= 0; i
++) {
369 if (outbw
< sfb_ttbl
[i
].speed
) {
372 n
= sfb_ttbl
[i
].holdtime
;
376 net_nsectimer(&holdtime
, &sp
->sfb_holdtime
);
380 sfb_calc_pboxtime(struct sfb
*sp
, u_int64_t outbw
)
384 if (sfb_pboxtime
!= 0) {
385 pboxtime
= sfb_pboxtime
;
386 } else if (outbw
== 0) {
387 pboxtime
= SFB_RANDOM(sp
, PBOXTIME_MIN
, PBOXTIME_MAX
);
391 n
= sfb_ttbl
[0].pboxtime
;
392 for (i
= 0; sfb_ttbl
[i
].speed
!= 0; i
++) {
393 if (outbw
< sfb_ttbl
[i
].speed
) {
396 n
= sfb_ttbl
[i
].pboxtime
;
400 net_nsectimer(&pboxtime
, &sp
->sfb_pboxtime
);
401 net_timerclear(&sp
->sfb_pboxfreeze
);
405 sfb_calc_hinterval(struct sfb
*sp
, u_int64_t
*t
)
407 u_int64_t hinterval
= 0;
412 * TODO adi@apple.com: use dq_avg to derive hinterval.
417 if (sfb_hinterval
!= 0) {
418 hinterval
= sfb_hinterval
;
419 } else if (t
== NULL
|| hinterval
== 0) {
420 hinterval
= ((u_int64_t
)SFB_HINTERVAL(sp
) * NSEC_PER_SEC
);
423 net_nsectimer(&hinterval
, &sp
->sfb_hinterval
);
426 net_timeradd(&now
, &sp
->sfb_hinterval
, &sp
->sfb_nextreset
);
430 sfb_calc_update_interval(struct sfb
*sp
, u_int64_t out_bw
)
432 #pragma unused(out_bw)
433 u_int64_t update_interval
= 0;
434 ifclassq_calc_update_interval(&update_interval
);
435 net_nsectimer(&update_interval
, &sp
->sfb_update_interval
);
439 * sfb support routines
442 sfb_alloc(struct ifnet
*ifp
, u_int32_t qid
, u_int32_t qlim
, u_int32_t flags
)
447 VERIFY(ifp
!= NULL
&& qlim
> 0);
449 sp
= zalloc(sfb_zone
);
451 log(LOG_ERR
, "%s: SFB unable to allocate\n", if_name(ifp
));
456 if ((sp
->sfb_bins
= zalloc(sfb_bins_zone
)) == NULL
) {
457 log(LOG_ERR
, "%s: SFB unable to allocate bins\n", if_name(ifp
));
461 bzero(sp
->sfb_bins
, sfb_bins_size
);
463 if ((sp
->sfb_fc_lists
= zalloc(sfb_fcl_zone
)) == NULL
) {
464 log(LOG_ERR
, "%s: SFB unable to allocate flow control lists\n",
469 bzero(sp
->sfb_fc_lists
, sfb_fcl_size
);
471 for (i
= 0; i
< SFB_BINS
; ++i
) {
472 STAILQ_INIT(&SFB_FC_LIST(sp
, i
)->fclist
);
478 sp
->sfb_flags
= (flags
& SFBF_USERFLAGS
);
480 if (sp
->sfb_flags
& SFBF_ECN
) {
481 sp
->sfb_flags
&= ~SFBF_ECN
;
482 log(LOG_ERR
, "%s: SFB qid=%d, ECN not available; ignoring "
483 "SFBF_ECN flag!\n", if_name(ifp
), sp
->sfb_qid
);
487 sfb_resetq(sp
, CLASSQ_EV_INIT
);
493 sfb_fclist_append(struct sfb
*sp
, struct sfb_fcl
*fcl
)
495 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
496 VERIFY(STAILQ_EMPTY(&fcl
->fclist
) || fcl
->cnt
> 0);
497 sp
->sfb_stats
.flow_feedback
+= fcl
->cnt
;
500 flowadv_add(&fcl
->fclist
);
501 VERIFY(fcl
->cnt
== 0 && STAILQ_EMPTY(&fcl
->fclist
));
505 sfb_fclists_clean(struct sfb
*sp
)
509 /* Move all the flow control entries to the flowadv list */
510 for (i
= 0; i
< SFB_BINS
; ++i
) {
511 struct sfb_fcl
*fcl
= SFB_FC_LIST(sp
, i
);
512 if (!STAILQ_EMPTY(&fcl
->fclist
)) {
513 sfb_fclist_append(sp
, fcl
);
519 sfb_destroy(struct sfb
*sp
)
521 sfb_fclists_clean(sp
);
522 if (sp
->sfb_bins
!= NULL
) {
523 zfree(sfb_bins_zone
, sp
->sfb_bins
);
526 if (sp
->sfb_fc_lists
!= NULL
) {
527 zfree(sfb_fcl_zone
, sp
->sfb_fc_lists
);
528 sp
->sfb_fc_lists
= NULL
;
534 sfb_resetq(struct sfb
*sp
, cqev_t ev
)
536 struct ifnet
*ifp
= sp
->sfb_ifp
;
541 if (ev
!= CLASSQ_EV_LINK_DOWN
) {
542 (*sp
->sfb_bins
)[0].fudge
= sfb_random(sp
);
543 (*sp
->sfb_bins
)[1].fudge
= sfb_random(sp
);
544 sp
->sfb_allocation
= ((sfb_allocation
== 0) ?
545 (sp
->sfb_qlim
/ 3) : sfb_allocation
);
546 sp
->sfb_drop_thresh
= sp
->sfb_allocation
+
547 (sp
->sfb_allocation
>> 1);
550 sp
->sfb_clearpkts
= 0;
553 eff_rate
= ifnet_output_linkrate(ifp
);
554 sp
->sfb_eff_rate
= eff_rate
;
556 sfb_calc_holdtime(sp
, eff_rate
);
557 sfb_calc_pboxtime(sp
, eff_rate
);
558 sfb_calc_hinterval(sp
, NULL
);
559 ifclassq_calc_target_qdelay(ifp
, &sp
->sfb_target_qdelay
);
560 sfb_calc_update_interval(sp
, eff_rate
);
562 if (ev
== CLASSQ_EV_LINK_DOWN
||
563 ev
== CLASSQ_EV_LINK_UP
) {
564 sfb_fclists_clean(sp
);
567 bzero(sp
->sfb_bins
, sizeof(*sp
->sfb_bins
));
568 bzero(&sp
->sfb_stats
, sizeof(sp
->sfb_stats
));
570 if (ev
== CLASSQ_EV_LINK_DOWN
|| !classq_verbose
) {
574 log(LOG_DEBUG
, "%s: SFB qid=%d, holdtime=%llu nsec, "
575 "pboxtime=%llu nsec, allocation=%d, drop_thresh=%d, "
576 "hinterval=%d sec, sfb_bins=%d bytes, eff_rate=%llu bps"
577 "target_qdelay= %llu nsec "
578 "update_interval=%llu sec %llu nsec flags=0x%x\n",
579 if_name(ifp
), sp
->sfb_qid
, (u_int64_t
)sp
->sfb_holdtime
.tv_nsec
,
580 (u_int64_t
)sp
->sfb_pboxtime
.tv_nsec
,
581 (u_int32_t
)sp
->sfb_allocation
, (u_int32_t
)sp
->sfb_drop_thresh
,
582 (int)sp
->sfb_hinterval
.tv_sec
, (int)sizeof(*sp
->sfb_bins
),
583 eff_rate
, (u_int64_t
)sp
->sfb_target_qdelay
,
584 (u_int64_t
)sp
->sfb_update_interval
.tv_sec
,
585 (u_int64_t
)sp
->sfb_update_interval
.tv_nsec
, sp
->sfb_flags
);
589 sfb_getstats(struct sfb
*sp
, struct sfb_stats
*sps
)
591 sps
->allocation
= sp
->sfb_allocation
;
592 sps
->dropthresh
= sp
->sfb_drop_thresh
;
593 sps
->clearpkts
= sp
->sfb_clearpkts
;
594 sps
->current
= sp
->sfb_current
;
595 sps
->target_qdelay
= sp
->sfb_target_qdelay
;
596 sps
->min_estdelay
= sp
->sfb_min_qdelay
;
597 sps
->delay_fcthreshold
= sp
->sfb_fc_threshold
;
598 sps
->flags
= sp
->sfb_flags
;
600 net_timernsec(&sp
->sfb_holdtime
, &sp
->sfb_stats
.hold_time
);
601 net_timernsec(&sp
->sfb_pboxtime
, &sp
->sfb_stats
.pbox_time
);
602 net_timernsec(&sp
->sfb_hinterval
, &sp
->sfb_stats
.rehash_intval
);
603 net_timernsec(&sp
->sfb_update_interval
, &sps
->update_interval
);
604 *(&(sps
->sfbstats
)) = *(&(sp
->sfb_stats
));
606 _CASSERT(sizeof((*sp
->sfb_bins
)[0].stats
) ==
607 sizeof(sps
->binstats
[0].stats
));
609 bcopy(&(*sp
->sfb_bins
)[0].stats
, &sps
->binstats
[0].stats
,
610 sizeof(sps
->binstats
[0].stats
));
611 bcopy(&(*sp
->sfb_bins
)[1].stats
, &sps
->binstats
[1].stats
,
612 sizeof(sps
->binstats
[1].stats
));
616 sfb_swap_bins(struct sfb
*sp
, u_int32_t len
)
620 if (sp
->sfb_flags
& SFBF_SUSPENDED
) {
625 VERIFY((s
+ (s
^ 1)) == 1);
627 (*sp
->sfb_bins
)[s
].fudge
= sfb_random(sp
); /* recompute perturbation */
628 sp
->sfb_clearpkts
= len
;
629 sp
->sfb_stats
.num_rehash
++;
631 s
= (sp
->sfb_current
^= 1); /* flip the bit (swap current) */
633 if (classq_verbose
) {
634 log(LOG_DEBUG
, "%s: SFB qid=%d, set %d is now current, "
635 "qlen=%d\n", if_name(sp
->sfb_ifp
), sp
->sfb_qid
, s
, len
);
638 /* clear freezetime for all current bins */
639 bzero(&(*sp
->sfb_bins
)[s
].freezetime
,
640 sizeof((*sp
->sfb_bins
)[s
].freezetime
));
642 /* clear/adjust bin statistics and flow control lists */
643 for (i
= 0; i
< SFB_BINS
; i
++) {
644 struct sfb_fcl
*fcl
= SFB_FC_LIST(sp
, i
);
646 if (!STAILQ_EMPTY(&fcl
->fclist
)) {
647 sfb_fclist_append(sp
, fcl
);
650 for (j
= 0; j
< SFB_LEVELS
; j
++) {
651 struct sfbbinstats
*cbin
, *wbin
;
653 cbin
= SFB_BINST(sp
, j
, i
, s
); /* current */
654 wbin
= SFB_BINST(sp
, j
, i
, s
^ 1); /* warm-up */
658 if (cbin
->pmark
> SFB_MAX_PMARK
) {
659 cbin
->pmark
= SFB_MAX_PMARK
;
661 if (cbin
->pmark
< 0) {
666 * Keep pmark from before to identify
667 * non-responsives immediately.
669 if (wbin
->pmark
> SFB_PMARK_WARM
) {
670 wbin
->pmark
= SFB_PMARK_WARM
;
677 sfb_pcheck(struct sfb
*sp
, uint32_t pkt_sfb_hash
)
681 #endif /* SFB_LEVELS != 2 */
682 uint8_t *pkt_sfb_hash8
= (uint8_t *)&pkt_sfb_hash
;
686 VERIFY((s
+ (s
^ 1)) == 1);
689 * For current bins, returns 1 if all pmark >= SFB_PMARK_TH,
690 * 0 otherwise; optimize for SFB_LEVELS=2.
694 * Level 0: bin index at [0] for set 0; [2] for set 1
695 * Level 1: bin index at [1] for set 0; [3] for set 1
697 if (SFB_BINST(sp
, 0, SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1)]),
698 s
)->pmark
< SFB_PMARK_TH
||
699 SFB_BINST(sp
, 1, SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1) + 1]),
700 s
)->pmark
< SFB_PMARK_TH
) {
703 #else /* SFB_LEVELS != 2 */
704 for (i
= 0; i
< SFB_LEVELS
; i
++) {
705 if (s
== 0) { /* set 0, bin index [0,1] */
706 n
= SFB_BINMASK(pkt_sfb_hash8
[i
]);
707 } else { /* set 1, bin index [2,3] */
708 n
= SFB_BINMASK(pkt_sfb_hash8
[i
+ 2]);
711 if (SFB_BINST(sp
, i
, n
, s
)->pmark
< SFB_PMARK_TH
) {
715 #endif /* SFB_LEVELS != 2 */
720 sfb_penalize(struct sfb
*sp
, uint32_t pkt_sfb_hash
, uint32_t *pkt_sfb_flags
,
721 struct timespec
*now
)
723 struct timespec delta
= { .tv_sec
= 0, .tv_nsec
= 0 };
724 uint8_t *pkt_sfb_hash8
= (uint8_t *)&pkt_sfb_hash
;
726 /* If minimum pmark of current bins is < SFB_PMARK_TH, we're done */
727 if (!sfb_ratelimit
|| !sfb_pcheck(sp
, pkt_sfb_hash
)) {
731 net_timersub(now
, &sp
->sfb_pboxfreeze
, &delta
);
732 if (net_timercmp(&delta
, &sp
->sfb_pboxtime
, <)) {
735 #endif /* SFB_LEVELS != 2 */
736 struct sfbbinstats
*bin
;
739 w
= sp
->sfb_current
^ 1;
740 VERIFY((w
+ (w
^ 1)) == 1);
743 * Update warm-up bins; optimize for SFB_LEVELS=2
746 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
747 n
= SFB_BINMASK(pkt_sfb_hash8
[(w
<< 1)]);
748 bin
= SFB_BINST(sp
, 0, n
, w
);
749 if (bin
->pkts
>= sp
->sfb_allocation
) {
750 sfb_increment_bin(sp
, bin
, SFB_BINFT(sp
, 0, n
, w
), now
);
753 /* Level 0: bin index at [1] for set 0; [3] for set 1 */
754 n
= SFB_BINMASK(pkt_sfb_hash8
[(w
<< 1) + 1]);
755 bin
= SFB_BINST(sp
, 1, n
, w
);
756 if (bin
->pkts
>= sp
->sfb_allocation
) {
757 sfb_increment_bin(sp
, bin
, SFB_BINFT(sp
, 1, n
, w
), now
);
759 #else /* SFB_LEVELS != 2 */
760 for (i
= 0; i
< SFB_LEVELS
; i
++) {
761 if (w
== 0) { /* set 0, bin index [0,1] */
762 n
= SFB_BINMASK(pkt_sfb_hash8
[i
]);
763 } else { /* set 1, bin index [2,3] */
764 n
= SFB_BINMASK(pkt_sfb_hash8
[i
+ 2]);
767 bin
= SFB_BINST(sp
, i
, n
, w
);
768 if (bin
->pkts
>= sp
->sfb_allocation
) {
769 sfb_increment_bin(sp
, bin
,
770 SFB_BINFT(sp
, i
, n
, w
), now
);
773 #endif /* SFB_LEVELS != 2 */
777 /* non-conformant or else misclassified flow; queue it anyway */
778 *pkt_sfb_flags
|= SFB_PKT_PBOX
;
779 *(&sp
->sfb_pboxfreeze
) = *now
;
785 sfb_adjust_bin(struct sfb
*sp
, struct sfbbinstats
*bin
, struct timespec
*ft
,
786 struct timespec
*now
, boolean_t inc
)
788 struct timespec delta
;
790 net_timersub(now
, ft
, &delta
);
791 if (net_timercmp(&delta
, &sp
->sfb_holdtime
, <)) {
792 if (classq_verbose
> 1) {
793 log(LOG_DEBUG
, "%s: SFB qid=%d, %s update frozen "
794 "(delta=%llu nsec)\n", if_name(sp
->sfb_ifp
),
795 sp
->sfb_qid
, inc
? "increment" : "decrement",
796 (u_int64_t
)delta
.tv_nsec
);
801 /* increment/decrement marking probability */
811 sfb_decrement_bin(struct sfb
*sp
, struct sfbbinstats
*bin
, struct timespec
*ft
,
812 struct timespec
*now
)
814 return sfb_adjust_bin(sp
, bin
, ft
, now
, FALSE
);
818 sfb_increment_bin(struct sfb
*sp
, struct sfbbinstats
*bin
, struct timespec
*ft
,
819 struct timespec
*now
)
821 return sfb_adjust_bin(sp
, bin
, ft
, now
, TRUE
);
825 sfb_dq_update_bins(struct sfb
*sp
, uint32_t pkt_sfb_hash
, uint32_t pkt_len
,
826 struct timespec
*now
, u_int32_t qsize
)
828 #if SFB_LEVELS != 2 || SFB_FC_LEVEL != 0
830 #endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
831 struct sfbbinstats
*bin
;
833 struct sfb_fcl
*fcl
= NULL
;
834 uint8_t *pkt_sfb_hash8
= (uint8_t *)&pkt_sfb_hash
;
837 VERIFY((s
+ (s
^ 1)) == 1);
840 * Update current bins; optimize for SFB_LEVELS=2 and SFB_FC_LEVEL=0
842 #if SFB_LEVELS == 2 && SFB_FC_LEVEL == 0
843 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
844 n
= SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1)]);
845 bin
= SFB_BINST(sp
, 0, n
, s
);
847 VERIFY(bin
->pkts
> 0 && bin
->bytes
>= pkt_len
);
849 bin
->bytes
-= pkt_len
;
851 if (bin
->pkts
== 0) {
852 sfb_decrement_bin(sp
, bin
, SFB_BINFT(sp
, 0, n
, s
), now
);
855 /* Deliver flow control feedback to the sockets */
856 if (SFB_QUEUE_DELAYBASED(sp
)) {
857 if (!(SFB_IS_DELAYHIGH(sp
)) ||
858 bin
->bytes
<= sp
->sfb_fc_threshold
||
859 bin
->pkts
== 0 || qsize
== 0) {
860 fcl
= SFB_FC_LIST(sp
, n
);
862 } else if (bin
->pkts
<= (sp
->sfb_allocation
>> 2)) {
863 fcl
= SFB_FC_LIST(sp
, n
);
866 if (fcl
!= NULL
&& !STAILQ_EMPTY(&fcl
->fclist
)) {
867 sfb_fclist_append(sp
, fcl
);
871 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
872 n
= SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1) + 1]);
873 bin
= SFB_BINST(sp
, 1, n
, s
);
875 VERIFY(bin
->pkts
> 0 && bin
->bytes
>= (u_int64_t
)pkt_len
);
877 bin
->bytes
-= pkt_len
;
878 if (bin
->pkts
== 0) {
879 sfb_decrement_bin(sp
, bin
, SFB_BINFT(sp
, 1, n
, s
), now
);
881 #else /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
882 for (i
= 0; i
< SFB_LEVELS
; i
++) {
883 if (s
== 0) { /* set 0, bin index [0,1] */
884 n
= SFB_BINMASK(pkt_sfb_hash8
[i
]);
885 } else { /* set 1, bin index [2,3] */
886 n
= SFB_BINMASK(pkt_sfb_hash8
[i
+ 2]);
889 bin
= SFB_BINST(sp
, i
, n
, s
);
891 VERIFY(bin
->pkts
> 0 && bin
->bytes
>= pkt_len
);
893 bin
->bytes
-= pkt_len
;
894 if (bin
->pkts
== 0) {
895 sfb_decrement_bin(sp
, bin
,
896 SFB_BINFT(sp
, i
, n
, s
), now
);
898 if (i
!= SFB_FC_LEVEL
) {
901 if (SFB_QUEUE_DELAYBASED(sp
)) {
902 if (!(SFB_IS_DELAYHIGH(sp
)) ||
903 bin
->bytes
<= sp
->sfb_fc_threshold
) {
904 fcl
= SFB_FC_LIST(sp
, n
);
906 } else if (bin
->pkts
<= (sp
->sfb_allocation
>> 2)) {
907 fcl
= SFB_FC_LIST(sp
, n
);
909 if (fcl
!= NULL
&& !STAILQ_EMPTY(&fcl
->fclist
)) {
910 sfb_fclist_append(sp
, fcl
);
914 #endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
918 sfb_eq_update_bins(struct sfb
*sp
, uint32_t pkt_sfb_hash
, uint32_t pkt_len
)
922 #endif /* SFB_LEVELS != 2 */
924 struct sfbbinstats
*bin
;
925 uint8_t *pkt_sfb_hash8
= (uint8_t *)&pkt_sfb_hash
;
927 VERIFY((s
+ (s
^ 1)) == 1);
930 * Update current bins; optimize for SFB_LEVELS=2
933 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
934 bin
= SFB_BINST(sp
, 0,
935 SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1)]), s
);
937 bin
->bytes
+= pkt_len
;
939 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
940 bin
= SFB_BINST(sp
, 1,
941 SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1) + 1]), s
);
943 bin
->bytes
+= pkt_len
;
945 #else /* SFB_LEVELS != 2 */
946 for (i
= 0; i
< SFB_LEVELS
; i
++) {
947 if (s
== 0) { /* set 0, bin index [0,1] */
948 n
= SFB_BINMASK(pkt_sfb_hash8
[i
]);
949 } else { /* set 1, bin index [2,3] */
950 n
= SFB_BINMASK(pkt_sfb_hash8
[i
+ 2]);
953 bin
= SFB_BINST(sp
, i
, n
, s
);
955 bin
->bytes
+= pkt_len
;
957 #endif /* SFB_LEVELS != 2 */
961 sfb_bin_addfcentry(struct sfb
*sp
, pktsched_pkt_t
*pkt
, uint32_t pkt_sfb_hash
,
962 uint8_t flowsrc
, uint32_t flowid
)
964 struct flowadv_fcentry
*fce
;
967 uint8_t *pkt_sfb_hash8
= (uint8_t *)&pkt_sfb_hash
;
970 VERIFY((s
+ (s
^ 1)) == 1);
973 sp
->sfb_stats
.null_flowid
++;
978 * Use value at index 0 for set 0 and
979 * value at index 2 for set 1
981 fcl
= SFB_FC_LIST(sp
, SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1)]));
982 STAILQ_FOREACH(fce
, &fcl
->fclist
, fce_link
) {
983 if ((uint8_t)fce
->fce_flowsrc_type
== flowsrc
&&
984 fce
->fce_flowid
== flowid
) {
985 /* Already on flow control list; just return */
990 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
991 fce
= pktsched_alloc_fcentry(pkt
, sp
->sfb_ifp
, M_WAITOK
);
993 STAILQ_INSERT_TAIL(&fcl
->fclist
, fce
, fce_link
);
995 sp
->sfb_stats
.flow_controlled
++;
1002 * check if this flow needs to be flow-controlled or if this
1003 * packet needs to be dropped.
1006 sfb_bin_mark_or_drop(struct sfb
*sp
, struct sfbbinstats
*bin
)
1009 if (SFB_QUEUE_DELAYBASED(sp
)) {
1011 * Mark or drop if this bin has more
1012 * bytes than the flowcontrol threshold.
1014 if (SFB_IS_DELAYHIGH(sp
) &&
1015 bin
->bytes
>= (sp
->sfb_fc_threshold
<< 1)) {
1019 if (bin
->pkts
>= sp
->sfb_allocation
&&
1020 bin
->pkts
>= sp
->sfb_drop_thresh
) {
1021 ret
= 1; /* drop or mark */
1028 * early-drop probability is kept in pmark of each bin of the flow
1031 sfb_drop_early(struct sfb
*sp
, uint32_t pkt_sfb_hash
, u_int16_t
*pmin
,
1032 struct timespec
*now
)
1036 #endif /* SFB_LEVELS != 2 */
1037 struct sfbbinstats
*bin
;
1039 uint8_t *pkt_sfb_hash8
= (uint8_t *)&pkt_sfb_hash
;
1041 s
= sp
->sfb_current
;
1042 VERIFY((s
+ (s
^ 1)) == 1);
1044 *pmin
= (u_int16_t
)-1;
1047 * Update current bins; optimize for SFB_LEVELS=2
1050 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
1051 n
= SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1)]);
1052 bin
= SFB_BINST(sp
, 0, n
, s
);
1053 if (*pmin
> (u_int16_t
)bin
->pmark
) {
1054 *pmin
= (u_int16_t
)bin
->pmark
;
1058 /* Update SFB probability */
1059 if (bin
->pkts
>= sp
->sfb_allocation
) {
1060 sfb_increment_bin(sp
, bin
, SFB_BINFT(sp
, 0, n
, s
), now
);
1063 ret
= sfb_bin_mark_or_drop(sp
, bin
);
1065 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
1066 n
= SFB_BINMASK(pkt_sfb_hash8
[(s
<< 1) + 1]);
1067 bin
= SFB_BINST(sp
, 1, n
, s
);
1068 if (*pmin
> (u_int16_t
)bin
->pmark
) {
1069 *pmin
= (u_int16_t
)bin
->pmark
;
1072 if (bin
->pkts
>= sp
->sfb_allocation
) {
1073 sfb_increment_bin(sp
, bin
, SFB_BINFT(sp
, 1, n
, s
), now
);
1075 #else /* SFB_LEVELS != 2 */
1076 for (i
= 0; i
< SFB_LEVELS
; i
++) {
1077 if (s
== 0) { /* set 0, bin index [0,1] */
1078 n
= SFB_BINMASK(pkt_sfb_hash8
[i
]);
1079 } else { /* set 1, bin index [2,3] */
1080 n
= SFB_BINMASK(pkt_sfb_hash8
[i
+ 2]);
1083 bin
= SFB_BINST(sp
, i
, n
, s
);
1084 if (*pmin
> (u_int16_t
)bin
->pmark
) {
1085 *pmin
= (u_int16_t
)bin
->pmark
;
1088 if (bin
->pkts
>= sp
->sfb_allocation
) {
1089 sfb_increment_bin(sp
, bin
,
1090 SFB_BINFT(sp
, i
, n
, s
), now
);
1092 if (i
== SFB_FC_LEVEL
) {
1093 ret
= sfb_bin_mark_or_drop(sp
, bin
);
1096 #endif /* SFB_LEVELS != 2 */
1098 if (sp
->sfb_flags
& SFBF_SUSPENDED
) {
1099 ret
= 1; /* drop or mark */
1105 sfb_detect_dequeue_stall(struct sfb
*sp
, class_queue_t
*q
,
1106 struct timespec
*now
)
1108 struct timespec max_getqtime
;
1110 if (!SFB_QUEUE_DELAYBASED(sp
) || SFB_IS_DELAYHIGH(sp
) ||
1111 qsize(q
) <= SFB_MIN_FC_THRESHOLD_BYTES
||
1112 !net_timerisset(&sp
->sfb_getqtime
)) {
1116 net_timeradd(&sp
->sfb_getqtime
, &sp
->sfb_update_interval
,
1118 if (net_timercmp(now
, &max_getqtime
, >)) {
1120 * No packets have been dequeued in an update interval
1121 * worth of time. It means that the queue is stalled
1123 SFB_SET_DELAY_HIGH(sp
, q
);
1124 sp
->sfb_stats
.dequeue_stall
++;
1128 #define DTYPE_NODROP 0 /* no drop */
1129 #define DTYPE_FORCED 1 /* a "forced" drop */
1130 #define DTYPE_EARLY 2 /* an "unforced" (early) drop */
1133 sfb_addq(struct sfb
*sp
, class_queue_t
*q
, pktsched_pkt_t
*pkt
,
1138 #endif /* !PF_ECN */
1139 struct timespec now
;
1143 int ret
= CLASSQEQ_SUCCESS
;
1144 uint32_t maxqsize
= 0;
1145 uint64_t *pkt_timestamp
;
1146 uint32_t *pkt_sfb_hash
;
1147 uint16_t *pkt_sfb_hash16
;
1148 uint32_t *pkt_sfb_flags
;
1149 uint32_t pkt_flowid
;
1150 volatile uint32_t *pkt_flags
;
1151 uint8_t pkt_proto
, pkt_flowsrc
;
1153 s
= sp
->sfb_current
;
1154 VERIFY((s
+ (s
^ 1)) == 1);
1156 pktsched_get_pkt_vars(pkt
, &pkt_flags
, &pkt_timestamp
, &pkt_flowid
,
1157 &pkt_flowsrc
, &pkt_proto
, NULL
);
1158 pkt_sfb_hash
= pktsched_get_pkt_sfb_vars(pkt
, &pkt_sfb_flags
);
1159 pkt_sfb_hash16
= (uint16_t *)pkt_sfb_hash
;
1161 switch (pkt
->pktsched_ptype
) {
1163 /* See comments in <rdar://problem/14040693> */
1164 VERIFY(!(*pkt_flags
& PKTF_PRIV_GUARDED
));
1165 *pkt_flags
|= PKTF_PRIV_GUARDED
;
1170 __builtin_unreachable();
1173 if (*pkt_timestamp
> 0) {
1174 net_nsectimer(pkt_timestamp
, &now
);
1177 net_timernsec(&now
, pkt_timestamp
);
1180 /* time to swap the bins? */
1181 if (net_timercmp(&now
, &sp
->sfb_nextreset
, >=)) {
1182 net_timeradd(&now
, &sp
->sfb_hinterval
, &sp
->sfb_nextreset
);
1183 sfb_swap_bins(sp
, qlen(q
));
1184 s
= sp
->sfb_current
;
1185 VERIFY((s
+ (s
^ 1)) == 1);
1188 if (!net_timerisset(&sp
->sfb_update_time
)) {
1189 net_timeradd(&now
, &sp
->sfb_update_interval
,
1190 &sp
->sfb_update_time
);
1194 * If getq time is not set because this is the first packet
1195 * or after idle time, set it now so that we can detect a stall.
1197 if (qsize(q
) == 0 && !net_timerisset(&sp
->sfb_getqtime
)) {
1198 *(&sp
->sfb_getqtime
) = *(&now
);
1203 (SFB_HASH(&pkt_flowid
, sizeof(pkt_flowid
),
1204 (*sp
->sfb_bins
)[s
].fudge
) & SFB_HASHMASK
);
1205 pkt_sfb_hash16
[s
^ 1] =
1206 (SFB_HASH(&pkt_flowid
, sizeof(pkt_flowid
),
1207 (*sp
->sfb_bins
)[s
^ 1].fudge
) & SFB_HASHMASK
);
1209 /* check if the queue has been stalled */
1210 sfb_detect_dequeue_stall(sp
, q
, &now
);
1212 /* see if we drop early */
1213 droptype
= DTYPE_NODROP
;
1214 if (sfb_drop_early(sp
, *pkt_sfb_hash
, &pmin
, &now
)) {
1215 /* flow control, mark or drop by sfb */
1216 if ((sp
->sfb_flags
& SFBF_FLOWCTL
) &&
1217 (*pkt_flags
& PKTF_FLOW_ADV
)) {
1219 /* drop all during suspension or for non-TCP */
1220 if ((sp
->sfb_flags
& SFBF_SUSPENDED
) ||
1221 pkt_proto
!= IPPROTO_TCP
) {
1222 droptype
= DTYPE_EARLY
;
1223 sp
->sfb_stats
.drop_early
++;
1227 /* XXX: only supported for mbuf */
1228 else if ((sp
->sfb_flags
& SFBF_ECN
) &&
1229 (pkt
->pktsched_ptype
== QP_MBUF
) &&
1230 (pkt_proto
== IPPROTO_TCP
) && /* only for TCP */
1231 ((sfb_random(sp
) & SFB_MAX_PMARK
) <= pmin
) &&
1232 mark_ecn(m
, t
, sp
->sfb_flags
) &&
1233 !(sp
->sfb_flags
& SFBF_SUSPENDED
)) {
1234 /* successfully marked; do not drop. */
1235 sp
->sfb_stats
.marked_packets
++;
1239 /* unforced drop by sfb */
1240 droptype
= DTYPE_EARLY
;
1241 sp
->sfb_stats
.drop_early
++;
1245 /* non-responsive flow penalty? */
1246 if (droptype
== DTYPE_NODROP
&& sfb_penalize(sp
, *pkt_sfb_hash
,
1247 pkt_sfb_flags
, &now
)) {
1248 droptype
= DTYPE_FORCED
;
1249 sp
->sfb_stats
.drop_pbox
++;
1252 if (SFB_QUEUE_DELAYBASED(sp
)) {
1253 maxqsize
= SFB_QUEUE_DELAYBASED_MAXSIZE
;
1255 maxqsize
= qlimit(q
);
1259 * When the queue length hits the queue limit, make it a forced
1262 if (droptype
== DTYPE_NODROP
&& qlen(q
) >= maxqsize
) {
1263 if (pkt_proto
== IPPROTO_TCP
&&
1264 qlen(q
) < (maxqsize
+ (maxqsize
>> 1)) &&
1265 ((*pkt_flags
& PKTF_TCP_REXMT
) ||
1266 (sp
->sfb_flags
& SFBF_LAST_PKT_DROPPED
))) {
1268 * At some level, dropping packets will make the
1269 * flows backoff and will keep memory requirements
1270 * under control. But we should not cause a tail
1271 * drop because it can take a long time for a
1272 * TCP flow to recover. We should try to drop
1273 * alternate packets instead.
1275 sp
->sfb_flags
&= ~SFBF_LAST_PKT_DROPPED
;
1277 droptype
= DTYPE_FORCED
;
1278 sp
->sfb_stats
.drop_queue
++;
1279 sp
->sfb_flags
|= SFBF_LAST_PKT_DROPPED
;
1283 if (fc_adv
== 1 && droptype
!= DTYPE_FORCED
&&
1284 sfb_bin_addfcentry(sp
, pkt
, *pkt_sfb_hash
, pkt_flowsrc
,
1286 /* deliver flow control advisory error */
1287 if (droptype
== DTYPE_NODROP
) {
1288 ret
= CLASSQEQ_SUCCESS_FC
;
1289 VERIFY(!(sp
->sfb_flags
& SFBF_SUSPENDED
));
1290 } else if (sp
->sfb_flags
& SFBF_SUSPENDED
) {
1291 /* drop due to suspension */
1292 ret
= CLASSQEQ_DROP_SP
;
1294 /* drop due to flow-control */
1295 ret
= CLASSQEQ_DROP_FC
;
1298 /* if successful enqueue this packet, else drop it */
1299 if (droptype
== DTYPE_NODROP
) {
1300 VERIFY(pkt
->pktsched_ptype
== qptype(q
));
1301 _addq(q
, &pkt
->pktsched_pkt
);
1303 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
1304 return (ret
!= CLASSQEQ_SUCCESS
) ? ret
: CLASSQEQ_DROP
;
1307 if (!(*pkt_sfb_flags
& SFB_PKT_PBOX
)) {
1308 sfb_eq_update_bins(sp
, *pkt_sfb_hash
,
1309 pktsched_get_pkt_len(pkt
));
1311 sp
->sfb_stats
.pbox_packets
++;
1314 /* successfully queued */
1319 sfb_getq_flow(struct sfb
*sp
, class_queue_t
*q
, u_int32_t flow
, boolean_t purge
,
1320 pktsched_pkt_t
*pkt
)
1322 struct timespec now
;
1323 uint64_t *pkt_timestamp
;
1324 volatile uint32_t *pkt_flags
;
1325 uint32_t *pkt_sfb_flags
;
1326 uint32_t *pkt_sfb_hash
;
1327 classq_pkt_t p
= CLASSQ_PKT_INITIALIZER(p
);
1329 if (!purge
&& (sp
->sfb_flags
& SFBF_SUSPENDED
)) {
1335 /* flow of 0 means head of queue */
1339 _getq_flow(q
, &p
, flow
);
1342 if (p
.cp_ptype
== QP_INVALID
) {
1344 net_timerclear(&sp
->sfb_getqtime
);
1349 pktsched_pkt_encap(pkt
, &p
);
1350 pktsched_get_pkt_vars(pkt
, &pkt_flags
, &pkt_timestamp
, NULL
,
1352 pkt_sfb_hash
= pktsched_get_pkt_sfb_vars(pkt
, &pkt_sfb_flags
);
1354 /* See comments in <rdar://problem/14040693> */
1355 switch (p
.cp_ptype
) {
1357 VERIFY(*pkt_flags
& PKTF_PRIV_GUARDED
);
1362 __builtin_unreachable();
1366 /* calculate EWMA of dequeues */
1367 if (net_timerisset(&sp
->sfb_getqtime
)) {
1368 struct timespec delta
;
1370 net_timersub(&now
, &sp
->sfb_getqtime
, &delta
);
1371 net_timernsec(&delta
, &new);
1372 avg
= sp
->sfb_stats
.dequeue_avg
;
1374 int decay
= DEQUEUE_DECAY
;
1376 * If the time since last dequeue is
1377 * significantly greater than the current
1378 * average, weigh the average more against
1381 if (DEQUEUE_SPIKE(new, avg
)) {
1384 avg
= (((avg
<< decay
) - avg
) + new) >> decay
;
1388 sp
->sfb_stats
.dequeue_avg
= avg
;
1390 *(&sp
->sfb_getqtime
) = *(&now
);
1393 if (!purge
&& SFB_QUEUE_DELAYBASED(sp
)) {
1394 u_int64_t dequeue_ns
, queue_delay
= 0;
1395 net_timernsec(&now
, &dequeue_ns
);
1396 if (dequeue_ns
> *pkt_timestamp
) {
1397 queue_delay
= dequeue_ns
- *pkt_timestamp
;
1400 if (sp
->sfb_min_qdelay
== 0 ||
1401 (queue_delay
> 0 && queue_delay
< sp
->sfb_min_qdelay
)) {
1402 sp
->sfb_min_qdelay
= queue_delay
;
1404 if (net_timercmp(&now
, &sp
->sfb_update_time
, >=)) {
1405 if (sp
->sfb_min_qdelay
> sp
->sfb_target_qdelay
) {
1406 if (!SFB_IS_DELAYHIGH(sp
)) {
1407 SFB_SET_DELAY_HIGH(sp
, q
);
1410 sp
->sfb_flags
&= ~(SFBF_DELAYHIGH
);
1411 sp
->sfb_fc_threshold
= 0;
1413 net_timeradd(&now
, &sp
->sfb_update_interval
,
1414 &sp
->sfb_update_time
);
1415 sp
->sfb_min_qdelay
= 0;
1421 * Clearpkts are the ones which were in the queue when the hash
1422 * function was perturbed. Since the perturbation value (fudge),
1423 * and thus bin information for these packets is not known, we do
1424 * not change accounting information while dequeuing these packets.
1425 * It is important not to set the hash interval too small due to
1426 * this reason. A rule of thumb is to set it to K*D, where D is
1427 * the time taken to drain queue.
1429 if (*pkt_sfb_flags
& SFB_PKT_PBOX
) {
1430 *pkt_sfb_flags
&= ~SFB_PKT_PBOX
;
1431 if (sp
->sfb_clearpkts
> 0) {
1432 sp
->sfb_clearpkts
--;
1434 } else if (sp
->sfb_clearpkts
> 0) {
1435 sp
->sfb_clearpkts
--;
1437 sfb_dq_update_bins(sp
, *pkt_sfb_hash
, pktsched_get_pkt_len(pkt
),
1441 switch (p
.cp_ptype
) {
1443 /* See comments in <rdar://problem/14040693> */
1444 *pkt_flags
&= ~PKTF_PRIV_GUARDED
;
1449 __builtin_unreachable();
1453 * If the queue becomes empty before the update interval, reset
1454 * the flow control threshold
1456 if (qsize(q
) == 0) {
1457 sp
->sfb_flags
&= ~SFBF_DELAYHIGH
;
1458 sp
->sfb_min_qdelay
= 0;
1459 sp
->sfb_fc_threshold
= 0;
1460 net_timerclear(&sp
->sfb_update_time
);
1461 net_timerclear(&sp
->sfb_getqtime
);
1463 return pkt
->pktsched_pkt_mbuf
;
1467 sfb_getq(struct sfb
*sp
, class_queue_t
*q
, pktsched_pkt_t
*pkt
)
1469 sfb_getq_flow(sp
, q
, 0, FALSE
, pkt
);
1473 sfb_purgeq(struct sfb
*sp
, class_queue_t
*q
, u_int32_t flow
, u_int32_t
*packets
,
1476 u_int32_t cnt
= 0, len
= 0;
1479 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
1480 while (sfb_getq_flow(sp
, q
, flow
, TRUE
, &pkt
) != NULL
) {
1482 len
+= pktsched_get_pkt_len(&pkt
);
1483 pktsched_free_pkt(&pkt
);
1486 if (packets
!= NULL
) {
1489 if (bytes
!= NULL
) {
1495 sfb_updateq(struct sfb
*sp
, cqev_t ev
)
1497 struct ifnet
*ifp
= sp
->sfb_ifp
;
1499 VERIFY(ifp
!= NULL
);
1502 case CLASSQ_EV_LINK_BANDWIDTH
: {
1503 u_int64_t eff_rate
= ifnet_output_linkrate(ifp
);
1505 /* update parameters only if rate has changed */
1506 if (eff_rate
== sp
->sfb_eff_rate
) {
1510 if (classq_verbose
) {
1511 log(LOG_DEBUG
, "%s: SFB qid=%d, adapting to new "
1512 "eff_rate=%llu bps\n", if_name(ifp
), sp
->sfb_qid
,
1515 sfb_calc_holdtime(sp
, eff_rate
);
1516 sfb_calc_pboxtime(sp
, eff_rate
);
1517 ifclassq_calc_target_qdelay(ifp
, &sp
->sfb_target_qdelay
);
1518 sfb_calc_update_interval(sp
, eff_rate
);
1522 case CLASSQ_EV_LINK_UP
:
1523 case CLASSQ_EV_LINK_DOWN
:
1524 if (classq_verbose
) {
1525 log(LOG_DEBUG
, "%s: SFB qid=%d, resetting due to "
1526 "link %s\n", if_name(ifp
), sp
->sfb_qid
,
1527 (ev
== CLASSQ_EV_LINK_UP
) ? "UP" : "DOWN");
1532 case CLASSQ_EV_LINK_LATENCY
:
1533 case CLASSQ_EV_LINK_MTU
:
1540 sfb_suspendq(struct sfb
*sp
, class_queue_t
*q
, boolean_t on
)
1543 struct ifnet
*ifp
= sp
->sfb_ifp
;
1545 VERIFY(ifp
!= NULL
);
1547 if ((on
&& (sp
->sfb_flags
& SFBF_SUSPENDED
)) ||
1548 (!on
&& !(sp
->sfb_flags
& SFBF_SUSPENDED
))) {
1552 if (!(sp
->sfb_flags
& SFBF_FLOWCTL
)) {
1553 log(LOG_ERR
, "%s: SFB qid=%d, unable to %s queue since "
1554 "flow-control is not enabled", if_name(ifp
), sp
->sfb_qid
,
1555 (on
? "suspend" : "resume"));
1559 if (classq_verbose
) {
1560 log(LOG_DEBUG
, "%s: SFB qid=%d, setting state to %s",
1561 if_name(ifp
), sp
->sfb_qid
, (on
? "SUSPENDED" : "RUNNING"));
1565 sp
->sfb_flags
|= SFBF_SUSPENDED
;
1567 sp
->sfb_flags
&= ~SFBF_SUSPENDED
;
1568 sfb_swap_bins(sp
, qlen(q
));