]> git.saurik.com Git - apple/xnu.git/blob - bsd/net/classq/classq_sfb.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / bsd / net / classq / classq_sfb.c
1 /*
2 * Copyright (c) 2011-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/mbuf.h>
32 #include <sys/socket.h>
33 #include <sys/sockio.h>
34 #include <sys/systm.h>
35 #include <sys/sysctl.h>
36 #include <sys/syslog.h>
37 #include <sys/proc.h>
38 #include <sys/errno.h>
39 #include <sys/kernel.h>
40 #include <sys/kauth.h>
41
42 #include <kern/zalloc.h>
43
44 #include <net/if.h>
45 #include <net/if_var.h>
46 #include <net/if_types.h>
47 #include <net/dlil.h>
48 #include <net/flowadv.h>
49
50 #include <netinet/in.h>
51 #include <netinet/in_systm.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip6.h>
54
55 #include <net/classq/classq_sfb.h>
56 #include <net/flowhash.h>
57 #include <net/net_osdep.h>
58 #include <dev/random/randomdev.h>
59
60 /*
61 * Stochastic Fair Blue
62 *
63 * Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin
64 * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
65 *
66 * Based on the NS code with the following parameters:
67 *
68 * bytes: false
69 * decrement: 0.001
70 * increment: 0.005
71 * hold-time: 10ms-50ms (randomized)
72 * algorithm: 0
73 * pbox: 1
74 * pbox-time: 50-100ms (randomized)
75 * hinterval: 11-23 (randomized)
76 *
77 * This implementation uses L = 2 and N = 32 for 2 sets of:
78 *
79 * B[L][N]: L x N array of bins (L levels, N bins per level)
80 *
81 * Each set effectively creates 32^2 virtual buckets (bin combinations)
82 * while using only O(32*2) states.
83 *
84 * Given a 32-bit hash value, we divide it such that octets [0,1,2,3] are
85 * used as index for the bins across the 2 levels, where level 1 uses [0,2]
86 * and level 2 uses [1,3]. The 2 values per level correspond to the indices
87 * for the current and warm-up sets (section 4.4. in the SFB paper regarding
88 * Moving Hash Functions explains the purposes of these 2 sets.)
89 */
90
91 /*
92 * Use Murmur3A_x86_32 for hash function. It seems to perform consistently
93 * across platforms for 1-word key (32-bit flowhash value). See flowhash.h
94 * for other alternatives. We only need 16-bit hash output.
95 */
96 #define SFB_HASH net_flowhash_mh3_x86_32
97 #define SFB_HASHMASK HASHMASK(16)
98
99 #define SFB_BINMASK(_x) \
100 ((_x) & HASHMASK(SFB_BINS_SHIFT))
101
102 #define SFB_BINST(_sp, _l, _n, _c) \
103 (&(*(_sp)->sfb_bins)[_c].stats[_l][_n])
104
105 #define SFB_BINFT(_sp, _l, _n, _c) \
106 (&(*(_sp)->sfb_bins)[_c].freezetime[_l][_n])
107
108 #define SFB_FC_LIST(_sp, _n) \
109 (&(*(_sp)->sfb_fc_lists)[_n])
110
111 /*
112 * The holdtime parameter determines the minimum time interval between
113 * two successive updates of the marking probability. In the event the
114 * uplink speed is not known, a default value is chosen and is randomized
115 * to be within the following range.
116 */
117 #define HOLDTIME_BASE (100ULL * 1000 * 1000) /* 100ms */
118 #define HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10ms */
119 #define HOLDTIME_MAX (100ULL * 1000 * 1000) /* 100ms */
120
121 /*
122 * The pboxtime parameter determines the bandwidth allocated for rogue
123 * flows, i.e. the rate limiting bandwidth. In the event the uplink speed
124 * is not known, a default value is chosen and is randomized to be within
125 * the following range.
126 */
127 #define PBOXTIME_BASE (300ULL * 1000 * 1000) /* 300ms */
128 #define PBOXTIME_MIN (30ULL * 1000 * 1000) /* 30ms */
129 #define PBOXTIME_MAX (300ULL * 1000 * 1000) /* 300ms */
130
131 /*
132 * Target queueing delay is the amount of extra delay that can be added
133 * to accommodate variations in the link bandwidth. The queue should be
134 * large enough to induce this much delay and nothing more than that.
135 */
136 #define TARGET_QDELAY_BASE (10ULL * 1000 * 1000) /* 10ms */
137 #define TARGET_QDELAY_MIN (10ULL * 1000) /* 10us */
138 #define TARGET_QDELAY_MAX (20ULL * 1000 * 1000 * 1000) /* 20s */
139
140 /*
141 * Update interval for checking the extra delay added by the queue. This
142 * should be 90-95 percentile of RTT experienced by any TCP connection
143 * so that it will take care of the burst traffic.
144 */
145 #define UPDATE_INTERVAL_BASE (100ULL * 1000 * 1000) /* 100ms */
146 #define UPDATE_INTERVAL_MIN (100ULL * 1000 * 1000) /* 100ms */
147 #define UPDATE_INTERVAL_MAX (10ULL * 1000 * 1000 * 1000) /* 10s */
148
149 #define SFB_RANDOM(sp, tmin, tmax) ((sfb_random(sp) % (tmax)) + (tmin))
150
151 #define SFB_PKT_PBOX 0x1 /* in penalty box */
152
153 /* The following mantissa values are in SFB_FP_SHIFT Q format */
154 #define SFB_MAX_PMARK (1 << SFB_FP_SHIFT) /* Q14 representation of 1.00 */
155
156 /*
157 * These are d1 (increment) and d2 (decrement) parameters, used to determine
158 * the amount by which the marking probability is incremented when the queue
159 * overflows, or is decremented when the link is idle. d1 is set higher than
160 * d2, because link underutilization can occur when congestion management is
161 * either too conservative or too aggressive, but packet loss occurs only
162 * when congestion management is too conservative. By weighing heavily
163 * against packet loss, it can quickly reach to a substantial increase in
164 * traffic load.
165 */
166 #define SFB_INCREMENT 82 /* Q14 representation of 0.005 */
167 #define SFB_DECREMENT 16 /* Q14 representation of 0.001 */
168
169 #define SFB_PMARK_TH 16056 /* Q14 representation of 0.98 */
170 #define SFB_PMARK_WARM 3276 /* Q14 representation of 0.2 */
171
172 #define SFB_PMARK_INC(_bin) do { \
173 (_bin)->pmark += sfb_increment; \
174 if ((_bin)->pmark > SFB_MAX_PMARK) \
175 (_bin)->pmark = SFB_MAX_PMARK; \
176 } while (0)
177
178 #define SFB_PMARK_DEC(_bin) do { \
179 if ((_bin)->pmark > 0) { \
180 (_bin)->pmark -= sfb_decrement; \
181 if ((_bin)->pmark < 0) \
182 (_bin)->pmark = 0; \
183 } \
184 } while (0)
185
186 /* Minimum nuber of bytes in queue to get flow controlled */
187 #define SFB_MIN_FC_THRESHOLD_BYTES 7500
188
189 #define SFB_SET_DELAY_HIGH(_sp_, _q_) do { \
190 (_sp_)->sfb_flags |= SFBF_DELAYHIGH; \
191 (_sp_)->sfb_fc_threshold = ulmax(SFB_MIN_FC_THRESHOLD_BYTES, \
192 (uint32_t)(qsize((_q_)) >> 3)); \
193 } while (0)
194
195 #define SFB_QUEUE_DELAYBASED(_sp_) ((_sp_)->sfb_flags & SFBF_DELAYBASED)
196 #define SFB_IS_DELAYHIGH(_sp_) ((_sp_)->sfb_flags & SFBF_DELAYHIGH)
197 #define SFB_QUEUE_DELAYBASED_MAXSIZE 2048 /* max pkts */
198
199 #define HINTERVAL_MIN (10) /* 10 seconds */
200 #define HINTERVAL_MAX (20) /* 20 seconds */
201 #define SFB_HINTERVAL(sp) ((sfb_random(sp) % HINTERVAL_MAX) + HINTERVAL_MIN)
202
203 #define DEQUEUE_DECAY 7 /* ilog2 of EWMA decay rate, (128) */
204 #define DEQUEUE_SPIKE(_new, _old) \
205 ((u_int64_t)ABS((int64_t)(_new) - (int64_t)(_old)) > ((_old) << 11))
206
207 /* Place the flow control entries in current bin on level 0 */
208 #define SFB_FC_LEVEL 0
209
210 static ZONE_DECLARE(sfb_zone, "classq_sfb",
211 sizeof(struct sfb), ZC_ZFREE_CLEARMEM);
212
213 static ZONE_DECLARE(sfb_bins_zone, "classq_sfb_bins",
214 sizeof(struct sfb_bins), ZC_ZFREE_CLEARMEM);
215
216 static ZONE_DECLARE(sfb_fcl_zone, "classq_sfb_fcl",
217 sizeof(struct sfb_fcl), ZC_ZFREE_CLEARMEM);
218
219 /* internal function prototypes */
220 static u_int32_t sfb_random(struct sfb *);
221 static void *sfb_getq_flow(struct sfb *, class_queue_t *, u_int32_t, boolean_t,
222 pktsched_pkt_t *);
223 static void sfb_resetq(struct sfb *, cqev_t);
224 static void sfb_calc_holdtime(struct sfb *, u_int64_t);
225 static void sfb_calc_pboxtime(struct sfb *, u_int64_t);
226 static void sfb_calc_hinterval(struct sfb *, u_int64_t *);
227 static void sfb_calc_update_interval(struct sfb *, u_int64_t);
228 static void sfb_swap_bins(struct sfb *, u_int32_t);
229 static inline int sfb_pcheck(struct sfb *, uint32_t);
230 static int sfb_penalize(struct sfb *, uint32_t, uint32_t *, struct timespec *);
231 static void sfb_adjust_bin(struct sfb *, struct sfbbinstats *,
232 struct timespec *, struct timespec *, boolean_t);
233 static void sfb_decrement_bin(struct sfb *, struct sfbbinstats *,
234 struct timespec *, struct timespec *);
235 static void sfb_increment_bin(struct sfb *, struct sfbbinstats *,
236 struct timespec *, struct timespec *);
237 static inline void sfb_dq_update_bins(struct sfb *, uint32_t, uint32_t,
238 struct timespec *, u_int64_t qsize);
239 static inline void sfb_eq_update_bins(struct sfb *, uint32_t, uint32_t);
240 static int sfb_drop_early(struct sfb *, uint32_t, u_int16_t *,
241 struct timespec *);
242 static boolean_t sfb_bin_addfcentry(struct sfb *, pktsched_pkt_t *,
243 uint32_t, uint8_t, uint32_t);
244 static void sfb_fclist_append(struct sfb *, struct sfb_fcl *);
245 static void sfb_fclists_clean(struct sfb *sp);
246 static int sfb_bin_mark_or_drop(struct sfb *sp, struct sfbbinstats *bin);
247 static void sfb_detect_dequeue_stall(struct sfb *sp, class_queue_t *,
248 struct timespec *);
249
250 SYSCTL_NODE(_net_classq, OID_AUTO, sfb, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "SFB");
251
252 static u_int64_t sfb_holdtime = 0; /* 0 indicates "automatic" */
253 SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, holdtime, CTLFLAG_RW | CTLFLAG_LOCKED,
254 &sfb_holdtime, "SFB freeze time in nanoseconds");
255
256 static u_int64_t sfb_pboxtime = 0; /* 0 indicates "automatic" */
257 SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, pboxtime, CTLFLAG_RW | CTLFLAG_LOCKED,
258 &sfb_pboxtime, "SFB penalty box time in nanoseconds");
259
260 static u_int64_t sfb_hinterval;
261 SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, hinterval, CTLFLAG_RW | CTLFLAG_LOCKED,
262 &sfb_hinterval, "SFB hash interval in nanoseconds");
263
264 static u_int32_t sfb_increment = SFB_INCREMENT;
265 SYSCTL_UINT(_net_classq_sfb, OID_AUTO, increment, CTLFLAG_RW | CTLFLAG_LOCKED,
266 &sfb_increment, SFB_INCREMENT, "SFB increment [d1]");
267
268 static u_int32_t sfb_decrement = SFB_DECREMENT;
269 SYSCTL_UINT(_net_classq_sfb, OID_AUTO, decrement, CTLFLAG_RW | CTLFLAG_LOCKED,
270 &sfb_decrement, SFB_DECREMENT, "SFB decrement [d2]");
271
272 static u_int32_t sfb_allocation = 0; /* 0 means "automatic" */
273 SYSCTL_UINT(_net_classq_sfb, OID_AUTO, allocation, CTLFLAG_RW | CTLFLAG_LOCKED,
274 &sfb_allocation, 0, "SFB bin allocation");
275
276 static u_int32_t sfb_ratelimit = 0;
277 SYSCTL_UINT(_net_classq_sfb, OID_AUTO, ratelimit, CTLFLAG_RW | CTLFLAG_LOCKED,
278 &sfb_ratelimit, 0, "SFB rate limit");
279
280 #define KBPS (1ULL * 1000) /* 1 Kbits per second */
281 #define MBPS (1ULL * 1000 * 1000) /* 1 Mbits per second */
282 #define GBPS (MBPS * 1000) /* 1 Gbits per second */
283
284 struct sfb_time_tbl {
285 u_int64_t speed; /* uplink speed */
286 u_int64_t holdtime; /* hold time */
287 u_int64_t pboxtime; /* penalty box time */
288 };
289
290 static struct sfb_time_tbl sfb_ttbl[] = {
291 { .speed = 1 * MBPS, .holdtime = HOLDTIME_BASE * 1000, .pboxtime = PBOXTIME_BASE * 1000},
292 { .speed = 10 * MBPS, .holdtime = HOLDTIME_BASE * 100, .pboxtime = PBOXTIME_BASE * 100 },
293 { .speed = 100 * MBPS, .holdtime = HOLDTIME_BASE * 10, .pboxtime = PBOXTIME_BASE * 10 },
294 { .speed = 1 * GBPS, .holdtime = HOLDTIME_BASE, .pboxtime = PBOXTIME_BASE },
295 { .speed = 10 * GBPS, .holdtime = HOLDTIME_BASE / 10, .pboxtime = PBOXTIME_BASE / 10 },
296 { .speed = 100 * GBPS, .holdtime = HOLDTIME_BASE / 100, .pboxtime = PBOXTIME_BASE / 100 },
297 { .speed = 0, .holdtime = 0, .pboxtime = 0 }
298 };
299
300 static_assert(SFBF_ECN4 == CLASSQF_ECN4);
301 static_assert(SFBF_ECN6 == CLASSQF_ECN6);
302
303 static u_int32_t
304 sfb_random(struct sfb *sp)
305 {
306 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
307 return RandomULong();
308 }
309
310 static void
311 sfb_calc_holdtime(struct sfb *sp, u_int64_t outbw)
312 {
313 u_int64_t holdtime;
314
315 if (sfb_holdtime != 0) {
316 holdtime = sfb_holdtime;
317 } else if (outbw == 0) {
318 holdtime = SFB_RANDOM(sp, HOLDTIME_MIN, HOLDTIME_MAX);
319 } else {
320 uint64_t n, i;
321
322 n = sfb_ttbl[0].holdtime;
323 for (i = 0; sfb_ttbl[i].speed != 0; i++) {
324 if (outbw < sfb_ttbl[i].speed) {
325 break;
326 }
327 n = sfb_ttbl[i].holdtime;
328 }
329 holdtime = n;
330 }
331 net_nsectimer(&holdtime, &sp->sfb_holdtime);
332 }
333
334 static void
335 sfb_calc_pboxtime(struct sfb *sp, u_int64_t outbw)
336 {
337 u_int64_t pboxtime;
338
339 if (sfb_pboxtime != 0) {
340 pboxtime = sfb_pboxtime;
341 } else if (outbw == 0) {
342 pboxtime = SFB_RANDOM(sp, PBOXTIME_MIN, PBOXTIME_MAX);
343 } else {
344 uint64_t n, i;
345
346 n = sfb_ttbl[0].pboxtime;
347 for (i = 0; sfb_ttbl[i].speed != 0; i++) {
348 if (outbw < sfb_ttbl[i].speed) {
349 break;
350 }
351 n = sfb_ttbl[i].pboxtime;
352 }
353 pboxtime = n;
354 }
355 net_nsectimer(&pboxtime, &sp->sfb_pboxtime);
356 net_timerclear(&sp->sfb_pboxfreeze);
357 }
358
359 static void
360 sfb_calc_hinterval(struct sfb *sp, u_int64_t *t)
361 {
362 u_int64_t hinterval = 0;
363 struct timespec now;
364
365 if (t != NULL) {
366 /*
367 * TODO adi@apple.com: use dq_avg to derive hinterval.
368 */
369 hinterval = *t;
370 }
371
372 if (sfb_hinterval != 0) {
373 hinterval = sfb_hinterval;
374 } else if (t == NULL || hinterval == 0) {
375 hinterval = ((u_int64_t)SFB_HINTERVAL(sp) * NSEC_PER_SEC);
376 }
377
378 net_nsectimer(&hinterval, &sp->sfb_hinterval);
379
380 nanouptime(&now);
381 net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset);
382 }
383
384 static void
385 sfb_calc_update_interval(struct sfb *sp, u_int64_t out_bw)
386 {
387 #pragma unused(out_bw)
388 u_int64_t update_interval = 0;
389 ifclassq_calc_update_interval(&update_interval);
390 net_nsectimer(&update_interval, &sp->sfb_update_interval);
391 }
392
393 /*
394 * sfb support routines
395 */
396 struct sfb *
397 sfb_alloc(struct ifnet *ifp, u_int32_t qid, u_int32_t qlim, u_int32_t flags)
398 {
399 struct sfb *sp;
400 int i;
401
402 VERIFY(ifp != NULL && qlim > 0);
403
404 sp = zalloc_flags(sfb_zone, Z_WAITOK | Z_ZERO);
405 sp->sfb_bins = zalloc_flags(sfb_bins_zone, Z_WAITOK | Z_ZERO);
406 sp->sfb_fc_lists = zalloc_flags(sfb_fcl_zone, Z_WAITOK | Z_ZERO);
407
408 for (i = 0; i < SFB_BINS; ++i) {
409 STAILQ_INIT(&SFB_FC_LIST(sp, i)->fclist);
410 }
411
412 sp->sfb_ifp = ifp;
413 sp->sfb_qlim = qlim;
414 sp->sfb_qid = qid;
415 sp->sfb_flags = (flags & SFBF_USERFLAGS);
416 #if !PF_ECN
417 if (sp->sfb_flags & SFBF_ECN) {
418 sp->sfb_flags &= ~SFBF_ECN;
419 log(LOG_ERR, "%s: SFB qid=%d, ECN not available; ignoring "
420 "SFBF_ECN flag!\n", if_name(ifp), sp->sfb_qid);
421 }
422 #endif /* !PF_ECN */
423
424 sfb_resetq(sp, CLASSQ_EV_INIT);
425
426 return sp;
427 }
428
429 static void
430 sfb_fclist_append(struct sfb *sp, struct sfb_fcl *fcl)
431 {
432 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
433 VERIFY(STAILQ_EMPTY(&fcl->fclist) || fcl->cnt > 0);
434 sp->sfb_stats.flow_feedback += fcl->cnt;
435 fcl->cnt = 0;
436
437 flowadv_add(&fcl->fclist);
438 VERIFY(fcl->cnt == 0 && STAILQ_EMPTY(&fcl->fclist));
439 }
440
441 static void
442 sfb_fclists_clean(struct sfb *sp)
443 {
444 int i;
445
446 /* Move all the flow control entries to the flowadv list */
447 for (i = 0; i < SFB_BINS; ++i) {
448 struct sfb_fcl *fcl = SFB_FC_LIST(sp, i);
449 if (!STAILQ_EMPTY(&fcl->fclist)) {
450 sfb_fclist_append(sp, fcl);
451 }
452 }
453 }
454
455 void
456 sfb_destroy(struct sfb *sp)
457 {
458 sfb_fclists_clean(sp);
459 if (sp->sfb_bins != NULL) {
460 zfree(sfb_bins_zone, sp->sfb_bins);
461 sp->sfb_bins = NULL;
462 }
463 if (sp->sfb_fc_lists != NULL) {
464 zfree(sfb_fcl_zone, sp->sfb_fc_lists);
465 sp->sfb_fc_lists = NULL;
466 }
467 zfree(sfb_zone, sp);
468 }
469
470 static void
471 sfb_resetq(struct sfb *sp, cqev_t ev)
472 {
473 struct ifnet *ifp = sp->sfb_ifp;
474 u_int64_t eff_rate;
475
476 VERIFY(ifp != NULL);
477
478 if (ev != CLASSQ_EV_LINK_DOWN) {
479 (*sp->sfb_bins)[0].fudge = sfb_random(sp);
480 (*sp->sfb_bins)[1].fudge = sfb_random(sp);
481 sp->sfb_allocation = sfb_allocation == 0 ?
482 (uint16_t)(sp->sfb_qlim / 3) :
483 (uint16_t)sfb_allocation;
484 sp->sfb_drop_thresh = sp->sfb_allocation +
485 (sp->sfb_allocation >> 1);
486 }
487
488 sp->sfb_clearpkts = 0;
489 sp->sfb_current = 0;
490
491 eff_rate = ifnet_output_linkrate(ifp);
492 sp->sfb_eff_rate = eff_rate;
493
494 sfb_calc_holdtime(sp, eff_rate);
495 sfb_calc_pboxtime(sp, eff_rate);
496 sfb_calc_hinterval(sp, NULL);
497 ifclassq_calc_target_qdelay(ifp, &sp->sfb_target_qdelay);
498 sfb_calc_update_interval(sp, eff_rate);
499
500 if (ev == CLASSQ_EV_LINK_DOWN ||
501 ev == CLASSQ_EV_LINK_UP) {
502 sfb_fclists_clean(sp);
503 }
504
505 bzero(sp->sfb_bins, sizeof(*sp->sfb_bins));
506 bzero(&sp->sfb_stats, sizeof(sp->sfb_stats));
507
508 if (ev == CLASSQ_EV_LINK_DOWN || !classq_verbose) {
509 return;
510 }
511
512 log(LOG_DEBUG, "%s: SFB qid=%d, holdtime=%llu nsec, "
513 "pboxtime=%llu nsec, allocation=%d, drop_thresh=%d, "
514 "hinterval=%d sec, sfb_bins=%d bytes, eff_rate=%llu bps"
515 "target_qdelay= %llu nsec "
516 "update_interval=%llu sec %llu nsec flags=0x%x\n",
517 if_name(ifp), sp->sfb_qid, (u_int64_t)sp->sfb_holdtime.tv_nsec,
518 (u_int64_t)sp->sfb_pboxtime.tv_nsec,
519 (u_int32_t)sp->sfb_allocation, (u_int32_t)sp->sfb_drop_thresh,
520 (int)sp->sfb_hinterval.tv_sec, (int)sizeof(*sp->sfb_bins),
521 eff_rate, (u_int64_t)sp->sfb_target_qdelay,
522 (u_int64_t)sp->sfb_update_interval.tv_sec,
523 (u_int64_t)sp->sfb_update_interval.tv_nsec, sp->sfb_flags);
524 }
525
526 void
527 sfb_getstats(struct sfb *sp, struct sfb_stats *sps)
528 {
529 sps->allocation = sp->sfb_allocation;
530 sps->dropthresh = sp->sfb_drop_thresh;
531 sps->clearpkts = sp->sfb_clearpkts;
532 sps->current = sp->sfb_current;
533 sps->target_qdelay = sp->sfb_target_qdelay;
534 sps->min_estdelay = sp->sfb_min_qdelay;
535 sps->delay_fcthreshold = (uint32_t)sp->sfb_fc_threshold;
536 sps->flags = sp->sfb_flags;
537
538 net_timernsec(&sp->sfb_holdtime, &sp->sfb_stats.hold_time);
539 net_timernsec(&sp->sfb_pboxtime, &sp->sfb_stats.pbox_time);
540 net_timernsec(&sp->sfb_hinterval, &sp->sfb_stats.rehash_intval);
541 net_timernsec(&sp->sfb_update_interval, &sps->update_interval);
542 *(&(sps->sfbstats)) = *(&(sp->sfb_stats));
543
544 _CASSERT(sizeof((*sp->sfb_bins)[0].stats) ==
545 sizeof(sps->binstats[0].stats));
546
547 bcopy(&(*sp->sfb_bins)[0].stats, &sps->binstats[0].stats,
548 sizeof(sps->binstats[0].stats));
549 bcopy(&(*sp->sfb_bins)[1].stats, &sps->binstats[1].stats,
550 sizeof(sps->binstats[1].stats));
551 }
552
553 static void
554 sfb_swap_bins(struct sfb *sp, u_int32_t len)
555 {
556 int i, j, s;
557
558 if (sp->sfb_flags & SFBF_SUSPENDED) {
559 return;
560 }
561
562 s = sp->sfb_current;
563 VERIFY((s + (s ^ 1)) == 1);
564
565 (*sp->sfb_bins)[s].fudge = sfb_random(sp); /* recompute perturbation */
566 sp->sfb_clearpkts = len;
567 sp->sfb_stats.num_rehash++;
568
569 s = (sp->sfb_current ^= 1); /* flip the bit (swap current) */
570
571 if (classq_verbose) {
572 log(LOG_DEBUG, "%s: SFB qid=%d, set %d is now current, "
573 "qlen=%d\n", if_name(sp->sfb_ifp), sp->sfb_qid, s, len);
574 }
575
576 /* clear freezetime for all current bins */
577 bzero(&(*sp->sfb_bins)[s].freezetime,
578 sizeof((*sp->sfb_bins)[s].freezetime));
579
580 /* clear/adjust bin statistics and flow control lists */
581 for (i = 0; i < SFB_BINS; i++) {
582 struct sfb_fcl *fcl = SFB_FC_LIST(sp, i);
583
584 if (!STAILQ_EMPTY(&fcl->fclist)) {
585 sfb_fclist_append(sp, fcl);
586 }
587
588 for (j = 0; j < SFB_LEVELS; j++) {
589 struct sfbbinstats *cbin, *wbin;
590
591 cbin = SFB_BINST(sp, j, i, s); /* current */
592 wbin = SFB_BINST(sp, j, i, s ^ 1); /* warm-up */
593
594 cbin->pkts = 0;
595 cbin->bytes = 0;
596 if (cbin->pmark > SFB_MAX_PMARK) {
597 cbin->pmark = SFB_MAX_PMARK;
598 }
599 if (cbin->pmark < 0) {
600 cbin->pmark = 0;
601 }
602
603 /*
604 * Keep pmark from before to identify
605 * non-responsives immediately.
606 */
607 if (wbin->pmark > SFB_PMARK_WARM) {
608 wbin->pmark = SFB_PMARK_WARM;
609 }
610 }
611 }
612 }
613
614 static inline int
615 sfb_pcheck(struct sfb *sp, uint32_t pkt_sfb_hash)
616 {
617 #if SFB_LEVELS != 2
618 int i, n;
619 #endif /* SFB_LEVELS != 2 */
620 uint8_t *pkt_sfb_hash8 = (uint8_t *)&pkt_sfb_hash;
621 int s;
622
623 s = sp->sfb_current;
624 VERIFY((s + (s ^ 1)) == 1);
625
626 /*
627 * For current bins, returns 1 if all pmark >= SFB_PMARK_TH,
628 * 0 otherwise; optimize for SFB_LEVELS=2.
629 */
630 #if SFB_LEVELS == 2
631 /*
632 * Level 0: bin index at [0] for set 0; [2] for set 1
633 * Level 1: bin index at [1] for set 0; [3] for set 1
634 */
635 if (SFB_BINST(sp, 0, SFB_BINMASK(pkt_sfb_hash8[(s << 1)]),
636 s)->pmark < SFB_PMARK_TH ||
637 SFB_BINST(sp, 1, SFB_BINMASK(pkt_sfb_hash8[(s << 1) + 1]),
638 s)->pmark < SFB_PMARK_TH) {
639 return 0;
640 }
641 #else /* SFB_LEVELS != 2 */
642 for (i = 0; i < SFB_LEVELS; i++) {
643 if (s == 0) { /* set 0, bin index [0,1] */
644 n = SFB_BINMASK(pkt_sfb_hash8[i]);
645 } else { /* set 1, bin index [2,3] */
646 n = SFB_BINMASK(pkt_sfb_hash8[i + 2]);
647 }
648
649 if (SFB_BINST(sp, i, n, s)->pmark < SFB_PMARK_TH) {
650 return 0;
651 }
652 }
653 #endif /* SFB_LEVELS != 2 */
654 return 1;
655 }
656
657 static int
658 sfb_penalize(struct sfb *sp, uint32_t pkt_sfb_hash, uint32_t *pkt_sfb_flags,
659 struct timespec *now)
660 {
661 struct timespec delta = { .tv_sec = 0, .tv_nsec = 0 };
662 uint8_t *pkt_sfb_hash8 = (uint8_t *)&pkt_sfb_hash;
663
664 /* If minimum pmark of current bins is < SFB_PMARK_TH, we're done */
665 if (!sfb_ratelimit || !sfb_pcheck(sp, pkt_sfb_hash)) {
666 return 0;
667 }
668
669 net_timersub(now, &sp->sfb_pboxfreeze, &delta);
670 if (net_timercmp(&delta, &sp->sfb_pboxtime, <)) {
671 #if SFB_LEVELS != 2
672 int i;
673 #endif /* SFB_LEVELS != 2 */
674 struct sfbbinstats *bin;
675 int n, w;
676
677 w = sp->sfb_current ^ 1;
678 VERIFY((w + (w ^ 1)) == 1);
679
680 /*
681 * Update warm-up bins; optimize for SFB_LEVELS=2
682 */
683 #if SFB_LEVELS == 2
684 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
685 n = SFB_BINMASK(pkt_sfb_hash8[(w << 1)]);
686 bin = SFB_BINST(sp, 0, n, w);
687 if (bin->pkts >= sp->sfb_allocation) {
688 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 0, n, w), now);
689 }
690
691 /* Level 0: bin index at [1] for set 0; [3] for set 1 */
692 n = SFB_BINMASK(pkt_sfb_hash8[(w << 1) + 1]);
693 bin = SFB_BINST(sp, 1, n, w);
694 if (bin->pkts >= sp->sfb_allocation) {
695 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 1, n, w), now);
696 }
697 #else /* SFB_LEVELS != 2 */
698 for (i = 0; i < SFB_LEVELS; i++) {
699 if (w == 0) { /* set 0, bin index [0,1] */
700 n = SFB_BINMASK(pkt_sfb_hash8[i]);
701 } else { /* set 1, bin index [2,3] */
702 n = SFB_BINMASK(pkt_sfb_hash8[i + 2]);
703 }
704
705 bin = SFB_BINST(sp, i, n, w);
706 if (bin->pkts >= sp->sfb_allocation) {
707 sfb_increment_bin(sp, bin,
708 SFB_BINFT(sp, i, n, w), now);
709 }
710 }
711 #endif /* SFB_LEVELS != 2 */
712 return 1;
713 }
714
715 /* non-conformant or else misclassified flow; queue it anyway */
716 *pkt_sfb_flags |= SFB_PKT_PBOX;
717 *(&sp->sfb_pboxfreeze) = *now;
718
719 return 0;
720 }
721
722 static void
723 sfb_adjust_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft,
724 struct timespec *now, boolean_t inc)
725 {
726 struct timespec delta;
727
728 net_timersub(now, ft, &delta);
729 if (net_timercmp(&delta, &sp->sfb_holdtime, <)) {
730 if (classq_verbose > 1) {
731 log(LOG_DEBUG, "%s: SFB qid=%d, %s update frozen "
732 "(delta=%llu nsec)\n", if_name(sp->sfb_ifp),
733 sp->sfb_qid, inc ? "increment" : "decrement",
734 (u_int64_t)delta.tv_nsec);
735 }
736 return;
737 }
738
739 /* increment/decrement marking probability */
740 *ft = *now;
741 if (inc) {
742 SFB_PMARK_INC(bin);
743 } else {
744 SFB_PMARK_DEC(bin);
745 }
746 }
747
748 static void
749 sfb_decrement_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft,
750 struct timespec *now)
751 {
752 return sfb_adjust_bin(sp, bin, ft, now, FALSE);
753 }
754
755 static void
756 sfb_increment_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft,
757 struct timespec *now)
758 {
759 return sfb_adjust_bin(sp, bin, ft, now, TRUE);
760 }
761
762 static inline void
763 sfb_dq_update_bins(struct sfb *sp, uint32_t pkt_sfb_hash, uint32_t pkt_len,
764 struct timespec *now, u_int64_t qsize)
765 {
766 #if SFB_LEVELS != 2 || SFB_FC_LEVEL != 0
767 int i;
768 #endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
769 struct sfbbinstats *bin;
770 int s, n;
771 struct sfb_fcl *fcl = NULL;
772 uint8_t *pkt_sfb_hash8 = (uint8_t *)&pkt_sfb_hash;
773
774 s = sp->sfb_current;
775 VERIFY((s + (s ^ 1)) == 1);
776
777 /*
778 * Update current bins; optimize for SFB_LEVELS=2 and SFB_FC_LEVEL=0
779 */
780 #if SFB_LEVELS == 2 && SFB_FC_LEVEL == 0
781 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
782 n = SFB_BINMASK(pkt_sfb_hash8[(s << 1)]);
783 bin = SFB_BINST(sp, 0, n, s);
784
785 VERIFY(bin->pkts > 0 && bin->bytes >= pkt_len);
786 bin->pkts--;
787 bin->bytes -= pkt_len;
788
789 if (bin->pkts == 0) {
790 sfb_decrement_bin(sp, bin, SFB_BINFT(sp, 0, n, s), now);
791 }
792
793 /* Deliver flow control feedback to the sockets */
794 if (SFB_QUEUE_DELAYBASED(sp)) {
795 if (!(SFB_IS_DELAYHIGH(sp)) ||
796 bin->bytes <= sp->sfb_fc_threshold ||
797 bin->pkts == 0 || qsize == 0) {
798 fcl = SFB_FC_LIST(sp, n);
799 }
800 } else if (bin->pkts <= (sp->sfb_allocation >> 2)) {
801 fcl = SFB_FC_LIST(sp, n);
802 }
803
804 if (fcl != NULL && !STAILQ_EMPTY(&fcl->fclist)) {
805 sfb_fclist_append(sp, fcl);
806 }
807 fcl = NULL;
808
809 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
810 n = SFB_BINMASK(pkt_sfb_hash8[(s << 1) + 1]);
811 bin = SFB_BINST(sp, 1, n, s);
812
813 VERIFY(bin->pkts > 0 && bin->bytes >= (u_int64_t)pkt_len);
814 bin->pkts--;
815 bin->bytes -= pkt_len;
816 if (bin->pkts == 0) {
817 sfb_decrement_bin(sp, bin, SFB_BINFT(sp, 1, n, s), now);
818 }
819 #else /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
820 for (i = 0; i < SFB_LEVELS; i++) {
821 if (s == 0) { /* set 0, bin index [0,1] */
822 n = SFB_BINMASK(pkt_sfb_hash8[i]);
823 } else { /* set 1, bin index [2,3] */
824 n = SFB_BINMASK(pkt_sfb_hash8[i + 2]);
825 }
826
827 bin = SFB_BINST(sp, i, n, s);
828
829 VERIFY(bin->pkts > 0 && bin->bytes >= pkt_len);
830 bin->pkts--;
831 bin->bytes -= pkt_len;
832 if (bin->pkts == 0) {
833 sfb_decrement_bin(sp, bin,
834 SFB_BINFT(sp, i, n, s), now);
835 }
836 if (i != SFB_FC_LEVEL) {
837 continue;
838 }
839 if (SFB_QUEUE_DELAYBASED(sp)) {
840 if (!(SFB_IS_DELAYHIGH(sp)) ||
841 bin->bytes <= sp->sfb_fc_threshold) {
842 fcl = SFB_FC_LIST(sp, n);
843 }
844 } else if (bin->pkts <= (sp->sfb_allocation >> 2)) {
845 fcl = SFB_FC_LIST(sp, n);
846 }
847 if (fcl != NULL && !STAILQ_EMPTY(&fcl->fclist)) {
848 sfb_fclist_append(sp, fcl);
849 }
850 fcl = NULL;
851 }
852 #endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
853 }
854
855 static inline void
856 sfb_eq_update_bins(struct sfb *sp, uint32_t pkt_sfb_hash, uint32_t pkt_len)
857 {
858 #if SFB_LEVELS != 2
859 int i, n;
860 #endif /* SFB_LEVELS != 2 */
861 int s;
862 struct sfbbinstats *bin;
863 uint8_t *pkt_sfb_hash8 = (uint8_t *)&pkt_sfb_hash;
864 s = sp->sfb_current;
865 VERIFY((s + (s ^ 1)) == 1);
866
867 /*
868 * Update current bins; optimize for SFB_LEVELS=2
869 */
870 #if SFB_LEVELS == 2
871 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
872 bin = SFB_BINST(sp, 0,
873 SFB_BINMASK(pkt_sfb_hash8[(s << 1)]), s);
874 bin->pkts++;
875 bin->bytes += pkt_len;
876
877 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
878 bin = SFB_BINST(sp, 1,
879 SFB_BINMASK(pkt_sfb_hash8[(s << 1) + 1]), s);
880 bin->pkts++;
881 bin->bytes += pkt_len;
882
883 #else /* SFB_LEVELS != 2 */
884 for (i = 0; i < SFB_LEVELS; i++) {
885 if (s == 0) { /* set 0, bin index [0,1] */
886 n = SFB_BINMASK(pkt_sfb_hash8[i]);
887 } else { /* set 1, bin index [2,3] */
888 n = SFB_BINMASK(pkt_sfb_hash8[i + 2]);
889 }
890
891 bin = SFB_BINST(sp, i, n, s);
892 bin->pkts++;
893 bin->bytes += pkt_len;
894 }
895 #endif /* SFB_LEVELS != 2 */
896 }
897
898 static boolean_t
899 sfb_bin_addfcentry(struct sfb *sp, pktsched_pkt_t *pkt, uint32_t pkt_sfb_hash,
900 uint8_t flowsrc, uint32_t flowid)
901 {
902 struct flowadv_fcentry *fce;
903 struct sfb_fcl *fcl;
904 int s;
905 uint8_t *pkt_sfb_hash8 = (uint8_t *)&pkt_sfb_hash;
906
907 s = sp->sfb_current;
908 VERIFY((s + (s ^ 1)) == 1);
909
910 if (flowid == 0) {
911 sp->sfb_stats.null_flowid++;
912 return FALSE;
913 }
914
915 /*
916 * Use value at index 0 for set 0 and
917 * value at index 2 for set 1
918 */
919 fcl = SFB_FC_LIST(sp, SFB_BINMASK(pkt_sfb_hash8[(s << 1)]));
920 STAILQ_FOREACH(fce, &fcl->fclist, fce_link) {
921 if ((uint8_t)fce->fce_flowsrc_type == flowsrc &&
922 fce->fce_flowid == flowid) {
923 /* Already on flow control list; just return */
924 return TRUE;
925 }
926 }
927
928 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
929 fce = pktsched_alloc_fcentry(pkt, sp->sfb_ifp, M_WAITOK);
930 if (fce != NULL) {
931 STAILQ_INSERT_TAIL(&fcl->fclist, fce, fce_link);
932 fcl->cnt++;
933 sp->sfb_stats.flow_controlled++;
934 }
935
936 return fce != NULL;
937 }
938
939 /*
940 * check if this flow needs to be flow-controlled or if this
941 * packet needs to be dropped.
942 */
943 static int
944 sfb_bin_mark_or_drop(struct sfb *sp, struct sfbbinstats *bin)
945 {
946 int ret = 0;
947 if (SFB_QUEUE_DELAYBASED(sp)) {
948 /*
949 * Mark or drop if this bin has more
950 * bytes than the flowcontrol threshold.
951 */
952 if (SFB_IS_DELAYHIGH(sp) &&
953 bin->bytes >= (sp->sfb_fc_threshold << 1)) {
954 ret = 1;
955 }
956 } else {
957 if (bin->pkts >= sp->sfb_allocation &&
958 bin->pkts >= sp->sfb_drop_thresh) {
959 ret = 1; /* drop or mark */
960 }
961 }
962 return ret;
963 }
964
965 /*
966 * early-drop probability is kept in pmark of each bin of the flow
967 */
968 static int
969 sfb_drop_early(struct sfb *sp, uint32_t pkt_sfb_hash, u_int16_t *pmin,
970 struct timespec *now)
971 {
972 #if SFB_LEVELS != 2
973 int i;
974 #endif /* SFB_LEVELS != 2 */
975 struct sfbbinstats *bin;
976 int s, n, ret = 0;
977 uint8_t *pkt_sfb_hash8 = (uint8_t *)&pkt_sfb_hash;
978
979 s = sp->sfb_current;
980 VERIFY((s + (s ^ 1)) == 1);
981
982 *pmin = (u_int16_t)-1;
983
984 /*
985 * Update current bins; optimize for SFB_LEVELS=2
986 */
987 #if SFB_LEVELS == 2
988 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
989 n = SFB_BINMASK(pkt_sfb_hash8[(s << 1)]);
990 bin = SFB_BINST(sp, 0, n, s);
991 if (*pmin > (u_int16_t)bin->pmark) {
992 *pmin = (u_int16_t)bin->pmark;
993 }
994
995
996 /* Update SFB probability */
997 if (bin->pkts >= sp->sfb_allocation) {
998 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 0, n, s), now);
999 }
1000
1001 ret = sfb_bin_mark_or_drop(sp, bin);
1002
1003 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
1004 n = SFB_BINMASK(pkt_sfb_hash8[(s << 1) + 1]);
1005 bin = SFB_BINST(sp, 1, n, s);
1006 if (*pmin > (u_int16_t)bin->pmark) {
1007 *pmin = (u_int16_t)bin->pmark;
1008 }
1009
1010 if (bin->pkts >= sp->sfb_allocation) {
1011 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 1, n, s), now);
1012 }
1013 #else /* SFB_LEVELS != 2 */
1014 for (i = 0; i < SFB_LEVELS; i++) {
1015 if (s == 0) { /* set 0, bin index [0,1] */
1016 n = SFB_BINMASK(pkt_sfb_hash8[i]);
1017 } else { /* set 1, bin index [2,3] */
1018 n = SFB_BINMASK(pkt_sfb_hash8[i + 2]);
1019 }
1020
1021 bin = SFB_BINST(sp, i, n, s);
1022 if (*pmin > (u_int16_t)bin->pmark) {
1023 *pmin = (u_int16_t)bin->pmark;
1024 }
1025
1026 if (bin->pkts >= sp->sfb_allocation) {
1027 sfb_increment_bin(sp, bin,
1028 SFB_BINFT(sp, i, n, s), now);
1029 }
1030 if (i == SFB_FC_LEVEL) {
1031 ret = sfb_bin_mark_or_drop(sp, bin);
1032 }
1033 }
1034 #endif /* SFB_LEVELS != 2 */
1035
1036 if (sp->sfb_flags & SFBF_SUSPENDED) {
1037 ret = 1; /* drop or mark */
1038 }
1039 return ret;
1040 }
1041
1042 void
1043 sfb_detect_dequeue_stall(struct sfb *sp, class_queue_t *q,
1044 struct timespec *now)
1045 {
1046 struct timespec max_getqtime;
1047
1048 if (!SFB_QUEUE_DELAYBASED(sp) || SFB_IS_DELAYHIGH(sp) ||
1049 qsize(q) <= SFB_MIN_FC_THRESHOLD_BYTES ||
1050 !net_timerisset(&sp->sfb_getqtime)) {
1051 return;
1052 }
1053
1054 net_timeradd(&sp->sfb_getqtime, &sp->sfb_update_interval,
1055 &max_getqtime);
1056 if (net_timercmp(now, &max_getqtime, >)) {
1057 /*
1058 * No packets have been dequeued in an update interval
1059 * worth of time. It means that the queue is stalled
1060 */
1061 SFB_SET_DELAY_HIGH(sp, q);
1062 sp->sfb_stats.dequeue_stall++;
1063 }
1064 }
1065
1066 #define DTYPE_NODROP 0 /* no drop */
1067 #define DTYPE_FORCED 1 /* a "forced" drop */
1068 #define DTYPE_EARLY 2 /* an "unforced" (early) drop */
1069
1070 int
1071 sfb_addq(struct sfb *sp, class_queue_t *q, pktsched_pkt_t *pkt,
1072 struct pf_mtag *t)
1073 {
1074 #if !PF_ECN
1075 #pragma unused(t)
1076 #endif /* !PF_ECN */
1077 struct timespec now;
1078 int droptype, s;
1079 uint16_t pmin;
1080 int fc_adv = 0;
1081 int ret = CLASSQEQ_SUCCESS;
1082 uint32_t maxqsize = 0;
1083 uint64_t *pkt_timestamp;
1084 uint32_t *pkt_sfb_hash;
1085 uint16_t *pkt_sfb_hash16;
1086 uint32_t *pkt_sfb_flags;
1087 uint32_t pkt_flowid;
1088 volatile uint32_t *pkt_flags;
1089 uint8_t pkt_proto, pkt_flowsrc;
1090
1091 s = sp->sfb_current;
1092 VERIFY((s + (s ^ 1)) == 1);
1093
1094 pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, &pkt_flowid,
1095 &pkt_flowsrc, &pkt_proto, NULL);
1096 pkt_sfb_hash = pktsched_get_pkt_sfb_vars(pkt, &pkt_sfb_flags);
1097 pkt_sfb_hash16 = (uint16_t *)pkt_sfb_hash;
1098
1099 switch (pkt->pktsched_ptype) {
1100 case QP_MBUF:
1101 /* See comments in <rdar://problem/14040693> */
1102 VERIFY(!(*pkt_flags & PKTF_PRIV_GUARDED));
1103 *pkt_flags |= PKTF_PRIV_GUARDED;
1104 break;
1105 default:
1106 VERIFY(0);
1107 /* NOTREACHED */
1108 __builtin_unreachable();
1109 }
1110
1111 if (*pkt_timestamp > 0) {
1112 net_nsectimer(pkt_timestamp, &now);
1113 } else {
1114 nanouptime(&now);
1115 net_timernsec(&now, pkt_timestamp);
1116 }
1117
1118 /* time to swap the bins? */
1119 if (net_timercmp(&now, &sp->sfb_nextreset, >=)) {
1120 net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset);
1121 sfb_swap_bins(sp, qlen(q));
1122 s = sp->sfb_current;
1123 VERIFY((s + (s ^ 1)) == 1);
1124 }
1125
1126 if (!net_timerisset(&sp->sfb_update_time)) {
1127 net_timeradd(&now, &sp->sfb_update_interval,
1128 &sp->sfb_update_time);
1129 }
1130
1131 /*
1132 * If getq time is not set because this is the first packet
1133 * or after idle time, set it now so that we can detect a stall.
1134 */
1135 if (qsize(q) == 0 && !net_timerisset(&sp->sfb_getqtime)) {
1136 *(&sp->sfb_getqtime) = *(&now);
1137 }
1138
1139 *pkt_sfb_flags = 0;
1140 pkt_sfb_hash16[s] =
1141 (SFB_HASH(&pkt_flowid, sizeof(pkt_flowid),
1142 (*sp->sfb_bins)[s].fudge) & SFB_HASHMASK);
1143 pkt_sfb_hash16[s ^ 1] =
1144 (SFB_HASH(&pkt_flowid, sizeof(pkt_flowid),
1145 (*sp->sfb_bins)[s ^ 1].fudge) & SFB_HASHMASK);
1146
1147 /* check if the queue has been stalled */
1148 sfb_detect_dequeue_stall(sp, q, &now);
1149
1150 /* see if we drop early */
1151 droptype = DTYPE_NODROP;
1152 if (sfb_drop_early(sp, *pkt_sfb_hash, &pmin, &now)) {
1153 /* flow control, mark or drop by sfb */
1154 if ((sp->sfb_flags & SFBF_FLOWCTL) &&
1155 (*pkt_flags & PKTF_FLOW_ADV)) {
1156 fc_adv = 1;
1157 /* drop all during suspension or for non-TCP */
1158 if ((sp->sfb_flags & SFBF_SUSPENDED) ||
1159 pkt_proto != IPPROTO_TCP) {
1160 droptype = DTYPE_EARLY;
1161 sp->sfb_stats.drop_early++;
1162 }
1163 }
1164 #if PF_ECN
1165 /* XXX: only supported for mbuf */
1166 else if ((sp->sfb_flags & SFBF_ECN) &&
1167 (pkt->pktsched_ptype == QP_MBUF) &&
1168 (pkt_proto == IPPROTO_TCP) && /* only for TCP */
1169 ((sfb_random(sp) & SFB_MAX_PMARK) <= pmin) &&
1170 mark_ecn(m, t, sp->sfb_flags) &&
1171 !(sp->sfb_flags & SFBF_SUSPENDED)) {
1172 /* successfully marked; do not drop. */
1173 sp->sfb_stats.marked_packets++;
1174 }
1175 #endif /* PF_ECN */
1176 else {
1177 /* unforced drop by sfb */
1178 droptype = DTYPE_EARLY;
1179 sp->sfb_stats.drop_early++;
1180 }
1181 }
1182
1183 /* non-responsive flow penalty? */
1184 if (droptype == DTYPE_NODROP && sfb_penalize(sp, *pkt_sfb_hash,
1185 pkt_sfb_flags, &now)) {
1186 droptype = DTYPE_FORCED;
1187 sp->sfb_stats.drop_pbox++;
1188 }
1189
1190 if (SFB_QUEUE_DELAYBASED(sp)) {
1191 maxqsize = SFB_QUEUE_DELAYBASED_MAXSIZE;
1192 } else {
1193 maxqsize = qlimit(q);
1194 }
1195
1196 /*
1197 * When the queue length hits the queue limit, make it a forced
1198 * drop
1199 */
1200 if (droptype == DTYPE_NODROP && qlen(q) >= maxqsize) {
1201 if (pkt_proto == IPPROTO_TCP &&
1202 qlen(q) < (maxqsize + (maxqsize >> 1)) &&
1203 ((*pkt_flags & PKTF_TCP_REXMT) ||
1204 (sp->sfb_flags & SFBF_LAST_PKT_DROPPED))) {
1205 /*
1206 * At some level, dropping packets will make the
1207 * flows backoff and will keep memory requirements
1208 * under control. But we should not cause a tail
1209 * drop because it can take a long time for a
1210 * TCP flow to recover. We should try to drop
1211 * alternate packets instead.
1212 */
1213 sp->sfb_flags &= ~SFBF_LAST_PKT_DROPPED;
1214 } else {
1215 droptype = DTYPE_FORCED;
1216 sp->sfb_stats.drop_queue++;
1217 sp->sfb_flags |= SFBF_LAST_PKT_DROPPED;
1218 }
1219 }
1220
1221 if (fc_adv == 1 && droptype != DTYPE_FORCED &&
1222 sfb_bin_addfcentry(sp, pkt, *pkt_sfb_hash, pkt_flowsrc,
1223 pkt_flowid)) {
1224 /* deliver flow control advisory error */
1225 if (droptype == DTYPE_NODROP) {
1226 ret = CLASSQEQ_SUCCESS_FC;
1227 VERIFY(!(sp->sfb_flags & SFBF_SUSPENDED));
1228 } else if (sp->sfb_flags & SFBF_SUSPENDED) {
1229 /* drop due to suspension */
1230 ret = CLASSQEQ_DROP_SP;
1231 } else {
1232 /* drop due to flow-control */
1233 ret = CLASSQEQ_DROP_FC;
1234 }
1235 }
1236 /* if successful enqueue this packet, else drop it */
1237 if (droptype == DTYPE_NODROP) {
1238 VERIFY(pkt->pktsched_ptype == qptype(q));
1239 _addq(q, &pkt->pktsched_pkt);
1240 } else {
1241 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
1242 return (ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROP;
1243 }
1244
1245 if (!(*pkt_sfb_flags & SFB_PKT_PBOX)) {
1246 sfb_eq_update_bins(sp, *pkt_sfb_hash,
1247 pktsched_get_pkt_len(pkt));
1248 } else {
1249 sp->sfb_stats.pbox_packets++;
1250 }
1251
1252 /* successfully queued */
1253 return ret;
1254 }
1255
1256 static void *
1257 sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge,
1258 pktsched_pkt_t *pkt)
1259 {
1260 struct timespec now;
1261 uint64_t *pkt_timestamp;
1262 volatile uint32_t *pkt_flags;
1263 uint32_t *pkt_sfb_flags;
1264 uint32_t *pkt_sfb_hash;
1265 classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p);
1266
1267 if (!purge && (sp->sfb_flags & SFBF_SUSPENDED)) {
1268 return NULL;
1269 }
1270
1271 nanouptime(&now);
1272
1273 /* flow of 0 means head of queue */
1274 if (flow == 0) {
1275 _getq(q, &p);
1276 } else {
1277 _getq_flow(q, &p, flow);
1278 }
1279
1280 if (p.cp_ptype == QP_INVALID) {
1281 if (!purge) {
1282 net_timerclear(&sp->sfb_getqtime);
1283 }
1284 return NULL;
1285 }
1286
1287 pktsched_pkt_encap(pkt, &p);
1288 pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL,
1289 NULL, NULL, NULL);
1290 pkt_sfb_hash = pktsched_get_pkt_sfb_vars(pkt, &pkt_sfb_flags);
1291
1292 /* See comments in <rdar://problem/14040693> */
1293 switch (p.cp_ptype) {
1294 case QP_MBUF:
1295 VERIFY(*pkt_flags & PKTF_PRIV_GUARDED);
1296 break;
1297 default:
1298 VERIFY(0);
1299 /* NOTREACHED */
1300 __builtin_unreachable();
1301 }
1302
1303 if (!purge) {
1304 /* calculate EWMA of dequeues */
1305 if (net_timerisset(&sp->sfb_getqtime)) {
1306 struct timespec delta;
1307 u_int64_t avg, new;
1308 net_timersub(&now, &sp->sfb_getqtime, &delta);
1309 net_timernsec(&delta, &new);
1310 avg = sp->sfb_stats.dequeue_avg;
1311 if (avg > 0) {
1312 int decay = DEQUEUE_DECAY;
1313 /*
1314 * If the time since last dequeue is
1315 * significantly greater than the current
1316 * average, weigh the average more against
1317 * the old value.
1318 */
1319 if (DEQUEUE_SPIKE(new, avg)) {
1320 decay += 5;
1321 }
1322 avg = (((avg << decay) - avg) + new) >> decay;
1323 } else {
1324 avg = new;
1325 }
1326 sp->sfb_stats.dequeue_avg = avg;
1327 }
1328 *(&sp->sfb_getqtime) = *(&now);
1329 }
1330
1331 if (!purge && SFB_QUEUE_DELAYBASED(sp)) {
1332 u_int64_t dequeue_ns, queue_delay = 0;
1333 net_timernsec(&now, &dequeue_ns);
1334 if (dequeue_ns > *pkt_timestamp) {
1335 queue_delay = dequeue_ns - *pkt_timestamp;
1336 }
1337
1338 if (sp->sfb_min_qdelay == 0 ||
1339 (queue_delay > 0 && queue_delay < sp->sfb_min_qdelay)) {
1340 sp->sfb_min_qdelay = queue_delay;
1341 }
1342 if (net_timercmp(&now, &sp->sfb_update_time, >=)) {
1343 if (sp->sfb_min_qdelay > sp->sfb_target_qdelay) {
1344 if (!SFB_IS_DELAYHIGH(sp)) {
1345 SFB_SET_DELAY_HIGH(sp, q);
1346 }
1347 } else {
1348 sp->sfb_flags &= ~(SFBF_DELAYHIGH);
1349 sp->sfb_fc_threshold = 0;
1350 }
1351 net_timeradd(&now, &sp->sfb_update_interval,
1352 &sp->sfb_update_time);
1353 sp->sfb_min_qdelay = 0;
1354 }
1355 }
1356 *pkt_timestamp = 0;
1357
1358 /*
1359 * Clearpkts are the ones which were in the queue when the hash
1360 * function was perturbed. Since the perturbation value (fudge),
1361 * and thus bin information for these packets is not known, we do
1362 * not change accounting information while dequeuing these packets.
1363 * It is important not to set the hash interval too small due to
1364 * this reason. A rule of thumb is to set it to K*D, where D is
1365 * the time taken to drain queue.
1366 */
1367 if (*pkt_sfb_flags & SFB_PKT_PBOX) {
1368 *pkt_sfb_flags &= ~SFB_PKT_PBOX;
1369 if (sp->sfb_clearpkts > 0) {
1370 sp->sfb_clearpkts--;
1371 }
1372 } else if (sp->sfb_clearpkts > 0) {
1373 sp->sfb_clearpkts--;
1374 } else {
1375 sfb_dq_update_bins(sp, *pkt_sfb_hash, pktsched_get_pkt_len(pkt),
1376 &now, qsize(q));
1377 }
1378
1379 switch (p.cp_ptype) {
1380 case QP_MBUF:
1381 /* See comments in <rdar://problem/14040693> */
1382 *pkt_flags &= ~PKTF_PRIV_GUARDED;
1383 break;
1384 default:
1385 VERIFY(0);
1386 /* NOTREACHED */
1387 __builtin_unreachable();
1388 }
1389
1390 /*
1391 * If the queue becomes empty before the update interval, reset
1392 * the flow control threshold
1393 */
1394 if (qsize(q) == 0) {
1395 sp->sfb_flags &= ~SFBF_DELAYHIGH;
1396 sp->sfb_min_qdelay = 0;
1397 sp->sfb_fc_threshold = 0;
1398 net_timerclear(&sp->sfb_update_time);
1399 net_timerclear(&sp->sfb_getqtime);
1400 }
1401 return pkt->pktsched_pkt_mbuf;
1402 }
1403
1404 void
1405 sfb_getq(struct sfb *sp, class_queue_t *q, pktsched_pkt_t *pkt)
1406 {
1407 sfb_getq_flow(sp, q, 0, FALSE, pkt);
1408 }
1409
1410 void
1411 sfb_purgeq(struct sfb *sp, class_queue_t *q, u_int32_t flow, u_int32_t *packets,
1412 u_int32_t *bytes)
1413 {
1414 u_int32_t cnt = 0, len = 0;
1415 pktsched_pkt_t pkt;
1416
1417 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
1418 while (sfb_getq_flow(sp, q, flow, TRUE, &pkt) != NULL) {
1419 cnt++;
1420 len += pktsched_get_pkt_len(&pkt);
1421 pktsched_free_pkt(&pkt);
1422 }
1423
1424 if (packets != NULL) {
1425 *packets = cnt;
1426 }
1427 if (bytes != NULL) {
1428 *bytes = len;
1429 }
1430 }
1431
1432 void
1433 sfb_updateq(struct sfb *sp, cqev_t ev)
1434 {
1435 struct ifnet *ifp = sp->sfb_ifp;
1436
1437 VERIFY(ifp != NULL);
1438
1439 switch (ev) {
1440 case CLASSQ_EV_LINK_BANDWIDTH: {
1441 u_int64_t eff_rate = ifnet_output_linkrate(ifp);
1442
1443 /* update parameters only if rate has changed */
1444 if (eff_rate == sp->sfb_eff_rate) {
1445 break;
1446 }
1447
1448 if (classq_verbose) {
1449 log(LOG_DEBUG, "%s: SFB qid=%d, adapting to new "
1450 "eff_rate=%llu bps\n", if_name(ifp), sp->sfb_qid,
1451 eff_rate);
1452 }
1453 sfb_calc_holdtime(sp, eff_rate);
1454 sfb_calc_pboxtime(sp, eff_rate);
1455 ifclassq_calc_target_qdelay(ifp, &sp->sfb_target_qdelay);
1456 sfb_calc_update_interval(sp, eff_rate);
1457 break;
1458 }
1459
1460 case CLASSQ_EV_LINK_UP:
1461 case CLASSQ_EV_LINK_DOWN:
1462 if (classq_verbose) {
1463 log(LOG_DEBUG, "%s: SFB qid=%d, resetting due to "
1464 "link %s\n", if_name(ifp), sp->sfb_qid,
1465 (ev == CLASSQ_EV_LINK_UP) ? "UP" : "DOWN");
1466 }
1467 sfb_resetq(sp, ev);
1468 break;
1469
1470 case CLASSQ_EV_LINK_LATENCY:
1471 case CLASSQ_EV_LINK_MTU:
1472 default:
1473 break;
1474 }
1475 }
1476
1477 int
1478 sfb_suspendq(struct sfb *sp, class_queue_t *q, boolean_t on)
1479 {
1480 #pragma unused(q)
1481 struct ifnet *ifp = sp->sfb_ifp;
1482
1483 VERIFY(ifp != NULL);
1484
1485 if ((on && (sp->sfb_flags & SFBF_SUSPENDED)) ||
1486 (!on && !(sp->sfb_flags & SFBF_SUSPENDED))) {
1487 return 0;
1488 }
1489
1490 if (!(sp->sfb_flags & SFBF_FLOWCTL)) {
1491 log(LOG_ERR, "%s: SFB qid=%d, unable to %s queue since "
1492 "flow-control is not enabled", if_name(ifp), sp->sfb_qid,
1493 (on ? "suspend" : "resume"));
1494 return ENOTSUP;
1495 }
1496
1497 if (classq_verbose) {
1498 log(LOG_DEBUG, "%s: SFB qid=%d, setting state to %s",
1499 if_name(ifp), sp->sfb_qid, (on ? "SUSPENDED" : "RUNNING"));
1500 }
1501
1502 if (on) {
1503 sp->sfb_flags |= SFBF_SUSPENDED;
1504 } else {
1505 sp->sfb_flags &= ~SFBF_SUSPENDED;
1506 sfb_swap_bins(sp, qlen(q));
1507 }
1508
1509 return 0;
1510 }