]> git.saurik.com Git - apple/xnu.git/blob - bsd/net/classq/classq_sfb.c
014870ac74c3bad3c9377d8186fb2acfba073a1e
[apple/xnu.git] / bsd / net / classq / classq_sfb.c
1 /*
2 * Copyright (c) 2011-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/mbuf.h>
32 #include <sys/socket.h>
33 #include <sys/sockio.h>
34 #include <sys/systm.h>
35 #include <sys/sysctl.h>
36 #include <sys/syslog.h>
37 #include <sys/proc.h>
38 #include <sys/errno.h>
39 #include <sys/kernel.h>
40 #include <sys/kauth.h>
41
42 #include <kern/zalloc.h>
43
44 #include <net/if.h>
45 #include <net/if_var.h>
46 #include <net/if_types.h>
47 #include <net/dlil.h>
48 #include <net/flowadv.h>
49
50 #include <netinet/in.h>
51 #include <netinet/in_systm.h>
52 #include <netinet/ip.h>
53 #if INET6
54 #include <netinet/ip6.h>
55 #endif
56
57 #include <net/classq/classq_sfb.h>
58 #include <net/flowhash.h>
59 #include <net/net_osdep.h>
60 #include <dev/random/randomdev.h>
61
62 /*
63 * Stochastic Fair Blue
64 *
65 * Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin
66 * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
67 *
68 * Based on the NS code with the following parameters:
69 *
70 * bytes: false
71 * decrement: 0.001
72 * increment: 0.005
73 * hold-time: 10ms-50ms (randomized)
74 * algorithm: 0
75 * pbox: 1
76 * pbox-time: 50-100ms (randomized)
77 * hinterval: 11-23 (randomized)
78 *
79 * This implementation uses L = 2 and N = 32 for 2 sets of:
80 *
81 * B[L][N]: L x N array of bins (L levels, N bins per level)
82 *
83 * Each set effectively creates 32^2 virtual buckets (bin combinations)
84 * while using only O(32*2) states.
85 *
86 * Given a 32-bit hash value, we divide it such that octets [0,1,2,3] are
87 * used as index for the bins across the 2 levels, where level 1 uses [0,2]
88 * and level 2 uses [1,3]. The 2 values per level correspond to the indices
89 * for the current and warm-up sets (section 4.4. in the SFB paper regarding
90 * Moving Hash Functions explains the purposes of these 2 sets.)
91 */
92
93 /*
94 * Use Murmur3A_x86_32 for hash function. It seems to perform consistently
95 * across platforms for 1-word key (32-bit flowhash value). See flowhash.h
96 * for other alternatives. We only need 16-bit hash output.
97 */
98 #define SFB_HASH net_flowhash_mh3_x86_32
99 #define SFB_HASHMASK HASHMASK(16)
100
101 #define SFB_BINMASK(_x) \
102 ((_x) & HASHMASK(SFB_BINS_SHIFT))
103
104 #define SFB_BINST(_sp, _l, _n, _c) \
105 (&(*(_sp)->sfb_bins)[_c].stats[_l][_n])
106
107 #define SFB_BINFT(_sp, _l, _n, _c) \
108 (&(*(_sp)->sfb_bins)[_c].freezetime[_l][_n])
109
110 #define SFB_FC_LIST(_sp, _n) \
111 (&(*(_sp)->sfb_fc_lists)[_n])
112
113 /*
114 * The holdtime parameter determines the minimum time interval between
115 * two successive updates of the marking probability. In the event the
116 * uplink speed is not known, a default value is chosen and is randomized
117 * to be within the following range.
118 */
119 #define HOLDTIME_BASE (100ULL * 1000 * 1000) /* 100ms */
120 #define HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10ms */
121 #define HOLDTIME_MAX (100ULL * 1000 * 1000) /* 100ms */
122
123 /*
124 * The pboxtime parameter determines the bandwidth allocated for rogue
125 * flows, i.e. the rate limiting bandwidth. In the event the uplink speed
126 * is not known, a default value is chosen and is randomized to be within
127 * the following range.
128 */
129 #define PBOXTIME_BASE (300ULL * 1000 * 1000) /* 300ms */
130 #define PBOXTIME_MIN (30ULL * 1000 * 1000) /* 30ms */
131 #define PBOXTIME_MAX (300ULL * 1000 * 1000) /* 300ms */
132
133 #define SFB_RANDOM(sp, tmin, tmax) ((sfb_random(sp) % (tmax)) + (tmin))
134
135 #define SFB_PKT_PBOX 0x1 /* in penalty box */
136
137 /* The following mantissa values are in SFB_FP_SHIFT Q format */
138 #define SFB_MAX_PMARK (1 << SFB_FP_SHIFT) /* Q14 representation of 1.00 */
139
140 /*
141 * These are d1 (increment) and d2 (decrement) parameters, used to determine
142 * the amount by which the marking probability is incremented when the queue
143 * overflows, or is decremented when the link is idle. d1 is set higher than
144 * d2, because link underutilization can occur when congestion management is
145 * either too conservative or too aggressive, but packet loss occurs only
146 * when congestion management is too conservative. By weighing heavily
147 * against packet loss, it can quickly reach to a substantial increase in
148 * traffic load.
149 */
150 #define SFB_INCREMENT 82 /* Q14 representation of 0.005 */
151 #define SFB_DECREMENT 16 /* Q14 representation of 0.001 */
152
153 #define SFB_PMARK_TH 16056 /* Q14 representation of 0.98 */
154 #define SFB_PMARK_WARM 3276 /* Q14 representation of 0.2 */
155
156 #define SFB_PMARK_INC(_bin) do { \
157 (_bin)->pmark += sfb_increment; \
158 if ((_bin)->pmark > SFB_MAX_PMARK) \
159 (_bin)->pmark = SFB_MAX_PMARK; \
160 } while (0)
161
162 #define SFB_PMARK_DEC(_bin) do { \
163 if ((_bin)->pmark > 0) { \
164 (_bin)->pmark -= sfb_decrement; \
165 if ((_bin)->pmark < 0) \
166 (_bin)->pmark = 0; \
167 } \
168 } while (0)
169
170 #define HINTERVAL_MIN (10) /* 10 seconds */
171 #define HINTERVAL_MAX (20) /* 20 seconds */
172 #define SFB_HINTERVAL(sp) ((sfb_random(sp) % HINTERVAL_MAX) + HINTERVAL_MIN)
173
174 #define DEQUEUE_DECAY 7 /* ilog2 of EWMA decay rate, (128) */
175 #define DEQUEUE_SPIKE(_new, _old) \
176 ((u_int64_t)ABS((int64_t)(_new) - (int64_t)(_old)) > ((_old) << 11))
177
178 #define ABS(v) (((v) > 0) ? (v) : -(v))
179
180 #define SFB_ZONE_MAX 32 /* maximum elements in zone */
181 #define SFB_ZONE_NAME "classq_sfb" /* zone name */
182
183 #define SFB_BINS_ZONE_MAX 32 /* maximum elements in zone */
184 #define SFB_BINS_ZONE_NAME "classq_sfb_bins" /* zone name */
185
186 #define SFB_FCL_ZONE_MAX 32 /* maximum elements in zone */
187 #define SFB_FCL_ZONE_NAME "classq_sfb_fcl" /* zone name */
188
189 /* Place the flow control entries in current bin on level 0 */
190 #define SFB_FC_LEVEL 0
191
192 /* Store SFB hash and flags in the module private scratch space */
193 #define pkt_sfb_hash8 pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val8
194 #define pkt_sfb_hash16 pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val16
195 #define pkt_sfb_hash32 pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val32
196 #define pkt_sfb_flags pkt_mpriv.__mpriv_u.__mpriv32[1].__mpriv32_u.__val32
197
198 static unsigned int sfb_size; /* size of zone element */
199 static struct zone *sfb_zone; /* zone for sfb */
200
201 static unsigned int sfb_bins_size; /* size of zone element */
202 static struct zone *sfb_bins_zone; /* zone for sfb_bins */
203
204 static unsigned int sfb_fcl_size; /* size of zone element */
205 static struct zone *sfb_fcl_zone; /* zone for sfb_fc_lists */
206
207 /* internal function prototypes */
208 static u_int32_t sfb_random(struct sfb *);
209 static struct mbuf *sfb_getq_flow(struct sfb *, class_queue_t *, u_int32_t,
210 boolean_t);
211 static void sfb_resetq(struct sfb *, cqev_t);
212 static void sfb_calc_holdtime(struct sfb *, u_int64_t);
213 static void sfb_calc_pboxtime(struct sfb *, u_int64_t);
214 static void sfb_calc_hinterval(struct sfb *, u_int64_t *);
215 static void sfb_swap_bins(struct sfb *, u_int32_t);
216 static inline int sfb_pcheck(struct sfb *, struct pkthdr *);
217 static int sfb_penalize(struct sfb *, struct pkthdr *, struct timespec *);
218 static void sfb_adjust_bin(struct sfb *, struct sfbbinstats *,
219 struct timespec *, struct timespec *, boolean_t);
220 static void sfb_decrement_bin(struct sfb *, struct sfbbinstats *,
221 struct timespec *, struct timespec *);
222 static void sfb_increment_bin(struct sfb *, struct sfbbinstats *,
223 struct timespec *, struct timespec *);
224 static inline void sfb_dq_update_bins(struct sfb *, struct pkthdr *,
225 struct timespec *);
226 static inline void sfb_eq_update_bins(struct sfb *, struct pkthdr *);
227 static int sfb_drop_early(struct sfb *, struct pkthdr *, u_int16_t *,
228 struct timespec *);
229 static boolean_t sfb_bin_addfcentry(struct sfb *, struct pkthdr *);
230 static void sfb_fclist_append(struct sfb *, struct sfb_fcl *);
231 static void sfb_fclists_clean(struct sfb *sp);
232
233 SYSCTL_NODE(_net_classq, OID_AUTO, sfb, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "SFB");
234
235 static u_int64_t sfb_holdtime = 0; /* 0 indicates "automatic" */
236 SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, holdtime, CTLFLAG_RW|CTLFLAG_LOCKED,
237 &sfb_holdtime, "SFB freeze time in nanoseconds");
238
239 static u_int64_t sfb_pboxtime = 0; /* 0 indicates "automatic" */
240 SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, pboxtime, CTLFLAG_RW|CTLFLAG_LOCKED,
241 &sfb_pboxtime, "SFB penalty box time in nanoseconds");
242
243 static u_int64_t sfb_hinterval;
244 SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, hinterval, CTLFLAG_RW|CTLFLAG_LOCKED,
245 &sfb_hinterval, "SFB hash interval in nanoseconds");
246
247 static u_int32_t sfb_increment = SFB_INCREMENT;
248 SYSCTL_UINT(_net_classq_sfb, OID_AUTO, increment, CTLFLAG_RW|CTLFLAG_LOCKED,
249 &sfb_increment, SFB_INCREMENT, "SFB increment [d1]");
250
251 static u_int32_t sfb_decrement = SFB_DECREMENT;
252 SYSCTL_UINT(_net_classq_sfb, OID_AUTO, decrement, CTLFLAG_RW|CTLFLAG_LOCKED,
253 &sfb_decrement, SFB_DECREMENT, "SFB decrement [d2]");
254
255 static u_int32_t sfb_allocation = 0; /* 0 means "automatic" */
256 SYSCTL_UINT(_net_classq_sfb, OID_AUTO, allocation, CTLFLAG_RW|CTLFLAG_LOCKED,
257 &sfb_allocation, 0, "SFB bin allocation");
258
259 static u_int32_t sfb_ratelimit = 0;
260 SYSCTL_UINT(_net_classq_sfb, OID_AUTO, ratelimit, CTLFLAG_RW|CTLFLAG_LOCKED,
261 &sfb_ratelimit, 0, "SFB rate limit");
262
263 #define MBPS (1ULL * 1000 * 1000)
264 #define GBPS (MBPS * 1000)
265
266 struct sfb_time_tbl {
267 u_int64_t speed; /* uplink speed */
268 u_int64_t holdtime; /* hold time */
269 u_int64_t pboxtime; /* penalty box time */
270 };
271
272 static struct sfb_time_tbl sfb_ttbl[] = {
273 { 1 * MBPS, HOLDTIME_BASE * 1000, PBOXTIME_BASE * 1000 },
274 { 10 * MBPS, HOLDTIME_BASE * 100, PBOXTIME_BASE * 100 },
275 { 100 * MBPS, HOLDTIME_BASE * 10, PBOXTIME_BASE * 10 },
276 { 1 * GBPS, HOLDTIME_BASE, PBOXTIME_BASE },
277 { 10 * GBPS, HOLDTIME_BASE / 10, PBOXTIME_BASE / 10 },
278 { 100 * GBPS, HOLDTIME_BASE / 100, PBOXTIME_BASE / 100 },
279 { 0, 0, 0 }
280 };
281
282 void
283 sfb_init(void)
284 {
285 _CASSERT(SFBF_ECN4 == CLASSQF_ECN4);
286 _CASSERT(SFBF_ECN6 == CLASSQF_ECN6);
287
288 sfb_size = sizeof (struct sfb);
289 sfb_zone = zinit(sfb_size, SFB_ZONE_MAX * sfb_size,
290 0, SFB_ZONE_NAME);
291 if (sfb_zone == NULL) {
292 panic("%s: failed allocating %s", __func__, SFB_ZONE_NAME);
293 /* NOTREACHED */
294 }
295 zone_change(sfb_zone, Z_EXPAND, TRUE);
296 zone_change(sfb_zone, Z_CALLERACCT, TRUE);
297
298 sfb_bins_size = sizeof (*((struct sfb *)0)->sfb_bins);
299 sfb_bins_zone = zinit(sfb_bins_size, SFB_BINS_ZONE_MAX * sfb_bins_size,
300 0, SFB_BINS_ZONE_NAME);
301 if (sfb_bins_zone == NULL) {
302 panic("%s: failed allocating %s", __func__, SFB_BINS_ZONE_NAME);
303 /* NOTREACHED */
304 }
305 zone_change(sfb_bins_zone, Z_EXPAND, TRUE);
306 zone_change(sfb_bins_zone, Z_CALLERACCT, TRUE);
307
308 sfb_fcl_size = sizeof (*((struct sfb *)0)->sfb_fc_lists);
309 sfb_fcl_zone = zinit(sfb_fcl_size, SFB_FCL_ZONE_MAX * sfb_fcl_size,
310 0, SFB_FCL_ZONE_NAME);
311 if (sfb_fcl_zone == NULL) {
312 panic("%s: failed allocating %s", __func__, SFB_FCL_ZONE_NAME);
313 /* NOTREACHED */
314 }
315 zone_change(sfb_fcl_zone, Z_EXPAND, TRUE);
316 zone_change(sfb_fcl_zone, Z_CALLERACCT, TRUE);
317 }
318
319 static u_int32_t
320 sfb_random(struct sfb *sp)
321 {
322 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
323 return (RandomULong());
324 }
325
326 static void
327 sfb_calc_holdtime(struct sfb *sp, u_int64_t outbw)
328 {
329 u_int64_t holdtime;
330
331 if (sfb_holdtime != 0) {
332 holdtime = sfb_holdtime;
333 } else if (outbw == 0) {
334 holdtime = SFB_RANDOM(sp, HOLDTIME_MIN, HOLDTIME_MAX);
335 } else {
336 unsigned int n, i;
337
338 n = sfb_ttbl[0].holdtime;
339 for (i = 0; sfb_ttbl[i].speed != 0; i++) {
340 if (outbw < sfb_ttbl[i].speed)
341 break;
342 n = sfb_ttbl[i].holdtime;
343 }
344 holdtime = n;
345 }
346 net_nsectimer(&holdtime, &sp->sfb_holdtime);
347 }
348
349 static void
350 sfb_calc_pboxtime(struct sfb *sp, u_int64_t outbw)
351 {
352 u_int64_t pboxtime;
353
354 if (sfb_pboxtime != 0) {
355 pboxtime = sfb_pboxtime;
356 } else if (outbw == 0) {
357 pboxtime = SFB_RANDOM(sp, PBOXTIME_MIN, PBOXTIME_MAX);
358 } else {
359 unsigned int n, i;
360
361 n = sfb_ttbl[0].pboxtime;
362 for (i = 0; sfb_ttbl[i].speed != 0; i++) {
363 if (outbw < sfb_ttbl[i].speed)
364 break;
365 n = sfb_ttbl[i].pboxtime;
366 }
367 pboxtime = n;
368 }
369 net_nsectimer(&pboxtime, &sp->sfb_pboxtime);
370 net_timerclear(&sp->sfb_pboxfreeze);
371 }
372
373 static void
374 sfb_calc_hinterval(struct sfb *sp, u_int64_t *t)
375 {
376 u_int64_t hinterval;
377 struct timespec now;
378
379 if (t != NULL) {
380 /*
381 * TODO adi@apple.com: use dq_avg to derive hinterval.
382 */
383 hinterval = *t;
384 }
385
386 if (sfb_hinterval != 0)
387 hinterval = sfb_hinterval;
388 else if (t == NULL || hinterval == 0)
389 hinterval = ((u_int64_t)SFB_HINTERVAL(sp) * NSEC_PER_SEC);
390
391 net_nsectimer(&hinterval, &sp->sfb_hinterval);
392
393 nanouptime(&now);
394 net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset);
395 }
396
397 /*
398 * sfb support routines
399 */
400 struct sfb *
401 sfb_alloc(struct ifnet *ifp, u_int32_t qid, u_int32_t qlim, u_int32_t flags)
402 {
403 struct sfb *sp;
404 int i;
405
406 VERIFY(ifp != NULL && qlim > 0);
407
408 sp = zalloc(sfb_zone);
409 if (sp == NULL) {
410 log(LOG_ERR, "%s: SFB unable to allocate\n", if_name(ifp));
411 return (NULL);
412 }
413 bzero(sp, sfb_size);
414
415 if ((sp->sfb_bins = zalloc(sfb_bins_zone)) == NULL) {
416 log(LOG_ERR, "%s: SFB unable to allocate bins\n", if_name(ifp));
417 sfb_destroy(sp);
418 return (NULL);
419 }
420 bzero(sp->sfb_bins, sfb_bins_size);
421
422 if ((sp->sfb_fc_lists = zalloc(sfb_fcl_zone)) == NULL) {
423 log(LOG_ERR, "%s: SFB unable to allocate flow control lists\n",
424 if_name(ifp));
425 sfb_destroy(sp);
426 return(NULL);
427 }
428 bzero(sp->sfb_fc_lists, sfb_fcl_size);
429
430 for (i = 0; i < SFB_BINS; ++i)
431 STAILQ_INIT(&SFB_FC_LIST(sp, i)->fclist);
432
433 sp->sfb_ifp = ifp;
434 sp->sfb_qlim = qlim;
435 sp->sfb_qid = qid;
436 sp->sfb_flags = (flags & SFBF_USERFLAGS);
437 #if !PF_ECN
438 if (sp->sfb_flags & SFBF_ECN) {
439 sp->sfb_flags &= ~SFBF_ECN;
440 log(LOG_ERR, "%s: SFB qid=%d, ECN not available; ignoring "
441 "SFBF_ECN flag!\n", if_name(ifp), sp->sfb_qid);
442 }
443 #endif /* !PF_ECN */
444
445 sfb_resetq(sp, -1);
446
447 return (sp);
448 }
449
450 static void
451 sfb_fclist_append(struct sfb *sp, struct sfb_fcl *fcl)
452 {
453 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
454
455 VERIFY(STAILQ_EMPTY(&fcl->fclist) || fcl->cnt > 0);
456 sp->sfb_stats.flow_feedback += fcl->cnt;
457 fcl->cnt = 0;
458
459 flowadv_add(&fcl->fclist);
460 VERIFY(fcl->cnt == 0 && STAILQ_EMPTY(&fcl->fclist));
461 }
462
463 static void
464 sfb_fclists_clean(struct sfb *sp)
465 {
466 int i;
467
468 /* Move all the flow control entries to the flowadv list */
469 for (i = 0; i < SFB_BINS; ++i) {
470 struct sfb_fcl *fcl = SFB_FC_LIST(sp, i);
471 if (!STAILQ_EMPTY(&fcl->fclist))
472 sfb_fclist_append(sp, fcl);
473 }
474 }
475
476 void
477 sfb_destroy(struct sfb *sp)
478 {
479 sfb_fclists_clean(sp);
480 if (sp->sfb_bins != NULL) {
481 zfree(sfb_bins_zone, sp->sfb_bins);
482 sp->sfb_bins = NULL;
483 }
484 if (sp->sfb_fc_lists != NULL) {
485 zfree(sfb_fcl_zone, sp->sfb_fc_lists);
486 sp->sfb_fc_lists = NULL;
487 }
488 zfree(sfb_zone, sp);
489 }
490
491 static void
492 sfb_resetq(struct sfb *sp, cqev_t ev)
493 {
494 struct ifnet *ifp = sp->sfb_ifp;
495 u_int64_t eff_rate;
496
497 VERIFY(ifp != NULL);
498
499 if (ev != CLASSQ_EV_LINK_DOWN) {
500 (*sp->sfb_bins)[0].fudge = sfb_random(sp);
501 (*sp->sfb_bins)[1].fudge = sfb_random(sp);
502 sp->sfb_allocation = ((sfb_allocation == 0) ?
503 (sp->sfb_qlim / 3) : sfb_allocation);
504 sp->sfb_drop_thresh = sp->sfb_allocation +
505 (sp->sfb_allocation >> 1);
506 }
507
508 sp->sfb_clearpkts = 0;
509 sp->sfb_current = 0;
510
511 eff_rate = ifnet_output_linkrate(ifp);
512 sp->sfb_eff_rate = eff_rate;
513
514 sfb_calc_holdtime(sp, eff_rate);
515 sfb_calc_pboxtime(sp, eff_rate);
516 sfb_calc_hinterval(sp, NULL);
517
518 if (ev == CLASSQ_EV_LINK_DOWN ||
519 ev == CLASSQ_EV_LINK_UP)
520 sfb_fclists_clean(sp);
521
522 bzero(sp->sfb_bins, sizeof (*sp->sfb_bins));
523 bzero(&sp->sfb_stats, sizeof (sp->sfb_stats));
524
525 if (ev == CLASSQ_EV_LINK_DOWN || !classq_verbose)
526 return;
527
528 log(LOG_DEBUG, "%s: SFB qid=%d, holdtime=%llu nsec, "
529 "pboxtime=%llu nsec, allocation=%d, drop_thresh=%d, "
530 "hinterval=%d sec, sfb_bins=%d bytes, eff_rate=%llu bps\n",
531 if_name(ifp), sp->sfb_qid, (u_int64_t)sp->sfb_holdtime.tv_nsec,
532 (u_int64_t)sp->sfb_pboxtime.tv_nsec,
533 (u_int32_t)sp->sfb_allocation, (u_int32_t)sp->sfb_drop_thresh,
534 (int)sp->sfb_hinterval.tv_sec, (int)sizeof (*sp->sfb_bins),
535 eff_rate);
536 }
537
538 void
539 sfb_getstats(struct sfb *sp, struct sfb_stats *sps)
540 {
541 sps->allocation = sp->sfb_allocation;
542 sps->dropthresh = sp->sfb_drop_thresh;
543 sps->clearpkts = sp->sfb_clearpkts;
544 sps->current = sp->sfb_current;
545
546 net_timernsec(&sp->sfb_holdtime, &sp->sfb_stats.hold_time);
547 net_timernsec(&sp->sfb_pboxtime, &sp->sfb_stats.pbox_time);
548 net_timernsec(&sp->sfb_hinterval, &sp->sfb_stats.rehash_intval);
549 *(&(sps->sfbstats)) = *(&(sp->sfb_stats));
550
551 _CASSERT(sizeof ((*sp->sfb_bins)[0].stats) ==
552 sizeof (sps->binstats[0].stats));
553
554 bcopy(&(*sp->sfb_bins)[0].stats, &sps->binstats[0].stats,
555 sizeof (sps->binstats[0].stats));
556 bcopy(&(*sp->sfb_bins)[1].stats, &sps->binstats[1].stats,
557 sizeof (sps->binstats[1].stats));
558 }
559
560 static void
561 sfb_swap_bins(struct sfb *sp, u_int32_t len)
562 {
563 int i, j, s;
564
565 if (sp->sfb_flags & SFBF_SUSPENDED)
566 return;
567
568 s = sp->sfb_current;
569 VERIFY((s + (s ^ 1)) == 1);
570
571 (*sp->sfb_bins)[s].fudge = sfb_random(sp); /* recompute perturbation */
572 sp->sfb_clearpkts = len;
573 sp->sfb_stats.num_rehash++;
574
575 s = (sp->sfb_current ^= 1); /* flip the bit (swap current) */
576
577 if (classq_verbose) {
578 log(LOG_DEBUG, "%s: SFB qid=%d, set %d is now current, "
579 "qlen=%d\n", if_name(sp->sfb_ifp), sp->sfb_qid, s, len);
580 }
581
582 /* clear freezetime for all current bins */
583 bzero(&(*sp->sfb_bins)[s].freezetime,
584 sizeof ((*sp->sfb_bins)[s].freezetime));
585
586 /* clear/adjust bin statistics and flow control lists */
587 for (i = 0; i < SFB_BINS; i++) {
588 struct sfb_fcl *fcl = SFB_FC_LIST(sp, i);
589
590 if (!STAILQ_EMPTY(&fcl->fclist))
591 sfb_fclist_append(sp, fcl);
592
593 for (j = 0; j < SFB_LEVELS; j++) {
594 struct sfbbinstats *cbin, *wbin;
595
596 cbin = SFB_BINST(sp, j, i, s); /* current */
597 wbin = SFB_BINST(sp, j, i, s ^ 1); /* warm-up */
598
599 cbin->pkts = 0;
600 if (cbin->pmark > SFB_MAX_PMARK)
601 cbin->pmark = SFB_MAX_PMARK;
602 if (cbin->pmark < 0)
603 cbin->pmark = 0;
604
605 /*
606 * Keep pmark from before to identify
607 * non-responsives immediately.
608 */
609 if (wbin->pmark > SFB_PMARK_WARM)
610 wbin->pmark = SFB_PMARK_WARM;
611 }
612 }
613 }
614
615 static inline int
616 sfb_pcheck(struct sfb *sp, struct pkthdr *pkt)
617 {
618 #if SFB_LEVELS != 2
619 int i, n;
620 #endif /* SFB_LEVELS != 2 */
621 int s;
622
623 s = sp->sfb_current;
624 VERIFY((s + (s ^ 1)) == 1);
625
626 /*
627 * For current bins, returns 1 if all pmark >= SFB_PMARK_TH,
628 * 0 otherwise; optimize for SFB_LEVELS=2.
629 */
630 #if SFB_LEVELS == 2
631 /*
632 * Level 0: bin index at [0] for set 0; [2] for set 1
633 * Level 1: bin index at [1] for set 0; [3] for set 1
634 */
635 if (SFB_BINST(sp, 0, SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]),
636 s)->pmark < SFB_PMARK_TH ||
637 SFB_BINST(sp, 1, SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]),
638 s)->pmark < SFB_PMARK_TH)
639 return (0);
640 #else /* SFB_LEVELS != 2 */
641 for (i = 0; i < SFB_LEVELS; i++) {
642 if (s == 0) /* set 0, bin index [0,1] */
643 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
644 else /* set 1, bin index [2,3] */
645 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
646
647 if (SFB_BINST(sp, i, n, s)->pmark < SFB_PMARK_TH)
648 return (0);
649 }
650 #endif /* SFB_LEVELS != 2 */
651 return (1);
652 }
653
654 static int
655 sfb_penalize(struct sfb *sp, struct pkthdr *pkt, struct timespec *now)
656 {
657 struct timespec delta = { 0, 0 };
658
659 /* If minimum pmark of current bins is < SFB_PMARK_TH, we're done */
660 if (!sfb_ratelimit || !sfb_pcheck(sp, pkt))
661 return (0);
662
663 net_timersub(now, &sp->sfb_pboxfreeze, &delta);
664 if (net_timercmp(&delta, &sp->sfb_pboxtime, <)) {
665 #if SFB_LEVELS != 2
666 int i;
667 #endif /* SFB_LEVELS != 2 */
668 struct sfbbinstats *bin;
669 int n, w;
670
671 w = sp->sfb_current ^ 1;
672 VERIFY((w + (w ^ 1)) == 1);
673
674 /*
675 * Update warm-up bins; optimize for SFB_LEVELS=2
676 */
677 #if SFB_LEVELS == 2
678 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
679 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(w << 1)]);
680 bin = SFB_BINST(sp, 0, n, w);
681 if (bin->pkts >= sp->sfb_allocation)
682 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 0, n, w), now);
683
684 /* Level 0: bin index at [1] for set 0; [3] for set 1 */
685 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(w << 1) + 1]);
686 bin = SFB_BINST(sp, 1, n, w);
687 if (bin->pkts >= sp->sfb_allocation)
688 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 1, n, w), now);
689 #else /* SFB_LEVELS != 2 */
690 for (i = 0; i < SFB_LEVELS; i++) {
691 if (w == 0) /* set 0, bin index [0,1] */
692 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
693 else /* set 1, bin index [2,3] */
694 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
695
696 bin = SFB_BINST(sp, i, n, w);
697 if (bin->pkts >= sp->sfb_allocation) {
698 sfb_increment_bin(sp, bin,
699 SFB_BINFT(sp, i, n, w), now);
700 }
701 }
702 #endif /* SFB_LEVELS != 2 */
703 return (1);
704 }
705
706 /* non-conformant or else misclassified flow; queue it anyway */
707 pkt->pkt_sfb_flags |= SFB_PKT_PBOX;
708 *(&sp->sfb_pboxfreeze) = *now;
709
710 return (0);
711 }
712
713 static void
714 sfb_adjust_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft,
715 struct timespec *now, boolean_t inc)
716 {
717 struct timespec delta;
718
719 net_timersub(now, ft, &delta);
720 if (net_timercmp(&delta, &sp->sfb_holdtime, <)) {
721 if (classq_verbose > 1) {
722 log(LOG_DEBUG, "%s: SFB qid=%d, %s update frozen "
723 "(delta=%llu nsec)\n", if_name(sp->sfb_ifp),
724 sp->sfb_qid, inc ? "increment" : "decrement",
725 (u_int64_t)delta.tv_nsec);
726 }
727 return;
728 }
729
730 /* increment/decrement marking probability */
731 *ft = *now;
732 if (inc)
733 SFB_PMARK_INC(bin);
734 else
735 SFB_PMARK_DEC(bin);
736 }
737
738 static void
739 sfb_decrement_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft,
740 struct timespec *now)
741 {
742 return (sfb_adjust_bin(sp, bin, ft, now, FALSE));
743 }
744
745 static void
746 sfb_increment_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft,
747 struct timespec *now)
748 {
749 return (sfb_adjust_bin(sp, bin, ft, now, TRUE));
750 }
751
752 static inline void
753 sfb_dq_update_bins(struct sfb *sp, struct pkthdr *pkt, struct timespec *now)
754 {
755 #if SFB_LEVELS != 2 || SFB_FC_LEVEL != 0
756 int i;
757 #endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
758 struct sfbbinstats *bin;
759 int s, n;
760 struct sfb_fcl *fcl = NULL;
761
762 s = sp->sfb_current;
763 VERIFY((s + (s ^ 1)) == 1);
764
765 /*
766 * Update current bins; optimize for SFB_LEVELS=2 and SFB_FC_LEVEL=0
767 */
768 #if SFB_LEVELS == 2 && SFB_FC_LEVEL == 0
769 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
770 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]);
771 bin = SFB_BINST(sp, 0, n, s);
772
773 VERIFY(bin->pkts > 0);
774 if (--bin->pkts == 0) {
775 sfb_decrement_bin(sp, bin, SFB_BINFT(sp, 0, n, s), now);
776 }
777 if (bin->pkts <= (sp->sfb_allocation >> 2)) {
778 /* deliver flow control feedback to the sockets */
779 fcl = SFB_FC_LIST(sp, n);
780 if (!STAILQ_EMPTY(&fcl->fclist))
781 sfb_fclist_append(sp, fcl);
782 }
783
784 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
785 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]);
786 bin = SFB_BINST(sp, 1, n, s);
787
788 VERIFY(bin->pkts > 0);
789 if (--bin->pkts == 0)
790 sfb_decrement_bin(sp, bin, SFB_BINFT(sp, 1, n, s), now);
791 #else /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
792 for (i = 0; i < SFB_LEVELS; i++) {
793 if (s == 0) /* set 0, bin index [0,1] */
794 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
795 else /* set 1, bin index [2,3] */
796 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
797
798 bin = SFB_BINST(sp, i, n, s);
799
800 VERIFY(bin->pkts > 0);
801 if (--bin->pkts == 0) {
802 sfb_decrement_bin(sp, bin,
803 SFB_BINFT(sp, i, n, s), now);
804 }
805 if (bin->pkts <= (sp->sfb_allocation >> 2)) {
806 /* deliver flow control feedback to the sockets */
807 if (i == SFB_FC_LEVEL) {
808 fcl = SFB_FC_LIST(sp, n);
809 if (!STAILQ_EMPTY(&fcl->fclist))
810 sfb_fclist_append(sp, fcl);
811 }
812 }
813 }
814 #endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
815 }
816
817 static inline void
818 sfb_eq_update_bins(struct sfb *sp, struct pkthdr *pkt)
819 {
820 #if SFB_LEVELS != 2
821 int i, n;
822 #endif /* SFB_LEVELS != 2 */
823 int s;
824
825 s = sp->sfb_current;
826 VERIFY((s + (s ^ 1)) == 1);
827
828 /*
829 * Update current bins; optimize for SFB_LEVELS=2
830 */
831 #if SFB_LEVELS == 2
832 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
833 SFB_BINST(sp, 0,
834 SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]), s)->pkts++;
835
836 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
837 SFB_BINST(sp, 1,
838 SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]), s)->pkts++;
839 #else /* SFB_LEVELS != 2 */
840 for (i = 0; i < SFB_LEVELS; i++) {
841 if (s == 0) /* set 0, bin index [0,1] */
842 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
843 else /* set 1, bin index [2,3] */
844 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
845
846 SFB_BINST(sp, i, n, s)->pkts++;
847 }
848 #endif /* SFB_LEVELS != 2 */
849 }
850
851 static boolean_t
852 sfb_bin_addfcentry(struct sfb *sp, struct pkthdr *pkt)
853 {
854 struct flowadv_fcentry *fce;
855 u_int32_t flowsrc, flowid;
856 struct sfb_fcl *fcl;
857 int s;
858
859 s = sp->sfb_current;
860 VERIFY((s + (s ^ 1)) == 1);
861
862 flowsrc = pkt->pkt_flowsrc;
863 flowid = pkt->pkt_flowid;
864
865 if (flowid == 0) {
866 sp->sfb_stats.null_flowid++;
867 return (FALSE);
868 }
869
870 /*
871 * Use value at index 0 for set 0 and
872 * value at index 2 for set 1
873 */
874 fcl = SFB_FC_LIST(sp, SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]));
875 STAILQ_FOREACH(fce, &fcl->fclist, fce_link) {
876 if (fce->fce_flowsrc == flowsrc &&
877 fce->fce_flowid == flowid) {
878 /* Already on flow control list; just return */
879 return (TRUE);
880 }
881 }
882
883 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
884 fce = flowadv_alloc_entry(M_WAITOK);
885 if (fce != NULL) {
886 fce->fce_flowsrc = flowsrc;
887 fce->fce_flowid = flowid;
888 STAILQ_INSERT_TAIL(&fcl->fclist, fce, fce_link);
889 fcl->cnt++;
890 sp->sfb_stats.flow_controlled++;
891 }
892
893 return (fce != NULL);
894 }
895
896 /*
897 * early-drop probability is kept in pmark of each bin of the flow
898 */
899 static int
900 sfb_drop_early(struct sfb *sp, struct pkthdr *pkt, u_int16_t *pmin,
901 struct timespec *now)
902 {
903 #if SFB_LEVELS != 2
904 int i;
905 #endif /* SFB_LEVELS != 2 */
906 struct sfbbinstats *bin;
907 int s, n, ret = 0;
908
909 s = sp->sfb_current;
910 VERIFY((s + (s ^ 1)) == 1);
911
912 *pmin = (u_int16_t)-1;
913
914 /*
915 * Update current bins; optimize for SFB_LEVELS=2
916 */
917 #if SFB_LEVELS == 2
918 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
919 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]);
920 bin = SFB_BINST(sp, 0, n, s);
921 if (*pmin > (u_int16_t)bin->pmark)
922 *pmin = (u_int16_t)bin->pmark;
923
924 if (bin->pkts >= sp->sfb_allocation) {
925 if (bin->pkts >= sp->sfb_drop_thresh)
926 ret = 1; /* drop or mark */
927 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 0, n, s), now);
928 }
929
930 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
931 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]);
932 bin = SFB_BINST(sp, 1, n, s);
933 if (*pmin > (u_int16_t)bin->pmark)
934 *pmin = (u_int16_t)bin->pmark;
935
936 if (bin->pkts >= sp->sfb_allocation) {
937 if (bin->pkts >= sp->sfb_drop_thresh)
938 ret = 1; /* drop or mark */
939 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 1, n, s), now);
940 }
941 #else /* SFB_LEVELS != 2 */
942 for (i = 0; i < SFB_LEVELS; i++) {
943 if (s == 0) /* set 0, bin index [0,1] */
944 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
945 else /* set 1, bin index [2,3] */
946 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
947
948 bin = SFB_BINST(sp, i, n, s);
949 if (*pmin > (u_int16_t)bin->pmark)
950 *pmin = (u_int16_t)bin->pmark;
951
952 if (bin->pkts >= sp->sfb_allocation) {
953 if (bin->pkts >= sp->sfb_drop_thresh)
954 ret = 1; /* drop or mark */
955 sfb_increment_bin(sp, bin,
956 SFB_BINFT(sp, i, n, s), now);
957 }
958 }
959 #endif /* SFB_LEVELS != 2 */
960
961 if (sp->sfb_flags & SFBF_SUSPENDED)
962 ret = 1; /* drop or mark */
963
964 return (ret);
965 }
966
967 #define DTYPE_NODROP 0 /* no drop */
968 #define DTYPE_FORCED 1 /* a "forced" drop */
969 #define DTYPE_EARLY 2 /* an "unforced" (early) drop */
970
971 int
972 sfb_addq(struct sfb *sp, class_queue_t *q, struct mbuf *m, struct pf_mtag *t)
973 {
974 #if !PF_ECN
975 #pragma unused(t)
976 #endif /* !PF_ECN */
977 struct pkthdr *pkt = &m->m_pkthdr;
978 struct timespec now;
979 int droptype, s;
980 u_int16_t pmin;
981 int fc_adv = 0;
982 int ret = CLASSQEQ_SUCCESS;
983
984 nanouptime(&now);
985
986 s = sp->sfb_current;
987 VERIFY((s + (s ^ 1)) == 1);
988
989 /* time to swap the bins? */
990 if (net_timercmp(&now, &sp->sfb_nextreset, >=)) {
991 net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset);
992 sfb_swap_bins(sp, qlen(q));
993 s = sp->sfb_current;
994 VERIFY((s + (s ^ 1)) == 1);
995 }
996
997 pkt->pkt_sfb_flags = 0;
998 pkt->pkt_sfb_hash16[s] =
999 (SFB_HASH(&pkt->pkt_flowid, sizeof (pkt->pkt_flowid),
1000 (*sp->sfb_bins)[s].fudge) & SFB_HASHMASK);
1001 pkt->pkt_sfb_hash16[s ^ 1] =
1002 (SFB_HASH(&pkt->pkt_flowid, sizeof (pkt->pkt_flowid),
1003 (*sp->sfb_bins)[s ^ 1].fudge) & SFB_HASHMASK);
1004
1005 /* see if we drop early */
1006 droptype = DTYPE_NODROP;
1007 if (sfb_drop_early(sp, pkt, &pmin, &now)) {
1008 /* flow control, mark or drop by sfb */
1009 if ((sp->sfb_flags & SFBF_FLOWCTL) &&
1010 (pkt->pkt_flags & PKTF_FLOW_ADV)) {
1011 fc_adv = 1;
1012 /* drop all during suspension or for non-TCP */
1013 if ((sp->sfb_flags & SFBF_SUSPENDED) ||
1014 pkt->pkt_proto != IPPROTO_TCP) {
1015 droptype = DTYPE_EARLY;
1016 sp->sfb_stats.drop_early++;
1017 }
1018 }
1019 #if PF_ECN
1020 else if ((sp->sfb_flags & SFBF_ECN) &&
1021 (pkt->pkt_proto == IPPROTO_TCP) && /* only for TCP */
1022 ((sfb_random(sp) & SFB_MAX_PMARK) <= pmin) &&
1023 mark_ecn(m, t, sp->sfb_flags) &&
1024 !(sp->sfb_flags & SFBF_SUSPENDED)) {
1025 /* successfully marked; do not drop. */
1026 sp->sfb_stats.marked_packets++;
1027 }
1028 #endif /* PF_ECN */
1029 else {
1030 /* unforced drop by sfb */
1031 droptype = DTYPE_EARLY;
1032 sp->sfb_stats.drop_early++;
1033 }
1034 }
1035
1036 /* non-responsive flow penalty? */
1037 if (droptype == DTYPE_NODROP && sfb_penalize(sp, pkt, &now)) {
1038 droptype = DTYPE_FORCED;
1039 sp->sfb_stats.drop_pbox++;
1040 }
1041
1042 /* if the queue length hits the hard limit, it's a forced drop */
1043 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) {
1044 droptype = DTYPE_FORCED;
1045 sp->sfb_stats.drop_queue++;
1046 }
1047
1048 if (fc_adv == 1 && droptype != DTYPE_FORCED &&
1049 sfb_bin_addfcentry(sp, pkt)) {
1050 /* deliver flow control advisory error */
1051 if (droptype == DTYPE_NODROP) {
1052 ret = CLASSQEQ_SUCCESS_FC;
1053 VERIFY(!(sp->sfb_flags & SFBF_SUSPENDED));
1054 } else if (sp->sfb_flags & SFBF_SUSPENDED) {
1055 /* dropped due to suspension */
1056 ret = CLASSQEQ_DROPPED_SP;
1057 } else {
1058 /* dropped due to flow-control */
1059 ret = CLASSQEQ_DROPPED_FC;
1060 }
1061 }
1062
1063 /* if successful enqueue this packet, else drop it */
1064 if (droptype == DTYPE_NODROP) {
1065 _addq(q, m);
1066 } else {
1067 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
1068 m_freem(m);
1069 return ((ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROPPED);
1070 }
1071
1072 if (!(pkt->pkt_sfb_flags & SFB_PKT_PBOX))
1073 sfb_eq_update_bins(sp, pkt);
1074 else
1075 sp->sfb_stats.pbox_packets++;
1076
1077 /* successfully queued */
1078 return (ret);
1079 }
1080
1081 static struct mbuf *
1082 sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge)
1083 {
1084 struct timespec now;
1085 struct mbuf *m;
1086 struct pkthdr *pkt;
1087
1088 if (!purge && (sp->sfb_flags & SFBF_SUSPENDED))
1089 return (NULL);
1090
1091 nanouptime(&now);
1092
1093 /* flow of 0 means head of queue */
1094 if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) {
1095 if (!purge)
1096 net_timerclear(&sp->sfb_getqtime);
1097 return (NULL);
1098 }
1099
1100 VERIFY(m->m_flags & M_PKTHDR);
1101
1102 pkt = &m->m_pkthdr;
1103
1104 if (!purge) {
1105 /* calculate EWMA of dequeues */
1106 if (net_timerisset(&sp->sfb_getqtime)) {
1107 struct timespec delta;
1108 u_int64_t avg, new;
1109
1110 net_timersub(&now, &sp->sfb_getqtime, &delta);
1111 net_timernsec(&delta, &new);
1112 avg = sp->sfb_stats.dequeue_avg;
1113 if (avg > 0) {
1114 int decay = DEQUEUE_DECAY;
1115 /*
1116 * If the time since last dequeue is
1117 * significantly greater than the current
1118 * average, weight the average more against
1119 * the old value.
1120 */
1121 if (DEQUEUE_SPIKE(new, avg))
1122 decay += 5;
1123 avg = (((avg << decay) - avg) + new) >> decay;
1124 } else {
1125 avg = new;
1126 }
1127 sp->sfb_stats.dequeue_avg = avg;
1128 }
1129 *(&sp->sfb_getqtime) = *(&now);
1130 }
1131
1132 /*
1133 * Clearpkts are the ones which were in the queue when the hash
1134 * function was perturbed. Since the perturbation value (fudge),
1135 * and thus bin information for these packets is not known, we do
1136 * not change accounting information while dequeuing these packets.
1137 * It is important not to set the hash interval too small due to
1138 * this reason. A rule of thumb is to set it to K*D, where D is
1139 * the time taken to drain queue.
1140 */
1141 if (pkt->pkt_sfb_flags & SFB_PKT_PBOX) {
1142 pkt->pkt_sfb_flags &= ~SFB_PKT_PBOX;
1143 if (sp->sfb_clearpkts > 0)
1144 sp->sfb_clearpkts--;
1145 } else if (sp->sfb_clearpkts > 0) {
1146 sp->sfb_clearpkts--;
1147 } else {
1148 sfb_dq_update_bins(sp, pkt, &now);
1149 }
1150
1151 return (m);
1152 }
1153
1154 struct mbuf *
1155 sfb_getq(struct sfb *sp, class_queue_t *q)
1156 {
1157 return (sfb_getq_flow(sp, q, 0, FALSE));
1158 }
1159
1160 void
1161 sfb_purgeq(struct sfb *sp, class_queue_t *q, u_int32_t flow, u_int32_t *packets,
1162 u_int32_t *bytes)
1163 {
1164 u_int32_t cnt = 0, len = 0;
1165 struct mbuf *m;
1166
1167 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
1168
1169 while ((m = sfb_getq_flow(sp, q, flow, TRUE)) != NULL) {
1170 cnt++;
1171 len += m_pktlen(m);
1172 m_freem(m);
1173 }
1174
1175 if (packets != NULL)
1176 *packets = cnt;
1177 if (bytes != NULL)
1178 *bytes = len;
1179 }
1180
1181 void
1182 sfb_updateq(struct sfb *sp, cqev_t ev)
1183 {
1184 struct ifnet *ifp = sp->sfb_ifp;
1185
1186 VERIFY(ifp != NULL);
1187
1188 switch (ev) {
1189 case CLASSQ_EV_LINK_BANDWIDTH: {
1190 u_int64_t eff_rate = ifnet_output_linkrate(ifp);
1191
1192 /* update parameters only if rate has changed */
1193 if (eff_rate == sp->sfb_eff_rate)
1194 break;
1195
1196 if (classq_verbose) {
1197 log(LOG_DEBUG, "%s: SFB qid=%d, adapting to new "
1198 "eff_rate=%llu bps\n", if_name(ifp), sp->sfb_qid,
1199 eff_rate);
1200 }
1201 sfb_calc_holdtime(sp, eff_rate);
1202 sfb_calc_pboxtime(sp, eff_rate);
1203 break;
1204 }
1205
1206 case CLASSQ_EV_LINK_UP:
1207 case CLASSQ_EV_LINK_DOWN:
1208 if (classq_verbose) {
1209 log(LOG_DEBUG, "%s: SFB qid=%d, resetting due to "
1210 "link %s\n", if_name(ifp), sp->sfb_qid,
1211 (ev == CLASSQ_EV_LINK_UP) ? "UP" : "DOWN");
1212 }
1213 sfb_resetq(sp, ev);
1214 break;
1215
1216 case CLASSQ_EV_LINK_LATENCY:
1217 case CLASSQ_EV_LINK_MTU:
1218 default:
1219 break;
1220 }
1221 }
1222
1223 int
1224 sfb_suspendq(struct sfb *sp, class_queue_t *q, boolean_t on)
1225 {
1226 #pragma unused(q)
1227 struct ifnet *ifp = sp->sfb_ifp;
1228
1229 VERIFY(ifp != NULL);
1230
1231 if ((on && (sp->sfb_flags & SFBF_SUSPENDED)) ||
1232 (!on && !(sp->sfb_flags & SFBF_SUSPENDED)))
1233 return (0);
1234
1235 if (!(sp->sfb_flags & SFBF_FLOWCTL)) {
1236 log(LOG_ERR, "%s: SFB qid=%d, unable to %s queue since "
1237 "flow-control is not enabled", if_name(ifp), sp->sfb_qid,
1238 (on ? "suspend" : "resume"));
1239 return (ENOTSUP);
1240 }
1241
1242 if (classq_verbose) {
1243 log(LOG_DEBUG, "%s: SFB qid=%d, setting state to %s",
1244 if_name(ifp), sp->sfb_qid, (on ? "SUSPENDED" : "RUNNING"));
1245 }
1246
1247 if (on) {
1248 sp->sfb_flags |= SFBF_SUSPENDED;
1249 } else {
1250 sp->sfb_flags &= ~SFBF_SUSPENDED;
1251 sfb_swap_bins(sp, qlen(q));
1252 }
1253
1254 return (0);
1255 }