]> git.saurik.com Git - apple/xnu.git/blame - bsd/net/classq/classq_sfb.c
xnu-2422.115.4.tar.gz
[apple/xnu.git] / bsd / net / classq / classq_sfb.c
CommitLineData
316670eb 1/*
39236c6e 2 * Copyright (c) 2011-2013 Apple Inc. All rights reserved.
316670eb
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/cdefs.h>
30#include <sys/param.h>
316670eb
A
31#include <sys/mbuf.h>
32#include <sys/socket.h>
33#include <sys/sockio.h>
34#include <sys/systm.h>
35#include <sys/sysctl.h>
36#include <sys/syslog.h>
37#include <sys/proc.h>
38#include <sys/errno.h>
39#include <sys/kernel.h>
40#include <sys/kauth.h>
41
42#include <kern/zalloc.h>
43
44#include <net/if.h>
45#include <net/if_var.h>
46#include <net/if_types.h>
47#include <net/dlil.h>
39236c6e 48#include <net/flowadv.h>
316670eb
A
49
50#include <netinet/in.h>
51#include <netinet/in_systm.h>
52#include <netinet/ip.h>
53#if INET6
54#include <netinet/ip6.h>
55#endif
56
57#include <net/classq/classq_sfb.h>
58#include <net/flowhash.h>
59#include <net/net_osdep.h>
39236c6e 60#include <dev/random/randomdev.h>
316670eb
A
61
62/*
63 * Stochastic Fair Blue
64 *
65 * Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin
66 * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
67 *
68 * Based on the NS code with the following parameters:
69 *
70 * bytes: false
71 * decrement: 0.001
72 * increment: 0.005
73 * hold-time: 10ms-50ms (randomized)
74 * algorithm: 0
75 * pbox: 1
76 * pbox-time: 50-100ms (randomized)
77 * hinterval: 11-23 (randomized)
78 *
79 * This implementation uses L = 2 and N = 32 for 2 sets of:
80 *
81 * B[L][N]: L x N array of bins (L levels, N bins per level)
82 *
83 * Each set effectively creates 32^2 virtual buckets (bin combinations)
84 * while using only O(32*2) states.
85 *
86 * Given a 32-bit hash value, we divide it such that octets [0,1,2,3] are
87 * used as index for the bins across the 2 levels, where level 1 uses [0,2]
88 * and level 2 uses [1,3]. The 2 values per level correspond to the indices
89 * for the current and warm-up sets (section 4.4. in the SFB paper regarding
90 * Moving Hash Functions explains the purposes of these 2 sets.)
91 */
92
93/*
94 * Use Murmur3A_x86_32 for hash function. It seems to perform consistently
95 * across platforms for 1-word key (32-bit flowhash value). See flowhash.h
96 * for other alternatives. We only need 16-bit hash output.
97 */
98#define SFB_HASH net_flowhash_mh3_x86_32
99#define SFB_HASHMASK HASHMASK(16)
100
101#define SFB_BINMASK(_x) \
102 ((_x) & HASHMASK(SFB_BINS_SHIFT))
103
104#define SFB_BINST(_sp, _l, _n, _c) \
105 (&(*(_sp)->sfb_bins)[_c].stats[_l][_n])
106
107#define SFB_BINFT(_sp, _l, _n, _c) \
108 (&(*(_sp)->sfb_bins)[_c].freezetime[_l][_n])
109
110#define SFB_FC_LIST(_sp, _n) \
111 (&(*(_sp)->sfb_fc_lists)[_n])
112
113/*
114 * The holdtime parameter determines the minimum time interval between
115 * two successive updates of the marking probability. In the event the
116 * uplink speed is not known, a default value is chosen and is randomized
117 * to be within the following range.
118 */
119#define HOLDTIME_BASE (100ULL * 1000 * 1000) /* 100ms */
120#define HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10ms */
121#define HOLDTIME_MAX (100ULL * 1000 * 1000) /* 100ms */
122
123/*
124 * The pboxtime parameter determines the bandwidth allocated for rogue
125 * flows, i.e. the rate limiting bandwidth. In the event the uplink speed
126 * is not known, a default value is chosen and is randomized to be within
127 * the following range.
128 */
129#define PBOXTIME_BASE (300ULL * 1000 * 1000) /* 300ms */
130#define PBOXTIME_MIN (30ULL * 1000 * 1000) /* 30ms */
131#define PBOXTIME_MAX (300ULL * 1000 * 1000) /* 300ms */
132
133#define SFB_RANDOM(sp, tmin, tmax) ((sfb_random(sp) % (tmax)) + (tmin))
134
39236c6e 135#define SFB_PKT_PBOX 0x1 /* in penalty box */
316670eb
A
136
137/* The following mantissa values are in SFB_FP_SHIFT Q format */
138#define SFB_MAX_PMARK (1 << SFB_FP_SHIFT) /* Q14 representation of 1.00 */
139
140/*
141 * These are d1 (increment) and d2 (decrement) parameters, used to determine
142 * the amount by which the marking probability is incremented when the queue
143 * overflows, or is decremented when the link is idle. d1 is set higher than
144 * d2, because link underutilization can occur when congestion management is
145 * either too conservative or too aggressive, but packet loss occurs only
146 * when congestion management is too conservative. By weighing heavily
147 * against packet loss, it can quickly reach to a substantial increase in
148 * traffic load.
149 */
150#define SFB_INCREMENT 82 /* Q14 representation of 0.005 */
151#define SFB_DECREMENT 16 /* Q14 representation of 0.001 */
152
153#define SFB_PMARK_TH 16056 /* Q14 representation of 0.98 */
154#define SFB_PMARK_WARM 3276 /* Q14 representation of 0.2 */
155
156#define SFB_PMARK_INC(_bin) do { \
157 (_bin)->pmark += sfb_increment; \
158 if ((_bin)->pmark > SFB_MAX_PMARK) \
159 (_bin)->pmark = SFB_MAX_PMARK; \
160} while (0)
161
162#define SFB_PMARK_DEC(_bin) do { \
163 if ((_bin)->pmark > 0) { \
164 (_bin)->pmark -= sfb_decrement; \
165 if ((_bin)->pmark < 0) \
166 (_bin)->pmark = 0; \
167 } \
168} while (0)
169
170#define HINTERVAL_MIN (10) /* 10 seconds */
171#define HINTERVAL_MAX (20) /* 20 seconds */
172#define SFB_HINTERVAL(sp) ((sfb_random(sp) % HINTERVAL_MAX) + HINTERVAL_MIN)
173
174#define DEQUEUE_DECAY 7 /* ilog2 of EWMA decay rate, (128) */
175#define DEQUEUE_SPIKE(_new, _old) \
176 ((u_int64_t)ABS((int64_t)(_new) - (int64_t)(_old)) > ((_old) << 11))
177
178#define ABS(v) (((v) > 0) ? (v) : -(v))
179
39236c6e
A
180#define SFB_ZONE_MAX 32 /* maximum elements in zone */
181#define SFB_ZONE_NAME "classq_sfb" /* zone name */
182
183#define SFB_BINS_ZONE_MAX 32 /* maximum elements in zone */
184#define SFB_BINS_ZONE_NAME "classq_sfb_bins" /* zone name */
185
186#define SFB_FCL_ZONE_MAX 32 /* maximum elements in zone */
187#define SFB_FCL_ZONE_NAME "classq_sfb_fcl" /* zone name */
316670eb
A
188
189/* Place the flow control entries in current bin on level 0 */
190#define SFB_FC_LEVEL 0
191
39236c6e
A
192/* Store SFB hash and flags in the module private scratch space */
193#define pkt_sfb_hash8 pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val8
194#define pkt_sfb_hash16 pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val16
195#define pkt_sfb_hash32 pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val32
196#define pkt_sfb_flags pkt_mpriv.__mpriv_u.__mpriv32[1].__mpriv32_u.__val32
197
316670eb
A
198static unsigned int sfb_size; /* size of zone element */
199static struct zone *sfb_zone; /* zone for sfb */
200
39236c6e
A
201static unsigned int sfb_bins_size; /* size of zone element */
202static struct zone *sfb_bins_zone; /* zone for sfb_bins */
203
204static unsigned int sfb_fcl_size; /* size of zone element */
205static struct zone *sfb_fcl_zone; /* zone for sfb_fc_lists */
206
316670eb
A
207/* internal function prototypes */
208static u_int32_t sfb_random(struct sfb *);
209static struct mbuf *sfb_getq_flow(struct sfb *, class_queue_t *, u_int32_t,
210 boolean_t);
211static void sfb_resetq(struct sfb *, cqev_t);
212static void sfb_calc_holdtime(struct sfb *, u_int64_t);
213static void sfb_calc_pboxtime(struct sfb *, u_int64_t);
214static void sfb_calc_hinterval(struct sfb *, u_int64_t *);
215static void sfb_swap_bins(struct sfb *, u_int32_t);
39236c6e
A
216static inline int sfb_pcheck(struct sfb *, struct pkthdr *);
217static int sfb_penalize(struct sfb *, struct pkthdr *, struct timespec *);
316670eb
A
218static void sfb_adjust_bin(struct sfb *, struct sfbbinstats *,
219 struct timespec *, struct timespec *, boolean_t);
220static void sfb_decrement_bin(struct sfb *, struct sfbbinstats *,
221 struct timespec *, struct timespec *);
222static void sfb_increment_bin(struct sfb *, struct sfbbinstats *,
223 struct timespec *, struct timespec *);
39236c6e 224static inline void sfb_dq_update_bins(struct sfb *, struct pkthdr *,
316670eb 225 struct timespec *);
39236c6e
A
226static inline void sfb_eq_update_bins(struct sfb *, struct pkthdr *);
227static int sfb_drop_early(struct sfb *, struct pkthdr *, u_int16_t *,
316670eb 228 struct timespec *);
39236c6e
A
229static boolean_t sfb_bin_addfcentry(struct sfb *, struct pkthdr *);
230static void sfb_fclist_append(struct sfb *, struct sfb_fcl *);
316670eb
A
231static void sfb_fclists_clean(struct sfb *sp);
232
233SYSCTL_NODE(_net_classq, OID_AUTO, sfb, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "SFB");
234
235static u_int64_t sfb_holdtime = 0; /* 0 indicates "automatic" */
236SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, holdtime, CTLFLAG_RW|CTLFLAG_LOCKED,
237 &sfb_holdtime, "SFB freeze time in nanoseconds");
238
239static u_int64_t sfb_pboxtime = 0; /* 0 indicates "automatic" */
240SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, pboxtime, CTLFLAG_RW|CTLFLAG_LOCKED,
241 &sfb_pboxtime, "SFB penalty box time in nanoseconds");
242
243static u_int64_t sfb_hinterval;
244SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, hinterval, CTLFLAG_RW|CTLFLAG_LOCKED,
245 &sfb_hinterval, "SFB hash interval in nanoseconds");
246
247static u_int32_t sfb_increment = SFB_INCREMENT;
248SYSCTL_UINT(_net_classq_sfb, OID_AUTO, increment, CTLFLAG_RW|CTLFLAG_LOCKED,
249 &sfb_increment, SFB_INCREMENT, "SFB increment [d1]");
250
251static u_int32_t sfb_decrement = SFB_DECREMENT;
252SYSCTL_UINT(_net_classq_sfb, OID_AUTO, decrement, CTLFLAG_RW|CTLFLAG_LOCKED,
253 &sfb_decrement, SFB_DECREMENT, "SFB decrement [d2]");
254
255static u_int32_t sfb_allocation = 0; /* 0 means "automatic" */
256SYSCTL_UINT(_net_classq_sfb, OID_AUTO, allocation, CTLFLAG_RW|CTLFLAG_LOCKED,
257 &sfb_allocation, 0, "SFB bin allocation");
258
259static u_int32_t sfb_ratelimit = 0;
260SYSCTL_UINT(_net_classq_sfb, OID_AUTO, ratelimit, CTLFLAG_RW|CTLFLAG_LOCKED,
261 &sfb_ratelimit, 0, "SFB rate limit");
262
263#define MBPS (1ULL * 1000 * 1000)
264#define GBPS (MBPS * 1000)
265
266struct sfb_time_tbl {
267 u_int64_t speed; /* uplink speed */
268 u_int64_t holdtime; /* hold time */
269 u_int64_t pboxtime; /* penalty box time */
270};
271
272static struct sfb_time_tbl sfb_ttbl[] = {
273 { 1 * MBPS, HOLDTIME_BASE * 1000, PBOXTIME_BASE * 1000 },
274 { 10 * MBPS, HOLDTIME_BASE * 100, PBOXTIME_BASE * 100 },
275 { 100 * MBPS, HOLDTIME_BASE * 10, PBOXTIME_BASE * 10 },
276 { 1 * GBPS, HOLDTIME_BASE, PBOXTIME_BASE },
277 { 10 * GBPS, HOLDTIME_BASE / 10, PBOXTIME_BASE / 10 },
278 { 100 * GBPS, HOLDTIME_BASE / 100, PBOXTIME_BASE / 100 },
279 { 0, 0, 0 }
280};
281
282void
283sfb_init(void)
284{
285 _CASSERT(SFBF_ECN4 == CLASSQF_ECN4);
286 _CASSERT(SFBF_ECN6 == CLASSQF_ECN6);
287
288 sfb_size = sizeof (struct sfb);
289 sfb_zone = zinit(sfb_size, SFB_ZONE_MAX * sfb_size,
290 0, SFB_ZONE_NAME);
291 if (sfb_zone == NULL) {
292 panic("%s: failed allocating %s", __func__, SFB_ZONE_NAME);
293 /* NOTREACHED */
294 }
295 zone_change(sfb_zone, Z_EXPAND, TRUE);
296 zone_change(sfb_zone, Z_CALLERACCT, TRUE);
39236c6e
A
297
298 sfb_bins_size = sizeof (*((struct sfb *)0)->sfb_bins);
299 sfb_bins_zone = zinit(sfb_bins_size, SFB_BINS_ZONE_MAX * sfb_bins_size,
300 0, SFB_BINS_ZONE_NAME);
301 if (sfb_bins_zone == NULL) {
302 panic("%s: failed allocating %s", __func__, SFB_BINS_ZONE_NAME);
303 /* NOTREACHED */
304 }
305 zone_change(sfb_bins_zone, Z_EXPAND, TRUE);
306 zone_change(sfb_bins_zone, Z_CALLERACCT, TRUE);
307
308 sfb_fcl_size = sizeof (*((struct sfb *)0)->sfb_fc_lists);
309 sfb_fcl_zone = zinit(sfb_fcl_size, SFB_FCL_ZONE_MAX * sfb_fcl_size,
310 0, SFB_FCL_ZONE_NAME);
311 if (sfb_fcl_zone == NULL) {
312 panic("%s: failed allocating %s", __func__, SFB_FCL_ZONE_NAME);
313 /* NOTREACHED */
314 }
315 zone_change(sfb_fcl_zone, Z_EXPAND, TRUE);
316 zone_change(sfb_fcl_zone, Z_CALLERACCT, TRUE);
316670eb
A
317}
318
319static u_int32_t
320sfb_random(struct sfb *sp)
321{
322 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
39236c6e 323 return (RandomULong());
316670eb
A
324}
325
326static void
327sfb_calc_holdtime(struct sfb *sp, u_int64_t outbw)
328{
329 u_int64_t holdtime;
330
331 if (sfb_holdtime != 0) {
332 holdtime = sfb_holdtime;
333 } else if (outbw == 0) {
334 holdtime = SFB_RANDOM(sp, HOLDTIME_MIN, HOLDTIME_MAX);
335 } else {
336 unsigned int n, i;
337
338 n = sfb_ttbl[0].holdtime;
339 for (i = 0; sfb_ttbl[i].speed != 0; i++) {
340 if (outbw < sfb_ttbl[i].speed)
341 break;
342 n = sfb_ttbl[i].holdtime;
343 }
344 holdtime = n;
345 }
346 net_nsectimer(&holdtime, &sp->sfb_holdtime);
347}
348
349static void
350sfb_calc_pboxtime(struct sfb *sp, u_int64_t outbw)
351{
352 u_int64_t pboxtime;
353
354 if (sfb_pboxtime != 0) {
355 pboxtime = sfb_pboxtime;
356 } else if (outbw == 0) {
357 pboxtime = SFB_RANDOM(sp, PBOXTIME_MIN, PBOXTIME_MAX);
358 } else {
359 unsigned int n, i;
360
361 n = sfb_ttbl[0].pboxtime;
362 for (i = 0; sfb_ttbl[i].speed != 0; i++) {
363 if (outbw < sfb_ttbl[i].speed)
364 break;
365 n = sfb_ttbl[i].pboxtime;
366 }
367 pboxtime = n;
368 }
369 net_nsectimer(&pboxtime, &sp->sfb_pboxtime);
370 net_timerclear(&sp->sfb_pboxfreeze);
371}
372
373static void
374sfb_calc_hinterval(struct sfb *sp, u_int64_t *t)
375{
376 u_int64_t hinterval;
377 struct timespec now;
378
379 if (t != NULL) {
380 /*
381 * TODO adi@apple.com: use dq_avg to derive hinterval.
382 */
383 hinterval = *t;
384 }
385
386 if (sfb_hinterval != 0)
387 hinterval = sfb_hinterval;
388 else if (t == NULL || hinterval == 0)
389 hinterval = ((u_int64_t)SFB_HINTERVAL(sp) * NSEC_PER_SEC);
390
391 net_nsectimer(&hinterval, &sp->sfb_hinterval);
392
393 nanouptime(&now);
394 net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset);
395}
396
397/*
398 * sfb support routines
399 */
400struct sfb *
401sfb_alloc(struct ifnet *ifp, u_int32_t qid, u_int32_t qlim, u_int32_t flags)
402{
403 struct sfb *sp;
39236c6e 404 int i;
316670eb
A
405
406 VERIFY(ifp != NULL && qlim > 0);
407
408 sp = zalloc(sfb_zone);
409 if (sp == NULL) {
410 log(LOG_ERR, "%s: SFB unable to allocate\n", if_name(ifp));
411 return (NULL);
412 }
316670eb 413 bzero(sp, sfb_size);
39236c6e
A
414
415 if ((sp->sfb_bins = zalloc(sfb_bins_zone)) == NULL) {
316670eb
A
416 log(LOG_ERR, "%s: SFB unable to allocate bins\n", if_name(ifp));
417 sfb_destroy(sp);
418 return (NULL);
419 }
39236c6e 420 bzero(sp->sfb_bins, sfb_bins_size);
316670eb 421
39236c6e 422 if ((sp->sfb_fc_lists = zalloc(sfb_fcl_zone)) == NULL) {
316670eb
A
423 log(LOG_ERR, "%s: SFB unable to allocate flow control lists\n",
424 if_name(ifp));
425 sfb_destroy(sp);
426 return(NULL);
427 }
39236c6e
A
428 bzero(sp->sfb_fc_lists, sfb_fcl_size);
429
430 for (i = 0; i < SFB_BINS; ++i)
431 STAILQ_INIT(&SFB_FC_LIST(sp, i)->fclist);
316670eb 432
316670eb
A
433 sp->sfb_ifp = ifp;
434 sp->sfb_qlim = qlim;
435 sp->sfb_qid = qid;
39236c6e
A
436 sp->sfb_flags = (flags & SFBF_USERFLAGS);
437#if !PF_ECN
438 if (sp->sfb_flags & SFBF_ECN) {
439 sp->sfb_flags &= ~SFBF_ECN;
440 log(LOG_ERR, "%s: SFB qid=%d, ECN not available; ignoring "
441 "SFBF_ECN flag!\n", if_name(ifp), sp->sfb_qid);
442 }
443#endif /* !PF_ECN */
316670eb
A
444
445 sfb_resetq(sp, -1);
446
447 return (sp);
448}
449
450static void
39236c6e 451sfb_fclist_append(struct sfb *sp, struct sfb_fcl *fcl)
316670eb
A
452{
453 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
39236c6e
A
454
455 VERIFY(STAILQ_EMPTY(&fcl->fclist) || fcl->cnt > 0);
456 sp->sfb_stats.flow_feedback += fcl->cnt;
457 fcl->cnt = 0;
458
459 flowadv_add(&fcl->fclist);
460 VERIFY(fcl->cnt == 0 && STAILQ_EMPTY(&fcl->fclist));
316670eb
A
461}
462
463static void
464sfb_fclists_clean(struct sfb *sp)
465{
466 int i;
467
39236c6e 468 /* Move all the flow control entries to the flowadv list */
316670eb 469 for (i = 0; i < SFB_BINS; ++i) {
39236c6e
A
470 struct sfb_fcl *fcl = SFB_FC_LIST(sp, i);
471 if (!STAILQ_EMPTY(&fcl->fclist))
316670eb
A
472 sfb_fclist_append(sp, fcl);
473 }
474}
475
476void
477sfb_destroy(struct sfb *sp)
478{
479 sfb_fclists_clean(sp);
480 if (sp->sfb_bins != NULL) {
39236c6e 481 zfree(sfb_bins_zone, sp->sfb_bins);
316670eb
A
482 sp->sfb_bins = NULL;
483 }
484 if (sp->sfb_fc_lists != NULL) {
39236c6e 485 zfree(sfb_fcl_zone, sp->sfb_fc_lists);
316670eb
A
486 sp->sfb_fc_lists = NULL;
487 }
488 zfree(sfb_zone, sp);
489}
490
491static void
492sfb_resetq(struct sfb *sp, cqev_t ev)
493{
494 struct ifnet *ifp = sp->sfb_ifp;
495 u_int64_t eff_rate;
496
497 VERIFY(ifp != NULL);
498
499 if (ev != CLASSQ_EV_LINK_DOWN) {
500 (*sp->sfb_bins)[0].fudge = sfb_random(sp);
501 (*sp->sfb_bins)[1].fudge = sfb_random(sp);
502 sp->sfb_allocation = ((sfb_allocation == 0) ?
503 (sp->sfb_qlim / 3) : sfb_allocation);
504 sp->sfb_drop_thresh = sp->sfb_allocation +
505 (sp->sfb_allocation >> 1);
506 }
507
508 sp->sfb_clearpkts = 0;
509 sp->sfb_current = 0;
510
511 eff_rate = ifnet_output_linkrate(ifp);
512 sp->sfb_eff_rate = eff_rate;
513
514 sfb_calc_holdtime(sp, eff_rate);
515 sfb_calc_pboxtime(sp, eff_rate);
516 sfb_calc_hinterval(sp, NULL);
517
518 if (ev == CLASSQ_EV_LINK_DOWN ||
519 ev == CLASSQ_EV_LINK_UP)
520 sfb_fclists_clean(sp);
521
522 bzero(sp->sfb_bins, sizeof (*sp->sfb_bins));
523 bzero(&sp->sfb_stats, sizeof (sp->sfb_stats));
524
525 if (ev == CLASSQ_EV_LINK_DOWN || !classq_verbose)
526 return;
527
528 log(LOG_DEBUG, "%s: SFB qid=%d, holdtime=%llu nsec, "
529 "pboxtime=%llu nsec, allocation=%d, drop_thresh=%d, "
530 "hinterval=%d sec, sfb_bins=%d bytes, eff_rate=%llu bps\n",
531 if_name(ifp), sp->sfb_qid, (u_int64_t)sp->sfb_holdtime.tv_nsec,
532 (u_int64_t)sp->sfb_pboxtime.tv_nsec,
533 (u_int32_t)sp->sfb_allocation, (u_int32_t)sp->sfb_drop_thresh,
534 (int)sp->sfb_hinterval.tv_sec, (int)sizeof (*sp->sfb_bins),
535 eff_rate);
536}
537
538void
539sfb_getstats(struct sfb *sp, struct sfb_stats *sps)
540{
541 sps->allocation = sp->sfb_allocation;
542 sps->dropthresh = sp->sfb_drop_thresh;
543 sps->clearpkts = sp->sfb_clearpkts;
544 sps->current = sp->sfb_current;
545
546 net_timernsec(&sp->sfb_holdtime, &sp->sfb_stats.hold_time);
547 net_timernsec(&sp->sfb_pboxtime, &sp->sfb_stats.pbox_time);
548 net_timernsec(&sp->sfb_hinterval, &sp->sfb_stats.rehash_intval);
549 *(&(sps->sfbstats)) = *(&(sp->sfb_stats));
550
551 _CASSERT(sizeof ((*sp->sfb_bins)[0].stats) ==
552 sizeof (sps->binstats[0].stats));
553
554 bcopy(&(*sp->sfb_bins)[0].stats, &sps->binstats[0].stats,
555 sizeof (sps->binstats[0].stats));
556 bcopy(&(*sp->sfb_bins)[1].stats, &sps->binstats[1].stats,
557 sizeof (sps->binstats[1].stats));
558}
559
560static void
561sfb_swap_bins(struct sfb *sp, u_int32_t len)
562{
563 int i, j, s;
564
565 if (sp->sfb_flags & SFBF_SUSPENDED)
566 return;
567
568 s = sp->sfb_current;
569 VERIFY((s + (s ^ 1)) == 1);
570
571 (*sp->sfb_bins)[s].fudge = sfb_random(sp); /* recompute perturbation */
572 sp->sfb_clearpkts = len;
573 sp->sfb_stats.num_rehash++;
574
575 s = (sp->sfb_current ^= 1); /* flip the bit (swap current) */
576
577 if (classq_verbose) {
578 log(LOG_DEBUG, "%s: SFB qid=%d, set %d is now current, "
579 "qlen=%d\n", if_name(sp->sfb_ifp), sp->sfb_qid, s, len);
580 }
581
582 /* clear freezetime for all current bins */
583 bzero(&(*sp->sfb_bins)[s].freezetime,
584 sizeof ((*sp->sfb_bins)[s].freezetime));
585
586 /* clear/adjust bin statistics and flow control lists */
587 for (i = 0; i < SFB_BINS; i++) {
39236c6e 588 struct sfb_fcl *fcl = SFB_FC_LIST(sp, i);
316670eb 589
39236c6e 590 if (!STAILQ_EMPTY(&fcl->fclist))
316670eb
A
591 sfb_fclist_append(sp, fcl);
592
593 for (j = 0; j < SFB_LEVELS; j++) {
594 struct sfbbinstats *cbin, *wbin;
595
596 cbin = SFB_BINST(sp, j, i, s); /* current */
597 wbin = SFB_BINST(sp, j, i, s ^ 1); /* warm-up */
598
599 cbin->pkts = 0;
600 if (cbin->pmark > SFB_MAX_PMARK)
601 cbin->pmark = SFB_MAX_PMARK;
602 if (cbin->pmark < 0)
603 cbin->pmark = 0;
604
605 /*
606 * Keep pmark from before to identify
607 * non-responsives immediately.
608 */
609 if (wbin->pmark > SFB_PMARK_WARM)
610 wbin->pmark = SFB_PMARK_WARM;
611 }
612 }
613}
614
615static inline int
39236c6e 616sfb_pcheck(struct sfb *sp, struct pkthdr *pkt)
316670eb
A
617{
618#if SFB_LEVELS != 2
619 int i, n;
620#endif /* SFB_LEVELS != 2 */
621 int s;
622
623 s = sp->sfb_current;
624 VERIFY((s + (s ^ 1)) == 1);
625
626 /*
627 * For current bins, returns 1 if all pmark >= SFB_PMARK_TH,
628 * 0 otherwise; optimize for SFB_LEVELS=2.
629 */
630#if SFB_LEVELS == 2
631 /*
632 * Level 0: bin index at [0] for set 0; [2] for set 1
633 * Level 1: bin index at [1] for set 0; [3] for set 1
634 */
39236c6e 635 if (SFB_BINST(sp, 0, SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]),
316670eb 636 s)->pmark < SFB_PMARK_TH ||
39236c6e 637 SFB_BINST(sp, 1, SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]),
316670eb
A
638 s)->pmark < SFB_PMARK_TH)
639 return (0);
640#else /* SFB_LEVELS != 2 */
641 for (i = 0; i < SFB_LEVELS; i++) {
642 if (s == 0) /* set 0, bin index [0,1] */
39236c6e 643 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
316670eb 644 else /* set 1, bin index [2,3] */
39236c6e 645 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
316670eb
A
646
647 if (SFB_BINST(sp, i, n, s)->pmark < SFB_PMARK_TH)
648 return (0);
649 }
650#endif /* SFB_LEVELS != 2 */
651 return (1);
652}
653
654static int
39236c6e 655sfb_penalize(struct sfb *sp, struct pkthdr *pkt, struct timespec *now)
316670eb
A
656{
657 struct timespec delta = { 0, 0 };
658
659 /* If minimum pmark of current bins is < SFB_PMARK_TH, we're done */
39236c6e 660 if (!sfb_ratelimit || !sfb_pcheck(sp, pkt))
316670eb
A
661 return (0);
662
663 net_timersub(now, &sp->sfb_pboxfreeze, &delta);
664 if (net_timercmp(&delta, &sp->sfb_pboxtime, <)) {
665#if SFB_LEVELS != 2
666 int i;
667#endif /* SFB_LEVELS != 2 */
668 struct sfbbinstats *bin;
669 int n, w;
670
671 w = sp->sfb_current ^ 1;
672 VERIFY((w + (w ^ 1)) == 1);
673
674 /*
675 * Update warm-up bins; optimize for SFB_LEVELS=2
676 */
677#if SFB_LEVELS == 2
678 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
39236c6e 679 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(w << 1)]);
316670eb
A
680 bin = SFB_BINST(sp, 0, n, w);
681 if (bin->pkts >= sp->sfb_allocation)
682 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 0, n, w), now);
683
684 /* Level 0: bin index at [1] for set 0; [3] for set 1 */
39236c6e 685 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(w << 1) + 1]);
316670eb
A
686 bin = SFB_BINST(sp, 1, n, w);
687 if (bin->pkts >= sp->sfb_allocation)
688 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 1, n, w), now);
689#else /* SFB_LEVELS != 2 */
690 for (i = 0; i < SFB_LEVELS; i++) {
691 if (w == 0) /* set 0, bin index [0,1] */
39236c6e 692 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
316670eb 693 else /* set 1, bin index [2,3] */
39236c6e 694 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
316670eb
A
695
696 bin = SFB_BINST(sp, i, n, w);
697 if (bin->pkts >= sp->sfb_allocation) {
698 sfb_increment_bin(sp, bin,
699 SFB_BINFT(sp, i, n, w), now);
700 }
701 }
702#endif /* SFB_LEVELS != 2 */
703 return (1);
704 }
705
706 /* non-conformant or else misclassified flow; queue it anyway */
39236c6e 707 pkt->pkt_sfb_flags |= SFB_PKT_PBOX;
316670eb
A
708 *(&sp->sfb_pboxfreeze) = *now;
709
710 return (0);
711}
712
713static void
714sfb_adjust_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft,
715 struct timespec *now, boolean_t inc)
716{
717 struct timespec delta;
718
719 net_timersub(now, ft, &delta);
720 if (net_timercmp(&delta, &sp->sfb_holdtime, <)) {
721 if (classq_verbose > 1) {
722 log(LOG_DEBUG, "%s: SFB qid=%d, %s update frozen "
723 "(delta=%llu nsec)\n", if_name(sp->sfb_ifp),
724 sp->sfb_qid, inc ? "increment" : "decrement",
725 (u_int64_t)delta.tv_nsec);
726 }
727 return;
728 }
729
730 /* increment/decrement marking probability */
731 *ft = *now;
732 if (inc)
733 SFB_PMARK_INC(bin);
734 else
735 SFB_PMARK_DEC(bin);
736}
737
738static void
739sfb_decrement_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft,
740 struct timespec *now)
741{
742 return (sfb_adjust_bin(sp, bin, ft, now, FALSE));
743}
744
745static void
746sfb_increment_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft,
747 struct timespec *now)
748{
749 return (sfb_adjust_bin(sp, bin, ft, now, TRUE));
750}
751
752static inline void
39236c6e 753sfb_dq_update_bins(struct sfb *sp, struct pkthdr *pkt, struct timespec *now)
316670eb
A
754{
755#if SFB_LEVELS != 2 || SFB_FC_LEVEL != 0
756 int i;
757#endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
758 struct sfbbinstats *bin;
759 int s, n;
39236c6e 760 struct sfb_fcl *fcl = NULL;
316670eb
A
761
762 s = sp->sfb_current;
763 VERIFY((s + (s ^ 1)) == 1);
764
765 /*
766 * Update current bins; optimize for SFB_LEVELS=2 and SFB_FC_LEVEL=0
767 */
768#if SFB_LEVELS == 2 && SFB_FC_LEVEL == 0
769 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
39236c6e 770 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]);
316670eb
A
771 bin = SFB_BINST(sp, 0, n, s);
772
773 VERIFY(bin->pkts > 0);
774 if (--bin->pkts == 0) {
775 sfb_decrement_bin(sp, bin, SFB_BINFT(sp, 0, n, s), now);
776 }
777 if (bin->pkts <= (sp->sfb_allocation >> 2)) {
778 /* deliver flow control feedback to the sockets */
779 fcl = SFB_FC_LIST(sp, n);
39236c6e 780 if (!STAILQ_EMPTY(&fcl->fclist))
316670eb
A
781 sfb_fclist_append(sp, fcl);
782 }
783
784 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
39236c6e 785 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]);
316670eb
A
786 bin = SFB_BINST(sp, 1, n, s);
787
788 VERIFY(bin->pkts > 0);
789 if (--bin->pkts == 0)
790 sfb_decrement_bin(sp, bin, SFB_BINFT(sp, 1, n, s), now);
791#else /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
792 for (i = 0; i < SFB_LEVELS; i++) {
793 if (s == 0) /* set 0, bin index [0,1] */
39236c6e 794 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
316670eb 795 else /* set 1, bin index [2,3] */
39236c6e 796 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
316670eb
A
797
798 bin = SFB_BINST(sp, i, n, s);
799
800 VERIFY(bin->pkts > 0);
801 if (--bin->pkts == 0) {
802 sfb_decrement_bin(sp, bin,
803 SFB_BINFT(sp, i, n, s), now);
804 }
805 if (bin->pkts <= (sp->sfb_allocation >> 2)) {
806 /* deliver flow control feedback to the sockets */
807 if (i == SFB_FC_LEVEL) {
808 fcl = SFB_FC_LIST(sp, n);
39236c6e 809 if (!STAILQ_EMPTY(&fcl->fclist))
316670eb
A
810 sfb_fclist_append(sp, fcl);
811 }
812 }
813 }
814#endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
815}
816
817static inline void
39236c6e 818sfb_eq_update_bins(struct sfb *sp, struct pkthdr *pkt)
316670eb
A
819{
820#if SFB_LEVELS != 2
821 int i, n;
822#endif /* SFB_LEVELS != 2 */
823 int s;
824
825 s = sp->sfb_current;
826 VERIFY((s + (s ^ 1)) == 1);
827
828 /*
829 * Update current bins; optimize for SFB_LEVELS=2
830 */
831#if SFB_LEVELS == 2
832 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
39236c6e
A
833 SFB_BINST(sp, 0,
834 SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]), s)->pkts++;
316670eb
A
835
836 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
39236c6e
A
837 SFB_BINST(sp, 1,
838 SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]), s)->pkts++;
316670eb
A
839#else /* SFB_LEVELS != 2 */
840 for (i = 0; i < SFB_LEVELS; i++) {
841 if (s == 0) /* set 0, bin index [0,1] */
39236c6e 842 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
316670eb 843 else /* set 1, bin index [2,3] */
39236c6e 844 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
316670eb
A
845
846 SFB_BINST(sp, i, n, s)->pkts++;
847 }
848#endif /* SFB_LEVELS != 2 */
849}
850
851static boolean_t
39236c6e 852sfb_bin_addfcentry(struct sfb *sp, struct pkthdr *pkt)
316670eb 853{
39236c6e
A
854 struct flowadv_fcentry *fce;
855 u_int32_t flowsrc, flowid;
856 struct sfb_fcl *fcl;
316670eb
A
857 int s;
858
859 s = sp->sfb_current;
860 VERIFY((s + (s ^ 1)) == 1);
861
39236c6e
A
862 flowsrc = pkt->pkt_flowsrc;
863 flowid = pkt->pkt_flowid;
316670eb 864
39236c6e
A
865 if (flowid == 0) {
866 sp->sfb_stats.null_flowid++;
316670eb
A
867 return (FALSE);
868 }
869
870 /*
871 * Use value at index 0 for set 0 and
872 * value at index 2 for set 1
873 */
39236c6e
A
874 fcl = SFB_FC_LIST(sp, SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]));
875 STAILQ_FOREACH(fce, &fcl->fclist, fce_link) {
876 if (fce->fce_flowsrc == flowsrc &&
877 fce->fce_flowid == flowid) {
316670eb
A
878 /* Already on flow control list; just return */
879 return (TRUE);
880 }
881 }
882
883 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
39236c6e 884 fce = flowadv_alloc_entry(M_WAITOK);
316670eb 885 if (fce != NULL) {
39236c6e
A
886 fce->fce_flowsrc = flowsrc;
887 fce->fce_flowid = flowid;
888 STAILQ_INSERT_TAIL(&fcl->fclist, fce, fce_link);
889 fcl->cnt++;
316670eb
A
890 sp->sfb_stats.flow_controlled++;
891 }
892
893 return (fce != NULL);
894}
895
896/*
897 * early-drop probability is kept in pmark of each bin of the flow
898 */
899static int
39236c6e 900sfb_drop_early(struct sfb *sp, struct pkthdr *pkt, u_int16_t *pmin,
316670eb
A
901 struct timespec *now)
902{
903#if SFB_LEVELS != 2
904 int i;
905#endif /* SFB_LEVELS != 2 */
906 struct sfbbinstats *bin;
907 int s, n, ret = 0;
908
909 s = sp->sfb_current;
910 VERIFY((s + (s ^ 1)) == 1);
911
912 *pmin = (u_int16_t)-1;
913
914 /*
915 * Update current bins; optimize for SFB_LEVELS=2
916 */
917#if SFB_LEVELS == 2
918 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
39236c6e 919 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]);
316670eb
A
920 bin = SFB_BINST(sp, 0, n, s);
921 if (*pmin > (u_int16_t)bin->pmark)
922 *pmin = (u_int16_t)bin->pmark;
923
924 if (bin->pkts >= sp->sfb_allocation) {
925 if (bin->pkts >= sp->sfb_drop_thresh)
926 ret = 1; /* drop or mark */
927 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 0, n, s), now);
928 }
929
930 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
39236c6e 931 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]);
316670eb
A
932 bin = SFB_BINST(sp, 1, n, s);
933 if (*pmin > (u_int16_t)bin->pmark)
934 *pmin = (u_int16_t)bin->pmark;
935
936 if (bin->pkts >= sp->sfb_allocation) {
937 if (bin->pkts >= sp->sfb_drop_thresh)
938 ret = 1; /* drop or mark */
939 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 1, n, s), now);
940 }
941#else /* SFB_LEVELS != 2 */
942 for (i = 0; i < SFB_LEVELS; i++) {
943 if (s == 0) /* set 0, bin index [0,1] */
39236c6e 944 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
316670eb 945 else /* set 1, bin index [2,3] */
39236c6e 946 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
316670eb
A
947
948 bin = SFB_BINST(sp, i, n, s);
949 if (*pmin > (u_int16_t)bin->pmark)
950 *pmin = (u_int16_t)bin->pmark;
951
952 if (bin->pkts >= sp->sfb_allocation) {
953 if (bin->pkts >= sp->sfb_drop_thresh)
954 ret = 1; /* drop or mark */
955 sfb_increment_bin(sp, bin,
956 SFB_BINFT(sp, i, n, s), now);
957 }
958 }
959#endif /* SFB_LEVELS != 2 */
960
961 if (sp->sfb_flags & SFBF_SUSPENDED)
962 ret = 1; /* drop or mark */
963
964 return (ret);
965}
966
967#define DTYPE_NODROP 0 /* no drop */
968#define DTYPE_FORCED 1 /* a "forced" drop */
969#define DTYPE_EARLY 2 /* an "unforced" (early) drop */
970
971int
972sfb_addq(struct sfb *sp, class_queue_t *q, struct mbuf *m, struct pf_mtag *t)
973{
39236c6e
A
974#if !PF_ECN
975#pragma unused(t)
976#endif /* !PF_ECN */
977 struct pkthdr *pkt = &m->m_pkthdr;
316670eb
A
978 struct timespec now;
979 int droptype, s;
980 u_int16_t pmin;
981 int fc_adv = 0;
982 int ret = CLASSQEQ_SUCCESS;
983
984 nanouptime(&now);
985
986 s = sp->sfb_current;
987 VERIFY((s + (s ^ 1)) == 1);
988
989 /* time to swap the bins? */
990 if (net_timercmp(&now, &sp->sfb_nextreset, >=)) {
991 net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset);
992 sfb_swap_bins(sp, qlen(q));
993 s = sp->sfb_current;
994 VERIFY((s + (s ^ 1)) == 1);
995 }
996
39236c6e
A
997 pkt->pkt_sfb_flags = 0;
998 pkt->pkt_sfb_hash16[s] =
999 (SFB_HASH(&pkt->pkt_flowid, sizeof (pkt->pkt_flowid),
316670eb 1000 (*sp->sfb_bins)[s].fudge) & SFB_HASHMASK);
39236c6e
A
1001 pkt->pkt_sfb_hash16[s ^ 1] =
1002 (SFB_HASH(&pkt->pkt_flowid, sizeof (pkt->pkt_flowid),
316670eb
A
1003 (*sp->sfb_bins)[s ^ 1].fudge) & SFB_HASHMASK);
1004
1005 /* see if we drop early */
1006 droptype = DTYPE_NODROP;
39236c6e 1007 if (sfb_drop_early(sp, pkt, &pmin, &now)) {
316670eb
A
1008 /* flow control, mark or drop by sfb */
1009 if ((sp->sfb_flags & SFBF_FLOWCTL) &&
39236c6e 1010 (pkt->pkt_flags & PKTF_FLOW_ADV)) {
316670eb
A
1011 fc_adv = 1;
1012 /* drop all during suspension or for non-TCP */
1013 if ((sp->sfb_flags & SFBF_SUSPENDED) ||
39236c6e 1014 pkt->pkt_proto != IPPROTO_TCP) {
316670eb
A
1015 droptype = DTYPE_EARLY;
1016 sp->sfb_stats.drop_early++;
1017 }
39236c6e
A
1018 }
1019#if PF_ECN
1020 else if ((sp->sfb_flags & SFBF_ECN) &&
1021 (pkt->pkt_proto == IPPROTO_TCP) && /* only for TCP */
316670eb
A
1022 ((sfb_random(sp) & SFB_MAX_PMARK) <= pmin) &&
1023 mark_ecn(m, t, sp->sfb_flags) &&
1024 !(sp->sfb_flags & SFBF_SUSPENDED)) {
1025 /* successfully marked; do not drop. */
1026 sp->sfb_stats.marked_packets++;
39236c6e
A
1027 }
1028#endif /* PF_ECN */
1029 else {
316670eb
A
1030 /* unforced drop by sfb */
1031 droptype = DTYPE_EARLY;
1032 sp->sfb_stats.drop_early++;
1033 }
1034 }
1035
1036 /* non-responsive flow penalty? */
39236c6e 1037 if (droptype == DTYPE_NODROP && sfb_penalize(sp, pkt, &now)) {
316670eb
A
1038 droptype = DTYPE_FORCED;
1039 sp->sfb_stats.drop_pbox++;
1040 }
1041
1042 /* if the queue length hits the hard limit, it's a forced drop */
1043 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) {
1044 droptype = DTYPE_FORCED;
1045 sp->sfb_stats.drop_queue++;
1046 }
1047
1048 if (fc_adv == 1 && droptype != DTYPE_FORCED &&
39236c6e 1049 sfb_bin_addfcentry(sp, pkt)) {
316670eb
A
1050 /* deliver flow control advisory error */
1051 if (droptype == DTYPE_NODROP) {
1052 ret = CLASSQEQ_SUCCESS_FC;
1053 VERIFY(!(sp->sfb_flags & SFBF_SUSPENDED));
1054 } else if (sp->sfb_flags & SFBF_SUSPENDED) {
1055 /* dropped due to suspension */
1056 ret = CLASSQEQ_DROPPED_SP;
1057 } else {
1058 /* dropped due to flow-control */
1059 ret = CLASSQEQ_DROPPED_FC;
1060 }
1061 }
1062
1063 /* if successful enqueue this packet, else drop it */
1064 if (droptype == DTYPE_NODROP) {
1065 _addq(q, m);
1066 } else {
1067 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
1068 m_freem(m);
1069 return ((ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROPPED);
1070 }
1071
39236c6e
A
1072 if (!(pkt->pkt_sfb_flags & SFB_PKT_PBOX))
1073 sfb_eq_update_bins(sp, pkt);
316670eb
A
1074 else
1075 sp->sfb_stats.pbox_packets++;
1076
1077 /* successfully queued */
1078 return (ret);
1079}
1080
1081static struct mbuf *
1082sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge)
1083{
1084 struct timespec now;
1085 struct mbuf *m;
39236c6e 1086 struct pkthdr *pkt;
316670eb
A
1087
1088 if (!purge && (sp->sfb_flags & SFBF_SUSPENDED))
1089 return (NULL);
1090
1091 nanouptime(&now);
1092
1093 /* flow of 0 means head of queue */
1094 if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) {
1095 if (!purge)
1096 net_timerclear(&sp->sfb_getqtime);
1097 return (NULL);
1098 }
1099
1100 VERIFY(m->m_flags & M_PKTHDR);
1101
39236c6e 1102 pkt = &m->m_pkthdr;
316670eb
A
1103
1104 if (!purge) {
1105 /* calculate EWMA of dequeues */
1106 if (net_timerisset(&sp->sfb_getqtime)) {
1107 struct timespec delta;
1108 u_int64_t avg, new;
1109
1110 net_timersub(&now, &sp->sfb_getqtime, &delta);
1111 net_timernsec(&delta, &new);
1112 avg = sp->sfb_stats.dequeue_avg;
1113 if (avg > 0) {
1114 int decay = DEQUEUE_DECAY;
1115 /*
1116 * If the time since last dequeue is
1117 * significantly greater than the current
1118 * average, weight the average more against
1119 * the old value.
1120 */
1121 if (DEQUEUE_SPIKE(new, avg))
1122 decay += 5;
1123 avg = (((avg << decay) - avg) + new) >> decay;
1124 } else {
1125 avg = new;
1126 }
1127 sp->sfb_stats.dequeue_avg = avg;
1128 }
1129 *(&sp->sfb_getqtime) = *(&now);
1130 }
1131
1132 /*
1133 * Clearpkts are the ones which were in the queue when the hash
1134 * function was perturbed. Since the perturbation value (fudge),
1135 * and thus bin information for these packets is not known, we do
1136 * not change accounting information while dequeuing these packets.
1137 * It is important not to set the hash interval too small due to
1138 * this reason. A rule of thumb is to set it to K*D, where D is
1139 * the time taken to drain queue.
1140 */
39236c6e
A
1141 if (pkt->pkt_sfb_flags & SFB_PKT_PBOX) {
1142 pkt->pkt_sfb_flags &= ~SFB_PKT_PBOX;
316670eb
A
1143 if (sp->sfb_clearpkts > 0)
1144 sp->sfb_clearpkts--;
1145 } else if (sp->sfb_clearpkts > 0) {
1146 sp->sfb_clearpkts--;
1147 } else {
39236c6e 1148 sfb_dq_update_bins(sp, pkt, &now);
316670eb
A
1149 }
1150
1151 return (m);
1152}
1153
1154struct mbuf *
1155sfb_getq(struct sfb *sp, class_queue_t *q)
1156{
1157 return (sfb_getq_flow(sp, q, 0, FALSE));
1158}
1159
1160void
1161sfb_purgeq(struct sfb *sp, class_queue_t *q, u_int32_t flow, u_int32_t *packets,
1162 u_int32_t *bytes)
1163{
1164 u_int32_t cnt = 0, len = 0;
1165 struct mbuf *m;
1166
1167 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
1168
1169 while ((m = sfb_getq_flow(sp, q, flow, TRUE)) != NULL) {
1170 cnt++;
1171 len += m_pktlen(m);
1172 m_freem(m);
1173 }
1174
1175 if (packets != NULL)
1176 *packets = cnt;
1177 if (bytes != NULL)
1178 *bytes = len;
1179}
1180
1181void
1182sfb_updateq(struct sfb *sp, cqev_t ev)
1183{
1184 struct ifnet *ifp = sp->sfb_ifp;
1185
1186 VERIFY(ifp != NULL);
1187
1188 switch (ev) {
39236c6e 1189 case CLASSQ_EV_LINK_BANDWIDTH: {
316670eb
A
1190 u_int64_t eff_rate = ifnet_output_linkrate(ifp);
1191
1192 /* update parameters only if rate has changed */
1193 if (eff_rate == sp->sfb_eff_rate)
1194 break;
1195
1196 if (classq_verbose) {
1197 log(LOG_DEBUG, "%s: SFB qid=%d, adapting to new "
1198 "eff_rate=%llu bps\n", if_name(ifp), sp->sfb_qid,
1199 eff_rate);
1200 }
1201 sfb_calc_holdtime(sp, eff_rate);
1202 sfb_calc_pboxtime(sp, eff_rate);
1203 break;
1204 }
1205
1206 case CLASSQ_EV_LINK_UP:
1207 case CLASSQ_EV_LINK_DOWN:
1208 if (classq_verbose) {
1209 log(LOG_DEBUG, "%s: SFB qid=%d, resetting due to "
1210 "link %s\n", if_name(ifp), sp->sfb_qid,
1211 (ev == CLASSQ_EV_LINK_UP) ? "UP" : "DOWN");
1212 }
1213 sfb_resetq(sp, ev);
1214 break;
1215
39236c6e 1216 case CLASSQ_EV_LINK_LATENCY:
316670eb
A
1217 case CLASSQ_EV_LINK_MTU:
1218 default:
1219 break;
1220 }
1221}
1222
1223int
1224sfb_suspendq(struct sfb *sp, class_queue_t *q, boolean_t on)
1225{
1226#pragma unused(q)
1227 struct ifnet *ifp = sp->sfb_ifp;
1228
1229 VERIFY(ifp != NULL);
1230
1231 if ((on && (sp->sfb_flags & SFBF_SUSPENDED)) ||
1232 (!on && !(sp->sfb_flags & SFBF_SUSPENDED)))
1233 return (0);
1234
1235 if (!(sp->sfb_flags & SFBF_FLOWCTL)) {
1236 log(LOG_ERR, "%s: SFB qid=%d, unable to %s queue since "
1237 "flow-control is not enabled", if_name(ifp), sp->sfb_qid,
1238 (on ? "suspend" : "resume"));
1239 return (ENOTSUP);
1240 }
1241
1242 if (classq_verbose) {
1243 log(LOG_DEBUG, "%s: SFB qid=%d, setting state to %s",
1244 if_name(ifp), sp->sfb_qid, (on ? "SUSPENDED" : "RUNNING"));
1245 }
1246
1247 if (on) {
1248 sp->sfb_flags |= SFBF_SUSPENDED;
1249 } else {
1250 sp->sfb_flags &= ~SFBF_SUSPENDED;
1251 sfb_swap_bins(sp, qlen(q));
1252 }
1253
1254 return (0);
1255}