]>
git.saurik.com Git - apple/xnu.git/blob - bsd/net/classq/classq_sfb.c
2 * Copyright (c) 2011-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/malloc.h>
33 #include <sys/socket.h>
34 #include <sys/sockio.h>
35 #include <sys/systm.h>
36 #include <sys/sysctl.h>
37 #include <sys/syslog.h>
39 #include <sys/errno.h>
40 #include <sys/kernel.h>
41 #include <sys/kauth.h>
43 #include <kern/zalloc.h>
46 #include <net/if_var.h>
47 #include <net/if_types.h>
50 #include <netinet/in.h>
51 #include <netinet/in_systm.h>
52 #include <netinet/ip.h>
54 #include <netinet/ip6.h>
57 #include <net/classq/classq_sfb.h>
58 #include <net/flowhash.h>
59 #include <net/net_osdep.h>
62 * Stochastic Fair Blue
64 * Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin
65 * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
67 * Based on the NS code with the following parameters:
72 * hold-time: 10ms-50ms (randomized)
75 * pbox-time: 50-100ms (randomized)
76 * hinterval: 11-23 (randomized)
78 * This implementation uses L = 2 and N = 32 for 2 sets of:
80 * B[L][N]: L x N array of bins (L levels, N bins per level)
82 * Each set effectively creates 32^2 virtual buckets (bin combinations)
83 * while using only O(32*2) states.
85 * Given a 32-bit hash value, we divide it such that octets [0,1,2,3] are
86 * used as index for the bins across the 2 levels, where level 1 uses [0,2]
87 * and level 2 uses [1,3]. The 2 values per level correspond to the indices
88 * for the current and warm-up sets (section 4.4. in the SFB paper regarding
89 * Moving Hash Functions explains the purposes of these 2 sets.)
93 * Use Murmur3A_x86_32 for hash function. It seems to perform consistently
94 * across platforms for 1-word key (32-bit flowhash value). See flowhash.h
95 * for other alternatives. We only need 16-bit hash output.
97 #define SFB_HASH net_flowhash_mh3_x86_32
98 #define SFB_HASHMASK HASHMASK(16)
100 #define SFB_BINMASK(_x) \
101 ((_x) & HASHMASK(SFB_BINS_SHIFT))
103 #define SFB_BINST(_sp, _l, _n, _c) \
104 (&(*(_sp)->sfb_bins)[_c].stats[_l][_n])
106 #define SFB_BINFT(_sp, _l, _n, _c) \
107 (&(*(_sp)->sfb_bins)[_c].freezetime[_l][_n])
109 #define SFB_FC_LIST(_sp, _n) \
110 (&(*(_sp)->sfb_fc_lists)[_n])
113 * The holdtime parameter determines the minimum time interval between
114 * two successive updates of the marking probability. In the event the
115 * uplink speed is not known, a default value is chosen and is randomized
116 * to be within the following range.
118 #define HOLDTIME_BASE (100ULL * 1000 * 1000) /* 100ms */
119 #define HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10ms */
120 #define HOLDTIME_MAX (100ULL * 1000 * 1000) /* 100ms */
123 * The pboxtime parameter determines the bandwidth allocated for rogue
124 * flows, i.e. the rate limiting bandwidth. In the event the uplink speed
125 * is not known, a default value is chosen and is randomized to be within
126 * the following range.
128 #define PBOXTIME_BASE (300ULL * 1000 * 1000) /* 300ms */
129 #define PBOXTIME_MIN (30ULL * 1000 * 1000) /* 30ms */
130 #define PBOXTIME_MAX (300ULL * 1000 * 1000) /* 300ms */
132 #define SFB_RANDOM(sp, tmin, tmax) ((sfb_random(sp) % (tmax)) + (tmin))
134 #define SFB_PKT_PBOX PF_TAG_QUEUE1 /* in penalty box */
136 /* The following mantissa values are in SFB_FP_SHIFT Q format */
137 #define SFB_MAX_PMARK (1 << SFB_FP_SHIFT) /* Q14 representation of 1.00 */
140 * These are d1 (increment) and d2 (decrement) parameters, used to determine
141 * the amount by which the marking probability is incremented when the queue
142 * overflows, or is decremented when the link is idle. d1 is set higher than
143 * d2, because link underutilization can occur when congestion management is
144 * either too conservative or too aggressive, but packet loss occurs only
145 * when congestion management is too conservative. By weighing heavily
146 * against packet loss, it can quickly reach to a substantial increase in
149 #define SFB_INCREMENT 82 /* Q14 representation of 0.005 */
150 #define SFB_DECREMENT 16 /* Q14 representation of 0.001 */
152 #define SFB_PMARK_TH 16056 /* Q14 representation of 0.98 */
153 #define SFB_PMARK_WARM 3276 /* Q14 representation of 0.2 */
155 #define SFB_PMARK_INC(_bin) do { \
156 (_bin)->pmark += sfb_increment; \
157 if ((_bin)->pmark > SFB_MAX_PMARK) \
158 (_bin)->pmark = SFB_MAX_PMARK; \
161 #define SFB_PMARK_DEC(_bin) do { \
162 if ((_bin)->pmark > 0) { \
163 (_bin)->pmark -= sfb_decrement; \
164 if ((_bin)->pmark < 0) \
169 #define HINTERVAL_MIN (10) /* 10 seconds */
170 #define HINTERVAL_MAX (20) /* 20 seconds */
171 #define SFB_HINTERVAL(sp) ((sfb_random(sp) % HINTERVAL_MAX) + HINTERVAL_MIN)
173 #define DEQUEUE_DECAY 7 /* ilog2 of EWMA decay rate, (128) */
174 #define DEQUEUE_SPIKE(_new, _old) \
175 ((u_int64_t)ABS((int64_t)(_new) - (int64_t)(_old)) > ((_old) << 11))
177 #define ABS(v) (((v) > 0) ? (v) : -(v))
179 #define SFB_ZONE_MAX 32 /* maximum elements in zone */
180 #define SFB_ZONE_NAME "classq_sfb" /* zone name */
182 /* Place the flow control entries in current bin on level 0 */
183 #define SFB_FC_LEVEL 0
185 static unsigned int sfb_size
; /* size of zone element */
186 static struct zone
*sfb_zone
; /* zone for sfb */
188 /* internal function prototypes */
189 static u_int32_t
sfb_random(struct sfb
*);
190 static struct mbuf
*sfb_getq_flow(struct sfb
*, class_queue_t
*, u_int32_t
,
192 static void sfb_resetq(struct sfb
*, cqev_t
);
193 static void sfb_calc_holdtime(struct sfb
*, u_int64_t
);
194 static void sfb_calc_pboxtime(struct sfb
*, u_int64_t
);
195 static void sfb_calc_hinterval(struct sfb
*, u_int64_t
*);
196 static void sfb_swap_bins(struct sfb
*, u_int32_t
);
197 static inline int sfb_pcheck(struct sfb
*, struct pf_mtag
*);
198 static int sfb_penalize(struct sfb
*, struct pf_mtag
*, struct timespec
*);
199 static void sfb_adjust_bin(struct sfb
*, struct sfbbinstats
*,
200 struct timespec
*, struct timespec
*, boolean_t
);
201 static void sfb_decrement_bin(struct sfb
*, struct sfbbinstats
*,
202 struct timespec
*, struct timespec
*);
203 static void sfb_increment_bin(struct sfb
*, struct sfbbinstats
*,
204 struct timespec
*, struct timespec
*);
205 static inline void sfb_dq_update_bins(struct sfb
*, struct pf_mtag
*,
207 static inline void sfb_eq_update_bins(struct sfb
*, struct pf_mtag
*);
208 static int sfb_drop_early(struct sfb
*, struct pf_mtag
*, u_int16_t
*,
210 static boolean_t
sfb_bin_addfcentry(struct sfb
*, struct pf_mtag
*);
211 static void sfb_fclist_append(struct sfb
*, struct sfb_fc_list
*);
212 static void sfb_fclists_clean(struct sfb
*sp
);
214 SYSCTL_NODE(_net_classq
, OID_AUTO
, sfb
, CTLFLAG_RW
|CTLFLAG_LOCKED
, 0, "SFB");
216 static u_int64_t sfb_holdtime
= 0; /* 0 indicates "automatic" */
217 SYSCTL_QUAD(_net_classq_sfb
, OID_AUTO
, holdtime
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
218 &sfb_holdtime
, "SFB freeze time in nanoseconds");
220 static u_int64_t sfb_pboxtime
= 0; /* 0 indicates "automatic" */
221 SYSCTL_QUAD(_net_classq_sfb
, OID_AUTO
, pboxtime
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
222 &sfb_pboxtime
, "SFB penalty box time in nanoseconds");
224 static u_int64_t sfb_hinterval
;
225 SYSCTL_QUAD(_net_classq_sfb
, OID_AUTO
, hinterval
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
226 &sfb_hinterval
, "SFB hash interval in nanoseconds");
228 static u_int32_t sfb_increment
= SFB_INCREMENT
;
229 SYSCTL_UINT(_net_classq_sfb
, OID_AUTO
, increment
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
230 &sfb_increment
, SFB_INCREMENT
, "SFB increment [d1]");
232 static u_int32_t sfb_decrement
= SFB_DECREMENT
;
233 SYSCTL_UINT(_net_classq_sfb
, OID_AUTO
, decrement
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
234 &sfb_decrement
, SFB_DECREMENT
, "SFB decrement [d2]");
236 static u_int32_t sfb_allocation
= 0; /* 0 means "automatic" */
237 SYSCTL_UINT(_net_classq_sfb
, OID_AUTO
, allocation
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
238 &sfb_allocation
, 0, "SFB bin allocation");
240 static u_int32_t sfb_ratelimit
= 0;
241 SYSCTL_UINT(_net_classq_sfb
, OID_AUTO
, ratelimit
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
242 &sfb_ratelimit
, 0, "SFB rate limit");
244 #define MBPS (1ULL * 1000 * 1000)
245 #define GBPS (MBPS * 1000)
247 struct sfb_time_tbl
{
248 u_int64_t speed
; /* uplink speed */
249 u_int64_t holdtime
; /* hold time */
250 u_int64_t pboxtime
; /* penalty box time */
253 static struct sfb_time_tbl sfb_ttbl
[] = {
254 { 1 * MBPS
, HOLDTIME_BASE
* 1000, PBOXTIME_BASE
* 1000 },
255 { 10 * MBPS
, HOLDTIME_BASE
* 100, PBOXTIME_BASE
* 100 },
256 { 100 * MBPS
, HOLDTIME_BASE
* 10, PBOXTIME_BASE
* 10 },
257 { 1 * GBPS
, HOLDTIME_BASE
, PBOXTIME_BASE
},
258 { 10 * GBPS
, HOLDTIME_BASE
/ 10, PBOXTIME_BASE
/ 10 },
259 { 100 * GBPS
, HOLDTIME_BASE
/ 100, PBOXTIME_BASE
/ 100 },
266 _CASSERT(SFBF_ECN4
== CLASSQF_ECN4
);
267 _CASSERT(SFBF_ECN6
== CLASSQF_ECN6
);
269 sfb_size
= sizeof (struct sfb
);
270 sfb_zone
= zinit(sfb_size
, SFB_ZONE_MAX
* sfb_size
,
272 if (sfb_zone
== NULL
) {
273 panic("%s: failed allocating %s", __func__
, SFB_ZONE_NAME
);
276 zone_change(sfb_zone
, Z_EXPAND
, TRUE
);
277 zone_change(sfb_zone
, Z_CALLERACCT
, TRUE
);
281 sfb_random(struct sfb
*sp
)
283 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
288 sfb_calc_holdtime(struct sfb
*sp
, u_int64_t outbw
)
292 if (sfb_holdtime
!= 0) {
293 holdtime
= sfb_holdtime
;
294 } else if (outbw
== 0) {
295 holdtime
= SFB_RANDOM(sp
, HOLDTIME_MIN
, HOLDTIME_MAX
);
299 n
= sfb_ttbl
[0].holdtime
;
300 for (i
= 0; sfb_ttbl
[i
].speed
!= 0; i
++) {
301 if (outbw
< sfb_ttbl
[i
].speed
)
303 n
= sfb_ttbl
[i
].holdtime
;
307 net_nsectimer(&holdtime
, &sp
->sfb_holdtime
);
311 sfb_calc_pboxtime(struct sfb
*sp
, u_int64_t outbw
)
315 if (sfb_pboxtime
!= 0) {
316 pboxtime
= sfb_pboxtime
;
317 } else if (outbw
== 0) {
318 pboxtime
= SFB_RANDOM(sp
, PBOXTIME_MIN
, PBOXTIME_MAX
);
322 n
= sfb_ttbl
[0].pboxtime
;
323 for (i
= 0; sfb_ttbl
[i
].speed
!= 0; i
++) {
324 if (outbw
< sfb_ttbl
[i
].speed
)
326 n
= sfb_ttbl
[i
].pboxtime
;
330 net_nsectimer(&pboxtime
, &sp
->sfb_pboxtime
);
331 net_timerclear(&sp
->sfb_pboxfreeze
);
335 sfb_calc_hinterval(struct sfb
*sp
, u_int64_t
*t
)
342 * TODO adi@apple.com: use dq_avg to derive hinterval.
347 if (sfb_hinterval
!= 0)
348 hinterval
= sfb_hinterval
;
349 else if (t
== NULL
|| hinterval
== 0)
350 hinterval
= ((u_int64_t
)SFB_HINTERVAL(sp
) * NSEC_PER_SEC
);
352 net_nsectimer(&hinterval
, &sp
->sfb_hinterval
);
355 net_timeradd(&now
, &sp
->sfb_hinterval
, &sp
->sfb_nextreset
);
359 * sfb support routines
362 sfb_alloc(struct ifnet
*ifp
, u_int32_t qid
, u_int32_t qlim
, u_int32_t flags
)
366 VERIFY(ifp
!= NULL
&& qlim
> 0);
368 sp
= zalloc(sfb_zone
);
370 log(LOG_ERR
, "%s: SFB unable to allocate\n", if_name(ifp
));
375 if ((sp
->sfb_bins
= _MALLOC(sizeof (*sp
->sfb_bins
), M_DEVBUF
,
376 M_WAITOK
|M_ZERO
)) == NULL
) {
377 log(LOG_ERR
, "%s: SFB unable to allocate bins\n", if_name(ifp
));
382 if ((sp
->sfb_fc_lists
= _MALLOC(sizeof (*sp
->sfb_fc_lists
), M_DEVBUF
,
383 M_WAITOK
|M_ZERO
)) == NULL
) {
384 log(LOG_ERR
, "%s: SFB unable to allocate flow control lists\n",
390 sp
->sfb_flags
= (flags
& SFBF_USERFLAGS
);
401 sfb_fclist_append(struct sfb
*sp
, struct sfb_fc_list
*fcl
)
403 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
404 ifnet_fclist_append(sp
, fcl
);
408 sfb_fclists_clean(struct sfb
*sp
)
412 /* Move all the flow control entries to the ifnet list */
413 for (i
= 0; i
< SFB_BINS
; ++i
) {
414 struct sfb_fc_list
*fcl
= SFB_FC_LIST(sp
, i
);
415 if (!SLIST_EMPTY(fcl
))
416 sfb_fclist_append(sp
, fcl
);
421 sfb_destroy(struct sfb
*sp
)
423 sfb_fclists_clean(sp
);
424 if (sp
->sfb_bins
!= NULL
) {
425 _FREE(sp
->sfb_bins
, M_DEVBUF
);
428 if (sp
->sfb_fc_lists
!= NULL
) {
429 _FREE(sp
->sfb_fc_lists
, M_DEVBUF
);
430 sp
->sfb_fc_lists
= NULL
;
436 sfb_resetq(struct sfb
*sp
, cqev_t ev
)
438 struct ifnet
*ifp
= sp
->sfb_ifp
;
443 if (ev
!= CLASSQ_EV_LINK_DOWN
) {
444 (*sp
->sfb_bins
)[0].fudge
= sfb_random(sp
);
445 (*sp
->sfb_bins
)[1].fudge
= sfb_random(sp
);
446 sp
->sfb_allocation
= ((sfb_allocation
== 0) ?
447 (sp
->sfb_qlim
/ 3) : sfb_allocation
);
448 sp
->sfb_drop_thresh
= sp
->sfb_allocation
+
449 (sp
->sfb_allocation
>> 1);
452 sp
->sfb_clearpkts
= 0;
455 eff_rate
= ifnet_output_linkrate(ifp
);
456 sp
->sfb_eff_rate
= eff_rate
;
458 sfb_calc_holdtime(sp
, eff_rate
);
459 sfb_calc_pboxtime(sp
, eff_rate
);
460 sfb_calc_hinterval(sp
, NULL
);
462 if (ev
== CLASSQ_EV_LINK_DOWN
||
463 ev
== CLASSQ_EV_LINK_UP
)
464 sfb_fclists_clean(sp
);
466 bzero(sp
->sfb_bins
, sizeof (*sp
->sfb_bins
));
467 bzero(&sp
->sfb_stats
, sizeof (sp
->sfb_stats
));
469 if (ev
== CLASSQ_EV_LINK_DOWN
|| !classq_verbose
)
472 log(LOG_DEBUG
, "%s: SFB qid=%d, holdtime=%llu nsec, "
473 "pboxtime=%llu nsec, allocation=%d, drop_thresh=%d, "
474 "hinterval=%d sec, sfb_bins=%d bytes, eff_rate=%llu bps\n",
475 if_name(ifp
), sp
->sfb_qid
, (u_int64_t
)sp
->sfb_holdtime
.tv_nsec
,
476 (u_int64_t
)sp
->sfb_pboxtime
.tv_nsec
,
477 (u_int32_t
)sp
->sfb_allocation
, (u_int32_t
)sp
->sfb_drop_thresh
,
478 (int)sp
->sfb_hinterval
.tv_sec
, (int)sizeof (*sp
->sfb_bins
),
483 sfb_getstats(struct sfb
*sp
, struct sfb_stats
*sps
)
485 sps
->allocation
= sp
->sfb_allocation
;
486 sps
->dropthresh
= sp
->sfb_drop_thresh
;
487 sps
->clearpkts
= sp
->sfb_clearpkts
;
488 sps
->current
= sp
->sfb_current
;
490 net_timernsec(&sp
->sfb_holdtime
, &sp
->sfb_stats
.hold_time
);
491 net_timernsec(&sp
->sfb_pboxtime
, &sp
->sfb_stats
.pbox_time
);
492 net_timernsec(&sp
->sfb_hinterval
, &sp
->sfb_stats
.rehash_intval
);
493 *(&(sps
->sfbstats
)) = *(&(sp
->sfb_stats
));
495 _CASSERT(sizeof ((*sp
->sfb_bins
)[0].stats
) ==
496 sizeof (sps
->binstats
[0].stats
));
498 bcopy(&(*sp
->sfb_bins
)[0].stats
, &sps
->binstats
[0].stats
,
499 sizeof (sps
->binstats
[0].stats
));
500 bcopy(&(*sp
->sfb_bins
)[1].stats
, &sps
->binstats
[1].stats
,
501 sizeof (sps
->binstats
[1].stats
));
505 sfb_swap_bins(struct sfb
*sp
, u_int32_t len
)
509 if (sp
->sfb_flags
& SFBF_SUSPENDED
)
513 VERIFY((s
+ (s
^ 1)) == 1);
515 (*sp
->sfb_bins
)[s
].fudge
= sfb_random(sp
); /* recompute perturbation */
516 sp
->sfb_clearpkts
= len
;
517 sp
->sfb_stats
.num_rehash
++;
519 s
= (sp
->sfb_current
^= 1); /* flip the bit (swap current) */
521 if (classq_verbose
) {
522 log(LOG_DEBUG
, "%s: SFB qid=%d, set %d is now current, "
523 "qlen=%d\n", if_name(sp
->sfb_ifp
), sp
->sfb_qid
, s
, len
);
526 /* clear freezetime for all current bins */
527 bzero(&(*sp
->sfb_bins
)[s
].freezetime
,
528 sizeof ((*sp
->sfb_bins
)[s
].freezetime
));
530 /* clear/adjust bin statistics and flow control lists */
531 for (i
= 0; i
< SFB_BINS
; i
++) {
532 struct sfb_fc_list
*fcl
= SFB_FC_LIST(sp
, i
);
534 if (!SLIST_EMPTY(fcl
))
535 sfb_fclist_append(sp
, fcl
);
537 for (j
= 0; j
< SFB_LEVELS
; j
++) {
538 struct sfbbinstats
*cbin
, *wbin
;
540 cbin
= SFB_BINST(sp
, j
, i
, s
); /* current */
541 wbin
= SFB_BINST(sp
, j
, i
, s
^ 1); /* warm-up */
544 if (cbin
->pmark
> SFB_MAX_PMARK
)
545 cbin
->pmark
= SFB_MAX_PMARK
;
550 * Keep pmark from before to identify
551 * non-responsives immediately.
553 if (wbin
->pmark
> SFB_PMARK_WARM
)
554 wbin
->pmark
= SFB_PMARK_WARM
;
560 sfb_pcheck(struct sfb
*sp
, struct pf_mtag
*t
)
564 #endif /* SFB_LEVELS != 2 */
568 VERIFY((s
+ (s
^ 1)) == 1);
571 * For current bins, returns 1 if all pmark >= SFB_PMARK_TH,
572 * 0 otherwise; optimize for SFB_LEVELS=2.
576 * Level 0: bin index at [0] for set 0; [2] for set 1
577 * Level 1: bin index at [1] for set 0; [3] for set 1
579 if (SFB_BINST(sp
, 0, SFB_BINMASK(t
->pftag_qpriv8
[(s
<< 1)]),
580 s
)->pmark
< SFB_PMARK_TH
||
581 SFB_BINST(sp
, 1, SFB_BINMASK(t
->pftag_qpriv8
[(s
<< 1) + 1]),
582 s
)->pmark
< SFB_PMARK_TH
)
584 #else /* SFB_LEVELS != 2 */
585 for (i
= 0; i
< SFB_LEVELS
; i
++) {
586 if (s
== 0) /* set 0, bin index [0,1] */
587 n
= SFB_BINMASK(t
->pftag_qpriv8
[i
]);
588 else /* set 1, bin index [2,3] */
589 n
= SFB_BINMASK(t
->pftag_qpriv8
[i
+ 2]);
591 if (SFB_BINST(sp
, i
, n
, s
)->pmark
< SFB_PMARK_TH
)
594 #endif /* SFB_LEVELS != 2 */
599 sfb_penalize(struct sfb
*sp
, struct pf_mtag
*t
, struct timespec
*now
)
601 struct timespec delta
= { 0, 0 };
603 /* If minimum pmark of current bins is < SFB_PMARK_TH, we're done */
604 if (!sfb_ratelimit
|| !sfb_pcheck(sp
, t
))
607 net_timersub(now
, &sp
->sfb_pboxfreeze
, &delta
);
608 if (net_timercmp(&delta
, &sp
->sfb_pboxtime
, <)) {
611 #endif /* SFB_LEVELS != 2 */
612 struct sfbbinstats
*bin
;
615 w
= sp
->sfb_current
^ 1;
616 VERIFY((w
+ (w
^ 1)) == 1);
619 * Update warm-up bins; optimize for SFB_LEVELS=2
622 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
623 n
= SFB_BINMASK(t
->pftag_qpriv8
[(w
<< 1)]);
624 bin
= SFB_BINST(sp
, 0, n
, w
);
625 if (bin
->pkts
>= sp
->sfb_allocation
)
626 sfb_increment_bin(sp
, bin
, SFB_BINFT(sp
, 0, n
, w
), now
);
628 /* Level 0: bin index at [1] for set 0; [3] for set 1 */
629 n
= SFB_BINMASK(t
->pftag_qpriv8
[(w
<< 1) + 1]);
630 bin
= SFB_BINST(sp
, 1, n
, w
);
631 if (bin
->pkts
>= sp
->sfb_allocation
)
632 sfb_increment_bin(sp
, bin
, SFB_BINFT(sp
, 1, n
, w
), now
);
633 #else /* SFB_LEVELS != 2 */
634 for (i
= 0; i
< SFB_LEVELS
; i
++) {
635 if (w
== 0) /* set 0, bin index [0,1] */
636 n
= SFB_BINMASK(t
->pftag_qpriv8
[i
]);
637 else /* set 1, bin index [2,3] */
638 n
= SFB_BINMASK(t
->pftag_qpriv8
[i
+ 2]);
640 bin
= SFB_BINST(sp
, i
, n
, w
);
641 if (bin
->pkts
>= sp
->sfb_allocation
) {
642 sfb_increment_bin(sp
, bin
,
643 SFB_BINFT(sp
, i
, n
, w
), now
);
646 #endif /* SFB_LEVELS != 2 */
650 /* non-conformant or else misclassified flow; queue it anyway */
651 t
->pftag_flags
|= SFB_PKT_PBOX
;
652 *(&sp
->sfb_pboxfreeze
) = *now
;
658 sfb_adjust_bin(struct sfb
*sp
, struct sfbbinstats
*bin
, struct timespec
*ft
,
659 struct timespec
*now
, boolean_t inc
)
661 struct timespec delta
;
663 net_timersub(now
, ft
, &delta
);
664 if (net_timercmp(&delta
, &sp
->sfb_holdtime
, <)) {
665 if (classq_verbose
> 1) {
666 log(LOG_DEBUG
, "%s: SFB qid=%d, %s update frozen "
667 "(delta=%llu nsec)\n", if_name(sp
->sfb_ifp
),
668 sp
->sfb_qid
, inc
? "increment" : "decrement",
669 (u_int64_t
)delta
.tv_nsec
);
674 /* increment/decrement marking probability */
683 sfb_decrement_bin(struct sfb
*sp
, struct sfbbinstats
*bin
, struct timespec
*ft
,
684 struct timespec
*now
)
686 return (sfb_adjust_bin(sp
, bin
, ft
, now
, FALSE
));
690 sfb_increment_bin(struct sfb
*sp
, struct sfbbinstats
*bin
, struct timespec
*ft
,
691 struct timespec
*now
)
693 return (sfb_adjust_bin(sp
, bin
, ft
, now
, TRUE
));
697 sfb_dq_update_bins(struct sfb
*sp
, struct pf_mtag
*t
, struct timespec
*now
)
699 #if SFB_LEVELS != 2 || SFB_FC_LEVEL != 0
701 #endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
702 struct sfbbinstats
*bin
;
704 struct sfb_fc_list
*fcl
= NULL
;
707 VERIFY((s
+ (s
^ 1)) == 1);
710 * Update current bins; optimize for SFB_LEVELS=2 and SFB_FC_LEVEL=0
712 #if SFB_LEVELS == 2 && SFB_FC_LEVEL == 0
713 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
714 n
= SFB_BINMASK(t
->pftag_qpriv8
[(s
<< 1)]);
715 bin
= SFB_BINST(sp
, 0, n
, s
);
717 VERIFY(bin
->pkts
> 0);
718 if (--bin
->pkts
== 0) {
719 sfb_decrement_bin(sp
, bin
, SFB_BINFT(sp
, 0, n
, s
), now
);
721 if (bin
->pkts
<= (sp
->sfb_allocation
>> 2)) {
722 /* deliver flow control feedback to the sockets */
723 fcl
= SFB_FC_LIST(sp
, n
);
724 if (!SLIST_EMPTY(fcl
))
725 sfb_fclist_append(sp
, fcl
);
728 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
729 n
= SFB_BINMASK(t
->pftag_qpriv8
[(s
<< 1) + 1]);
730 bin
= SFB_BINST(sp
, 1, n
, s
);
732 VERIFY(bin
->pkts
> 0);
733 if (--bin
->pkts
== 0)
734 sfb_decrement_bin(sp
, bin
, SFB_BINFT(sp
, 1, n
, s
), now
);
735 #else /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
736 for (i
= 0; i
< SFB_LEVELS
; i
++) {
737 if (s
== 0) /* set 0, bin index [0,1] */
738 n
= SFB_BINMASK(t
->pftag_qpriv8
[i
]);
739 else /* set 1, bin index [2,3] */
740 n
= SFB_BINMASK(t
->pftag_qpriv8
[i
+ 2]);
742 bin
= SFB_BINST(sp
, i
, n
, s
);
744 VERIFY(bin
->pkts
> 0);
745 if (--bin
->pkts
== 0) {
746 sfb_decrement_bin(sp
, bin
,
747 SFB_BINFT(sp
, i
, n
, s
), now
);
749 if (bin
->pkts
<= (sp
->sfb_allocation
>> 2)) {
750 /* deliver flow control feedback to the sockets */
751 if (i
== SFB_FC_LEVEL
) {
752 fcl
= SFB_FC_LIST(sp
, n
);
753 if (!SLIST_EMPTY(fcl
))
754 sfb_fclist_append(sp
, fcl
);
758 #endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
762 sfb_eq_update_bins(struct sfb
*sp
, struct pf_mtag
*t
)
766 #endif /* SFB_LEVELS != 2 */
770 VERIFY((s
+ (s
^ 1)) == 1);
773 * Update current bins; optimize for SFB_LEVELS=2
776 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
777 SFB_BINST(sp
, 0, SFB_BINMASK(t
->pftag_qpriv8
[(s
<< 1)]), s
)->pkts
++;
779 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
780 SFB_BINST(sp
, 1, SFB_BINMASK(t
->pftag_qpriv8
[(s
<< 1) + 1]), s
)->pkts
++;
781 #else /* SFB_LEVELS != 2 */
782 for (i
= 0; i
< SFB_LEVELS
; i
++) {
783 if (s
== 0) /* set 0, bin index [0,1] */
784 n
= SFB_BINMASK(t
->pftag_qpriv8
[i
]);
785 else /* set 1, bin index [2,3] */
786 n
= SFB_BINMASK(t
->pftag_qpriv8
[i
+ 2]);
788 SFB_BINST(sp
, i
, n
, s
)->pkts
++;
790 #endif /* SFB_LEVELS != 2 */
794 sfb_bin_addfcentry(struct sfb
*sp
, struct pf_mtag
*t
)
796 struct sfb_bin_fcentry
*fce
;
798 struct sfb_fc_list
*fcl
;
802 VERIFY((s
+ (s
^ 1)) == 1);
804 flowhash
= t
->pftag_flowhash
;
807 sp
->sfb_stats
.null_flowhash
++;
812 * Use value at index 0 for set 0 and
813 * value at index 2 for set 1
815 fcl
= SFB_FC_LIST(sp
, SFB_BINMASK(t
->pftag_qpriv8
[(s
<< 1)]));
816 SLIST_FOREACH(fce
, fcl
, fce_link
) {
817 if (fce
->fce_flowhash
== flowhash
) {
818 /* Already on flow control list; just return */
823 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
824 fce
= ifnet_fce_alloc(M_WAITOK
);
826 fce
->fce_flowhash
= flowhash
;
827 SLIST_INSERT_HEAD(fcl
, fce
, fce_link
);
828 sp
->sfb_stats
.flow_controlled
++;
831 return (fce
!= NULL
);
835 * early-drop probability is kept in pmark of each bin of the flow
838 sfb_drop_early(struct sfb
*sp
, struct pf_mtag
*t
, u_int16_t
*pmin
,
839 struct timespec
*now
)
843 #endif /* SFB_LEVELS != 2 */
844 struct sfbbinstats
*bin
;
848 VERIFY((s
+ (s
^ 1)) == 1);
850 *pmin
= (u_int16_t
)-1;
853 * Update current bins; optimize for SFB_LEVELS=2
856 /* Level 0: bin index at [0] for set 0; [2] for set 1 */
857 n
= SFB_BINMASK(t
->pftag_qpriv8
[(s
<< 1)]);
858 bin
= SFB_BINST(sp
, 0, n
, s
);
859 if (*pmin
> (u_int16_t
)bin
->pmark
)
860 *pmin
= (u_int16_t
)bin
->pmark
;
862 if (bin
->pkts
>= sp
->sfb_allocation
) {
863 if (bin
->pkts
>= sp
->sfb_drop_thresh
)
864 ret
= 1; /* drop or mark */
865 sfb_increment_bin(sp
, bin
, SFB_BINFT(sp
, 0, n
, s
), now
);
868 /* Level 1: bin index at [1] for set 0; [3] for set 1 */
869 n
= SFB_BINMASK(t
->pftag_qpriv8
[(s
<< 1) + 1]);
870 bin
= SFB_BINST(sp
, 1, n
, s
);
871 if (*pmin
> (u_int16_t
)bin
->pmark
)
872 *pmin
= (u_int16_t
)bin
->pmark
;
874 if (bin
->pkts
>= sp
->sfb_allocation
) {
875 if (bin
->pkts
>= sp
->sfb_drop_thresh
)
876 ret
= 1; /* drop or mark */
877 sfb_increment_bin(sp
, bin
, SFB_BINFT(sp
, 1, n
, s
), now
);
879 #else /* SFB_LEVELS != 2 */
880 for (i
= 0; i
< SFB_LEVELS
; i
++) {
881 if (s
== 0) /* set 0, bin index [0,1] */
882 n
= SFB_BINMASK(t
->pftag_qpriv8
[i
]);
883 else /* set 1, bin index [2,3] */
884 n
= SFB_BINMASK(t
->pftag_qpriv8
[i
+ 2]);
886 bin
= SFB_BINST(sp
, i
, n
, s
);
887 if (*pmin
> (u_int16_t
)bin
->pmark
)
888 *pmin
= (u_int16_t
)bin
->pmark
;
890 if (bin
->pkts
>= sp
->sfb_allocation
) {
891 if (bin
->pkts
>= sp
->sfb_drop_thresh
)
892 ret
= 1; /* drop or mark */
893 sfb_increment_bin(sp
, bin
,
894 SFB_BINFT(sp
, i
, n
, s
), now
);
897 #endif /* SFB_LEVELS != 2 */
899 if (sp
->sfb_flags
& SFBF_SUSPENDED
)
900 ret
= 1; /* drop or mark */
905 #define DTYPE_NODROP 0 /* no drop */
906 #define DTYPE_FORCED 1 /* a "forced" drop */
907 #define DTYPE_EARLY 2 /* an "unforced" (early) drop */
910 sfb_addq(struct sfb
*sp
, class_queue_t
*q
, struct mbuf
*m
, struct pf_mtag
*t
)
916 int ret
= CLASSQEQ_SUCCESS
;
921 VERIFY((s
+ (s
^ 1)) == 1);
923 /* time to swap the bins? */
924 if (net_timercmp(&now
, &sp
->sfb_nextreset
, >=)) {
925 net_timeradd(&now
, &sp
->sfb_hinterval
, &sp
->sfb_nextreset
);
926 sfb_swap_bins(sp
, qlen(q
));
928 VERIFY((s
+ (s
^ 1)) == 1);
931 t
->pftag_flags
&= ~SFB_PKT_PBOX
;
932 t
->pftag_qpriv16
[s
] =
933 (SFB_HASH(&t
->pftag_flowhash
, sizeof (t
->pftag_flowhash
),
934 (*sp
->sfb_bins
)[s
].fudge
) & SFB_HASHMASK
);
935 t
->pftag_qpriv16
[s
^ 1] =
936 (SFB_HASH(&t
->pftag_flowhash
, sizeof (t
->pftag_flowhash
),
937 (*sp
->sfb_bins
)[s
^ 1].fudge
) & SFB_HASHMASK
);
939 /* see if we drop early */
940 droptype
= DTYPE_NODROP
;
941 if (sfb_drop_early(sp
, t
, &pmin
, &now
)) {
942 /* flow control, mark or drop by sfb */
943 if ((sp
->sfb_flags
& SFBF_FLOWCTL
) &&
944 (t
->pftag_flags
& PF_TAG_FLOWADV
)) {
946 /* drop all during suspension or for non-TCP */
947 if ((sp
->sfb_flags
& SFBF_SUSPENDED
) ||
948 !(t
->pftag_flags
& PF_TAG_TCP
)) {
949 droptype
= DTYPE_EARLY
;
950 sp
->sfb_stats
.drop_early
++;
952 } else if ((sp
->sfb_flags
& SFBF_ECN
) &&
953 (t
->pftag_flags
& PF_TAG_TCP
) && /* only for TCP */
954 ((sfb_random(sp
) & SFB_MAX_PMARK
) <= pmin
) &&
955 mark_ecn(m
, t
, sp
->sfb_flags
) &&
956 !(sp
->sfb_flags
& SFBF_SUSPENDED
)) {
957 /* successfully marked; do not drop. */
958 sp
->sfb_stats
.marked_packets
++;
960 /* unforced drop by sfb */
961 droptype
= DTYPE_EARLY
;
962 sp
->sfb_stats
.drop_early
++;
966 /* non-responsive flow penalty? */
967 if (droptype
== DTYPE_NODROP
&& sfb_penalize(sp
, t
, &now
)) {
968 droptype
= DTYPE_FORCED
;
969 sp
->sfb_stats
.drop_pbox
++;
972 /* if the queue length hits the hard limit, it's a forced drop */
973 if (droptype
== DTYPE_NODROP
&& qlen(q
) >= qlimit(q
)) {
974 droptype
= DTYPE_FORCED
;
975 sp
->sfb_stats
.drop_queue
++;
978 if (fc_adv
== 1 && droptype
!= DTYPE_FORCED
&&
979 sfb_bin_addfcentry(sp
, t
)) {
980 /* deliver flow control advisory error */
981 if (droptype
== DTYPE_NODROP
) {
982 ret
= CLASSQEQ_SUCCESS_FC
;
983 VERIFY(!(sp
->sfb_flags
& SFBF_SUSPENDED
));
984 } else if (sp
->sfb_flags
& SFBF_SUSPENDED
) {
985 /* dropped due to suspension */
986 ret
= CLASSQEQ_DROPPED_SP
;
988 /* dropped due to flow-control */
989 ret
= CLASSQEQ_DROPPED_FC
;
993 /* if successful enqueue this packet, else drop it */
994 if (droptype
== DTYPE_NODROP
) {
997 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
999 return ((ret
!= CLASSQEQ_SUCCESS
) ? ret
: CLASSQEQ_DROPPED
);
1002 if (!(t
->pftag_flags
& SFB_PKT_PBOX
))
1003 sfb_eq_update_bins(sp
, t
);
1005 sp
->sfb_stats
.pbox_packets
++;
1007 /* successfully queued */
1011 static struct mbuf
*
1012 sfb_getq_flow(struct sfb
*sp
, class_queue_t
*q
, u_int32_t flow
, boolean_t purge
)
1014 struct timespec now
;
1018 if (!purge
&& (sp
->sfb_flags
& SFBF_SUSPENDED
))
1023 /* flow of 0 means head of queue */
1024 if ((m
= ((flow
== 0) ? _getq(q
) : _getq_flow(q
, flow
))) == NULL
) {
1026 net_timerclear(&sp
->sfb_getqtime
);
1030 VERIFY(m
->m_flags
& M_PKTHDR
);
1035 /* calculate EWMA of dequeues */
1036 if (net_timerisset(&sp
->sfb_getqtime
)) {
1037 struct timespec delta
;
1040 net_timersub(&now
, &sp
->sfb_getqtime
, &delta
);
1041 net_timernsec(&delta
, &new);
1042 avg
= sp
->sfb_stats
.dequeue_avg
;
1044 int decay
= DEQUEUE_DECAY
;
1046 * If the time since last dequeue is
1047 * significantly greater than the current
1048 * average, weight the average more against
1051 if (DEQUEUE_SPIKE(new, avg
))
1053 avg
= (((avg
<< decay
) - avg
) + new) >> decay
;
1057 sp
->sfb_stats
.dequeue_avg
= avg
;
1059 *(&sp
->sfb_getqtime
) = *(&now
);
1063 * Clearpkts are the ones which were in the queue when the hash
1064 * function was perturbed. Since the perturbation value (fudge),
1065 * and thus bin information for these packets is not known, we do
1066 * not change accounting information while dequeuing these packets.
1067 * It is important not to set the hash interval too small due to
1068 * this reason. A rule of thumb is to set it to K*D, where D is
1069 * the time taken to drain queue.
1071 if (t
->pftag_flags
& SFB_PKT_PBOX
) {
1072 t
->pftag_flags
&= ~SFB_PKT_PBOX
;
1073 if (sp
->sfb_clearpkts
> 0)
1074 sp
->sfb_clearpkts
--;
1075 } else if (sp
->sfb_clearpkts
> 0) {
1076 sp
->sfb_clearpkts
--;
1078 sfb_dq_update_bins(sp
, t
, &now
);
1085 sfb_getq(struct sfb
*sp
, class_queue_t
*q
)
1087 return (sfb_getq_flow(sp
, q
, 0, FALSE
));
1091 sfb_purgeq(struct sfb
*sp
, class_queue_t
*q
, u_int32_t flow
, u_int32_t
*packets
,
1094 u_int32_t cnt
= 0, len
= 0;
1097 IFCQ_CONVERT_LOCK(&sp
->sfb_ifp
->if_snd
);
1099 while ((m
= sfb_getq_flow(sp
, q
, flow
, TRUE
)) != NULL
) {
1105 if (packets
!= NULL
)
1112 sfb_updateq(struct sfb
*sp
, cqev_t ev
)
1114 struct ifnet
*ifp
= sp
->sfb_ifp
;
1116 VERIFY(ifp
!= NULL
);
1119 case CLASSQ_EV_LINK_SPEED
: {
1120 u_int64_t eff_rate
= ifnet_output_linkrate(ifp
);
1122 /* update parameters only if rate has changed */
1123 if (eff_rate
== sp
->sfb_eff_rate
)
1126 if (classq_verbose
) {
1127 log(LOG_DEBUG
, "%s: SFB qid=%d, adapting to new "
1128 "eff_rate=%llu bps\n", if_name(ifp
), sp
->sfb_qid
,
1131 sfb_calc_holdtime(sp
, eff_rate
);
1132 sfb_calc_pboxtime(sp
, eff_rate
);
1136 case CLASSQ_EV_LINK_UP
:
1137 case CLASSQ_EV_LINK_DOWN
:
1138 if (classq_verbose
) {
1139 log(LOG_DEBUG
, "%s: SFB qid=%d, resetting due to "
1140 "link %s\n", if_name(ifp
), sp
->sfb_qid
,
1141 (ev
== CLASSQ_EV_LINK_UP
) ? "UP" : "DOWN");
1146 case CLASSQ_EV_LINK_MTU
:
1153 sfb_suspendq(struct sfb
*sp
, class_queue_t
*q
, boolean_t on
)
1156 struct ifnet
*ifp
= sp
->sfb_ifp
;
1158 VERIFY(ifp
!= NULL
);
1160 if ((on
&& (sp
->sfb_flags
& SFBF_SUSPENDED
)) ||
1161 (!on
&& !(sp
->sfb_flags
& SFBF_SUSPENDED
)))
1164 if (!(sp
->sfb_flags
& SFBF_FLOWCTL
)) {
1165 log(LOG_ERR
, "%s: SFB qid=%d, unable to %s queue since "
1166 "flow-control is not enabled", if_name(ifp
), sp
->sfb_qid
,
1167 (on
? "suspend" : "resume"));
1171 if (classq_verbose
) {
1172 log(LOG_DEBUG
, "%s: SFB qid=%d, setting state to %s",
1173 if_name(ifp
), sp
->sfb_qid
, (on
? "SUSPENDED" : "RUNNING"));
1177 sp
->sfb_flags
|= SFBF_SUSPENDED
;
1179 sp
->sfb_flags
&= ~SFBF_SUSPENDED
;
1180 sfb_swap_bins(sp
, qlen(q
));