]>
git.saurik.com Git - apple/xnu.git/blob - bsd/net/classq/classq_red.c
2 * Copyright (c) 2007-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 /* $OpenBSD: altq_red.c,v 1.14 2007/09/13 20:40:02 chl Exp $ */
30 /* $KAME: altq_red.c,v 1.10 2002/04/03 05:38:51 kjc Exp $ */
33 * Copyright (C) 1997-2003
34 * Sony Computer Science Laboratories Inc. All rights reserved.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
45 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * Copyright (c) 1990-1994 Regents of the University of California.
60 * All rights reserved.
62 * Redistribution and use in source and binary forms, with or without
63 * modification, are permitted provided that the following conditions
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 * 2. Redistributions in binary form must reproduce the above copyright
68 * notice, this list of conditions and the following disclaimer in the
69 * documentation and/or other materials provided with the distribution.
70 * 3. All advertising materials mentioning features or use of this software
71 * must display the following acknowledgement:
72 * This product includes software developed by the Computer Systems
73 * Engineering Group at Lawrence Berkeley Laboratory.
74 * 4. Neither the name of the University nor of the Laboratory may be used
75 * to endorse or promote products derived from this software without
76 * specific prior written permission.
78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
91 #include <sys/cdefs.h>
95 #include <sys/param.h>
96 #include <sys/malloc.h>
98 #include <sys/socket.h>
99 #include <sys/systm.h>
100 #include <sys/syslog.h>
101 #include <sys/errno.h>
102 #include <sys/kauth.h>
103 #include <dev/random/randomdev.h>
104 #include <kern/zalloc.h>
108 #include <netinet/in.h>
109 #include <netinet/in_systm.h>
110 #include <netinet/ip.h>
112 #include <netinet/ip6.h>
115 #include <net/classq/classq_red.h>
116 #include <net/net_osdep.h>
119 * ALTQ/RED (Random Early Detection) implementation using 32-bit
120 * fixed-point calculation.
122 * written by kjc using the ns code as a reference.
123 * you can learn more about red and ns from Sally's home page at
124 * http://www-nrg.ee.lbl.gov/floyd/
126 * most of the red parameter values are fixed in this implementation
127 * to prevent fixed-point overflow/underflow.
128 * if you change the parameters, watch out for overflow/underflow!
130 * the parameters used are recommended values by Sally.
131 * the corresponding ns config looks:
133 * minthresh=5 maxthresh=15 queue-size=60
136 * bytes=false (can't be handled by 32-bit fixed-point)
137 * doubleq=false dqthresh=false
141 * alternative red parameters for a slow link.
143 * assume the queue length becomes from zero to L and keeps L, it takes
144 * N packets for q_avg to reach 63% of L.
145 * when q_weight is 0.002, N is about 500 packets.
146 * for a slow link like dial-up, 500 packets takes more than 1 minute!
147 * when q_weight is 0.008, N is about 127 packets.
148 * when q_weight is 0.016, N is about 63 packets.
149 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
150 * are allowed for 0.016.
151 * see Sally's paper for more details.
153 /* normal red parameters */
154 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
155 /* q_weight = 0.00195 */
157 /* red parameters for a slow link */
158 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
159 /* q_weight = 0.0078125 */
161 /* red parameters for a very slow link (e.g., dialup) */
162 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
163 /* q_weight = 0.015625 */
165 /* fixed-point uses 12-bit decimal places */
166 #define FP_SHIFT 12 /* fixed-point shift */
168 /* red parameters for drop probability */
169 #define INV_P_MAX 10 /* inverse of max drop probability */
170 #define TH_MIN 5 /* min threshold */
171 #define TH_MAX 15 /* max threshold */
173 #define RED_LIMIT 60 /* default max queue lenght */
175 #define RED_ZONE_MAX 32 /* maximum elements in zone */
176 #define RED_ZONE_NAME "classq_red" /* zone name */
178 static unsigned int red_size
; /* size of zone element */
179 static struct zone
*red_zone
; /* zone for red */
182 * our default policy for forced-drop is drop-tail.
183 * (in altq-1.1.2 or earlier, the default was random-drop.
184 * but it makes more sense to punish the cause of the surge.)
185 * to switch to the random-drop policy, define "RED_RANDOM_DROP".
188 /* default red parameter values */
189 static int default_th_min
= TH_MIN
;
190 static int default_th_max
= TH_MAX
;
191 static int default_inv_pmax
= INV_P_MAX
;
193 static struct mbuf
*red_getq_flow(struct red
*, class_queue_t
*,
194 u_int32_t
, boolean_t
);
199 _CASSERT(REDF_ECN4
== CLASSQF_ECN4
);
200 _CASSERT(REDF_ECN6
== CLASSQF_ECN6
);
202 red_size
= sizeof (red_t
);
203 red_zone
= zinit(red_size
, RED_ZONE_MAX
* red_size
,
205 if (red_zone
== NULL
) {
206 panic("%s: failed allocating %s", __func__
, RED_ZONE_NAME
);
209 zone_change(red_zone
, Z_EXPAND
, TRUE
);
210 zone_change(red_zone
, Z_CALLERACCT
, TRUE
);
214 * red support routines
217 red_alloc(struct ifnet
*ifp
, int weight
, int inv_pmax
, int th_min
,
218 int th_max
, int flags
, int pkttime
)
226 rp
= zalloc(red_zone
);
235 rp
->red_weight
= W_WEIGHT
;
237 rp
->red_weight
= weight
;
239 rp
->red_inv_pmax
= default_inv_pmax
;
241 rp
->red_inv_pmax
= inv_pmax
;
243 rp
->red_thmin
= default_th_min
;
245 rp
->red_thmin
= th_min
;
247 rp
->red_thmax
= default_th_max
;
249 rp
->red_thmax
= th_max
;
252 rp
->red_flags
= (flags
& REDF_USERFLAGS
);
254 if (rp
->red_flags
& REDF_ECN
) {
255 rp
->red_flags
&= ~REDF_ECN
;
256 log(LOG_ERR
, "%s: RED ECN not available; ignoring "
257 "REDF_ECN flag!\n", if_name(ifp
));
262 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
263 rp
->red_pkttime
= 800;
265 rp
->red_pkttime
= pkttime
;
268 /* when the link is very slow, adjust red parameters */
269 npkts_per_sec
= 1000000 / rp
->red_pkttime
;
270 if (npkts_per_sec
< 50) {
271 /* up to about 400Kbps */
272 rp
->red_weight
= W_WEIGHT_2
;
273 } else if (npkts_per_sec
< 300) {
274 /* up to about 2.4Mbps */
275 rp
->red_weight
= W_WEIGHT_1
;
279 /* calculate wshift. weight must be power of 2 */
281 for (i
= 0; w
> 1; i
++)
284 w
= 1 << rp
->red_wshift
;
285 if (w
!= rp
->red_weight
) {
286 printf("invalid weight value %d for red! use %d\n",
292 * thmin_s and thmax_s are scaled versions of th_min and th_max
293 * to be compared with avg.
295 rp
->red_thmin_s
= rp
->red_thmin
<< (rp
->red_wshift
+ FP_SHIFT
);
296 rp
->red_thmax_s
= rp
->red_thmax
<< (rp
->red_wshift
+ FP_SHIFT
);
299 * precompute probability denominator
300 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
302 rp
->red_probd
= (2 * (rp
->red_thmax
- rp
->red_thmin
) *
303 rp
->red_inv_pmax
) << FP_SHIFT
;
305 /* allocate weight table */
306 rp
->red_wtab
= wtab_alloc(rp
->red_weight
);
307 if (rp
->red_wtab
== NULL
) {
312 microuptime(&rp
->red_last
);
317 red_destroy(red_t
*rp
)
319 if (rp
->red_wtab
!= NULL
) {
320 wtab_destroy(rp
->red_wtab
);
327 red_getstats(red_t
*rp
, struct red_stats
*sp
)
329 sp
->q_avg
= rp
->red_avg
>> rp
->red_wshift
;
330 sp
->drop_forced
= rp
->red_stats
.drop_forced
;
331 sp
->drop_unforced
= rp
->red_stats
.drop_unforced
;
332 sp
->marked_packets
= rp
->red_stats
.marked_packets
;
336 red_addq(red_t
*rp
, class_queue_t
*q
, struct mbuf
*m
, struct pf_mtag
*tag
)
347 * if we were idle, we pretend that n packets arrived during
356 t
= (now
.tv_sec
- rp
->red_last
.tv_sec
);
359 * being idle for more than 1 minute, set avg to zero.
360 * this prevents t from overflow.
364 t
= t
* 1000000 + (now
.tv_usec
- rp
->red_last
.tv_usec
);
365 n
= t
/ rp
->red_pkttime
- 1;
367 /* the following line does (avg = (1 - Wq)^n * avg) */
369 avg
= (avg
>> FP_SHIFT
) *
370 pow_w(rp
->red_wtab
, n
);
374 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
375 avg
+= (qlen(q
) << FP_SHIFT
) - (avg
>> rp
->red_wshift
);
376 rp
->red_avg
= avg
; /* save the new value */
379 * red_count keeps a tally of arriving traffic that has not
384 /* see if we drop early */
385 droptype
= DTYPE_NODROP
;
386 if (avg
>= rp
->red_thmin_s
&& qlen(q
) > 1) {
387 if (avg
>= rp
->red_thmax_s
) {
388 /* avg >= th_max: forced drop */
389 droptype
= DTYPE_FORCED
;
390 } else if (rp
->red_old
== 0) {
391 /* first exceeds th_min */
394 } else if (drop_early((avg
- rp
->red_thmin_s
) >> rp
->red_wshift
,
395 rp
->red_probd
, rp
->red_count
)) {
396 /* mark or drop by red */
398 if ((rp
->red_flags
& REDF_ECN
) &&
399 (tag
->pftag_proto
== IPPROTO_TCP
) && /* only TCP */
400 mark_ecn(m
, tag
, rp
->red_flags
)) {
401 /* successfully marked. do not drop. */
403 rp
->red_stats
.marked_packets
++;
407 /* unforced drop by red */
408 droptype
= DTYPE_EARLY
;
417 * if the queue length hits the hard limit, it's a forced drop.
419 if (droptype
== DTYPE_NODROP
&& qlen(q
) >= qlimit(q
))
420 droptype
= DTYPE_FORCED
;
422 #ifdef RED_RANDOM_DROP
423 /* if successful or forced drop, enqueue this packet. */
424 if (droptype
!= DTYPE_EARLY
)
427 /* if successful, enqueue this packet. */
428 if (droptype
== DTYPE_NODROP
)
431 if (droptype
!= DTYPE_NODROP
) {
432 if (droptype
== DTYPE_EARLY
) {
433 /* drop the incoming packet */
434 rp
->red_stats
.drop_unforced
++;
436 /* forced drop, select a victim packet in the queue. */
437 #ifdef RED_RANDOM_DROP
440 rp
->red_stats
.drop_forced
++;
443 IFCQ_CONVERT_LOCK(&rp
->red_ifp
->if_snd
);
445 return (CLASSQEQ_DROPPED
);
447 /* successfully queued */
448 return (CLASSQEQ_SUCCESS
);
452 * early-drop probability is calculated as follows:
453 * prob = p_max * (avg - th_min) / (th_max - th_min)
454 * prob_a = prob / (2 - count*prob)
455 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
456 * here prob_a increases as successive undrop count increases.
457 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
458 * becomes 1 when (count >= (2 / prob))).
461 drop_early(int fp_len
, int fp_probd
, int count
)
463 int d
; /* denominator of drop-probability */
465 d
= fp_probd
- count
* fp_len
;
467 /* count exceeds the hard limit: drop or mark */
471 * now the range of d is [1..600] in fixed-point. (when
472 * th_max-th_min=10 and p_max=1/30)
473 * drop probability = (avg - TH_MIN) / d
476 if ((RandomULong() % d
) < (unsigned)fp_len
) {
485 red_getq_flow(struct red
*rp
, class_queue_t
*q
, u_int32_t flow
, boolean_t purge
)
487 #pragma unused(purge)
490 /* flow of 0 means head of queue */
491 if ((m
= ((flow
== 0) ? _getq(q
) : _getq_flow(q
, flow
))) == NULL
) {
492 if (rp
->red_idle
== 0) {
494 microuptime(&rp
->red_last
);
504 red_getq(red_t
*rp
, class_queue_t
*q
)
506 return (red_getq_flow(rp
, q
, 0, FALSE
));
510 red_purgeq(struct red
*rp
, class_queue_t
*q
, u_int32_t flow
, u_int32_t
*packets
,
513 u_int32_t cnt
= 0, len
= 0;
516 IFCQ_CONVERT_LOCK(&rp
->red_ifp
->if_snd
);
518 while ((m
= red_getq_flow(rp
, q
, flow
, TRUE
)) != NULL
) {
531 red_updateq(red_t
*rp
, cqev_t ev
)
533 #pragma unused(rp, ev)
534 /* nothing for now */
538 red_suspendq(red_t
*rp
, class_queue_t
*q
, boolean_t on
)
540 #pragma unused(rp, q, on)
545 * helper routine to calibrate avg during idle.
546 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
547 * here Wq = 1/weight and the code assumes Wq is close to zero.
549 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
551 static struct wtab
*wtab_list
= NULL
; /* pointer to wtab list */
554 wtab_alloc(int weight
)
559 for (w
= wtab_list
; w
!= NULL
; w
= w
->w_next
)
560 if (w
->w_weight
== weight
) {
565 w
= _MALLOC(sizeof (struct wtab
), M_DEVBUF
, M_WAITOK
|M_ZERO
);
569 w
->w_weight
= weight
;
571 w
->w_next
= wtab_list
;
574 /* initialize the weight table */
575 w
->w_tab
[0] = ((weight
- 1) << FP_SHIFT
) / weight
;
576 for (i
= 1; i
< 32; i
++) {
577 w
->w_tab
[i
] = (w
->w_tab
[i
-1] * w
->w_tab
[i
-1]) >> FP_SHIFT
;
578 if (w
->w_tab
[i
] == 0 && w
->w_param_max
== 0)
579 w
->w_param_max
= 1 << i
;
586 wtab_destroy(struct wtab
*w
)
590 if (--w
->w_refcount
> 0)
594 wtab_list
= w
->w_next
;
595 else for (prev
= wtab_list
; prev
->w_next
!= NULL
; prev
= prev
->w_next
)
596 if (prev
->w_next
== w
) {
597 prev
->w_next
= w
->w_next
;
605 pow_w(struct wtab
*w
, int n
)
610 if (n
>= w
->w_param_max
)
621 val
= (val
* w
->w_tab
[i
]) >> FP_SHIFT
;
630 #endif /* CLASSQ_RED */