]> git.saurik.com Git - apple/xnu.git/blob - bsd/net/classq/classq_red.c
xnu-3789.70.16.tar.gz
[apple/xnu.git] / bsd / net / classq / classq_red.c
1 /*
2 * Copyright (c) 2007-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /* $OpenBSD: altq_red.c,v 1.14 2007/09/13 20:40:02 chl Exp $ */
30 /* $KAME: altq_red.c,v 1.10 2002/04/03 05:38:51 kjc Exp $ */
31
32 /*
33 * Copyright (C) 1997-2003
34 * Sony Computer Science Laboratories Inc. All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 *
45 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 *
57 */
58 /*
59 * Copyright (c) 1990-1994 Regents of the University of California.
60 * All rights reserved.
61 *
62 * Redistribution and use in source and binary forms, with or without
63 * modification, are permitted provided that the following conditions
64 * are met:
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 * 2. Redistributions in binary form must reproduce the above copyright
68 * notice, this list of conditions and the following disclaimer in the
69 * documentation and/or other materials provided with the distribution.
70 * 3. All advertising materials mentioning features or use of this software
71 * must display the following acknowledgement:
72 * This product includes software developed by the Computer Systems
73 * Engineering Group at Lawrence Berkeley Laboratory.
74 * 4. Neither the name of the University nor of the Laboratory may be used
75 * to endorse or promote products derived from this software without
76 * specific prior written permission.
77 *
78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
88 * SUCH DAMAGE.
89 */
90
91 #include <sys/cdefs.h>
92
93 #if CLASSQ_RED
94
95 #include <sys/param.h>
96 #include <sys/malloc.h>
97 #include <sys/mbuf.h>
98 #include <sys/socket.h>
99 #include <sys/systm.h>
100 #include <sys/syslog.h>
101 #include <sys/errno.h>
102 #include <sys/kauth.h>
103 #include <dev/random/randomdev.h>
104 #include <kern/zalloc.h>
105
106 #include <net/if.h>
107
108 #include <netinet/in.h>
109 #include <netinet/in_systm.h>
110 #include <netinet/ip.h>
111 #if INET6
112 #include <netinet/ip6.h>
113 #endif
114
115 #include <net/classq/classq_red.h>
116 #include <net/net_osdep.h>
117
118 /*
119 * ALTQ/RED (Random Early Detection) implementation using 32-bit
120 * fixed-point calculation.
121 *
122 * written by kjc using the ns code as a reference.
123 * you can learn more about red and ns from Sally's home page at
124 * http://www-nrg.ee.lbl.gov/floyd/
125 *
126 * most of the red parameter values are fixed in this implementation
127 * to prevent fixed-point overflow/underflow.
128 * if you change the parameters, watch out for overflow/underflow!
129 *
130 * the parameters used are recommended values by Sally.
131 * the corresponding ns config looks:
132 * q_weight=0.00195
133 * minthresh=5 maxthresh=15 queue-size=60
134 * linterm=30
135 * dropmech=drop-tail
136 * bytes=false (can't be handled by 32-bit fixed-point)
137 * doubleq=false dqthresh=false
138 * wait=true
139 */
140 /*
141 * alternative red parameters for a slow link.
142 *
143 * assume the queue length becomes from zero to L and keeps L, it takes
144 * N packets for q_avg to reach 63% of L.
145 * when q_weight is 0.002, N is about 500 packets.
146 * for a slow link like dial-up, 500 packets takes more than 1 minute!
147 * when q_weight is 0.008, N is about 127 packets.
148 * when q_weight is 0.016, N is about 63 packets.
149 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
150 * are allowed for 0.016.
151 * see Sally's paper for more details.
152 */
153 /* normal red parameters */
154 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
155 /* q_weight = 0.00195 */
156
157 /* red parameters for a slow link */
158 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
159 /* q_weight = 0.0078125 */
160
161 /* red parameters for a very slow link (e.g., dialup) */
162 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
163 /* q_weight = 0.015625 */
164
165 /* fixed-point uses 12-bit decimal places */
166 #define FP_SHIFT 12 /* fixed-point shift */
167
168 /* red parameters for drop probability */
169 #define INV_P_MAX 10 /* inverse of max drop probability */
170 #define TH_MIN 5 /* min threshold */
171 #define TH_MAX 15 /* max threshold */
172
173 #define RED_LIMIT 60 /* default max queue lenght */
174
175 #define RED_ZONE_MAX 32 /* maximum elements in zone */
176 #define RED_ZONE_NAME "classq_red" /* zone name */
177
178 static unsigned int red_size; /* size of zone element */
179 static struct zone *red_zone; /* zone for red */
180
181 /*
182 * our default policy for forced-drop is drop-tail.
183 * (in altq-1.1.2 or earlier, the default was random-drop.
184 * but it makes more sense to punish the cause of the surge.)
185 * to switch to the random-drop policy, define "RED_RANDOM_DROP".
186 */
187
188 /* default red parameter values */
189 static int default_th_min = TH_MIN;
190 static int default_th_max = TH_MAX;
191 static int default_inv_pmax = INV_P_MAX;
192
193 static struct mbuf *red_getq_flow(struct red *, class_queue_t *,
194 u_int32_t, boolean_t);
195
196 void
197 red_init(void)
198 {
199 _CASSERT(REDF_ECN4 == CLASSQF_ECN4);
200 _CASSERT(REDF_ECN6 == CLASSQF_ECN6);
201
202 red_size = sizeof (red_t);
203 red_zone = zinit(red_size, RED_ZONE_MAX * red_size,
204 0, RED_ZONE_NAME);
205 if (red_zone == NULL) {
206 panic("%s: failed allocating %s", __func__, RED_ZONE_NAME);
207 /* NOTREACHED */
208 }
209 zone_change(red_zone, Z_EXPAND, TRUE);
210 zone_change(red_zone, Z_CALLERACCT, TRUE);
211 }
212
213 /*
214 * red support routines
215 */
216 red_t *
217 red_alloc(struct ifnet *ifp, int weight, int inv_pmax, int th_min,
218 int th_max, int flags, int pkttime)
219 {
220 red_t *rp;
221 int w, i;
222 int npkts_per_sec;
223
224 VERIFY(ifp != NULL);
225
226 rp = zalloc(red_zone);
227 if (rp == NULL)
228 return (NULL);
229
230 bzero(rp, red_size);
231 rp->red_avg = 0;
232 rp->red_idle = 1;
233
234 if (weight == 0)
235 rp->red_weight = W_WEIGHT;
236 else
237 rp->red_weight = weight;
238 if (inv_pmax == 0)
239 rp->red_inv_pmax = default_inv_pmax;
240 else
241 rp->red_inv_pmax = inv_pmax;
242 if (th_min == 0)
243 rp->red_thmin = default_th_min;
244 else
245 rp->red_thmin = th_min;
246 if (th_max == 0)
247 rp->red_thmax = default_th_max;
248 else
249 rp->red_thmax = th_max;
250
251 rp->red_ifp = ifp;
252 rp->red_flags = (flags & REDF_USERFLAGS);
253 #if !PF_ECN
254 if (rp->red_flags & REDF_ECN) {
255 rp->red_flags &= ~REDF_ECN;
256 log(LOG_ERR, "%s: RED ECN not available; ignoring "
257 "REDF_ECN flag!\n", if_name(ifp));
258 }
259 #endif /* !PF_ECN */
260
261 if (pkttime == 0)
262 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
263 rp->red_pkttime = 800;
264 else
265 rp->red_pkttime = pkttime;
266
267 if (weight == 0) {
268 /* when the link is very slow, adjust red parameters */
269 npkts_per_sec = 1000000 / rp->red_pkttime;
270 if (npkts_per_sec < 50) {
271 /* up to about 400Kbps */
272 rp->red_weight = W_WEIGHT_2;
273 } else if (npkts_per_sec < 300) {
274 /* up to about 2.4Mbps */
275 rp->red_weight = W_WEIGHT_1;
276 }
277 }
278
279 /* calculate wshift. weight must be power of 2 */
280 w = rp->red_weight;
281 for (i = 0; w > 1; i++)
282 w = w >> 1;
283 rp->red_wshift = i;
284 w = 1 << rp->red_wshift;
285 if (w != rp->red_weight) {
286 printf("invalid weight value %d for red! use %d\n",
287 rp->red_weight, w);
288 rp->red_weight = w;
289 }
290
291 /*
292 * thmin_s and thmax_s are scaled versions of th_min and th_max
293 * to be compared with avg.
294 */
295 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
296 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
297
298 /*
299 * precompute probability denominator
300 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
301 */
302 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) *
303 rp->red_inv_pmax) << FP_SHIFT;
304
305 /* allocate weight table */
306 rp->red_wtab = wtab_alloc(rp->red_weight);
307 if (rp->red_wtab == NULL) {
308 red_destroy(rp);
309 return (NULL);
310 }
311
312 microuptime(&rp->red_last);
313 return (rp);
314 }
315
316 void
317 red_destroy(red_t *rp)
318 {
319 if (rp->red_wtab != NULL) {
320 wtab_destroy(rp->red_wtab);
321 rp->red_wtab = NULL;
322 }
323 zfree(red_zone, rp);
324 }
325
326 void
327 red_getstats(red_t *rp, struct red_stats *sp)
328 {
329 sp->q_avg = rp->red_avg >> rp->red_wshift;
330 sp->drop_forced = rp->red_stats.drop_forced;
331 sp->drop_unforced = rp->red_stats.drop_unforced;
332 sp->marked_packets = rp->red_stats.marked_packets;
333 }
334
335 int
336 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, struct pf_mtag *tag)
337 {
338 #if !PF_ECN
339 #pragma unused(tag)
340 #endif /* !PF_ECN */
341 int avg, droptype;
342 int n;
343
344 avg = rp->red_avg;
345
346 /*
347 * if we were idle, we pretend that n packets arrived during
348 * the idle period.
349 */
350 if (rp->red_idle) {
351 struct timeval now;
352 int t;
353
354 rp->red_idle = 0;
355 microuptime(&now);
356 t = (now.tv_sec - rp->red_last.tv_sec);
357 if (t > 60) {
358 /*
359 * being idle for more than 1 minute, set avg to zero.
360 * this prevents t from overflow.
361 */
362 avg = 0;
363 } else {
364 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
365 n = t / rp->red_pkttime - 1;
366
367 /* the following line does (avg = (1 - Wq)^n * avg) */
368 if (n > 0)
369 avg = (avg >> FP_SHIFT) *
370 pow_w(rp->red_wtab, n);
371 }
372 }
373
374 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
375 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
376 rp->red_avg = avg; /* save the new value */
377
378 /*
379 * red_count keeps a tally of arriving traffic that has not
380 * been dropped.
381 */
382 rp->red_count++;
383
384 /* see if we drop early */
385 droptype = DTYPE_NODROP;
386 if (avg >= rp->red_thmin_s && qlen(q) > 1) {
387 if (avg >= rp->red_thmax_s) {
388 /* avg >= th_max: forced drop */
389 droptype = DTYPE_FORCED;
390 } else if (rp->red_old == 0) {
391 /* first exceeds th_min */
392 rp->red_count = 1;
393 rp->red_old = 1;
394 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
395 rp->red_probd, rp->red_count)) {
396 /* mark or drop by red */
397 #if PF_ECN
398 if ((rp->red_flags & REDF_ECN) &&
399 (tag->pftag_proto == IPPROTO_TCP) && /* only TCP */
400 mark_ecn(m, tag, rp->red_flags)) {
401 /* successfully marked. do not drop. */
402 rp->red_count = 0;
403 rp->red_stats.marked_packets++;
404 } else
405 #endif /* PF_ECN */
406 {
407 /* unforced drop by red */
408 droptype = DTYPE_EARLY;
409 }
410 }
411 } else {
412 /* avg < th_min */
413 rp->red_old = 0;
414 }
415
416 /*
417 * if the queue length hits the hard limit, it's a forced drop.
418 */
419 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
420 droptype = DTYPE_FORCED;
421
422 #ifdef RED_RANDOM_DROP
423 /* if successful or forced drop, enqueue this packet. */
424 if (droptype != DTYPE_EARLY)
425 _addq(q, m);
426 #else
427 /* if successful, enqueue this packet. */
428 if (droptype == DTYPE_NODROP)
429 _addq(q, m);
430 #endif
431 if (droptype != DTYPE_NODROP) {
432 if (droptype == DTYPE_EARLY) {
433 /* drop the incoming packet */
434 rp->red_stats.drop_unforced++;
435 } else {
436 /* forced drop, select a victim packet in the queue. */
437 #ifdef RED_RANDOM_DROP
438 m = _getq_random(q);
439 #endif
440 rp->red_stats.drop_forced++;
441 }
442 rp->red_count = 0;
443 IFCQ_CONVERT_LOCK(&rp->red_ifp->if_snd);
444 m_freem(m);
445 return (CLASSQEQ_DROPPED);
446 }
447 /* successfully queued */
448 return (CLASSQEQ_SUCCESS);
449 }
450
451 /*
452 * early-drop probability is calculated as follows:
453 * prob = p_max * (avg - th_min) / (th_max - th_min)
454 * prob_a = prob / (2 - count*prob)
455 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
456 * here prob_a increases as successive undrop count increases.
457 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
458 * becomes 1 when (count >= (2 / prob))).
459 */
460 int
461 drop_early(int fp_len, int fp_probd, int count)
462 {
463 int d; /* denominator of drop-probability */
464
465 d = fp_probd - count * fp_len;
466 if (d <= 0)
467 /* count exceeds the hard limit: drop or mark */
468 return (1);
469
470 /*
471 * now the range of d is [1..600] in fixed-point. (when
472 * th_max-th_min=10 and p_max=1/30)
473 * drop probability = (avg - TH_MIN) / d
474 */
475
476 if ((RandomULong() % d) < (unsigned)fp_len) {
477 /* drop or mark */
478 return (1);
479 }
480 /* no drop/mark */
481 return (0);
482 }
483
484 static struct mbuf *
485 red_getq_flow(struct red *rp, class_queue_t *q, u_int32_t flow, boolean_t purge)
486 {
487 #pragma unused(purge)
488 struct mbuf *m;
489
490 /* flow of 0 means head of queue */
491 if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) {
492 if (rp->red_idle == 0) {
493 rp->red_idle = 1;
494 microuptime(&rp->red_last);
495 }
496 return (NULL);
497 }
498
499 rp->red_idle = 0;
500 return (m);
501 }
502
503 struct mbuf *
504 red_getq(red_t *rp, class_queue_t *q)
505 {
506 return (red_getq_flow(rp, q, 0, FALSE));
507 }
508
509 void
510 red_purgeq(struct red *rp, class_queue_t *q, u_int32_t flow, u_int32_t *packets,
511 u_int32_t *bytes)
512 {
513 u_int32_t cnt = 0, len = 0;
514 struct mbuf *m;
515
516 IFCQ_CONVERT_LOCK(&rp->red_ifp->if_snd);
517
518 while ((m = red_getq_flow(rp, q, flow, TRUE)) != NULL) {
519 cnt++;
520 len += m_pktlen(m);
521 m_freem(m);
522 }
523
524 if (packets != NULL)
525 *packets = cnt;
526 if (bytes != NULL)
527 *bytes = len;
528 }
529
530 void
531 red_updateq(red_t *rp, cqev_t ev)
532 {
533 #pragma unused(rp, ev)
534 /* nothing for now */
535 }
536
537 int
538 red_suspendq(red_t *rp, class_queue_t *q, boolean_t on)
539 {
540 #pragma unused(rp, q, on)
541 return (ENOTSUP);
542 }
543
544 /*
545 * helper routine to calibrate avg during idle.
546 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
547 * here Wq = 1/weight and the code assumes Wq is close to zero.
548 *
549 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
550 */
551 static struct wtab *wtab_list = NULL; /* pointer to wtab list */
552
553 struct wtab *
554 wtab_alloc(int weight)
555 {
556 struct wtab *w;
557 int i;
558
559 for (w = wtab_list; w != NULL; w = w->w_next)
560 if (w->w_weight == weight) {
561 w->w_refcount++;
562 return (w);
563 }
564
565 w = _MALLOC(sizeof (struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
566 if (w == NULL)
567 return (NULL);
568
569 w->w_weight = weight;
570 w->w_refcount = 1;
571 w->w_next = wtab_list;
572 wtab_list = w;
573
574 /* initialize the weight table */
575 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
576 for (i = 1; i < 32; i++) {
577 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
578 if (w->w_tab[i] == 0 && w->w_param_max == 0)
579 w->w_param_max = 1 << i;
580 }
581
582 return (w);
583 }
584
585 void
586 wtab_destroy(struct wtab *w)
587 {
588 struct wtab *prev;
589
590 if (--w->w_refcount > 0)
591 return;
592
593 if (wtab_list == w)
594 wtab_list = w->w_next;
595 else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
596 if (prev->w_next == w) {
597 prev->w_next = w->w_next;
598 break;
599 }
600
601 _FREE(w, M_DEVBUF);
602 }
603
604 int32_t
605 pow_w(struct wtab *w, int n)
606 {
607 int i, bit;
608 int32_t val;
609
610 if (n >= w->w_param_max)
611 return (0);
612
613 val = 1 << FP_SHIFT;
614 if (n <= 0)
615 return (val);
616
617 bit = 1;
618 i = 0;
619 while (n) {
620 if (n & bit) {
621 val = (val * w->w_tab[i]) >> FP_SHIFT;
622 n &= ~bit;
623 }
624 i++;
625 bit <<= 1;
626 }
627 return (val);
628 }
629
630 #endif /* CLASSQ_RED */