]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/ip_dummynet.c
xnu-1228.3.13.tar.gz
[apple/xnu.git] / bsd / netinet / ip_dummynet.c
CommitLineData
1c79356b 1/*
2d21ac55 2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
91447636
A
28/*
29 * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
9bccf70c
A
30 * Portions Copyright (c) 2000 Akamba Corp.
31 * All rights reserved
1c79356b 32 *
9bccf70c
A
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
1c79356b 41 *
9bccf70c
A
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
1c79356b 53 *
91447636 54 * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $
1c79356b
A
55 */
56
91447636 57#define DUMMYNET_DEBUG
9bccf70c 58
1c79356b
A
59/*
60 * This module implements IP dummynet, a bandwidth limiter/delay emulator
61 * used in conjunction with the ipfw package.
9bccf70c
A
62 * Description of the data structures used is in ip_dummynet.h
63 * Here you mainly find the following blocks of code:
64 * + variable declarations;
65 * + heap management functions;
66 * + scheduler and dummynet functions;
67 * + configuration and initialization.
68 *
91447636 69 * NOTA BENE: critical sections are protected by the "dummynet lock".
1c79356b 70 *
9bccf70c 71 * Most important Changes:
1c79356b 72 *
9bccf70c
A
73 * 010124: Fixed WF2Q behaviour
74 * 010122: Fixed spl protection.
75 * 000601: WF2Q support
76 * 000106: large rewrite, use heaps to handle very many pipes.
1c79356b 77 * 980513: initial release
9bccf70c
A
78 *
79 * include files marked with XXX are probably not needed
1c79356b
A
80 */
81
1c79356b
A
82#include <sys/param.h>
83#include <sys/systm.h>
84#include <sys/malloc.h>
85#include <sys/mbuf.h>
86#include <sys/queue.h> /* XXX */
87#include <sys/kernel.h>
88#include <sys/socket.h>
89#include <sys/socketvar.h>
90#include <sys/time.h>
91#include <sys/sysctl.h>
92#include <net/if.h>
93#include <net/route.h>
91447636 94#include <net/kpi_protocol.h>
1c79356b
A
95#include <netinet/in.h>
96#include <netinet/in_systm.h>
97#include <netinet/in_var.h>
98#include <netinet/ip.h>
99#include <netinet/ip_fw.h>
100#include <netinet/ip_dummynet.h>
101#include <netinet/ip_var.h>
102
103#if BRIDGE
104#include <netinet/if_ether.h> /* for struct arpcom */
105#include <net/bridge.h>
106#endif
107
9bccf70c
A
108/*
109 * We keep a private variable for the simulation time, but we could
110 * probably use an existing one ("softticks" in sys/kern/kern_timer.c)
111 */
112static dn_key curr_time = 0 ; /* current simulation time */
113
0c530ab8
A
114/* this is for the timer that fires to call dummynet() - we only enable the timer when
115 there are packets to process, otherwise it's disabled */
116static int timer_enabled = 0;
117
9bccf70c
A
118static int dn_hash_size = 64 ; /* default hash size */
119
120/* statistics on number of queue searches and search steps */
121static int searches, search_steps ;
122static int pipe_expire = 1 ; /* expire queue if empty */
123static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
124
125static int red_lookup_depth = 256; /* RED - default lookup table depth */
126static int red_avg_pkt_size = 512; /* RED - default medium packet size */
127static int red_max_pkt_size = 1500; /* RED - default max packet size */
128
129/*
130 * Three heaps contain queues and pipes that the scheduler handles:
131 *
132 * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
133 *
134 * wfq_ready_heap contains the pipes associated with WF2Q flows
135 *
136 * extract_heap contains pipes associated with delay lines.
137 *
138 */
139static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
140
141static int heap_init(struct dn_heap *h, int size) ;
142static int heap_insert (struct dn_heap *h, dn_key key1, void *p);
143static void heap_extract(struct dn_heap *h, void *obj);
144
145static void transmit_event(struct dn_pipe *pipe);
146static void ready_event(struct dn_flow_queue *q);
147
1c79356b 148static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */
9bccf70c 149static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */
1c79356b 150
91447636 151#ifdef SYSCTL_NODE
9bccf70c
A
152SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
153 CTLFLAG_RW, 0, "Dummynet");
154SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
155 CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
156SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time,
157 CTLFLAG_RD, &curr_time, 0, "Current tick");
158SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
159 CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
160SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
161 CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
162SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
163 CTLFLAG_RD, &searches, 0, "Number of queue searches");
164SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
165 CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
166SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
167 CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
168SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
169 CTLFLAG_RW, &dn_max_ratio, 0,
170 "Max ratio between dynamic queues and buckets");
171SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
172 CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
173SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
174 CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
175SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
176 CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
1c79356b
A
177#endif
178
91447636
A
179#ifdef DUMMYNET_DEBUG
180int dummynet_debug = 0;
181#ifdef SYSCTL_NODE
182SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
183 0, "control debugging printfs");
184#endif
185#define DPRINTF(X) if (dummynet_debug) printf X
186#else
187#define DPRINTF(X)
188#endif
189
2d21ac55
A
190/* contrary to the comment above random(), it does not actually
191 * return a value [0, 2^31 - 1], which breaks plr amongst other
192 * things. Masking it should work even if the behavior of
193 * the function is fixed.
194 */
195#define MY_RANDOM (random() & 0x7FFFFFFF)
196
91447636
A
197/* dummynet lock */
198lck_grp_t *dn_mutex_grp;
199lck_grp_attr_t *dn_mutex_grp_attr;
200lck_attr_t *dn_mutex_attr;
201lck_mtx_t *dn_mutex;
202
9bccf70c 203static int config_pipe(struct dn_pipe *p);
1c79356b
A
204static int ip_dn_ctl(struct sockopt *sopt);
205
1c79356b 206static void dummynet(void *);
1c79356b 207static void dummynet_flush(void);
9bccf70c 208void dummynet_drain(void);
91447636
A
209static ip_dn_io_t dummynet_io;
210static void dn_rule_delete(void *);
1c79356b 211
91447636 212int if_tx_rdy(struct ifnet *ifp);
1c79356b 213
9bccf70c
A
214/*
215 * Heap management functions.
216 *
217 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
218 * Some macros help finding parent/children so we can optimize them.
219 *
220 * heap_init() is called to expand the heap when needed.
221 * Increment size in blocks of 16 entries.
222 * XXX failure to allocate a new element is a pretty bad failure
223 * as we basically stall a whole queue forever!!
224 * Returns 1 on error, 0 on success
225 */
226#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
227#define HEAP_LEFT(x) ( 2*(x) + 1 )
228#define HEAP_IS_LEFT(x) ( (x) & 1 )
229#define HEAP_RIGHT(x) ( 2*(x) + 2 )
230#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
231#define HEAP_INCREMENT 15
232
233static int
234heap_init(struct dn_heap *h, int new_size)
91447636 235{
9bccf70c
A
236 struct dn_heap_entry *p;
237
238 if (h->size >= new_size ) {
91447636 239 printf("dummynet: heap_init, Bogus call, have %d want %d\n",
9bccf70c
A
240 h->size, new_size);
241 return 0 ;
91447636 242 }
9bccf70c 243 new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
91447636 244 p = _MALLOC(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT );
9bccf70c 245 if (p == NULL) {
91447636 246 printf("dummynet: heap_init, resize %d failed\n", new_size );
9bccf70c
A
247 return 1 ; /* error */
248 }
249 if (h->size > 0) {
250 bcopy(h->p, p, h->size * sizeof(*p) );
91447636 251 FREE(h->p, M_DUMMYNET);
9bccf70c
A
252 }
253 h->p = p ;
254 h->size = new_size ;
255 return 0 ;
256}
257
258/*
259 * Insert element in heap. Normally, p != NULL, we insert p in
260 * a new position and bubble up. If p == NULL, then the element is
261 * already in place, and key is the position where to start the
262 * bubble-up.
263 * Returns 1 on failure (cannot allocate new heap entry)
264 *
265 * If offset > 0 the position (index, int) of the element in the heap is
266 * also stored in the element itself at the given offset in bytes.
267 */
268#define SET_OFFSET(heap, node) \
269 if (heap->offset > 0) \
270 *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
271/*
272 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
273 */
274#define RESET_OFFSET(heap, node) \
275 if (heap->offset > 0) \
276 *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
277static int
278heap_insert(struct dn_heap *h, dn_key key1, void *p)
91447636 279{
9bccf70c
A
280 int son = h->elements ;
281
282 if (p == NULL) /* data already there, set starting point */
283 son = key1 ;
284 else { /* insert new element at the end, possibly resize */
285 son = h->elements ;
286 if (son == h->size) /* need resize... */
287 if (heap_init(h, h->elements+1) )
288 return 1 ; /* failure... */
289 h->p[son].object = p ;
290 h->p[son].key = key1 ;
291 h->elements++ ;
292 }
293 while (son > 0) { /* bubble up */
294 int father = HEAP_FATHER(son) ;
295 struct dn_heap_entry tmp ;
296
297 if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
91447636 298 break ; /* found right position */
9bccf70c
A
299 /* son smaller than father, swap and repeat */
300 HEAP_SWAP(h->p[son], h->p[father], tmp) ;
301 SET_OFFSET(h, son);
302 son = father ;
303 }
304 SET_OFFSET(h, son);
305 return 0 ;
1c79356b
A
306}
307
308/*
9bccf70c 309 * remove top element from heap, or obj if obj != NULL
1c79356b
A
310 */
311static void
9bccf70c 312heap_extract(struct dn_heap *h, void *obj)
91447636 313{
2d21ac55 314 int child, father, maxelt = h->elements - 1 ;
9bccf70c 315
2d21ac55 316 if (maxelt < 0) {
91447636 317 printf("dummynet: warning, extract from empty heap 0x%p\n", h);
9bccf70c
A
318 return ;
319 }
320 father = 0 ; /* default: move up smallest child */
321 if (obj != NULL) { /* extract specific element, index is at offset */
322 if (h->offset <= 0)
91447636 323 panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
9bccf70c
A
324 father = *((int *)((char *)obj + h->offset)) ;
325 if (father < 0 || father >= h->elements) {
326 printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
327 father, h->elements);
91447636 328 panic("dummynet: heap_extract");
9bccf70c
A
329 }
330 }
331 RESET_OFFSET(h, father);
332 child = HEAP_LEFT(father) ; /* left child */
2d21ac55
A
333 while (child <= maxelt) { /* valid entry */
334 if (child != maxelt && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
9bccf70c
A
335 child = child+1 ; /* take right child, otherwise left */
336 h->p[father] = h->p[child] ;
337 SET_OFFSET(h, father);
338 father = child ;
339 child = HEAP_LEFT(child) ; /* left child for next loop */
91447636 340 }
9bccf70c 341 h->elements-- ;
2d21ac55 342 if (father != maxelt) {
1c79356b 343 /*
9bccf70c 344 * Fill hole with last entry and bubble up, reusing the insert code
1c79356b 345 */
2d21ac55 346 h->p[father] = h->p[maxelt] ;
9bccf70c
A
347 heap_insert(h, father, NULL); /* this one cannot fail */
348 }
91447636 349}
1c79356b 350
9bccf70c
A
351#if 0
352/*
353 * change object position and update references
354 * XXX this one is never used!
355 */
356static void
357heap_move(struct dn_heap *h, dn_key new_key, void *object)
358{
359 int temp;
360 int i ;
2d21ac55 361 int maxelt = h->elements-1 ;
9bccf70c 362 struct dn_heap_entry buf ;
1c79356b 363
9bccf70c
A
364 if (h->offset <= 0)
365 panic("cannot move items on this heap");
366
367 i = *((int *)((char *)object + h->offset));
368 if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
369 h->p[i].key = new_key ;
370 for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
371 i = temp ) { /* bubble up */
372 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
373 SET_OFFSET(h, i);
374 }
375 } else { /* must move down */
376 h->p[i].key = new_key ;
2d21ac55
A
377 while ( (temp = HEAP_LEFT(i)) <= maxelt ) { /* found left child */
378 if ((temp != maxelt) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
9bccf70c
A
379 temp++ ; /* select child with min key */
380 if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
381 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
382 SET_OFFSET(h, i);
383 } else
384 break ;
385 i = temp ;
1c79356b 386 }
1c79356b 387 }
9bccf70c
A
388 SET_OFFSET(h, i);
389}
390#endif /* heap_move, unused */
391
392/*
393 * heapify() will reorganize data inside an array to maintain the
394 * heap property. It is needed when we delete a bunch of entries.
395 */
396static void
397heapify(struct dn_heap *h)
398{
399 int i ;
400
401 for (i = 0 ; i < h->elements ; i++ )
402 heap_insert(h, i , NULL) ;
403}
404
405/*
406 * cleanup the heap and free data structure
407 */
408static void
409heap_free(struct dn_heap *h)
410{
411 if (h->size >0 )
91447636 412 FREE(h->p, M_DUMMYNET);
2d21ac55 413 bzero(h, sizeof(*h));
9bccf70c
A
414}
415
416/*
417 * --- end of heap management functions ---
418 */
419
91447636
A
420/*
421 * Return the mbuf tag holding the dummynet state. As an optimization
422 * this is assumed to be the first tag on the list. If this turns out
423 * wrong we'll need to search the list.
424 */
425static struct dn_pkt_tag *
426dn_tag_get(struct mbuf *m)
427{
428 struct m_tag *mtag = m_tag_first(m);
429/* KASSERT(mtag != NULL &&
430 mtag->m_tag_id == KERNEL_MODULE_TAG_ID &&
431 mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET,
432 ("packet on dummynet queue w/o dummynet tag!"));
433*/
434 return (struct dn_pkt_tag *)(mtag+1);
435}
436
9bccf70c
A
437/*
438 * Scheduler functions:
439 *
440 * transmit_event() is called when the delay-line needs to enter
441 * the scheduler, either because of existing pkts getting ready,
442 * or new packets entering the queue. The event handled is the delivery
443 * time of the packet.
444 *
445 * ready_event() does something similar with fixed-rate queues, and the
446 * event handled is the finish time of the head pkt.
447 *
448 * wfq_ready_event() does something similar with WF2Q queues, and the
449 * event handled is the start time of the head pkt.
450 *
451 * In all cases, we make sure that the data structures are consistent
452 * before passing pkts out, because this might trigger recursive
453 * invocations of the procedures.
454 */
455static void
456transmit_event(struct dn_pipe *pipe)
457{
91447636
A
458 struct mbuf *m ;
459 struct dn_pkt_tag *pkt ;
91447636
A
460
461 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
462
463 while ( (m = pipe->head) ) {
464 pkt = dn_tag_get(m);
465 if ( !DN_KEY_LEQ(pkt->output_time, curr_time) )
466 break;
467 /*
468 * first unlink, then call procedures, since ip_input() can invoke
469 * ip_output() and viceversa, thus causing nested calls
470 */
471 pipe->head = m->m_nextpkt ;
472 m->m_nextpkt = NULL;
473
474 /* XXX: drop the lock for now to avoid LOR's */
475 lck_mtx_unlock(dn_mutex);
476 switch (pkt->dn_dir) {
477 case DN_TO_IP_OUT: {
478 struct route tmp_rt = pkt->ro;
2d21ac55 479 (void)ip_output(m, NULL, NULL, pkt->flags, NULL, NULL);
91447636
A
480 if (tmp_rt.ro_rt) {
481 rtfree(tmp_rt.ro_rt);
482 }
483 break ;
484 }
485 case DN_TO_IP_IN :
91447636
A
486 proto_inject(PF_INET, m);
487 break ;
488
1c79356b 489#if BRIDGE
91447636
A
490 case DN_TO_BDG_FWD :
491 /*
492 * The bridge requires/assumes the Ethernet header is
493 * contiguous in the first mbuf header. Insure this is true.
494 */
495 if (BDG_LOADED) {
496 if (m->m_len < ETHER_HDR_LEN &&
497 (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
498 printf("dummynet/bridge: pullup fail, dropping pkt\n");
499 break;
500 }
501 m = bdg_forward_ptr(m, pkt->ifp);
502 } else {
503 /* somebody unloaded the bridge module. Drop pkt */
504 /* XXX rate limit */
505 printf("dummynet: dropping bridged packet trapped in pipe\n");
506 }
507 if (m)
508 m_freem(m);
509 break;
510#endif
511 default:
512 printf("dummynet: bad switch %d!\n", pkt->dn_dir);
513 m_freem(m);
514 break ;
515 }
516 lck_mtx_lock(dn_mutex);
1c79356b 517 }
9bccf70c 518 /* if there are leftover packets, put into the heap for next event */
91447636
A
519 if ( (m = pipe->head) ) {
520 pkt = dn_tag_get(m);
521 /* XXX should check errors on heap_insert, by draining the
522 * whole pipe p and hoping in the future we are more successful
523 */
524 heap_insert(&extract_heap, pkt->output_time, pipe);
525 }
1c79356b 526}
9bccf70c 527
1c79356b 528/*
9bccf70c
A
529 * the following macro computes how many ticks we have to wait
530 * before being able to transmit a packet. The credit is taken from
531 * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
1c79356b 532 */
0c530ab8
A
533
534/* hz is 100, which gives a granularity of 10ms in the old timer.
535 * The timer has been changed to fire every 1ms, so the use of
536 * hz has been modified here. All instances of hz have been left
537 * in place but adjusted by a factor of 10 so that hz is functionally
538 * equal to 1000.
539 */
91447636 540#define SET_TICKS(_m, q, p) \
0c530ab8 541 ((_m)->m_pkthdr.len*8*(hz*10) - (q)->numbytes + p->bandwidth - 1 ) / \
9bccf70c
A
542 p->bandwidth ;
543
544/*
545 * extract pkt from queue, compute output time (could be now)
546 * and put into delay line (p_queue)
547 */
548static void
91447636 549move_pkt(struct mbuf *pkt, struct dn_flow_queue *q,
9bccf70c 550 struct dn_pipe *p, int len)
1c79356b 551{
91447636
A
552 struct dn_pkt_tag *dt = dn_tag_get(pkt);
553
554 q->head = pkt->m_nextpkt ;
9bccf70c
A
555 q->len-- ;
556 q->len_bytes -= len ;
1c79356b 557
91447636 558 dt->output_time = curr_time + p->delay ;
1c79356b 559
9bccf70c
A
560 if (p->head == NULL)
561 p->head = pkt;
562 else
91447636 563 p->tail->m_nextpkt = pkt;
9bccf70c 564 p->tail = pkt;
91447636 565 p->tail->m_nextpkt = NULL;
1c79356b
A
566}
567
568/*
9bccf70c
A
569 * ready_event() is invoked every time the queue must enter the
570 * scheduler, either because the first packet arrives, or because
571 * a previously scheduled event fired.
572 * On invokation, drain as many pkts as possible (could be 0) and then
573 * if there are leftover packets reinsert the pkt in the scheduler.
1c79356b 574 */
9bccf70c
A
575static void
576ready_event(struct dn_flow_queue *q)
1c79356b 577{
91447636 578 struct mbuf *pkt;
9bccf70c
A
579 struct dn_pipe *p = q->fs->pipe ;
580 int p_was_empty ;
1c79356b 581
91447636
A
582 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
583
9bccf70c 584 if (p == NULL) {
91447636 585 printf("dummynet: ready_event- pipe is gone\n");
9bccf70c
A
586 return ;
587 }
588 p_was_empty = (p->head == NULL) ;
1c79356b 589
1c79356b 590 /*
9bccf70c
A
591 * schedule fixed-rate queues linked to this pipe:
592 * Account for the bw accumulated since last scheduling, then
593 * drain as many pkts as allowed by q->numbytes and move to
594 * the delay line (in p) computing output time.
595 * bandwidth==0 (no limit) means we can drain the whole queue,
596 * setting len_scaled = 0 does the job.
597 */
598 q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
599 while ( (pkt = q->head) != NULL ) {
91447636 600 int len = pkt->m_pkthdr.len;
0c530ab8 601 int len_scaled = p->bandwidth ? len*8*(hz*10) : 0 ;
9bccf70c
A
602 if (len_scaled > q->numbytes )
603 break ;
604 q->numbytes -= len_scaled ;
605 move_pkt(pkt, q, p, len);
606 }
607 /*
608 * If we have more packets queued, schedule next ready event
609 * (can only occur when bandwidth != 0, otherwise we would have
610 * flushed the whole queue in the previous loop).
611 * To this purpose we record the current time and compute how many
612 * ticks to go for the finish time of the packet.
613 */
614 if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */
615 dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
616 q->sched_time = curr_time ;
617 heap_insert(&ready_heap, curr_time + t, (void *)q );
618 /* XXX should check errors on heap_insert, and drain the whole
619 * queue on error hoping next time we are luckier.
620 */
91447636 621 } else { /* RED needs to know when the queue becomes empty */
9bccf70c 622 q->q_time = curr_time;
91447636
A
623 q->numbytes = 0;
624 }
9bccf70c
A
625 /*
626 * If the delay line was empty call transmit_event(p) now.
627 * Otherwise, the scheduler will take care of it.
1c79356b 628 */
9bccf70c
A
629 if (p_was_empty)
630 transmit_event(p);
631}
1c79356b 632
9bccf70c
A
633/*
634 * Called when we can transmit packets on WF2Q queues. Take pkts out of
635 * the queues at their start time, and enqueue into the delay line.
636 * Packets are drained until p->numbytes < 0. As long as
637 * len_scaled >= p->numbytes, the packet goes into the delay line
638 * with a deadline p->delay. For the last packet, if p->numbytes<0,
639 * there is an additional delay.
640 */
641static void
642ready_event_wfq(struct dn_pipe *p)
643{
644 int p_was_empty = (p->head == NULL) ;
645 struct dn_heap *sch = &(p->scheduler_heap);
646 struct dn_heap *neh = &(p->not_eligible_heap) ;
647
91447636
A
648 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
649
9bccf70c
A
650 if (p->if_name[0] == 0) /* tx clock is simulated */
651 p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
652 else { /* tx clock is for real, the ifq must be empty or this is a NOP */
653 if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
654 return ;
655 else {
91447636
A
656 DPRINTF(("dummynet: pipe %d ready from %s --\n",
657 p->pipe_nr, p->if_name));
9bccf70c 658 }
1c79356b 659 }
9bccf70c 660
1c79356b 661 /*
9bccf70c
A
662 * While we have backlogged traffic AND credit, we need to do
663 * something on the queue.
1c79356b 664 */
9bccf70c
A
665 while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) {
666 if (sch->elements > 0) { /* have some eligible pkts to send out */
667 struct dn_flow_queue *q = sch->p[0].object ;
91447636
A
668 struct mbuf *pkt = q->head;
669 struct dn_flow_set *fs = q->fs;
670 u_int64_t len = pkt->m_pkthdr.len;
0c530ab8 671 int len_scaled = p->bandwidth ? len*8*(hz*10) : 0 ;
1c79356b 672
9bccf70c
A
673 heap_extract(sch, NULL); /* remove queue from heap */
674 p->numbytes -= len_scaled ;
675 move_pkt(pkt, q, p, len);
676
677 p->V += (len<<MY_M) / p->sum ; /* update V */
678 q->S = q->F ; /* update start time */
679 if (q->len == 0) { /* Flow not backlogged any more */
680 fs->backlogged-- ;
681 heap_insert(&(p->idle_heap), q->F, q);
682 } else { /* still backlogged */
683 /*
684 * update F and position in backlogged queue, then
685 * put flow in not_eligible_heap (we will fix this later).
686 */
91447636 687 len = (q->head)->m_pkthdr.len;
9bccf70c
A
688 q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
689 if (DN_KEY_LEQ(q->S, p->V))
690 heap_insert(neh, q->S, q);
691 else
692 heap_insert(sch, q->F, q);
693 }
694 }
695 /*
696 * now compute V = max(V, min(S_i)). Remember that all elements in sch
697 * have by definition S_i <= V so if sch is not empty, V is surely
698 * the max and we must not update it. Conversely, if sch is empty
699 * we only need to look at neh.
700 */
701 if (sch->elements == 0 && neh->elements > 0)
702 p->V = MAX64 ( p->V, neh->p[0].key );
703 /* move from neh to sch any packets that have become eligible */
704 while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) {
705 struct dn_flow_queue *q = neh->p[0].object ;
706 heap_extract(neh, NULL);
707 heap_insert(sch, q->F, q);
708 }
709
710 if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
711 p->numbytes = -1 ; /* mark not ready for I/O */
712 break ;
713 }
1c79356b 714 }
9bccf70c
A
715 if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0
716 && p->idle_heap.elements > 0) {
717 /*
718 * no traffic and no events scheduled. We can get rid of idle-heap.
719 */
720 int i ;
721
722 for (i = 0 ; i < p->idle_heap.elements ; i++) {
723 struct dn_flow_queue *q = p->idle_heap.p[i].object ;
1c79356b 724
9bccf70c
A
725 q->F = 0 ;
726 q->S = q->F + 1 ;
727 }
728 p->sum = 0 ;
729 p->V = 0 ;
730 p->idle_heap.elements = 0 ;
731 }
732 /*
733 * If we are getting clocks from dummynet (not a real interface) and
734 * If we are under credit, schedule the next ready event.
735 * Also fix the delivery time of the last packet.
1c79356b 736 */
9bccf70c
A
737 if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */
738 dn_key t=0 ; /* number of ticks i have to wait */
1c79356b 739
9bccf70c
A
740 if (p->bandwidth > 0)
741 t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ;
91447636 742 dn_tag_get(p->tail)->output_time += t ;
9bccf70c
A
743 p->sched_time = curr_time ;
744 heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
745 /* XXX should check errors on heap_insert, and drain the whole
746 * queue on error hoping next time we are luckier.
747 */
1c79356b 748 }
9bccf70c
A
749 /*
750 * If the delay line was empty call transmit_event(p) now.
751 * Otherwise, the scheduler will take care of it.
752 */
753 if (p_was_empty)
754 transmit_event(p);
1c79356b
A
755}
756
757/*
0c530ab8 758 * This is called every 1ms. It is used to
9bccf70c 759 * increment the current tick counter and schedule expired events.
1c79356b
A
760 */
761static void
2d21ac55 762dummynet(__unused void * unused)
1c79356b 763{
9bccf70c
A
764 void *p ; /* generic parameter to handler */
765 struct dn_heap *h ;
9bccf70c
A
766 struct dn_heap *heaps[3];
767 int i;
768 struct dn_pipe *pe ;
0c530ab8
A
769 struct timespec ts;
770 struct timeval tv;
1c79356b 771
9bccf70c
A
772 heaps[0] = &ready_heap ; /* fixed-rate queues */
773 heaps[1] = &wfq_ready_heap ; /* wfq queues */
774 heaps[2] = &extract_heap ; /* delay line */
91447636
A
775
776 lck_mtx_lock(dn_mutex);
777
0c530ab8
A
778 /* make all time measurements in milliseconds (ms) -
779 * here we convert secs and usecs to msecs (just divide the
780 * usecs and take the closest whole number).
781 */
782 microuptime(&tv);
783 curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
784
9bccf70c
A
785 for (i=0; i < 3 ; i++) {
786 h = heaps[i];
787 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) {
0c530ab8 788 if (h->p[0].key > curr_time)
91447636 789 printf("dummynet: warning, heap %d is %d ticks late\n",
0c530ab8
A
790 i, (int)(curr_time - h->p[0].key));
791 p = h->p[0].object ; /* store a copy before heap_extract */
792 heap_extract(h, NULL); /* need to extract before processing */
793 if (i == 0)
9bccf70c 794 ready_event(p) ;
0c530ab8 795 else if (i == 1) {
9bccf70c
A
796 struct dn_pipe *pipe = p;
797 if (pipe->if_name[0] != '\0')
0c530ab8 798 printf("dummynet: bad ready_event_wfq for pipe %s\n",
9bccf70c
A
799 pipe->if_name);
800 else
0c530ab8
A
801 ready_event_wfq(p) ;
802 } else
9bccf70c
A
803 transmit_event(p);
804 }
1c79356b 805 }
9bccf70c
A
806 /* sweep pipes trying to expire idle flow_queues */
807 for (pe = all_pipes; pe ; pe = pe->next )
808 if (pe->idle_heap.elements > 0 &&
809 DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) {
810 struct dn_flow_queue *q = pe->idle_heap.p[0].object ;
1c79356b 811
9bccf70c
A
812 heap_extract(&(pe->idle_heap), NULL);
813 q->S = q->F + 1 ; /* mark timestamp as invalid */
814 pe->sum -= q->fs->weight ;
815 }
91447636 816
0c530ab8
A
817 /* check the heaps to see if there's still stuff in there, and
818 * only set the timer if there are packets to process
819 */
820 timer_enabled = 0;
821 for (i=0; i < 3 ; i++) {
822 h = heaps[i];
823 if (h->elements > 0) { // set the timer
824 ts.tv_sec = 0;
825 ts.tv_nsec = 1 * 1000000; // 1ms
826 timer_enabled = 1;
827 bsd_timeout(dummynet, NULL, &ts);
828 break;
829 }
830 }
91447636 831
0c530ab8 832 lck_mtx_unlock(dn_mutex);
9bccf70c
A
833}
834
1c79356b 835/*
9bccf70c 836 * called by an interface when tx_rdy occurs.
1c79356b 837 */
9bccf70c
A
838int
839if_tx_rdy(struct ifnet *ifp)
1c79356b 840{
9bccf70c 841 struct dn_pipe *p;
1c79356b 842
91447636 843 lck_mtx_lock(dn_mutex);
9bccf70c
A
844 for (p = all_pipes; p ; p = p->next )
845 if (p->ifp == ifp)
846 break ;
847 if (p == NULL) {
848 char buf[32];
2d21ac55 849 snprintf(buf, sizeof(buf), "%s%d",ifp->if_name, ifp->if_unit);
9bccf70c
A
850 for (p = all_pipes; p ; p = p->next )
851 if (!strcmp(p->if_name, buf) ) {
852 p->ifp = ifp ;
91447636 853 DPRINTF(("dummynet: ++ tx rdy from %s (now found)\n", buf));
9bccf70c
A
854 break ;
855 }
856 }
857 if (p != NULL) {
91447636
A
858 DPRINTF(("dummynet: ++ tx rdy from %s%d - qlen %d\n", ifp->if_name,
859 ifp->if_unit, ifp->if_snd.ifq_len));
9bccf70c
A
860 p->numbytes = 0 ; /* mark ready for I/O */
861 ready_event_wfq(p);
1c79356b 862 }
91447636
A
863 lck_mtx_lock(dn_mutex);
864
9bccf70c 865 return 0;
1c79356b
A
866}
867
1c79356b 868/*
9bccf70c
A
869 * Unconditionally expire empty queues in case of shortage.
870 * Returns the number of queues freed.
1c79356b 871 */
9bccf70c
A
872static int
873expire_queues(struct dn_flow_set *fs)
1c79356b 874{
9bccf70c
A
875 struct dn_flow_queue *q, *prev ;
876 int i, initial_elements = fs->rq_elements ;
91447636 877 struct timeval timenow;
1c79356b 878
91447636
A
879 getmicrotime(&timenow);
880
881 if (fs->last_expired == timenow.tv_sec)
9bccf70c 882 return 0 ;
91447636 883 fs->last_expired = timenow.tv_sec ;
9bccf70c
A
884 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
885 for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
886 if (q->head != NULL || q->S != q->F+1) {
887 prev = q ;
888 q = q->next ;
889 } else { /* entry is idle, expire it */
890 struct dn_flow_queue *old_q = q ;
891
892 if (prev != NULL)
893 prev->next = q = q->next ;
894 else
895 fs->rq[i] = q = q->next ;
896 fs->rq_elements-- ;
91447636 897 FREE(old_q, M_DUMMYNET);
1c79356b 898 }
9bccf70c 899 return initial_elements - fs->rq_elements ;
1c79356b
A
900}
901
902/*
9bccf70c
A
903 * If room, create a new queue and put at head of slot i;
904 * otherwise, create or use the default queue.
1c79356b 905 */
9bccf70c
A
906static struct dn_flow_queue *
907create_queue(struct dn_flow_set *fs, int i)
1c79356b 908{
9bccf70c 909 struct dn_flow_queue *q ;
1c79356b 910
9bccf70c
A
911 if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
912 expire_queues(fs) == 0) {
913 /*
914 * No way to get room, use or create overflow queue.
915 */
916 i = fs->rq_size ;
917 if ( fs->rq[i] != NULL )
918 return fs->rq[i] ;
919 }
91447636 920 q = _MALLOC(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO);
9bccf70c 921 if (q == NULL) {
91447636 922 printf("dummynet: sorry, cannot allocate queue for new flow\n");
9bccf70c
A
923 return NULL ;
924 }
9bccf70c
A
925 q->fs = fs ;
926 q->hash_slot = i ;
927 q->next = fs->rq[i] ;
928 q->S = q->F + 1; /* hack - mark timestamp as invalid */
929 fs->rq[i] = q ;
930 fs->rq_elements++ ;
931 return q ;
932}
1c79356b 933
9bccf70c
A
934/*
935 * Given a flow_set and a pkt in last_pkt, find a matching queue
936 * after appropriate masking. The queue is moved to front
937 * so that further searches take less time.
938 */
939static struct dn_flow_queue *
91447636 940find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
9bccf70c
A
941{
942 int i = 0 ; /* we need i and q for new allocations */
943 struct dn_flow_queue *q, *prev;
1c79356b 944
9bccf70c
A
945 if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
946 q = fs->rq[0] ;
947 else {
948 /* first, do the masking */
91447636
A
949 id->dst_ip &= fs->flow_mask.dst_ip ;
950 id->src_ip &= fs->flow_mask.src_ip ;
951 id->dst_port &= fs->flow_mask.dst_port ;
952 id->src_port &= fs->flow_mask.src_port ;
953 id->proto &= fs->flow_mask.proto ;
954 id->flags = 0 ; /* we don't care about this one */
9bccf70c 955 /* then, hash function */
91447636
A
956 i = ( (id->dst_ip) & 0xffff ) ^
957 ( (id->dst_ip >> 15) & 0xffff ) ^
958 ( (id->src_ip << 1) & 0xffff ) ^
959 ( (id->src_ip >> 16 ) & 0xffff ) ^
960 (id->dst_port << 1) ^ (id->src_port) ^
961 (id->proto );
9bccf70c
A
962 i = i % fs->rq_size ;
963 /* finally, scan the current list for a match */
964 searches++ ;
965 for (prev=NULL, q = fs->rq[i] ; q ; ) {
966 search_steps++;
91447636
A
967 if (id->dst_ip == q->id.dst_ip &&
968 id->src_ip == q->id.src_ip &&
969 id->dst_port == q->id.dst_port &&
970 id->src_port == q->id.src_port &&
971 id->proto == q->id.proto &&
972 id->flags == q->id.flags)
9bccf70c
A
973 break ; /* found */
974 else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
975 /* entry is idle and not in any heap, expire it */
976 struct dn_flow_queue *old_q = q ;
1c79356b 977
9bccf70c
A
978 if (prev != NULL)
979 prev->next = q = q->next ;
980 else
981 fs->rq[i] = q = q->next ;
982 fs->rq_elements-- ;
91447636 983 FREE(old_q, M_DUMMYNET);
9bccf70c
A
984 continue ;
985 }
986 prev = q ;
987 q = q->next ;
1c79356b 988 }
9bccf70c
A
989 if (q && prev != NULL) { /* found and not in front */
990 prev->next = q->next ;
991 q->next = fs->rq[i] ;
992 fs->rq[i] = q ;
1c79356b 993 }
9bccf70c
A
994 }
995 if (q == NULL) { /* no match, need to allocate a new entry */
996 q = create_queue(fs, i);
997 if (q != NULL)
91447636 998 q->id = *id ;
9bccf70c
A
999 }
1000 return q ;
1001}
1002
1003static int
1004red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
1005{
1006 /*
1007 * RED algorithm
91447636 1008 *
9bccf70c
A
1009 * RED calculates the average queue size (avg) using a low-pass filter
1010 * with an exponential weighted (w_q) moving average:
1011 * avg <- (1-w_q) * avg + w_q * q_size
1012 * where q_size is the queue length (measured in bytes or * packets).
91447636 1013 *
9bccf70c
A
1014 * If q_size == 0, we compute the idle time for the link, and set
1015 * avg = (1 - w_q)^(idle/s)
1016 * where s is the time needed for transmitting a medium-sized packet.
91447636 1017 *
9bccf70c
A
1018 * Now, if avg < min_th the packet is enqueued.
1019 * If avg > max_th the packet is dropped. Otherwise, the packet is
1020 * dropped with probability P function of avg.
91447636 1021 *
9bccf70c
A
1022 */
1023
1024 int64_t p_b = 0;
1025 /* queue in bytes or packets ? */
1026 u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len;
1027
91447636 1028 DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size));
9bccf70c
A
1029
1030 /* average queue size estimation */
1031 if (q_size != 0) {
1032 /*
1033 * queue is not empty, avg <- avg + (q_size - avg) * w_q
1034 */
1035 int diff = SCALE(q_size) - q->avg;
1036 int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q);
1037
1038 q->avg += (int) v;
1039 } else {
1040 /*
1041 * queue is empty, find for how long the queue has been
1042 * empty and use a lookup table for computing
1043 * (1 - * w_q)^(idle_time/s) where s is the time to send a
1044 * (small) packet.
1045 * XXX check wraps...
1046 */
1047 if (q->avg) {
1048 u_int t = (curr_time - q->q_time) / fs->lookup_step;
1049
1050 q->avg = (t < fs->lookup_depth) ?
1051 SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
1052 }
1053 }
91447636 1054 DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
9bccf70c
A
1055
1056 /* should i drop ? */
1057
1058 if (q->avg < fs->min_th) {
1059 q->count = -1;
1060 return 0; /* accept packet ; */
1061 }
1062 if (q->avg >= fs->max_th) { /* average queue >= max threshold */
1063 if (fs->flags_fs & DN_IS_GENTLE_RED) {
1c79356b 1064 /*
9bccf70c
A
1065 * According to Gentle-RED, if avg is greater than max_th the
1066 * packet is dropped with a probability
1067 * p_b = c_3 * avg - c_4
1068 * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
1c79356b 1069 */
9bccf70c
A
1070 p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4;
1071 } else {
1072 q->count = -1;
91447636 1073 DPRINTF(("dummynet: - drop"));
9bccf70c
A
1074 return 1 ;
1075 }
1076 } else if (q->avg > fs->min_th) {
1077 /*
1078 * we compute p_b using the linear dropping function p_b = c_1 *
1079 * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
1080 * max_p * min_th / (max_th - min_th)
1081 */
1082 p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2;
1083 }
1084 if (fs->flags_fs & DN_QSIZE_IS_BYTES)
1085 p_b = (p_b * len) / fs->max_pkt_size;
1086 if (++q->count == 0)
2d21ac55 1087 q->random = MY_RANDOM & 0xffff;
9bccf70c
A
1088 else {
1089 /*
1090 * q->count counts packets arrived since last drop, so a greater
1091 * value of q->count means a greater packet drop probability.
1092 */
1093 if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) {
1094 q->count = 0;
91447636 1095 DPRINTF(("dummynet: - red drop"));
9bccf70c 1096 /* after a drop we calculate a new random value */
2d21ac55 1097 q->random = MY_RANDOM & 0xffff;
9bccf70c
A
1098 return 1; /* drop */
1099 }
1100 }
1101 /* end of RED algorithm */
1102 return 0 ; /* accept */
1103}
1104
1105static __inline
1106struct dn_flow_set *
91447636 1107locate_flowset(int pipe_nr, struct ip_fw *rule)
9bccf70c 1108{
91447636
A
1109 struct dn_flow_set *fs;
1110 ipfw_insn *cmd = rule->cmd + rule->act_ofs;
9bccf70c 1111
91447636
A
1112 if (cmd->opcode == O_LOG)
1113 cmd += F_LEN(cmd);
1114
1115 bcopy(& ((ipfw_insn_pipe *)cmd)->pipe_ptr, &fs, sizeof(fs));
1116
1117 if (fs != NULL)
1118 return fs;
1119
1120 if (cmd->opcode == O_QUEUE) {
1121 for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next)
1122 ;
1123 }
9bccf70c 1124 else {
91447636
A
1125 struct dn_pipe *p1;
1126 for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next)
1127 ;
1128 if (p1 != NULL)
1129 fs = &(p1->fs) ;
9bccf70c 1130 }
91447636
A
1131 /* record for the future */
1132 bcopy(&fs, & ((ipfw_insn_pipe *)cmd)->pipe_ptr, sizeof(fs));
1133
9bccf70c
A
1134 return fs ;
1135}
1136
1137/*
1138 * dummynet hook for packets. Below 'pipe' is a pipe or a queue
1139 * depending on whether WF2Q or fixed bw is used.
91447636
A
1140 *
1141 * pipe_nr pipe or queue the packet is destined for.
1142 * dir where shall we send the packet after dummynet.
1143 * m the mbuf with the packet
1144 * ifp the 'ifp' parameter from the caller.
1145 * NULL in ip_input, destination interface in ip_output,
1146 * real_dst in bdg_forward
1147 * ro route parameter (only used in ip_output, NULL otherwise)
1148 * dst destination address, only used by ip_output
1149 * rule matching rule, in case of multiple passes
1150 * flags flags from the caller, only used in ip_output
1151 *
9bccf70c 1152 */
91447636
A
1153static int
1154dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
9bccf70c 1155{
91447636
A
1156 struct dn_pkt_tag *pkt;
1157 struct m_tag *mtag;
9bccf70c
A
1158 struct dn_flow_set *fs;
1159 struct dn_pipe *pipe ;
1160 u_int64_t len = m->m_pkthdr.len ;
1161 struct dn_flow_queue *q = NULL ;
91447636 1162 int is_pipe;
0c530ab8
A
1163 struct timespec ts;
1164 struct timeval tv;
91447636
A
1165
1166#if IPFW2
1167 ipfw_insn *cmd = fwa->rule->cmd + fwa->rule->act_ofs;
1168
1169 if (cmd->opcode == O_LOG)
1170 cmd += F_LEN(cmd);
1171 is_pipe = (cmd->opcode == O_PIPE);
1172#else
1173 is_pipe = (fwa->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE;
1174#endif
9bccf70c
A
1175
1176 pipe_nr &= 0xffff ;
1177
91447636 1178 lck_mtx_lock(dn_mutex);
0c530ab8
A
1179
1180 /* make all time measurements in milliseconds (ms) -
1181 * here we convert secs and usecs to msecs (just divide the
1182 * usecs and take the closest whole number).
1183 */
1184 microuptime(&tv);
1185 curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
91447636
A
1186
1187 /*
1188 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
1189 */
1190 fs = locate_flowset(pipe_nr, fwa->rule);
1191 if (fs == NULL)
1192 goto dropit ; /* this queue/pipe does not exist! */
9bccf70c
A
1193 pipe = fs->pipe ;
1194 if (pipe == NULL) { /* must be a queue, try find a matching pipe */
1195 for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr;
1196 pipe = pipe->next)
1197 ;
1198 if (pipe != NULL)
1199 fs->pipe = pipe ;
1200 else {
91447636 1201 printf("dummynet: no pipe %d for queue %d, drop pkt\n",
9bccf70c
A
1202 fs->parent_nr, fs->fs_nr);
1203 goto dropit ;
1204 }
1205 }
91447636 1206 q = find_queue(fs, &(fwa->f_id));
9bccf70c
A
1207 if ( q == NULL )
1208 goto dropit ; /* cannot allocate queue */
1209 /*
1210 * update statistics, then check reasons to drop pkt
1211 */
1212 q->tot_bytes += len ;
1213 q->tot_pkts++ ;
2d21ac55 1214 if ( fs->plr && (MY_RANDOM < fs->plr) )
9bccf70c
A
1215 goto dropit ; /* random pkt drop */
1216 if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
1217 if (q->len_bytes > fs->qsize)
1218 goto dropit ; /* queue size overflow */
1219 } else {
1220 if (q->len >= fs->qsize)
1221 goto dropit ; /* queue count overflow */
1222 }
1223 if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
1224 goto dropit ;
1225
91447636
A
1226 /* XXX expensive to zero, see if we can remove it*/
1227 mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET,
13fec989 1228 sizeof(struct dn_pkt_tag), M_NOWAIT);
91447636
A
1229 if ( mtag == NULL )
1230 goto dropit ; /* cannot allocate packet header */
1231 m_tag_prepend(m, mtag); /* attach to mbuf chain */
1232
1233 pkt = (struct dn_pkt_tag *)(mtag+1);
13fec989 1234 bzero(pkt, sizeof(struct dn_pkt_tag));
9bccf70c 1235 /* ok, i can handle the pkt now... */
9bccf70c 1236 /* build and enqueue packet + parameters */
91447636 1237 pkt->rule = fwa->rule ;
9bccf70c
A
1238 pkt->dn_dir = dir ;
1239
91447636 1240 pkt->ifp = fwa->oif;
9bccf70c
A
1241 if (dir == DN_TO_IP_OUT) {
1242 /*
1243 * We need to copy *ro because for ICMP pkts (and maybe others)
1244 * the caller passed a pointer into the stack; dst might also be
1245 * a pointer into *ro so it needs to be updated.
1246 */
91447636
A
1247 lck_mtx_lock(rt_mtx);
1248 pkt->ro = *(fwa->ro);
1249 if (fwa->ro->ro_rt)
2d21ac55 1250 rtref(fwa->ro->ro_rt);
91447636
A
1251 if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */
1252 fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ;
1253 lck_mtx_unlock(rt_mtx);
1254
1255 pkt->dn_dst = fwa->dst;
1256 pkt->flags = fwa->flags;
1257 }
9bccf70c 1258 if (q->head == NULL)
91447636 1259 q->head = m;
9bccf70c 1260 else
91447636
A
1261 q->tail->m_nextpkt = m;
1262 q->tail = m;
9bccf70c
A
1263 q->len++;
1264 q->len_bytes += len ;
1265
91447636 1266 if ( q->head != m ) /* flow was not idle, we are done */
9bccf70c
A
1267 goto done;
1268 /*
1269 * If we reach this point the flow was previously idle, so we need
1270 * to schedule it. This involves different actions for fixed-rate or
1271 * WF2Q queues.
1272 */
91447636 1273 if (is_pipe) {
9bccf70c
A
1274 /*
1275 * Fixed-rate queue: just insert into the ready_heap.
1276 */
1277 dn_key t = 0 ;
91447636
A
1278 if (pipe->bandwidth)
1279 t = SET_TICKS(m, q, pipe);
9bccf70c
A
1280 q->sched_time = curr_time ;
1281 if (t == 0) /* must process it now */
1282 ready_event( q );
1283 else
1284 heap_insert(&ready_heap, curr_time + t , q );
1285 } else {
1286 /*
1287 * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
1288 * set S to the virtual time V for the controlling pipe, and update
1289 * the sum of weights for the pipe; otherwise, remove flow from
1290 * idle_heap and set S to max(F,V).
1291 * Second, compute finish time F = S + len/weight.
1292 * Third, if pipe was idle, update V=max(S, V).
1293 * Fourth, count one more backlogged flow.
1294 */
1295 if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
1296 q->S = pipe->V ;
1297 pipe->sum += fs->weight ; /* add weight of new queue */
1298 } else {
1299 heap_extract(&(pipe->idle_heap), q);
1300 q->S = MAX64(q->F, pipe->V ) ;
1301 }
1302 q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
1303
1304 if (pipe->not_eligible_heap.elements == 0 &&
1305 pipe->scheduler_heap.elements == 0)
1306 pipe->V = MAX64 ( q->S, pipe->V );
1307 fs->backlogged++ ;
1308 /*
1309 * Look at eligibility. A flow is not eligibile if S>V (when
1310 * this happens, it means that there is some other flow already
1311 * scheduled for the same pipe, so the scheduler_heap cannot be
1312 * empty). If the flow is not eligible we just store it in the
1313 * not_eligible_heap. Otherwise, we store in the scheduler_heap
1314 * and possibly invoke ready_event_wfq() right now if there is
1315 * leftover credit.
1316 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
1317 * and for all flows in not_eligible_heap (NEH), S_i > V .
1318 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
1319 * we only need to look into NEH.
1320 */
1321 if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */
1322 if (pipe->scheduler_heap.elements == 0)
91447636 1323 printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
9bccf70c
A
1324 heap_insert(&(pipe->not_eligible_heap), q->S, q);
1325 } else {
1326 heap_insert(&(pipe->scheduler_heap), q->F, q);
1327 if (pipe->numbytes >= 0) { /* pipe is idle */
1328 if (pipe->scheduler_heap.elements != 1)
91447636
A
1329 printf("dummynet: OUCH! pipe should have been idle!\n");
1330 DPRINTF(("dummynet: waking up pipe %d at %d\n",
1331 pipe->pipe_nr, (int)(q->F >> MY_M)));
9bccf70c
A
1332 pipe->sched_time = curr_time ;
1333 ready_event_wfq(pipe);
1c79356b 1334 }
9bccf70c
A
1335 }
1336 }
1337done:
0c530ab8
A
1338 /* start the timer and set global if not already set */
1339 if (!timer_enabled) {
1340 ts.tv_sec = 0;
1341 ts.tv_nsec = 1 * 1000000; // 1ms
1342 timer_enabled = 1;
1343 bsd_timeout(dummynet, NULL, &ts);
1344 }
1345
91447636 1346 lck_mtx_unlock(dn_mutex);
9bccf70c
A
1347 return 0;
1348
1349dropit:
9bccf70c
A
1350 if (q)
1351 q->drops++ ;
91447636 1352 lck_mtx_unlock(dn_mutex);
9bccf70c 1353 m_freem(m);
91447636 1354 return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
9bccf70c
A
1355}
1356
1357/*
91447636 1358 * Below, the rtfree is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
9bccf70c
A
1359 * Doing this would probably save us the initial bzero of dn_pkt
1360 */
91447636
A
1361#define DN_FREE_PKT(_m) do { \
1362 struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \
1363 if (tag) { \
1364 struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1); \
1365 if (n->ro.ro_rt) \
1366 rtfree(n->ro.ro_rt); \
1367 } \
1368 m_tag_delete(_m, tag); \
1369 m_freem(_m); \
1370} while (0)
9bccf70c
A
1371
1372/*
1373 * Dispose all packets and flow_queues on a flow_set.
1374 * If all=1, also remove red lookup table and other storage,
1375 * including the descriptor itself.
1376 * For the one in dn_pipe MUST also cleanup ready_heap...
1377 */
1378static void
1379purge_flow_set(struct dn_flow_set *fs, int all)
1380{
9bccf70c
A
1381 struct dn_flow_queue *q, *qn ;
1382 int i ;
1383
91447636
A
1384 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1385
9bccf70c
A
1386 for (i = 0 ; i <= fs->rq_size ; i++ ) {
1387 for (q = fs->rq[i] ; q ; q = qn ) {
91447636
A
1388 struct mbuf *m, *mnext;
1389
1390 mnext = q->head;
1391 while ((m = mnext) != NULL) {
1392 mnext = m->m_nextpkt;
1393 DN_FREE_PKT(m);
1394 }
9bccf70c 1395 qn = q->next ;
91447636 1396 FREE(q, M_DUMMYNET);
9bccf70c
A
1397 }
1398 fs->rq[i] = NULL ;
1399 }
1400 fs->rq_elements = 0 ;
1401 if (all) {
1402 /* RED - free lookup table */
1403 if (fs->w_q_lookup)
91447636 1404 FREE(fs->w_q_lookup, M_DUMMYNET);
9bccf70c 1405 if (fs->rq)
91447636 1406 FREE(fs->rq, M_DUMMYNET);
9bccf70c
A
1407 /* if this fs is not part of a pipe, free it */
1408 if (fs->pipe && fs != &(fs->pipe->fs) )
91447636 1409 FREE(fs, M_DUMMYNET);
9bccf70c
A
1410 }
1411}
1412
1413/*
1414 * Dispose all packets queued on a pipe (not a flow_set).
1415 * Also free all resources associated to a pipe, which is about
1416 * to be deleted.
1417 */
1418static void
1419purge_pipe(struct dn_pipe *pipe)
1420{
91447636 1421 struct mbuf *m, *mnext;
9bccf70c
A
1422
1423 purge_flow_set( &(pipe->fs), 1 );
1424
91447636
A
1425 mnext = pipe->head;
1426 while ((m = mnext) != NULL) {
1427 mnext = m->m_nextpkt;
1428 DN_FREE_PKT(m);
1429 }
9bccf70c
A
1430
1431 heap_free( &(pipe->scheduler_heap) );
1432 heap_free( &(pipe->not_eligible_heap) );
1433 heap_free( &(pipe->idle_heap) );
1434}
1435
1436/*
1437 * Delete all pipes and heaps returning memory. Must also
1438 * remove references from all ipfw rules to all pipes.
1439 */
1440static void
2d21ac55 1441dummynet_flush(void)
9bccf70c
A
1442{
1443 struct dn_pipe *curr_p, *p ;
9bccf70c 1444 struct dn_flow_set *fs, *curr_fs;
9bccf70c 1445
91447636 1446 lck_mtx_lock(dn_mutex);
9bccf70c
A
1447
1448 /* remove all references to pipes ...*/
91447636 1449 flush_pipe_ptrs(NULL);
9bccf70c
A
1450 /* prevent future matches... */
1451 p = all_pipes ;
91447636 1452 all_pipes = NULL ;
9bccf70c
A
1453 fs = all_flow_sets ;
1454 all_flow_sets = NULL ;
1455 /* and free heaps so we don't have unwanted events */
1456 heap_free(&ready_heap);
1457 heap_free(&wfq_ready_heap);
1458 heap_free(&extract_heap);
91447636 1459
9bccf70c
A
1460 /*
1461 * Now purge all queued pkts and delete all pipes
1462 */
1463 /* scan and purge all flow_sets. */
1464 for ( ; fs ; ) {
1465 curr_fs = fs ;
1466 fs = fs->next ;
1467 purge_flow_set(curr_fs, 1);
1468 }
1469 for ( ; p ; ) {
1470 purge_pipe(p);
1471 curr_p = p ;
1472 p = p->next ;
91447636 1473 FREE(curr_p, M_DUMMYNET);
9bccf70c 1474 }
91447636 1475 lck_mtx_unlock(dn_mutex);
9bccf70c
A
1476}
1477
1478
91447636 1479extern struct ip_fw *ip_fw_default_rule ;
9bccf70c
A
1480static void
1481dn_rule_delete_fs(struct dn_flow_set *fs, void *r)
1482{
1483 int i ;
1484 struct dn_flow_queue *q ;
91447636 1485 struct mbuf *m ;
9bccf70c
A
1486
1487 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
1488 for (q = fs->rq[i] ; q ; q = q->next )
91447636
A
1489 for (m = q->head ; m ; m = m->m_nextpkt ) {
1490 struct dn_pkt_tag *pkt = dn_tag_get(m) ;
1491 if (pkt->rule == r)
1492 pkt->rule = ip_fw_default_rule ;
1493 }
9bccf70c
A
1494}
1495/*
1496 * when a firewall rule is deleted, scan all queues and remove the flow-id
1497 * from packets matching this rule.
1498 */
1499void
1500dn_rule_delete(void *r)
1501{
1502 struct dn_pipe *p ;
9bccf70c 1503 struct dn_flow_set *fs ;
91447636
A
1504 struct dn_pkt_tag *pkt ;
1505 struct mbuf *m ;
1506
1507 lck_mtx_lock(dn_mutex);
9bccf70c
A
1508
1509 /*
1510 * If the rule references a queue (dn_flow_set), then scan
1511 * the flow set, otherwise scan pipes. Should do either, but doing
1512 * both does not harm.
1513 */
1514 for ( fs = all_flow_sets ; fs ; fs = fs->next )
1515 dn_rule_delete_fs(fs, r);
1516 for ( p = all_pipes ; p ; p = p->next ) {
1517 fs = &(p->fs) ;
1518 dn_rule_delete_fs(fs, r);
91447636
A
1519 for (m = p->head ; m ; m = m->m_nextpkt ) {
1520 pkt = dn_tag_get(m) ;
1521 if (pkt->rule == r)
1522 pkt->rule = ip_fw_default_rule ;
1523 }
9bccf70c 1524 }
91447636 1525 lck_mtx_unlock(dn_mutex);
9bccf70c
A
1526}
1527
1528/*
1529 * setup RED parameters
1530 */
1531static int
91447636 1532config_red(struct dn_flow_set *p, struct dn_flow_set * x)
9bccf70c
A
1533{
1534 int i;
1535
1536 x->w_q = p->w_q;
1537 x->min_th = SCALE(p->min_th);
1538 x->max_th = SCALE(p->max_th);
1539 x->max_p = p->max_p;
1540
1541 x->c_1 = p->max_p / (p->max_th - p->min_th);
1542 x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
1543 if (x->flags_fs & DN_IS_GENTLE_RED) {
1544 x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
1545 x->c_4 = (SCALE(1) - 2 * p->max_p);
1546 }
1547
1548 /* if the lookup table already exist, free and create it again */
91447636
A
1549 if (x->w_q_lookup) {
1550 FREE(x->w_q_lookup, M_DUMMYNET);
1551 x->w_q_lookup = NULL ;
1552 }
9bccf70c 1553 if (red_lookup_depth == 0) {
91447636
A
1554 printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n");
1555 FREE(x, M_DUMMYNET);
9bccf70c
A
1556 return EINVAL;
1557 }
1558 x->lookup_depth = red_lookup_depth;
1559 x->w_q_lookup = (u_int *) _MALLOC(x->lookup_depth * sizeof(int),
91447636 1560 M_DUMMYNET, M_DONTWAIT);
9bccf70c 1561 if (x->w_q_lookup == NULL) {
91447636
A
1562 printf("dummynet: sorry, cannot allocate red lookup table\n");
1563 FREE(x, M_DUMMYNET);
9bccf70c
A
1564 return ENOSPC;
1565 }
1566
1567 /* fill the lookup table with (1 - w_q)^x */
1568 x->lookup_step = p->lookup_step ;
1569 x->lookup_weight = p->lookup_weight ;
1570 x->w_q_lookup[0] = SCALE(1) - x->w_q;
1571 for (i = 1; i < x->lookup_depth; i++)
1572 x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
1573 if (red_avg_pkt_size < 1)
1574 red_avg_pkt_size = 512 ;
1575 x->avg_pkt_size = red_avg_pkt_size ;
1576 if (red_max_pkt_size < 1)
1577 red_max_pkt_size = 1500 ;
1578 x->max_pkt_size = red_max_pkt_size ;
1579 return 0 ;
1580}
1581
1582static int
1583alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
1584{
1585 if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */
1586 int l = pfs->rq_size;
1587
1588 if (l == 0)
1589 l = dn_hash_size;
1590 if (l < 4)
1591 l = 4;
91447636
A
1592 else if (l > DN_MAX_HASH_SIZE)
1593 l = DN_MAX_HASH_SIZE;
9bccf70c
A
1594 x->rq_size = l;
1595 } else /* one is enough for null mask */
1596 x->rq_size = 1;
1597 x->rq = _MALLOC((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
91447636 1598 M_DUMMYNET, M_DONTWAIT | M_ZERO);
9bccf70c 1599 if (x->rq == NULL) {
91447636 1600 printf("dummynet: sorry, cannot allocate queue\n");
9bccf70c
A
1601 return ENOSPC;
1602 }
9bccf70c
A
1603 x->rq_elements = 0;
1604 return 0 ;
1605}
1606
1607static void
1608set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
1609{
1610 x->flags_fs = src->flags_fs;
1611 x->qsize = src->qsize;
1612 x->plr = src->plr;
1613 x->flow_mask = src->flow_mask;
1614 if (x->flags_fs & DN_QSIZE_IS_BYTES) {
1615 if (x->qsize > 1024*1024)
1616 x->qsize = 1024*1024 ;
1617 } else {
1618 if (x->qsize == 0)
1619 x->qsize = 50 ;
1620 if (x->qsize > 100)
1621 x->qsize = 50 ;
1622 }
1623 /* configuring RED */
1624 if ( x->flags_fs & DN_IS_RED )
1625 config_red(src, x) ; /* XXX should check errors */
1626}
1627
1628/*
1629 * setup pipe or queue parameters.
1630 */
1631
91447636 1632static int
9bccf70c
A
1633config_pipe(struct dn_pipe *p)
1634{
91447636 1635 int i, r;
9bccf70c 1636 struct dn_flow_set *pfs = &(p->fs);
91447636 1637 struct dn_flow_queue *q;
9bccf70c 1638
91447636
A
1639 /*
1640 * The config program passes parameters as follows:
9bccf70c
A
1641 * bw = bits/second (0 means no limits),
1642 * delay = ms, must be translated into ticks.
1643 * qsize = slots/bytes
91447636 1644 */
0c530ab8 1645 p->delay = ( p->delay * (hz*10) ) / 1000 ;
9bccf70c
A
1646 /* We need either a pipe number or a flow_set number */
1647 if (p->pipe_nr == 0 && pfs->fs_nr == 0)
1648 return EINVAL ;
1649 if (p->pipe_nr != 0 && pfs->fs_nr != 0)
1650 return EINVAL ;
1651 if (p->pipe_nr != 0) { /* this is a pipe */
1652 struct dn_pipe *x, *a, *b;
91447636
A
1653
1654 lck_mtx_lock(dn_mutex);
1655/* locate pipe */
9bccf70c 1656 for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
1c79356b 1657 a = b , b = b->next) ;
9bccf70c
A
1658
1659 if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */
91447636 1660 x = _MALLOC(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO) ;
9bccf70c 1661 if (x == NULL) {
91447636
A
1662 lck_mtx_unlock(dn_mutex);
1663 printf("dummynet: no memory for new pipe\n");
9bccf70c 1664 return ENOSPC;
1c79356b 1665 }
9bccf70c
A
1666 x->pipe_nr = p->pipe_nr;
1667 x->fs.pipe = x ;
1668 /* idle_heap is the only one from which we extract from the middle.
1669 */
1670 x->idle_heap.size = x->idle_heap.elements = 0 ;
1671 x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos);
91447636 1672 } else {
9bccf70c 1673 x = b;
91447636
A
1674 /* Flush accumulated credit for all queues */
1675 for (i = 0; i <= x->fs.rq_size; i++)
1676 for (q = x->fs.rq[i]; q; q = q->next)
1677 q->numbytes = 0;
1678 }
9bccf70c 1679
91447636 1680 x->bandwidth = p->bandwidth ;
9bccf70c
A
1681 x->numbytes = 0; /* just in case... */
1682 bcopy(p->if_name, x->if_name, sizeof(p->if_name) );
1683 x->ifp = NULL ; /* reset interface ptr */
91447636 1684 x->delay = p->delay ;
9bccf70c 1685 set_fs_parms(&(x->fs), pfs);
1c79356b 1686
1c79356b 1687
9bccf70c 1688 if ( x->fs.rq == NULL ) { /* a new pipe */
91447636
A
1689 r = alloc_hash(&(x->fs), pfs) ;
1690 if (r) {
1691 lck_mtx_unlock(dn_mutex);
1692 FREE(x, M_DUMMYNET);
1693 return r ;
9bccf70c 1694 }
9bccf70c
A
1695 x->next = b ;
1696 if (a == NULL)
1697 all_pipes = x ;
1698 else
1699 a->next = x ;
9bccf70c 1700 }
91447636 1701 lck_mtx_unlock(dn_mutex);
9bccf70c
A
1702 } else { /* config queue */
1703 struct dn_flow_set *x, *a, *b ;
1704
91447636 1705 lck_mtx_lock(dn_mutex);
9bccf70c
A
1706 /* locate flow_set */
1707 for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ;
1c79356b 1708 a = b , b = b->next) ;
1c79356b 1709
9bccf70c 1710 if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */
91447636
A
1711 if (pfs->parent_nr == 0) { /* need link to a pipe */
1712 lck_mtx_unlock(dn_mutex);
1713 return EINVAL ;
1714 }
1715 x = _MALLOC(sizeof(struct dn_flow_set), M_DUMMYNET, M_DONTWAIT | M_ZERO);
9bccf70c 1716 if (x == NULL) {
91447636
A
1717 lck_mtx_unlock(dn_mutex);
1718 printf("dummynet: no memory for new flow_set\n");
1719 return ENOSPC;
1c79356b 1720 }
9bccf70c
A
1721 x->fs_nr = pfs->fs_nr;
1722 x->parent_nr = pfs->parent_nr;
1723 x->weight = pfs->weight ;
1724 if (x->weight == 0)
1725 x->weight = 1 ;
1726 else if (x->weight > 100)
1727 x->weight = 100 ;
1728 } else {
1729 /* Change parent pipe not allowed; must delete and recreate */
91447636
A
1730 if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) {
1731 lck_mtx_unlock(dn_mutex);
1732 return EINVAL ;
1733 }
9bccf70c 1734 x = b;
1c79356b 1735 }
9bccf70c
A
1736 set_fs_parms(x, pfs);
1737
1738 if ( x->rq == NULL ) { /* a new flow_set */
91447636
A
1739 r = alloc_hash(x, pfs) ;
1740 if (r) {
1741 lck_mtx_unlock(dn_mutex);
1742 FREE(x, M_DUMMYNET);
1743 return r ;
9bccf70c 1744 }
9bccf70c
A
1745 x->next = b;
1746 if (a == NULL)
1747 all_flow_sets = x;
1748 else
1749 a->next = x;
9bccf70c 1750 }
91447636 1751 lck_mtx_unlock(dn_mutex);
9bccf70c
A
1752 }
1753 return 0 ;
1754}
1755
1756/*
1757 * Helper function to remove from a heap queues which are linked to
1758 * a flow_set about to be deleted.
1759 */
1760static void
1761fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
1762{
1763 int i = 0, found = 0 ;
1764 for (; i < h->elements ;)
1765 if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
1766 h->elements-- ;
1767 h->p[i] = h->p[h->elements] ;
1768 found++ ;
1769 } else
1770 i++ ;
1771 if (found)
1772 heapify(h);
1c79356b
A
1773}
1774
9bccf70c
A
1775/*
1776 * helper function to remove a pipe from a heap (can be there at most once)
1777 */
1778static void
1779pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
1780{
1781 if (h->elements > 0) {
1782 int i = 0 ;
1783 for (i=0; i < h->elements ; i++ ) {
1784 if (h->p[i].object == p) { /* found it */
1785 h->elements-- ;
1786 h->p[i] = h->p[h->elements] ;
1787 heapify(h);
1788 break ;
1789 }
1790 }
1791 }
1792}
1793
1794/*
1795 * drain all queues. Called in case of severe mbuf shortage.
1796 */
1c79356b 1797void
2d21ac55 1798dummynet_drain(void)
1c79356b 1799{
9bccf70c
A
1800 struct dn_flow_set *fs;
1801 struct dn_pipe *p;
91447636
A
1802 struct mbuf *m, *mnext;
1803
1804 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
9bccf70c
A
1805
1806 heap_free(&ready_heap);
1807 heap_free(&wfq_ready_heap);
1808 heap_free(&extract_heap);
1809 /* remove all references to this pipe from flow_sets */
1810 for (fs = all_flow_sets; fs; fs= fs->next )
1811 purge_flow_set(fs, 0);
1812
1813 for (p = all_pipes; p; p= p->next ) {
1814 purge_flow_set(&(p->fs), 0);
91447636
A
1815
1816 mnext = p->head;
1817 while ((m = mnext) != NULL) {
1818 mnext = m->m_nextpkt;
1819 DN_FREE_PKT(m);
1820 }
9bccf70c
A
1821 p->head = p->tail = NULL ;
1822 }
1c79356b
A
1823}
1824
9bccf70c
A
1825/*
1826 * Fully delete a pipe or a queue, cleaning up associated info.
1827 */
91447636 1828static int
9bccf70c
A
1829delete_pipe(struct dn_pipe *p)
1830{
9bccf70c
A
1831 if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
1832 return EINVAL ;
1833 if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
1834 return EINVAL ;
1835 if (p->pipe_nr != 0) { /* this is an old-style pipe */
1836 struct dn_pipe *a, *b;
1837 struct dn_flow_set *fs;
1c79356b 1838
91447636 1839 lck_mtx_lock(dn_mutex);
9bccf70c
A
1840 /* locate pipe */
1841 for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
1842 a = b , b = b->next) ;
91447636
A
1843 if (b == NULL || (b->pipe_nr != p->pipe_nr) ) {
1844 lck_mtx_unlock(dn_mutex);
9bccf70c 1845 return EINVAL ; /* not found */
91447636 1846 }
1c79356b 1847
9bccf70c
A
1848 /* unlink from list of pipes */
1849 if (a == NULL)
1850 all_pipes = b->next ;
1851 else
1852 a->next = b->next ;
1853 /* remove references to this pipe from the ip_fw rules. */
91447636 1854 flush_pipe_ptrs(&(b->fs));
9bccf70c
A
1855
1856 /* remove all references to this pipe from flow_sets */
1857 for (fs = all_flow_sets; fs; fs= fs->next )
1858 if (fs->pipe == b) {
91447636 1859 printf("dummynet: ++ ref to pipe %d from fs %d\n",
9bccf70c
A
1860 p->pipe_nr, fs->fs_nr);
1861 fs->pipe = NULL ;
1862 purge_flow_set(fs, 0);
1863 }
1864 fs_remove_from_heap(&ready_heap, &(b->fs));
1865 purge_pipe(b); /* remove all data associated to this pipe */
1866 /* remove reference to here from extract_heap and wfq_ready_heap */
1867 pipe_remove_from_heap(&extract_heap, b);
1868 pipe_remove_from_heap(&wfq_ready_heap, b);
91447636
A
1869 lck_mtx_unlock(dn_mutex);
1870
1871 FREE(b, M_DUMMYNET);
9bccf70c
A
1872 } else { /* this is a WF2Q queue (dn_flow_set) */
1873 struct dn_flow_set *a, *b;
1874
91447636 1875 lck_mtx_lock(dn_mutex);
9bccf70c
A
1876 /* locate set */
1877 for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ;
1878 a = b , b = b->next) ;
91447636
A
1879 if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) {
1880 lck_mtx_unlock(dn_mutex);
9bccf70c 1881 return EINVAL ; /* not found */
91447636 1882 }
9bccf70c 1883
9bccf70c
A
1884 if (a == NULL)
1885 all_flow_sets = b->next ;
1886 else
1887 a->next = b->next ;
1888 /* remove references to this flow_set from the ip_fw rules. */
91447636 1889 flush_pipe_ptrs(b);
9bccf70c
A
1890
1891 if (b->pipe != NULL) {
1892 /* Update total weight on parent pipe and cleanup parent heaps */
1893 b->pipe->sum -= b->weight * b->backlogged ;
1894 fs_remove_from_heap(&(b->pipe->not_eligible_heap), b);
1895 fs_remove_from_heap(&(b->pipe->scheduler_heap), b);
1896#if 1 /* XXX should i remove from idle_heap as well ? */
1897 fs_remove_from_heap(&(b->pipe->idle_heap), b);
1898#endif
1899 }
1900 purge_flow_set(b, 1);
91447636 1901 lck_mtx_unlock(dn_mutex);
9bccf70c
A
1902 }
1903 return 0 ;
1904}
1905
1906/*
1907 * helper function used to copy data from kernel in DUMMYNET_GET
1908 */
1909static char *
1910dn_copy_set(struct dn_flow_set *set, char *bp)
1911{
1912 int i, copied = 0 ;
1913 struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
1914
91447636
A
1915 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1916
9bccf70c
A
1917 for (i = 0 ; i <= set->rq_size ; i++)
1918 for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
1919 if (q->hash_slot != i)
91447636 1920 printf("dummynet: ++ at %d: wrong slot (have %d, "
9bccf70c
A
1921 "should be %d)\n", copied, q->hash_slot, i);
1922 if (q->fs != set)
91447636 1923 printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
9bccf70c
A
1924 i, q->fs, set);
1925 copied++ ;
2d21ac55 1926 bcopy(q, qp, sizeof(*q));
9bccf70c
A
1927 /* cleanup pointers */
1928 qp->next = NULL ;
1929 qp->head = qp->tail = NULL ;
1930 qp->fs = NULL ;
1931 }
1932 if (copied != set->rq_elements)
91447636 1933 printf("dummynet: ++ wrong count, have %d should be %d\n",
9bccf70c
A
1934 copied, set->rq_elements);
1935 return (char *)qp ;
1936}
1c79356b 1937
91447636
A
1938static size_t
1939dn_calc_size(void)
1c79356b 1940{
9bccf70c
A
1941 struct dn_flow_set *set ;
1942 struct dn_pipe *p ;
91447636
A
1943 size_t size ;
1944
1945 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
9bccf70c 1946
9bccf70c
A
1947 /*
1948 * compute size of data structures: list of pipes and flow_sets.
1949 */
1950 for (p = all_pipes, size = 0 ; p ; p = p->next )
2d21ac55 1951 size += sizeof(*p) +
9bccf70c
A
1952 p->fs.rq_elements * sizeof(struct dn_flow_queue);
1953 for (set = all_flow_sets ; set ; set = set->next )
2d21ac55 1954 size += sizeof(*set) +
9bccf70c 1955 set->rq_elements * sizeof(struct dn_flow_queue);
91447636
A
1956 return size ;
1957}
1958
1959static int
1960dummynet_get(struct sockopt *sopt)
1961{
1962 char *buf, *bp ; /* bp is the "copy-pointer" */
1963 size_t size ;
1964 struct dn_flow_set *set ;
1965 struct dn_pipe *p ;
1966 int error=0, i ;
1967
1968 /* XXX lock held too long */
1969 lck_mtx_lock(dn_mutex);
1970 /*
1971 * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
1972 * cannot use this flag while holding a mutex.
1973 */
1974 for (i = 0; i < 10; i++) {
1975 size = dn_calc_size();
1976 lck_mtx_unlock(dn_mutex);
1977 buf = _MALLOC(size, M_TEMP, M_WAITOK);
1978 lck_mtx_lock(dn_mutex);
1979 if (size == dn_calc_size())
1980 break;
1981 FREE(buf, M_TEMP);
1982 buf = NULL;
1983 }
1984 if (buf == NULL) {
1985 lck_mtx_unlock(dn_mutex);
1986 return ENOBUFS ;
9bccf70c
A
1987 }
1988 for (p = all_pipes, bp = buf ; p ; p = p->next ) {
1989 struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ;
1990
1991 /*
1992 * copy pipe descriptor into *bp, convert delay back to ms,
1993 * then copy the flow_set descriptor(s) one at a time.
1994 * After each flow_set, copy the queue descriptor it owns.
1995 */
2d21ac55 1996 bcopy(p, bp, sizeof(*p));
0c530ab8 1997 pipe_bp->delay = (pipe_bp->delay * 1000) / (hz*10) ;
9bccf70c
A
1998 /*
1999 * XXX the following is a hack based on ->next being the
2000 * first field in dn_pipe and dn_flow_set. The correct
2001 * solution would be to move the dn_flow_set to the beginning
2002 * of struct dn_pipe.
2003 */
2004 pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ;
2005 /* clean pointers */
2006 pipe_bp->head = pipe_bp->tail = NULL ;
2007 pipe_bp->fs.next = NULL ;
2008 pipe_bp->fs.pipe = NULL ;
2009 pipe_bp->fs.rq = NULL ;
2010
2d21ac55 2011 bp += sizeof(*p);
9bccf70c
A
2012 bp = dn_copy_set( &(p->fs), bp );
2013 }
2014 for (set = all_flow_sets ; set ; set = set->next ) {
2015 struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ;
2d21ac55 2016 bcopy(set, bp, sizeof(*set));
9bccf70c
A
2017 /* XXX same hack as above */
2018 fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ;
2019 fs_bp->pipe = NULL ;
2020 fs_bp->rq = NULL ;
2d21ac55 2021 bp += sizeof(*set);
9bccf70c
A
2022 bp = dn_copy_set( set, bp );
2023 }
91447636
A
2024 lck_mtx_unlock(dn_mutex);
2025
9bccf70c
A
2026 error = sooptcopyout(sopt, buf, size);
2027 FREE(buf, M_TEMP);
2028 return error ;
1c79356b
A
2029}
2030
9bccf70c
A
2031/*
2032 * Handler for the various dummynet socket options (get, flush, config, del)
2033 */
1c79356b 2034static int
9bccf70c 2035ip_dn_ctl(struct sockopt *sopt)
1c79356b 2036{
9bccf70c
A
2037 int error = 0 ;
2038 struct dn_pipe *p, tmp_pipe;
2039
2040 /* Disallow sets in really-really secure mode. */
2041 if (sopt->sopt_dir == SOPT_SET && securelevel >= 3)
2042 return (EPERM);
2043
2044 switch (sopt->sopt_name) {
2045 default :
91447636 2046 printf("dummynet: -- unknown option %d", sopt->sopt_name);
9bccf70c
A
2047 return EINVAL ;
2048
2049 case IP_DUMMYNET_GET :
2050 error = dummynet_get(sopt);
2051 break ;
2052
2053 case IP_DUMMYNET_FLUSH :
2054 dummynet_flush() ;
2055 break ;
91447636 2056
9bccf70c
A
2057 case IP_DUMMYNET_CONFIGURE :
2058 p = &tmp_pipe ;
2d21ac55 2059 error = sooptcopyin(sopt, p, sizeof(*p), sizeof(*p));
9bccf70c
A
2060 if (error)
2061 break ;
2062 error = config_pipe(p);
2063 break ;
2064
2065 case IP_DUMMYNET_DEL : /* remove a pipe or queue */
2066 p = &tmp_pipe ;
2d21ac55 2067 error = sooptcopyin(sopt, p, sizeof(*p), sizeof(*p));
9bccf70c
A
2068 if (error)
2069 break ;
2070
2071 error = delete_pipe(p);
2072 break ;
2073 }
2074 return error ;
1c79356b
A
2075}
2076
91447636 2077void
9bccf70c 2078ip_dn_init(void)
1c79356b 2079{
91447636
A
2080 /* setup locks */
2081 dn_mutex_grp_attr = lck_grp_attr_alloc_init();
2082 dn_mutex_grp = lck_grp_alloc_init("dn", dn_mutex_grp_attr);
2083 dn_mutex_attr = lck_attr_alloc_init();
91447636
A
2084
2085 if ((dn_mutex = lck_mtx_alloc_init(dn_mutex_grp, dn_mutex_attr)) == NULL) {
2086 printf("ip_dn_init: can't alloc dn_mutex\n");
2087 return;
2088 }
2089
9bccf70c
A
2090 all_pipes = NULL ;
2091 all_flow_sets = NULL ;
2092 ready_heap.size = ready_heap.elements = 0 ;
2093 ready_heap.offset = 0 ;
2094
2095 wfq_ready_heap.size = wfq_ready_heap.elements = 0 ;
2096 wfq_ready_heap.offset = 0 ;
2097
2098 extract_heap.size = extract_heap.elements = 0 ;
2099 extract_heap.offset = 0 ;
2100 ip_dn_ctl_ptr = ip_dn_ctl;
91447636
A
2101 ip_dn_io_ptr = dummynet_io;
2102 ip_dn_ruledel_ptr = dn_rule_delete;
1c79356b 2103}