]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/ip_dummynet.c
xnu-792.1.5.tar.gz
[apple/xnu.git] / bsd / netinet / ip_dummynet.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /*
23 * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
24 * Portions Copyright (c) 2000 Akamba Corp.
25 * All rights reserved
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
37 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
39 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
40 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
41 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
42 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
44 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
45 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46 * SUCH DAMAGE.
47 *
48 * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $
49 */
50
51 #define DUMMYNET_DEBUG
52
53 /*
54 * This module implements IP dummynet, a bandwidth limiter/delay emulator
55 * used in conjunction with the ipfw package.
56 * Description of the data structures used is in ip_dummynet.h
57 * Here you mainly find the following blocks of code:
58 * + variable declarations;
59 * + heap management functions;
60 * + scheduler and dummynet functions;
61 * + configuration and initialization.
62 *
63 * NOTA BENE: critical sections are protected by the "dummynet lock".
64 *
65 * Most important Changes:
66 *
67 * 010124: Fixed WF2Q behaviour
68 * 010122: Fixed spl protection.
69 * 000601: WF2Q support
70 * 000106: large rewrite, use heaps to handle very many pipes.
71 * 980513: initial release
72 *
73 * include files marked with XXX are probably not needed
74 */
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/queue.h> /* XXX */
81 #include <sys/kernel.h>
82 #include <sys/socket.h>
83 #include <sys/socketvar.h>
84 #include <sys/time.h>
85 #include <sys/sysctl.h>
86 #include <net/if.h>
87 #include <net/route.h>
88 #include <net/kpi_protocol.h>
89 #include <netinet/in.h>
90 #include <netinet/in_systm.h>
91 #include <netinet/in_var.h>
92 #include <netinet/ip.h>
93 #include <netinet/ip_fw.h>
94 #include <netinet/ip_dummynet.h>
95 #include <netinet/ip_var.h>
96
97 #if BRIDGE
98 #include <netinet/if_ether.h> /* for struct arpcom */
99 #include <net/bridge.h>
100 #endif
101
102 /*
103 * We keep a private variable for the simulation time, but we could
104 * probably use an existing one ("softticks" in sys/kern/kern_timer.c)
105 */
106 static dn_key curr_time = 0 ; /* current simulation time */
107
108 static int dn_hash_size = 64 ; /* default hash size */
109
110 /* statistics on number of queue searches and search steps */
111 static int searches, search_steps ;
112 static int pipe_expire = 1 ; /* expire queue if empty */
113 static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
114
115 static int red_lookup_depth = 256; /* RED - default lookup table depth */
116 static int red_avg_pkt_size = 512; /* RED - default medium packet size */
117 static int red_max_pkt_size = 1500; /* RED - default max packet size */
118
119 /*
120 * Three heaps contain queues and pipes that the scheduler handles:
121 *
122 * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
123 *
124 * wfq_ready_heap contains the pipes associated with WF2Q flows
125 *
126 * extract_heap contains pipes associated with delay lines.
127 *
128 */
129 static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
130
131 static int heap_init(struct dn_heap *h, int size) ;
132 static int heap_insert (struct dn_heap *h, dn_key key1, void *p);
133 static void heap_extract(struct dn_heap *h, void *obj);
134
135 static void transmit_event(struct dn_pipe *pipe);
136 static void ready_event(struct dn_flow_queue *q);
137
138 static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */
139 static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */
140
141 #ifdef SYSCTL_NODE
142 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
143 CTLFLAG_RW, 0, "Dummynet");
144 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
145 CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
146 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time,
147 CTLFLAG_RD, &curr_time, 0, "Current tick");
148 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
149 CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
150 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
151 CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
152 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
153 CTLFLAG_RD, &searches, 0, "Number of queue searches");
154 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
155 CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
156 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
157 CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
158 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
159 CTLFLAG_RW, &dn_max_ratio, 0,
160 "Max ratio between dynamic queues and buckets");
161 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
162 CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
163 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
164 CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
165 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
166 CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
167 #endif
168
169 #ifdef DUMMYNET_DEBUG
170 int dummynet_debug = 0;
171 #ifdef SYSCTL_NODE
172 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
173 0, "control debugging printfs");
174 #endif
175 #define DPRINTF(X) if (dummynet_debug) printf X
176 #else
177 #define DPRINTF(X)
178 #endif
179
180 /* dummynet lock */
181 lck_grp_t *dn_mutex_grp;
182 lck_grp_attr_t *dn_mutex_grp_attr;
183 lck_attr_t *dn_mutex_attr;
184 lck_mtx_t *dn_mutex;
185
186 static int config_pipe(struct dn_pipe *p);
187 static int ip_dn_ctl(struct sockopt *sopt);
188
189 static void dummynet(void *);
190 static void dummynet_flush(void);
191 void dummynet_drain(void);
192 static ip_dn_io_t dummynet_io;
193 static void dn_rule_delete(void *);
194
195 int if_tx_rdy(struct ifnet *ifp);
196
197 extern lck_mtx_t *rt_mtx; /* route global lock */
198
199 /*
200 * Heap management functions.
201 *
202 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
203 * Some macros help finding parent/children so we can optimize them.
204 *
205 * heap_init() is called to expand the heap when needed.
206 * Increment size in blocks of 16 entries.
207 * XXX failure to allocate a new element is a pretty bad failure
208 * as we basically stall a whole queue forever!!
209 * Returns 1 on error, 0 on success
210 */
211 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
212 #define HEAP_LEFT(x) ( 2*(x) + 1 )
213 #define HEAP_IS_LEFT(x) ( (x) & 1 )
214 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
215 #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
216 #define HEAP_INCREMENT 15
217
218 static int
219 heap_init(struct dn_heap *h, int new_size)
220 {
221 struct dn_heap_entry *p;
222
223 if (h->size >= new_size ) {
224 printf("dummynet: heap_init, Bogus call, have %d want %d\n",
225 h->size, new_size);
226 return 0 ;
227 }
228 new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
229 p = _MALLOC(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT );
230 if (p == NULL) {
231 printf("dummynet: heap_init, resize %d failed\n", new_size );
232 return 1 ; /* error */
233 }
234 if (h->size > 0) {
235 bcopy(h->p, p, h->size * sizeof(*p) );
236 FREE(h->p, M_DUMMYNET);
237 }
238 h->p = p ;
239 h->size = new_size ;
240 return 0 ;
241 }
242
243 /*
244 * Insert element in heap. Normally, p != NULL, we insert p in
245 * a new position and bubble up. If p == NULL, then the element is
246 * already in place, and key is the position where to start the
247 * bubble-up.
248 * Returns 1 on failure (cannot allocate new heap entry)
249 *
250 * If offset > 0 the position (index, int) of the element in the heap is
251 * also stored in the element itself at the given offset in bytes.
252 */
253 #define SET_OFFSET(heap, node) \
254 if (heap->offset > 0) \
255 *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
256 /*
257 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
258 */
259 #define RESET_OFFSET(heap, node) \
260 if (heap->offset > 0) \
261 *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
262 static int
263 heap_insert(struct dn_heap *h, dn_key key1, void *p)
264 {
265 int son = h->elements ;
266
267 if (p == NULL) /* data already there, set starting point */
268 son = key1 ;
269 else { /* insert new element at the end, possibly resize */
270 son = h->elements ;
271 if (son == h->size) /* need resize... */
272 if (heap_init(h, h->elements+1) )
273 return 1 ; /* failure... */
274 h->p[son].object = p ;
275 h->p[son].key = key1 ;
276 h->elements++ ;
277 }
278 while (son > 0) { /* bubble up */
279 int father = HEAP_FATHER(son) ;
280 struct dn_heap_entry tmp ;
281
282 if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
283 break ; /* found right position */
284 /* son smaller than father, swap and repeat */
285 HEAP_SWAP(h->p[son], h->p[father], tmp) ;
286 SET_OFFSET(h, son);
287 son = father ;
288 }
289 SET_OFFSET(h, son);
290 return 0 ;
291 }
292
293 /*
294 * remove top element from heap, or obj if obj != NULL
295 */
296 static void
297 heap_extract(struct dn_heap *h, void *obj)
298 {
299 int child, father, max = h->elements - 1 ;
300
301 if (max < 0) {
302 printf("dummynet: warning, extract from empty heap 0x%p\n", h);
303 return ;
304 }
305 father = 0 ; /* default: move up smallest child */
306 if (obj != NULL) { /* extract specific element, index is at offset */
307 if (h->offset <= 0)
308 panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
309 father = *((int *)((char *)obj + h->offset)) ;
310 if (father < 0 || father >= h->elements) {
311 printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
312 father, h->elements);
313 panic("dummynet: heap_extract");
314 }
315 }
316 RESET_OFFSET(h, father);
317 child = HEAP_LEFT(father) ; /* left child */
318 while (child <= max) { /* valid entry */
319 if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
320 child = child+1 ; /* take right child, otherwise left */
321 h->p[father] = h->p[child] ;
322 SET_OFFSET(h, father);
323 father = child ;
324 child = HEAP_LEFT(child) ; /* left child for next loop */
325 }
326 h->elements-- ;
327 if (father != max) {
328 /*
329 * Fill hole with last entry and bubble up, reusing the insert code
330 */
331 h->p[father] = h->p[max] ;
332 heap_insert(h, father, NULL); /* this one cannot fail */
333 }
334 }
335
336 #if 0
337 /*
338 * change object position and update references
339 * XXX this one is never used!
340 */
341 static void
342 heap_move(struct dn_heap *h, dn_key new_key, void *object)
343 {
344 int temp;
345 int i ;
346 int max = h->elements-1 ;
347 struct dn_heap_entry buf ;
348
349 if (h->offset <= 0)
350 panic("cannot move items on this heap");
351
352 i = *((int *)((char *)object + h->offset));
353 if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
354 h->p[i].key = new_key ;
355 for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
356 i = temp ) { /* bubble up */
357 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
358 SET_OFFSET(h, i);
359 }
360 } else { /* must move down */
361 h->p[i].key = new_key ;
362 while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
363 if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
364 temp++ ; /* select child with min key */
365 if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
366 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
367 SET_OFFSET(h, i);
368 } else
369 break ;
370 i = temp ;
371 }
372 }
373 SET_OFFSET(h, i);
374 }
375 #endif /* heap_move, unused */
376
377 /*
378 * heapify() will reorganize data inside an array to maintain the
379 * heap property. It is needed when we delete a bunch of entries.
380 */
381 static void
382 heapify(struct dn_heap *h)
383 {
384 int i ;
385
386 for (i = 0 ; i < h->elements ; i++ )
387 heap_insert(h, i , NULL) ;
388 }
389
390 /*
391 * cleanup the heap and free data structure
392 */
393 static void
394 heap_free(struct dn_heap *h)
395 {
396 if (h->size >0 )
397 FREE(h->p, M_DUMMYNET);
398 bzero(h, sizeof(*h) );
399 }
400
401 /*
402 * --- end of heap management functions ---
403 */
404
405 /*
406 * Return the mbuf tag holding the dummynet state. As an optimization
407 * this is assumed to be the first tag on the list. If this turns out
408 * wrong we'll need to search the list.
409 */
410 static struct dn_pkt_tag *
411 dn_tag_get(struct mbuf *m)
412 {
413 struct m_tag *mtag = m_tag_first(m);
414 /* KASSERT(mtag != NULL &&
415 mtag->m_tag_id == KERNEL_MODULE_TAG_ID &&
416 mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET,
417 ("packet on dummynet queue w/o dummynet tag!"));
418 */
419 return (struct dn_pkt_tag *)(mtag+1);
420 }
421
422 /*
423 * Scheduler functions:
424 *
425 * transmit_event() is called when the delay-line needs to enter
426 * the scheduler, either because of existing pkts getting ready,
427 * or new packets entering the queue. The event handled is the delivery
428 * time of the packet.
429 *
430 * ready_event() does something similar with fixed-rate queues, and the
431 * event handled is the finish time of the head pkt.
432 *
433 * wfq_ready_event() does something similar with WF2Q queues, and the
434 * event handled is the start time of the head pkt.
435 *
436 * In all cases, we make sure that the data structures are consistent
437 * before passing pkts out, because this might trigger recursive
438 * invocations of the procedures.
439 */
440 static void
441 transmit_event(struct dn_pipe *pipe)
442 {
443 struct mbuf *m ;
444 struct dn_pkt_tag *pkt ;
445 struct ip *ip;
446
447 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
448
449 while ( (m = pipe->head) ) {
450 pkt = dn_tag_get(m);
451 if ( !DN_KEY_LEQ(pkt->output_time, curr_time) )
452 break;
453 /*
454 * first unlink, then call procedures, since ip_input() can invoke
455 * ip_output() and viceversa, thus causing nested calls
456 */
457 pipe->head = m->m_nextpkt ;
458 m->m_nextpkt = NULL;
459
460 /* XXX: drop the lock for now to avoid LOR's */
461 lck_mtx_unlock(dn_mutex);
462 switch (pkt->dn_dir) {
463 case DN_TO_IP_OUT: {
464 struct route tmp_rt = pkt->ro;
465 (void)ip_output(m, NULL, NULL, pkt->flags, NULL);
466 if (tmp_rt.ro_rt) {
467 rtfree(tmp_rt.ro_rt);
468 }
469 break ;
470 }
471 case DN_TO_IP_IN :
472 ip = mtod(m, struct ip *);
473 ip->ip_len = htons(ip->ip_len);
474 ip->ip_off = htons(ip->ip_off);
475 proto_inject(PF_INET, m);
476 break ;
477
478 #if BRIDGE
479 case DN_TO_BDG_FWD :
480 /*
481 * The bridge requires/assumes the Ethernet header is
482 * contiguous in the first mbuf header. Insure this is true.
483 */
484 if (BDG_LOADED) {
485 if (m->m_len < ETHER_HDR_LEN &&
486 (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
487 printf("dummynet/bridge: pullup fail, dropping pkt\n");
488 break;
489 }
490 m = bdg_forward_ptr(m, pkt->ifp);
491 } else {
492 /* somebody unloaded the bridge module. Drop pkt */
493 /* XXX rate limit */
494 printf("dummynet: dropping bridged packet trapped in pipe\n");
495 }
496 if (m)
497 m_freem(m);
498 break;
499 #endif
500 default:
501 printf("dummynet: bad switch %d!\n", pkt->dn_dir);
502 m_freem(m);
503 break ;
504 }
505 lck_mtx_lock(dn_mutex);
506 }
507 /* if there are leftover packets, put into the heap for next event */
508 if ( (m = pipe->head) ) {
509 pkt = dn_tag_get(m);
510 /* XXX should check errors on heap_insert, by draining the
511 * whole pipe p and hoping in the future we are more successful
512 */
513 heap_insert(&extract_heap, pkt->output_time, pipe);
514 }
515 }
516
517 /*
518 * the following macro computes how many ticks we have to wait
519 * before being able to transmit a packet. The credit is taken from
520 * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
521 */
522 #define SET_TICKS(_m, q, p) \
523 ((_m)->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \
524 p->bandwidth ;
525
526 /*
527 * extract pkt from queue, compute output time (could be now)
528 * and put into delay line (p_queue)
529 */
530 static void
531 move_pkt(struct mbuf *pkt, struct dn_flow_queue *q,
532 struct dn_pipe *p, int len)
533 {
534 struct dn_pkt_tag *dt = dn_tag_get(pkt);
535
536 q->head = pkt->m_nextpkt ;
537 q->len-- ;
538 q->len_bytes -= len ;
539
540 dt->output_time = curr_time + p->delay ;
541
542 if (p->head == NULL)
543 p->head = pkt;
544 else
545 p->tail->m_nextpkt = pkt;
546 p->tail = pkt;
547 p->tail->m_nextpkt = NULL;
548 }
549
550 /*
551 * ready_event() is invoked every time the queue must enter the
552 * scheduler, either because the first packet arrives, or because
553 * a previously scheduled event fired.
554 * On invokation, drain as many pkts as possible (could be 0) and then
555 * if there are leftover packets reinsert the pkt in the scheduler.
556 */
557 static void
558 ready_event(struct dn_flow_queue *q)
559 {
560 struct mbuf *pkt;
561 struct dn_pipe *p = q->fs->pipe ;
562 int p_was_empty ;
563
564 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
565
566 if (p == NULL) {
567 printf("dummynet: ready_event- pipe is gone\n");
568 return ;
569 }
570 p_was_empty = (p->head == NULL) ;
571
572 /*
573 * schedule fixed-rate queues linked to this pipe:
574 * Account for the bw accumulated since last scheduling, then
575 * drain as many pkts as allowed by q->numbytes and move to
576 * the delay line (in p) computing output time.
577 * bandwidth==0 (no limit) means we can drain the whole queue,
578 * setting len_scaled = 0 does the job.
579 */
580 q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
581 while ( (pkt = q->head) != NULL ) {
582 int len = pkt->m_pkthdr.len;
583 int len_scaled = p->bandwidth ? len*8*hz : 0 ;
584 if (len_scaled > q->numbytes )
585 break ;
586 q->numbytes -= len_scaled ;
587 move_pkt(pkt, q, p, len);
588 }
589 /*
590 * If we have more packets queued, schedule next ready event
591 * (can only occur when bandwidth != 0, otherwise we would have
592 * flushed the whole queue in the previous loop).
593 * To this purpose we record the current time and compute how many
594 * ticks to go for the finish time of the packet.
595 */
596 if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */
597 dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
598 q->sched_time = curr_time ;
599 heap_insert(&ready_heap, curr_time + t, (void *)q );
600 /* XXX should check errors on heap_insert, and drain the whole
601 * queue on error hoping next time we are luckier.
602 */
603 } else { /* RED needs to know when the queue becomes empty */
604 q->q_time = curr_time;
605 q->numbytes = 0;
606 }
607 /*
608 * If the delay line was empty call transmit_event(p) now.
609 * Otherwise, the scheduler will take care of it.
610 */
611 if (p_was_empty)
612 transmit_event(p);
613 }
614
615 /*
616 * Called when we can transmit packets on WF2Q queues. Take pkts out of
617 * the queues at their start time, and enqueue into the delay line.
618 * Packets are drained until p->numbytes < 0. As long as
619 * len_scaled >= p->numbytes, the packet goes into the delay line
620 * with a deadline p->delay. For the last packet, if p->numbytes<0,
621 * there is an additional delay.
622 */
623 static void
624 ready_event_wfq(struct dn_pipe *p)
625 {
626 int p_was_empty = (p->head == NULL) ;
627 struct dn_heap *sch = &(p->scheduler_heap);
628 struct dn_heap *neh = &(p->not_eligible_heap) ;
629
630 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
631
632 if (p->if_name[0] == 0) /* tx clock is simulated */
633 p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
634 else { /* tx clock is for real, the ifq must be empty or this is a NOP */
635 if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
636 return ;
637 else {
638 DPRINTF(("dummynet: pipe %d ready from %s --\n",
639 p->pipe_nr, p->if_name));
640 }
641 }
642
643 /*
644 * While we have backlogged traffic AND credit, we need to do
645 * something on the queue.
646 */
647 while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) {
648 if (sch->elements > 0) { /* have some eligible pkts to send out */
649 struct dn_flow_queue *q = sch->p[0].object ;
650 struct mbuf *pkt = q->head;
651 struct dn_flow_set *fs = q->fs;
652 u_int64_t len = pkt->m_pkthdr.len;
653 int len_scaled = p->bandwidth ? len*8*hz : 0 ;
654
655 heap_extract(sch, NULL); /* remove queue from heap */
656 p->numbytes -= len_scaled ;
657 move_pkt(pkt, q, p, len);
658
659 p->V += (len<<MY_M) / p->sum ; /* update V */
660 q->S = q->F ; /* update start time */
661 if (q->len == 0) { /* Flow not backlogged any more */
662 fs->backlogged-- ;
663 heap_insert(&(p->idle_heap), q->F, q);
664 } else { /* still backlogged */
665 /*
666 * update F and position in backlogged queue, then
667 * put flow in not_eligible_heap (we will fix this later).
668 */
669 len = (q->head)->m_pkthdr.len;
670 q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
671 if (DN_KEY_LEQ(q->S, p->V))
672 heap_insert(neh, q->S, q);
673 else
674 heap_insert(sch, q->F, q);
675 }
676 }
677 /*
678 * now compute V = max(V, min(S_i)). Remember that all elements in sch
679 * have by definition S_i <= V so if sch is not empty, V is surely
680 * the max and we must not update it. Conversely, if sch is empty
681 * we only need to look at neh.
682 */
683 if (sch->elements == 0 && neh->elements > 0)
684 p->V = MAX64 ( p->V, neh->p[0].key );
685 /* move from neh to sch any packets that have become eligible */
686 while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) {
687 struct dn_flow_queue *q = neh->p[0].object ;
688 heap_extract(neh, NULL);
689 heap_insert(sch, q->F, q);
690 }
691
692 if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
693 p->numbytes = -1 ; /* mark not ready for I/O */
694 break ;
695 }
696 }
697 if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0
698 && p->idle_heap.elements > 0) {
699 /*
700 * no traffic and no events scheduled. We can get rid of idle-heap.
701 */
702 int i ;
703
704 for (i = 0 ; i < p->idle_heap.elements ; i++) {
705 struct dn_flow_queue *q = p->idle_heap.p[i].object ;
706
707 q->F = 0 ;
708 q->S = q->F + 1 ;
709 }
710 p->sum = 0 ;
711 p->V = 0 ;
712 p->idle_heap.elements = 0 ;
713 }
714 /*
715 * If we are getting clocks from dummynet (not a real interface) and
716 * If we are under credit, schedule the next ready event.
717 * Also fix the delivery time of the last packet.
718 */
719 if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */
720 dn_key t=0 ; /* number of ticks i have to wait */
721
722 if (p->bandwidth > 0)
723 t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ;
724 dn_tag_get(p->tail)->output_time += t ;
725 p->sched_time = curr_time ;
726 heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
727 /* XXX should check errors on heap_insert, and drain the whole
728 * queue on error hoping next time we are luckier.
729 */
730 }
731 /*
732 * If the delay line was empty call transmit_event(p) now.
733 * Otherwise, the scheduler will take care of it.
734 */
735 if (p_was_empty)
736 transmit_event(p);
737 }
738
739 /*
740 * This is called once per tick, or HZ times per second. It is used to
741 * increment the current tick counter and schedule expired events.
742 */
743 static void
744 dummynet(void * __unused unused)
745 {
746 void *p ; /* generic parameter to handler */
747 struct dn_heap *h ;
748 struct dn_heap *heaps[3];
749 int i;
750 struct dn_pipe *pe ;
751
752 heaps[0] = &ready_heap ; /* fixed-rate queues */
753 heaps[1] = &wfq_ready_heap ; /* wfq queues */
754 heaps[2] = &extract_heap ; /* delay line */
755
756 lck_mtx_lock(dn_mutex);
757
758 curr_time++ ;
759 for (i=0; i < 3 ; i++) {
760 h = heaps[i];
761 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) {
762 if (h->p[0].key > curr_time)
763 printf("dummynet: warning, heap %d is %d ticks late\n",
764 i, (int)(curr_time - h->p[0].key));
765 p = h->p[0].object ; /* store a copy before heap_extract */
766 heap_extract(h, NULL); /* need to extract before processing */
767 if (i == 0)
768 ready_event(p) ;
769 else if (i == 1) {
770 struct dn_pipe *pipe = p;
771 if (pipe->if_name[0] != '\0')
772 printf("dummynet: bad ready_event_wfq for pipe %s\n",
773 pipe->if_name);
774 else
775 ready_event_wfq(p) ;
776 } else
777 transmit_event(p);
778 }
779 }
780 /* sweep pipes trying to expire idle flow_queues */
781 for (pe = all_pipes; pe ; pe = pe->next )
782 if (pe->idle_heap.elements > 0 &&
783 DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) {
784 struct dn_flow_queue *q = pe->idle_heap.p[0].object ;
785
786 heap_extract(&(pe->idle_heap), NULL);
787 q->S = q->F + 1 ; /* mark timestamp as invalid */
788 pe->sum -= q->fs->weight ;
789 }
790
791 lck_mtx_unlock(dn_mutex);
792
793 timeout(dummynet, NULL, 1);
794 }
795
796 /*
797 * called by an interface when tx_rdy occurs.
798 */
799 int
800 if_tx_rdy(struct ifnet *ifp)
801 {
802 struct dn_pipe *p;
803
804 lck_mtx_lock(dn_mutex);
805 for (p = all_pipes; p ; p = p->next )
806 if (p->ifp == ifp)
807 break ;
808 if (p == NULL) {
809 char buf[32];
810 sprintf(buf, "%s%d",ifp->if_name, ifp->if_unit);
811 for (p = all_pipes; p ; p = p->next )
812 if (!strcmp(p->if_name, buf) ) {
813 p->ifp = ifp ;
814 DPRINTF(("dummynet: ++ tx rdy from %s (now found)\n", buf));
815 break ;
816 }
817 }
818 if (p != NULL) {
819 DPRINTF(("dummynet: ++ tx rdy from %s%d - qlen %d\n", ifp->if_name,
820 ifp->if_unit, ifp->if_snd.ifq_len));
821 p->numbytes = 0 ; /* mark ready for I/O */
822 ready_event_wfq(p);
823 }
824 lck_mtx_lock(dn_mutex);
825
826 return 0;
827 }
828
829 /*
830 * Unconditionally expire empty queues in case of shortage.
831 * Returns the number of queues freed.
832 */
833 static int
834 expire_queues(struct dn_flow_set *fs)
835 {
836 struct dn_flow_queue *q, *prev ;
837 int i, initial_elements = fs->rq_elements ;
838 struct timeval timenow;
839
840 getmicrotime(&timenow);
841
842 if (fs->last_expired == timenow.tv_sec)
843 return 0 ;
844 fs->last_expired = timenow.tv_sec ;
845 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
846 for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
847 if (q->head != NULL || q->S != q->F+1) {
848 prev = q ;
849 q = q->next ;
850 } else { /* entry is idle, expire it */
851 struct dn_flow_queue *old_q = q ;
852
853 if (prev != NULL)
854 prev->next = q = q->next ;
855 else
856 fs->rq[i] = q = q->next ;
857 fs->rq_elements-- ;
858 FREE(old_q, M_DUMMYNET);
859 }
860 return initial_elements - fs->rq_elements ;
861 }
862
863 /*
864 * If room, create a new queue and put at head of slot i;
865 * otherwise, create or use the default queue.
866 */
867 static struct dn_flow_queue *
868 create_queue(struct dn_flow_set *fs, int i)
869 {
870 struct dn_flow_queue *q ;
871
872 if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
873 expire_queues(fs) == 0) {
874 /*
875 * No way to get room, use or create overflow queue.
876 */
877 i = fs->rq_size ;
878 if ( fs->rq[i] != NULL )
879 return fs->rq[i] ;
880 }
881 q = _MALLOC(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO);
882 if (q == NULL) {
883 printf("dummynet: sorry, cannot allocate queue for new flow\n");
884 return NULL ;
885 }
886 q->fs = fs ;
887 q->hash_slot = i ;
888 q->next = fs->rq[i] ;
889 q->S = q->F + 1; /* hack - mark timestamp as invalid */
890 fs->rq[i] = q ;
891 fs->rq_elements++ ;
892 return q ;
893 }
894
895 /*
896 * Given a flow_set and a pkt in last_pkt, find a matching queue
897 * after appropriate masking. The queue is moved to front
898 * so that further searches take less time.
899 */
900 static struct dn_flow_queue *
901 find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
902 {
903 int i = 0 ; /* we need i and q for new allocations */
904 struct dn_flow_queue *q, *prev;
905
906 if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
907 q = fs->rq[0] ;
908 else {
909 /* first, do the masking */
910 id->dst_ip &= fs->flow_mask.dst_ip ;
911 id->src_ip &= fs->flow_mask.src_ip ;
912 id->dst_port &= fs->flow_mask.dst_port ;
913 id->src_port &= fs->flow_mask.src_port ;
914 id->proto &= fs->flow_mask.proto ;
915 id->flags = 0 ; /* we don't care about this one */
916 /* then, hash function */
917 i = ( (id->dst_ip) & 0xffff ) ^
918 ( (id->dst_ip >> 15) & 0xffff ) ^
919 ( (id->src_ip << 1) & 0xffff ) ^
920 ( (id->src_ip >> 16 ) & 0xffff ) ^
921 (id->dst_port << 1) ^ (id->src_port) ^
922 (id->proto );
923 i = i % fs->rq_size ;
924 /* finally, scan the current list for a match */
925 searches++ ;
926 for (prev=NULL, q = fs->rq[i] ; q ; ) {
927 search_steps++;
928 if (id->dst_ip == q->id.dst_ip &&
929 id->src_ip == q->id.src_ip &&
930 id->dst_port == q->id.dst_port &&
931 id->src_port == q->id.src_port &&
932 id->proto == q->id.proto &&
933 id->flags == q->id.flags)
934 break ; /* found */
935 else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
936 /* entry is idle and not in any heap, expire it */
937 struct dn_flow_queue *old_q = q ;
938
939 if (prev != NULL)
940 prev->next = q = q->next ;
941 else
942 fs->rq[i] = q = q->next ;
943 fs->rq_elements-- ;
944 FREE(old_q, M_DUMMYNET);
945 continue ;
946 }
947 prev = q ;
948 q = q->next ;
949 }
950 if (q && prev != NULL) { /* found and not in front */
951 prev->next = q->next ;
952 q->next = fs->rq[i] ;
953 fs->rq[i] = q ;
954 }
955 }
956 if (q == NULL) { /* no match, need to allocate a new entry */
957 q = create_queue(fs, i);
958 if (q != NULL)
959 q->id = *id ;
960 }
961 return q ;
962 }
963
964 static int
965 red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
966 {
967 /*
968 * RED algorithm
969 *
970 * RED calculates the average queue size (avg) using a low-pass filter
971 * with an exponential weighted (w_q) moving average:
972 * avg <- (1-w_q) * avg + w_q * q_size
973 * where q_size is the queue length (measured in bytes or * packets).
974 *
975 * If q_size == 0, we compute the idle time for the link, and set
976 * avg = (1 - w_q)^(idle/s)
977 * where s is the time needed for transmitting a medium-sized packet.
978 *
979 * Now, if avg < min_th the packet is enqueued.
980 * If avg > max_th the packet is dropped. Otherwise, the packet is
981 * dropped with probability P function of avg.
982 *
983 */
984
985 int64_t p_b = 0;
986 /* queue in bytes or packets ? */
987 u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len;
988
989 DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size));
990
991 /* average queue size estimation */
992 if (q_size != 0) {
993 /*
994 * queue is not empty, avg <- avg + (q_size - avg) * w_q
995 */
996 int diff = SCALE(q_size) - q->avg;
997 int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q);
998
999 q->avg += (int) v;
1000 } else {
1001 /*
1002 * queue is empty, find for how long the queue has been
1003 * empty and use a lookup table for computing
1004 * (1 - * w_q)^(idle_time/s) where s is the time to send a
1005 * (small) packet.
1006 * XXX check wraps...
1007 */
1008 if (q->avg) {
1009 u_int t = (curr_time - q->q_time) / fs->lookup_step;
1010
1011 q->avg = (t < fs->lookup_depth) ?
1012 SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
1013 }
1014 }
1015 DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
1016
1017 /* should i drop ? */
1018
1019 if (q->avg < fs->min_th) {
1020 q->count = -1;
1021 return 0; /* accept packet ; */
1022 }
1023 if (q->avg >= fs->max_th) { /* average queue >= max threshold */
1024 if (fs->flags_fs & DN_IS_GENTLE_RED) {
1025 /*
1026 * According to Gentle-RED, if avg is greater than max_th the
1027 * packet is dropped with a probability
1028 * p_b = c_3 * avg - c_4
1029 * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
1030 */
1031 p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4;
1032 } else {
1033 q->count = -1;
1034 DPRINTF(("dummynet: - drop"));
1035 return 1 ;
1036 }
1037 } else if (q->avg > fs->min_th) {
1038 /*
1039 * we compute p_b using the linear dropping function p_b = c_1 *
1040 * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
1041 * max_p * min_th / (max_th - min_th)
1042 */
1043 p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2;
1044 }
1045 if (fs->flags_fs & DN_QSIZE_IS_BYTES)
1046 p_b = (p_b * len) / fs->max_pkt_size;
1047 if (++q->count == 0)
1048 q->random = random() & 0xffff;
1049 else {
1050 /*
1051 * q->count counts packets arrived since last drop, so a greater
1052 * value of q->count means a greater packet drop probability.
1053 */
1054 if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) {
1055 q->count = 0;
1056 DPRINTF(("dummynet: - red drop"));
1057 /* after a drop we calculate a new random value */
1058 q->random = random() & 0xffff;
1059 return 1; /* drop */
1060 }
1061 }
1062 /* end of RED algorithm */
1063 return 0 ; /* accept */
1064 }
1065
1066 static __inline
1067 struct dn_flow_set *
1068 locate_flowset(int pipe_nr, struct ip_fw *rule)
1069 {
1070 struct dn_flow_set *fs;
1071 ipfw_insn *cmd = rule->cmd + rule->act_ofs;
1072
1073 if (cmd->opcode == O_LOG)
1074 cmd += F_LEN(cmd);
1075
1076 bcopy(& ((ipfw_insn_pipe *)cmd)->pipe_ptr, &fs, sizeof(fs));
1077
1078 if (fs != NULL)
1079 return fs;
1080
1081 if (cmd->opcode == O_QUEUE) {
1082 for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next)
1083 ;
1084 }
1085 else {
1086 struct dn_pipe *p1;
1087 for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next)
1088 ;
1089 if (p1 != NULL)
1090 fs = &(p1->fs) ;
1091 }
1092 /* record for the future */
1093 bcopy(&fs, & ((ipfw_insn_pipe *)cmd)->pipe_ptr, sizeof(fs));
1094
1095 return fs ;
1096 }
1097
1098 /*
1099 * dummynet hook for packets. Below 'pipe' is a pipe or a queue
1100 * depending on whether WF2Q or fixed bw is used.
1101 *
1102 * pipe_nr pipe or queue the packet is destined for.
1103 * dir where shall we send the packet after dummynet.
1104 * m the mbuf with the packet
1105 * ifp the 'ifp' parameter from the caller.
1106 * NULL in ip_input, destination interface in ip_output,
1107 * real_dst in bdg_forward
1108 * ro route parameter (only used in ip_output, NULL otherwise)
1109 * dst destination address, only used by ip_output
1110 * rule matching rule, in case of multiple passes
1111 * flags flags from the caller, only used in ip_output
1112 *
1113 */
1114 static int
1115 dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
1116 {
1117 struct dn_pkt_tag *pkt;
1118 struct m_tag *mtag;
1119 struct dn_flow_set *fs;
1120 struct dn_pipe *pipe ;
1121 u_int64_t len = m->m_pkthdr.len ;
1122 struct dn_flow_queue *q = NULL ;
1123 int is_pipe;
1124
1125 #if IPFW2
1126 ipfw_insn *cmd = fwa->rule->cmd + fwa->rule->act_ofs;
1127
1128 if (cmd->opcode == O_LOG)
1129 cmd += F_LEN(cmd);
1130 is_pipe = (cmd->opcode == O_PIPE);
1131 #else
1132 is_pipe = (fwa->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE;
1133 #endif
1134
1135 pipe_nr &= 0xffff ;
1136
1137 lck_mtx_lock(dn_mutex);
1138
1139 /*
1140 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
1141 */
1142 fs = locate_flowset(pipe_nr, fwa->rule);
1143 if (fs == NULL)
1144 goto dropit ; /* this queue/pipe does not exist! */
1145 pipe = fs->pipe ;
1146 if (pipe == NULL) { /* must be a queue, try find a matching pipe */
1147 for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr;
1148 pipe = pipe->next)
1149 ;
1150 if (pipe != NULL)
1151 fs->pipe = pipe ;
1152 else {
1153 printf("dummynet: no pipe %d for queue %d, drop pkt\n",
1154 fs->parent_nr, fs->fs_nr);
1155 goto dropit ;
1156 }
1157 }
1158 q = find_queue(fs, &(fwa->f_id));
1159 if ( q == NULL )
1160 goto dropit ; /* cannot allocate queue */
1161 /*
1162 * update statistics, then check reasons to drop pkt
1163 */
1164 q->tot_bytes += len ;
1165 q->tot_pkts++ ;
1166 if ( fs->plr && random() < fs->plr )
1167 goto dropit ; /* random pkt drop */
1168 if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
1169 if (q->len_bytes > fs->qsize)
1170 goto dropit ; /* queue size overflow */
1171 } else {
1172 if (q->len >= fs->qsize)
1173 goto dropit ; /* queue count overflow */
1174 }
1175 if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
1176 goto dropit ;
1177
1178 /* XXX expensive to zero, see if we can remove it*/
1179 mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET,
1180 sizeof(struct dn_pkt_tag), M_NOWAIT|M_ZERO);
1181 if ( mtag == NULL )
1182 goto dropit ; /* cannot allocate packet header */
1183 m_tag_prepend(m, mtag); /* attach to mbuf chain */
1184
1185 pkt = (struct dn_pkt_tag *)(mtag+1);
1186 /* ok, i can handle the pkt now... */
1187 /* build and enqueue packet + parameters */
1188 pkt->rule = fwa->rule ;
1189 pkt->dn_dir = dir ;
1190
1191 pkt->ifp = fwa->oif;
1192 if (dir == DN_TO_IP_OUT) {
1193 /*
1194 * We need to copy *ro because for ICMP pkts (and maybe others)
1195 * the caller passed a pointer into the stack; dst might also be
1196 * a pointer into *ro so it needs to be updated.
1197 */
1198 lck_mtx_lock(rt_mtx);
1199 pkt->ro = *(fwa->ro);
1200 if (fwa->ro->ro_rt)
1201 fwa->ro->ro_rt->rt_refcnt++ ;
1202 if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */
1203 fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ;
1204 lck_mtx_unlock(rt_mtx);
1205
1206 pkt->dn_dst = fwa->dst;
1207 pkt->flags = fwa->flags;
1208 }
1209 if (q->head == NULL)
1210 q->head = m;
1211 else
1212 q->tail->m_nextpkt = m;
1213 q->tail = m;
1214 q->len++;
1215 q->len_bytes += len ;
1216
1217 if ( q->head != m ) /* flow was not idle, we are done */
1218 goto done;
1219 /*
1220 * If we reach this point the flow was previously idle, so we need
1221 * to schedule it. This involves different actions for fixed-rate or
1222 * WF2Q queues.
1223 */
1224 if (is_pipe) {
1225 /*
1226 * Fixed-rate queue: just insert into the ready_heap.
1227 */
1228 dn_key t = 0 ;
1229 if (pipe->bandwidth)
1230 t = SET_TICKS(m, q, pipe);
1231 q->sched_time = curr_time ;
1232 if (t == 0) /* must process it now */
1233 ready_event( q );
1234 else
1235 heap_insert(&ready_heap, curr_time + t , q );
1236 } else {
1237 /*
1238 * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
1239 * set S to the virtual time V for the controlling pipe, and update
1240 * the sum of weights for the pipe; otherwise, remove flow from
1241 * idle_heap and set S to max(F,V).
1242 * Second, compute finish time F = S + len/weight.
1243 * Third, if pipe was idle, update V=max(S, V).
1244 * Fourth, count one more backlogged flow.
1245 */
1246 if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
1247 q->S = pipe->V ;
1248 pipe->sum += fs->weight ; /* add weight of new queue */
1249 } else {
1250 heap_extract(&(pipe->idle_heap), q);
1251 q->S = MAX64(q->F, pipe->V ) ;
1252 }
1253 q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
1254
1255 if (pipe->not_eligible_heap.elements == 0 &&
1256 pipe->scheduler_heap.elements == 0)
1257 pipe->V = MAX64 ( q->S, pipe->V );
1258 fs->backlogged++ ;
1259 /*
1260 * Look at eligibility. A flow is not eligibile if S>V (when
1261 * this happens, it means that there is some other flow already
1262 * scheduled for the same pipe, so the scheduler_heap cannot be
1263 * empty). If the flow is not eligible we just store it in the
1264 * not_eligible_heap. Otherwise, we store in the scheduler_heap
1265 * and possibly invoke ready_event_wfq() right now if there is
1266 * leftover credit.
1267 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
1268 * and for all flows in not_eligible_heap (NEH), S_i > V .
1269 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
1270 * we only need to look into NEH.
1271 */
1272 if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */
1273 if (pipe->scheduler_heap.elements == 0)
1274 printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
1275 heap_insert(&(pipe->not_eligible_heap), q->S, q);
1276 } else {
1277 heap_insert(&(pipe->scheduler_heap), q->F, q);
1278 if (pipe->numbytes >= 0) { /* pipe is idle */
1279 if (pipe->scheduler_heap.elements != 1)
1280 printf("dummynet: OUCH! pipe should have been idle!\n");
1281 DPRINTF(("dummynet: waking up pipe %d at %d\n",
1282 pipe->pipe_nr, (int)(q->F >> MY_M)));
1283 pipe->sched_time = curr_time ;
1284 ready_event_wfq(pipe);
1285 }
1286 }
1287 }
1288 done:
1289 lck_mtx_unlock(dn_mutex);
1290 return 0;
1291
1292 dropit:
1293 if (q)
1294 q->drops++ ;
1295 lck_mtx_unlock(dn_mutex);
1296 m_freem(m);
1297 return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
1298 }
1299
1300 /*
1301 * Below, the rtfree is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
1302 * Doing this would probably save us the initial bzero of dn_pkt
1303 */
1304 #define DN_FREE_PKT(_m) do { \
1305 struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \
1306 if (tag) { \
1307 struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1); \
1308 if (n->ro.ro_rt) \
1309 rtfree(n->ro.ro_rt); \
1310 } \
1311 m_tag_delete(_m, tag); \
1312 m_freem(_m); \
1313 } while (0)
1314
1315 /*
1316 * Dispose all packets and flow_queues on a flow_set.
1317 * If all=1, also remove red lookup table and other storage,
1318 * including the descriptor itself.
1319 * For the one in dn_pipe MUST also cleanup ready_heap...
1320 */
1321 static void
1322 purge_flow_set(struct dn_flow_set *fs, int all)
1323 {
1324 struct dn_flow_queue *q, *qn ;
1325 int i ;
1326
1327 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1328
1329 for (i = 0 ; i <= fs->rq_size ; i++ ) {
1330 for (q = fs->rq[i] ; q ; q = qn ) {
1331 struct mbuf *m, *mnext;
1332
1333 mnext = q->head;
1334 while ((m = mnext) != NULL) {
1335 mnext = m->m_nextpkt;
1336 DN_FREE_PKT(m);
1337 }
1338 qn = q->next ;
1339 FREE(q, M_DUMMYNET);
1340 }
1341 fs->rq[i] = NULL ;
1342 }
1343 fs->rq_elements = 0 ;
1344 if (all) {
1345 /* RED - free lookup table */
1346 if (fs->w_q_lookup)
1347 FREE(fs->w_q_lookup, M_DUMMYNET);
1348 if (fs->rq)
1349 FREE(fs->rq, M_DUMMYNET);
1350 /* if this fs is not part of a pipe, free it */
1351 if (fs->pipe && fs != &(fs->pipe->fs) )
1352 FREE(fs, M_DUMMYNET);
1353 }
1354 }
1355
1356 /*
1357 * Dispose all packets queued on a pipe (not a flow_set).
1358 * Also free all resources associated to a pipe, which is about
1359 * to be deleted.
1360 */
1361 static void
1362 purge_pipe(struct dn_pipe *pipe)
1363 {
1364 struct mbuf *m, *mnext;
1365
1366 purge_flow_set( &(pipe->fs), 1 );
1367
1368 mnext = pipe->head;
1369 while ((m = mnext) != NULL) {
1370 mnext = m->m_nextpkt;
1371 DN_FREE_PKT(m);
1372 }
1373
1374 heap_free( &(pipe->scheduler_heap) );
1375 heap_free( &(pipe->not_eligible_heap) );
1376 heap_free( &(pipe->idle_heap) );
1377 }
1378
1379 /*
1380 * Delete all pipes and heaps returning memory. Must also
1381 * remove references from all ipfw rules to all pipes.
1382 */
1383 static void
1384 dummynet_flush()
1385 {
1386 struct dn_pipe *curr_p, *p ;
1387 struct dn_flow_set *fs, *curr_fs;
1388
1389 lck_mtx_lock(dn_mutex);
1390
1391 /* remove all references to pipes ...*/
1392 flush_pipe_ptrs(NULL);
1393 /* prevent future matches... */
1394 p = all_pipes ;
1395 all_pipes = NULL ;
1396 fs = all_flow_sets ;
1397 all_flow_sets = NULL ;
1398 /* and free heaps so we don't have unwanted events */
1399 heap_free(&ready_heap);
1400 heap_free(&wfq_ready_heap);
1401 heap_free(&extract_heap);
1402
1403 /*
1404 * Now purge all queued pkts and delete all pipes
1405 */
1406 /* scan and purge all flow_sets. */
1407 for ( ; fs ; ) {
1408 curr_fs = fs ;
1409 fs = fs->next ;
1410 purge_flow_set(curr_fs, 1);
1411 }
1412 for ( ; p ; ) {
1413 purge_pipe(p);
1414 curr_p = p ;
1415 p = p->next ;
1416 FREE(curr_p, M_DUMMYNET);
1417 }
1418 lck_mtx_unlock(dn_mutex);
1419 }
1420
1421
1422 extern struct ip_fw *ip_fw_default_rule ;
1423 static void
1424 dn_rule_delete_fs(struct dn_flow_set *fs, void *r)
1425 {
1426 int i ;
1427 struct dn_flow_queue *q ;
1428 struct mbuf *m ;
1429
1430 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
1431 for (q = fs->rq[i] ; q ; q = q->next )
1432 for (m = q->head ; m ; m = m->m_nextpkt ) {
1433 struct dn_pkt_tag *pkt = dn_tag_get(m) ;
1434 if (pkt->rule == r)
1435 pkt->rule = ip_fw_default_rule ;
1436 }
1437 }
1438 /*
1439 * when a firewall rule is deleted, scan all queues and remove the flow-id
1440 * from packets matching this rule.
1441 */
1442 void
1443 dn_rule_delete(void *r)
1444 {
1445 struct dn_pipe *p ;
1446 struct dn_flow_set *fs ;
1447 struct dn_pkt_tag *pkt ;
1448 struct mbuf *m ;
1449
1450 lck_mtx_lock(dn_mutex);
1451
1452 /*
1453 * If the rule references a queue (dn_flow_set), then scan
1454 * the flow set, otherwise scan pipes. Should do either, but doing
1455 * both does not harm.
1456 */
1457 for ( fs = all_flow_sets ; fs ; fs = fs->next )
1458 dn_rule_delete_fs(fs, r);
1459 for ( p = all_pipes ; p ; p = p->next ) {
1460 fs = &(p->fs) ;
1461 dn_rule_delete_fs(fs, r);
1462 for (m = p->head ; m ; m = m->m_nextpkt ) {
1463 pkt = dn_tag_get(m) ;
1464 if (pkt->rule == r)
1465 pkt->rule = ip_fw_default_rule ;
1466 }
1467 }
1468 lck_mtx_unlock(dn_mutex);
1469 }
1470
1471 /*
1472 * setup RED parameters
1473 */
1474 static int
1475 config_red(struct dn_flow_set *p, struct dn_flow_set * x)
1476 {
1477 int i;
1478
1479 x->w_q = p->w_q;
1480 x->min_th = SCALE(p->min_th);
1481 x->max_th = SCALE(p->max_th);
1482 x->max_p = p->max_p;
1483
1484 x->c_1 = p->max_p / (p->max_th - p->min_th);
1485 x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
1486 if (x->flags_fs & DN_IS_GENTLE_RED) {
1487 x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
1488 x->c_4 = (SCALE(1) - 2 * p->max_p);
1489 }
1490
1491 /* if the lookup table already exist, free and create it again */
1492 if (x->w_q_lookup) {
1493 FREE(x->w_q_lookup, M_DUMMYNET);
1494 x->w_q_lookup = NULL ;
1495 }
1496 if (red_lookup_depth == 0) {
1497 printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n");
1498 FREE(x, M_DUMMYNET);
1499 return EINVAL;
1500 }
1501 x->lookup_depth = red_lookup_depth;
1502 x->w_q_lookup = (u_int *) _MALLOC(x->lookup_depth * sizeof(int),
1503 M_DUMMYNET, M_DONTWAIT);
1504 if (x->w_q_lookup == NULL) {
1505 printf("dummynet: sorry, cannot allocate red lookup table\n");
1506 FREE(x, M_DUMMYNET);
1507 return ENOSPC;
1508 }
1509
1510 /* fill the lookup table with (1 - w_q)^x */
1511 x->lookup_step = p->lookup_step ;
1512 x->lookup_weight = p->lookup_weight ;
1513 x->w_q_lookup[0] = SCALE(1) - x->w_q;
1514 for (i = 1; i < x->lookup_depth; i++)
1515 x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
1516 if (red_avg_pkt_size < 1)
1517 red_avg_pkt_size = 512 ;
1518 x->avg_pkt_size = red_avg_pkt_size ;
1519 if (red_max_pkt_size < 1)
1520 red_max_pkt_size = 1500 ;
1521 x->max_pkt_size = red_max_pkt_size ;
1522 return 0 ;
1523 }
1524
1525 static int
1526 alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
1527 {
1528 if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */
1529 int l = pfs->rq_size;
1530
1531 if (l == 0)
1532 l = dn_hash_size;
1533 if (l < 4)
1534 l = 4;
1535 else if (l > DN_MAX_HASH_SIZE)
1536 l = DN_MAX_HASH_SIZE;
1537 x->rq_size = l;
1538 } else /* one is enough for null mask */
1539 x->rq_size = 1;
1540 x->rq = _MALLOC((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
1541 M_DUMMYNET, M_DONTWAIT | M_ZERO);
1542 if (x->rq == NULL) {
1543 printf("dummynet: sorry, cannot allocate queue\n");
1544 return ENOSPC;
1545 }
1546 x->rq_elements = 0;
1547 return 0 ;
1548 }
1549
1550 static void
1551 set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
1552 {
1553 x->flags_fs = src->flags_fs;
1554 x->qsize = src->qsize;
1555 x->plr = src->plr;
1556 x->flow_mask = src->flow_mask;
1557 if (x->flags_fs & DN_QSIZE_IS_BYTES) {
1558 if (x->qsize > 1024*1024)
1559 x->qsize = 1024*1024 ;
1560 } else {
1561 if (x->qsize == 0)
1562 x->qsize = 50 ;
1563 if (x->qsize > 100)
1564 x->qsize = 50 ;
1565 }
1566 /* configuring RED */
1567 if ( x->flags_fs & DN_IS_RED )
1568 config_red(src, x) ; /* XXX should check errors */
1569 }
1570
1571 /*
1572 * setup pipe or queue parameters.
1573 */
1574
1575 static int
1576 config_pipe(struct dn_pipe *p)
1577 {
1578 int i, r;
1579 struct dn_flow_set *pfs = &(p->fs);
1580 struct dn_flow_queue *q;
1581
1582 /*
1583 * The config program passes parameters as follows:
1584 * bw = bits/second (0 means no limits),
1585 * delay = ms, must be translated into ticks.
1586 * qsize = slots/bytes
1587 */
1588 p->delay = ( p->delay * hz ) / 1000 ;
1589 /* We need either a pipe number or a flow_set number */
1590 if (p->pipe_nr == 0 && pfs->fs_nr == 0)
1591 return EINVAL ;
1592 if (p->pipe_nr != 0 && pfs->fs_nr != 0)
1593 return EINVAL ;
1594 if (p->pipe_nr != 0) { /* this is a pipe */
1595 struct dn_pipe *x, *a, *b;
1596
1597 lck_mtx_lock(dn_mutex);
1598 /* locate pipe */
1599 for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
1600 a = b , b = b->next) ;
1601
1602 if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */
1603 x = _MALLOC(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO) ;
1604 if (x == NULL) {
1605 lck_mtx_unlock(dn_mutex);
1606 printf("dummynet: no memory for new pipe\n");
1607 return ENOSPC;
1608 }
1609 x->pipe_nr = p->pipe_nr;
1610 x->fs.pipe = x ;
1611 /* idle_heap is the only one from which we extract from the middle.
1612 */
1613 x->idle_heap.size = x->idle_heap.elements = 0 ;
1614 x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos);
1615 } else {
1616 x = b;
1617 /* Flush accumulated credit for all queues */
1618 for (i = 0; i <= x->fs.rq_size; i++)
1619 for (q = x->fs.rq[i]; q; q = q->next)
1620 q->numbytes = 0;
1621 }
1622
1623 x->bandwidth = p->bandwidth ;
1624 x->numbytes = 0; /* just in case... */
1625 bcopy(p->if_name, x->if_name, sizeof(p->if_name) );
1626 x->ifp = NULL ; /* reset interface ptr */
1627 x->delay = p->delay ;
1628 set_fs_parms(&(x->fs), pfs);
1629
1630
1631 if ( x->fs.rq == NULL ) { /* a new pipe */
1632 r = alloc_hash(&(x->fs), pfs) ;
1633 if (r) {
1634 lck_mtx_unlock(dn_mutex);
1635 FREE(x, M_DUMMYNET);
1636 return r ;
1637 }
1638 x->next = b ;
1639 if (a == NULL)
1640 all_pipes = x ;
1641 else
1642 a->next = x ;
1643 }
1644 lck_mtx_unlock(dn_mutex);
1645 } else { /* config queue */
1646 struct dn_flow_set *x, *a, *b ;
1647
1648 lck_mtx_lock(dn_mutex);
1649 /* locate flow_set */
1650 for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ;
1651 a = b , b = b->next) ;
1652
1653 if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */
1654 if (pfs->parent_nr == 0) { /* need link to a pipe */
1655 lck_mtx_unlock(dn_mutex);
1656 return EINVAL ;
1657 }
1658 x = _MALLOC(sizeof(struct dn_flow_set), M_DUMMYNET, M_DONTWAIT | M_ZERO);
1659 if (x == NULL) {
1660 lck_mtx_unlock(dn_mutex);
1661 printf("dummynet: no memory for new flow_set\n");
1662 return ENOSPC;
1663 }
1664 x->fs_nr = pfs->fs_nr;
1665 x->parent_nr = pfs->parent_nr;
1666 x->weight = pfs->weight ;
1667 if (x->weight == 0)
1668 x->weight = 1 ;
1669 else if (x->weight > 100)
1670 x->weight = 100 ;
1671 } else {
1672 /* Change parent pipe not allowed; must delete and recreate */
1673 if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) {
1674 lck_mtx_unlock(dn_mutex);
1675 return EINVAL ;
1676 }
1677 x = b;
1678 }
1679 set_fs_parms(x, pfs);
1680
1681 if ( x->rq == NULL ) { /* a new flow_set */
1682 r = alloc_hash(x, pfs) ;
1683 if (r) {
1684 lck_mtx_unlock(dn_mutex);
1685 FREE(x, M_DUMMYNET);
1686 return r ;
1687 }
1688 x->next = b;
1689 if (a == NULL)
1690 all_flow_sets = x;
1691 else
1692 a->next = x;
1693 }
1694 lck_mtx_unlock(dn_mutex);
1695 }
1696 return 0 ;
1697 }
1698
1699 /*
1700 * Helper function to remove from a heap queues which are linked to
1701 * a flow_set about to be deleted.
1702 */
1703 static void
1704 fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
1705 {
1706 int i = 0, found = 0 ;
1707 for (; i < h->elements ;)
1708 if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
1709 h->elements-- ;
1710 h->p[i] = h->p[h->elements] ;
1711 found++ ;
1712 } else
1713 i++ ;
1714 if (found)
1715 heapify(h);
1716 }
1717
1718 /*
1719 * helper function to remove a pipe from a heap (can be there at most once)
1720 */
1721 static void
1722 pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
1723 {
1724 if (h->elements > 0) {
1725 int i = 0 ;
1726 for (i=0; i < h->elements ; i++ ) {
1727 if (h->p[i].object == p) { /* found it */
1728 h->elements-- ;
1729 h->p[i] = h->p[h->elements] ;
1730 heapify(h);
1731 break ;
1732 }
1733 }
1734 }
1735 }
1736
1737 /*
1738 * drain all queues. Called in case of severe mbuf shortage.
1739 */
1740 void
1741 dummynet_drain()
1742 {
1743 struct dn_flow_set *fs;
1744 struct dn_pipe *p;
1745 struct mbuf *m, *mnext;
1746
1747 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1748
1749 heap_free(&ready_heap);
1750 heap_free(&wfq_ready_heap);
1751 heap_free(&extract_heap);
1752 /* remove all references to this pipe from flow_sets */
1753 for (fs = all_flow_sets; fs; fs= fs->next )
1754 purge_flow_set(fs, 0);
1755
1756 for (p = all_pipes; p; p= p->next ) {
1757 purge_flow_set(&(p->fs), 0);
1758
1759 mnext = p->head;
1760 while ((m = mnext) != NULL) {
1761 mnext = m->m_nextpkt;
1762 DN_FREE_PKT(m);
1763 }
1764 p->head = p->tail = NULL ;
1765 }
1766 }
1767
1768 /*
1769 * Fully delete a pipe or a queue, cleaning up associated info.
1770 */
1771 static int
1772 delete_pipe(struct dn_pipe *p)
1773 {
1774 if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
1775 return EINVAL ;
1776 if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
1777 return EINVAL ;
1778 if (p->pipe_nr != 0) { /* this is an old-style pipe */
1779 struct dn_pipe *a, *b;
1780 struct dn_flow_set *fs;
1781
1782 lck_mtx_lock(dn_mutex);
1783 /* locate pipe */
1784 for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
1785 a = b , b = b->next) ;
1786 if (b == NULL || (b->pipe_nr != p->pipe_nr) ) {
1787 lck_mtx_unlock(dn_mutex);
1788 return EINVAL ; /* not found */
1789 }
1790
1791 /* unlink from list of pipes */
1792 if (a == NULL)
1793 all_pipes = b->next ;
1794 else
1795 a->next = b->next ;
1796 /* remove references to this pipe from the ip_fw rules. */
1797 flush_pipe_ptrs(&(b->fs));
1798
1799 /* remove all references to this pipe from flow_sets */
1800 for (fs = all_flow_sets; fs; fs= fs->next )
1801 if (fs->pipe == b) {
1802 printf("dummynet: ++ ref to pipe %d from fs %d\n",
1803 p->pipe_nr, fs->fs_nr);
1804 fs->pipe = NULL ;
1805 purge_flow_set(fs, 0);
1806 }
1807 fs_remove_from_heap(&ready_heap, &(b->fs));
1808 purge_pipe(b); /* remove all data associated to this pipe */
1809 /* remove reference to here from extract_heap and wfq_ready_heap */
1810 pipe_remove_from_heap(&extract_heap, b);
1811 pipe_remove_from_heap(&wfq_ready_heap, b);
1812 lck_mtx_unlock(dn_mutex);
1813
1814 FREE(b, M_DUMMYNET);
1815 } else { /* this is a WF2Q queue (dn_flow_set) */
1816 struct dn_flow_set *a, *b;
1817
1818 lck_mtx_lock(dn_mutex);
1819 /* locate set */
1820 for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ;
1821 a = b , b = b->next) ;
1822 if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) {
1823 lck_mtx_unlock(dn_mutex);
1824 return EINVAL ; /* not found */
1825 }
1826
1827 if (a == NULL)
1828 all_flow_sets = b->next ;
1829 else
1830 a->next = b->next ;
1831 /* remove references to this flow_set from the ip_fw rules. */
1832 flush_pipe_ptrs(b);
1833
1834 if (b->pipe != NULL) {
1835 /* Update total weight on parent pipe and cleanup parent heaps */
1836 b->pipe->sum -= b->weight * b->backlogged ;
1837 fs_remove_from_heap(&(b->pipe->not_eligible_heap), b);
1838 fs_remove_from_heap(&(b->pipe->scheduler_heap), b);
1839 #if 1 /* XXX should i remove from idle_heap as well ? */
1840 fs_remove_from_heap(&(b->pipe->idle_heap), b);
1841 #endif
1842 }
1843 purge_flow_set(b, 1);
1844 lck_mtx_unlock(dn_mutex);
1845 }
1846 return 0 ;
1847 }
1848
1849 /*
1850 * helper function used to copy data from kernel in DUMMYNET_GET
1851 */
1852 static char *
1853 dn_copy_set(struct dn_flow_set *set, char *bp)
1854 {
1855 int i, copied = 0 ;
1856 struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
1857
1858 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1859
1860 for (i = 0 ; i <= set->rq_size ; i++)
1861 for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
1862 if (q->hash_slot != i)
1863 printf("dummynet: ++ at %d: wrong slot (have %d, "
1864 "should be %d)\n", copied, q->hash_slot, i);
1865 if (q->fs != set)
1866 printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
1867 i, q->fs, set);
1868 copied++ ;
1869 bcopy(q, qp, sizeof( *q ) );
1870 /* cleanup pointers */
1871 qp->next = NULL ;
1872 qp->head = qp->tail = NULL ;
1873 qp->fs = NULL ;
1874 }
1875 if (copied != set->rq_elements)
1876 printf("dummynet: ++ wrong count, have %d should be %d\n",
1877 copied, set->rq_elements);
1878 return (char *)qp ;
1879 }
1880
1881 static size_t
1882 dn_calc_size(void)
1883 {
1884 struct dn_flow_set *set ;
1885 struct dn_pipe *p ;
1886 size_t size ;
1887
1888 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1889
1890 /*
1891 * compute size of data structures: list of pipes and flow_sets.
1892 */
1893 for (p = all_pipes, size = 0 ; p ; p = p->next )
1894 size += sizeof( *p ) +
1895 p->fs.rq_elements * sizeof(struct dn_flow_queue);
1896 for (set = all_flow_sets ; set ; set = set->next )
1897 size += sizeof ( *set ) +
1898 set->rq_elements * sizeof(struct dn_flow_queue);
1899 return size ;
1900 }
1901
1902 static int
1903 dummynet_get(struct sockopt *sopt)
1904 {
1905 char *buf, *bp ; /* bp is the "copy-pointer" */
1906 size_t size ;
1907 struct dn_flow_set *set ;
1908 struct dn_pipe *p ;
1909 int error=0, i ;
1910
1911 /* XXX lock held too long */
1912 lck_mtx_lock(dn_mutex);
1913 /*
1914 * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
1915 * cannot use this flag while holding a mutex.
1916 */
1917 for (i = 0; i < 10; i++) {
1918 size = dn_calc_size();
1919 lck_mtx_unlock(dn_mutex);
1920 buf = _MALLOC(size, M_TEMP, M_WAITOK);
1921 lck_mtx_lock(dn_mutex);
1922 if (size == dn_calc_size())
1923 break;
1924 FREE(buf, M_TEMP);
1925 buf = NULL;
1926 }
1927 if (buf == NULL) {
1928 lck_mtx_unlock(dn_mutex);
1929 return ENOBUFS ;
1930 }
1931 for (p = all_pipes, bp = buf ; p ; p = p->next ) {
1932 struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ;
1933
1934 /*
1935 * copy pipe descriptor into *bp, convert delay back to ms,
1936 * then copy the flow_set descriptor(s) one at a time.
1937 * After each flow_set, copy the queue descriptor it owns.
1938 */
1939 bcopy(p, bp, sizeof( *p ) );
1940 pipe_bp->delay = (pipe_bp->delay * 1000) / hz ;
1941 /*
1942 * XXX the following is a hack based on ->next being the
1943 * first field in dn_pipe and dn_flow_set. The correct
1944 * solution would be to move the dn_flow_set to the beginning
1945 * of struct dn_pipe.
1946 */
1947 pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ;
1948 /* clean pointers */
1949 pipe_bp->head = pipe_bp->tail = NULL ;
1950 pipe_bp->fs.next = NULL ;
1951 pipe_bp->fs.pipe = NULL ;
1952 pipe_bp->fs.rq = NULL ;
1953
1954 bp += sizeof( *p ) ;
1955 bp = dn_copy_set( &(p->fs), bp );
1956 }
1957 for (set = all_flow_sets ; set ; set = set->next ) {
1958 struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ;
1959 bcopy(set, bp, sizeof( *set ) );
1960 /* XXX same hack as above */
1961 fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ;
1962 fs_bp->pipe = NULL ;
1963 fs_bp->rq = NULL ;
1964 bp += sizeof( *set ) ;
1965 bp = dn_copy_set( set, bp );
1966 }
1967 lck_mtx_unlock(dn_mutex);
1968
1969 error = sooptcopyout(sopt, buf, size);
1970 FREE(buf, M_TEMP);
1971 return error ;
1972 }
1973
1974 /*
1975 * Handler for the various dummynet socket options (get, flush, config, del)
1976 */
1977 static int
1978 ip_dn_ctl(struct sockopt *sopt)
1979 {
1980 int error = 0 ;
1981 struct dn_pipe *p, tmp_pipe;
1982
1983 /* Disallow sets in really-really secure mode. */
1984 if (sopt->sopt_dir == SOPT_SET && securelevel >= 3)
1985 return (EPERM);
1986
1987 switch (sopt->sopt_name) {
1988 default :
1989 printf("dummynet: -- unknown option %d", sopt->sopt_name);
1990 return EINVAL ;
1991
1992 case IP_DUMMYNET_GET :
1993 error = dummynet_get(sopt);
1994 break ;
1995
1996 case IP_DUMMYNET_FLUSH :
1997 dummynet_flush() ;
1998 break ;
1999
2000 case IP_DUMMYNET_CONFIGURE :
2001 p = &tmp_pipe ;
2002 error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
2003 if (error)
2004 break ;
2005 error = config_pipe(p);
2006 break ;
2007
2008 case IP_DUMMYNET_DEL : /* remove a pipe or queue */
2009 p = &tmp_pipe ;
2010 error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
2011 if (error)
2012 break ;
2013
2014 error = delete_pipe(p);
2015 break ;
2016 }
2017 return error ;
2018 }
2019
2020 void
2021 ip_dn_init(void)
2022 {
2023 /* setup locks */
2024 dn_mutex_grp_attr = lck_grp_attr_alloc_init();
2025 dn_mutex_grp = lck_grp_alloc_init("dn", dn_mutex_grp_attr);
2026 dn_mutex_attr = lck_attr_alloc_init();
2027 lck_attr_setdefault(dn_mutex_attr);
2028
2029 if ((dn_mutex = lck_mtx_alloc_init(dn_mutex_grp, dn_mutex_attr)) == NULL) {
2030 printf("ip_dn_init: can't alloc dn_mutex\n");
2031 return;
2032 }
2033
2034 all_pipes = NULL ;
2035 all_flow_sets = NULL ;
2036 ready_heap.size = ready_heap.elements = 0 ;
2037 ready_heap.offset = 0 ;
2038
2039 wfq_ready_heap.size = wfq_ready_heap.elements = 0 ;
2040 wfq_ready_heap.offset = 0 ;
2041
2042 extract_heap.size = extract_heap.elements = 0 ;
2043 extract_heap.offset = 0 ;
2044 ip_dn_ctl_ptr = ip_dn_ctl;
2045 ip_dn_io_ptr = dummynet_io;
2046 ip_dn_ruledel_ptr = dn_rule_delete;
2047
2048 timeout(dummynet, NULL, 1);
2049 }