]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/ip_dummynet.c
6b599e42c2906a93322b33dbb1987a1860f9abf5
[apple/xnu.git] / bsd / netinet / ip_dummynet.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /*
23 * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
24 * Portions Copyright (c) 2000 Akamba Corp.
25 * All rights reserved
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
37 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
39 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
40 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
41 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
42 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
44 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
45 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46 * SUCH DAMAGE.
47 *
48 * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $
49 */
50
51 #define DUMMYNET_DEBUG
52
53 /*
54 * This module implements IP dummynet, a bandwidth limiter/delay emulator
55 * used in conjunction with the ipfw package.
56 * Description of the data structures used is in ip_dummynet.h
57 * Here you mainly find the following blocks of code:
58 * + variable declarations;
59 * + heap management functions;
60 * + scheduler and dummynet functions;
61 * + configuration and initialization.
62 *
63 * NOTA BENE: critical sections are protected by the "dummynet lock".
64 *
65 * Most important Changes:
66 *
67 * 010124: Fixed WF2Q behaviour
68 * 010122: Fixed spl protection.
69 * 000601: WF2Q support
70 * 000106: large rewrite, use heaps to handle very many pipes.
71 * 980513: initial release
72 *
73 * include files marked with XXX are probably not needed
74 */
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/queue.h> /* XXX */
81 #include <sys/kernel.h>
82 #include <sys/socket.h>
83 #include <sys/socketvar.h>
84 #include <sys/time.h>
85 #include <sys/sysctl.h>
86 #include <net/if.h>
87 #include <net/route.h>
88 #include <net/kpi_protocol.h>
89 #include <netinet/in.h>
90 #include <netinet/in_systm.h>
91 #include <netinet/in_var.h>
92 #include <netinet/ip.h>
93 #include <netinet/ip_fw.h>
94 #include <netinet/ip_dummynet.h>
95 #include <netinet/ip_var.h>
96
97 #if BRIDGE
98 #include <netinet/if_ether.h> /* for struct arpcom */
99 #include <net/bridge.h>
100 #endif
101
102 /*
103 * We keep a private variable for the simulation time, but we could
104 * probably use an existing one ("softticks" in sys/kern/kern_timer.c)
105 */
106 static dn_key curr_time = 0 ; /* current simulation time */
107
108 /* this is for the timer that fires to call dummynet() - we only enable the timer when
109 there are packets to process, otherwise it's disabled */
110 static int timer_enabled = 0;
111
112 static int dn_hash_size = 64 ; /* default hash size */
113
114 /* statistics on number of queue searches and search steps */
115 static int searches, search_steps ;
116 static int pipe_expire = 1 ; /* expire queue if empty */
117 static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
118
119 static int red_lookup_depth = 256; /* RED - default lookup table depth */
120 static int red_avg_pkt_size = 512; /* RED - default medium packet size */
121 static int red_max_pkt_size = 1500; /* RED - default max packet size */
122
123 /*
124 * Three heaps contain queues and pipes that the scheduler handles:
125 *
126 * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
127 *
128 * wfq_ready_heap contains the pipes associated with WF2Q flows
129 *
130 * extract_heap contains pipes associated with delay lines.
131 *
132 */
133 static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
134
135 static int heap_init(struct dn_heap *h, int size) ;
136 static int heap_insert (struct dn_heap *h, dn_key key1, void *p);
137 static void heap_extract(struct dn_heap *h, void *obj);
138
139 static void transmit_event(struct dn_pipe *pipe);
140 static void ready_event(struct dn_flow_queue *q);
141
142 static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */
143 static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */
144
145 #ifdef SYSCTL_NODE
146 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
147 CTLFLAG_RW, 0, "Dummynet");
148 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
149 CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
150 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time,
151 CTLFLAG_RD, &curr_time, 0, "Current tick");
152 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
153 CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
154 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
155 CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
156 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
157 CTLFLAG_RD, &searches, 0, "Number of queue searches");
158 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
159 CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
160 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
161 CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
162 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
163 CTLFLAG_RW, &dn_max_ratio, 0,
164 "Max ratio between dynamic queues and buckets");
165 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
166 CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
167 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
168 CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
169 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
170 CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
171 #endif
172
173 #ifdef DUMMYNET_DEBUG
174 int dummynet_debug = 0;
175 #ifdef SYSCTL_NODE
176 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
177 0, "control debugging printfs");
178 #endif
179 #define DPRINTF(X) if (dummynet_debug) printf X
180 #else
181 #define DPRINTF(X)
182 #endif
183
184 /* dummynet lock */
185 lck_grp_t *dn_mutex_grp;
186 lck_grp_attr_t *dn_mutex_grp_attr;
187 lck_attr_t *dn_mutex_attr;
188 lck_mtx_t *dn_mutex;
189
190 static int config_pipe(struct dn_pipe *p);
191 static int ip_dn_ctl(struct sockopt *sopt);
192
193 static void dummynet(void *);
194 static void dummynet_flush(void);
195 void dummynet_drain(void);
196 static ip_dn_io_t dummynet_io;
197 static void dn_rule_delete(void *);
198
199 int if_tx_rdy(struct ifnet *ifp);
200
201 extern lck_mtx_t *rt_mtx; /* route global lock */
202
203 /*
204 * Heap management functions.
205 *
206 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
207 * Some macros help finding parent/children so we can optimize them.
208 *
209 * heap_init() is called to expand the heap when needed.
210 * Increment size in blocks of 16 entries.
211 * XXX failure to allocate a new element is a pretty bad failure
212 * as we basically stall a whole queue forever!!
213 * Returns 1 on error, 0 on success
214 */
215 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
216 #define HEAP_LEFT(x) ( 2*(x) + 1 )
217 #define HEAP_IS_LEFT(x) ( (x) & 1 )
218 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
219 #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
220 #define HEAP_INCREMENT 15
221
222 static int
223 heap_init(struct dn_heap *h, int new_size)
224 {
225 struct dn_heap_entry *p;
226
227 if (h->size >= new_size ) {
228 printf("dummynet: heap_init, Bogus call, have %d want %d\n",
229 h->size, new_size);
230 return 0 ;
231 }
232 new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
233 p = _MALLOC(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT );
234 if (p == NULL) {
235 printf("dummynet: heap_init, resize %d failed\n", new_size );
236 return 1 ; /* error */
237 }
238 if (h->size > 0) {
239 bcopy(h->p, p, h->size * sizeof(*p) );
240 FREE(h->p, M_DUMMYNET);
241 }
242 h->p = p ;
243 h->size = new_size ;
244 return 0 ;
245 }
246
247 /*
248 * Insert element in heap. Normally, p != NULL, we insert p in
249 * a new position and bubble up. If p == NULL, then the element is
250 * already in place, and key is the position where to start the
251 * bubble-up.
252 * Returns 1 on failure (cannot allocate new heap entry)
253 *
254 * If offset > 0 the position (index, int) of the element in the heap is
255 * also stored in the element itself at the given offset in bytes.
256 */
257 #define SET_OFFSET(heap, node) \
258 if (heap->offset > 0) \
259 *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
260 /*
261 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
262 */
263 #define RESET_OFFSET(heap, node) \
264 if (heap->offset > 0) \
265 *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
266 static int
267 heap_insert(struct dn_heap *h, dn_key key1, void *p)
268 {
269 int son = h->elements ;
270
271 if (p == NULL) /* data already there, set starting point */
272 son = key1 ;
273 else { /* insert new element at the end, possibly resize */
274 son = h->elements ;
275 if (son == h->size) /* need resize... */
276 if (heap_init(h, h->elements+1) )
277 return 1 ; /* failure... */
278 h->p[son].object = p ;
279 h->p[son].key = key1 ;
280 h->elements++ ;
281 }
282 while (son > 0) { /* bubble up */
283 int father = HEAP_FATHER(son) ;
284 struct dn_heap_entry tmp ;
285
286 if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
287 break ; /* found right position */
288 /* son smaller than father, swap and repeat */
289 HEAP_SWAP(h->p[son], h->p[father], tmp) ;
290 SET_OFFSET(h, son);
291 son = father ;
292 }
293 SET_OFFSET(h, son);
294 return 0 ;
295 }
296
297 /*
298 * remove top element from heap, or obj if obj != NULL
299 */
300 static void
301 heap_extract(struct dn_heap *h, void *obj)
302 {
303 int child, father, max = h->elements - 1 ;
304
305 if (max < 0) {
306 printf("dummynet: warning, extract from empty heap 0x%p\n", h);
307 return ;
308 }
309 father = 0 ; /* default: move up smallest child */
310 if (obj != NULL) { /* extract specific element, index is at offset */
311 if (h->offset <= 0)
312 panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
313 father = *((int *)((char *)obj + h->offset)) ;
314 if (father < 0 || father >= h->elements) {
315 printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
316 father, h->elements);
317 panic("dummynet: heap_extract");
318 }
319 }
320 RESET_OFFSET(h, father);
321 child = HEAP_LEFT(father) ; /* left child */
322 while (child <= max) { /* valid entry */
323 if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
324 child = child+1 ; /* take right child, otherwise left */
325 h->p[father] = h->p[child] ;
326 SET_OFFSET(h, father);
327 father = child ;
328 child = HEAP_LEFT(child) ; /* left child for next loop */
329 }
330 h->elements-- ;
331 if (father != max) {
332 /*
333 * Fill hole with last entry and bubble up, reusing the insert code
334 */
335 h->p[father] = h->p[max] ;
336 heap_insert(h, father, NULL); /* this one cannot fail */
337 }
338 }
339
340 #if 0
341 /*
342 * change object position and update references
343 * XXX this one is never used!
344 */
345 static void
346 heap_move(struct dn_heap *h, dn_key new_key, void *object)
347 {
348 int temp;
349 int i ;
350 int max = h->elements-1 ;
351 struct dn_heap_entry buf ;
352
353 if (h->offset <= 0)
354 panic("cannot move items on this heap");
355
356 i = *((int *)((char *)object + h->offset));
357 if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
358 h->p[i].key = new_key ;
359 for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
360 i = temp ) { /* bubble up */
361 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
362 SET_OFFSET(h, i);
363 }
364 } else { /* must move down */
365 h->p[i].key = new_key ;
366 while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
367 if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
368 temp++ ; /* select child with min key */
369 if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
370 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
371 SET_OFFSET(h, i);
372 } else
373 break ;
374 i = temp ;
375 }
376 }
377 SET_OFFSET(h, i);
378 }
379 #endif /* heap_move, unused */
380
381 /*
382 * heapify() will reorganize data inside an array to maintain the
383 * heap property. It is needed when we delete a bunch of entries.
384 */
385 static void
386 heapify(struct dn_heap *h)
387 {
388 int i ;
389
390 for (i = 0 ; i < h->elements ; i++ )
391 heap_insert(h, i , NULL) ;
392 }
393
394 /*
395 * cleanup the heap and free data structure
396 */
397 static void
398 heap_free(struct dn_heap *h)
399 {
400 if (h->size >0 )
401 FREE(h->p, M_DUMMYNET);
402 bzero(h, sizeof(*h) );
403 }
404
405 /*
406 * --- end of heap management functions ---
407 */
408
409 /*
410 * Return the mbuf tag holding the dummynet state. As an optimization
411 * this is assumed to be the first tag on the list. If this turns out
412 * wrong we'll need to search the list.
413 */
414 static struct dn_pkt_tag *
415 dn_tag_get(struct mbuf *m)
416 {
417 struct m_tag *mtag = m_tag_first(m);
418 /* KASSERT(mtag != NULL &&
419 mtag->m_tag_id == KERNEL_MODULE_TAG_ID &&
420 mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET,
421 ("packet on dummynet queue w/o dummynet tag!"));
422 */
423 return (struct dn_pkt_tag *)(mtag+1);
424 }
425
426 /*
427 * Scheduler functions:
428 *
429 * transmit_event() is called when the delay-line needs to enter
430 * the scheduler, either because of existing pkts getting ready,
431 * or new packets entering the queue. The event handled is the delivery
432 * time of the packet.
433 *
434 * ready_event() does something similar with fixed-rate queues, and the
435 * event handled is the finish time of the head pkt.
436 *
437 * wfq_ready_event() does something similar with WF2Q queues, and the
438 * event handled is the start time of the head pkt.
439 *
440 * In all cases, we make sure that the data structures are consistent
441 * before passing pkts out, because this might trigger recursive
442 * invocations of the procedures.
443 */
444 static void
445 transmit_event(struct dn_pipe *pipe)
446 {
447 struct mbuf *m ;
448 struct dn_pkt_tag *pkt ;
449 struct ip *ip;
450
451 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
452
453 while ( (m = pipe->head) ) {
454 pkt = dn_tag_get(m);
455 if ( !DN_KEY_LEQ(pkt->output_time, curr_time) )
456 break;
457 /*
458 * first unlink, then call procedures, since ip_input() can invoke
459 * ip_output() and viceversa, thus causing nested calls
460 */
461 pipe->head = m->m_nextpkt ;
462 m->m_nextpkt = NULL;
463
464 /* XXX: drop the lock for now to avoid LOR's */
465 lck_mtx_unlock(dn_mutex);
466 switch (pkt->dn_dir) {
467 case DN_TO_IP_OUT: {
468 struct route tmp_rt = pkt->ro;
469 (void)ip_output(m, NULL, NULL, pkt->flags, NULL);
470 if (tmp_rt.ro_rt) {
471 rtfree(tmp_rt.ro_rt);
472 }
473 break ;
474 }
475 case DN_TO_IP_IN :
476 proto_inject(PF_INET, m);
477 break ;
478
479 #if BRIDGE
480 case DN_TO_BDG_FWD :
481 /*
482 * The bridge requires/assumes the Ethernet header is
483 * contiguous in the first mbuf header. Insure this is true.
484 */
485 if (BDG_LOADED) {
486 if (m->m_len < ETHER_HDR_LEN &&
487 (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
488 printf("dummynet/bridge: pullup fail, dropping pkt\n");
489 break;
490 }
491 m = bdg_forward_ptr(m, pkt->ifp);
492 } else {
493 /* somebody unloaded the bridge module. Drop pkt */
494 /* XXX rate limit */
495 printf("dummynet: dropping bridged packet trapped in pipe\n");
496 }
497 if (m)
498 m_freem(m);
499 break;
500 #endif
501 default:
502 printf("dummynet: bad switch %d!\n", pkt->dn_dir);
503 m_freem(m);
504 break ;
505 }
506 lck_mtx_lock(dn_mutex);
507 }
508 /* if there are leftover packets, put into the heap for next event */
509 if ( (m = pipe->head) ) {
510 pkt = dn_tag_get(m);
511 /* XXX should check errors on heap_insert, by draining the
512 * whole pipe p and hoping in the future we are more successful
513 */
514 heap_insert(&extract_heap, pkt->output_time, pipe);
515 }
516 }
517
518 /*
519 * the following macro computes how many ticks we have to wait
520 * before being able to transmit a packet. The credit is taken from
521 * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
522 */
523
524 /* hz is 100, which gives a granularity of 10ms in the old timer.
525 * The timer has been changed to fire every 1ms, so the use of
526 * hz has been modified here. All instances of hz have been left
527 * in place but adjusted by a factor of 10 so that hz is functionally
528 * equal to 1000.
529 */
530 #define SET_TICKS(_m, q, p) \
531 ((_m)->m_pkthdr.len*8*(hz*10) - (q)->numbytes + p->bandwidth - 1 ) / \
532 p->bandwidth ;
533
534 /*
535 * extract pkt from queue, compute output time (could be now)
536 * and put into delay line (p_queue)
537 */
538 static void
539 move_pkt(struct mbuf *pkt, struct dn_flow_queue *q,
540 struct dn_pipe *p, int len)
541 {
542 struct dn_pkt_tag *dt = dn_tag_get(pkt);
543
544 q->head = pkt->m_nextpkt ;
545 q->len-- ;
546 q->len_bytes -= len ;
547
548 dt->output_time = curr_time + p->delay ;
549
550 if (p->head == NULL)
551 p->head = pkt;
552 else
553 p->tail->m_nextpkt = pkt;
554 p->tail = pkt;
555 p->tail->m_nextpkt = NULL;
556 }
557
558 /*
559 * ready_event() is invoked every time the queue must enter the
560 * scheduler, either because the first packet arrives, or because
561 * a previously scheduled event fired.
562 * On invokation, drain as many pkts as possible (could be 0) and then
563 * if there are leftover packets reinsert the pkt in the scheduler.
564 */
565 static void
566 ready_event(struct dn_flow_queue *q)
567 {
568 struct mbuf *pkt;
569 struct dn_pipe *p = q->fs->pipe ;
570 int p_was_empty ;
571
572 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
573
574 if (p == NULL) {
575 printf("dummynet: ready_event- pipe is gone\n");
576 return ;
577 }
578 p_was_empty = (p->head == NULL) ;
579
580 /*
581 * schedule fixed-rate queues linked to this pipe:
582 * Account for the bw accumulated since last scheduling, then
583 * drain as many pkts as allowed by q->numbytes and move to
584 * the delay line (in p) computing output time.
585 * bandwidth==0 (no limit) means we can drain the whole queue,
586 * setting len_scaled = 0 does the job.
587 */
588 q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
589 while ( (pkt = q->head) != NULL ) {
590 int len = pkt->m_pkthdr.len;
591 int len_scaled = p->bandwidth ? len*8*(hz*10) : 0 ;
592 if (len_scaled > q->numbytes )
593 break ;
594 q->numbytes -= len_scaled ;
595 move_pkt(pkt, q, p, len);
596 }
597 /*
598 * If we have more packets queued, schedule next ready event
599 * (can only occur when bandwidth != 0, otherwise we would have
600 * flushed the whole queue in the previous loop).
601 * To this purpose we record the current time and compute how many
602 * ticks to go for the finish time of the packet.
603 */
604 if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */
605 dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
606 q->sched_time = curr_time ;
607 heap_insert(&ready_heap, curr_time + t, (void *)q );
608 /* XXX should check errors on heap_insert, and drain the whole
609 * queue on error hoping next time we are luckier.
610 */
611 } else { /* RED needs to know when the queue becomes empty */
612 q->q_time = curr_time;
613 q->numbytes = 0;
614 }
615 /*
616 * If the delay line was empty call transmit_event(p) now.
617 * Otherwise, the scheduler will take care of it.
618 */
619 if (p_was_empty)
620 transmit_event(p);
621 }
622
623 /*
624 * Called when we can transmit packets on WF2Q queues. Take pkts out of
625 * the queues at their start time, and enqueue into the delay line.
626 * Packets are drained until p->numbytes < 0. As long as
627 * len_scaled >= p->numbytes, the packet goes into the delay line
628 * with a deadline p->delay. For the last packet, if p->numbytes<0,
629 * there is an additional delay.
630 */
631 static void
632 ready_event_wfq(struct dn_pipe *p)
633 {
634 int p_was_empty = (p->head == NULL) ;
635 struct dn_heap *sch = &(p->scheduler_heap);
636 struct dn_heap *neh = &(p->not_eligible_heap) ;
637
638 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
639
640 if (p->if_name[0] == 0) /* tx clock is simulated */
641 p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
642 else { /* tx clock is for real, the ifq must be empty or this is a NOP */
643 if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
644 return ;
645 else {
646 DPRINTF(("dummynet: pipe %d ready from %s --\n",
647 p->pipe_nr, p->if_name));
648 }
649 }
650
651 /*
652 * While we have backlogged traffic AND credit, we need to do
653 * something on the queue.
654 */
655 while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) {
656 if (sch->elements > 0) { /* have some eligible pkts to send out */
657 struct dn_flow_queue *q = sch->p[0].object ;
658 struct mbuf *pkt = q->head;
659 struct dn_flow_set *fs = q->fs;
660 u_int64_t len = pkt->m_pkthdr.len;
661 int len_scaled = p->bandwidth ? len*8*(hz*10) : 0 ;
662
663 heap_extract(sch, NULL); /* remove queue from heap */
664 p->numbytes -= len_scaled ;
665 move_pkt(pkt, q, p, len);
666
667 p->V += (len<<MY_M) / p->sum ; /* update V */
668 q->S = q->F ; /* update start time */
669 if (q->len == 0) { /* Flow not backlogged any more */
670 fs->backlogged-- ;
671 heap_insert(&(p->idle_heap), q->F, q);
672 } else { /* still backlogged */
673 /*
674 * update F and position in backlogged queue, then
675 * put flow in not_eligible_heap (we will fix this later).
676 */
677 len = (q->head)->m_pkthdr.len;
678 q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
679 if (DN_KEY_LEQ(q->S, p->V))
680 heap_insert(neh, q->S, q);
681 else
682 heap_insert(sch, q->F, q);
683 }
684 }
685 /*
686 * now compute V = max(V, min(S_i)). Remember that all elements in sch
687 * have by definition S_i <= V so if sch is not empty, V is surely
688 * the max and we must not update it. Conversely, if sch is empty
689 * we only need to look at neh.
690 */
691 if (sch->elements == 0 && neh->elements > 0)
692 p->V = MAX64 ( p->V, neh->p[0].key );
693 /* move from neh to sch any packets that have become eligible */
694 while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) {
695 struct dn_flow_queue *q = neh->p[0].object ;
696 heap_extract(neh, NULL);
697 heap_insert(sch, q->F, q);
698 }
699
700 if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
701 p->numbytes = -1 ; /* mark not ready for I/O */
702 break ;
703 }
704 }
705 if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0
706 && p->idle_heap.elements > 0) {
707 /*
708 * no traffic and no events scheduled. We can get rid of idle-heap.
709 */
710 int i ;
711
712 for (i = 0 ; i < p->idle_heap.elements ; i++) {
713 struct dn_flow_queue *q = p->idle_heap.p[i].object ;
714
715 q->F = 0 ;
716 q->S = q->F + 1 ;
717 }
718 p->sum = 0 ;
719 p->V = 0 ;
720 p->idle_heap.elements = 0 ;
721 }
722 /*
723 * If we are getting clocks from dummynet (not a real interface) and
724 * If we are under credit, schedule the next ready event.
725 * Also fix the delivery time of the last packet.
726 */
727 if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */
728 dn_key t=0 ; /* number of ticks i have to wait */
729
730 if (p->bandwidth > 0)
731 t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ;
732 dn_tag_get(p->tail)->output_time += t ;
733 p->sched_time = curr_time ;
734 heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
735 /* XXX should check errors on heap_insert, and drain the whole
736 * queue on error hoping next time we are luckier.
737 */
738 }
739 /*
740 * If the delay line was empty call transmit_event(p) now.
741 * Otherwise, the scheduler will take care of it.
742 */
743 if (p_was_empty)
744 transmit_event(p);
745 }
746
747 /*
748 * This is called every 1ms. It is used to
749 * increment the current tick counter and schedule expired events.
750 */
751 static void
752 dummynet(void * __unused unused)
753 {
754 void *p ; /* generic parameter to handler */
755 struct dn_heap *h ;
756 struct dn_heap *heaps[3];
757 int i;
758 struct dn_pipe *pe ;
759 struct timespec ts;
760 struct timeval tv;
761
762 heaps[0] = &ready_heap ; /* fixed-rate queues */
763 heaps[1] = &wfq_ready_heap ; /* wfq queues */
764 heaps[2] = &extract_heap ; /* delay line */
765
766 lck_mtx_lock(dn_mutex);
767
768 /* make all time measurements in milliseconds (ms) -
769 * here we convert secs and usecs to msecs (just divide the
770 * usecs and take the closest whole number).
771 */
772 microuptime(&tv);
773 curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
774
775 for (i=0; i < 3 ; i++) {
776 h = heaps[i];
777 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) {
778 if (h->p[0].key > curr_time)
779 printf("dummynet: warning, heap %d is %d ticks late\n",
780 i, (int)(curr_time - h->p[0].key));
781 p = h->p[0].object ; /* store a copy before heap_extract */
782 heap_extract(h, NULL); /* need to extract before processing */
783 if (i == 0)
784 ready_event(p) ;
785 else if (i == 1) {
786 struct dn_pipe *pipe = p;
787 if (pipe->if_name[0] != '\0')
788 printf("dummynet: bad ready_event_wfq for pipe %s\n",
789 pipe->if_name);
790 else
791 ready_event_wfq(p) ;
792 } else
793 transmit_event(p);
794 }
795 }
796 /* sweep pipes trying to expire idle flow_queues */
797 for (pe = all_pipes; pe ; pe = pe->next )
798 if (pe->idle_heap.elements > 0 &&
799 DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) {
800 struct dn_flow_queue *q = pe->idle_heap.p[0].object ;
801
802 heap_extract(&(pe->idle_heap), NULL);
803 q->S = q->F + 1 ; /* mark timestamp as invalid */
804 pe->sum -= q->fs->weight ;
805 }
806
807 /* check the heaps to see if there's still stuff in there, and
808 * only set the timer if there are packets to process
809 */
810 timer_enabled = 0;
811 for (i=0; i < 3 ; i++) {
812 h = heaps[i];
813 if (h->elements > 0) { // set the timer
814 ts.tv_sec = 0;
815 ts.tv_nsec = 1 * 1000000; // 1ms
816 timer_enabled = 1;
817 bsd_timeout(dummynet, NULL, &ts);
818 break;
819 }
820 }
821
822 lck_mtx_unlock(dn_mutex);
823 }
824
825 /*
826 * called by an interface when tx_rdy occurs.
827 */
828 int
829 if_tx_rdy(struct ifnet *ifp)
830 {
831 struct dn_pipe *p;
832
833 lck_mtx_lock(dn_mutex);
834 for (p = all_pipes; p ; p = p->next )
835 if (p->ifp == ifp)
836 break ;
837 if (p == NULL) {
838 char buf[32];
839 sprintf(buf, "%s%d",ifp->if_name, ifp->if_unit);
840 for (p = all_pipes; p ; p = p->next )
841 if (!strcmp(p->if_name, buf) ) {
842 p->ifp = ifp ;
843 DPRINTF(("dummynet: ++ tx rdy from %s (now found)\n", buf));
844 break ;
845 }
846 }
847 if (p != NULL) {
848 DPRINTF(("dummynet: ++ tx rdy from %s%d - qlen %d\n", ifp->if_name,
849 ifp->if_unit, ifp->if_snd.ifq_len));
850 p->numbytes = 0 ; /* mark ready for I/O */
851 ready_event_wfq(p);
852 }
853 lck_mtx_lock(dn_mutex);
854
855 return 0;
856 }
857
858 /*
859 * Unconditionally expire empty queues in case of shortage.
860 * Returns the number of queues freed.
861 */
862 static int
863 expire_queues(struct dn_flow_set *fs)
864 {
865 struct dn_flow_queue *q, *prev ;
866 int i, initial_elements = fs->rq_elements ;
867 struct timeval timenow;
868
869 getmicrotime(&timenow);
870
871 if (fs->last_expired == timenow.tv_sec)
872 return 0 ;
873 fs->last_expired = timenow.tv_sec ;
874 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
875 for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
876 if (q->head != NULL || q->S != q->F+1) {
877 prev = q ;
878 q = q->next ;
879 } else { /* entry is idle, expire it */
880 struct dn_flow_queue *old_q = q ;
881
882 if (prev != NULL)
883 prev->next = q = q->next ;
884 else
885 fs->rq[i] = q = q->next ;
886 fs->rq_elements-- ;
887 FREE(old_q, M_DUMMYNET);
888 }
889 return initial_elements - fs->rq_elements ;
890 }
891
892 /*
893 * If room, create a new queue and put at head of slot i;
894 * otherwise, create or use the default queue.
895 */
896 static struct dn_flow_queue *
897 create_queue(struct dn_flow_set *fs, int i)
898 {
899 struct dn_flow_queue *q ;
900
901 if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
902 expire_queues(fs) == 0) {
903 /*
904 * No way to get room, use or create overflow queue.
905 */
906 i = fs->rq_size ;
907 if ( fs->rq[i] != NULL )
908 return fs->rq[i] ;
909 }
910 q = _MALLOC(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO);
911 if (q == NULL) {
912 printf("dummynet: sorry, cannot allocate queue for new flow\n");
913 return NULL ;
914 }
915 q->fs = fs ;
916 q->hash_slot = i ;
917 q->next = fs->rq[i] ;
918 q->S = q->F + 1; /* hack - mark timestamp as invalid */
919 fs->rq[i] = q ;
920 fs->rq_elements++ ;
921 return q ;
922 }
923
924 /*
925 * Given a flow_set and a pkt in last_pkt, find a matching queue
926 * after appropriate masking. The queue is moved to front
927 * so that further searches take less time.
928 */
929 static struct dn_flow_queue *
930 find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
931 {
932 int i = 0 ; /* we need i and q for new allocations */
933 struct dn_flow_queue *q, *prev;
934
935 if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
936 q = fs->rq[0] ;
937 else {
938 /* first, do the masking */
939 id->dst_ip &= fs->flow_mask.dst_ip ;
940 id->src_ip &= fs->flow_mask.src_ip ;
941 id->dst_port &= fs->flow_mask.dst_port ;
942 id->src_port &= fs->flow_mask.src_port ;
943 id->proto &= fs->flow_mask.proto ;
944 id->flags = 0 ; /* we don't care about this one */
945 /* then, hash function */
946 i = ( (id->dst_ip) & 0xffff ) ^
947 ( (id->dst_ip >> 15) & 0xffff ) ^
948 ( (id->src_ip << 1) & 0xffff ) ^
949 ( (id->src_ip >> 16 ) & 0xffff ) ^
950 (id->dst_port << 1) ^ (id->src_port) ^
951 (id->proto );
952 i = i % fs->rq_size ;
953 /* finally, scan the current list for a match */
954 searches++ ;
955 for (prev=NULL, q = fs->rq[i] ; q ; ) {
956 search_steps++;
957 if (id->dst_ip == q->id.dst_ip &&
958 id->src_ip == q->id.src_ip &&
959 id->dst_port == q->id.dst_port &&
960 id->src_port == q->id.src_port &&
961 id->proto == q->id.proto &&
962 id->flags == q->id.flags)
963 break ; /* found */
964 else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
965 /* entry is idle and not in any heap, expire it */
966 struct dn_flow_queue *old_q = q ;
967
968 if (prev != NULL)
969 prev->next = q = q->next ;
970 else
971 fs->rq[i] = q = q->next ;
972 fs->rq_elements-- ;
973 FREE(old_q, M_DUMMYNET);
974 continue ;
975 }
976 prev = q ;
977 q = q->next ;
978 }
979 if (q && prev != NULL) { /* found and not in front */
980 prev->next = q->next ;
981 q->next = fs->rq[i] ;
982 fs->rq[i] = q ;
983 }
984 }
985 if (q == NULL) { /* no match, need to allocate a new entry */
986 q = create_queue(fs, i);
987 if (q != NULL)
988 q->id = *id ;
989 }
990 return q ;
991 }
992
993 static int
994 red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
995 {
996 /*
997 * RED algorithm
998 *
999 * RED calculates the average queue size (avg) using a low-pass filter
1000 * with an exponential weighted (w_q) moving average:
1001 * avg <- (1-w_q) * avg + w_q * q_size
1002 * where q_size is the queue length (measured in bytes or * packets).
1003 *
1004 * If q_size == 0, we compute the idle time for the link, and set
1005 * avg = (1 - w_q)^(idle/s)
1006 * where s is the time needed for transmitting a medium-sized packet.
1007 *
1008 * Now, if avg < min_th the packet is enqueued.
1009 * If avg > max_th the packet is dropped. Otherwise, the packet is
1010 * dropped with probability P function of avg.
1011 *
1012 */
1013
1014 int64_t p_b = 0;
1015 /* queue in bytes or packets ? */
1016 u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len;
1017
1018 DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size));
1019
1020 /* average queue size estimation */
1021 if (q_size != 0) {
1022 /*
1023 * queue is not empty, avg <- avg + (q_size - avg) * w_q
1024 */
1025 int diff = SCALE(q_size) - q->avg;
1026 int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q);
1027
1028 q->avg += (int) v;
1029 } else {
1030 /*
1031 * queue is empty, find for how long the queue has been
1032 * empty and use a lookup table for computing
1033 * (1 - * w_q)^(idle_time/s) where s is the time to send a
1034 * (small) packet.
1035 * XXX check wraps...
1036 */
1037 if (q->avg) {
1038 u_int t = (curr_time - q->q_time) / fs->lookup_step;
1039
1040 q->avg = (t < fs->lookup_depth) ?
1041 SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
1042 }
1043 }
1044 DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
1045
1046 /* should i drop ? */
1047
1048 if (q->avg < fs->min_th) {
1049 q->count = -1;
1050 return 0; /* accept packet ; */
1051 }
1052 if (q->avg >= fs->max_th) { /* average queue >= max threshold */
1053 if (fs->flags_fs & DN_IS_GENTLE_RED) {
1054 /*
1055 * According to Gentle-RED, if avg is greater than max_th the
1056 * packet is dropped with a probability
1057 * p_b = c_3 * avg - c_4
1058 * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
1059 */
1060 p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4;
1061 } else {
1062 q->count = -1;
1063 DPRINTF(("dummynet: - drop"));
1064 return 1 ;
1065 }
1066 } else if (q->avg > fs->min_th) {
1067 /*
1068 * we compute p_b using the linear dropping function p_b = c_1 *
1069 * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
1070 * max_p * min_th / (max_th - min_th)
1071 */
1072 p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2;
1073 }
1074 if (fs->flags_fs & DN_QSIZE_IS_BYTES)
1075 p_b = (p_b * len) / fs->max_pkt_size;
1076 if (++q->count == 0)
1077 q->random = random() & 0xffff;
1078 else {
1079 /*
1080 * q->count counts packets arrived since last drop, so a greater
1081 * value of q->count means a greater packet drop probability.
1082 */
1083 if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) {
1084 q->count = 0;
1085 DPRINTF(("dummynet: - red drop"));
1086 /* after a drop we calculate a new random value */
1087 q->random = random() & 0xffff;
1088 return 1; /* drop */
1089 }
1090 }
1091 /* end of RED algorithm */
1092 return 0 ; /* accept */
1093 }
1094
1095 static __inline
1096 struct dn_flow_set *
1097 locate_flowset(int pipe_nr, struct ip_fw *rule)
1098 {
1099 struct dn_flow_set *fs;
1100 ipfw_insn *cmd = rule->cmd + rule->act_ofs;
1101
1102 if (cmd->opcode == O_LOG)
1103 cmd += F_LEN(cmd);
1104
1105 bcopy(& ((ipfw_insn_pipe *)cmd)->pipe_ptr, &fs, sizeof(fs));
1106
1107 if (fs != NULL)
1108 return fs;
1109
1110 if (cmd->opcode == O_QUEUE) {
1111 for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next)
1112 ;
1113 }
1114 else {
1115 struct dn_pipe *p1;
1116 for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next)
1117 ;
1118 if (p1 != NULL)
1119 fs = &(p1->fs) ;
1120 }
1121 /* record for the future */
1122 bcopy(&fs, & ((ipfw_insn_pipe *)cmd)->pipe_ptr, sizeof(fs));
1123
1124 return fs ;
1125 }
1126
1127 /*
1128 * dummynet hook for packets. Below 'pipe' is a pipe or a queue
1129 * depending on whether WF2Q or fixed bw is used.
1130 *
1131 * pipe_nr pipe or queue the packet is destined for.
1132 * dir where shall we send the packet after dummynet.
1133 * m the mbuf with the packet
1134 * ifp the 'ifp' parameter from the caller.
1135 * NULL in ip_input, destination interface in ip_output,
1136 * real_dst in bdg_forward
1137 * ro route parameter (only used in ip_output, NULL otherwise)
1138 * dst destination address, only used by ip_output
1139 * rule matching rule, in case of multiple passes
1140 * flags flags from the caller, only used in ip_output
1141 *
1142 */
1143 static int
1144 dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
1145 {
1146 struct dn_pkt_tag *pkt;
1147 struct m_tag *mtag;
1148 struct dn_flow_set *fs;
1149 struct dn_pipe *pipe ;
1150 u_int64_t len = m->m_pkthdr.len ;
1151 struct dn_flow_queue *q = NULL ;
1152 int is_pipe;
1153 struct timespec ts;
1154 struct timeval tv;
1155
1156 #if IPFW2
1157 ipfw_insn *cmd = fwa->rule->cmd + fwa->rule->act_ofs;
1158
1159 if (cmd->opcode == O_LOG)
1160 cmd += F_LEN(cmd);
1161 is_pipe = (cmd->opcode == O_PIPE);
1162 #else
1163 is_pipe = (fwa->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE;
1164 #endif
1165
1166 pipe_nr &= 0xffff ;
1167
1168 lck_mtx_lock(dn_mutex);
1169
1170 /* make all time measurements in milliseconds (ms) -
1171 * here we convert secs and usecs to msecs (just divide the
1172 * usecs and take the closest whole number).
1173 */
1174 microuptime(&tv);
1175 curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
1176
1177 /*
1178 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
1179 */
1180 fs = locate_flowset(pipe_nr, fwa->rule);
1181 if (fs == NULL)
1182 goto dropit ; /* this queue/pipe does not exist! */
1183 pipe = fs->pipe ;
1184 if (pipe == NULL) { /* must be a queue, try find a matching pipe */
1185 for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr;
1186 pipe = pipe->next)
1187 ;
1188 if (pipe != NULL)
1189 fs->pipe = pipe ;
1190 else {
1191 printf("dummynet: no pipe %d for queue %d, drop pkt\n",
1192 fs->parent_nr, fs->fs_nr);
1193 goto dropit ;
1194 }
1195 }
1196 q = find_queue(fs, &(fwa->f_id));
1197 if ( q == NULL )
1198 goto dropit ; /* cannot allocate queue */
1199 /*
1200 * update statistics, then check reasons to drop pkt
1201 */
1202 q->tot_bytes += len ;
1203 q->tot_pkts++ ;
1204 if ( fs->plr && random() < fs->plr )
1205 goto dropit ; /* random pkt drop */
1206 if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
1207 if (q->len_bytes > fs->qsize)
1208 goto dropit ; /* queue size overflow */
1209 } else {
1210 if (q->len >= fs->qsize)
1211 goto dropit ; /* queue count overflow */
1212 }
1213 if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
1214 goto dropit ;
1215
1216 /* XXX expensive to zero, see if we can remove it*/
1217 mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET,
1218 sizeof(struct dn_pkt_tag), M_NOWAIT);
1219 if ( mtag == NULL )
1220 goto dropit ; /* cannot allocate packet header */
1221 m_tag_prepend(m, mtag); /* attach to mbuf chain */
1222
1223 pkt = (struct dn_pkt_tag *)(mtag+1);
1224 bzero(pkt, sizeof(struct dn_pkt_tag));
1225 /* ok, i can handle the pkt now... */
1226 /* build and enqueue packet + parameters */
1227 pkt->rule = fwa->rule ;
1228 pkt->dn_dir = dir ;
1229
1230 pkt->ifp = fwa->oif;
1231 if (dir == DN_TO_IP_OUT) {
1232 /*
1233 * We need to copy *ro because for ICMP pkts (and maybe others)
1234 * the caller passed a pointer into the stack; dst might also be
1235 * a pointer into *ro so it needs to be updated.
1236 */
1237 lck_mtx_lock(rt_mtx);
1238 pkt->ro = *(fwa->ro);
1239 if (fwa->ro->ro_rt)
1240 fwa->ro->ro_rt->rt_refcnt++ ;
1241 if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */
1242 fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ;
1243 lck_mtx_unlock(rt_mtx);
1244
1245 pkt->dn_dst = fwa->dst;
1246 pkt->flags = fwa->flags;
1247 }
1248 if (q->head == NULL)
1249 q->head = m;
1250 else
1251 q->tail->m_nextpkt = m;
1252 q->tail = m;
1253 q->len++;
1254 q->len_bytes += len ;
1255
1256 if ( q->head != m ) /* flow was not idle, we are done */
1257 goto done;
1258 /*
1259 * If we reach this point the flow was previously idle, so we need
1260 * to schedule it. This involves different actions for fixed-rate or
1261 * WF2Q queues.
1262 */
1263 if (is_pipe) {
1264 /*
1265 * Fixed-rate queue: just insert into the ready_heap.
1266 */
1267 dn_key t = 0 ;
1268 if (pipe->bandwidth)
1269 t = SET_TICKS(m, q, pipe);
1270 q->sched_time = curr_time ;
1271 if (t == 0) /* must process it now */
1272 ready_event( q );
1273 else
1274 heap_insert(&ready_heap, curr_time + t , q );
1275 } else {
1276 /*
1277 * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
1278 * set S to the virtual time V for the controlling pipe, and update
1279 * the sum of weights for the pipe; otherwise, remove flow from
1280 * idle_heap and set S to max(F,V).
1281 * Second, compute finish time F = S + len/weight.
1282 * Third, if pipe was idle, update V=max(S, V).
1283 * Fourth, count one more backlogged flow.
1284 */
1285 if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
1286 q->S = pipe->V ;
1287 pipe->sum += fs->weight ; /* add weight of new queue */
1288 } else {
1289 heap_extract(&(pipe->idle_heap), q);
1290 q->S = MAX64(q->F, pipe->V ) ;
1291 }
1292 q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
1293
1294 if (pipe->not_eligible_heap.elements == 0 &&
1295 pipe->scheduler_heap.elements == 0)
1296 pipe->V = MAX64 ( q->S, pipe->V );
1297 fs->backlogged++ ;
1298 /*
1299 * Look at eligibility. A flow is not eligibile if S>V (when
1300 * this happens, it means that there is some other flow already
1301 * scheduled for the same pipe, so the scheduler_heap cannot be
1302 * empty). If the flow is not eligible we just store it in the
1303 * not_eligible_heap. Otherwise, we store in the scheduler_heap
1304 * and possibly invoke ready_event_wfq() right now if there is
1305 * leftover credit.
1306 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
1307 * and for all flows in not_eligible_heap (NEH), S_i > V .
1308 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
1309 * we only need to look into NEH.
1310 */
1311 if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */
1312 if (pipe->scheduler_heap.elements == 0)
1313 printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
1314 heap_insert(&(pipe->not_eligible_heap), q->S, q);
1315 } else {
1316 heap_insert(&(pipe->scheduler_heap), q->F, q);
1317 if (pipe->numbytes >= 0) { /* pipe is idle */
1318 if (pipe->scheduler_heap.elements != 1)
1319 printf("dummynet: OUCH! pipe should have been idle!\n");
1320 DPRINTF(("dummynet: waking up pipe %d at %d\n",
1321 pipe->pipe_nr, (int)(q->F >> MY_M)));
1322 pipe->sched_time = curr_time ;
1323 ready_event_wfq(pipe);
1324 }
1325 }
1326 }
1327 done:
1328 /* start the timer and set global if not already set */
1329 if (!timer_enabled) {
1330 ts.tv_sec = 0;
1331 ts.tv_nsec = 1 * 1000000; // 1ms
1332 timer_enabled = 1;
1333 bsd_timeout(dummynet, NULL, &ts);
1334 }
1335
1336 lck_mtx_unlock(dn_mutex);
1337 return 0;
1338
1339 dropit:
1340 if (q)
1341 q->drops++ ;
1342 lck_mtx_unlock(dn_mutex);
1343 m_freem(m);
1344 return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
1345 }
1346
1347 /*
1348 * Below, the rtfree is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
1349 * Doing this would probably save us the initial bzero of dn_pkt
1350 */
1351 #define DN_FREE_PKT(_m) do { \
1352 struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \
1353 if (tag) { \
1354 struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1); \
1355 if (n->ro.ro_rt) \
1356 rtfree(n->ro.ro_rt); \
1357 } \
1358 m_tag_delete(_m, tag); \
1359 m_freem(_m); \
1360 } while (0)
1361
1362 /*
1363 * Dispose all packets and flow_queues on a flow_set.
1364 * If all=1, also remove red lookup table and other storage,
1365 * including the descriptor itself.
1366 * For the one in dn_pipe MUST also cleanup ready_heap...
1367 */
1368 static void
1369 purge_flow_set(struct dn_flow_set *fs, int all)
1370 {
1371 struct dn_flow_queue *q, *qn ;
1372 int i ;
1373
1374 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1375
1376 for (i = 0 ; i <= fs->rq_size ; i++ ) {
1377 for (q = fs->rq[i] ; q ; q = qn ) {
1378 struct mbuf *m, *mnext;
1379
1380 mnext = q->head;
1381 while ((m = mnext) != NULL) {
1382 mnext = m->m_nextpkt;
1383 DN_FREE_PKT(m);
1384 }
1385 qn = q->next ;
1386 FREE(q, M_DUMMYNET);
1387 }
1388 fs->rq[i] = NULL ;
1389 }
1390 fs->rq_elements = 0 ;
1391 if (all) {
1392 /* RED - free lookup table */
1393 if (fs->w_q_lookup)
1394 FREE(fs->w_q_lookup, M_DUMMYNET);
1395 if (fs->rq)
1396 FREE(fs->rq, M_DUMMYNET);
1397 /* if this fs is not part of a pipe, free it */
1398 if (fs->pipe && fs != &(fs->pipe->fs) )
1399 FREE(fs, M_DUMMYNET);
1400 }
1401 }
1402
1403 /*
1404 * Dispose all packets queued on a pipe (not a flow_set).
1405 * Also free all resources associated to a pipe, which is about
1406 * to be deleted.
1407 */
1408 static void
1409 purge_pipe(struct dn_pipe *pipe)
1410 {
1411 struct mbuf *m, *mnext;
1412
1413 purge_flow_set( &(pipe->fs), 1 );
1414
1415 mnext = pipe->head;
1416 while ((m = mnext) != NULL) {
1417 mnext = m->m_nextpkt;
1418 DN_FREE_PKT(m);
1419 }
1420
1421 heap_free( &(pipe->scheduler_heap) );
1422 heap_free( &(pipe->not_eligible_heap) );
1423 heap_free( &(pipe->idle_heap) );
1424 }
1425
1426 /*
1427 * Delete all pipes and heaps returning memory. Must also
1428 * remove references from all ipfw rules to all pipes.
1429 */
1430 static void
1431 dummynet_flush()
1432 {
1433 struct dn_pipe *curr_p, *p ;
1434 struct dn_flow_set *fs, *curr_fs;
1435
1436 lck_mtx_lock(dn_mutex);
1437
1438 /* remove all references to pipes ...*/
1439 flush_pipe_ptrs(NULL);
1440 /* prevent future matches... */
1441 p = all_pipes ;
1442 all_pipes = NULL ;
1443 fs = all_flow_sets ;
1444 all_flow_sets = NULL ;
1445 /* and free heaps so we don't have unwanted events */
1446 heap_free(&ready_heap);
1447 heap_free(&wfq_ready_heap);
1448 heap_free(&extract_heap);
1449
1450 /*
1451 * Now purge all queued pkts and delete all pipes
1452 */
1453 /* scan and purge all flow_sets. */
1454 for ( ; fs ; ) {
1455 curr_fs = fs ;
1456 fs = fs->next ;
1457 purge_flow_set(curr_fs, 1);
1458 }
1459 for ( ; p ; ) {
1460 purge_pipe(p);
1461 curr_p = p ;
1462 p = p->next ;
1463 FREE(curr_p, M_DUMMYNET);
1464 }
1465 lck_mtx_unlock(dn_mutex);
1466 }
1467
1468
1469 extern struct ip_fw *ip_fw_default_rule ;
1470 static void
1471 dn_rule_delete_fs(struct dn_flow_set *fs, void *r)
1472 {
1473 int i ;
1474 struct dn_flow_queue *q ;
1475 struct mbuf *m ;
1476
1477 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
1478 for (q = fs->rq[i] ; q ; q = q->next )
1479 for (m = q->head ; m ; m = m->m_nextpkt ) {
1480 struct dn_pkt_tag *pkt = dn_tag_get(m) ;
1481 if (pkt->rule == r)
1482 pkt->rule = ip_fw_default_rule ;
1483 }
1484 }
1485 /*
1486 * when a firewall rule is deleted, scan all queues and remove the flow-id
1487 * from packets matching this rule.
1488 */
1489 void
1490 dn_rule_delete(void *r)
1491 {
1492 struct dn_pipe *p ;
1493 struct dn_flow_set *fs ;
1494 struct dn_pkt_tag *pkt ;
1495 struct mbuf *m ;
1496
1497 lck_mtx_lock(dn_mutex);
1498
1499 /*
1500 * If the rule references a queue (dn_flow_set), then scan
1501 * the flow set, otherwise scan pipes. Should do either, but doing
1502 * both does not harm.
1503 */
1504 for ( fs = all_flow_sets ; fs ; fs = fs->next )
1505 dn_rule_delete_fs(fs, r);
1506 for ( p = all_pipes ; p ; p = p->next ) {
1507 fs = &(p->fs) ;
1508 dn_rule_delete_fs(fs, r);
1509 for (m = p->head ; m ; m = m->m_nextpkt ) {
1510 pkt = dn_tag_get(m) ;
1511 if (pkt->rule == r)
1512 pkt->rule = ip_fw_default_rule ;
1513 }
1514 }
1515 lck_mtx_unlock(dn_mutex);
1516 }
1517
1518 /*
1519 * setup RED parameters
1520 */
1521 static int
1522 config_red(struct dn_flow_set *p, struct dn_flow_set * x)
1523 {
1524 int i;
1525
1526 x->w_q = p->w_q;
1527 x->min_th = SCALE(p->min_th);
1528 x->max_th = SCALE(p->max_th);
1529 x->max_p = p->max_p;
1530
1531 x->c_1 = p->max_p / (p->max_th - p->min_th);
1532 x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
1533 if (x->flags_fs & DN_IS_GENTLE_RED) {
1534 x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
1535 x->c_4 = (SCALE(1) - 2 * p->max_p);
1536 }
1537
1538 /* if the lookup table already exist, free and create it again */
1539 if (x->w_q_lookup) {
1540 FREE(x->w_q_lookup, M_DUMMYNET);
1541 x->w_q_lookup = NULL ;
1542 }
1543 if (red_lookup_depth == 0) {
1544 printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n");
1545 FREE(x, M_DUMMYNET);
1546 return EINVAL;
1547 }
1548 x->lookup_depth = red_lookup_depth;
1549 x->w_q_lookup = (u_int *) _MALLOC(x->lookup_depth * sizeof(int),
1550 M_DUMMYNET, M_DONTWAIT);
1551 if (x->w_q_lookup == NULL) {
1552 printf("dummynet: sorry, cannot allocate red lookup table\n");
1553 FREE(x, M_DUMMYNET);
1554 return ENOSPC;
1555 }
1556
1557 /* fill the lookup table with (1 - w_q)^x */
1558 x->lookup_step = p->lookup_step ;
1559 x->lookup_weight = p->lookup_weight ;
1560 x->w_q_lookup[0] = SCALE(1) - x->w_q;
1561 for (i = 1; i < x->lookup_depth; i++)
1562 x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
1563 if (red_avg_pkt_size < 1)
1564 red_avg_pkt_size = 512 ;
1565 x->avg_pkt_size = red_avg_pkt_size ;
1566 if (red_max_pkt_size < 1)
1567 red_max_pkt_size = 1500 ;
1568 x->max_pkt_size = red_max_pkt_size ;
1569 return 0 ;
1570 }
1571
1572 static int
1573 alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
1574 {
1575 if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */
1576 int l = pfs->rq_size;
1577
1578 if (l == 0)
1579 l = dn_hash_size;
1580 if (l < 4)
1581 l = 4;
1582 else if (l > DN_MAX_HASH_SIZE)
1583 l = DN_MAX_HASH_SIZE;
1584 x->rq_size = l;
1585 } else /* one is enough for null mask */
1586 x->rq_size = 1;
1587 x->rq = _MALLOC((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
1588 M_DUMMYNET, M_DONTWAIT | M_ZERO);
1589 if (x->rq == NULL) {
1590 printf("dummynet: sorry, cannot allocate queue\n");
1591 return ENOSPC;
1592 }
1593 x->rq_elements = 0;
1594 return 0 ;
1595 }
1596
1597 static void
1598 set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
1599 {
1600 x->flags_fs = src->flags_fs;
1601 x->qsize = src->qsize;
1602 x->plr = src->plr;
1603 x->flow_mask = src->flow_mask;
1604 if (x->flags_fs & DN_QSIZE_IS_BYTES) {
1605 if (x->qsize > 1024*1024)
1606 x->qsize = 1024*1024 ;
1607 } else {
1608 if (x->qsize == 0)
1609 x->qsize = 50 ;
1610 if (x->qsize > 100)
1611 x->qsize = 50 ;
1612 }
1613 /* configuring RED */
1614 if ( x->flags_fs & DN_IS_RED )
1615 config_red(src, x) ; /* XXX should check errors */
1616 }
1617
1618 /*
1619 * setup pipe or queue parameters.
1620 */
1621
1622 static int
1623 config_pipe(struct dn_pipe *p)
1624 {
1625 int i, r;
1626 struct dn_flow_set *pfs = &(p->fs);
1627 struct dn_flow_queue *q;
1628
1629 /*
1630 * The config program passes parameters as follows:
1631 * bw = bits/second (0 means no limits),
1632 * delay = ms, must be translated into ticks.
1633 * qsize = slots/bytes
1634 */
1635 p->delay = ( p->delay * (hz*10) ) / 1000 ;
1636 /* We need either a pipe number or a flow_set number */
1637 if (p->pipe_nr == 0 && pfs->fs_nr == 0)
1638 return EINVAL ;
1639 if (p->pipe_nr != 0 && pfs->fs_nr != 0)
1640 return EINVAL ;
1641 if (p->pipe_nr != 0) { /* this is a pipe */
1642 struct dn_pipe *x, *a, *b;
1643
1644 lck_mtx_lock(dn_mutex);
1645 /* locate pipe */
1646 for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
1647 a = b , b = b->next) ;
1648
1649 if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */
1650 x = _MALLOC(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO) ;
1651 if (x == NULL) {
1652 lck_mtx_unlock(dn_mutex);
1653 printf("dummynet: no memory for new pipe\n");
1654 return ENOSPC;
1655 }
1656 x->pipe_nr = p->pipe_nr;
1657 x->fs.pipe = x ;
1658 /* idle_heap is the only one from which we extract from the middle.
1659 */
1660 x->idle_heap.size = x->idle_heap.elements = 0 ;
1661 x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos);
1662 } else {
1663 x = b;
1664 /* Flush accumulated credit for all queues */
1665 for (i = 0; i <= x->fs.rq_size; i++)
1666 for (q = x->fs.rq[i]; q; q = q->next)
1667 q->numbytes = 0;
1668 }
1669
1670 x->bandwidth = p->bandwidth ;
1671 x->numbytes = 0; /* just in case... */
1672 bcopy(p->if_name, x->if_name, sizeof(p->if_name) );
1673 x->ifp = NULL ; /* reset interface ptr */
1674 x->delay = p->delay ;
1675 set_fs_parms(&(x->fs), pfs);
1676
1677
1678 if ( x->fs.rq == NULL ) { /* a new pipe */
1679 r = alloc_hash(&(x->fs), pfs) ;
1680 if (r) {
1681 lck_mtx_unlock(dn_mutex);
1682 FREE(x, M_DUMMYNET);
1683 return r ;
1684 }
1685 x->next = b ;
1686 if (a == NULL)
1687 all_pipes = x ;
1688 else
1689 a->next = x ;
1690 }
1691 lck_mtx_unlock(dn_mutex);
1692 } else { /* config queue */
1693 struct dn_flow_set *x, *a, *b ;
1694
1695 lck_mtx_lock(dn_mutex);
1696 /* locate flow_set */
1697 for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ;
1698 a = b , b = b->next) ;
1699
1700 if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */
1701 if (pfs->parent_nr == 0) { /* need link to a pipe */
1702 lck_mtx_unlock(dn_mutex);
1703 return EINVAL ;
1704 }
1705 x = _MALLOC(sizeof(struct dn_flow_set), M_DUMMYNET, M_DONTWAIT | M_ZERO);
1706 if (x == NULL) {
1707 lck_mtx_unlock(dn_mutex);
1708 printf("dummynet: no memory for new flow_set\n");
1709 return ENOSPC;
1710 }
1711 x->fs_nr = pfs->fs_nr;
1712 x->parent_nr = pfs->parent_nr;
1713 x->weight = pfs->weight ;
1714 if (x->weight == 0)
1715 x->weight = 1 ;
1716 else if (x->weight > 100)
1717 x->weight = 100 ;
1718 } else {
1719 /* Change parent pipe not allowed; must delete and recreate */
1720 if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) {
1721 lck_mtx_unlock(dn_mutex);
1722 return EINVAL ;
1723 }
1724 x = b;
1725 }
1726 set_fs_parms(x, pfs);
1727
1728 if ( x->rq == NULL ) { /* a new flow_set */
1729 r = alloc_hash(x, pfs) ;
1730 if (r) {
1731 lck_mtx_unlock(dn_mutex);
1732 FREE(x, M_DUMMYNET);
1733 return r ;
1734 }
1735 x->next = b;
1736 if (a == NULL)
1737 all_flow_sets = x;
1738 else
1739 a->next = x;
1740 }
1741 lck_mtx_unlock(dn_mutex);
1742 }
1743 return 0 ;
1744 }
1745
1746 /*
1747 * Helper function to remove from a heap queues which are linked to
1748 * a flow_set about to be deleted.
1749 */
1750 static void
1751 fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
1752 {
1753 int i = 0, found = 0 ;
1754 for (; i < h->elements ;)
1755 if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
1756 h->elements-- ;
1757 h->p[i] = h->p[h->elements] ;
1758 found++ ;
1759 } else
1760 i++ ;
1761 if (found)
1762 heapify(h);
1763 }
1764
1765 /*
1766 * helper function to remove a pipe from a heap (can be there at most once)
1767 */
1768 static void
1769 pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
1770 {
1771 if (h->elements > 0) {
1772 int i = 0 ;
1773 for (i=0; i < h->elements ; i++ ) {
1774 if (h->p[i].object == p) { /* found it */
1775 h->elements-- ;
1776 h->p[i] = h->p[h->elements] ;
1777 heapify(h);
1778 break ;
1779 }
1780 }
1781 }
1782 }
1783
1784 /*
1785 * drain all queues. Called in case of severe mbuf shortage.
1786 */
1787 void
1788 dummynet_drain()
1789 {
1790 struct dn_flow_set *fs;
1791 struct dn_pipe *p;
1792 struct mbuf *m, *mnext;
1793
1794 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1795
1796 heap_free(&ready_heap);
1797 heap_free(&wfq_ready_heap);
1798 heap_free(&extract_heap);
1799 /* remove all references to this pipe from flow_sets */
1800 for (fs = all_flow_sets; fs; fs= fs->next )
1801 purge_flow_set(fs, 0);
1802
1803 for (p = all_pipes; p; p= p->next ) {
1804 purge_flow_set(&(p->fs), 0);
1805
1806 mnext = p->head;
1807 while ((m = mnext) != NULL) {
1808 mnext = m->m_nextpkt;
1809 DN_FREE_PKT(m);
1810 }
1811 p->head = p->tail = NULL ;
1812 }
1813 }
1814
1815 /*
1816 * Fully delete a pipe or a queue, cleaning up associated info.
1817 */
1818 static int
1819 delete_pipe(struct dn_pipe *p)
1820 {
1821 if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
1822 return EINVAL ;
1823 if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
1824 return EINVAL ;
1825 if (p->pipe_nr != 0) { /* this is an old-style pipe */
1826 struct dn_pipe *a, *b;
1827 struct dn_flow_set *fs;
1828
1829 lck_mtx_lock(dn_mutex);
1830 /* locate pipe */
1831 for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
1832 a = b , b = b->next) ;
1833 if (b == NULL || (b->pipe_nr != p->pipe_nr) ) {
1834 lck_mtx_unlock(dn_mutex);
1835 return EINVAL ; /* not found */
1836 }
1837
1838 /* unlink from list of pipes */
1839 if (a == NULL)
1840 all_pipes = b->next ;
1841 else
1842 a->next = b->next ;
1843 /* remove references to this pipe from the ip_fw rules. */
1844 flush_pipe_ptrs(&(b->fs));
1845
1846 /* remove all references to this pipe from flow_sets */
1847 for (fs = all_flow_sets; fs; fs= fs->next )
1848 if (fs->pipe == b) {
1849 printf("dummynet: ++ ref to pipe %d from fs %d\n",
1850 p->pipe_nr, fs->fs_nr);
1851 fs->pipe = NULL ;
1852 purge_flow_set(fs, 0);
1853 }
1854 fs_remove_from_heap(&ready_heap, &(b->fs));
1855 purge_pipe(b); /* remove all data associated to this pipe */
1856 /* remove reference to here from extract_heap and wfq_ready_heap */
1857 pipe_remove_from_heap(&extract_heap, b);
1858 pipe_remove_from_heap(&wfq_ready_heap, b);
1859 lck_mtx_unlock(dn_mutex);
1860
1861 FREE(b, M_DUMMYNET);
1862 } else { /* this is a WF2Q queue (dn_flow_set) */
1863 struct dn_flow_set *a, *b;
1864
1865 lck_mtx_lock(dn_mutex);
1866 /* locate set */
1867 for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ;
1868 a = b , b = b->next) ;
1869 if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) {
1870 lck_mtx_unlock(dn_mutex);
1871 return EINVAL ; /* not found */
1872 }
1873
1874 if (a == NULL)
1875 all_flow_sets = b->next ;
1876 else
1877 a->next = b->next ;
1878 /* remove references to this flow_set from the ip_fw rules. */
1879 flush_pipe_ptrs(b);
1880
1881 if (b->pipe != NULL) {
1882 /* Update total weight on parent pipe and cleanup parent heaps */
1883 b->pipe->sum -= b->weight * b->backlogged ;
1884 fs_remove_from_heap(&(b->pipe->not_eligible_heap), b);
1885 fs_remove_from_heap(&(b->pipe->scheduler_heap), b);
1886 #if 1 /* XXX should i remove from idle_heap as well ? */
1887 fs_remove_from_heap(&(b->pipe->idle_heap), b);
1888 #endif
1889 }
1890 purge_flow_set(b, 1);
1891 lck_mtx_unlock(dn_mutex);
1892 }
1893 return 0 ;
1894 }
1895
1896 /*
1897 * helper function used to copy data from kernel in DUMMYNET_GET
1898 */
1899 static char *
1900 dn_copy_set(struct dn_flow_set *set, char *bp)
1901 {
1902 int i, copied = 0 ;
1903 struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
1904
1905 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1906
1907 for (i = 0 ; i <= set->rq_size ; i++)
1908 for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
1909 if (q->hash_slot != i)
1910 printf("dummynet: ++ at %d: wrong slot (have %d, "
1911 "should be %d)\n", copied, q->hash_slot, i);
1912 if (q->fs != set)
1913 printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
1914 i, q->fs, set);
1915 copied++ ;
1916 bcopy(q, qp, sizeof( *q ) );
1917 /* cleanup pointers */
1918 qp->next = NULL ;
1919 qp->head = qp->tail = NULL ;
1920 qp->fs = NULL ;
1921 }
1922 if (copied != set->rq_elements)
1923 printf("dummynet: ++ wrong count, have %d should be %d\n",
1924 copied, set->rq_elements);
1925 return (char *)qp ;
1926 }
1927
1928 static size_t
1929 dn_calc_size(void)
1930 {
1931 struct dn_flow_set *set ;
1932 struct dn_pipe *p ;
1933 size_t size ;
1934
1935 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1936
1937 /*
1938 * compute size of data structures: list of pipes and flow_sets.
1939 */
1940 for (p = all_pipes, size = 0 ; p ; p = p->next )
1941 size += sizeof( *p ) +
1942 p->fs.rq_elements * sizeof(struct dn_flow_queue);
1943 for (set = all_flow_sets ; set ; set = set->next )
1944 size += sizeof ( *set ) +
1945 set->rq_elements * sizeof(struct dn_flow_queue);
1946 return size ;
1947 }
1948
1949 static int
1950 dummynet_get(struct sockopt *sopt)
1951 {
1952 char *buf, *bp ; /* bp is the "copy-pointer" */
1953 size_t size ;
1954 struct dn_flow_set *set ;
1955 struct dn_pipe *p ;
1956 int error=0, i ;
1957
1958 /* XXX lock held too long */
1959 lck_mtx_lock(dn_mutex);
1960 /*
1961 * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
1962 * cannot use this flag while holding a mutex.
1963 */
1964 for (i = 0; i < 10; i++) {
1965 size = dn_calc_size();
1966 lck_mtx_unlock(dn_mutex);
1967 buf = _MALLOC(size, M_TEMP, M_WAITOK);
1968 lck_mtx_lock(dn_mutex);
1969 if (size == dn_calc_size())
1970 break;
1971 FREE(buf, M_TEMP);
1972 buf = NULL;
1973 }
1974 if (buf == NULL) {
1975 lck_mtx_unlock(dn_mutex);
1976 return ENOBUFS ;
1977 }
1978 for (p = all_pipes, bp = buf ; p ; p = p->next ) {
1979 struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ;
1980
1981 /*
1982 * copy pipe descriptor into *bp, convert delay back to ms,
1983 * then copy the flow_set descriptor(s) one at a time.
1984 * After each flow_set, copy the queue descriptor it owns.
1985 */
1986 bcopy(p, bp, sizeof( *p ) );
1987 pipe_bp->delay = (pipe_bp->delay * 1000) / (hz*10) ;
1988 /*
1989 * XXX the following is a hack based on ->next being the
1990 * first field in dn_pipe and dn_flow_set. The correct
1991 * solution would be to move the dn_flow_set to the beginning
1992 * of struct dn_pipe.
1993 */
1994 pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ;
1995 /* clean pointers */
1996 pipe_bp->head = pipe_bp->tail = NULL ;
1997 pipe_bp->fs.next = NULL ;
1998 pipe_bp->fs.pipe = NULL ;
1999 pipe_bp->fs.rq = NULL ;
2000
2001 bp += sizeof( *p ) ;
2002 bp = dn_copy_set( &(p->fs), bp );
2003 }
2004 for (set = all_flow_sets ; set ; set = set->next ) {
2005 struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ;
2006 bcopy(set, bp, sizeof( *set ) );
2007 /* XXX same hack as above */
2008 fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ;
2009 fs_bp->pipe = NULL ;
2010 fs_bp->rq = NULL ;
2011 bp += sizeof( *set ) ;
2012 bp = dn_copy_set( set, bp );
2013 }
2014 lck_mtx_unlock(dn_mutex);
2015
2016 error = sooptcopyout(sopt, buf, size);
2017 FREE(buf, M_TEMP);
2018 return error ;
2019 }
2020
2021 /*
2022 * Handler for the various dummynet socket options (get, flush, config, del)
2023 */
2024 static int
2025 ip_dn_ctl(struct sockopt *sopt)
2026 {
2027 int error = 0 ;
2028 struct dn_pipe *p, tmp_pipe;
2029
2030 /* Disallow sets in really-really secure mode. */
2031 if (sopt->sopt_dir == SOPT_SET && securelevel >= 3)
2032 return (EPERM);
2033
2034 switch (sopt->sopt_name) {
2035 default :
2036 printf("dummynet: -- unknown option %d", sopt->sopt_name);
2037 return EINVAL ;
2038
2039 case IP_DUMMYNET_GET :
2040 error = dummynet_get(sopt);
2041 break ;
2042
2043 case IP_DUMMYNET_FLUSH :
2044 dummynet_flush() ;
2045 break ;
2046
2047 case IP_DUMMYNET_CONFIGURE :
2048 p = &tmp_pipe ;
2049 error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
2050 if (error)
2051 break ;
2052 error = config_pipe(p);
2053 break ;
2054
2055 case IP_DUMMYNET_DEL : /* remove a pipe or queue */
2056 p = &tmp_pipe ;
2057 error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
2058 if (error)
2059 break ;
2060
2061 error = delete_pipe(p);
2062 break ;
2063 }
2064 return error ;
2065 }
2066
2067 void
2068 ip_dn_init(void)
2069 {
2070 /* setup locks */
2071 dn_mutex_grp_attr = lck_grp_attr_alloc_init();
2072 dn_mutex_grp = lck_grp_alloc_init("dn", dn_mutex_grp_attr);
2073 dn_mutex_attr = lck_attr_alloc_init();
2074
2075 if ((dn_mutex = lck_mtx_alloc_init(dn_mutex_grp, dn_mutex_attr)) == NULL) {
2076 printf("ip_dn_init: can't alloc dn_mutex\n");
2077 return;
2078 }
2079
2080 all_pipes = NULL ;
2081 all_flow_sets = NULL ;
2082 ready_heap.size = ready_heap.elements = 0 ;
2083 ready_heap.offset = 0 ;
2084
2085 wfq_ready_heap.size = wfq_ready_heap.elements = 0 ;
2086 wfq_ready_heap.offset = 0 ;
2087
2088 extract_heap.size = extract_heap.elements = 0 ;
2089 extract_heap.offset = 0 ;
2090 ip_dn_ctl_ptr = ip_dn_ctl;
2091 ip_dn_io_ptr = dummynet_io;
2092 ip_dn_ruledel_ptr = dn_rule_delete;
2093 }