]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/ip_dummynet.c
1d19216089209b27dd104141f8d09a97974b52ee
[apple/xnu.git] / bsd / netinet / ip_dummynet.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
30 * Portions Copyright (c) 2000 Akamba Corp.
31 * All rights reserved
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 *
54 * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $
55 */
56
57 #define DUMMYNET_DEBUG
58
59 /*
60 * This module implements IP dummynet, a bandwidth limiter/delay emulator
61 * used in conjunction with the ipfw package.
62 * Description of the data structures used is in ip_dummynet.h
63 * Here you mainly find the following blocks of code:
64 * + variable declarations;
65 * + heap management functions;
66 * + scheduler and dummynet functions;
67 * + configuration and initialization.
68 *
69 * NOTA BENE: critical sections are protected by the "dummynet lock".
70 *
71 * Most important Changes:
72 *
73 * 010124: Fixed WF2Q behaviour
74 * 010122: Fixed spl protection.
75 * 000601: WF2Q support
76 * 000106: large rewrite, use heaps to handle very many pipes.
77 * 980513: initial release
78 *
79 * include files marked with XXX are probably not needed
80 */
81
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/malloc.h>
85 #include <sys/mbuf.h>
86 #include <sys/queue.h> /* XXX */
87 #include <sys/kernel.h>
88 #include <sys/socket.h>
89 #include <sys/socketvar.h>
90 #include <sys/time.h>
91 #include <sys/sysctl.h>
92 #include <net/if.h>
93 #include <net/route.h>
94 #include <net/kpi_protocol.h>
95 #include <netinet/in.h>
96 #include <netinet/in_systm.h>
97 #include <netinet/in_var.h>
98 #include <netinet/ip.h>
99 #include <netinet/ip_fw.h>
100 #include <netinet/ip_dummynet.h>
101 #include <netinet/ip_var.h>
102
103 #if BRIDGE
104 #include <netinet/if_ether.h> /* for struct arpcom */
105 #include <net/bridge.h>
106 #endif
107
108 /*
109 * We keep a private variable for the simulation time, but we could
110 * probably use an existing one ("softticks" in sys/kern/kern_timer.c)
111 */
112 static dn_key curr_time = 0 ; /* current simulation time */
113
114 static int dn_hash_size = 64 ; /* default hash size */
115
116 /* statistics on number of queue searches and search steps */
117 static int searches, search_steps ;
118 static int pipe_expire = 1 ; /* expire queue if empty */
119 static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
120
121 static int red_lookup_depth = 256; /* RED - default lookup table depth */
122 static int red_avg_pkt_size = 512; /* RED - default medium packet size */
123 static int red_max_pkt_size = 1500; /* RED - default max packet size */
124
125 /*
126 * Three heaps contain queues and pipes that the scheduler handles:
127 *
128 * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
129 *
130 * wfq_ready_heap contains the pipes associated with WF2Q flows
131 *
132 * extract_heap contains pipes associated with delay lines.
133 *
134 */
135 static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
136
137 static int heap_init(struct dn_heap *h, int size) ;
138 static int heap_insert (struct dn_heap *h, dn_key key1, void *p);
139 static void heap_extract(struct dn_heap *h, void *obj);
140
141 static void transmit_event(struct dn_pipe *pipe);
142 static void ready_event(struct dn_flow_queue *q);
143
144 static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */
145 static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */
146
147 #ifdef SYSCTL_NODE
148 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
149 CTLFLAG_RW, 0, "Dummynet");
150 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
151 CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
152 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time,
153 CTLFLAG_RD, &curr_time, 0, "Current tick");
154 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
155 CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
156 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
157 CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
158 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
159 CTLFLAG_RD, &searches, 0, "Number of queue searches");
160 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
161 CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
162 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
163 CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
164 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
165 CTLFLAG_RW, &dn_max_ratio, 0,
166 "Max ratio between dynamic queues and buckets");
167 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
168 CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
169 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
170 CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
171 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
172 CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
173 #endif
174
175 #ifdef DUMMYNET_DEBUG
176 int dummynet_debug = 0;
177 #ifdef SYSCTL_NODE
178 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
179 0, "control debugging printfs");
180 #endif
181 #define DPRINTF(X) if (dummynet_debug) printf X
182 #else
183 #define DPRINTF(X)
184 #endif
185
186 /* dummynet lock */
187 lck_grp_t *dn_mutex_grp;
188 lck_grp_attr_t *dn_mutex_grp_attr;
189 lck_attr_t *dn_mutex_attr;
190 lck_mtx_t *dn_mutex;
191
192 static int config_pipe(struct dn_pipe *p);
193 static int ip_dn_ctl(struct sockopt *sopt);
194
195 static void dummynet(void *);
196 static void dummynet_flush(void);
197 void dummynet_drain(void);
198 static ip_dn_io_t dummynet_io;
199 static void dn_rule_delete(void *);
200
201 int if_tx_rdy(struct ifnet *ifp);
202
203 extern lck_mtx_t *rt_mtx; /* route global lock */
204
205 /*
206 * Heap management functions.
207 *
208 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
209 * Some macros help finding parent/children so we can optimize them.
210 *
211 * heap_init() is called to expand the heap when needed.
212 * Increment size in blocks of 16 entries.
213 * XXX failure to allocate a new element is a pretty bad failure
214 * as we basically stall a whole queue forever!!
215 * Returns 1 on error, 0 on success
216 */
217 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
218 #define HEAP_LEFT(x) ( 2*(x) + 1 )
219 #define HEAP_IS_LEFT(x) ( (x) & 1 )
220 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
221 #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
222 #define HEAP_INCREMENT 15
223
224 static int
225 heap_init(struct dn_heap *h, int new_size)
226 {
227 struct dn_heap_entry *p;
228
229 if (h->size >= new_size ) {
230 printf("dummynet: heap_init, Bogus call, have %d want %d\n",
231 h->size, new_size);
232 return 0 ;
233 }
234 new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
235 p = _MALLOC(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT );
236 if (p == NULL) {
237 printf("dummynet: heap_init, resize %d failed\n", new_size );
238 return 1 ; /* error */
239 }
240 if (h->size > 0) {
241 bcopy(h->p, p, h->size * sizeof(*p) );
242 FREE(h->p, M_DUMMYNET);
243 }
244 h->p = p ;
245 h->size = new_size ;
246 return 0 ;
247 }
248
249 /*
250 * Insert element in heap. Normally, p != NULL, we insert p in
251 * a new position and bubble up. If p == NULL, then the element is
252 * already in place, and key is the position where to start the
253 * bubble-up.
254 * Returns 1 on failure (cannot allocate new heap entry)
255 *
256 * If offset > 0 the position (index, int) of the element in the heap is
257 * also stored in the element itself at the given offset in bytes.
258 */
259 #define SET_OFFSET(heap, node) \
260 if (heap->offset > 0) \
261 *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
262 /*
263 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
264 */
265 #define RESET_OFFSET(heap, node) \
266 if (heap->offset > 0) \
267 *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
268 static int
269 heap_insert(struct dn_heap *h, dn_key key1, void *p)
270 {
271 int son = h->elements ;
272
273 if (p == NULL) /* data already there, set starting point */
274 son = key1 ;
275 else { /* insert new element at the end, possibly resize */
276 son = h->elements ;
277 if (son == h->size) /* need resize... */
278 if (heap_init(h, h->elements+1) )
279 return 1 ; /* failure... */
280 h->p[son].object = p ;
281 h->p[son].key = key1 ;
282 h->elements++ ;
283 }
284 while (son > 0) { /* bubble up */
285 int father = HEAP_FATHER(son) ;
286 struct dn_heap_entry tmp ;
287
288 if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
289 break ; /* found right position */
290 /* son smaller than father, swap and repeat */
291 HEAP_SWAP(h->p[son], h->p[father], tmp) ;
292 SET_OFFSET(h, son);
293 son = father ;
294 }
295 SET_OFFSET(h, son);
296 return 0 ;
297 }
298
299 /*
300 * remove top element from heap, or obj if obj != NULL
301 */
302 static void
303 heap_extract(struct dn_heap *h, void *obj)
304 {
305 int child, father, max = h->elements - 1 ;
306
307 if (max < 0) {
308 printf("dummynet: warning, extract from empty heap 0x%p\n", h);
309 return ;
310 }
311 father = 0 ; /* default: move up smallest child */
312 if (obj != NULL) { /* extract specific element, index is at offset */
313 if (h->offset <= 0)
314 panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
315 father = *((int *)((char *)obj + h->offset)) ;
316 if (father < 0 || father >= h->elements) {
317 printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
318 father, h->elements);
319 panic("dummynet: heap_extract");
320 }
321 }
322 RESET_OFFSET(h, father);
323 child = HEAP_LEFT(father) ; /* left child */
324 while (child <= max) { /* valid entry */
325 if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
326 child = child+1 ; /* take right child, otherwise left */
327 h->p[father] = h->p[child] ;
328 SET_OFFSET(h, father);
329 father = child ;
330 child = HEAP_LEFT(child) ; /* left child for next loop */
331 }
332 h->elements-- ;
333 if (father != max) {
334 /*
335 * Fill hole with last entry and bubble up, reusing the insert code
336 */
337 h->p[father] = h->p[max] ;
338 heap_insert(h, father, NULL); /* this one cannot fail */
339 }
340 }
341
342 #if 0
343 /*
344 * change object position and update references
345 * XXX this one is never used!
346 */
347 static void
348 heap_move(struct dn_heap *h, dn_key new_key, void *object)
349 {
350 int temp;
351 int i ;
352 int max = h->elements-1 ;
353 struct dn_heap_entry buf ;
354
355 if (h->offset <= 0)
356 panic("cannot move items on this heap");
357
358 i = *((int *)((char *)object + h->offset));
359 if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
360 h->p[i].key = new_key ;
361 for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
362 i = temp ) { /* bubble up */
363 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
364 SET_OFFSET(h, i);
365 }
366 } else { /* must move down */
367 h->p[i].key = new_key ;
368 while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
369 if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
370 temp++ ; /* select child with min key */
371 if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
372 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
373 SET_OFFSET(h, i);
374 } else
375 break ;
376 i = temp ;
377 }
378 }
379 SET_OFFSET(h, i);
380 }
381 #endif /* heap_move, unused */
382
383 /*
384 * heapify() will reorganize data inside an array to maintain the
385 * heap property. It is needed when we delete a bunch of entries.
386 */
387 static void
388 heapify(struct dn_heap *h)
389 {
390 int i ;
391
392 for (i = 0 ; i < h->elements ; i++ )
393 heap_insert(h, i , NULL) ;
394 }
395
396 /*
397 * cleanup the heap and free data structure
398 */
399 static void
400 heap_free(struct dn_heap *h)
401 {
402 if (h->size >0 )
403 FREE(h->p, M_DUMMYNET);
404 bzero(h, sizeof(*h) );
405 }
406
407 /*
408 * --- end of heap management functions ---
409 */
410
411 /*
412 * Return the mbuf tag holding the dummynet state. As an optimization
413 * this is assumed to be the first tag on the list. If this turns out
414 * wrong we'll need to search the list.
415 */
416 static struct dn_pkt_tag *
417 dn_tag_get(struct mbuf *m)
418 {
419 struct m_tag *mtag = m_tag_first(m);
420 /* KASSERT(mtag != NULL &&
421 mtag->m_tag_id == KERNEL_MODULE_TAG_ID &&
422 mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET,
423 ("packet on dummynet queue w/o dummynet tag!"));
424 */
425 return (struct dn_pkt_tag *)(mtag+1);
426 }
427
428 /*
429 * Scheduler functions:
430 *
431 * transmit_event() is called when the delay-line needs to enter
432 * the scheduler, either because of existing pkts getting ready,
433 * or new packets entering the queue. The event handled is the delivery
434 * time of the packet.
435 *
436 * ready_event() does something similar with fixed-rate queues, and the
437 * event handled is the finish time of the head pkt.
438 *
439 * wfq_ready_event() does something similar with WF2Q queues, and the
440 * event handled is the start time of the head pkt.
441 *
442 * In all cases, we make sure that the data structures are consistent
443 * before passing pkts out, because this might trigger recursive
444 * invocations of the procedures.
445 */
446 static void
447 transmit_event(struct dn_pipe *pipe)
448 {
449 struct mbuf *m ;
450 struct dn_pkt_tag *pkt ;
451 struct ip *ip;
452
453 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
454
455 while ( (m = pipe->head) ) {
456 pkt = dn_tag_get(m);
457 if ( !DN_KEY_LEQ(pkt->output_time, curr_time) )
458 break;
459 /*
460 * first unlink, then call procedures, since ip_input() can invoke
461 * ip_output() and viceversa, thus causing nested calls
462 */
463 pipe->head = m->m_nextpkt ;
464 m->m_nextpkt = NULL;
465
466 /* XXX: drop the lock for now to avoid LOR's */
467 lck_mtx_unlock(dn_mutex);
468 switch (pkt->dn_dir) {
469 case DN_TO_IP_OUT: {
470 struct route tmp_rt = pkt->ro;
471 (void)ip_output(m, NULL, NULL, pkt->flags, NULL);
472 if (tmp_rt.ro_rt) {
473 rtfree(tmp_rt.ro_rt);
474 }
475 break ;
476 }
477 case DN_TO_IP_IN :
478 ip = mtod(m, struct ip *);
479 ip->ip_len = htons(ip->ip_len);
480 ip->ip_off = htons(ip->ip_off);
481 proto_inject(PF_INET, m);
482 break ;
483
484 #if BRIDGE
485 case DN_TO_BDG_FWD :
486 /*
487 * The bridge requires/assumes the Ethernet header is
488 * contiguous in the first mbuf header. Insure this is true.
489 */
490 if (BDG_LOADED) {
491 if (m->m_len < ETHER_HDR_LEN &&
492 (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
493 printf("dummynet/bridge: pullup fail, dropping pkt\n");
494 break;
495 }
496 m = bdg_forward_ptr(m, pkt->ifp);
497 } else {
498 /* somebody unloaded the bridge module. Drop pkt */
499 /* XXX rate limit */
500 printf("dummynet: dropping bridged packet trapped in pipe\n");
501 }
502 if (m)
503 m_freem(m);
504 break;
505 #endif
506 default:
507 printf("dummynet: bad switch %d!\n", pkt->dn_dir);
508 m_freem(m);
509 break ;
510 }
511 lck_mtx_lock(dn_mutex);
512 }
513 /* if there are leftover packets, put into the heap for next event */
514 if ( (m = pipe->head) ) {
515 pkt = dn_tag_get(m);
516 /* XXX should check errors on heap_insert, by draining the
517 * whole pipe p and hoping in the future we are more successful
518 */
519 heap_insert(&extract_heap, pkt->output_time, pipe);
520 }
521 }
522
523 /*
524 * the following macro computes how many ticks we have to wait
525 * before being able to transmit a packet. The credit is taken from
526 * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
527 */
528 #define SET_TICKS(_m, q, p) \
529 ((_m)->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \
530 p->bandwidth ;
531
532 /*
533 * extract pkt from queue, compute output time (could be now)
534 * and put into delay line (p_queue)
535 */
536 static void
537 move_pkt(struct mbuf *pkt, struct dn_flow_queue *q,
538 struct dn_pipe *p, int len)
539 {
540 struct dn_pkt_tag *dt = dn_tag_get(pkt);
541
542 q->head = pkt->m_nextpkt ;
543 q->len-- ;
544 q->len_bytes -= len ;
545
546 dt->output_time = curr_time + p->delay ;
547
548 if (p->head == NULL)
549 p->head = pkt;
550 else
551 p->tail->m_nextpkt = pkt;
552 p->tail = pkt;
553 p->tail->m_nextpkt = NULL;
554 }
555
556 /*
557 * ready_event() is invoked every time the queue must enter the
558 * scheduler, either because the first packet arrives, or because
559 * a previously scheduled event fired.
560 * On invokation, drain as many pkts as possible (could be 0) and then
561 * if there are leftover packets reinsert the pkt in the scheduler.
562 */
563 static void
564 ready_event(struct dn_flow_queue *q)
565 {
566 struct mbuf *pkt;
567 struct dn_pipe *p = q->fs->pipe ;
568 int p_was_empty ;
569
570 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
571
572 if (p == NULL) {
573 printf("dummynet: ready_event- pipe is gone\n");
574 return ;
575 }
576 p_was_empty = (p->head == NULL) ;
577
578 /*
579 * schedule fixed-rate queues linked to this pipe:
580 * Account for the bw accumulated since last scheduling, then
581 * drain as many pkts as allowed by q->numbytes and move to
582 * the delay line (in p) computing output time.
583 * bandwidth==0 (no limit) means we can drain the whole queue,
584 * setting len_scaled = 0 does the job.
585 */
586 q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
587 while ( (pkt = q->head) != NULL ) {
588 int len = pkt->m_pkthdr.len;
589 int len_scaled = p->bandwidth ? len*8*hz : 0 ;
590 if (len_scaled > q->numbytes )
591 break ;
592 q->numbytes -= len_scaled ;
593 move_pkt(pkt, q, p, len);
594 }
595 /*
596 * If we have more packets queued, schedule next ready event
597 * (can only occur when bandwidth != 0, otherwise we would have
598 * flushed the whole queue in the previous loop).
599 * To this purpose we record the current time and compute how many
600 * ticks to go for the finish time of the packet.
601 */
602 if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */
603 dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
604 q->sched_time = curr_time ;
605 heap_insert(&ready_heap, curr_time + t, (void *)q );
606 /* XXX should check errors on heap_insert, and drain the whole
607 * queue on error hoping next time we are luckier.
608 */
609 } else { /* RED needs to know when the queue becomes empty */
610 q->q_time = curr_time;
611 q->numbytes = 0;
612 }
613 /*
614 * If the delay line was empty call transmit_event(p) now.
615 * Otherwise, the scheduler will take care of it.
616 */
617 if (p_was_empty)
618 transmit_event(p);
619 }
620
621 /*
622 * Called when we can transmit packets on WF2Q queues. Take pkts out of
623 * the queues at their start time, and enqueue into the delay line.
624 * Packets are drained until p->numbytes < 0. As long as
625 * len_scaled >= p->numbytes, the packet goes into the delay line
626 * with a deadline p->delay. For the last packet, if p->numbytes<0,
627 * there is an additional delay.
628 */
629 static void
630 ready_event_wfq(struct dn_pipe *p)
631 {
632 int p_was_empty = (p->head == NULL) ;
633 struct dn_heap *sch = &(p->scheduler_heap);
634 struct dn_heap *neh = &(p->not_eligible_heap) ;
635
636 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
637
638 if (p->if_name[0] == 0) /* tx clock is simulated */
639 p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
640 else { /* tx clock is for real, the ifq must be empty or this is a NOP */
641 if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
642 return ;
643 else {
644 DPRINTF(("dummynet: pipe %d ready from %s --\n",
645 p->pipe_nr, p->if_name));
646 }
647 }
648
649 /*
650 * While we have backlogged traffic AND credit, we need to do
651 * something on the queue.
652 */
653 while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) {
654 if (sch->elements > 0) { /* have some eligible pkts to send out */
655 struct dn_flow_queue *q = sch->p[0].object ;
656 struct mbuf *pkt = q->head;
657 struct dn_flow_set *fs = q->fs;
658 u_int64_t len = pkt->m_pkthdr.len;
659 int len_scaled = p->bandwidth ? len*8*hz : 0 ;
660
661 heap_extract(sch, NULL); /* remove queue from heap */
662 p->numbytes -= len_scaled ;
663 move_pkt(pkt, q, p, len);
664
665 p->V += (len<<MY_M) / p->sum ; /* update V */
666 q->S = q->F ; /* update start time */
667 if (q->len == 0) { /* Flow not backlogged any more */
668 fs->backlogged-- ;
669 heap_insert(&(p->idle_heap), q->F, q);
670 } else { /* still backlogged */
671 /*
672 * update F and position in backlogged queue, then
673 * put flow in not_eligible_heap (we will fix this later).
674 */
675 len = (q->head)->m_pkthdr.len;
676 q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
677 if (DN_KEY_LEQ(q->S, p->V))
678 heap_insert(neh, q->S, q);
679 else
680 heap_insert(sch, q->F, q);
681 }
682 }
683 /*
684 * now compute V = max(V, min(S_i)). Remember that all elements in sch
685 * have by definition S_i <= V so if sch is not empty, V is surely
686 * the max and we must not update it. Conversely, if sch is empty
687 * we only need to look at neh.
688 */
689 if (sch->elements == 0 && neh->elements > 0)
690 p->V = MAX64 ( p->V, neh->p[0].key );
691 /* move from neh to sch any packets that have become eligible */
692 while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) {
693 struct dn_flow_queue *q = neh->p[0].object ;
694 heap_extract(neh, NULL);
695 heap_insert(sch, q->F, q);
696 }
697
698 if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
699 p->numbytes = -1 ; /* mark not ready for I/O */
700 break ;
701 }
702 }
703 if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0
704 && p->idle_heap.elements > 0) {
705 /*
706 * no traffic and no events scheduled. We can get rid of idle-heap.
707 */
708 int i ;
709
710 for (i = 0 ; i < p->idle_heap.elements ; i++) {
711 struct dn_flow_queue *q = p->idle_heap.p[i].object ;
712
713 q->F = 0 ;
714 q->S = q->F + 1 ;
715 }
716 p->sum = 0 ;
717 p->V = 0 ;
718 p->idle_heap.elements = 0 ;
719 }
720 /*
721 * If we are getting clocks from dummynet (not a real interface) and
722 * If we are under credit, schedule the next ready event.
723 * Also fix the delivery time of the last packet.
724 */
725 if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */
726 dn_key t=0 ; /* number of ticks i have to wait */
727
728 if (p->bandwidth > 0)
729 t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ;
730 dn_tag_get(p->tail)->output_time += t ;
731 p->sched_time = curr_time ;
732 heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
733 /* XXX should check errors on heap_insert, and drain the whole
734 * queue on error hoping next time we are luckier.
735 */
736 }
737 /*
738 * If the delay line was empty call transmit_event(p) now.
739 * Otherwise, the scheduler will take care of it.
740 */
741 if (p_was_empty)
742 transmit_event(p);
743 }
744
745 /*
746 * This is called once per tick, or HZ times per second. It is used to
747 * increment the current tick counter and schedule expired events.
748 */
749 static void
750 dummynet(void * __unused unused)
751 {
752 void *p ; /* generic parameter to handler */
753 struct dn_heap *h ;
754 struct dn_heap *heaps[3];
755 int i;
756 struct dn_pipe *pe ;
757
758 heaps[0] = &ready_heap ; /* fixed-rate queues */
759 heaps[1] = &wfq_ready_heap ; /* wfq queues */
760 heaps[2] = &extract_heap ; /* delay line */
761
762 lck_mtx_lock(dn_mutex);
763
764 curr_time++ ;
765 for (i=0; i < 3 ; i++) {
766 h = heaps[i];
767 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) {
768 if (h->p[0].key > curr_time)
769 printf("dummynet: warning, heap %d is %d ticks late\n",
770 i, (int)(curr_time - h->p[0].key));
771 p = h->p[0].object ; /* store a copy before heap_extract */
772 heap_extract(h, NULL); /* need to extract before processing */
773 if (i == 0)
774 ready_event(p) ;
775 else if (i == 1) {
776 struct dn_pipe *pipe = p;
777 if (pipe->if_name[0] != '\0')
778 printf("dummynet: bad ready_event_wfq for pipe %s\n",
779 pipe->if_name);
780 else
781 ready_event_wfq(p) ;
782 } else
783 transmit_event(p);
784 }
785 }
786 /* sweep pipes trying to expire idle flow_queues */
787 for (pe = all_pipes; pe ; pe = pe->next )
788 if (pe->idle_heap.elements > 0 &&
789 DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) {
790 struct dn_flow_queue *q = pe->idle_heap.p[0].object ;
791
792 heap_extract(&(pe->idle_heap), NULL);
793 q->S = q->F + 1 ; /* mark timestamp as invalid */
794 pe->sum -= q->fs->weight ;
795 }
796
797 lck_mtx_unlock(dn_mutex);
798
799 timeout(dummynet, NULL, 1);
800 }
801
802 /*
803 * called by an interface when tx_rdy occurs.
804 */
805 int
806 if_tx_rdy(struct ifnet *ifp)
807 {
808 struct dn_pipe *p;
809
810 lck_mtx_lock(dn_mutex);
811 for (p = all_pipes; p ; p = p->next )
812 if (p->ifp == ifp)
813 break ;
814 if (p == NULL) {
815 char buf[32];
816 sprintf(buf, "%s%d",ifp->if_name, ifp->if_unit);
817 for (p = all_pipes; p ; p = p->next )
818 if (!strcmp(p->if_name, buf) ) {
819 p->ifp = ifp ;
820 DPRINTF(("dummynet: ++ tx rdy from %s (now found)\n", buf));
821 break ;
822 }
823 }
824 if (p != NULL) {
825 DPRINTF(("dummynet: ++ tx rdy from %s%d - qlen %d\n", ifp->if_name,
826 ifp->if_unit, ifp->if_snd.ifq_len));
827 p->numbytes = 0 ; /* mark ready for I/O */
828 ready_event_wfq(p);
829 }
830 lck_mtx_lock(dn_mutex);
831
832 return 0;
833 }
834
835 /*
836 * Unconditionally expire empty queues in case of shortage.
837 * Returns the number of queues freed.
838 */
839 static int
840 expire_queues(struct dn_flow_set *fs)
841 {
842 struct dn_flow_queue *q, *prev ;
843 int i, initial_elements = fs->rq_elements ;
844 struct timeval timenow;
845
846 getmicrotime(&timenow);
847
848 if (fs->last_expired == timenow.tv_sec)
849 return 0 ;
850 fs->last_expired = timenow.tv_sec ;
851 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
852 for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
853 if (q->head != NULL || q->S != q->F+1) {
854 prev = q ;
855 q = q->next ;
856 } else { /* entry is idle, expire it */
857 struct dn_flow_queue *old_q = q ;
858
859 if (prev != NULL)
860 prev->next = q = q->next ;
861 else
862 fs->rq[i] = q = q->next ;
863 fs->rq_elements-- ;
864 FREE(old_q, M_DUMMYNET);
865 }
866 return initial_elements - fs->rq_elements ;
867 }
868
869 /*
870 * If room, create a new queue and put at head of slot i;
871 * otherwise, create or use the default queue.
872 */
873 static struct dn_flow_queue *
874 create_queue(struct dn_flow_set *fs, int i)
875 {
876 struct dn_flow_queue *q ;
877
878 if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
879 expire_queues(fs) == 0) {
880 /*
881 * No way to get room, use or create overflow queue.
882 */
883 i = fs->rq_size ;
884 if ( fs->rq[i] != NULL )
885 return fs->rq[i] ;
886 }
887 q = _MALLOC(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO);
888 if (q == NULL) {
889 printf("dummynet: sorry, cannot allocate queue for new flow\n");
890 return NULL ;
891 }
892 q->fs = fs ;
893 q->hash_slot = i ;
894 q->next = fs->rq[i] ;
895 q->S = q->F + 1; /* hack - mark timestamp as invalid */
896 fs->rq[i] = q ;
897 fs->rq_elements++ ;
898 return q ;
899 }
900
901 /*
902 * Given a flow_set and a pkt in last_pkt, find a matching queue
903 * after appropriate masking. The queue is moved to front
904 * so that further searches take less time.
905 */
906 static struct dn_flow_queue *
907 find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
908 {
909 int i = 0 ; /* we need i and q for new allocations */
910 struct dn_flow_queue *q, *prev;
911
912 if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
913 q = fs->rq[0] ;
914 else {
915 /* first, do the masking */
916 id->dst_ip &= fs->flow_mask.dst_ip ;
917 id->src_ip &= fs->flow_mask.src_ip ;
918 id->dst_port &= fs->flow_mask.dst_port ;
919 id->src_port &= fs->flow_mask.src_port ;
920 id->proto &= fs->flow_mask.proto ;
921 id->flags = 0 ; /* we don't care about this one */
922 /* then, hash function */
923 i = ( (id->dst_ip) & 0xffff ) ^
924 ( (id->dst_ip >> 15) & 0xffff ) ^
925 ( (id->src_ip << 1) & 0xffff ) ^
926 ( (id->src_ip >> 16 ) & 0xffff ) ^
927 (id->dst_port << 1) ^ (id->src_port) ^
928 (id->proto );
929 i = i % fs->rq_size ;
930 /* finally, scan the current list for a match */
931 searches++ ;
932 for (prev=NULL, q = fs->rq[i] ; q ; ) {
933 search_steps++;
934 if (id->dst_ip == q->id.dst_ip &&
935 id->src_ip == q->id.src_ip &&
936 id->dst_port == q->id.dst_port &&
937 id->src_port == q->id.src_port &&
938 id->proto == q->id.proto &&
939 id->flags == q->id.flags)
940 break ; /* found */
941 else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
942 /* entry is idle and not in any heap, expire it */
943 struct dn_flow_queue *old_q = q ;
944
945 if (prev != NULL)
946 prev->next = q = q->next ;
947 else
948 fs->rq[i] = q = q->next ;
949 fs->rq_elements-- ;
950 FREE(old_q, M_DUMMYNET);
951 continue ;
952 }
953 prev = q ;
954 q = q->next ;
955 }
956 if (q && prev != NULL) { /* found and not in front */
957 prev->next = q->next ;
958 q->next = fs->rq[i] ;
959 fs->rq[i] = q ;
960 }
961 }
962 if (q == NULL) { /* no match, need to allocate a new entry */
963 q = create_queue(fs, i);
964 if (q != NULL)
965 q->id = *id ;
966 }
967 return q ;
968 }
969
970 static int
971 red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
972 {
973 /*
974 * RED algorithm
975 *
976 * RED calculates the average queue size (avg) using a low-pass filter
977 * with an exponential weighted (w_q) moving average:
978 * avg <- (1-w_q) * avg + w_q * q_size
979 * where q_size is the queue length (measured in bytes or * packets).
980 *
981 * If q_size == 0, we compute the idle time for the link, and set
982 * avg = (1 - w_q)^(idle/s)
983 * where s is the time needed for transmitting a medium-sized packet.
984 *
985 * Now, if avg < min_th the packet is enqueued.
986 * If avg > max_th the packet is dropped. Otherwise, the packet is
987 * dropped with probability P function of avg.
988 *
989 */
990
991 int64_t p_b = 0;
992 /* queue in bytes or packets ? */
993 u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len;
994
995 DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size));
996
997 /* average queue size estimation */
998 if (q_size != 0) {
999 /*
1000 * queue is not empty, avg <- avg + (q_size - avg) * w_q
1001 */
1002 int diff = SCALE(q_size) - q->avg;
1003 int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q);
1004
1005 q->avg += (int) v;
1006 } else {
1007 /*
1008 * queue is empty, find for how long the queue has been
1009 * empty and use a lookup table for computing
1010 * (1 - * w_q)^(idle_time/s) where s is the time to send a
1011 * (small) packet.
1012 * XXX check wraps...
1013 */
1014 if (q->avg) {
1015 u_int t = (curr_time - q->q_time) / fs->lookup_step;
1016
1017 q->avg = (t < fs->lookup_depth) ?
1018 SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
1019 }
1020 }
1021 DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
1022
1023 /* should i drop ? */
1024
1025 if (q->avg < fs->min_th) {
1026 q->count = -1;
1027 return 0; /* accept packet ; */
1028 }
1029 if (q->avg >= fs->max_th) { /* average queue >= max threshold */
1030 if (fs->flags_fs & DN_IS_GENTLE_RED) {
1031 /*
1032 * According to Gentle-RED, if avg is greater than max_th the
1033 * packet is dropped with a probability
1034 * p_b = c_3 * avg - c_4
1035 * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
1036 */
1037 p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4;
1038 } else {
1039 q->count = -1;
1040 DPRINTF(("dummynet: - drop"));
1041 return 1 ;
1042 }
1043 } else if (q->avg > fs->min_th) {
1044 /*
1045 * we compute p_b using the linear dropping function p_b = c_1 *
1046 * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
1047 * max_p * min_th / (max_th - min_th)
1048 */
1049 p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2;
1050 }
1051 if (fs->flags_fs & DN_QSIZE_IS_BYTES)
1052 p_b = (p_b * len) / fs->max_pkt_size;
1053 if (++q->count == 0)
1054 q->random = random() & 0xffff;
1055 else {
1056 /*
1057 * q->count counts packets arrived since last drop, so a greater
1058 * value of q->count means a greater packet drop probability.
1059 */
1060 if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) {
1061 q->count = 0;
1062 DPRINTF(("dummynet: - red drop"));
1063 /* after a drop we calculate a new random value */
1064 q->random = random() & 0xffff;
1065 return 1; /* drop */
1066 }
1067 }
1068 /* end of RED algorithm */
1069 return 0 ; /* accept */
1070 }
1071
1072 static __inline
1073 struct dn_flow_set *
1074 locate_flowset(int pipe_nr, struct ip_fw *rule)
1075 {
1076 struct dn_flow_set *fs;
1077 ipfw_insn *cmd = rule->cmd + rule->act_ofs;
1078
1079 if (cmd->opcode == O_LOG)
1080 cmd += F_LEN(cmd);
1081
1082 bcopy(& ((ipfw_insn_pipe *)cmd)->pipe_ptr, &fs, sizeof(fs));
1083
1084 if (fs != NULL)
1085 return fs;
1086
1087 if (cmd->opcode == O_QUEUE) {
1088 for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next)
1089 ;
1090 }
1091 else {
1092 struct dn_pipe *p1;
1093 for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next)
1094 ;
1095 if (p1 != NULL)
1096 fs = &(p1->fs) ;
1097 }
1098 /* record for the future */
1099 bcopy(&fs, & ((ipfw_insn_pipe *)cmd)->pipe_ptr, sizeof(fs));
1100
1101 return fs ;
1102 }
1103
1104 /*
1105 * dummynet hook for packets. Below 'pipe' is a pipe or a queue
1106 * depending on whether WF2Q or fixed bw is used.
1107 *
1108 * pipe_nr pipe or queue the packet is destined for.
1109 * dir where shall we send the packet after dummynet.
1110 * m the mbuf with the packet
1111 * ifp the 'ifp' parameter from the caller.
1112 * NULL in ip_input, destination interface in ip_output,
1113 * real_dst in bdg_forward
1114 * ro route parameter (only used in ip_output, NULL otherwise)
1115 * dst destination address, only used by ip_output
1116 * rule matching rule, in case of multiple passes
1117 * flags flags from the caller, only used in ip_output
1118 *
1119 */
1120 static int
1121 dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
1122 {
1123 struct dn_pkt_tag *pkt;
1124 struct m_tag *mtag;
1125 struct dn_flow_set *fs;
1126 struct dn_pipe *pipe ;
1127 u_int64_t len = m->m_pkthdr.len ;
1128 struct dn_flow_queue *q = NULL ;
1129 int is_pipe;
1130
1131 #if IPFW2
1132 ipfw_insn *cmd = fwa->rule->cmd + fwa->rule->act_ofs;
1133
1134 if (cmd->opcode == O_LOG)
1135 cmd += F_LEN(cmd);
1136 is_pipe = (cmd->opcode == O_PIPE);
1137 #else
1138 is_pipe = (fwa->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE;
1139 #endif
1140
1141 pipe_nr &= 0xffff ;
1142
1143 lck_mtx_lock(dn_mutex);
1144
1145 /*
1146 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
1147 */
1148 fs = locate_flowset(pipe_nr, fwa->rule);
1149 if (fs == NULL)
1150 goto dropit ; /* this queue/pipe does not exist! */
1151 pipe = fs->pipe ;
1152 if (pipe == NULL) { /* must be a queue, try find a matching pipe */
1153 for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr;
1154 pipe = pipe->next)
1155 ;
1156 if (pipe != NULL)
1157 fs->pipe = pipe ;
1158 else {
1159 printf("dummynet: no pipe %d for queue %d, drop pkt\n",
1160 fs->parent_nr, fs->fs_nr);
1161 goto dropit ;
1162 }
1163 }
1164 q = find_queue(fs, &(fwa->f_id));
1165 if ( q == NULL )
1166 goto dropit ; /* cannot allocate queue */
1167 /*
1168 * update statistics, then check reasons to drop pkt
1169 */
1170 q->tot_bytes += len ;
1171 q->tot_pkts++ ;
1172 if ( fs->plr && random() < fs->plr )
1173 goto dropit ; /* random pkt drop */
1174 if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
1175 if (q->len_bytes > fs->qsize)
1176 goto dropit ; /* queue size overflow */
1177 } else {
1178 if (q->len >= fs->qsize)
1179 goto dropit ; /* queue count overflow */
1180 }
1181 if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
1182 goto dropit ;
1183
1184 /* XXX expensive to zero, see if we can remove it*/
1185 mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET,
1186 sizeof(struct dn_pkt_tag), M_NOWAIT);
1187 if ( mtag == NULL )
1188 goto dropit ; /* cannot allocate packet header */
1189 m_tag_prepend(m, mtag); /* attach to mbuf chain */
1190
1191 pkt = (struct dn_pkt_tag *)(mtag+1);
1192 bzero(pkt, sizeof(struct dn_pkt_tag));
1193 /* ok, i can handle the pkt now... */
1194 /* build and enqueue packet + parameters */
1195 pkt->rule = fwa->rule ;
1196 pkt->dn_dir = dir ;
1197
1198 pkt->ifp = fwa->oif;
1199 if (dir == DN_TO_IP_OUT) {
1200 /*
1201 * We need to copy *ro because for ICMP pkts (and maybe others)
1202 * the caller passed a pointer into the stack; dst might also be
1203 * a pointer into *ro so it needs to be updated.
1204 */
1205 lck_mtx_lock(rt_mtx);
1206 pkt->ro = *(fwa->ro);
1207 if (fwa->ro->ro_rt)
1208 fwa->ro->ro_rt->rt_refcnt++ ;
1209 if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */
1210 fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ;
1211 lck_mtx_unlock(rt_mtx);
1212
1213 pkt->dn_dst = fwa->dst;
1214 pkt->flags = fwa->flags;
1215 }
1216 if (q->head == NULL)
1217 q->head = m;
1218 else
1219 q->tail->m_nextpkt = m;
1220 q->tail = m;
1221 q->len++;
1222 q->len_bytes += len ;
1223
1224 if ( q->head != m ) /* flow was not idle, we are done */
1225 goto done;
1226 /*
1227 * If we reach this point the flow was previously idle, so we need
1228 * to schedule it. This involves different actions for fixed-rate or
1229 * WF2Q queues.
1230 */
1231 if (is_pipe) {
1232 /*
1233 * Fixed-rate queue: just insert into the ready_heap.
1234 */
1235 dn_key t = 0 ;
1236 if (pipe->bandwidth)
1237 t = SET_TICKS(m, q, pipe);
1238 q->sched_time = curr_time ;
1239 if (t == 0) /* must process it now */
1240 ready_event( q );
1241 else
1242 heap_insert(&ready_heap, curr_time + t , q );
1243 } else {
1244 /*
1245 * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
1246 * set S to the virtual time V for the controlling pipe, and update
1247 * the sum of weights for the pipe; otherwise, remove flow from
1248 * idle_heap and set S to max(F,V).
1249 * Second, compute finish time F = S + len/weight.
1250 * Third, if pipe was idle, update V=max(S, V).
1251 * Fourth, count one more backlogged flow.
1252 */
1253 if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
1254 q->S = pipe->V ;
1255 pipe->sum += fs->weight ; /* add weight of new queue */
1256 } else {
1257 heap_extract(&(pipe->idle_heap), q);
1258 q->S = MAX64(q->F, pipe->V ) ;
1259 }
1260 q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
1261
1262 if (pipe->not_eligible_heap.elements == 0 &&
1263 pipe->scheduler_heap.elements == 0)
1264 pipe->V = MAX64 ( q->S, pipe->V );
1265 fs->backlogged++ ;
1266 /*
1267 * Look at eligibility. A flow is not eligibile if S>V (when
1268 * this happens, it means that there is some other flow already
1269 * scheduled for the same pipe, so the scheduler_heap cannot be
1270 * empty). If the flow is not eligible we just store it in the
1271 * not_eligible_heap. Otherwise, we store in the scheduler_heap
1272 * and possibly invoke ready_event_wfq() right now if there is
1273 * leftover credit.
1274 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
1275 * and for all flows in not_eligible_heap (NEH), S_i > V .
1276 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
1277 * we only need to look into NEH.
1278 */
1279 if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */
1280 if (pipe->scheduler_heap.elements == 0)
1281 printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
1282 heap_insert(&(pipe->not_eligible_heap), q->S, q);
1283 } else {
1284 heap_insert(&(pipe->scheduler_heap), q->F, q);
1285 if (pipe->numbytes >= 0) { /* pipe is idle */
1286 if (pipe->scheduler_heap.elements != 1)
1287 printf("dummynet: OUCH! pipe should have been idle!\n");
1288 DPRINTF(("dummynet: waking up pipe %d at %d\n",
1289 pipe->pipe_nr, (int)(q->F >> MY_M)));
1290 pipe->sched_time = curr_time ;
1291 ready_event_wfq(pipe);
1292 }
1293 }
1294 }
1295 done:
1296 lck_mtx_unlock(dn_mutex);
1297 return 0;
1298
1299 dropit:
1300 if (q)
1301 q->drops++ ;
1302 lck_mtx_unlock(dn_mutex);
1303 m_freem(m);
1304 return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
1305 }
1306
1307 /*
1308 * Below, the rtfree is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
1309 * Doing this would probably save us the initial bzero of dn_pkt
1310 */
1311 #define DN_FREE_PKT(_m) do { \
1312 struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \
1313 if (tag) { \
1314 struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1); \
1315 if (n->ro.ro_rt) \
1316 rtfree(n->ro.ro_rt); \
1317 } \
1318 m_tag_delete(_m, tag); \
1319 m_freem(_m); \
1320 } while (0)
1321
1322 /*
1323 * Dispose all packets and flow_queues on a flow_set.
1324 * If all=1, also remove red lookup table and other storage,
1325 * including the descriptor itself.
1326 * For the one in dn_pipe MUST also cleanup ready_heap...
1327 */
1328 static void
1329 purge_flow_set(struct dn_flow_set *fs, int all)
1330 {
1331 struct dn_flow_queue *q, *qn ;
1332 int i ;
1333
1334 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1335
1336 for (i = 0 ; i <= fs->rq_size ; i++ ) {
1337 for (q = fs->rq[i] ; q ; q = qn ) {
1338 struct mbuf *m, *mnext;
1339
1340 mnext = q->head;
1341 while ((m = mnext) != NULL) {
1342 mnext = m->m_nextpkt;
1343 DN_FREE_PKT(m);
1344 }
1345 qn = q->next ;
1346 FREE(q, M_DUMMYNET);
1347 }
1348 fs->rq[i] = NULL ;
1349 }
1350 fs->rq_elements = 0 ;
1351 if (all) {
1352 /* RED - free lookup table */
1353 if (fs->w_q_lookup)
1354 FREE(fs->w_q_lookup, M_DUMMYNET);
1355 if (fs->rq)
1356 FREE(fs->rq, M_DUMMYNET);
1357 /* if this fs is not part of a pipe, free it */
1358 if (fs->pipe && fs != &(fs->pipe->fs) )
1359 FREE(fs, M_DUMMYNET);
1360 }
1361 }
1362
1363 /*
1364 * Dispose all packets queued on a pipe (not a flow_set).
1365 * Also free all resources associated to a pipe, which is about
1366 * to be deleted.
1367 */
1368 static void
1369 purge_pipe(struct dn_pipe *pipe)
1370 {
1371 struct mbuf *m, *mnext;
1372
1373 purge_flow_set( &(pipe->fs), 1 );
1374
1375 mnext = pipe->head;
1376 while ((m = mnext) != NULL) {
1377 mnext = m->m_nextpkt;
1378 DN_FREE_PKT(m);
1379 }
1380
1381 heap_free( &(pipe->scheduler_heap) );
1382 heap_free( &(pipe->not_eligible_heap) );
1383 heap_free( &(pipe->idle_heap) );
1384 }
1385
1386 /*
1387 * Delete all pipes and heaps returning memory. Must also
1388 * remove references from all ipfw rules to all pipes.
1389 */
1390 static void
1391 dummynet_flush()
1392 {
1393 struct dn_pipe *curr_p, *p ;
1394 struct dn_flow_set *fs, *curr_fs;
1395
1396 lck_mtx_lock(dn_mutex);
1397
1398 /* remove all references to pipes ...*/
1399 flush_pipe_ptrs(NULL);
1400 /* prevent future matches... */
1401 p = all_pipes ;
1402 all_pipes = NULL ;
1403 fs = all_flow_sets ;
1404 all_flow_sets = NULL ;
1405 /* and free heaps so we don't have unwanted events */
1406 heap_free(&ready_heap);
1407 heap_free(&wfq_ready_heap);
1408 heap_free(&extract_heap);
1409
1410 /*
1411 * Now purge all queued pkts and delete all pipes
1412 */
1413 /* scan and purge all flow_sets. */
1414 for ( ; fs ; ) {
1415 curr_fs = fs ;
1416 fs = fs->next ;
1417 purge_flow_set(curr_fs, 1);
1418 }
1419 for ( ; p ; ) {
1420 purge_pipe(p);
1421 curr_p = p ;
1422 p = p->next ;
1423 FREE(curr_p, M_DUMMYNET);
1424 }
1425 lck_mtx_unlock(dn_mutex);
1426 }
1427
1428
1429 extern struct ip_fw *ip_fw_default_rule ;
1430 static void
1431 dn_rule_delete_fs(struct dn_flow_set *fs, void *r)
1432 {
1433 int i ;
1434 struct dn_flow_queue *q ;
1435 struct mbuf *m ;
1436
1437 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
1438 for (q = fs->rq[i] ; q ; q = q->next )
1439 for (m = q->head ; m ; m = m->m_nextpkt ) {
1440 struct dn_pkt_tag *pkt = dn_tag_get(m) ;
1441 if (pkt->rule == r)
1442 pkt->rule = ip_fw_default_rule ;
1443 }
1444 }
1445 /*
1446 * when a firewall rule is deleted, scan all queues and remove the flow-id
1447 * from packets matching this rule.
1448 */
1449 void
1450 dn_rule_delete(void *r)
1451 {
1452 struct dn_pipe *p ;
1453 struct dn_flow_set *fs ;
1454 struct dn_pkt_tag *pkt ;
1455 struct mbuf *m ;
1456
1457 lck_mtx_lock(dn_mutex);
1458
1459 /*
1460 * If the rule references a queue (dn_flow_set), then scan
1461 * the flow set, otherwise scan pipes. Should do either, but doing
1462 * both does not harm.
1463 */
1464 for ( fs = all_flow_sets ; fs ; fs = fs->next )
1465 dn_rule_delete_fs(fs, r);
1466 for ( p = all_pipes ; p ; p = p->next ) {
1467 fs = &(p->fs) ;
1468 dn_rule_delete_fs(fs, r);
1469 for (m = p->head ; m ; m = m->m_nextpkt ) {
1470 pkt = dn_tag_get(m) ;
1471 if (pkt->rule == r)
1472 pkt->rule = ip_fw_default_rule ;
1473 }
1474 }
1475 lck_mtx_unlock(dn_mutex);
1476 }
1477
1478 /*
1479 * setup RED parameters
1480 */
1481 static int
1482 config_red(struct dn_flow_set *p, struct dn_flow_set * x)
1483 {
1484 int i;
1485
1486 x->w_q = p->w_q;
1487 x->min_th = SCALE(p->min_th);
1488 x->max_th = SCALE(p->max_th);
1489 x->max_p = p->max_p;
1490
1491 x->c_1 = p->max_p / (p->max_th - p->min_th);
1492 x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
1493 if (x->flags_fs & DN_IS_GENTLE_RED) {
1494 x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
1495 x->c_4 = (SCALE(1) - 2 * p->max_p);
1496 }
1497
1498 /* if the lookup table already exist, free and create it again */
1499 if (x->w_q_lookup) {
1500 FREE(x->w_q_lookup, M_DUMMYNET);
1501 x->w_q_lookup = NULL ;
1502 }
1503 if (red_lookup_depth == 0) {
1504 printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n");
1505 FREE(x, M_DUMMYNET);
1506 return EINVAL;
1507 }
1508 x->lookup_depth = red_lookup_depth;
1509 x->w_q_lookup = (u_int *) _MALLOC(x->lookup_depth * sizeof(int),
1510 M_DUMMYNET, M_DONTWAIT);
1511 if (x->w_q_lookup == NULL) {
1512 printf("dummynet: sorry, cannot allocate red lookup table\n");
1513 FREE(x, M_DUMMYNET);
1514 return ENOSPC;
1515 }
1516
1517 /* fill the lookup table with (1 - w_q)^x */
1518 x->lookup_step = p->lookup_step ;
1519 x->lookup_weight = p->lookup_weight ;
1520 x->w_q_lookup[0] = SCALE(1) - x->w_q;
1521 for (i = 1; i < x->lookup_depth; i++)
1522 x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
1523 if (red_avg_pkt_size < 1)
1524 red_avg_pkt_size = 512 ;
1525 x->avg_pkt_size = red_avg_pkt_size ;
1526 if (red_max_pkt_size < 1)
1527 red_max_pkt_size = 1500 ;
1528 x->max_pkt_size = red_max_pkt_size ;
1529 return 0 ;
1530 }
1531
1532 static int
1533 alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
1534 {
1535 if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */
1536 int l = pfs->rq_size;
1537
1538 if (l == 0)
1539 l = dn_hash_size;
1540 if (l < 4)
1541 l = 4;
1542 else if (l > DN_MAX_HASH_SIZE)
1543 l = DN_MAX_HASH_SIZE;
1544 x->rq_size = l;
1545 } else /* one is enough for null mask */
1546 x->rq_size = 1;
1547 x->rq = _MALLOC((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
1548 M_DUMMYNET, M_DONTWAIT | M_ZERO);
1549 if (x->rq == NULL) {
1550 printf("dummynet: sorry, cannot allocate queue\n");
1551 return ENOSPC;
1552 }
1553 x->rq_elements = 0;
1554 return 0 ;
1555 }
1556
1557 static void
1558 set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
1559 {
1560 x->flags_fs = src->flags_fs;
1561 x->qsize = src->qsize;
1562 x->plr = src->plr;
1563 x->flow_mask = src->flow_mask;
1564 if (x->flags_fs & DN_QSIZE_IS_BYTES) {
1565 if (x->qsize > 1024*1024)
1566 x->qsize = 1024*1024 ;
1567 } else {
1568 if (x->qsize == 0)
1569 x->qsize = 50 ;
1570 if (x->qsize > 100)
1571 x->qsize = 50 ;
1572 }
1573 /* configuring RED */
1574 if ( x->flags_fs & DN_IS_RED )
1575 config_red(src, x) ; /* XXX should check errors */
1576 }
1577
1578 /*
1579 * setup pipe or queue parameters.
1580 */
1581
1582 static int
1583 config_pipe(struct dn_pipe *p)
1584 {
1585 int i, r;
1586 struct dn_flow_set *pfs = &(p->fs);
1587 struct dn_flow_queue *q;
1588
1589 /*
1590 * The config program passes parameters as follows:
1591 * bw = bits/second (0 means no limits),
1592 * delay = ms, must be translated into ticks.
1593 * qsize = slots/bytes
1594 */
1595 p->delay = ( p->delay * hz ) / 1000 ;
1596 /* We need either a pipe number or a flow_set number */
1597 if (p->pipe_nr == 0 && pfs->fs_nr == 0)
1598 return EINVAL ;
1599 if (p->pipe_nr != 0 && pfs->fs_nr != 0)
1600 return EINVAL ;
1601 if (p->pipe_nr != 0) { /* this is a pipe */
1602 struct dn_pipe *x, *a, *b;
1603
1604 lck_mtx_lock(dn_mutex);
1605 /* locate pipe */
1606 for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
1607 a = b , b = b->next) ;
1608
1609 if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */
1610 x = _MALLOC(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO) ;
1611 if (x == NULL) {
1612 lck_mtx_unlock(dn_mutex);
1613 printf("dummynet: no memory for new pipe\n");
1614 return ENOSPC;
1615 }
1616 x->pipe_nr = p->pipe_nr;
1617 x->fs.pipe = x ;
1618 /* idle_heap is the only one from which we extract from the middle.
1619 */
1620 x->idle_heap.size = x->idle_heap.elements = 0 ;
1621 x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos);
1622 } else {
1623 x = b;
1624 /* Flush accumulated credit for all queues */
1625 for (i = 0; i <= x->fs.rq_size; i++)
1626 for (q = x->fs.rq[i]; q; q = q->next)
1627 q->numbytes = 0;
1628 }
1629
1630 x->bandwidth = p->bandwidth ;
1631 x->numbytes = 0; /* just in case... */
1632 bcopy(p->if_name, x->if_name, sizeof(p->if_name) );
1633 x->ifp = NULL ; /* reset interface ptr */
1634 x->delay = p->delay ;
1635 set_fs_parms(&(x->fs), pfs);
1636
1637
1638 if ( x->fs.rq == NULL ) { /* a new pipe */
1639 r = alloc_hash(&(x->fs), pfs) ;
1640 if (r) {
1641 lck_mtx_unlock(dn_mutex);
1642 FREE(x, M_DUMMYNET);
1643 return r ;
1644 }
1645 x->next = b ;
1646 if (a == NULL)
1647 all_pipes = x ;
1648 else
1649 a->next = x ;
1650 }
1651 lck_mtx_unlock(dn_mutex);
1652 } else { /* config queue */
1653 struct dn_flow_set *x, *a, *b ;
1654
1655 lck_mtx_lock(dn_mutex);
1656 /* locate flow_set */
1657 for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ;
1658 a = b , b = b->next) ;
1659
1660 if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */
1661 if (pfs->parent_nr == 0) { /* need link to a pipe */
1662 lck_mtx_unlock(dn_mutex);
1663 return EINVAL ;
1664 }
1665 x = _MALLOC(sizeof(struct dn_flow_set), M_DUMMYNET, M_DONTWAIT | M_ZERO);
1666 if (x == NULL) {
1667 lck_mtx_unlock(dn_mutex);
1668 printf("dummynet: no memory for new flow_set\n");
1669 return ENOSPC;
1670 }
1671 x->fs_nr = pfs->fs_nr;
1672 x->parent_nr = pfs->parent_nr;
1673 x->weight = pfs->weight ;
1674 if (x->weight == 0)
1675 x->weight = 1 ;
1676 else if (x->weight > 100)
1677 x->weight = 100 ;
1678 } else {
1679 /* Change parent pipe not allowed; must delete and recreate */
1680 if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) {
1681 lck_mtx_unlock(dn_mutex);
1682 return EINVAL ;
1683 }
1684 x = b;
1685 }
1686 set_fs_parms(x, pfs);
1687
1688 if ( x->rq == NULL ) { /* a new flow_set */
1689 r = alloc_hash(x, pfs) ;
1690 if (r) {
1691 lck_mtx_unlock(dn_mutex);
1692 FREE(x, M_DUMMYNET);
1693 return r ;
1694 }
1695 x->next = b;
1696 if (a == NULL)
1697 all_flow_sets = x;
1698 else
1699 a->next = x;
1700 }
1701 lck_mtx_unlock(dn_mutex);
1702 }
1703 return 0 ;
1704 }
1705
1706 /*
1707 * Helper function to remove from a heap queues which are linked to
1708 * a flow_set about to be deleted.
1709 */
1710 static void
1711 fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
1712 {
1713 int i = 0, found = 0 ;
1714 for (; i < h->elements ;)
1715 if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
1716 h->elements-- ;
1717 h->p[i] = h->p[h->elements] ;
1718 found++ ;
1719 } else
1720 i++ ;
1721 if (found)
1722 heapify(h);
1723 }
1724
1725 /*
1726 * helper function to remove a pipe from a heap (can be there at most once)
1727 */
1728 static void
1729 pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
1730 {
1731 if (h->elements > 0) {
1732 int i = 0 ;
1733 for (i=0; i < h->elements ; i++ ) {
1734 if (h->p[i].object == p) { /* found it */
1735 h->elements-- ;
1736 h->p[i] = h->p[h->elements] ;
1737 heapify(h);
1738 break ;
1739 }
1740 }
1741 }
1742 }
1743
1744 /*
1745 * drain all queues. Called in case of severe mbuf shortage.
1746 */
1747 void
1748 dummynet_drain()
1749 {
1750 struct dn_flow_set *fs;
1751 struct dn_pipe *p;
1752 struct mbuf *m, *mnext;
1753
1754 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1755
1756 heap_free(&ready_heap);
1757 heap_free(&wfq_ready_heap);
1758 heap_free(&extract_heap);
1759 /* remove all references to this pipe from flow_sets */
1760 for (fs = all_flow_sets; fs; fs= fs->next )
1761 purge_flow_set(fs, 0);
1762
1763 for (p = all_pipes; p; p= p->next ) {
1764 purge_flow_set(&(p->fs), 0);
1765
1766 mnext = p->head;
1767 while ((m = mnext) != NULL) {
1768 mnext = m->m_nextpkt;
1769 DN_FREE_PKT(m);
1770 }
1771 p->head = p->tail = NULL ;
1772 }
1773 }
1774
1775 /*
1776 * Fully delete a pipe or a queue, cleaning up associated info.
1777 */
1778 static int
1779 delete_pipe(struct dn_pipe *p)
1780 {
1781 if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
1782 return EINVAL ;
1783 if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
1784 return EINVAL ;
1785 if (p->pipe_nr != 0) { /* this is an old-style pipe */
1786 struct dn_pipe *a, *b;
1787 struct dn_flow_set *fs;
1788
1789 lck_mtx_lock(dn_mutex);
1790 /* locate pipe */
1791 for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
1792 a = b , b = b->next) ;
1793 if (b == NULL || (b->pipe_nr != p->pipe_nr) ) {
1794 lck_mtx_unlock(dn_mutex);
1795 return EINVAL ; /* not found */
1796 }
1797
1798 /* unlink from list of pipes */
1799 if (a == NULL)
1800 all_pipes = b->next ;
1801 else
1802 a->next = b->next ;
1803 /* remove references to this pipe from the ip_fw rules. */
1804 flush_pipe_ptrs(&(b->fs));
1805
1806 /* remove all references to this pipe from flow_sets */
1807 for (fs = all_flow_sets; fs; fs= fs->next )
1808 if (fs->pipe == b) {
1809 printf("dummynet: ++ ref to pipe %d from fs %d\n",
1810 p->pipe_nr, fs->fs_nr);
1811 fs->pipe = NULL ;
1812 purge_flow_set(fs, 0);
1813 }
1814 fs_remove_from_heap(&ready_heap, &(b->fs));
1815 purge_pipe(b); /* remove all data associated to this pipe */
1816 /* remove reference to here from extract_heap and wfq_ready_heap */
1817 pipe_remove_from_heap(&extract_heap, b);
1818 pipe_remove_from_heap(&wfq_ready_heap, b);
1819 lck_mtx_unlock(dn_mutex);
1820
1821 FREE(b, M_DUMMYNET);
1822 } else { /* this is a WF2Q queue (dn_flow_set) */
1823 struct dn_flow_set *a, *b;
1824
1825 lck_mtx_lock(dn_mutex);
1826 /* locate set */
1827 for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ;
1828 a = b , b = b->next) ;
1829 if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) {
1830 lck_mtx_unlock(dn_mutex);
1831 return EINVAL ; /* not found */
1832 }
1833
1834 if (a == NULL)
1835 all_flow_sets = b->next ;
1836 else
1837 a->next = b->next ;
1838 /* remove references to this flow_set from the ip_fw rules. */
1839 flush_pipe_ptrs(b);
1840
1841 if (b->pipe != NULL) {
1842 /* Update total weight on parent pipe and cleanup parent heaps */
1843 b->pipe->sum -= b->weight * b->backlogged ;
1844 fs_remove_from_heap(&(b->pipe->not_eligible_heap), b);
1845 fs_remove_from_heap(&(b->pipe->scheduler_heap), b);
1846 #if 1 /* XXX should i remove from idle_heap as well ? */
1847 fs_remove_from_heap(&(b->pipe->idle_heap), b);
1848 #endif
1849 }
1850 purge_flow_set(b, 1);
1851 lck_mtx_unlock(dn_mutex);
1852 }
1853 return 0 ;
1854 }
1855
1856 /*
1857 * helper function used to copy data from kernel in DUMMYNET_GET
1858 */
1859 static char *
1860 dn_copy_set(struct dn_flow_set *set, char *bp)
1861 {
1862 int i, copied = 0 ;
1863 struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
1864
1865 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1866
1867 for (i = 0 ; i <= set->rq_size ; i++)
1868 for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
1869 if (q->hash_slot != i)
1870 printf("dummynet: ++ at %d: wrong slot (have %d, "
1871 "should be %d)\n", copied, q->hash_slot, i);
1872 if (q->fs != set)
1873 printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
1874 i, q->fs, set);
1875 copied++ ;
1876 bcopy(q, qp, sizeof( *q ) );
1877 /* cleanup pointers */
1878 qp->next = NULL ;
1879 qp->head = qp->tail = NULL ;
1880 qp->fs = NULL ;
1881 }
1882 if (copied != set->rq_elements)
1883 printf("dummynet: ++ wrong count, have %d should be %d\n",
1884 copied, set->rq_elements);
1885 return (char *)qp ;
1886 }
1887
1888 static size_t
1889 dn_calc_size(void)
1890 {
1891 struct dn_flow_set *set ;
1892 struct dn_pipe *p ;
1893 size_t size ;
1894
1895 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1896
1897 /*
1898 * compute size of data structures: list of pipes and flow_sets.
1899 */
1900 for (p = all_pipes, size = 0 ; p ; p = p->next )
1901 size += sizeof( *p ) +
1902 p->fs.rq_elements * sizeof(struct dn_flow_queue);
1903 for (set = all_flow_sets ; set ; set = set->next )
1904 size += sizeof ( *set ) +
1905 set->rq_elements * sizeof(struct dn_flow_queue);
1906 return size ;
1907 }
1908
1909 static int
1910 dummynet_get(struct sockopt *sopt)
1911 {
1912 char *buf, *bp ; /* bp is the "copy-pointer" */
1913 size_t size ;
1914 struct dn_flow_set *set ;
1915 struct dn_pipe *p ;
1916 int error=0, i ;
1917
1918 /* XXX lock held too long */
1919 lck_mtx_lock(dn_mutex);
1920 /*
1921 * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
1922 * cannot use this flag while holding a mutex.
1923 */
1924 for (i = 0; i < 10; i++) {
1925 size = dn_calc_size();
1926 lck_mtx_unlock(dn_mutex);
1927 buf = _MALLOC(size, M_TEMP, M_WAITOK);
1928 lck_mtx_lock(dn_mutex);
1929 if (size == dn_calc_size())
1930 break;
1931 FREE(buf, M_TEMP);
1932 buf = NULL;
1933 }
1934 if (buf == NULL) {
1935 lck_mtx_unlock(dn_mutex);
1936 return ENOBUFS ;
1937 }
1938 for (p = all_pipes, bp = buf ; p ; p = p->next ) {
1939 struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ;
1940
1941 /*
1942 * copy pipe descriptor into *bp, convert delay back to ms,
1943 * then copy the flow_set descriptor(s) one at a time.
1944 * After each flow_set, copy the queue descriptor it owns.
1945 */
1946 bcopy(p, bp, sizeof( *p ) );
1947 pipe_bp->delay = (pipe_bp->delay * 1000) / hz ;
1948 /*
1949 * XXX the following is a hack based on ->next being the
1950 * first field in dn_pipe and dn_flow_set. The correct
1951 * solution would be to move the dn_flow_set to the beginning
1952 * of struct dn_pipe.
1953 */
1954 pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ;
1955 /* clean pointers */
1956 pipe_bp->head = pipe_bp->tail = NULL ;
1957 pipe_bp->fs.next = NULL ;
1958 pipe_bp->fs.pipe = NULL ;
1959 pipe_bp->fs.rq = NULL ;
1960
1961 bp += sizeof( *p ) ;
1962 bp = dn_copy_set( &(p->fs), bp );
1963 }
1964 for (set = all_flow_sets ; set ; set = set->next ) {
1965 struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ;
1966 bcopy(set, bp, sizeof( *set ) );
1967 /* XXX same hack as above */
1968 fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ;
1969 fs_bp->pipe = NULL ;
1970 fs_bp->rq = NULL ;
1971 bp += sizeof( *set ) ;
1972 bp = dn_copy_set( set, bp );
1973 }
1974 lck_mtx_unlock(dn_mutex);
1975
1976 error = sooptcopyout(sopt, buf, size);
1977 FREE(buf, M_TEMP);
1978 return error ;
1979 }
1980
1981 /*
1982 * Handler for the various dummynet socket options (get, flush, config, del)
1983 */
1984 static int
1985 ip_dn_ctl(struct sockopt *sopt)
1986 {
1987 int error = 0 ;
1988 struct dn_pipe *p, tmp_pipe;
1989
1990 /* Disallow sets in really-really secure mode. */
1991 if (sopt->sopt_dir == SOPT_SET && securelevel >= 3)
1992 return (EPERM);
1993
1994 switch (sopt->sopt_name) {
1995 default :
1996 printf("dummynet: -- unknown option %d", sopt->sopt_name);
1997 return EINVAL ;
1998
1999 case IP_DUMMYNET_GET :
2000 error = dummynet_get(sopt);
2001 break ;
2002
2003 case IP_DUMMYNET_FLUSH :
2004 dummynet_flush() ;
2005 break ;
2006
2007 case IP_DUMMYNET_CONFIGURE :
2008 p = &tmp_pipe ;
2009 error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
2010 if (error)
2011 break ;
2012 error = config_pipe(p);
2013 break ;
2014
2015 case IP_DUMMYNET_DEL : /* remove a pipe or queue */
2016 p = &tmp_pipe ;
2017 error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
2018 if (error)
2019 break ;
2020
2021 error = delete_pipe(p);
2022 break ;
2023 }
2024 return error ;
2025 }
2026
2027 void
2028 ip_dn_init(void)
2029 {
2030 /* setup locks */
2031 dn_mutex_grp_attr = lck_grp_attr_alloc_init();
2032 dn_mutex_grp = lck_grp_alloc_init("dn", dn_mutex_grp_attr);
2033 dn_mutex_attr = lck_attr_alloc_init();
2034 lck_attr_setdefault(dn_mutex_attr);
2035
2036 if ((dn_mutex = lck_mtx_alloc_init(dn_mutex_grp, dn_mutex_attr)) == NULL) {
2037 printf("ip_dn_init: can't alloc dn_mutex\n");
2038 return;
2039 }
2040
2041 all_pipes = NULL ;
2042 all_flow_sets = NULL ;
2043 ready_heap.size = ready_heap.elements = 0 ;
2044 ready_heap.offset = 0 ;
2045
2046 wfq_ready_heap.size = wfq_ready_heap.elements = 0 ;
2047 wfq_ready_heap.offset = 0 ;
2048
2049 extract_heap.size = extract_heap.elements = 0 ;
2050 extract_heap.offset = 0 ;
2051 ip_dn_ctl_ptr = ip_dn_ctl;
2052 ip_dn_io_ptr = dummynet_io;
2053 ip_dn_ruledel_ptr = dn_rule_delete;
2054
2055 timeout(dummynet, NULL, 1);
2056 }