]>
git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/ip_dummynet.c
039bef66f8b299f8554dcecaa4d81e1f72e66c37
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
31 * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
32 * Portions Copyright (c) 2000 Akamba Corp.
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
45 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
46 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
47 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
48 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
49 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
50 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
51 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
52 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
53 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $
59 #define DUMMYNET_DEBUG
62 * This module implements IP dummynet, a bandwidth limiter/delay emulator
63 * used in conjunction with the ipfw package.
64 * Description of the data structures used is in ip_dummynet.h
65 * Here you mainly find the following blocks of code:
66 * + variable declarations;
67 * + heap management functions;
68 * + scheduler and dummynet functions;
69 * + configuration and initialization.
71 * NOTA BENE: critical sections are protected by the "dummynet lock".
73 * Most important Changes:
75 * 010124: Fixed WF2Q behaviour
76 * 010122: Fixed spl protection.
77 * 000601: WF2Q support
78 * 000106: large rewrite, use heaps to handle very many pipes.
79 * 980513: initial release
81 * include files marked with XXX are probably not needed
84 #include <sys/param.h>
85 #include <sys/systm.h>
86 #include <sys/malloc.h>
88 #include <sys/queue.h> /* XXX */
89 #include <sys/kernel.h>
90 #include <sys/socket.h>
91 #include <sys/socketvar.h>
93 #include <sys/sysctl.h>
95 #include <net/route.h>
96 #include <net/kpi_protocol.h>
97 #include <netinet/in.h>
98 #include <netinet/in_systm.h>
99 #include <netinet/in_var.h>
100 #include <netinet/ip.h>
101 #include <netinet/ip_fw.h>
102 #include <netinet/ip_dummynet.h>
103 #include <netinet/ip_var.h>
106 #include <netinet/if_ether.h> /* for struct arpcom */
107 #include <net/bridge.h>
111 * We keep a private variable for the simulation time, but we could
112 * probably use an existing one ("softticks" in sys/kern/kern_timer.c)
114 static dn_key curr_time
= 0 ; /* current simulation time */
116 /* this is for the timer that fires to call dummynet() - we only enable the timer when
117 there are packets to process, otherwise it's disabled */
118 static int timer_enabled
= 0;
120 static int dn_hash_size
= 64 ; /* default hash size */
122 /* statistics on number of queue searches and search steps */
123 static int searches
, search_steps
;
124 static int pipe_expire
= 1 ; /* expire queue if empty */
125 static int dn_max_ratio
= 16 ; /* max queues/buckets ratio */
127 static int red_lookup_depth
= 256; /* RED - default lookup table depth */
128 static int red_avg_pkt_size
= 512; /* RED - default medium packet size */
129 static int red_max_pkt_size
= 1500; /* RED - default max packet size */
132 * Three heaps contain queues and pipes that the scheduler handles:
134 * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
136 * wfq_ready_heap contains the pipes associated with WF2Q flows
138 * extract_heap contains pipes associated with delay lines.
141 static struct dn_heap ready_heap
, extract_heap
, wfq_ready_heap
;
143 static int heap_init(struct dn_heap
*h
, int size
) ;
144 static int heap_insert (struct dn_heap
*h
, dn_key key1
, void *p
);
145 static void heap_extract(struct dn_heap
*h
, void *obj
);
147 static void transmit_event(struct dn_pipe
*pipe
);
148 static void ready_event(struct dn_flow_queue
*q
);
150 static struct dn_pipe
*all_pipes
= NULL
; /* list of all pipes */
151 static struct dn_flow_set
*all_flow_sets
= NULL
;/* list of all flow_sets */
154 SYSCTL_NODE(_net_inet_ip
, OID_AUTO
, dummynet
,
155 CTLFLAG_RW
, 0, "Dummynet");
156 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, hash_size
,
157 CTLFLAG_RW
, &dn_hash_size
, 0, "Default hash table size");
158 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, curr_time
,
159 CTLFLAG_RD
, &curr_time
, 0, "Current tick");
160 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, ready_heap
,
161 CTLFLAG_RD
, &ready_heap
.size
, 0, "Size of ready heap");
162 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, extract_heap
,
163 CTLFLAG_RD
, &extract_heap
.size
, 0, "Size of extract heap");
164 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, searches
,
165 CTLFLAG_RD
, &searches
, 0, "Number of queue searches");
166 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, search_steps
,
167 CTLFLAG_RD
, &search_steps
, 0, "Number of queue search steps");
168 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, expire
,
169 CTLFLAG_RW
, &pipe_expire
, 0, "Expire queue if empty");
170 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, max_chain_len
,
171 CTLFLAG_RW
, &dn_max_ratio
, 0,
172 "Max ratio between dynamic queues and buckets");
173 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, red_lookup_depth
,
174 CTLFLAG_RD
, &red_lookup_depth
, 0, "Depth of RED lookup table");
175 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, red_avg_pkt_size
,
176 CTLFLAG_RD
, &red_avg_pkt_size
, 0, "RED Medium packet size");
177 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, red_max_pkt_size
,
178 CTLFLAG_RD
, &red_max_pkt_size
, 0, "RED Max packet size");
181 #ifdef DUMMYNET_DEBUG
182 int dummynet_debug
= 0;
184 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, debug
, CTLFLAG_RW
, &dummynet_debug
,
185 0, "control debugging printfs");
187 #define DPRINTF(X) if (dummynet_debug) printf X
193 lck_grp_t
*dn_mutex_grp
;
194 lck_grp_attr_t
*dn_mutex_grp_attr
;
195 lck_attr_t
*dn_mutex_attr
;
198 static int config_pipe(struct dn_pipe
*p
);
199 static int ip_dn_ctl(struct sockopt
*sopt
);
201 static void dummynet(void *);
202 static void dummynet_flush(void);
203 void dummynet_drain(void);
204 static ip_dn_io_t dummynet_io
;
205 static void dn_rule_delete(void *);
207 int if_tx_rdy(struct ifnet
*ifp
);
209 extern lck_mtx_t
*rt_mtx
; /* route global lock */
212 * Heap management functions.
214 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
215 * Some macros help finding parent/children so we can optimize them.
217 * heap_init() is called to expand the heap when needed.
218 * Increment size in blocks of 16 entries.
219 * XXX failure to allocate a new element is a pretty bad failure
220 * as we basically stall a whole queue forever!!
221 * Returns 1 on error, 0 on success
223 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
224 #define HEAP_LEFT(x) ( 2*(x) + 1 )
225 #define HEAP_IS_LEFT(x) ( (x) & 1 )
226 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
227 #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
228 #define HEAP_INCREMENT 15
231 heap_init(struct dn_heap
*h
, int new_size
)
233 struct dn_heap_entry
*p
;
235 if (h
->size
>= new_size
) {
236 printf("dummynet: heap_init, Bogus call, have %d want %d\n",
240 new_size
= (new_size
+ HEAP_INCREMENT
) & ~HEAP_INCREMENT
;
241 p
= _MALLOC(new_size
* sizeof(*p
), M_DUMMYNET
, M_DONTWAIT
);
243 printf("dummynet: heap_init, resize %d failed\n", new_size
);
244 return 1 ; /* error */
247 bcopy(h
->p
, p
, h
->size
* sizeof(*p
) );
248 FREE(h
->p
, M_DUMMYNET
);
256 * Insert element in heap. Normally, p != NULL, we insert p in
257 * a new position and bubble up. If p == NULL, then the element is
258 * already in place, and key is the position where to start the
260 * Returns 1 on failure (cannot allocate new heap entry)
262 * If offset > 0 the position (index, int) of the element in the heap is
263 * also stored in the element itself at the given offset in bytes.
265 #define SET_OFFSET(heap, node) \
266 if (heap->offset > 0) \
267 *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
269 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
271 #define RESET_OFFSET(heap, node) \
272 if (heap->offset > 0) \
273 *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
275 heap_insert(struct dn_heap
*h
, dn_key key1
, void *p
)
277 int son
= h
->elements
;
279 if (p
== NULL
) /* data already there, set starting point */
281 else { /* insert new element at the end, possibly resize */
283 if (son
== h
->size
) /* need resize... */
284 if (heap_init(h
, h
->elements
+1) )
285 return 1 ; /* failure... */
286 h
->p
[son
].object
= p
;
287 h
->p
[son
].key
= key1
;
290 while (son
> 0) { /* bubble up */
291 int father
= HEAP_FATHER(son
) ;
292 struct dn_heap_entry tmp
;
294 if (DN_KEY_LT( h
->p
[father
].key
, h
->p
[son
].key
) )
295 break ; /* found right position */
296 /* son smaller than father, swap and repeat */
297 HEAP_SWAP(h
->p
[son
], h
->p
[father
], tmp
) ;
306 * remove top element from heap, or obj if obj != NULL
309 heap_extract(struct dn_heap
*h
, void *obj
)
311 int child
, father
, max
= h
->elements
- 1 ;
314 printf("dummynet: warning, extract from empty heap 0x%p\n", h
);
317 father
= 0 ; /* default: move up smallest child */
318 if (obj
!= NULL
) { /* extract specific element, index is at offset */
320 panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
321 father
= *((int *)((char *)obj
+ h
->offset
)) ;
322 if (father
< 0 || father
>= h
->elements
) {
323 printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
324 father
, h
->elements
);
325 panic("dummynet: heap_extract");
328 RESET_OFFSET(h
, father
);
329 child
= HEAP_LEFT(father
) ; /* left child */
330 while (child
<= max
) { /* valid entry */
331 if (child
!= max
&& DN_KEY_LT(h
->p
[child
+1].key
, h
->p
[child
].key
) )
332 child
= child
+1 ; /* take right child, otherwise left */
333 h
->p
[father
] = h
->p
[child
] ;
334 SET_OFFSET(h
, father
);
336 child
= HEAP_LEFT(child
) ; /* left child for next loop */
341 * Fill hole with last entry and bubble up, reusing the insert code
343 h
->p
[father
] = h
->p
[max
] ;
344 heap_insert(h
, father
, NULL
); /* this one cannot fail */
350 * change object position and update references
351 * XXX this one is never used!
354 heap_move(struct dn_heap
*h
, dn_key new_key
, void *object
)
358 int max
= h
->elements
-1 ;
359 struct dn_heap_entry buf
;
362 panic("cannot move items on this heap");
364 i
= *((int *)((char *)object
+ h
->offset
));
365 if (DN_KEY_LT(new_key
, h
->p
[i
].key
) ) { /* must move up */
366 h
->p
[i
].key
= new_key
;
367 for (; i
>0 && DN_KEY_LT(new_key
, h
->p
[(temp
= HEAP_FATHER(i
))].key
) ;
368 i
= temp
) { /* bubble up */
369 HEAP_SWAP(h
->p
[i
], h
->p
[temp
], buf
) ;
372 } else { /* must move down */
373 h
->p
[i
].key
= new_key
;
374 while ( (temp
= HEAP_LEFT(i
)) <= max
) { /* found left child */
375 if ((temp
!= max
) && DN_KEY_GT(h
->p
[temp
].key
, h
->p
[temp
+1].key
))
376 temp
++ ; /* select child with min key */
377 if (DN_KEY_GT(new_key
, h
->p
[temp
].key
)) { /* go down */
378 HEAP_SWAP(h
->p
[i
], h
->p
[temp
], buf
) ;
387 #endif /* heap_move, unused */
390 * heapify() will reorganize data inside an array to maintain the
391 * heap property. It is needed when we delete a bunch of entries.
394 heapify(struct dn_heap
*h
)
398 for (i
= 0 ; i
< h
->elements
; i
++ )
399 heap_insert(h
, i
, NULL
) ;
403 * cleanup the heap and free data structure
406 heap_free(struct dn_heap
*h
)
409 FREE(h
->p
, M_DUMMYNET
);
410 bzero(h
, sizeof(*h
) );
414 * --- end of heap management functions ---
418 * Return the mbuf tag holding the dummynet state. As an optimization
419 * this is assumed to be the first tag on the list. If this turns out
420 * wrong we'll need to search the list.
422 static struct dn_pkt_tag
*
423 dn_tag_get(struct mbuf
*m
)
425 struct m_tag
*mtag
= m_tag_first(m
);
426 /* KASSERT(mtag != NULL &&
427 mtag->m_tag_id == KERNEL_MODULE_TAG_ID &&
428 mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET,
429 ("packet on dummynet queue w/o dummynet tag!"));
431 return (struct dn_pkt_tag
*)(mtag
+1);
435 * Scheduler functions:
437 * transmit_event() is called when the delay-line needs to enter
438 * the scheduler, either because of existing pkts getting ready,
439 * or new packets entering the queue. The event handled is the delivery
440 * time of the packet.
442 * ready_event() does something similar with fixed-rate queues, and the
443 * event handled is the finish time of the head pkt.
445 * wfq_ready_event() does something similar with WF2Q queues, and the
446 * event handled is the start time of the head pkt.
448 * In all cases, we make sure that the data structures are consistent
449 * before passing pkts out, because this might trigger recursive
450 * invocations of the procedures.
453 transmit_event(struct dn_pipe
*pipe
)
456 struct dn_pkt_tag
*pkt
;
459 lck_mtx_assert(dn_mutex
, LCK_MTX_ASSERT_OWNED
);
461 while ( (m
= pipe
->head
) ) {
463 if ( !DN_KEY_LEQ(pkt
->output_time
, curr_time
) )
466 * first unlink, then call procedures, since ip_input() can invoke
467 * ip_output() and viceversa, thus causing nested calls
469 pipe
->head
= m
->m_nextpkt
;
472 /* XXX: drop the lock for now to avoid LOR's */
473 lck_mtx_unlock(dn_mutex
);
474 switch (pkt
->dn_dir
) {
476 struct route tmp_rt
= pkt
->ro
;
477 (void)ip_output(m
, NULL
, NULL
, pkt
->flags
, NULL
);
479 rtfree(tmp_rt
.ro_rt
);
484 proto_inject(PF_INET
, m
);
490 * The bridge requires/assumes the Ethernet header is
491 * contiguous in the first mbuf header. Insure this is true.
494 if (m
->m_len
< ETHER_HDR_LEN
&&
495 (m
= m_pullup(m
, ETHER_HDR_LEN
)) == NULL
) {
496 printf("dummynet/bridge: pullup fail, dropping pkt\n");
499 m
= bdg_forward_ptr(m
, pkt
->ifp
);
501 /* somebody unloaded the bridge module. Drop pkt */
503 printf("dummynet: dropping bridged packet trapped in pipe\n");
510 printf("dummynet: bad switch %d!\n", pkt
->dn_dir
);
514 lck_mtx_lock(dn_mutex
);
516 /* if there are leftover packets, put into the heap for next event */
517 if ( (m
= pipe
->head
) ) {
519 /* XXX should check errors on heap_insert, by draining the
520 * whole pipe p and hoping in the future we are more successful
522 heap_insert(&extract_heap
, pkt
->output_time
, pipe
);
527 * the following macro computes how many ticks we have to wait
528 * before being able to transmit a packet. The credit is taken from
529 * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
532 /* hz is 100, which gives a granularity of 10ms in the old timer.
533 * The timer has been changed to fire every 1ms, so the use of
534 * hz has been modified here. All instances of hz have been left
535 * in place but adjusted by a factor of 10 so that hz is functionally
538 #define SET_TICKS(_m, q, p) \
539 ((_m)->m_pkthdr.len*8*(hz*10) - (q)->numbytes + p->bandwidth - 1 ) / \
543 * extract pkt from queue, compute output time (could be now)
544 * and put into delay line (p_queue)
547 move_pkt(struct mbuf
*pkt
, struct dn_flow_queue
*q
,
548 struct dn_pipe
*p
, int len
)
550 struct dn_pkt_tag
*dt
= dn_tag_get(pkt
);
552 q
->head
= pkt
->m_nextpkt
;
554 q
->len_bytes
-= len
;
556 dt
->output_time
= curr_time
+ p
->delay
;
561 p
->tail
->m_nextpkt
= pkt
;
563 p
->tail
->m_nextpkt
= NULL
;
567 * ready_event() is invoked every time the queue must enter the
568 * scheduler, either because the first packet arrives, or because
569 * a previously scheduled event fired.
570 * On invokation, drain as many pkts as possible (could be 0) and then
571 * if there are leftover packets reinsert the pkt in the scheduler.
574 ready_event(struct dn_flow_queue
*q
)
577 struct dn_pipe
*p
= q
->fs
->pipe
;
580 lck_mtx_assert(dn_mutex
, LCK_MTX_ASSERT_OWNED
);
583 printf("dummynet: ready_event- pipe is gone\n");
586 p_was_empty
= (p
->head
== NULL
) ;
589 * schedule fixed-rate queues linked to this pipe:
590 * Account for the bw accumulated since last scheduling, then
591 * drain as many pkts as allowed by q->numbytes and move to
592 * the delay line (in p) computing output time.
593 * bandwidth==0 (no limit) means we can drain the whole queue,
594 * setting len_scaled = 0 does the job.
596 q
->numbytes
+= ( curr_time
- q
->sched_time
) * p
->bandwidth
;
597 while ( (pkt
= q
->head
) != NULL
) {
598 int len
= pkt
->m_pkthdr
.len
;
599 int len_scaled
= p
->bandwidth
? len
*8*(hz
*10) : 0 ;
600 if (len_scaled
> q
->numbytes
)
602 q
->numbytes
-= len_scaled
;
603 move_pkt(pkt
, q
, p
, len
);
606 * If we have more packets queued, schedule next ready event
607 * (can only occur when bandwidth != 0, otherwise we would have
608 * flushed the whole queue in the previous loop).
609 * To this purpose we record the current time and compute how many
610 * ticks to go for the finish time of the packet.
612 if ( (pkt
= q
->head
) != NULL
) { /* this implies bandwidth != 0 */
613 dn_key t
= SET_TICKS(pkt
, q
, p
); /* ticks i have to wait */
614 q
->sched_time
= curr_time
;
615 heap_insert(&ready_heap
, curr_time
+ t
, (void *)q
);
616 /* XXX should check errors on heap_insert, and drain the whole
617 * queue on error hoping next time we are luckier.
619 } else { /* RED needs to know when the queue becomes empty */
620 q
->q_time
= curr_time
;
624 * If the delay line was empty call transmit_event(p) now.
625 * Otherwise, the scheduler will take care of it.
632 * Called when we can transmit packets on WF2Q queues. Take pkts out of
633 * the queues at their start time, and enqueue into the delay line.
634 * Packets are drained until p->numbytes < 0. As long as
635 * len_scaled >= p->numbytes, the packet goes into the delay line
636 * with a deadline p->delay. For the last packet, if p->numbytes<0,
637 * there is an additional delay.
640 ready_event_wfq(struct dn_pipe
*p
)
642 int p_was_empty
= (p
->head
== NULL
) ;
643 struct dn_heap
*sch
= &(p
->scheduler_heap
);
644 struct dn_heap
*neh
= &(p
->not_eligible_heap
) ;
646 lck_mtx_assert(dn_mutex
, LCK_MTX_ASSERT_OWNED
);
648 if (p
->if_name
[0] == 0) /* tx clock is simulated */
649 p
->numbytes
+= ( curr_time
- p
->sched_time
) * p
->bandwidth
;
650 else { /* tx clock is for real, the ifq must be empty or this is a NOP */
651 if (p
->ifp
&& p
->ifp
->if_snd
.ifq_head
!= NULL
)
654 DPRINTF(("dummynet: pipe %d ready from %s --\n",
655 p
->pipe_nr
, p
->if_name
));
660 * While we have backlogged traffic AND credit, we need to do
661 * something on the queue.
663 while ( p
->numbytes
>=0 && (sch
->elements
>0 || neh
->elements
>0) ) {
664 if (sch
->elements
> 0) { /* have some eligible pkts to send out */
665 struct dn_flow_queue
*q
= sch
->p
[0].object
;
666 struct mbuf
*pkt
= q
->head
;
667 struct dn_flow_set
*fs
= q
->fs
;
668 u_int64_t len
= pkt
->m_pkthdr
.len
;
669 int len_scaled
= p
->bandwidth
? len
*8*(hz
*10) : 0 ;
671 heap_extract(sch
, NULL
); /* remove queue from heap */
672 p
->numbytes
-= len_scaled
;
673 move_pkt(pkt
, q
, p
, len
);
675 p
->V
+= (len
<<MY_M
) / p
->sum
; /* update V */
676 q
->S
= q
->F
; /* update start time */
677 if (q
->len
== 0) { /* Flow not backlogged any more */
679 heap_insert(&(p
->idle_heap
), q
->F
, q
);
680 } else { /* still backlogged */
682 * update F and position in backlogged queue, then
683 * put flow in not_eligible_heap (we will fix this later).
685 len
= (q
->head
)->m_pkthdr
.len
;
686 q
->F
+= (len
<<MY_M
)/(u_int64_t
) fs
->weight
;
687 if (DN_KEY_LEQ(q
->S
, p
->V
))
688 heap_insert(neh
, q
->S
, q
);
690 heap_insert(sch
, q
->F
, q
);
694 * now compute V = max(V, min(S_i)). Remember that all elements in sch
695 * have by definition S_i <= V so if sch is not empty, V is surely
696 * the max and we must not update it. Conversely, if sch is empty
697 * we only need to look at neh.
699 if (sch
->elements
== 0 && neh
->elements
> 0)
700 p
->V
= MAX64 ( p
->V
, neh
->p
[0].key
);
701 /* move from neh to sch any packets that have become eligible */
702 while (neh
->elements
> 0 && DN_KEY_LEQ(neh
->p
[0].key
, p
->V
) ) {
703 struct dn_flow_queue
*q
= neh
->p
[0].object
;
704 heap_extract(neh
, NULL
);
705 heap_insert(sch
, q
->F
, q
);
708 if (p
->if_name
[0] != '\0') {/* tx clock is from a real thing */
709 p
->numbytes
= -1 ; /* mark not ready for I/O */
713 if (sch
->elements
== 0 && neh
->elements
== 0 && p
->numbytes
>= 0
714 && p
->idle_heap
.elements
> 0) {
716 * no traffic and no events scheduled. We can get rid of idle-heap.
720 for (i
= 0 ; i
< p
->idle_heap
.elements
; i
++) {
721 struct dn_flow_queue
*q
= p
->idle_heap
.p
[i
].object
;
728 p
->idle_heap
.elements
= 0 ;
731 * If we are getting clocks from dummynet (not a real interface) and
732 * If we are under credit, schedule the next ready event.
733 * Also fix the delivery time of the last packet.
735 if (p
->if_name
[0]==0 && p
->numbytes
< 0) { /* this implies bandwidth >0 */
736 dn_key t
=0 ; /* number of ticks i have to wait */
738 if (p
->bandwidth
> 0)
739 t
= ( p
->bandwidth
-1 - p
->numbytes
) / p
->bandwidth
;
740 dn_tag_get(p
->tail
)->output_time
+= t
;
741 p
->sched_time
= curr_time
;
742 heap_insert(&wfq_ready_heap
, curr_time
+ t
, (void *)p
);
743 /* XXX should check errors on heap_insert, and drain the whole
744 * queue on error hoping next time we are luckier.
748 * If the delay line was empty call transmit_event(p) now.
749 * Otherwise, the scheduler will take care of it.
756 * This is called every 1ms. It is used to
757 * increment the current tick counter and schedule expired events.
760 dummynet(void * __unused unused
)
762 void *p
; /* generic parameter to handler */
764 struct dn_heap
*heaps
[3];
770 heaps
[0] = &ready_heap
; /* fixed-rate queues */
771 heaps
[1] = &wfq_ready_heap
; /* wfq queues */
772 heaps
[2] = &extract_heap
; /* delay line */
774 lck_mtx_lock(dn_mutex
);
776 /* make all time measurements in milliseconds (ms) -
777 * here we convert secs and usecs to msecs (just divide the
778 * usecs and take the closest whole number).
781 curr_time
= (tv
.tv_sec
* 1000) + (tv
.tv_usec
/ 1000);
783 for (i
=0; i
< 3 ; i
++) {
785 while (h
->elements
> 0 && DN_KEY_LEQ(h
->p
[0].key
, curr_time
) ) {
786 if (h
->p
[0].key
> curr_time
)
787 printf("dummynet: warning, heap %d is %d ticks late\n",
788 i
, (int)(curr_time
- h
->p
[0].key
));
789 p
= h
->p
[0].object
; /* store a copy before heap_extract */
790 heap_extract(h
, NULL
); /* need to extract before processing */
794 struct dn_pipe
*pipe
= p
;
795 if (pipe
->if_name
[0] != '\0')
796 printf("dummynet: bad ready_event_wfq for pipe %s\n",
804 /* sweep pipes trying to expire idle flow_queues */
805 for (pe
= all_pipes
; pe
; pe
= pe
->next
)
806 if (pe
->idle_heap
.elements
> 0 &&
807 DN_KEY_LT(pe
->idle_heap
.p
[0].key
, pe
->V
) ) {
808 struct dn_flow_queue
*q
= pe
->idle_heap
.p
[0].object
;
810 heap_extract(&(pe
->idle_heap
), NULL
);
811 q
->S
= q
->F
+ 1 ; /* mark timestamp as invalid */
812 pe
->sum
-= q
->fs
->weight
;
815 /* check the heaps to see if there's still stuff in there, and
816 * only set the timer if there are packets to process
819 for (i
=0; i
< 3 ; i
++) {
821 if (h
->elements
> 0) { // set the timer
823 ts
.tv_nsec
= 1 * 1000000; // 1ms
825 bsd_timeout(dummynet
, NULL
, &ts
);
830 lck_mtx_unlock(dn_mutex
);
834 * called by an interface when tx_rdy occurs.
837 if_tx_rdy(struct ifnet
*ifp
)
841 lck_mtx_lock(dn_mutex
);
842 for (p
= all_pipes
; p
; p
= p
->next
)
847 sprintf(buf
, "%s%d",ifp
->if_name
, ifp
->if_unit
);
848 for (p
= all_pipes
; p
; p
= p
->next
)
849 if (!strcmp(p
->if_name
, buf
) ) {
851 DPRINTF(("dummynet: ++ tx rdy from %s (now found)\n", buf
));
856 DPRINTF(("dummynet: ++ tx rdy from %s%d - qlen %d\n", ifp
->if_name
,
857 ifp
->if_unit
, ifp
->if_snd
.ifq_len
));
858 p
->numbytes
= 0 ; /* mark ready for I/O */
861 lck_mtx_lock(dn_mutex
);
867 * Unconditionally expire empty queues in case of shortage.
868 * Returns the number of queues freed.
871 expire_queues(struct dn_flow_set
*fs
)
873 struct dn_flow_queue
*q
, *prev
;
874 int i
, initial_elements
= fs
->rq_elements
;
875 struct timeval timenow
;
877 getmicrotime(&timenow
);
879 if (fs
->last_expired
== timenow
.tv_sec
)
881 fs
->last_expired
= timenow
.tv_sec
;
882 for (i
= 0 ; i
<= fs
->rq_size
; i
++) /* last one is overflow */
883 for (prev
=NULL
, q
= fs
->rq
[i
] ; q
!= NULL
; )
884 if (q
->head
!= NULL
|| q
->S
!= q
->F
+1) {
887 } else { /* entry is idle, expire it */
888 struct dn_flow_queue
*old_q
= q
;
891 prev
->next
= q
= q
->next
;
893 fs
->rq
[i
] = q
= q
->next
;
895 FREE(old_q
, M_DUMMYNET
);
897 return initial_elements
- fs
->rq_elements
;
901 * If room, create a new queue and put at head of slot i;
902 * otherwise, create or use the default queue.
904 static struct dn_flow_queue
*
905 create_queue(struct dn_flow_set
*fs
, int i
)
907 struct dn_flow_queue
*q
;
909 if (fs
->rq_elements
> fs
->rq_size
* dn_max_ratio
&&
910 expire_queues(fs
) == 0) {
912 * No way to get room, use or create overflow queue.
915 if ( fs
->rq
[i
] != NULL
)
918 q
= _MALLOC(sizeof(*q
), M_DUMMYNET
, M_DONTWAIT
| M_ZERO
);
920 printf("dummynet: sorry, cannot allocate queue for new flow\n");
925 q
->next
= fs
->rq
[i
] ;
926 q
->S
= q
->F
+ 1; /* hack - mark timestamp as invalid */
933 * Given a flow_set and a pkt in last_pkt, find a matching queue
934 * after appropriate masking. The queue is moved to front
935 * so that further searches take less time.
937 static struct dn_flow_queue
*
938 find_queue(struct dn_flow_set
*fs
, struct ipfw_flow_id
*id
)
940 int i
= 0 ; /* we need i and q for new allocations */
941 struct dn_flow_queue
*q
, *prev
;
943 if ( !(fs
->flags_fs
& DN_HAVE_FLOW_MASK
) )
946 /* first, do the masking */
947 id
->dst_ip
&= fs
->flow_mask
.dst_ip
;
948 id
->src_ip
&= fs
->flow_mask
.src_ip
;
949 id
->dst_port
&= fs
->flow_mask
.dst_port
;
950 id
->src_port
&= fs
->flow_mask
.src_port
;
951 id
->proto
&= fs
->flow_mask
.proto
;
952 id
->flags
= 0 ; /* we don't care about this one */
953 /* then, hash function */
954 i
= ( (id
->dst_ip
) & 0xffff ) ^
955 ( (id
->dst_ip
>> 15) & 0xffff ) ^
956 ( (id
->src_ip
<< 1) & 0xffff ) ^
957 ( (id
->src_ip
>> 16 ) & 0xffff ) ^
958 (id
->dst_port
<< 1) ^ (id
->src_port
) ^
960 i
= i
% fs
->rq_size
;
961 /* finally, scan the current list for a match */
963 for (prev
=NULL
, q
= fs
->rq
[i
] ; q
; ) {
965 if (id
->dst_ip
== q
->id
.dst_ip
&&
966 id
->src_ip
== q
->id
.src_ip
&&
967 id
->dst_port
== q
->id
.dst_port
&&
968 id
->src_port
== q
->id
.src_port
&&
969 id
->proto
== q
->id
.proto
&&
970 id
->flags
== q
->id
.flags
)
972 else if (pipe_expire
&& q
->head
== NULL
&& q
->S
== q
->F
+1 ) {
973 /* entry is idle and not in any heap, expire it */
974 struct dn_flow_queue
*old_q
= q
;
977 prev
->next
= q
= q
->next
;
979 fs
->rq
[i
] = q
= q
->next
;
981 FREE(old_q
, M_DUMMYNET
);
987 if (q
&& prev
!= NULL
) { /* found and not in front */
988 prev
->next
= q
->next
;
989 q
->next
= fs
->rq
[i
] ;
993 if (q
== NULL
) { /* no match, need to allocate a new entry */
994 q
= create_queue(fs
, i
);
1002 red_drops(struct dn_flow_set
*fs
, struct dn_flow_queue
*q
, int len
)
1007 * RED calculates the average queue size (avg) using a low-pass filter
1008 * with an exponential weighted (w_q) moving average:
1009 * avg <- (1-w_q) * avg + w_q * q_size
1010 * where q_size is the queue length (measured in bytes or * packets).
1012 * If q_size == 0, we compute the idle time for the link, and set
1013 * avg = (1 - w_q)^(idle/s)
1014 * where s is the time needed for transmitting a medium-sized packet.
1016 * Now, if avg < min_th the packet is enqueued.
1017 * If avg > max_th the packet is dropped. Otherwise, the packet is
1018 * dropped with probability P function of avg.
1023 /* queue in bytes or packets ? */
1024 u_int q_size
= (fs
->flags_fs
& DN_QSIZE_IS_BYTES
) ? q
->len_bytes
: q
->len
;
1026 DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time
, q_size
));
1028 /* average queue size estimation */
1031 * queue is not empty, avg <- avg + (q_size - avg) * w_q
1033 int diff
= SCALE(q_size
) - q
->avg
;
1034 int64_t v
= SCALE_MUL((int64_t) diff
, (int64_t) fs
->w_q
);
1039 * queue is empty, find for how long the queue has been
1040 * empty and use a lookup table for computing
1041 * (1 - * w_q)^(idle_time/s) where s is the time to send a
1043 * XXX check wraps...
1046 u_int t
= (curr_time
- q
->q_time
) / fs
->lookup_step
;
1048 q
->avg
= (t
< fs
->lookup_depth
) ?
1049 SCALE_MUL(q
->avg
, fs
->w_q_lookup
[t
]) : 0;
1052 DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q
->avg
)));
1054 /* should i drop ? */
1056 if (q
->avg
< fs
->min_th
) {
1058 return 0; /* accept packet ; */
1060 if (q
->avg
>= fs
->max_th
) { /* average queue >= max threshold */
1061 if (fs
->flags_fs
& DN_IS_GENTLE_RED
) {
1063 * According to Gentle-RED, if avg is greater than max_th the
1064 * packet is dropped with a probability
1065 * p_b = c_3 * avg - c_4
1066 * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
1068 p_b
= SCALE_MUL((int64_t) fs
->c_3
, (int64_t) q
->avg
) - fs
->c_4
;
1071 DPRINTF(("dummynet: - drop"));
1074 } else if (q
->avg
> fs
->min_th
) {
1076 * we compute p_b using the linear dropping function p_b = c_1 *
1077 * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
1078 * max_p * min_th / (max_th - min_th)
1080 p_b
= SCALE_MUL((int64_t) fs
->c_1
, (int64_t) q
->avg
) - fs
->c_2
;
1082 if (fs
->flags_fs
& DN_QSIZE_IS_BYTES
)
1083 p_b
= (p_b
* len
) / fs
->max_pkt_size
;
1084 if (++q
->count
== 0)
1085 q
->random
= random() & 0xffff;
1088 * q->count counts packets arrived since last drop, so a greater
1089 * value of q->count means a greater packet drop probability.
1091 if (SCALE_MUL(p_b
, SCALE((int64_t) q
->count
)) > q
->random
) {
1093 DPRINTF(("dummynet: - red drop"));
1094 /* after a drop we calculate a new random value */
1095 q
->random
= random() & 0xffff;
1096 return 1; /* drop */
1099 /* end of RED algorithm */
1100 return 0 ; /* accept */
1104 struct dn_flow_set
*
1105 locate_flowset(int pipe_nr
, struct ip_fw
*rule
)
1107 struct dn_flow_set
*fs
;
1108 ipfw_insn
*cmd
= rule
->cmd
+ rule
->act_ofs
;
1110 if (cmd
->opcode
== O_LOG
)
1113 bcopy(& ((ipfw_insn_pipe
*)cmd
)->pipe_ptr
, &fs
, sizeof(fs
));
1118 if (cmd
->opcode
== O_QUEUE
) {
1119 for (fs
=all_flow_sets
; fs
&& fs
->fs_nr
!= pipe_nr
; fs
=fs
->next
)
1124 for (p1
= all_pipes
; p1
&& p1
->pipe_nr
!= pipe_nr
; p1
= p1
->next
)
1129 /* record for the future */
1130 bcopy(&fs
, & ((ipfw_insn_pipe
*)cmd
)->pipe_ptr
, sizeof(fs
));
1136 * dummynet hook for packets. Below 'pipe' is a pipe or a queue
1137 * depending on whether WF2Q or fixed bw is used.
1139 * pipe_nr pipe or queue the packet is destined for.
1140 * dir where shall we send the packet after dummynet.
1141 * m the mbuf with the packet
1142 * ifp the 'ifp' parameter from the caller.
1143 * NULL in ip_input, destination interface in ip_output,
1144 * real_dst in bdg_forward
1145 * ro route parameter (only used in ip_output, NULL otherwise)
1146 * dst destination address, only used by ip_output
1147 * rule matching rule, in case of multiple passes
1148 * flags flags from the caller, only used in ip_output
1152 dummynet_io(struct mbuf
*m
, int pipe_nr
, int dir
, struct ip_fw_args
*fwa
)
1154 struct dn_pkt_tag
*pkt
;
1156 struct dn_flow_set
*fs
;
1157 struct dn_pipe
*pipe
;
1158 u_int64_t len
= m
->m_pkthdr
.len
;
1159 struct dn_flow_queue
*q
= NULL
;
1165 ipfw_insn
*cmd
= fwa
->rule
->cmd
+ fwa
->rule
->act_ofs
;
1167 if (cmd
->opcode
== O_LOG
)
1169 is_pipe
= (cmd
->opcode
== O_PIPE
);
1171 is_pipe
= (fwa
->rule
->fw_flg
& IP_FW_F_COMMAND
) == IP_FW_F_PIPE
;
1176 lck_mtx_lock(dn_mutex
);
1178 /* make all time measurements in milliseconds (ms) -
1179 * here we convert secs and usecs to msecs (just divide the
1180 * usecs and take the closest whole number).
1183 curr_time
= (tv
.tv_sec
* 1000) + (tv
.tv_usec
/ 1000);
1186 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
1188 fs
= locate_flowset(pipe_nr
, fwa
->rule
);
1190 goto dropit
; /* this queue/pipe does not exist! */
1192 if (pipe
== NULL
) { /* must be a queue, try find a matching pipe */
1193 for (pipe
= all_pipes
; pipe
&& pipe
->pipe_nr
!= fs
->parent_nr
;
1199 printf("dummynet: no pipe %d for queue %d, drop pkt\n",
1200 fs
->parent_nr
, fs
->fs_nr
);
1204 q
= find_queue(fs
, &(fwa
->f_id
));
1206 goto dropit
; /* cannot allocate queue */
1208 * update statistics, then check reasons to drop pkt
1210 q
->tot_bytes
+= len
;
1212 if ( fs
->plr
&& random() < fs
->plr
)
1213 goto dropit
; /* random pkt drop */
1214 if ( fs
->flags_fs
& DN_QSIZE_IS_BYTES
) {
1215 if (q
->len_bytes
> fs
->qsize
)
1216 goto dropit
; /* queue size overflow */
1218 if (q
->len
>= fs
->qsize
)
1219 goto dropit
; /* queue count overflow */
1221 if ( fs
->flags_fs
& DN_IS_RED
&& red_drops(fs
, q
, len
) )
1224 /* XXX expensive to zero, see if we can remove it*/
1225 mtag
= m_tag_alloc(KERNEL_MODULE_TAG_ID
, KERNEL_TAG_TYPE_DUMMYNET
,
1226 sizeof(struct dn_pkt_tag
), M_NOWAIT
);
1228 goto dropit
; /* cannot allocate packet header */
1229 m_tag_prepend(m
, mtag
); /* attach to mbuf chain */
1231 pkt
= (struct dn_pkt_tag
*)(mtag
+1);
1232 bzero(pkt
, sizeof(struct dn_pkt_tag
));
1233 /* ok, i can handle the pkt now... */
1234 /* build and enqueue packet + parameters */
1235 pkt
->rule
= fwa
->rule
;
1238 pkt
->ifp
= fwa
->oif
;
1239 if (dir
== DN_TO_IP_OUT
) {
1241 * We need to copy *ro because for ICMP pkts (and maybe others)
1242 * the caller passed a pointer into the stack; dst might also be
1243 * a pointer into *ro so it needs to be updated.
1245 lck_mtx_lock(rt_mtx
);
1246 pkt
->ro
= *(fwa
->ro
);
1248 fwa
->ro
->ro_rt
->rt_refcnt
++ ;
1249 if (fwa
->dst
== (struct sockaddr_in
*)&fwa
->ro
->ro_dst
) /* dst points into ro */
1250 fwa
->dst
= (struct sockaddr_in
*)&(pkt
->ro
.ro_dst
) ;
1251 lck_mtx_unlock(rt_mtx
);
1253 pkt
->dn_dst
= fwa
->dst
;
1254 pkt
->flags
= fwa
->flags
;
1256 if (q
->head
== NULL
)
1259 q
->tail
->m_nextpkt
= m
;
1262 q
->len_bytes
+= len
;
1264 if ( q
->head
!= m
) /* flow was not idle, we are done */
1267 * If we reach this point the flow was previously idle, so we need
1268 * to schedule it. This involves different actions for fixed-rate or
1273 * Fixed-rate queue: just insert into the ready_heap.
1276 if (pipe
->bandwidth
)
1277 t
= SET_TICKS(m
, q
, pipe
);
1278 q
->sched_time
= curr_time
;
1279 if (t
== 0) /* must process it now */
1282 heap_insert(&ready_heap
, curr_time
+ t
, q
);
1285 * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
1286 * set S to the virtual time V for the controlling pipe, and update
1287 * the sum of weights for the pipe; otherwise, remove flow from
1288 * idle_heap and set S to max(F,V).
1289 * Second, compute finish time F = S + len/weight.
1290 * Third, if pipe was idle, update V=max(S, V).
1291 * Fourth, count one more backlogged flow.
1293 if (DN_KEY_GT(q
->S
, q
->F
)) { /* means timestamps are invalid */
1295 pipe
->sum
+= fs
->weight
; /* add weight of new queue */
1297 heap_extract(&(pipe
->idle_heap
), q
);
1298 q
->S
= MAX64(q
->F
, pipe
->V
) ;
1300 q
->F
= q
->S
+ ( len
<<MY_M
)/(u_int64_t
) fs
->weight
;
1302 if (pipe
->not_eligible_heap
.elements
== 0 &&
1303 pipe
->scheduler_heap
.elements
== 0)
1304 pipe
->V
= MAX64 ( q
->S
, pipe
->V
);
1307 * Look at eligibility. A flow is not eligibile if S>V (when
1308 * this happens, it means that there is some other flow already
1309 * scheduled for the same pipe, so the scheduler_heap cannot be
1310 * empty). If the flow is not eligible we just store it in the
1311 * not_eligible_heap. Otherwise, we store in the scheduler_heap
1312 * and possibly invoke ready_event_wfq() right now if there is
1314 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
1315 * and for all flows in not_eligible_heap (NEH), S_i > V .
1316 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
1317 * we only need to look into NEH.
1319 if (DN_KEY_GT(q
->S
, pipe
->V
) ) { /* not eligible */
1320 if (pipe
->scheduler_heap
.elements
== 0)
1321 printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
1322 heap_insert(&(pipe
->not_eligible_heap
), q
->S
, q
);
1324 heap_insert(&(pipe
->scheduler_heap
), q
->F
, q
);
1325 if (pipe
->numbytes
>= 0) { /* pipe is idle */
1326 if (pipe
->scheduler_heap
.elements
!= 1)
1327 printf("dummynet: OUCH! pipe should have been idle!\n");
1328 DPRINTF(("dummynet: waking up pipe %d at %d\n",
1329 pipe
->pipe_nr
, (int)(q
->F
>> MY_M
)));
1330 pipe
->sched_time
= curr_time
;
1331 ready_event_wfq(pipe
);
1336 /* start the timer and set global if not already set */
1337 if (!timer_enabled
) {
1339 ts
.tv_nsec
= 1 * 1000000; // 1ms
1341 bsd_timeout(dummynet
, NULL
, &ts
);
1344 lck_mtx_unlock(dn_mutex
);
1350 lck_mtx_unlock(dn_mutex
);
1352 return ( (fs
&& (fs
->flags_fs
& DN_NOERROR
)) ? 0 : ENOBUFS
);
1356 * Below, the rtfree is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
1357 * Doing this would probably save us the initial bzero of dn_pkt
1359 #define DN_FREE_PKT(_m) do { \
1360 struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \
1362 struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1); \
1364 rtfree(n->ro.ro_rt); \
1366 m_tag_delete(_m, tag); \
1371 * Dispose all packets and flow_queues on a flow_set.
1372 * If all=1, also remove red lookup table and other storage,
1373 * including the descriptor itself.
1374 * For the one in dn_pipe MUST also cleanup ready_heap...
1377 purge_flow_set(struct dn_flow_set
*fs
, int all
)
1379 struct dn_flow_queue
*q
, *qn
;
1382 lck_mtx_assert(dn_mutex
, LCK_MTX_ASSERT_OWNED
);
1384 for (i
= 0 ; i
<= fs
->rq_size
; i
++ ) {
1385 for (q
= fs
->rq
[i
] ; q
; q
= qn
) {
1386 struct mbuf
*m
, *mnext
;
1389 while ((m
= mnext
) != NULL
) {
1390 mnext
= m
->m_nextpkt
;
1394 FREE(q
, M_DUMMYNET
);
1398 fs
->rq_elements
= 0 ;
1400 /* RED - free lookup table */
1402 FREE(fs
->w_q_lookup
, M_DUMMYNET
);
1404 FREE(fs
->rq
, M_DUMMYNET
);
1405 /* if this fs is not part of a pipe, free it */
1406 if (fs
->pipe
&& fs
!= &(fs
->pipe
->fs
) )
1407 FREE(fs
, M_DUMMYNET
);
1412 * Dispose all packets queued on a pipe (not a flow_set).
1413 * Also free all resources associated to a pipe, which is about
1417 purge_pipe(struct dn_pipe
*pipe
)
1419 struct mbuf
*m
, *mnext
;
1421 purge_flow_set( &(pipe
->fs
), 1 );
1424 while ((m
= mnext
) != NULL
) {
1425 mnext
= m
->m_nextpkt
;
1429 heap_free( &(pipe
->scheduler_heap
) );
1430 heap_free( &(pipe
->not_eligible_heap
) );
1431 heap_free( &(pipe
->idle_heap
) );
1435 * Delete all pipes and heaps returning memory. Must also
1436 * remove references from all ipfw rules to all pipes.
1441 struct dn_pipe
*curr_p
, *p
;
1442 struct dn_flow_set
*fs
, *curr_fs
;
1444 lck_mtx_lock(dn_mutex
);
1446 /* remove all references to pipes ...*/
1447 flush_pipe_ptrs(NULL
);
1448 /* prevent future matches... */
1451 fs
= all_flow_sets
;
1452 all_flow_sets
= NULL
;
1453 /* and free heaps so we don't have unwanted events */
1454 heap_free(&ready_heap
);
1455 heap_free(&wfq_ready_heap
);
1456 heap_free(&extract_heap
);
1459 * Now purge all queued pkts and delete all pipes
1461 /* scan and purge all flow_sets. */
1465 purge_flow_set(curr_fs
, 1);
1471 FREE(curr_p
, M_DUMMYNET
);
1473 lck_mtx_unlock(dn_mutex
);
1477 extern struct ip_fw
*ip_fw_default_rule
;
1479 dn_rule_delete_fs(struct dn_flow_set
*fs
, void *r
)
1482 struct dn_flow_queue
*q
;
1485 for (i
= 0 ; i
<= fs
->rq_size
; i
++) /* last one is ovflow */
1486 for (q
= fs
->rq
[i
] ; q
; q
= q
->next
)
1487 for (m
= q
->head
; m
; m
= m
->m_nextpkt
) {
1488 struct dn_pkt_tag
*pkt
= dn_tag_get(m
) ;
1490 pkt
->rule
= ip_fw_default_rule
;
1494 * when a firewall rule is deleted, scan all queues and remove the flow-id
1495 * from packets matching this rule.
1498 dn_rule_delete(void *r
)
1501 struct dn_flow_set
*fs
;
1502 struct dn_pkt_tag
*pkt
;
1505 lck_mtx_lock(dn_mutex
);
1508 * If the rule references a queue (dn_flow_set), then scan
1509 * the flow set, otherwise scan pipes. Should do either, but doing
1510 * both does not harm.
1512 for ( fs
= all_flow_sets
; fs
; fs
= fs
->next
)
1513 dn_rule_delete_fs(fs
, r
);
1514 for ( p
= all_pipes
; p
; p
= p
->next
) {
1516 dn_rule_delete_fs(fs
, r
);
1517 for (m
= p
->head
; m
; m
= m
->m_nextpkt
) {
1518 pkt
= dn_tag_get(m
) ;
1520 pkt
->rule
= ip_fw_default_rule
;
1523 lck_mtx_unlock(dn_mutex
);
1527 * setup RED parameters
1530 config_red(struct dn_flow_set
*p
, struct dn_flow_set
* x
)
1535 x
->min_th
= SCALE(p
->min_th
);
1536 x
->max_th
= SCALE(p
->max_th
);
1537 x
->max_p
= p
->max_p
;
1539 x
->c_1
= p
->max_p
/ (p
->max_th
- p
->min_th
);
1540 x
->c_2
= SCALE_MUL(x
->c_1
, SCALE(p
->min_th
));
1541 if (x
->flags_fs
& DN_IS_GENTLE_RED
) {
1542 x
->c_3
= (SCALE(1) - p
->max_p
) / p
->max_th
;
1543 x
->c_4
= (SCALE(1) - 2 * p
->max_p
);
1546 /* if the lookup table already exist, free and create it again */
1547 if (x
->w_q_lookup
) {
1548 FREE(x
->w_q_lookup
, M_DUMMYNET
);
1549 x
->w_q_lookup
= NULL
;
1551 if (red_lookup_depth
== 0) {
1552 printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n");
1553 FREE(x
, M_DUMMYNET
);
1556 x
->lookup_depth
= red_lookup_depth
;
1557 x
->w_q_lookup
= (u_int
*) _MALLOC(x
->lookup_depth
* sizeof(int),
1558 M_DUMMYNET
, M_DONTWAIT
);
1559 if (x
->w_q_lookup
== NULL
) {
1560 printf("dummynet: sorry, cannot allocate red lookup table\n");
1561 FREE(x
, M_DUMMYNET
);
1565 /* fill the lookup table with (1 - w_q)^x */
1566 x
->lookup_step
= p
->lookup_step
;
1567 x
->lookup_weight
= p
->lookup_weight
;
1568 x
->w_q_lookup
[0] = SCALE(1) - x
->w_q
;
1569 for (i
= 1; i
< x
->lookup_depth
; i
++)
1570 x
->w_q_lookup
[i
] = SCALE_MUL(x
->w_q_lookup
[i
- 1], x
->lookup_weight
);
1571 if (red_avg_pkt_size
< 1)
1572 red_avg_pkt_size
= 512 ;
1573 x
->avg_pkt_size
= red_avg_pkt_size
;
1574 if (red_max_pkt_size
< 1)
1575 red_max_pkt_size
= 1500 ;
1576 x
->max_pkt_size
= red_max_pkt_size
;
1581 alloc_hash(struct dn_flow_set
*x
, struct dn_flow_set
*pfs
)
1583 if (x
->flags_fs
& DN_HAVE_FLOW_MASK
) { /* allocate some slots */
1584 int l
= pfs
->rq_size
;
1590 else if (l
> DN_MAX_HASH_SIZE
)
1591 l
= DN_MAX_HASH_SIZE
;
1593 } else /* one is enough for null mask */
1595 x
->rq
= _MALLOC((1 + x
->rq_size
) * sizeof(struct dn_flow_queue
*),
1596 M_DUMMYNET
, M_DONTWAIT
| M_ZERO
);
1597 if (x
->rq
== NULL
) {
1598 printf("dummynet: sorry, cannot allocate queue\n");
1606 set_fs_parms(struct dn_flow_set
*x
, struct dn_flow_set
*src
)
1608 x
->flags_fs
= src
->flags_fs
;
1609 x
->qsize
= src
->qsize
;
1611 x
->flow_mask
= src
->flow_mask
;
1612 if (x
->flags_fs
& DN_QSIZE_IS_BYTES
) {
1613 if (x
->qsize
> 1024*1024)
1614 x
->qsize
= 1024*1024 ;
1621 /* configuring RED */
1622 if ( x
->flags_fs
& DN_IS_RED
)
1623 config_red(src
, x
) ; /* XXX should check errors */
1627 * setup pipe or queue parameters.
1631 config_pipe(struct dn_pipe
*p
)
1634 struct dn_flow_set
*pfs
= &(p
->fs
);
1635 struct dn_flow_queue
*q
;
1638 * The config program passes parameters as follows:
1639 * bw = bits/second (0 means no limits),
1640 * delay = ms, must be translated into ticks.
1641 * qsize = slots/bytes
1643 p
->delay
= ( p
->delay
* (hz
*10) ) / 1000 ;
1644 /* We need either a pipe number or a flow_set number */
1645 if (p
->pipe_nr
== 0 && pfs
->fs_nr
== 0)
1647 if (p
->pipe_nr
!= 0 && pfs
->fs_nr
!= 0)
1649 if (p
->pipe_nr
!= 0) { /* this is a pipe */
1650 struct dn_pipe
*x
, *a
, *b
;
1652 lck_mtx_lock(dn_mutex
);
1654 for (a
= NULL
, b
= all_pipes
; b
&& b
->pipe_nr
< p
->pipe_nr
;
1655 a
= b
, b
= b
->next
) ;
1657 if (b
== NULL
|| b
->pipe_nr
!= p
->pipe_nr
) { /* new pipe */
1658 x
= _MALLOC(sizeof(struct dn_pipe
), M_DUMMYNET
, M_DONTWAIT
| M_ZERO
) ;
1660 lck_mtx_unlock(dn_mutex
);
1661 printf("dummynet: no memory for new pipe\n");
1664 x
->pipe_nr
= p
->pipe_nr
;
1666 /* idle_heap is the only one from which we extract from the middle.
1668 x
->idle_heap
.size
= x
->idle_heap
.elements
= 0 ;
1669 x
->idle_heap
.offset
=OFFSET_OF(struct dn_flow_queue
, heap_pos
);
1672 /* Flush accumulated credit for all queues */
1673 for (i
= 0; i
<= x
->fs
.rq_size
; i
++)
1674 for (q
= x
->fs
.rq
[i
]; q
; q
= q
->next
)
1678 x
->bandwidth
= p
->bandwidth
;
1679 x
->numbytes
= 0; /* just in case... */
1680 bcopy(p
->if_name
, x
->if_name
, sizeof(p
->if_name
) );
1681 x
->ifp
= NULL
; /* reset interface ptr */
1682 x
->delay
= p
->delay
;
1683 set_fs_parms(&(x
->fs
), pfs
);
1686 if ( x
->fs
.rq
== NULL
) { /* a new pipe */
1687 r
= alloc_hash(&(x
->fs
), pfs
) ;
1689 lck_mtx_unlock(dn_mutex
);
1690 FREE(x
, M_DUMMYNET
);
1699 lck_mtx_unlock(dn_mutex
);
1700 } else { /* config queue */
1701 struct dn_flow_set
*x
, *a
, *b
;
1703 lck_mtx_lock(dn_mutex
);
1704 /* locate flow_set */
1705 for (a
=NULL
, b
=all_flow_sets
; b
&& b
->fs_nr
< pfs
->fs_nr
;
1706 a
= b
, b
= b
->next
) ;
1708 if (b
== NULL
|| b
->fs_nr
!= pfs
->fs_nr
) { /* new */
1709 if (pfs
->parent_nr
== 0) { /* need link to a pipe */
1710 lck_mtx_unlock(dn_mutex
);
1713 x
= _MALLOC(sizeof(struct dn_flow_set
), M_DUMMYNET
, M_DONTWAIT
| M_ZERO
);
1715 lck_mtx_unlock(dn_mutex
);
1716 printf("dummynet: no memory for new flow_set\n");
1719 x
->fs_nr
= pfs
->fs_nr
;
1720 x
->parent_nr
= pfs
->parent_nr
;
1721 x
->weight
= pfs
->weight
;
1724 else if (x
->weight
> 100)
1727 /* Change parent pipe not allowed; must delete and recreate */
1728 if (pfs
->parent_nr
!= 0 && b
->parent_nr
!= pfs
->parent_nr
) {
1729 lck_mtx_unlock(dn_mutex
);
1734 set_fs_parms(x
, pfs
);
1736 if ( x
->rq
== NULL
) { /* a new flow_set */
1737 r
= alloc_hash(x
, pfs
) ;
1739 lck_mtx_unlock(dn_mutex
);
1740 FREE(x
, M_DUMMYNET
);
1749 lck_mtx_unlock(dn_mutex
);
1755 * Helper function to remove from a heap queues which are linked to
1756 * a flow_set about to be deleted.
1759 fs_remove_from_heap(struct dn_heap
*h
, struct dn_flow_set
*fs
)
1761 int i
= 0, found
= 0 ;
1762 for (; i
< h
->elements
;)
1763 if ( ((struct dn_flow_queue
*)h
->p
[i
].object
)->fs
== fs
) {
1765 h
->p
[i
] = h
->p
[h
->elements
] ;
1774 * helper function to remove a pipe from a heap (can be there at most once)
1777 pipe_remove_from_heap(struct dn_heap
*h
, struct dn_pipe
*p
)
1779 if (h
->elements
> 0) {
1781 for (i
=0; i
< h
->elements
; i
++ ) {
1782 if (h
->p
[i
].object
== p
) { /* found it */
1784 h
->p
[i
] = h
->p
[h
->elements
] ;
1793 * drain all queues. Called in case of severe mbuf shortage.
1798 struct dn_flow_set
*fs
;
1800 struct mbuf
*m
, *mnext
;
1802 lck_mtx_assert(dn_mutex
, LCK_MTX_ASSERT_OWNED
);
1804 heap_free(&ready_heap
);
1805 heap_free(&wfq_ready_heap
);
1806 heap_free(&extract_heap
);
1807 /* remove all references to this pipe from flow_sets */
1808 for (fs
= all_flow_sets
; fs
; fs
= fs
->next
)
1809 purge_flow_set(fs
, 0);
1811 for (p
= all_pipes
; p
; p
= p
->next
) {
1812 purge_flow_set(&(p
->fs
), 0);
1815 while ((m
= mnext
) != NULL
) {
1816 mnext
= m
->m_nextpkt
;
1819 p
->head
= p
->tail
= NULL
;
1824 * Fully delete a pipe or a queue, cleaning up associated info.
1827 delete_pipe(struct dn_pipe
*p
)
1829 if (p
->pipe_nr
== 0 && p
->fs
.fs_nr
== 0)
1831 if (p
->pipe_nr
!= 0 && p
->fs
.fs_nr
!= 0)
1833 if (p
->pipe_nr
!= 0) { /* this is an old-style pipe */
1834 struct dn_pipe
*a
, *b
;
1835 struct dn_flow_set
*fs
;
1837 lck_mtx_lock(dn_mutex
);
1839 for (a
= NULL
, b
= all_pipes
; b
&& b
->pipe_nr
< p
->pipe_nr
;
1840 a
= b
, b
= b
->next
) ;
1841 if (b
== NULL
|| (b
->pipe_nr
!= p
->pipe_nr
) ) {
1842 lck_mtx_unlock(dn_mutex
);
1843 return EINVAL
; /* not found */
1846 /* unlink from list of pipes */
1848 all_pipes
= b
->next
;
1851 /* remove references to this pipe from the ip_fw rules. */
1852 flush_pipe_ptrs(&(b
->fs
));
1854 /* remove all references to this pipe from flow_sets */
1855 for (fs
= all_flow_sets
; fs
; fs
= fs
->next
)
1856 if (fs
->pipe
== b
) {
1857 printf("dummynet: ++ ref to pipe %d from fs %d\n",
1858 p
->pipe_nr
, fs
->fs_nr
);
1860 purge_flow_set(fs
, 0);
1862 fs_remove_from_heap(&ready_heap
, &(b
->fs
));
1863 purge_pipe(b
); /* remove all data associated to this pipe */
1864 /* remove reference to here from extract_heap and wfq_ready_heap */
1865 pipe_remove_from_heap(&extract_heap
, b
);
1866 pipe_remove_from_heap(&wfq_ready_heap
, b
);
1867 lck_mtx_unlock(dn_mutex
);
1869 FREE(b
, M_DUMMYNET
);
1870 } else { /* this is a WF2Q queue (dn_flow_set) */
1871 struct dn_flow_set
*a
, *b
;
1873 lck_mtx_lock(dn_mutex
);
1875 for (a
= NULL
, b
= all_flow_sets
; b
&& b
->fs_nr
< p
->fs
.fs_nr
;
1876 a
= b
, b
= b
->next
) ;
1877 if (b
== NULL
|| (b
->fs_nr
!= p
->fs
.fs_nr
) ) {
1878 lck_mtx_unlock(dn_mutex
);
1879 return EINVAL
; /* not found */
1883 all_flow_sets
= b
->next
;
1886 /* remove references to this flow_set from the ip_fw rules. */
1889 if (b
->pipe
!= NULL
) {
1890 /* Update total weight on parent pipe and cleanup parent heaps */
1891 b
->pipe
->sum
-= b
->weight
* b
->backlogged
;
1892 fs_remove_from_heap(&(b
->pipe
->not_eligible_heap
), b
);
1893 fs_remove_from_heap(&(b
->pipe
->scheduler_heap
), b
);
1894 #if 1 /* XXX should i remove from idle_heap as well ? */
1895 fs_remove_from_heap(&(b
->pipe
->idle_heap
), b
);
1898 purge_flow_set(b
, 1);
1899 lck_mtx_unlock(dn_mutex
);
1905 * helper function used to copy data from kernel in DUMMYNET_GET
1908 dn_copy_set(struct dn_flow_set
*set
, char *bp
)
1911 struct dn_flow_queue
*q
, *qp
= (struct dn_flow_queue
*)bp
;
1913 lck_mtx_assert(dn_mutex
, LCK_MTX_ASSERT_OWNED
);
1915 for (i
= 0 ; i
<= set
->rq_size
; i
++)
1916 for (q
= set
->rq
[i
] ; q
; q
= q
->next
, qp
++ ) {
1917 if (q
->hash_slot
!= i
)
1918 printf("dummynet: ++ at %d: wrong slot (have %d, "
1919 "should be %d)\n", copied
, q
->hash_slot
, i
);
1921 printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
1924 bcopy(q
, qp
, sizeof( *q
) );
1925 /* cleanup pointers */
1927 qp
->head
= qp
->tail
= NULL
;
1930 if (copied
!= set
->rq_elements
)
1931 printf("dummynet: ++ wrong count, have %d should be %d\n",
1932 copied
, set
->rq_elements
);
1939 struct dn_flow_set
*set
;
1943 lck_mtx_assert(dn_mutex
, LCK_MTX_ASSERT_OWNED
);
1946 * compute size of data structures: list of pipes and flow_sets.
1948 for (p
= all_pipes
, size
= 0 ; p
; p
= p
->next
)
1949 size
+= sizeof( *p
) +
1950 p
->fs
.rq_elements
* sizeof(struct dn_flow_queue
);
1951 for (set
= all_flow_sets
; set
; set
= set
->next
)
1952 size
+= sizeof ( *set
) +
1953 set
->rq_elements
* sizeof(struct dn_flow_queue
);
1958 dummynet_get(struct sockopt
*sopt
)
1960 char *buf
, *bp
; /* bp is the "copy-pointer" */
1962 struct dn_flow_set
*set
;
1966 /* XXX lock held too long */
1967 lck_mtx_lock(dn_mutex
);
1969 * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
1970 * cannot use this flag while holding a mutex.
1972 for (i
= 0; i
< 10; i
++) {
1973 size
= dn_calc_size();
1974 lck_mtx_unlock(dn_mutex
);
1975 buf
= _MALLOC(size
, M_TEMP
, M_WAITOK
);
1976 lck_mtx_lock(dn_mutex
);
1977 if (size
== dn_calc_size())
1983 lck_mtx_unlock(dn_mutex
);
1986 for (p
= all_pipes
, bp
= buf
; p
; p
= p
->next
) {
1987 struct dn_pipe
*pipe_bp
= (struct dn_pipe
*)bp
;
1990 * copy pipe descriptor into *bp, convert delay back to ms,
1991 * then copy the flow_set descriptor(s) one at a time.
1992 * After each flow_set, copy the queue descriptor it owns.
1994 bcopy(p
, bp
, sizeof( *p
) );
1995 pipe_bp
->delay
= (pipe_bp
->delay
* 1000) / (hz
*10) ;
1997 * XXX the following is a hack based on ->next being the
1998 * first field in dn_pipe and dn_flow_set. The correct
1999 * solution would be to move the dn_flow_set to the beginning
2000 * of struct dn_pipe.
2002 pipe_bp
->next
= (struct dn_pipe
*)DN_IS_PIPE
;
2003 /* clean pointers */
2004 pipe_bp
->head
= pipe_bp
->tail
= NULL
;
2005 pipe_bp
->fs
.next
= NULL
;
2006 pipe_bp
->fs
.pipe
= NULL
;
2007 pipe_bp
->fs
.rq
= NULL
;
2009 bp
+= sizeof( *p
) ;
2010 bp
= dn_copy_set( &(p
->fs
), bp
);
2012 for (set
= all_flow_sets
; set
; set
= set
->next
) {
2013 struct dn_flow_set
*fs_bp
= (struct dn_flow_set
*)bp
;
2014 bcopy(set
, bp
, sizeof( *set
) );
2015 /* XXX same hack as above */
2016 fs_bp
->next
= (struct dn_flow_set
*)DN_IS_QUEUE
;
2017 fs_bp
->pipe
= NULL
;
2019 bp
+= sizeof( *set
) ;
2020 bp
= dn_copy_set( set
, bp
);
2022 lck_mtx_unlock(dn_mutex
);
2024 error
= sooptcopyout(sopt
, buf
, size
);
2030 * Handler for the various dummynet socket options (get, flush, config, del)
2033 ip_dn_ctl(struct sockopt
*sopt
)
2036 struct dn_pipe
*p
, tmp_pipe
;
2038 /* Disallow sets in really-really secure mode. */
2039 if (sopt
->sopt_dir
== SOPT_SET
&& securelevel
>= 3)
2042 switch (sopt
->sopt_name
) {
2044 printf("dummynet: -- unknown option %d", sopt
->sopt_name
);
2047 case IP_DUMMYNET_GET
:
2048 error
= dummynet_get(sopt
);
2051 case IP_DUMMYNET_FLUSH
:
2055 case IP_DUMMYNET_CONFIGURE
:
2057 error
= sooptcopyin(sopt
, p
, sizeof *p
, sizeof *p
);
2060 error
= config_pipe(p
);
2063 case IP_DUMMYNET_DEL
: /* remove a pipe or queue */
2065 error
= sooptcopyin(sopt
, p
, sizeof *p
, sizeof *p
);
2069 error
= delete_pipe(p
);
2079 dn_mutex_grp_attr
= lck_grp_attr_alloc_init();
2080 dn_mutex_grp
= lck_grp_alloc_init("dn", dn_mutex_grp_attr
);
2081 dn_mutex_attr
= lck_attr_alloc_init();
2083 if ((dn_mutex
= lck_mtx_alloc_init(dn_mutex_grp
, dn_mutex_attr
)) == NULL
) {
2084 printf("ip_dn_init: can't alloc dn_mutex\n");
2089 all_flow_sets
= NULL
;
2090 ready_heap
.size
= ready_heap
.elements
= 0 ;
2091 ready_heap
.offset
= 0 ;
2093 wfq_ready_heap
.size
= wfq_ready_heap
.elements
= 0 ;
2094 wfq_ready_heap
.offset
= 0 ;
2096 extract_heap
.size
= extract_heap
.elements
= 0 ;
2097 extract_heap
.offset
= 0 ;
2098 ip_dn_ctl_ptr
= ip_dn_ctl
;
2099 ip_dn_io_ptr
= dummynet_io
;
2100 ip_dn_ruledel_ptr
= dn_rule_delete
;