2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
25 * Copyright (c
) 1998-2001 Luigi Rizzo
, Universita` di Pisa
26 * Portions
Copyright (c
) 2000 Akamba Corp
.
29 * Redistribution
and use in source
and binary forms
, with
or without
30 * modification
, are permitted provided that the following conditions
32 * 1. Redistributions of source code must retain the above copyright
33 * notice
, this list of conditions
and the following disclaimer
.
34 * 2. Redistributions in binary form must reproduce the above copyright
35 * notice
, this list of conditions
and the following disclaimer in the
36 * documentation
and/or other materials provided with the distribution
.
38 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS
'' AND
39 * ANY EXPRESS OR IMPLIED WARRANTIES
, INCLUDING
, BUT NOT LIMITED TO
, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41 * ARE DISCLAIMED
. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
42 * FOR ANY DIRECT
, INDIRECT
, INCIDENTAL
, SPECIAL
, EXEMPLARY
, OR CONSEQUENTIAL
43 * DAMAGES (INCLUDING
, BUT NOT LIMITED TO
, PROCUREMENT OF SUBSTITUTE GOODS
44 * OR SERVICES
; LOSS OF USE
, DATA
, OR PROFITS
; OR BUSINESS INTERRUPTION
)
45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY
, WHETHER IN CONTRACT
, STRICT
46 * LIABILITY
, OR
TORT (INCLUDING NEGLIGENCE OR OTHERWISE
) ARISING IN ANY WAY
47 * OUT OF THE USE OF THIS SOFTWARE
, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * $FreeBSD
: src
/sys
/netinet
/ip_dummynet
.c
,v
1.24.2.11 2001/02/09 23:18:08 luigi Exp $
57 * This module implements IP dummynet, a bandwidth limiter/delay emulator
58 * used in conjunction with the ipfw package.
59 * Description of the data structures used is in ip_dummynet.h
60 * Here you mainly find the following blocks of code:
61 * + variable declarations;
62 * + heap management functions;
63 * + scheduler and dummynet functions;
64 * + configuration and initialization.
66 * NOTA BENE: critical sections are protected by splimp()/splx()
67 * pairs. One would think that splnet() is enough as for most of
68 * the netinet code, but it is not so because when used with
69 * bridging, dummynet is invoked at splimp().
71 * Most important Changes:
73 * 010124: Fixed WF2Q behaviour
74 * 010122: Fixed spl protection.
75 * 000601: WF2Q support
76 * 000106: large rewrite, use heaps to handle very many pipes.
77 * 980513: initial release
79 * include files marked with XXX are probably not needed
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/malloc.h>
86 #include <sys/queue.h> /* XXX */
87 #include <sys/kernel.h>
88 #include <sys/socket.h>
89 #include <sys/socketvar.h>
91 #include <sys/sysctl.h>
93 #include <net/route.h>
94 #include <netinet/in.h>
95 #include <netinet/in_systm.h>
96 #include <netinet/in_var.h>
97 #include <netinet/ip.h>
98 #include <netinet/ip_fw.h>
99 #include <netinet/ip_dummynet.h>
100 #include <netinet/ip_var.h>
103 #include <netinet/if_ether.h> /* for struct arpcom */
104 #include <net/bridge.h>
108 * We keep a private variable for the simulation time, but we could
109 * probably use an existing one ("softticks" in sys/kern/kern_timer.c)
111 static dn_key curr_time
= 0 ; /* current simulation time */
113 static int dn_hash_size
= 64 ; /* default hash size */
115 /* statistics on number of queue searches and search steps */
116 static int searches
, search_steps
;
117 static int pipe_expire
= 1 ; /* expire queue if empty */
118 static int dn_max_ratio
= 16 ; /* max queues/buckets ratio */
120 static int red_lookup_depth
= 256; /* RED - default lookup table depth */
121 static int red_avg_pkt_size
= 512; /* RED - default medium packet size */
122 static int red_max_pkt_size
= 1500; /* RED - default max packet size */
125 * Three heaps contain queues and pipes that the scheduler handles:
127 * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
129 * wfq_ready_heap contains the pipes associated with WF2Q flows
131 * extract_heap contains pipes associated with delay lines.
134 static struct dn_heap ready_heap
, extract_heap
, wfq_ready_heap
;
136 static int heap_init(struct dn_heap
*h
, int size
) ;
137 static int heap_insert (struct dn_heap
*h
, dn_key key1
, void *p
);
138 static void heap_extract(struct dn_heap
*h
, void *obj
);
140 static void transmit_event(struct dn_pipe
*pipe
);
141 static void ready_event(struct dn_flow_queue
*q
);
143 static struct dn_pipe
*all_pipes
= NULL
; /* list of all pipes */
144 static struct dn_flow_set
*all_flow_sets
= NULL
;/* list of all flow_sets */
147 SYSCTL_NODE(_net_inet_ip
, OID_AUTO
, dummynet
,
148 CTLFLAG_RW
, 0, "Dummynet");
149 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, hash_size
,
150 CTLFLAG_RW
, &dn_hash_size
, 0, "Default hash table size");
151 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, curr_time
,
152 CTLFLAG_RD
, &curr_time
, 0, "Current tick");
153 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, ready_heap
,
154 CTLFLAG_RD
, &ready_heap
.size
, 0, "Size of ready heap");
155 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, extract_heap
,
156 CTLFLAG_RD
, &extract_heap
.size
, 0, "Size of extract heap");
157 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, searches
,
158 CTLFLAG_RD
, &searches
, 0, "Number of queue searches");
159 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, search_steps
,
160 CTLFLAG_RD
, &search_steps
, 0, "Number of queue search steps");
161 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, expire
,
162 CTLFLAG_RW
, &pipe_expire
, 0, "Expire queue if empty");
163 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, max_chain_len
,
164 CTLFLAG_RW
, &dn_max_ratio
, 0,
165 "Max ratio between dynamic queues and buckets");
166 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, red_lookup_depth
,
167 CTLFLAG_RD
, &red_lookup_depth
, 0, "Depth of RED lookup table");
168 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, red_avg_pkt_size
,
169 CTLFLAG_RD
, &red_avg_pkt_size
, 0, "RED Medium packet size");
170 SYSCTL_INT(_net_inet_ip_dummynet
, OID_AUTO
, red_max_pkt_size
,
171 CTLFLAG_RD
, &red_max_pkt_size
, 0, "RED Max packet size");
174 static int config_pipe(struct dn_pipe
*p
);
175 static int ip_dn_ctl(struct sockopt
*sopt
);
177 static void rt_unref(struct rtentry
*);
178 static void dummynet(void *);
179 static void dummynet_flush(void);
180 void dummynet_drain(void);
181 int if_tx_rdy(struct ifnet
*ifp
);
184 * ip_fw_chain is used when deleting a pipe, because ipfw rules can
185 * hold references to the pipe.
187 extern LIST_HEAD (ip_fw_head
, ip_fw_chain
) ip_fw_chain_head
;
190 rt_unref(struct rtentry
*rt
)
194 if (rt
->rt_refcnt
<= 0)
195 printf("-- warning, refcnt now %ld, decreasing\n", rt
->rt_refcnt
);
200 * Heap management functions.
202 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
203 * Some macros help finding parent/children so we can optimize them.
205 * heap_init() is called to expand the heap when needed.
206 * Increment size in blocks of 16 entries.
207 * XXX failure to allocate a new element is a pretty bad failure
208 * as we basically stall a whole queue forever!!
209 * Returns 1 on error, 0 on success
211 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
212 #define HEAP_LEFT(x) ( 2*(x) + 1 )
213 #define HEAP_IS_LEFT(x) ( (x) & 1 )
214 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
215 #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
216 #define HEAP_INCREMENT 15
219 heap_init(struct dn_heap
*h
, int new_size
)
221 struct dn_heap_entry
*p
;
223 if (h
->size
>= new_size
) {
224 printf("heap_init, Bogus call, have %d want %d\n",
228 new_size
= (new_size
+ HEAP_INCREMENT
) & ~HEAP_INCREMENT
;
229 p
= _MALLOC(new_size
* sizeof(*p
), M_IPFW
, M_DONTWAIT
);
231 printf(" heap_init, resize %d failed\n", new_size
);
232 return 1 ; /* error */
235 bcopy(h
->p
, p
, h
->size
* sizeof(*p
) );
244 * Insert element in heap. Normally, p != NULL, we insert p in
245 * a new position and bubble up. If p == NULL, then the element is
246 * already in place, and key is the position where to start the
248 * Returns 1 on failure (cannot allocate new heap entry)
250 * If offset > 0 the position (index, int) of the element in the heap is
251 * also stored in the element itself at the given offset in bytes.
253 #define SET_OFFSET(heap, node) \
254 if (heap->offset > 0) \
255 *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
257 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
259 #define RESET_OFFSET(heap, node) \
260 if (heap->offset > 0) \
261 *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
263 heap_insert(struct dn_heap
*h
, dn_key key1
, void *p
)
265 int son
= h
->elements
;
267 if (p
== NULL
) /* data already there, set starting point */
269 else { /* insert new element at the end, possibly resize */
271 if (son
== h
->size
) /* need resize... */
272 if (heap_init(h
, h
->elements
+1) )
273 return 1 ; /* failure... */
274 h
->p
[son
].object
= p
;
275 h
->p
[son
].key
= key1
;
278 while (son
> 0) { /* bubble up */
279 int father
= HEAP_FATHER(son
) ;
280 struct dn_heap_entry tmp
;
282 if (DN_KEY_LT( h
->p
[father
].key
, h
->p
[son
].key
) )
283 break ; /* found right position */
284 /* son smaller than father, swap and repeat */
285 HEAP_SWAP(h
->p
[son
], h
->p
[father
], tmp
) ;
294 * remove top element from heap, or obj if obj != NULL
297 heap_extract(struct dn_heap
*h
, void *obj
)
299 int child
, father
, max
= h
->elements
- 1 ;
302 printf("warning, extract from empty heap 0x%p\n", h
);
305 father
= 0 ; /* default: move up smallest child */
306 if (obj
!= NULL
) { /* extract specific element, index is at offset */
308 panic("*** heap_extract from middle not supported on this heap!!!\n");
309 father
= *((int *)((char *)obj
+ h
->offset
)) ;
310 if (father
< 0 || father
>= h
->elements
) {
311 printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
312 father
, h
->elements
);
313 panic("heap_extract");
316 RESET_OFFSET(h
, father
);
317 child
= HEAP_LEFT(father
) ; /* left child */
318 while (child
<= max
) { /* valid entry */
319 if (child
!= max
&& DN_KEY_LT(h
->p
[child
+1].key
, h
->p
[child
].key
) )
320 child
= child
+1 ; /* take right child, otherwise left */
321 h
->p
[father
] = h
->p
[child
] ;
322 SET_OFFSET(h
, father
);
324 child
= HEAP_LEFT(child
) ; /* left child for next loop */
329 * Fill hole with last entry and bubble up, reusing the insert code
331 h
->p
[father
] = h
->p
[max
] ;
332 heap_insert(h
, father
, NULL
); /* this one cannot fail */
338 * change object position and update references
339 * XXX this one is never used!
342 heap_move(struct dn_heap
*h
, dn_key new_key
, void *object
)
346 int max
= h
->elements
-1 ;
347 struct dn_heap_entry buf
;
350 panic("cannot move items on this heap");
352 i
= *((int *)((char *)object
+ h
->offset
));
353 if (DN_KEY_LT(new_key
, h
->p
[i
].key
) ) { /* must move up */
354 h
->p
[i
].key
= new_key
;
355 for (; i
>0 && DN_KEY_LT(new_key
, h
->p
[(temp
= HEAP_FATHER(i
))].key
) ;
356 i
= temp
) { /* bubble up */
357 HEAP_SWAP(h
->p
[i
], h
->p
[temp
], buf
) ;
360 } else { /* must move down */
361 h
->p
[i
].key
= new_key
;
362 while ( (temp
= HEAP_LEFT(i
)) <= max
) { /* found left child */
363 if ((temp
!= max
) && DN_KEY_GT(h
->p
[temp
].key
, h
->p
[temp
+1].key
))
364 temp
++ ; /* select child with min key */
365 if (DN_KEY_GT(new_key
, h
->p
[temp
].key
)) { /* go down */
366 HEAP_SWAP(h
->p
[i
], h
->p
[temp
], buf
) ;
375 #endif /* heap_move, unused */
378 * heapify() will reorganize data inside an array to maintain the
379 * heap property. It is needed when we delete a bunch of entries.
382 heapify(struct dn_heap
*h
)
386 for (i
= 0 ; i
< h
->elements
; i
++ )
387 heap_insert(h
, i
, NULL
) ;
391 * cleanup the heap and free data structure
394 heap_free(struct dn_heap
*h
)
398 bzero(h
, sizeof(*h
) );
402 * --- end of heap management functions ---
406 * Scheduler functions:
408 * transmit_event() is called when the delay-line needs to enter
409 * the scheduler, either because of existing pkts getting ready,
410 * or new packets entering the queue. The event handled is the delivery
411 * time of the packet.
413 * ready_event() does something similar with fixed-rate queues, and the
414 * event handled is the finish time of the head pkt.
416 * wfq_ready_event() does something similar with WF2Q queues, and the
417 * event handled is the start time of the head pkt.
419 * In all cases, we make sure that the data structures are consistent
420 * before passing pkts out, because this might trigger recursive
421 * invocations of the procedures.
424 transmit_event(struct dn_pipe
*pipe
)
428 while ( (pkt
= pipe
->head
) && DN_KEY_LEQ(pkt
->output_time
, curr_time
) ) {
430 * first unlink, then call procedures, since ip_input() can invoke
431 * ip_output() and viceversa, thus causing nested calls
433 pipe
->head
= DN_NEXT(pkt
) ;
436 * The actual mbuf is preceded by a struct dn_pkt, resembling an mbuf
437 * (NOT A REAL one, just a small block of malloc'ed memory) with
438 * m_type = MT_DUMMYNET
439 * m_next = actual mbuf to be processed by ip_input/output
440 * m_data = the matching rule
441 * and some other fields.
442 * The block IS FREED HERE because it contains parameters passed
443 * to the called routine.
445 switch (pkt
->dn_dir
) {
447 (void)ip_output((struct mbuf
*)pkt
, NULL
, NULL
, 0, NULL
);
448 rt_unref (pkt
->ro
.ro_rt
) ;
452 ip_input((struct mbuf
*)pkt
) ;
456 case DN_TO_BDG_FWD
: {
457 struct mbuf
*m
= (struct mbuf
*)pkt
;
458 struct ether_header
*eh
;
460 if (pkt
->dn_m
->m_len
< ETHER_HDR_LEN
461 && (pkt
->dn_m
= m_pullup(pkt
->dn_m
, ETHER_HDR_LEN
)) == NULL
) {
462 printf("dummynet/bridge: pullup fail, dropping pkt\n");
466 * same as ether_input, make eh be a pointer into the mbuf
468 eh
= mtod(pkt
->dn_m
, struct ether_header
*);
469 m_adj(pkt
->dn_m
, ETHER_HDR_LEN
);
471 * bdg_forward() wants a pointer to the pseudo-mbuf-header, but
472 * on return it will supply the pointer to the actual packet
473 * (originally pkt->dn_m, but could be something else now) if
474 * it has not consumed it.
476 m
= bdg_forward(m
, eh
, pkt
->ifp
);
484 printf("dummynet: bad switch %d!\n", pkt
->dn_dir
);
490 /* if there are leftover packets, put into the heap for next event */
491 if ( (pkt
= pipe
->head
) )
492 heap_insert(&extract_heap
, pkt
->output_time
, pipe
) ;
493 /* XXX should check errors on heap_insert, by draining the
494 * whole pipe p and hoping in the future we are more successful
499 * the following macro computes how many ticks we have to wait
500 * before being able to transmit a packet. The credit is taken from
501 * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
503 #define SET_TICKS(pkt, q, p) \
504 (pkt->dn_m->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \
508 * extract pkt from queue, compute output time (could be now)
509 * and put into delay line (p_queue)
512 move_pkt(struct dn_pkt
*pkt
, struct dn_flow_queue
*q
,
513 struct dn_pipe
*p
, int len
)
515 q
->head
= DN_NEXT(pkt
) ;
517 q
->len_bytes
-= len
;
519 pkt
->output_time
= curr_time
+ p
->delay
;
524 DN_NEXT(p
->tail
) = pkt
;
526 DN_NEXT(p
->tail
) = NULL
;
530 * ready_event() is invoked every time the queue must enter the
531 * scheduler, either because the first packet arrives, or because
532 * a previously scheduled event fired.
533 * On invokation, drain as many pkts as possible (could be 0) and then
534 * if there are leftover packets reinsert the pkt in the scheduler.
537 ready_event(struct dn_flow_queue
*q
)
540 struct dn_pipe
*p
= q
->fs
->pipe
;
544 printf("ready_event- pipe is gone\n");
547 p_was_empty
= (p
->head
== NULL
) ;
550 * schedule fixed-rate queues linked to this pipe:
551 * Account for the bw accumulated since last scheduling, then
552 * drain as many pkts as allowed by q->numbytes and move to
553 * the delay line (in p) computing output time.
554 * bandwidth==0 (no limit) means we can drain the whole queue,
555 * setting len_scaled = 0 does the job.
557 q
->numbytes
+= ( curr_time
- q
->sched_time
) * p
->bandwidth
;
558 while ( (pkt
= q
->head
) != NULL
) {
559 int len
= pkt
->dn_m
->m_pkthdr
.len
;
560 int len_scaled
= p
->bandwidth
? len
*8*hz
: 0 ;
561 if (len_scaled
> q
->numbytes
)
563 q
->numbytes
-= len_scaled
;
564 move_pkt(pkt
, q
, p
, len
);
567 * If we have more packets queued, schedule next ready event
568 * (can only occur when bandwidth != 0, otherwise we would have
569 * flushed the whole queue in the previous loop).
570 * To this purpose we record the current time and compute how many
571 * ticks to go for the finish time of the packet.
573 if ( (pkt
= q
->head
) != NULL
) { /* this implies bandwidth != 0 */
574 dn_key t
= SET_TICKS(pkt
, q
, p
); /* ticks i have to wait */
575 q
->sched_time
= curr_time
;
576 heap_insert(&ready_heap
, curr_time
+ t
, (void *)q
);
577 /* XXX should check errors on heap_insert, and drain the whole
578 * queue on error hoping next time we are luckier.
580 } else /* RED needs to know when the queue becomes empty */
581 q
->q_time
= curr_time
;
583 * If the delay line was empty call transmit_event(p) now.
584 * Otherwise, the scheduler will take care of it.
591 * Called when we can transmit packets on WF2Q queues. Take pkts out of
592 * the queues at their start time, and enqueue into the delay line.
593 * Packets are drained until p->numbytes < 0. As long as
594 * len_scaled >= p->numbytes, the packet goes into the delay line
595 * with a deadline p->delay. For the last packet, if p->numbytes<0,
596 * there is an additional delay.
599 ready_event_wfq(struct dn_pipe
*p
)
601 int p_was_empty
= (p
->head
== NULL
) ;
602 struct dn_heap
*sch
= &(p
->scheduler_heap
);
603 struct dn_heap
*neh
= &(p
->not_eligible_heap
) ;
605 if (p
->if_name
[0] == 0) /* tx clock is simulated */
606 p
->numbytes
+= ( curr_time
- p
->sched_time
) * p
->bandwidth
;
607 else { /* tx clock is for real, the ifq must be empty or this is a NOP */
608 if (p
->ifp
&& p
->ifp
->if_snd
.ifq_head
!= NULL
)
611 DEB(printf("pipe %d ready from %s --\n",
612 p
->pipe_nr
, p
->if_name
);)
617 * While we have backlogged traffic AND credit, we need to do
618 * something on the queue.
620 while ( p
->numbytes
>=0 && (sch
->elements
>0 || neh
->elements
>0) ) {
621 if (sch
->elements
> 0) { /* have some eligible pkts to send out */
622 struct dn_flow_queue
*q
= sch
->p
[0].object
;
623 struct dn_pkt
*pkt
= q
->head
;
624 struct dn_flow_set
*fs
= q
->fs
;
625 u_int64_t len
= pkt
->dn_m
->m_pkthdr
.len
;
626 int len_scaled
= p
->bandwidth
? len
*8*hz
: 0 ;
628 heap_extract(sch
, NULL
); /* remove queue from heap */
629 p
->numbytes
-= len_scaled
;
630 move_pkt(pkt
, q
, p
, len
);
632 p
->V
+= (len
<<MY_M
) / p
->sum
; /* update V */
633 q
->S
= q
->F
; /* update start time */
634 if (q
->len
== 0) { /* Flow not backlogged any more */
636 heap_insert(&(p
->idle_heap
), q
->F
, q
);
637 } else { /* still backlogged */
639 * update F and position in backlogged queue, then
640 * put flow in not_eligible_heap (we will fix this later).
642 len
= (q
->head
)->dn_m
->m_pkthdr
.len
;
643 q
->F
+= (len
<<MY_M
)/(u_int64_t
) fs
->weight
;
644 if (DN_KEY_LEQ(q
->S
, p
->V
))
645 heap_insert(neh
, q
->S
, q
);
647 heap_insert(sch
, q
->F
, q
);
651 * now compute V = max(V, min(S_i)). Remember that all elements in sch
652 * have by definition S_i <= V so if sch is not empty, V is surely
653 * the max and we must not update it. Conversely, if sch is empty
654 * we only need to look at neh.
656 if (sch
->elements
== 0 && neh
->elements
> 0)
657 p
->V
= MAX64 ( p
->V
, neh
->p
[0].key
);
658 /* move from neh to sch any packets that have become eligible */
659 while (neh
->elements
> 0 && DN_KEY_LEQ(neh
->p
[0].key
, p
->V
) ) {
660 struct dn_flow_queue
*q
= neh
->p
[0].object
;
661 heap_extract(neh
, NULL
);
662 heap_insert(sch
, q
->F
, q
);
665 if (p
->if_name
[0] != '\0') {/* tx clock is from a real thing */
666 p
->numbytes
= -1 ; /* mark not ready for I/O */
670 if (sch
->elements
== 0 && neh
->elements
== 0 && p
->numbytes
>= 0
671 && p
->idle_heap
.elements
> 0) {
673 * no traffic and no events scheduled. We can get rid of idle-heap.
677 for (i
= 0 ; i
< p
->idle_heap
.elements
; i
++) {
678 struct dn_flow_queue
*q
= p
->idle_heap
.p
[i
].object
;
685 p
->idle_heap
.elements
= 0 ;
688 * If we are getting clocks from dummynet (not a real interface) and
689 * If we are under credit, schedule the next ready event.
690 * Also fix the delivery time of the last packet.
692 if (p
->if_name
[0]==0 && p
->numbytes
< 0) { /* this implies bandwidth >0 */
693 dn_key t
=0 ; /* number of ticks i have to wait */
695 if (p
->bandwidth
> 0)
696 t
= ( p
->bandwidth
-1 - p
->numbytes
) / p
->bandwidth
;
697 p
->tail
->output_time
+= t
;
698 p
->sched_time
= curr_time
;
699 heap_insert(&wfq_ready_heap
, curr_time
+ t
, (void *)p
);
700 /* XXX should check errors on heap_insert, and drain the whole
701 * queue on error hoping next time we are luckier.
705 * If the delay line was empty call transmit_event(p) now.
706 * Otherwise, the scheduler will take care of it.
713 * This is called once per tick, or HZ times per second. It is used to
714 * increment the current tick counter and schedule expired events.
717 dummynet(void * __unused unused
)
719 void *p
; /* generic parameter to handler */
722 struct dn_heap
*heaps
[3];
726 heaps
[0] = &ready_heap
; /* fixed-rate queues */
727 heaps
[1] = &wfq_ready_heap
; /* wfq queues */
728 heaps
[2] = &extract_heap
; /* delay line */
729 s
= splimp(); /* see note on top, splnet() is not enough */
731 for (i
=0; i
< 3 ; i
++) {
733 while (h
->elements
> 0 && DN_KEY_LEQ(h
->p
[0].key
, curr_time
) ) {
734 DDB(if (h
->p
[0].key
> curr_time
)
735 printf("-- dummynet: warning, heap %d is %d ticks late\n",
736 i
, (int)(curr_time
- h
->p
[0].key
));)
737 p
= h
->p
[0].object
; /* store a copy before heap_extract */
738 heap_extract(h
, NULL
); /* need to extract before processing */
742 struct dn_pipe
*pipe
= p
;
743 if (pipe
->if_name
[0] != '\0')
744 printf("*** bad ready_event_wfq for pipe %s\n",
752 /* sweep pipes trying to expire idle flow_queues */
753 for (pe
= all_pipes
; pe
; pe
= pe
->next
)
754 if (pe
->idle_heap
.elements
> 0 &&
755 DN_KEY_LT(pe
->idle_heap
.p
[0].key
, pe
->V
) ) {
756 struct dn_flow_queue
*q
= pe
->idle_heap
.p
[0].object
;
758 heap_extract(&(pe
->idle_heap
), NULL
);
759 q
->S
= q
->F
+ 1 ; /* mark timestamp as invalid */
760 pe
->sum
-= q
->fs
->weight
;
763 timeout(dummynet
, NULL
, 1);
767 * called by an interface when tx_rdy occurs.
770 if_tx_rdy(struct ifnet
*ifp
)
774 for (p
= all_pipes
; p
; p
= p
->next
)
779 sprintf(buf
, "%s%d",ifp
->if_name
, ifp
->if_unit
);
780 for (p
= all_pipes
; p
; p
= p
->next
)
781 if (!strcmp(p
->if_name
, buf
) ) {
783 DEB(printf("++ tx rdy from %s (now found)\n", buf
);)
788 DEB(printf("++ tx rdy from %s%d - qlen %d\n", ifp
->if_name
,
789 ifp
->if_unit
, ifp
->if_snd
.ifq_len
);)
790 p
->numbytes
= 0 ; /* mark ready for I/O */
797 * Unconditionally expire empty queues in case of shortage.
798 * Returns the number of queues freed.
801 expire_queues(struct dn_flow_set
*fs
)
803 struct dn_flow_queue
*q
, *prev
;
804 int i
, initial_elements
= fs
->rq_elements
;
806 if (fs
->last_expired
== time_second
)
808 fs
->last_expired
= time_second
;
809 for (i
= 0 ; i
<= fs
->rq_size
; i
++) /* last one is overflow */
810 for (prev
=NULL
, q
= fs
->rq
[i
] ; q
!= NULL
; )
811 if (q
->head
!= NULL
|| q
->S
!= q
->F
+1) {
814 } else { /* entry is idle, expire it */
815 struct dn_flow_queue
*old_q
= q
;
818 prev
->next
= q
= q
->next
;
820 fs
->rq
[i
] = q
= q
->next
;
824 return initial_elements
- fs
->rq_elements
;
828 * If room, create a new queue and put at head of slot i;
829 * otherwise, create or use the default queue.
831 static struct dn_flow_queue
*
832 create_queue(struct dn_flow_set
*fs
, int i
)
834 struct dn_flow_queue
*q
;
836 if (fs
->rq_elements
> fs
->rq_size
* dn_max_ratio
&&
837 expire_queues(fs
) == 0) {
839 * No way to get room, use or create overflow queue.
842 if ( fs
->rq
[i
] != NULL
)
845 q
= _MALLOC(sizeof(*q
), M_IPFW
, M_DONTWAIT
) ;
847 printf("sorry, cannot allocate queue for new flow\n");
850 bzero(q
, sizeof(*q
) ); /* needed */
853 q
->next
= fs
->rq
[i
] ;
854 q
->S
= q
->F
+ 1; /* hack - mark timestamp as invalid */
861 * Given a flow_set and a pkt in last_pkt, find a matching queue
862 * after appropriate masking. The queue is moved to front
863 * so that further searches take less time.
865 static struct dn_flow_queue
*
866 find_queue(struct dn_flow_set
*fs
)
868 int i
= 0 ; /* we need i and q for new allocations */
869 struct dn_flow_queue
*q
, *prev
;
871 if ( !(fs
->flags_fs
& DN_HAVE_FLOW_MASK
) )
874 /* first, do the masking */
875 last_pkt
.dst_ip
&= fs
->flow_mask
.dst_ip
;
876 last_pkt
.src_ip
&= fs
->flow_mask
.src_ip
;
877 last_pkt
.dst_port
&= fs
->flow_mask
.dst_port
;
878 last_pkt
.src_port
&= fs
->flow_mask
.src_port
;
879 last_pkt
.proto
&= fs
->flow_mask
.proto
;
880 last_pkt
.flags
= 0 ; /* we don't care about this one */
881 /* then, hash function */
882 i
= ( (last_pkt
.dst_ip
) & 0xffff ) ^
883 ( (last_pkt
.dst_ip
>> 15) & 0xffff ) ^
884 ( (last_pkt
.src_ip
<< 1) & 0xffff ) ^
885 ( (last_pkt
.src_ip
>> 16 ) & 0xffff ) ^
886 (last_pkt
.dst_port
<< 1) ^ (last_pkt
.src_port
) ^
888 i
= i
% fs
->rq_size
;
889 /* finally, scan the current list for a match */
891 for (prev
=NULL
, q
= fs
->rq
[i
] ; q
; ) {
893 if (bcmp(&last_pkt
, &(q
->id
), sizeof(q
->id
) ) == 0)
895 else if (pipe_expire
&& q
->head
== NULL
&& q
->S
== q
->F
+1 ) {
896 /* entry is idle and not in any heap, expire it */
897 struct dn_flow_queue
*old_q
= q
;
900 prev
->next
= q
= q
->next
;
902 fs
->rq
[i
] = q
= q
->next
;
910 if (q
&& prev
!= NULL
) { /* found and not in front */
911 prev
->next
= q
->next
;
912 q
->next
= fs
->rq
[i
] ;
916 if (q
== NULL
) { /* no match, need to allocate a new entry */
917 q
= create_queue(fs
, i
);
925 red_drops(struct dn_flow_set
*fs
, struct dn_flow_queue
*q
, int len
)
930 * RED calculates the average queue size (avg) using a low-pass filter
931 * with an exponential weighted (w_q) moving average:
932 * avg <- (1-w_q) * avg + w_q * q_size
933 * where q_size is the queue length (measured in bytes or * packets).
935 * If q_size == 0, we compute the idle time for the link, and set
936 * avg = (1 - w_q)^(idle/s)
937 * where s is the time needed for transmitting a medium-sized packet.
939 * Now, if avg < min_th the packet is enqueued.
940 * If avg > max_th the packet is dropped. Otherwise, the packet is
941 * dropped with probability P function of avg.
946 /* queue in bytes or packets ? */
947 u_int q_size
= (fs
->flags_fs
& DN_QSIZE_IS_BYTES
) ? q
->len_bytes
: q
->len
;
949 DEB(printf("\n%d q: %2u ", (int) curr_time
, q_size
);)
951 /* average queue size estimation */
954 * queue is not empty, avg <- avg + (q_size - avg) * w_q
956 int diff
= SCALE(q_size
) - q
->avg
;
957 int64_t v
= SCALE_MUL((int64_t) diff
, (int64_t) fs
->w_q
);
962 * queue is empty, find for how long the queue has been
963 * empty and use a lookup table for computing
964 * (1 - * w_q)^(idle_time/s) where s is the time to send a
969 u_int t
= (curr_time
- q
->q_time
) / fs
->lookup_step
;
971 q
->avg
= (t
< fs
->lookup_depth
) ?
972 SCALE_MUL(q
->avg
, fs
->w_q_lookup
[t
]) : 0;
975 DEB(printf("avg: %u ", SCALE_VAL(q
->avg
));)
977 /* should i drop ? */
979 if (q
->avg
< fs
->min_th
) {
981 return 0; /* accept packet ; */
983 if (q
->avg
>= fs
->max_th
) { /* average queue >= max threshold */
984 if (fs
->flags_fs
& DN_IS_GENTLE_RED
) {
986 * According to Gentle-RED, if avg is greater than max_th the
987 * packet is dropped with a probability
988 * p_b = c_3 * avg - c_4
989 * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
991 p_b
= SCALE_MUL((int64_t) fs
->c_3
, (int64_t) q
->avg
) - fs
->c_4
;
997 } else if (q
->avg
> fs
->min_th
) {
999 * we compute p_b using the linear dropping function p_b = c_1 *
1000 * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
1001 * max_p * min_th / (max_th - min_th)
1003 p_b
= SCALE_MUL((int64_t) fs
->c_1
, (int64_t) q
->avg
) - fs
->c_2
;
1005 if (fs
->flags_fs
& DN_QSIZE_IS_BYTES
)
1006 p_b
= (p_b
* len
) / fs
->max_pkt_size
;
1007 if (++q
->count
== 0)
1008 q
->random
= random() & 0xffff;
1011 * q->count counts packets arrived since last drop, so a greater
1012 * value of q->count means a greater packet drop probability.
1014 if (SCALE_MUL(p_b
, SCALE((int64_t) q
->count
)) > q
->random
) {
1016 DEB(printf("- red drop");)
1017 /* after a drop we calculate a new random value */
1018 q
->random
= random() & 0xffff;
1019 return 1; /* drop */
1022 /* end of RED algorithm */
1023 return 0 ; /* accept */
1027 struct dn_flow_set
*
1028 locate_flowset(int pipe_nr
, struct ip_fw_chain
*rule
)
1030 struct dn_flow_set
*fs
= NULL
;
1032 if ( (rule
->rule
->fw_flg
& IP_FW_F_COMMAND
) == IP_FW_F_QUEUE
)
1033 for (fs
=all_flow_sets
; fs
&& fs
->fs_nr
!= pipe_nr
; fs
=fs
->next
)
1037 for (p1
= all_pipes
; p1
&& p1
->pipe_nr
!= pipe_nr
; p1
= p1
->next
)
1043 rule
->rule
->pipe_ptr
= fs
; /* record for the future */
1048 * dummynet hook for packets. Below 'pipe' is a pipe or a queue
1049 * depending on whether WF2Q or fixed bw is used.
1052 dummynet_io(int pipe_nr
, int dir
, /* pipe_nr can also be a fs_nr */
1053 struct mbuf
*m
, struct ifnet
*ifp
, struct route
*ro
,
1054 struct sockaddr_in
*dst
,
1055 struct ip_fw_chain
*rule
, int flags
)
1058 struct dn_flow_set
*fs
;
1059 struct dn_pipe
*pipe
;
1060 u_int64_t len
= m
->m_pkthdr
.len
;
1061 struct dn_flow_queue
*q
= NULL
;
1068 if ( (fs
= rule
->rule
->pipe_ptr
) == NULL
) {
1069 fs
= locate_flowset(pipe_nr
, rule
);
1071 goto dropit
; /* this queue/pipe does not exist! */
1074 if (pipe
== NULL
) { /* must be a queue, try find a matching pipe */
1075 for (pipe
= all_pipes
; pipe
&& pipe
->pipe_nr
!= fs
->parent_nr
;
1081 printf("No pipe %d for queue %d, drop pkt\n",
1082 fs
->parent_nr
, fs
->fs_nr
);
1088 goto dropit
; /* cannot allocate queue */
1090 * update statistics, then check reasons to drop pkt
1092 q
->tot_bytes
+= len
;
1094 if ( fs
->plr
&& random() < fs
->plr
)
1095 goto dropit
; /* random pkt drop */
1096 if ( fs
->flags_fs
& DN_QSIZE_IS_BYTES
) {
1097 if (q
->len_bytes
> fs
->qsize
)
1098 goto dropit
; /* queue size overflow */
1100 if (q
->len
>= fs
->qsize
)
1101 goto dropit
; /* queue count overflow */
1103 if ( fs
->flags_fs
& DN_IS_RED
&& red_drops(fs
, q
, len
) )
1106 pkt
= (struct dn_pkt
*)_MALLOC(sizeof (*pkt
), M_IPFW
, M_NOWAIT
) ;
1108 goto dropit
; /* cannot allocate packet header */
1109 /* ok, i can handle the pkt now... */
1110 bzero(pkt
, sizeof(*pkt
) ); /* XXX expensive, see if we can remove it*/
1111 /* build and enqueue packet + parameters */
1112 pkt
->hdr
.mh_type
= MT_DUMMYNET
;
1113 (struct ip_fw_chain
*)pkt
->hdr
.mh_data
= rule
;
1114 DN_NEXT(pkt
) = NULL
;
1119 if (dir
== DN_TO_IP_OUT
) {
1121 * We need to copy *ro because for ICMP pkts (and maybe others)
1122 * the caller passed a pointer into the stack; dst might also be
1123 * a pointer into *ro so it needs to be updated.
1128 if (dst
== (struct sockaddr_in
*)&ro
->ro_dst
) /* dst points into ro */
1129 dst
= (struct sockaddr_in
*)&(pkt
->ro
.ro_dst
) ;
1132 pkt
->flags
= flags
;
1134 if (q
->head
== NULL
)
1137 DN_NEXT(q
->tail
) = pkt
;
1140 q
->len_bytes
+= len
;
1142 if ( q
->head
!= pkt
) /* flow was not idle, we are done */
1145 * If we reach this point the flow was previously idle, so we need
1146 * to schedule it. This involves different actions for fixed-rate or
1149 if ( (rule
->rule
->fw_flg
& IP_FW_F_COMMAND
) == IP_FW_F_PIPE
) {
1151 * Fixed-rate queue: just insert into the ready_heap.
1154 if (pipe
->bandwidth
)
1155 t
= SET_TICKS(pkt
, q
, pipe
);
1156 q
->sched_time
= curr_time
;
1157 if (t
== 0) /* must process it now */
1160 heap_insert(&ready_heap
, curr_time
+ t
, q
);
1163 * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
1164 * set S to the virtual time V for the controlling pipe, and update
1165 * the sum of weights for the pipe; otherwise, remove flow from
1166 * idle_heap and set S to max(F,V).
1167 * Second, compute finish time F = S + len/weight.
1168 * Third, if pipe was idle, update V=max(S, V).
1169 * Fourth, count one more backlogged flow.
1171 if (DN_KEY_GT(q
->S
, q
->F
)) { /* means timestamps are invalid */
1173 pipe
->sum
+= fs
->weight
; /* add weight of new queue */
1175 heap_extract(&(pipe
->idle_heap
), q
);
1176 q
->S
= MAX64(q
->F
, pipe
->V
) ;
1178 q
->F
= q
->S
+ ( len
<<MY_M
)/(u_int64_t
) fs
->weight
;
1180 if (pipe
->not_eligible_heap
.elements
== 0 &&
1181 pipe
->scheduler_heap
.elements
== 0)
1182 pipe
->V
= MAX64 ( q
->S
, pipe
->V
);
1185 * Look at eligibility. A flow is not eligibile if S>V (when
1186 * this happens, it means that there is some other flow already
1187 * scheduled for the same pipe, so the scheduler_heap cannot be
1188 * empty). If the flow is not eligible we just store it in the
1189 * not_eligible_heap. Otherwise, we store in the scheduler_heap
1190 * and possibly invoke ready_event_wfq() right now if there is
1192 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
1193 * and for all flows in not_eligible_heap (NEH), S_i > V .
1194 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
1195 * we only need to look into NEH.
1197 if (DN_KEY_GT(q
->S
, pipe
->V
) ) { /* not eligible */
1198 if (pipe
->scheduler_heap
.elements
== 0)
1199 printf("++ ouch! not eligible but empty scheduler!\n");
1200 heap_insert(&(pipe
->not_eligible_heap
), q
->S
, q
);
1202 heap_insert(&(pipe
->scheduler_heap
), q
->F
, q
);
1203 if (pipe
->numbytes
>= 0) { /* pipe is idle */
1204 if (pipe
->scheduler_heap
.elements
!= 1)
1205 printf("*** OUCH! pipe should have been idle!\n");
1206 DEB(printf("Waking up pipe %d at %d\n",
1207 pipe
->pipe_nr
, (int)(q
->F
>> MY_M
)); )
1208 pipe
->sched_time
= curr_time
;
1209 ready_event_wfq(pipe
);
1226 * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
1227 * Doing this would probably save us the initial bzero of dn_pkt
1229 #define DN_FREE_PKT(pkt) { \
1230 struct dn_pkt *n = pkt ; \
1231 rt_unref ( n->ro.ro_rt ) ; \
1233 pkt = DN_NEXT(n) ; \
1237 * Dispose all packets and flow_queues on a flow_set.
1238 * If all=1, also remove red lookup table and other storage,
1239 * including the descriptor itself.
1240 * For the one in dn_pipe MUST also cleanup ready_heap...
1243 purge_flow_set(struct dn_flow_set
*fs
, int all
)
1245 struct dn_pkt
*pkt
;
1246 struct dn_flow_queue
*q
, *qn
;
1249 for (i
= 0 ; i
<= fs
->rq_size
; i
++ ) {
1250 for (q
= fs
->rq
[i
] ; q
; q
= qn
) {
1251 for (pkt
= q
->head
; pkt
; )
1258 fs
->rq_elements
= 0 ;
1260 /* RED - free lookup table */
1262 FREE(fs
->w_q_lookup
, M_IPFW
);
1264 FREE(fs
->rq
, M_IPFW
);
1265 /* if this fs is not part of a pipe, free it */
1266 if (fs
->pipe
&& fs
!= &(fs
->pipe
->fs
) )
1272 * Dispose all packets queued on a pipe (not a flow_set).
1273 * Also free all resources associated to a pipe, which is about
1277 purge_pipe(struct dn_pipe
*pipe
)
1279 struct dn_pkt
*pkt
;
1281 purge_flow_set( &(pipe
->fs
), 1 );
1283 for (pkt
= pipe
->head
; pkt
; )
1286 heap_free( &(pipe
->scheduler_heap
) );
1287 heap_free( &(pipe
->not_eligible_heap
) );
1288 heap_free( &(pipe
->idle_heap
) );
1292 * Delete all pipes and heaps returning memory. Must also
1293 * remove references from all ipfw rules to all pipes.
1298 struct dn_pipe
*curr_p
, *p
;
1299 struct ip_fw_chain
*chain
;
1300 struct dn_flow_set
*fs
, *curr_fs
;
1305 /* remove all references to pipes ...*/
1306 LIST_FOREACH(chain
, &ip_fw_chain_head
, next
)
1307 chain
->rule
->pipe_ptr
= NULL
;
1308 /* prevent future matches... */
1311 fs
= all_flow_sets
;
1312 all_flow_sets
= NULL
;
1313 /* and free heaps so we don't have unwanted events */
1314 heap_free(&ready_heap
);
1315 heap_free(&wfq_ready_heap
);
1316 heap_free(&extract_heap
);
1319 * Now purge all queued pkts and delete all pipes
1321 /* scan and purge all flow_sets. */
1325 purge_flow_set(curr_fs
, 1);
1336 extern struct ip_fw_chain
*ip_fw_default_rule
;
1338 dn_rule_delete_fs(struct dn_flow_set
*fs
, void *r
)
1341 struct dn_flow_queue
*q
;
1342 struct dn_pkt
*pkt
;
1344 for (i
= 0 ; i
<= fs
->rq_size
; i
++) /* last one is ovflow */
1345 for (q
= fs
->rq
[i
] ; q
; q
= q
->next
)
1346 for (pkt
= q
->head
; pkt
; pkt
= DN_NEXT(pkt
) )
1347 if (pkt
->hdr
.mh_data
== r
)
1348 pkt
->hdr
.mh_data
= (void *)ip_fw_default_rule
;
1351 * when a firewall rule is deleted, scan all queues and remove the flow-id
1352 * from packets matching this rule.
1355 dn_rule_delete(void *r
)
1358 struct dn_pkt
*pkt
;
1359 struct dn_flow_set
*fs
;
1362 * If the rule references a queue (dn_flow_set), then scan
1363 * the flow set, otherwise scan pipes. Should do either, but doing
1364 * both does not harm.
1366 for ( fs
= all_flow_sets
; fs
; fs
= fs
->next
)
1367 dn_rule_delete_fs(fs
, r
);
1368 for ( p
= all_pipes
; p
; p
= p
->next
) {
1370 dn_rule_delete_fs(fs
, r
);
1371 for (pkt
= p
->head
; pkt
; pkt
= DN_NEXT(pkt
) )
1372 if (pkt
->hdr
.mh_data
== r
)
1373 pkt
->hdr
.mh_data
= (void *)ip_fw_default_rule
;
1378 * setup RED parameters
1381 config_red(struct dn_flow_set
*p
, struct dn_flow_set
* x
)
1386 x
->min_th
= SCALE(p
->min_th
);
1387 x
->max_th
= SCALE(p
->max_th
);
1388 x
->max_p
= p
->max_p
;
1390 x
->c_1
= p
->max_p
/ (p
->max_th
- p
->min_th
);
1391 x
->c_2
= SCALE_MUL(x
->c_1
, SCALE(p
->min_th
));
1392 if (x
->flags_fs
& DN_IS_GENTLE_RED
) {
1393 x
->c_3
= (SCALE(1) - p
->max_p
) / p
->max_th
;
1394 x
->c_4
= (SCALE(1) - 2 * p
->max_p
);
1397 /* if the lookup table already exist, free and create it again */
1399 FREE(x
->w_q_lookup
, M_IPFW
);
1400 if (red_lookup_depth
== 0) {
1401 printf("\nnet.inet.ip.dummynet.red_lookup_depth must be > 0");
1405 x
->lookup_depth
= red_lookup_depth
;
1406 x
->w_q_lookup
= (u_int
*) _MALLOC(x
->lookup_depth
* sizeof(int),
1407 M_IPFW
, M_DONTWAIT
);
1408 if (x
->w_q_lookup
== NULL
) {
1409 printf("sorry, cannot allocate red lookup table\n");
1414 /* fill the lookup table with (1 - w_q)^x */
1415 x
->lookup_step
= p
->lookup_step
;
1416 x
->lookup_weight
= p
->lookup_weight
;
1417 x
->w_q_lookup
[0] = SCALE(1) - x
->w_q
;
1418 for (i
= 1; i
< x
->lookup_depth
; i
++)
1419 x
->w_q_lookup
[i
] = SCALE_MUL(x
->w_q_lookup
[i
- 1], x
->lookup_weight
);
1420 if (red_avg_pkt_size
< 1)
1421 red_avg_pkt_size
= 512 ;
1422 x
->avg_pkt_size
= red_avg_pkt_size
;
1423 if (red_max_pkt_size
< 1)
1424 red_max_pkt_size
= 1500 ;
1425 x
->max_pkt_size
= red_max_pkt_size
;
1430 alloc_hash(struct dn_flow_set
*x
, struct dn_flow_set
*pfs
)
1432 if (x
->flags_fs
& DN_HAVE_FLOW_MASK
) { /* allocate some slots */
1433 int l
= pfs
->rq_size
;
1442 } else /* one is enough for null mask */
1444 x
->rq
= _MALLOC((1 + x
->rq_size
) * sizeof(struct dn_flow_queue
*),
1445 M_IPFW
, M_DONTWAIT
);
1446 if (x
->rq
== NULL
) {
1447 printf("sorry, cannot allocate queue\n");
1450 bzero(x
->rq
, (1+x
->rq_size
) * sizeof(struct dn_flow_queue
*));
1456 set_fs_parms(struct dn_flow_set
*x
, struct dn_flow_set
*src
)
1458 x
->flags_fs
= src
->flags_fs
;
1459 x
->qsize
= src
->qsize
;
1461 x
->flow_mask
= src
->flow_mask
;
1462 if (x
->flags_fs
& DN_QSIZE_IS_BYTES
) {
1463 if (x
->qsize
> 1024*1024)
1464 x
->qsize
= 1024*1024 ;
1471 /* configuring RED */
1472 if ( x
->flags_fs
& DN_IS_RED
)
1473 config_red(src
, x
) ; /* XXX should check errors */
1477 * setup pipe or queue parameters.
1481 config_pipe(struct dn_pipe
*p
)
1484 struct dn_flow_set
*pfs
= &(p
->fs
);
1487 * The config program passes parameters as follows:
1488 * bw = bits/second (0 means no limits),
1489 * delay = ms, must be translated into ticks.
1490 * qsize = slots/bytes
1492 p
->delay
= ( p
->delay
* hz
) / 1000 ;
1493 /* We need either a pipe number or a flow_set number */
1494 if (p
->pipe_nr
== 0 && pfs
->fs_nr
== 0)
1496 if (p
->pipe_nr
!= 0 && pfs
->fs_nr
!= 0)
1498 if (p
->pipe_nr
!= 0) { /* this is a pipe */
1499 struct dn_pipe
*x
, *a
, *b
;
1501 for (a
= NULL
, b
= all_pipes
; b
&& b
->pipe_nr
< p
->pipe_nr
;
1502 a
= b
, b
= b
->next
) ;
1504 if (b
== NULL
|| b
->pipe_nr
!= p
->pipe_nr
) { /* new pipe */
1505 x
= _MALLOC(sizeof(struct dn_pipe
), M_IPFW
, M_DONTWAIT
) ;
1507 printf("ip_dummynet.c: no memory for new pipe\n");
1510 bzero(x
, sizeof(struct dn_pipe
));
1511 x
->pipe_nr
= p
->pipe_nr
;
1513 /* idle_heap is the only one from which we extract from the middle.
1515 x
->idle_heap
.size
= x
->idle_heap
.elements
= 0 ;
1516 x
->idle_heap
.offset
=OFFSET_OF(struct dn_flow_queue
, heap_pos
);
1520 x
->bandwidth
= p
->bandwidth
;
1521 x
->numbytes
= 0; /* just in case... */
1522 bcopy(p
->if_name
, x
->if_name
, sizeof(p
->if_name
) );
1523 x
->ifp
= NULL
; /* reset interface ptr */
1524 x
->delay
= p
->delay
;
1525 set_fs_parms(&(x
->fs
), pfs
);
1528 if ( x
->fs
.rq
== NULL
) { /* a new pipe */
1529 s
= alloc_hash(&(x
->fs
), pfs
) ;
1542 } else { /* config queue */
1543 struct dn_flow_set
*x
, *a
, *b
;
1545 /* locate flow_set */
1546 for (a
=NULL
, b
=all_flow_sets
; b
&& b
->fs_nr
< pfs
->fs_nr
;
1547 a
= b
, b
= b
->next
) ;
1549 if (b
== NULL
|| b
->fs_nr
!= pfs
->fs_nr
) { /* new */
1550 if (pfs
->parent_nr
== 0) /* need link to a pipe */
1552 x
= _MALLOC(sizeof(struct dn_flow_set
), M_IPFW
, M_DONTWAIT
);
1554 printf("ip_dummynet.c: no memory for new flow_set\n");
1557 bzero(x
, sizeof(struct dn_flow_set
));
1558 x
->fs_nr
= pfs
->fs_nr
;
1559 x
->parent_nr
= pfs
->parent_nr
;
1560 x
->weight
= pfs
->weight
;
1563 else if (x
->weight
> 100)
1566 /* Change parent pipe not allowed; must delete and recreate */
1567 if (pfs
->parent_nr
!= 0 && b
->parent_nr
!= pfs
->parent_nr
)
1571 set_fs_parms(x
, pfs
);
1573 if ( x
->rq
== NULL
) { /* a new flow_set */
1574 s
= alloc_hash(x
, pfs
) ;
1592 * Helper function to remove from a heap queues which are linked to
1593 * a flow_set about to be deleted.
1596 fs_remove_from_heap(struct dn_heap
*h
, struct dn_flow_set
*fs
)
1598 int i
= 0, found
= 0 ;
1599 for (; i
< h
->elements
;)
1600 if ( ((struct dn_flow_queue
*)h
->p
[i
].object
)->fs
== fs
) {
1602 h
->p
[i
] = h
->p
[h
->elements
] ;
1611 * helper function to remove a pipe from a heap (can be there at most once)
1614 pipe_remove_from_heap(struct dn_heap
*h
, struct dn_pipe
*p
)
1616 if (h
->elements
> 0) {
1618 for (i
=0; i
< h
->elements
; i
++ ) {
1619 if (h
->p
[i
].object
== p
) { /* found it */
1621 h
->p
[i
] = h
->p
[h
->elements
] ;
1630 * drain all queues. Called in case of severe mbuf shortage.
1635 struct dn_flow_set
*fs
;
1639 heap_free(&ready_heap
);
1640 heap_free(&wfq_ready_heap
);
1641 heap_free(&extract_heap
);
1642 /* remove all references to this pipe from flow_sets */
1643 for (fs
= all_flow_sets
; fs
; fs
= fs
->next
)
1644 purge_flow_set(fs
, 0);
1646 for (p
= all_pipes
; p
; p
= p
->next
) {
1647 purge_flow_set(&(p
->fs
), 0);
1648 for (pkt
= p
->head
; pkt
; )
1650 p
->head
= p
->tail
= NULL
;
1655 * Fully delete a pipe or a queue, cleaning up associated info.
1658 delete_pipe(struct dn_pipe
*p
)
1661 struct ip_fw_chain
*chain
;
1663 if (p
->pipe_nr
== 0 && p
->fs
.fs_nr
== 0)
1665 if (p
->pipe_nr
!= 0 && p
->fs
.fs_nr
!= 0)
1667 if (p
->pipe_nr
!= 0) { /* this is an old-style pipe */
1668 struct dn_pipe
*a
, *b
;
1669 struct dn_flow_set
*fs
;
1672 for (a
= NULL
, b
= all_pipes
; b
&& b
->pipe_nr
< p
->pipe_nr
;
1673 a
= b
, b
= b
->next
) ;
1674 if (b
== NULL
|| (b
->pipe_nr
!= p
->pipe_nr
) )
1675 return EINVAL
; /* not found */
1679 /* unlink from list of pipes */
1681 all_pipes
= b
->next
;
1684 /* remove references to this pipe from the ip_fw rules. */
1685 LIST_FOREACH(chain
, &ip_fw_chain_head
, next
)
1686 if (chain
->rule
->pipe_ptr
== &(b
->fs
))
1687 chain
->rule
->pipe_ptr
= NULL
;
1689 /* remove all references to this pipe from flow_sets */
1690 for (fs
= all_flow_sets
; fs
; fs
= fs
->next
)
1691 if (fs
->pipe
== b
) {
1692 printf("++ ref to pipe %d from fs %d\n",
1693 p
->pipe_nr
, fs
->fs_nr
);
1695 purge_flow_set(fs
, 0);
1697 fs_remove_from_heap(&ready_heap
, &(b
->fs
));
1698 purge_pipe(b
); /* remove all data associated to this pipe */
1699 /* remove reference to here from extract_heap and wfq_ready_heap */
1700 pipe_remove_from_heap(&extract_heap
, b
);
1701 pipe_remove_from_heap(&wfq_ready_heap
, b
);
1704 } else { /* this is a WF2Q queue (dn_flow_set) */
1705 struct dn_flow_set
*a
, *b
;
1708 for (a
= NULL
, b
= all_flow_sets
; b
&& b
->fs_nr
< p
->fs
.fs_nr
;
1709 a
= b
, b
= b
->next
) ;
1710 if (b
== NULL
|| (b
->fs_nr
!= p
->fs
.fs_nr
) )
1711 return EINVAL
; /* not found */
1715 all_flow_sets
= b
->next
;
1718 /* remove references to this flow_set from the ip_fw rules. */
1719 LIST_FOREACH(chain
, &ip_fw_chain_head
, next
)
1720 if (chain
->rule
->pipe_ptr
== b
)
1721 chain
->rule
->pipe_ptr
= NULL
;
1723 if (b
->pipe
!= NULL
) {
1724 /* Update total weight on parent pipe and cleanup parent heaps */
1725 b
->pipe
->sum
-= b
->weight
* b
->backlogged
;
1726 fs_remove_from_heap(&(b
->pipe
->not_eligible_heap
), b
);
1727 fs_remove_from_heap(&(b
->pipe
->scheduler_heap
), b
);
1728 #if 1 /* XXX should i remove from idle_heap as well ? */
1729 fs_remove_from_heap(&(b
->pipe
->idle_heap
), b
);
1732 purge_flow_set(b
, 1);
1739 * helper function used to copy data from kernel in DUMMYNET_GET
1742 dn_copy_set(struct dn_flow_set
*set
, char *bp
)
1745 struct dn_flow_queue
*q
, *qp
= (struct dn_flow_queue
*)bp
;
1747 for (i
= 0 ; i
<= set
->rq_size
; i
++)
1748 for (q
= set
->rq
[i
] ; q
; q
= q
->next
, qp
++ ) {
1749 if (q
->hash_slot
!= i
)
1750 printf("++ at %d: wrong slot (have %d, "
1751 "should be %d)\n", copied
, q
->hash_slot
, i
);
1753 printf("++ at %d: wrong fs ptr (have %p, should be %p)\n",
1756 bcopy(q
, qp
, sizeof( *q
) );
1757 /* cleanup pointers */
1759 qp
->head
= qp
->tail
= NULL
;
1762 if (copied
!= set
->rq_elements
)
1763 printf("++ wrong count, have %d should be %d\n",
1764 copied
, set
->rq_elements
);
1769 dummynet_get(struct sockopt
*sopt
)
1771 char *buf
, *bp
; /* bp is the "copy-pointer" */
1773 struct dn_flow_set
*set
;
1779 * compute size of data structures: list of pipes and flow_sets.
1781 for (p
= all_pipes
, size
= 0 ; p
; p
= p
->next
)
1782 size
+= sizeof( *p
) +
1783 p
->fs
.rq_elements
* sizeof(struct dn_flow_queue
);
1784 for (set
= all_flow_sets
; set
; set
= set
->next
)
1785 size
+= sizeof ( *set
) +
1786 set
->rq_elements
* sizeof(struct dn_flow_queue
);
1787 buf
= _MALLOC(size
, M_TEMP
, M_DONTWAIT
);
1792 for (p
= all_pipes
, bp
= buf
; p
; p
= p
->next
) {
1793 struct dn_pipe
*pipe_bp
= (struct dn_pipe
*)bp
;
1796 * copy pipe descriptor into *bp, convert delay back to ms,
1797 * then copy the flow_set descriptor(s) one at a time.
1798 * After each flow_set, copy the queue descriptor it owns.
1800 bcopy(p
, bp
, sizeof( *p
) );
1801 pipe_bp
->delay
= (pipe_bp
->delay
* 1000) / hz
;
1803 * XXX the following is a hack based on ->next being the
1804 * first field in dn_pipe and dn_flow_set. The correct
1805 * solution would be to move the dn_flow_set to the beginning
1806 * of struct dn_pipe.
1808 pipe_bp
->next
= (struct dn_pipe
*)DN_IS_PIPE
;
1809 /* clean pointers */
1810 pipe_bp
->head
= pipe_bp
->tail
= NULL
;
1811 pipe_bp
->fs
.next
= NULL
;
1812 pipe_bp
->fs
.pipe
= NULL
;
1813 pipe_bp
->fs
.rq
= NULL
;
1815 bp
+= sizeof( *p
) ;
1816 bp
= dn_copy_set( &(p
->fs
), bp
);
1818 for (set
= all_flow_sets
; set
; set
= set
->next
) {
1819 struct dn_flow_set
*fs_bp
= (struct dn_flow_set
*)bp
;
1820 bcopy(set
, bp
, sizeof( *set
) );
1821 /* XXX same hack as above */
1822 fs_bp
->next
= (struct dn_flow_set
*)DN_IS_QUEUE
;
1823 fs_bp
->pipe
= NULL
;
1825 bp
+= sizeof( *set
) ;
1826 bp
= dn_copy_set( set
, bp
);
1829 error
= sooptcopyout(sopt
, buf
, size
);
1835 * Handler for the various dummynet socket options (get, flush, config, del)
1838 ip_dn_ctl(struct sockopt
*sopt
)
1841 struct dn_pipe
*p
, tmp_pipe
;
1843 /* Disallow sets in really-really secure mode. */
1844 if (sopt
->sopt_dir
== SOPT_SET
&& securelevel
>= 3)
1847 switch (sopt
->sopt_name
) {
1849 printf("ip_dn_ctl -- unknown option %d", sopt
->sopt_name
);
1852 case IP_DUMMYNET_GET
:
1853 error
= dummynet_get(sopt
);
1856 case IP_DUMMYNET_FLUSH
:
1859 case IP_DUMMYNET_CONFIGURE
:
1861 error
= sooptcopyin(sopt
, p
, sizeof *p
, sizeof *p
);
1864 error
= config_pipe(p
);
1867 case IP_DUMMYNET_DEL
: /* remove a pipe or queue */
1869 error
= sooptcopyin(sopt
, p
, sizeof *p
, sizeof *p
);
1873 error
= delete_pipe(p
);
1882 printf("DUMMYNET initialized (010124)\n");
1884 all_flow_sets
= NULL
;
1885 ready_heap
.size
= ready_heap
.elements
= 0 ;
1886 ready_heap
.offset
= 0 ;
1888 wfq_ready_heap
.size
= wfq_ready_heap
.elements
= 0 ;
1889 wfq_ready_heap
.offset
= 0 ;
1891 extract_heap
.size
= extract_heap
.elements
= 0 ;
1892 extract_heap
.offset
= 0 ;
1893 ip_dn_ctl_ptr
= ip_dn_ctl
;
1894 timeout(dummynet
, NULL
, 1);
1897 static ip_dn_ctl_t
*old_dn_ctl_ptr
;
1900 dummynet_modevent(module_t mod
, int type
, void *data
)
1906 old_dn_ctl_ptr
= ip_dn_ctl_ptr
;
1912 ip_dn_ctl_ptr
= old_dn_ctl_ptr
;
1922 static moduledata_t dummynet_mod
= {
1927 DECLARE_MODULE(dummynet
, dummynet_mod
, SI_SUB_PSEUDO
, SI_ORDER_ANY
);